aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig141
-rw-r--r--arch/x86/Kconfig.cpu3
-rw-r--r--arch/x86/Kconfig.debug24
-rw-r--r--arch/x86/boot/video-vga.c4
-rw-r--r--arch/x86/boot/video.c2
-rw-r--r--arch/x86/configs/i386_defconfig4
-rw-r--r--arch/x86/configs/x86_64_defconfig4
-rw-r--r--arch/x86/crypto/crc32c-intel.c121
-rw-r--r--arch/x86/ia32/ia32_aout.c2
-rw-r--r--arch/x86/ia32/ia32_signal.c110
-rw-r--r--arch/x86/ia32/ipc32.c1
-rw-r--r--arch/x86/ia32/sys_ia32.c2
-rw-r--r--arch/x86/include/asm/amd_iommu_types.h61
-rw-r--r--arch/x86/include/asm/apic.h4
-rw-r--r--arch/x86/include/asm/bigsmp/apic.h34
-rw-r--r--arch/x86/include/asm/bigsmp/ipi.h13
-rw-r--r--arch/x86/include/asm/bitops.h10
-rw-r--r--arch/x86/include/asm/bug.h2
-rw-r--r--arch/x86/include/asm/byteorder.h74
-rw-r--r--arch/x86/include/asm/cpufeature.h5
-rw-r--r--arch/x86/include/asm/desc.h10
-rw-r--r--arch/x86/include/asm/dma-mapping.h4
-rw-r--r--arch/x86/include/asm/ds.h312
-rw-r--r--arch/x86/include/asm/dwarf2.h97
-rw-r--r--arch/x86/include/asm/efi.h1
-rw-r--r--arch/x86/include/asm/elf.h2
-rw-r--r--arch/x86/include/asm/emergency-restart.h4
-rw-r--r--arch/x86/include/asm/es7000/apic.h123
-rw-r--r--arch/x86/include/asm/es7000/ipi.h12
-rw-r--r--arch/x86/include/asm/es7000/wakecpu.h41
-rw-r--r--arch/x86/include/asm/ftrace.h61
-rw-r--r--arch/x86/include/asm/gart.h33
-rw-r--r--arch/x86/include/asm/genapic_32.h32
-rw-r--r--arch/x86/include/asm/genapic_64.h16
-rw-r--r--arch/x86/include/asm/hardirq_32.h2
-rw-r--r--arch/x86/include/asm/hardirq_64.h2
-rw-r--r--arch/x86/include/asm/hw_irq.h4
-rw-r--r--arch/x86/include/asm/hypervisor.h26
-rw-r--r--arch/x86/include/asm/ia32.h18
-rw-r--r--arch/x86/include/asm/idle.h5
-rw-r--r--arch/x86/include/asm/io.h37
-rw-r--r--arch/x86/include/asm/io_64.h2
-rw-r--r--arch/x86/include/asm/io_apic.h19
-rw-r--r--arch/x86/include/asm/iommu.h35
-rw-r--r--arch/x86/include/asm/ipi.h23
-rw-r--r--arch/x86/include/asm/irq.h7
-rw-r--r--arch/x86/include/asm/irq_regs_32.h2
-rw-r--r--arch/x86/include/asm/irq_vectors.h11
-rw-r--r--arch/x86/include/asm/kexec.h31
-rw-r--r--arch/x86/include/asm/kvm_host.h47
-rw-r--r--arch/x86/include/asm/kvm_x86_emulate.h11
-rw-r--r--arch/x86/include/asm/lguest.h2
-rw-r--r--arch/x86/include/asm/linkage.h60
-rw-r--r--arch/x86/include/asm/mach-default/mach_apic.h30
-rw-r--r--arch/x86/include/asm/mach-default/mach_ipi.h18
-rw-r--r--arch/x86/include/asm/mach-default/mach_wakecpu.h24
-rw-r--r--arch/x86/include/asm/mach-default/smpboot_hooks.h8
-rw-r--r--arch/x86/include/asm/mach-generic/mach_apic.h2
-rw-r--r--arch/x86/include/asm/mach-generic/mach_wakecpu.h12
-rw-r--r--arch/x86/include/asm/mmu_context_32.h13
-rw-r--r--arch/x86/include/asm/mpspec.h2
-rw-r--r--arch/x86/include/asm/msr-index.h2
-rw-r--r--arch/x86/include/asm/msr.h15
-rw-r--r--arch/x86/include/asm/mtrr.h25
-rw-r--r--arch/x86/include/asm/numaq/apic.h16
-rw-r--r--arch/x86/include/asm/numaq/ipi.h13
-rw-r--r--arch/x86/include/asm/numaq/wakecpu.h24
-rw-r--r--arch/x86/include/asm/pci.h14
-rw-r--r--arch/x86/include/asm/pci_64.h1
-rw-r--r--arch/x86/include/asm/pci_x86.h (renamed from arch/x86/pci/pci.h)18
-rw-r--r--arch/x86/include/asm/pgtable-2level.h50
-rw-r--r--arch/x86/include/asm/pgtable-3level.h1
-rw-r--r--arch/x86/include/asm/pgtable.h28
-rw-r--r--arch/x86/include/asm/pgtable_32.h9
-rw-r--r--arch/x86/include/asm/pgtable_64.h28
-rw-r--r--arch/x86/include/asm/prctl.h3
-rw-r--r--arch/x86/include/asm/processor.h17
-rw-r--r--arch/x86/include/asm/ptrace.h43
-rw-r--r--arch/x86/include/asm/reboot.h5
-rw-r--r--arch/x86/include/asm/setup.h7
-rw-r--r--arch/x86/include/asm/sigframe.h70
-rw-r--r--arch/x86/include/asm/signal.h6
-rw-r--r--arch/x86/include/asm/smp.h6
-rw-r--r--arch/x86/include/asm/sparsemem.h2
-rw-r--r--arch/x86/include/asm/summit/apic.h39
-rw-r--r--arch/x86/include/asm/summit/ipi.h9
-rw-r--r--arch/x86/include/asm/svm.h (renamed from arch/x86/kvm/svm.h)0
-rw-r--r--arch/x86/include/asm/sys_ia32.h101
-rw-r--r--arch/x86/include/asm/syscalls.h16
-rw-r--r--arch/x86/include/asm/system.h6
-rw-r--r--arch/x86/include/asm/thread_info.h9
-rw-r--r--arch/x86/include/asm/topology.h38
-rw-r--r--arch/x86/include/asm/trampoline.h7
-rw-r--r--arch/x86/include/asm/traps.h11
-rw-r--r--arch/x86/include/asm/tsc.h8
-rw-r--r--arch/x86/include/asm/uaccess.h6
-rw-r--r--arch/x86/include/asm/uaccess_32.h8
-rw-r--r--arch/x86/include/asm/uaccess_64.h6
-rw-r--r--arch/x86/include/asm/uv/bios.h34
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h46
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h103
-rw-r--r--arch/x86/include/asm/virtext.h132
-rw-r--r--arch/x86/include/asm/vmi.h8
-rw-r--r--arch/x86/include/asm/vmware.h27
-rw-r--r--arch/x86/include/asm/vmx.h (renamed from arch/x86/kvm/vmx.h)27
-rw-r--r--arch/x86/include/asm/xen/hypercall.h6
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h39
-rw-r--r--arch/x86/include/asm/xen/page.h5
-rw-r--r--arch/x86/kernel/Makefile11
-rw-r--r--arch/x86/kernel/acpi/boot.c42
-rw-r--r--arch/x86/kernel/amd_iommu.c679
-rw-r--r--arch/x86/kernel/amd_iommu_init.c27
-rw-r--r--arch/x86/kernel/aperture_64.c5
-rw-r--r--arch/x86/kernel/apic.c177
-rw-r--r--arch/x86/kernel/apm_32.c4
-rw-r--r--arch/x86/kernel/asm-offsets_32.c2
-rw-r--r--arch/x86/kernel/asm-offsets_64.c4
-rw-r--r--arch/x86/kernel/bios_uv.c60
-rw-r--r--arch/x86/kernel/check.c161
-rw-r--r--arch/x86/kernel/cpu/Makefile6
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c8
-rw-r--r--arch/x86/kernel/cpu/amd.c9
-rw-r--r--arch/x86/kernel/cpu/common.c10
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c32
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c9
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c24
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c58
-rw-r--r--arch/x86/kernel/cpu/intel.c23
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c62
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c3
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c110
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel_64.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c12
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c356
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h18
-rw-r--r--arch/x86/kernel/cpu/vmware.c112
-rw-r--r--arch/x86/kernel/cpuid.c8
-rw-r--r--arch/x86/kernel/crash.c86
-rw-r--r--arch/x86/kernel/ds.c1147
-rw-r--r--arch/x86/kernel/dumpstack.c351
-rw-r--r--arch/x86/kernel/dumpstack.h39
-rw-r--r--arch/x86/kernel/dumpstack_32.c307
-rw-r--r--arch/x86/kernel/dumpstack_64.c289
-rw-r--r--arch/x86/kernel/e820.c16
-rw-r--r--arch/x86/kernel/early-quirks.c1
-rw-r--r--arch/x86/kernel/early_printk.c49
-rw-r--r--arch/x86/kernel/entry_32.S528
-rw-r--r--arch/x86/kernel/entry_64.S1458
-rw-r--r--arch/x86/kernel/es7000_32.c62
-rw-r--r--arch/x86/kernel/ftrace.c390
-rw-r--r--arch/x86/kernel/genapic_64.c4
-rw-r--r--arch/x86/kernel/genapic_flat_64.c107
-rw-r--r--arch/x86/kernel/genx2apic_cluster.c81
-rw-r--r--arch/x86/kernel/genx2apic_phys.c78
-rw-r--r--arch/x86/kernel/genx2apic_uv_x.c172
-rw-r--r--arch/x86/kernel/head.c1
-rw-r--r--arch/x86/kernel/head32.c3
-rw-r--r--arch/x86/kernel/head64.c5
-rw-r--r--arch/x86/kernel/hpet.c19
-rw-r--r--arch/x86/kernel/i8253.c2
-rw-r--r--arch/x86/kernel/init_task.c2
-rw-r--r--arch/x86/kernel/io_apic.c1026
-rw-r--r--arch/x86/kernel/ipi.c28
-rw-r--r--arch/x86/kernel/irq.c6
-rw-r--r--arch/x86/kernel/irq_32.c15
-rw-r--r--arch/x86/kernel/irq_64.c44
-rw-r--r--arch/x86/kernel/irqinit_32.c21
-rw-r--r--arch/x86/kernel/irqinit_64.c82
-rw-r--r--arch/x86/kernel/kvmclock.c10
-rw-r--r--arch/x86/kernel/ldt.c4
-rw-r--r--arch/x86/kernel/machine_kexec_32.c104
-rw-r--r--arch/x86/kernel/mfgpt_32.c2
-rw-r--r--arch/x86/kernel/microcode_amd.c232
-rw-r--r--arch/x86/kernel/microcode_core.c25
-rw-r--r--arch/x86/kernel/microcode_intel.c8
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c3
-rw-r--r--arch/x86/kernel/mpparse.c35
-rw-r--r--arch/x86/kernel/msr.c2
-rw-r--r--arch/x86/kernel/nmi.c61
-rw-r--r--arch/x86/kernel/numaq_32.c10
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c3
-rw-r--r--arch/x86/kernel/pci-dma.c24
-rw-r--r--arch/x86/kernel/pci-gart_64.c6
-rw-r--r--arch/x86/kernel/pci-swiotlb_64.c29
-rw-r--r--arch/x86/kernel/process.c35
-rw-r--r--arch/x86/kernel/process_32.c67
-rw-r--r--arch/x86/kernel/process_64.c58
-rw-r--r--arch/x86/kernel/ptrace.c432
-rw-r--r--arch/x86/kernel/quirks.c2
-rw-r--r--arch/x86/kernel/reboot.c189
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S115
-rw-r--r--arch/x86/kernel/setup.c183
-rw-r--r--arch/x86/kernel/setup_percpu.c36
-rw-r--r--arch/x86/kernel/sigframe.h42
-rw-r--r--arch/x86/kernel/signal.c (renamed from arch/x86/kernel/signal_32.c)567
-rw-r--r--arch/x86/kernel/signal_64.c516
-rw-r--r--arch/x86/kernel/smp.c39
-rw-r--r--arch/x86/kernel/smpboot.c75
-rw-r--r--arch/x86/kernel/stacktrace.c64
-rw-r--r--arch/x86/kernel/time_32.c2
-rw-r--r--arch/x86/kernel/time_64.c6
-rw-r--r--arch/x86/kernel/tlb_32.c15
-rw-r--r--arch/x86/kernel/tlb_64.c4
-rw-r--r--arch/x86/kernel/tlb_uv.c13
-rw-r--r--arch/x86/kernel/trampoline.c19
-rw-r--r--arch/x86/kernel/traps.c75
-rw-r--r--arch/x86/kernel/tsc.c42
-rw-r--r--arch/x86/kernel/tsc_sync.c8
-rw-r--r--arch/x86/kernel/vmi_32.c135
-rw-r--r--arch/x86/kernel/vmiclock_32.c2
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S1
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S1
-rw-r--r--arch/x86/kernel/vsyscall_64.c12
-rw-r--r--arch/x86/kernel/xsave.c2
-rw-r--r--arch/x86/kvm/Makefile4
-rw-r--r--arch/x86/kvm/i8254.c19
-rw-r--r--arch/x86/kvm/i8259.c52
-rw-r--r--arch/x86/kvm/irq.h6
-rw-r--r--arch/x86/kvm/kvm_svm.h2
-rw-r--r--arch/x86/kvm/lapic.c58
-rw-r--r--arch/x86/kvm/mmu.c444
-rw-r--r--arch/x86/kvm/paging_tmpl.h44
-rw-r--r--arch/x86/kvm/svm.c48
-rw-r--r--arch/x86/kvm/vmx.c350
-rw-r--r--arch/x86/kvm/x86.c120
-rw-r--r--arch/x86/kvm/x86_emulate.c297
-rw-r--r--arch/x86/lguest/boot.c5
-rw-r--r--arch/x86/lguest/i386_head.S15
-rw-r--r--arch/x86/lib/usercopy_32.c8
-rw-r--r--arch/x86/lib/usercopy_64.c4
-rw-r--r--arch/x86/mach-default/setup.c15
-rw-r--r--arch/x86/mach-generic/bigsmp.c6
-rw-r--r--arch/x86/mach-generic/default.c1
-rw-r--r--arch/x86/mach-generic/es7000.c19
-rw-r--r--arch/x86/mach-generic/numaq.c5
-rw-r--r--arch/x86/mach-generic/probe.c16
-rw-r--r--arch/x86/mach-generic/summit.c6
-rw-r--r--arch/x86/mach-voyager/voyager_smp.c16
-rw-r--r--arch/x86/mm/Makefile3
-rw-r--r--arch/x86/mm/fault.c15
-rw-r--r--arch/x86/mm/init_32.c41
-rw-r--r--arch/x86/mm/init_64.c2
-rw-r--r--arch/x86/mm/ioremap.c3
-rw-r--r--arch/x86/mm/numa_64.c4
-rw-r--r--arch/x86/mm/pat.c236
-rw-r--r--arch/x86/mm/srat_64.c2
-rw-r--r--arch/x86/oprofile/op_model_amd.c89
-rw-r--r--arch/x86/pci/acpi.c2
-rw-r--r--arch/x86/pci/amd_bus.c2
-rw-r--r--arch/x86/pci/common.c20
-rw-r--r--arch/x86/pci/direct.c6
-rw-r--r--arch/x86/pci/early.c2
-rw-r--r--arch/x86/pci/fixup.c3
-rw-r--r--arch/x86/pci/i386.c2
-rw-r--r--arch/x86/pci/init.c2
-rw-r--r--arch/x86/pci/irq.c3
-rw-r--r--arch/x86/pci/legacy.c2
-rw-r--r--arch/x86/pci/mmconfig-shared.c3
-rw-r--r--arch/x86/pci/mmconfig_32.c2
-rw-r--r--arch/x86/pci/mmconfig_64.c3
-rw-r--r--arch/x86/pci/numaq_32.c2
-rw-r--r--arch/x86/pci/olpc.c2
-rw-r--r--arch/x86/pci/pcbios.c5
-rw-r--r--arch/x86/pci/visws.c3
-rw-r--r--arch/x86/scripts/strip-symbols1
-rw-r--r--arch/x86/vdso/vclock_gettime.c3
-rw-r--r--arch/x86/vdso/vdso32-setup.c2
-rw-r--r--arch/x86/vdso/vma.c2
-rw-r--r--arch/x86/xen/enlighten.c17
-rw-r--r--arch/x86/xen/mmu.c37
-rw-r--r--arch/x86/xen/multicalls.c2
-rw-r--r--arch/x86/xen/setup.c9
-rw-r--r--arch/x86/xen/smp.c27
-rw-r--r--arch/x86/xen/suspend.c3
-rw-r--r--arch/x86/xen/time.c12
-rw-r--r--arch/x86/xen/xen-ops.h2
276 files changed, 10069 insertions, 6751 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ac22bb7719f7..862adb9bf0d4 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -19,6 +19,8 @@ config X86_64
19config X86 19config X86
20 def_bool y 20 def_bool y
21 select HAVE_AOUT if X86_32 21 select HAVE_AOUT if X86_32
22 select HAVE_READQ
23 select HAVE_WRITEQ
22 select HAVE_UNSTABLE_SCHED_CLOCK 24 select HAVE_UNSTABLE_SCHED_CLOCK
23 select HAVE_IDE 25 select HAVE_IDE
24 select HAVE_OPROFILE 26 select HAVE_OPROFILE
@@ -29,11 +31,14 @@ config X86
29 select HAVE_FTRACE_MCOUNT_RECORD 31 select HAVE_FTRACE_MCOUNT_RECORD
30 select HAVE_DYNAMIC_FTRACE 32 select HAVE_DYNAMIC_FTRACE
31 select HAVE_FUNCTION_TRACER 33 select HAVE_FUNCTION_TRACER
34 select HAVE_FUNCTION_GRAPH_TRACER
35 select HAVE_FUNCTION_TRACE_MCOUNT_TEST
32 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) 36 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
33 select HAVE_ARCH_KGDB if !X86_VOYAGER 37 select HAVE_ARCH_KGDB if !X86_VOYAGER
34 select HAVE_ARCH_TRACEHOOK 38 select HAVE_ARCH_TRACEHOOK
35 select HAVE_GENERIC_DMA_COHERENT if X86_32 39 select HAVE_GENERIC_DMA_COHERENT if X86_32
36 select HAVE_EFFICIENT_UNALIGNED_ACCESS 40 select HAVE_EFFICIENT_UNALIGNED_ACCESS
41 select USER_STACKTRACE_SUPPORT
37 42
38config ARCH_DEFCONFIG 43config ARCH_DEFCONFIG
39 string 44 string
@@ -87,6 +92,10 @@ config GENERIC_IOMAP
87config GENERIC_BUG 92config GENERIC_BUG
88 def_bool y 93 def_bool y
89 depends on BUG 94 depends on BUG
95 select GENERIC_BUG_RELATIVE_POINTERS if X86_64
96
97config GENERIC_BUG_RELATIVE_POINTERS
98 bool
90 99
91config GENERIC_HWEIGHT 100config GENERIC_HWEIGHT
92 def_bool y 101 def_bool y
@@ -238,25 +247,39 @@ config X86_HAS_BOOT_CPU_ID
238 def_bool y 247 def_bool y
239 depends on X86_VOYAGER 248 depends on X86_VOYAGER
240 249
250config SPARSE_IRQ
251 bool "Support sparse irq numbering"
252 depends on PCI_MSI || HT_IRQ
253 help
254 This enables support for sparse irqs. This is useful for distro
255 kernels that want to define a high CONFIG_NR_CPUS value but still
256 want to have low kernel memory footprint on smaller machines.
257
258 ( Sparse IRQs can also be beneficial on NUMA boxes, as they spread
259 out the irq_desc[] array in a more NUMA-friendly way. )
260
261 If you don't know what to do here, say N.
262
263config NUMA_MIGRATE_IRQ_DESC
264 bool "Move irq desc when changing irq smp_affinity"
265 depends on SPARSE_IRQ && NUMA
266 default n
267 help
268 This enables moving irq_desc to cpu/node that irq will use handled.
269
270 If you don't know what to do here, say N.
271
241config X86_FIND_SMP_CONFIG 272config X86_FIND_SMP_CONFIG
242 def_bool y 273 def_bool y
243 depends on X86_MPPARSE || X86_VOYAGER 274 depends on X86_MPPARSE || X86_VOYAGER
244 275
245if ACPI
246config X86_MPPARSE 276config X86_MPPARSE
247 def_bool y 277 bool "Enable MPS table" if ACPI
248 bool "Enable MPS table" 278 default y
249 depends on X86_LOCAL_APIC 279 depends on X86_LOCAL_APIC
250 help 280 help
251 For old smp systems that do not have proper acpi support. Newer systems 281 For old smp systems that do not have proper acpi support. Newer systems
252 (esp with 64bit cpus) with acpi support, MADT and DSDT will override it 282 (esp with 64bit cpus) with acpi support, MADT and DSDT will override it
253endif
254
255if !ACPI
256config X86_MPPARSE
257 def_bool y
258 depends on X86_LOCAL_APIC
259endif
260 283
261choice 284choice
262 prompt "Subarchitecture Type" 285 prompt "Subarchitecture Type"
@@ -367,10 +390,10 @@ config X86_RDC321X
367 as R-8610-(G). 390 as R-8610-(G).
368 If you don't have one of these chips, you should say N here. 391 If you don't have one of these chips, you should say N here.
369 392
370config SCHED_NO_NO_OMIT_FRAME_POINTER 393config SCHED_OMIT_FRAME_POINTER
371 def_bool y 394 def_bool y
372 prompt "Single-depth WCHAN output" 395 prompt "Single-depth WCHAN output"
373 depends on X86_32 396 depends on X86
374 help 397 help
375 Calculate simpler /proc/<PID>/wchan values. If this option 398 Calculate simpler /proc/<PID>/wchan values. If this option
376 is disabled then wchan values will recurse back to the 399 is disabled then wchan values will recurse back to the
@@ -465,10 +488,6 @@ config X86_CYCLONE_TIMER
465 def_bool y 488 def_bool y
466 depends on X86_GENERICARCH 489 depends on X86_GENERICARCH
467 490
468config ES7000_CLUSTERED_APIC
469 def_bool y
470 depends on SMP && X86_ES7000 && MPENTIUMIII
471
472source "arch/x86/Kconfig.cpu" 491source "arch/x86/Kconfig.cpu"
473 492
474config HPET_TIMER 493config HPET_TIMER
@@ -482,7 +501,7 @@ config HPET_TIMER
482 The HPET provides a stable time base on SMP 501 The HPET provides a stable time base on SMP
483 systems, unlike the TSC, but it is more expensive to access, 502 systems, unlike the TSC, but it is more expensive to access,
484 as it is off-chip. You can find the HPET spec at 503 as it is off-chip. You can find the HPET spec at
485 <http://www.intel.com/hardwaredesign/hpetspec.htm>. 504 <http://www.intel.com/hardwaredesign/hpetspec_1.pdf>.
486 505
487 You can safely choose Y here. However, HPET will only be 506 You can safely choose Y here. However, HPET will only be
488 activated if the platform and the BIOS support this feature. 507 activated if the platform and the BIOS support this feature.
@@ -567,9 +586,19 @@ config AMD_IOMMU
567 your BIOS for an option to enable it or if you have an IVRS ACPI 586 your BIOS for an option to enable it or if you have an IVRS ACPI
568 table. 587 table.
569 588
589config AMD_IOMMU_STATS
590 bool "Export AMD IOMMU statistics to debugfs"
591 depends on AMD_IOMMU
592 select DEBUG_FS
593 help
594 This option enables code in the AMD IOMMU driver to collect various
595 statistics about whats happening in the driver and exports that
596 information to userspace via debugfs.
597 If unsure, say N.
598
570# need this always selected by IOMMU for the VIA workaround 599# need this always selected by IOMMU for the VIA workaround
571config SWIOTLB 600config SWIOTLB
572 bool 601 def_bool y if X86_64
573 help 602 help
574 Support for software bounce buffers used on x86-64 systems 603 Support for software bounce buffers used on x86-64 systems
575 which don't have a hardware IOMMU (e.g. the current generation 604 which don't have a hardware IOMMU (e.g. the current generation
@@ -580,21 +609,25 @@ config SWIOTLB
580config IOMMU_HELPER 609config IOMMU_HELPER
581 def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU) 610 def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU)
582 611
612config IOMMU_API
613 def_bool (AMD_IOMMU || DMAR)
614
583config MAXSMP 615config MAXSMP
584 bool "Configure Maximum number of SMP Processors and NUMA Nodes" 616 bool "Configure Maximum number of SMP Processors and NUMA Nodes"
585 depends on X86_64 && SMP && BROKEN 617 depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL
618 select CPUMASK_OFFSTACK
586 default n 619 default n
587 help 620 help
588 Configure maximum number of CPUS and NUMA Nodes for this architecture. 621 Configure maximum number of CPUS and NUMA Nodes for this architecture.
589 If unsure, say N. 622 If unsure, say N.
590 623
591config NR_CPUS 624config NR_CPUS
592 int "Maximum number of CPUs (2-512)" if !MAXSMP 625 int "Maximum number of CPUs" if SMP && !MAXSMP
593 range 2 512 626 range 2 512 if SMP && !MAXSMP
594 depends on SMP 627 default "1" if !SMP
595 default "4096" if MAXSMP 628 default "4096" if MAXSMP
596 default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000 629 default "32" if SMP && (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000)
597 default "8" 630 default "8" if SMP
598 help 631 help
599 This allows you to specify the maximum number of CPUs which this 632 This allows you to specify the maximum number of CPUs which this
600 kernel will support. The maximum supported value is 512 and the 633 kernel will support. The maximum supported value is 512 and the
@@ -660,6 +693,30 @@ config X86_VISWS_APIC
660 def_bool y 693 def_bool y
661 depends on X86_32 && X86_VISWS 694 depends on X86_32 && X86_VISWS
662 695
696config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
697 bool "Reroute for broken boot IRQs"
698 default n
699 depends on X86_IO_APIC
700 help
701 This option enables a workaround that fixes a source of
702 spurious interrupts. This is recommended when threaded
703 interrupt handling is used on systems where the generation of
704 superfluous "boot interrupts" cannot be disabled.
705
706 Some chipsets generate a legacy INTx "boot IRQ" when the IRQ
707 entry in the chipset's IO-APIC is masked (as, e.g. the RT
708 kernel does during interrupt handling). On chipsets where this
709 boot IRQ generation cannot be disabled, this workaround keeps
710 the original IRQ line masked so that only the equivalent "boot
711 IRQ" is delivered to the CPUs. The workaround also tells the
712 kernel to set up the IRQ handler on the boot IRQ line. In this
713 way only one interrupt is delivered to the kernel. Otherwise
714 the spurious second interrupt may cause the kernel to bring
715 down (vital) interrupt lines.
716
717 Only affects "broken" chipsets. Interrupt sharing may be
718 increased on these systems.
719
663config X86_MCE 720config X86_MCE
664 bool "Machine Check Exception" 721 bool "Machine Check Exception"
665 depends on !X86_VOYAGER 722 depends on !X86_VOYAGER
@@ -956,24 +1013,37 @@ config X86_PAE
956config ARCH_PHYS_ADDR_T_64BIT 1013config ARCH_PHYS_ADDR_T_64BIT
957 def_bool X86_64 || X86_PAE 1014 def_bool X86_64 || X86_PAE
958 1015
1016config DIRECT_GBPAGES
1017 bool "Enable 1GB pages for kernel pagetables" if EMBEDDED
1018 default y
1019 depends on X86_64
1020 help
1021 Allow the kernel linear mapping to use 1GB pages on CPUs that
1022 support it. This can improve the kernel's performance a tiny bit by
1023 reducing TLB pressure. If in doubt, say "Y".
1024
959# Common NUMA Features 1025# Common NUMA Features
960config NUMA 1026config NUMA
961 bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)" 1027 bool "Numa Memory Allocation and Scheduler Support"
962 depends on SMP 1028 depends on SMP
963 depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL) 1029 depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL)
964 default n if X86_PC 1030 default n if X86_PC
965 default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP) 1031 default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP)
966 help 1032 help
967 Enable NUMA (Non Uniform Memory Access) support. 1033 Enable NUMA (Non Uniform Memory Access) support.
1034
968 The kernel will try to allocate memory used by a CPU on the 1035 The kernel will try to allocate memory used by a CPU on the
969 local memory controller of the CPU and add some more 1036 local memory controller of the CPU and add some more
970 NUMA awareness to the kernel. 1037 NUMA awareness to the kernel.
971 1038
972 For 32-bit this is currently highly experimental and should be only 1039 For 64-bit this is recommended if the system is Intel Core i7
973 used for kernel development. It might also cause boot failures. 1040 (or later), AMD Opteron, or EM64T NUMA.
974 For 64-bit this is recommended on all multiprocessor Opteron systems. 1041
975 If the system is EM64T, you should say N unless your system is 1042 For 32-bit this is only needed on (rare) 32-bit-only platforms
976 EM64T NUMA. 1043 that support NUMA topologies, such as NUMAQ / Summit, or if you
1044 boot a 32-bit kernel on a 64-bit NUMA platform.
1045
1046 Otherwise, you should say N.
977 1047
978comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI" 1048comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI"
979 depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI) 1049 depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI)
@@ -1493,6 +1563,10 @@ config ARCH_ENABLE_MEMORY_HOTPLUG
1493 def_bool y 1563 def_bool y
1494 depends on X86_64 || (X86_32 && HIGHMEM) 1564 depends on X86_64 || (X86_32 && HIGHMEM)
1495 1565
1566config ARCH_ENABLE_MEMORY_HOTREMOVE
1567 def_bool y
1568 depends on MEMORY_HOTPLUG
1569
1496config HAVE_ARCH_EARLY_PFN_TO_NID 1570config HAVE_ARCH_EARLY_PFN_TO_NID
1497 def_bool X86_64 1571 def_bool X86_64
1498 depends on NUMA 1572 depends on NUMA
@@ -1632,13 +1706,6 @@ config APM_ALLOW_INTS
1632 many of the newer IBM Thinkpads. If you experience hangs when you 1706 many of the newer IBM Thinkpads. If you experience hangs when you
1633 suspend, try setting this to Y. Otherwise, say N. 1707 suspend, try setting this to Y. Otherwise, say N.
1634 1708
1635config APM_REAL_MODE_POWER_OFF
1636 bool "Use real mode APM BIOS call to power off"
1637 help
1638 Use real mode APM BIOS calls to switch off the computer. This is
1639 a work-around for a number of buggy BIOSes. Switch this option on if
1640 your computer crashes instead of powering off properly.
1641
1642endif # APM 1709endif # APM
1643 1710
1644source "arch/x86/kernel/cpu/cpufreq/Kconfig" 1711source "arch/x86/kernel/cpu/cpufreq/Kconfig"
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index b815664fe370..8078955845ae 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -408,7 +408,7 @@ config X86_MINIMUM_CPU_FAMILY
408 408
409config X86_DEBUGCTLMSR 409config X86_DEBUGCTLMSR
410 def_bool y 410 def_bool y
411 depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) 411 depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) && !UML
412 412
413menuconfig PROCESSOR_SELECT 413menuconfig PROCESSOR_SELECT
414 bool "Supported processor vendors" if EMBEDDED 414 bool "Supported processor vendors" if EMBEDDED
@@ -515,6 +515,7 @@ config CPU_SUP_UMC_32
515config X86_DS 515config X86_DS
516 def_bool X86_PTRACE_BTS 516 def_bool X86_PTRACE_BTS
517 depends on X86_DEBUGCTLMSR 517 depends on X86_DEBUGCTLMSR
518 select HAVE_HW_BRANCH_TRACER
518 519
519config X86_PTRACE_BTS 520config X86_PTRACE_BTS
520 bool "Branch Trace Store" 521 bool "Branch Trace Store"
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 2a3dfbd5e677..10d6cc3fd052 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -114,18 +114,6 @@ config DEBUG_RODATA
114 data. This is recommended so that we can catch kernel bugs sooner. 114 data. This is recommended so that we can catch kernel bugs sooner.
115 If in doubt, say "Y". 115 If in doubt, say "Y".
116 116
117config DIRECT_GBPAGES
118 bool "Enable gbpages-mapped kernel pagetables"
119 depends on DEBUG_KERNEL && EXPERIMENTAL && X86_64
120 help
121 Enable gigabyte pages support (if the CPU supports it). This can
122 improve the kernel's performance a tiny bit by reducing TLB
123 pressure.
124
125 This is experimental code.
126
127 If in doubt, say "N".
128
129config DEBUG_RODATA_TEST 117config DEBUG_RODATA_TEST
130 bool "Testcase for the DEBUG_RODATA feature" 118 bool "Testcase for the DEBUG_RODATA feature"
131 depends on DEBUG_RODATA 119 depends on DEBUG_RODATA
@@ -186,14 +174,10 @@ config IOMMU_LEAK
186 Add a simple leak tracer to the IOMMU code. This is useful when you 174 Add a simple leak tracer to the IOMMU code. This is useful when you
187 are debugging a buggy device driver that leaks IOMMU mappings. 175 are debugging a buggy device driver that leaks IOMMU mappings.
188 176
189config MMIOTRACE_HOOKS
190 bool
191
192config MMIOTRACE 177config MMIOTRACE
193 bool "Memory mapped IO tracing" 178 bool "Memory mapped IO tracing"
194 depends on DEBUG_KERNEL && PCI 179 depends on DEBUG_KERNEL && PCI
195 select TRACING 180 select TRACING
196 select MMIOTRACE_HOOKS
197 help 181 help
198 Mmiotrace traces Memory Mapped I/O access and is meant for 182 Mmiotrace traces Memory Mapped I/O access and is meant for
199 debugging and reverse engineering. It is called from the ioremap 183 debugging and reverse engineering. It is called from the ioremap
@@ -307,10 +291,10 @@ config OPTIMIZE_INLINING
307 developers have marked 'inline'. Doing so takes away freedom from gcc to 291 developers have marked 'inline'. Doing so takes away freedom from gcc to
308 do what it thinks is best, which is desirable for the gcc 3.x series of 292 do what it thinks is best, which is desirable for the gcc 3.x series of
309 compilers. The gcc 4.x series have a rewritten inlining algorithm and 293 compilers. The gcc 4.x series have a rewritten inlining algorithm and
310 disabling this option will generate a smaller kernel there. Hopefully 294 enabling this option will generate a smaller kernel there. Hopefully
311 this algorithm is so good that allowing gcc4 to make the decision can 295 this algorithm is so good that allowing gcc 4.x and above to make the
312 become the default in the future, until then this option is there to 296 decision will become the default in the future. Until then this option
313 test gcc for this. 297 is there to test gcc for this.
314 298
315 If unsure, say N. 299 If unsure, say N.
316 300
diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c
index b939cb476dec..5d4742ed4aa2 100644
--- a/arch/x86/boot/video-vga.c
+++ b/arch/x86/boot/video-vga.c
@@ -34,7 +34,7 @@ static struct mode_info cga_modes[] = {
34 { VIDEO_80x25, 80, 25, 0 }, 34 { VIDEO_80x25, 80, 25, 0 },
35}; 35};
36 36
37__videocard video_vga; 37static __videocard video_vga;
38 38
39/* Set basic 80x25 mode */ 39/* Set basic 80x25 mode */
40static u8 vga_set_basic_mode(void) 40static u8 vga_set_basic_mode(void)
@@ -259,7 +259,7 @@ static int vga_probe(void)
259 return mode_count[adapter]; 259 return mode_count[adapter];
260} 260}
261 261
262__videocard video_vga = { 262static __videocard video_vga = {
263 .card_name = "VGA", 263 .card_name = "VGA",
264 .probe = vga_probe, 264 .probe = vga_probe,
265 .set_mode = vga_set_mode, 265 .set_mode = vga_set_mode,
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
index 83598b23093a..3bef2c1febe9 100644
--- a/arch/x86/boot/video.c
+++ b/arch/x86/boot/video.c
@@ -226,7 +226,7 @@ static unsigned int mode_menu(void)
226 226
227#ifdef CONFIG_VIDEO_RETAIN 227#ifdef CONFIG_VIDEO_RETAIN
228/* Save screen content to the heap */ 228/* Save screen content to the heap */
229struct saved_screen { 229static struct saved_screen {
230 int x, y; 230 int x, y;
231 int curx, cury; 231 int curx, cury;
232 u16 *data; 232 u16 *data;
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 13b8c86ae985..b30a08ed8eb4 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -77,7 +77,7 @@ CONFIG_AUDIT=y
77CONFIG_AUDITSYSCALL=y 77CONFIG_AUDITSYSCALL=y
78CONFIG_AUDIT_TREE=y 78CONFIG_AUDIT_TREE=y
79# CONFIG_IKCONFIG is not set 79# CONFIG_IKCONFIG is not set
80CONFIG_LOG_BUF_SHIFT=17 80CONFIG_LOG_BUF_SHIFT=18
81CONFIG_CGROUPS=y 81CONFIG_CGROUPS=y
82# CONFIG_CGROUP_DEBUG is not set 82# CONFIG_CGROUP_DEBUG is not set
83CONFIG_CGROUP_NS=y 83CONFIG_CGROUP_NS=y
@@ -298,7 +298,7 @@ CONFIG_KEXEC=y
298CONFIG_CRASH_DUMP=y 298CONFIG_CRASH_DUMP=y
299# CONFIG_KEXEC_JUMP is not set 299# CONFIG_KEXEC_JUMP is not set
300CONFIG_PHYSICAL_START=0x1000000 300CONFIG_PHYSICAL_START=0x1000000
301CONFIG_RELOCATABLE=y 301# CONFIG_RELOCATABLE is not set
302CONFIG_PHYSICAL_ALIGN=0x200000 302CONFIG_PHYSICAL_ALIGN=0x200000
303CONFIG_HOTPLUG_CPU=y 303CONFIG_HOTPLUG_CPU=y
304# CONFIG_COMPAT_VDSO is not set 304# CONFIG_COMPAT_VDSO is not set
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index f0a03d7a7d63..0e7dbc0a3e46 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -77,7 +77,7 @@ CONFIG_AUDIT=y
77CONFIG_AUDITSYSCALL=y 77CONFIG_AUDITSYSCALL=y
78CONFIG_AUDIT_TREE=y 78CONFIG_AUDIT_TREE=y
79# CONFIG_IKCONFIG is not set 79# CONFIG_IKCONFIG is not set
80CONFIG_LOG_BUF_SHIFT=17 80CONFIG_LOG_BUF_SHIFT=18
81CONFIG_CGROUPS=y 81CONFIG_CGROUPS=y
82# CONFIG_CGROUP_DEBUG is not set 82# CONFIG_CGROUP_DEBUG is not set
83CONFIG_CGROUP_NS=y 83CONFIG_CGROUP_NS=y
@@ -298,7 +298,7 @@ CONFIG_SCHED_HRTICK=y
298CONFIG_KEXEC=y 298CONFIG_KEXEC=y
299CONFIG_CRASH_DUMP=y 299CONFIG_CRASH_DUMP=y
300CONFIG_PHYSICAL_START=0x1000000 300CONFIG_PHYSICAL_START=0x1000000
301CONFIG_RELOCATABLE=y 301# CONFIG_RELOCATABLE is not set
302CONFIG_PHYSICAL_ALIGN=0x200000 302CONFIG_PHYSICAL_ALIGN=0x200000
303CONFIG_HOTPLUG_CPU=y 303CONFIG_HOTPLUG_CPU=y
304# CONFIG_COMPAT_VDSO is not set 304# CONFIG_COMPAT_VDSO is not set
diff --git a/arch/x86/crypto/crc32c-intel.c b/arch/x86/crypto/crc32c-intel.c
index 070afc5b6c94..b9d00261703c 100644
--- a/arch/x86/crypto/crc32c-intel.c
+++ b/arch/x86/crypto/crc32c-intel.c
@@ -6,13 +6,22 @@
6 * Intel(R) 64 and IA-32 Architectures Software Developer's Manual 6 * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
7 * Volume 2A: Instruction Set Reference, A-M 7 * Volume 2A: Instruction Set Reference, A-M
8 * 8 *
9 * Copyright (c) 2008 Austin Zhang <austin_zhang@linux.intel.com> 9 * Copyright (C) 2008 Intel Corporation
10 * Copyright (c) 2008 Kent Liu <kent.liu@intel.com> 10 * Authors: Austin Zhang <austin_zhang@linux.intel.com>
11 * Kent Liu <kent.liu@intel.com>
11 * 12 *
12 * This program is free software; you can redistribute it and/or modify it 13 * This program is free software; you can redistribute it and/or modify it
13 * under the terms of the GNU General Public License as published by the Free 14 * under the terms and conditions of the GNU General Public License,
14 * Software Foundation; either version 2 of the License, or (at your option) 15 * version 2, as published by the Free Software Foundation.
15 * any later version. 16 *
17 * This program is distributed in the hope it will be useful, but WITHOUT
18 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
20 * more details.
21 *
22 * You should have received a copy of the GNU General Public License along with
23 * this program; if not, write to the Free Software Foundation, Inc.,
24 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
16 * 25 *
17 */ 26 */
18#include <linux/init.h> 27#include <linux/init.h>
@@ -75,99 +84,92 @@ static u32 __pure crc32c_intel_le_hw(u32 crc, unsigned char const *p, size_t len
75 * If your algorithm starts with ~0, then XOR with ~0 before you set 84 * If your algorithm starts with ~0, then XOR with ~0 before you set
76 * the seed. 85 * the seed.
77 */ 86 */
78static int crc32c_intel_setkey(struct crypto_ahash *hash, const u8 *key, 87static int crc32c_intel_setkey(struct crypto_shash *hash, const u8 *key,
79 unsigned int keylen) 88 unsigned int keylen)
80{ 89{
81 u32 *mctx = crypto_ahash_ctx(hash); 90 u32 *mctx = crypto_shash_ctx(hash);
82 91
83 if (keylen != sizeof(u32)) { 92 if (keylen != sizeof(u32)) {
84 crypto_ahash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN); 93 crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
85 return -EINVAL; 94 return -EINVAL;
86 } 95 }
87 *mctx = le32_to_cpup((__le32 *)key); 96 *mctx = le32_to_cpup((__le32 *)key);
88 return 0; 97 return 0;
89} 98}
90 99
91static int crc32c_intel_init(struct ahash_request *req) 100static int crc32c_intel_init(struct shash_desc *desc)
92{ 101{
93 u32 *mctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req)); 102 u32 *mctx = crypto_shash_ctx(desc->tfm);
94 u32 *crcp = ahash_request_ctx(req); 103 u32 *crcp = shash_desc_ctx(desc);
95 104
96 *crcp = *mctx; 105 *crcp = *mctx;
97 106
98 return 0; 107 return 0;
99} 108}
100 109
101static int crc32c_intel_update(struct ahash_request *req) 110static int crc32c_intel_update(struct shash_desc *desc, const u8 *data,
111 unsigned int len)
102{ 112{
103 struct crypto_hash_walk walk; 113 u32 *crcp = shash_desc_ctx(desc);
104 u32 *crcp = ahash_request_ctx(req);
105 u32 crc = *crcp;
106 int nbytes;
107
108 for (nbytes = crypto_hash_walk_first(req, &walk); nbytes;
109 nbytes = crypto_hash_walk_done(&walk, 0))
110 crc = crc32c_intel_le_hw(crc, walk.data, nbytes);
111 114
112 *crcp = crc; 115 *crcp = crc32c_intel_le_hw(*crcp, data, len);
113 return 0; 116 return 0;
114} 117}
115 118
116static int crc32c_intel_final(struct ahash_request *req) 119static int __crc32c_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
120 u8 *out)
117{ 121{
118 u32 *crcp = ahash_request_ctx(req); 122 *(__le32 *)out = ~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
119
120 *(__le32 *)req->result = ~cpu_to_le32p(crcp);
121 return 0; 123 return 0;
122} 124}
123 125
124static int crc32c_intel_digest(struct ahash_request *req) 126static int crc32c_intel_finup(struct shash_desc *desc, const u8 *data,
127 unsigned int len, u8 *out)
125{ 128{
126 struct crypto_hash_walk walk; 129 return __crc32c_intel_finup(shash_desc_ctx(desc), data, len, out);
127 u32 *mctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req)); 130}
128 u32 crc = *mctx;
129 int nbytes;
130 131
131 for (nbytes = crypto_hash_walk_first(req, &walk); nbytes; 132static int crc32c_intel_final(struct shash_desc *desc, u8 *out)
132 nbytes = crypto_hash_walk_done(&walk, 0)) 133{
133 crc = crc32c_intel_le_hw(crc, walk.data, nbytes); 134 u32 *crcp = shash_desc_ctx(desc);
134 135
135 *(__le32 *)req->result = ~cpu_to_le32(crc); 136 *(__le32 *)out = ~cpu_to_le32p(crcp);
136 return 0; 137 return 0;
137} 138}
138 139
140static int crc32c_intel_digest(struct shash_desc *desc, const u8 *data,
141 unsigned int len, u8 *out)
142{
143 return __crc32c_intel_finup(crypto_shash_ctx(desc->tfm), data, len,
144 out);
145}
146
139static int crc32c_intel_cra_init(struct crypto_tfm *tfm) 147static int crc32c_intel_cra_init(struct crypto_tfm *tfm)
140{ 148{
141 u32 *key = crypto_tfm_ctx(tfm); 149 u32 *key = crypto_tfm_ctx(tfm);
142 150
143 *key = ~0; 151 *key = ~0;
144 152
145 tfm->crt_ahash.reqsize = sizeof(u32);
146
147 return 0; 153 return 0;
148} 154}
149 155
150static struct crypto_alg alg = { 156static struct shash_alg alg = {
151 .cra_name = "crc32c", 157 .setkey = crc32c_intel_setkey,
152 .cra_driver_name = "crc32c-intel", 158 .init = crc32c_intel_init,
153 .cra_priority = 200, 159 .update = crc32c_intel_update,
154 .cra_flags = CRYPTO_ALG_TYPE_AHASH, 160 .final = crc32c_intel_final,
155 .cra_blocksize = CHKSUM_BLOCK_SIZE, 161 .finup = crc32c_intel_finup,
156 .cra_alignmask = 3, 162 .digest = crc32c_intel_digest,
157 .cra_ctxsize = sizeof(u32), 163 .descsize = sizeof(u32),
158 .cra_module = THIS_MODULE, 164 .digestsize = CHKSUM_DIGEST_SIZE,
159 .cra_list = LIST_HEAD_INIT(alg.cra_list), 165 .base = {
160 .cra_init = crc32c_intel_cra_init, 166 .cra_name = "crc32c",
161 .cra_type = &crypto_ahash_type, 167 .cra_driver_name = "crc32c-intel",
162 .cra_u = { 168 .cra_priority = 200,
163 .ahash = { 169 .cra_blocksize = CHKSUM_BLOCK_SIZE,
164 .digestsize = CHKSUM_DIGEST_SIZE, 170 .cra_ctxsize = sizeof(u32),
165 .setkey = crc32c_intel_setkey, 171 .cra_module = THIS_MODULE,
166 .init = crc32c_intel_init, 172 .cra_init = crc32c_intel_cra_init,
167 .update = crc32c_intel_update,
168 .final = crc32c_intel_final,
169 .digest = crc32c_intel_digest,
170 }
171 } 173 }
172}; 174};
173 175
@@ -175,14 +177,14 @@ static struct crypto_alg alg = {
175static int __init crc32c_intel_mod_init(void) 177static int __init crc32c_intel_mod_init(void)
176{ 178{
177 if (cpu_has_xmm4_2) 179 if (cpu_has_xmm4_2)
178 return crypto_register_alg(&alg); 180 return crypto_register_shash(&alg);
179 else 181 else
180 return -ENODEV; 182 return -ENODEV;
181} 183}
182 184
183static void __exit crc32c_intel_mod_fini(void) 185static void __exit crc32c_intel_mod_fini(void)
184{ 186{
185 crypto_unregister_alg(&alg); 187 crypto_unregister_shash(&alg);
186} 188}
187 189
188module_init(crc32c_intel_mod_init); 190module_init(crc32c_intel_mod_init);
@@ -194,4 +196,3 @@ MODULE_LICENSE("GPL");
194 196
195MODULE_ALIAS("crc32c"); 197MODULE_ALIAS("crc32c");
196MODULE_ALIAS("crc32c-intel"); 198MODULE_ALIAS("crc32c-intel");
197
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 127ec3f07214..2a4d073d2cf1 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -327,7 +327,7 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs)
327 current->mm->cached_hole_size = 0; 327 current->mm->cached_hole_size = 0;
328 328
329 current->mm->mmap = NULL; 329 current->mm->mmap = NULL;
330 compute_creds(bprm); 330 install_exec_creds(bprm);
331 current->flags &= ~PF_FORKNOEXEC; 331 current->flags &= ~PF_FORKNOEXEC;
332 332
333 if (N_MAGIC(ex) == OMAGIC) { 333 if (N_MAGIC(ex) == OMAGIC) {
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 4bc02b23674b..9dabd00e9805 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -24,13 +24,14 @@
24#include <asm/ucontext.h> 24#include <asm/ucontext.h>
25#include <asm/uaccess.h> 25#include <asm/uaccess.h>
26#include <asm/i387.h> 26#include <asm/i387.h>
27#include <asm/ia32.h>
28#include <asm/ptrace.h> 27#include <asm/ptrace.h>
29#include <asm/ia32_unistd.h> 28#include <asm/ia32_unistd.h>
30#include <asm/user32.h> 29#include <asm/user32.h>
31#include <asm/sigcontext32.h> 30#include <asm/sigcontext32.h>
32#include <asm/proto.h> 31#include <asm/proto.h>
33#include <asm/vdso.h> 32#include <asm/vdso.h>
33#include <asm/sigframe.h>
34#include <asm/sys_ia32.h>
34 35
35#define DEBUG_SIG 0 36#define DEBUG_SIG 0
36 37
@@ -41,7 +42,6 @@
41 X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \ 42 X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \
42 X86_EFLAGS_CF) 43 X86_EFLAGS_CF)
43 44
44asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset);
45void signal_fault(struct pt_regs *regs, void __user *frame, char *where); 45void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
46 46
47int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from) 47int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
@@ -173,47 +173,28 @@ asmlinkage long sys32_sigaltstack(const stack_ia32_t __user *uss_ptr,
173/* 173/*
174 * Do a signal return; undo the signal stack. 174 * Do a signal return; undo the signal stack.
175 */ 175 */
176#define COPY(x) { \
177 err |= __get_user(regs->x, &sc->x); \
178}
176 179
177struct sigframe 180#define COPY_SEG_CPL3(seg) { \
178{ 181 unsigned short tmp; \
179 u32 pretcode; 182 err |= __get_user(tmp, &sc->seg); \
180 int sig; 183 regs->seg = tmp | 3; \
181 struct sigcontext_ia32 sc;
182 struct _fpstate_ia32 fpstate_unused; /* look at kernel/sigframe.h */
183 unsigned int extramask[_COMPAT_NSIG_WORDS-1];
184 char retcode[8];
185 /* fp state follows here */
186};
187
188struct rt_sigframe
189{
190 u32 pretcode;
191 int sig;
192 u32 pinfo;
193 u32 puc;
194 compat_siginfo_t info;
195 struct ucontext_ia32 uc;
196 char retcode[8];
197 /* fp state follows here */
198};
199
200#define COPY(x) { \
201 unsigned int reg; \
202 err |= __get_user(reg, &sc->x); \
203 regs->x = reg; \
204} 184}
205 185
206#define RELOAD_SEG(seg,mask) \ 186#define RELOAD_SEG(seg) { \
207 { unsigned int cur; \ 187 unsigned int cur, pre; \
208 unsigned short pre; \ 188 err |= __get_user(pre, &sc->seg); \
209 err |= __get_user(pre, &sc->seg); \ 189 savesegment(seg, cur); \
210 savesegment(seg, cur); \ 190 pre |= 3; \
211 pre |= mask; \ 191 if (pre != cur) \
212 if (pre != cur) loadsegment(seg, pre); } 192 loadsegment(seg, pre); \
193}
213 194
214static int ia32_restore_sigcontext(struct pt_regs *regs, 195static int ia32_restore_sigcontext(struct pt_regs *regs,
215 struct sigcontext_ia32 __user *sc, 196 struct sigcontext_ia32 __user *sc,
216 unsigned int *peax) 197 unsigned int *pax)
217{ 198{
218 unsigned int tmpflags, gs, oldgs, err = 0; 199 unsigned int tmpflags, gs, oldgs, err = 0;
219 void __user *buf; 200 void __user *buf;
@@ -240,18 +221,16 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
240 if (gs != oldgs) 221 if (gs != oldgs)
241 load_gs_index(gs); 222 load_gs_index(gs);
242 223
243 RELOAD_SEG(fs, 3); 224 RELOAD_SEG(fs);
244 RELOAD_SEG(ds, 3); 225 RELOAD_SEG(ds);
245 RELOAD_SEG(es, 3); 226 RELOAD_SEG(es);
246 227
247 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); 228 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
248 COPY(dx); COPY(cx); COPY(ip); 229 COPY(dx); COPY(cx); COPY(ip);
249 /* Don't touch extended registers */ 230 /* Don't touch extended registers */
250 231
251 err |= __get_user(regs->cs, &sc->cs); 232 COPY_SEG_CPL3(cs);
252 regs->cs |= 3; 233 COPY_SEG_CPL3(ss);
253 err |= __get_user(regs->ss, &sc->ss);
254 regs->ss |= 3;
255 234
256 err |= __get_user(tmpflags, &sc->flags); 235 err |= __get_user(tmpflags, &sc->flags);
257 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); 236 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
@@ -262,15 +241,13 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
262 buf = compat_ptr(tmp); 241 buf = compat_ptr(tmp);
263 err |= restore_i387_xstate_ia32(buf); 242 err |= restore_i387_xstate_ia32(buf);
264 243
265 err |= __get_user(tmp, &sc->ax); 244 err |= __get_user(*pax, &sc->ax);
266 *peax = tmp;
267
268 return err; 245 return err;
269} 246}
270 247
271asmlinkage long sys32_sigreturn(struct pt_regs *regs) 248asmlinkage long sys32_sigreturn(struct pt_regs *regs)
272{ 249{
273 struct sigframe __user *frame = (struct sigframe __user *)(regs->sp-8); 250 struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8);
274 sigset_t set; 251 sigset_t set;
275 unsigned int ax; 252 unsigned int ax;
276 253
@@ -300,12 +277,12 @@ badframe:
300 277
301asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs) 278asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs)
302{ 279{
303 struct rt_sigframe __user *frame; 280 struct rt_sigframe_ia32 __user *frame;
304 sigset_t set; 281 sigset_t set;
305 unsigned int ax; 282 unsigned int ax;
306 struct pt_regs tregs; 283 struct pt_regs tregs;
307 284
308 frame = (struct rt_sigframe __user *)(regs->sp - 4); 285 frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4);
309 286
310 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) 287 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
311 goto badframe; 288 goto badframe;
@@ -359,20 +336,15 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc,
359 err |= __put_user(regs->dx, &sc->dx); 336 err |= __put_user(regs->dx, &sc->dx);
360 err |= __put_user(regs->cx, &sc->cx); 337 err |= __put_user(regs->cx, &sc->cx);
361 err |= __put_user(regs->ax, &sc->ax); 338 err |= __put_user(regs->ax, &sc->ax);
362 err |= __put_user(regs->cs, &sc->cs);
363 err |= __put_user(regs->ss, &sc->ss);
364 err |= __put_user(current->thread.trap_no, &sc->trapno); 339 err |= __put_user(current->thread.trap_no, &sc->trapno);
365 err |= __put_user(current->thread.error_code, &sc->err); 340 err |= __put_user(current->thread.error_code, &sc->err);
366 err |= __put_user(regs->ip, &sc->ip); 341 err |= __put_user(regs->ip, &sc->ip);
342 err |= __put_user(regs->cs, (unsigned int __user *)&sc->cs);
367 err |= __put_user(regs->flags, &sc->flags); 343 err |= __put_user(regs->flags, &sc->flags);
368 err |= __put_user(regs->sp, &sc->sp_at_signal); 344 err |= __put_user(regs->sp, &sc->sp_at_signal);
345 err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss);
369 346
370 tmp = save_i387_xstate_ia32(fpstate); 347 err |= __put_user(ptr_to_compat(fpstate), &sc->fpstate);
371 if (tmp < 0)
372 err = -EFAULT;
373 else
374 err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL),
375 &sc->fpstate);
376 348
377 /* non-iBCS2 extensions.. */ 349 /* non-iBCS2 extensions.. */
378 err |= __put_user(mask, &sc->oldmask); 350 err |= __put_user(mask, &sc->oldmask);
@@ -400,7 +372,7 @@ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
400 } 372 }
401 373
402 /* This is the legacy signal stack switching. */ 374 /* This is the legacy signal stack switching. */
403 else if ((regs->ss & 0xffff) != __USER_DS && 375 else if ((regs->ss & 0xffff) != __USER32_DS &&
404 !(ka->sa.sa_flags & SA_RESTORER) && 376 !(ka->sa.sa_flags & SA_RESTORER) &&
405 ka->sa.sa_restorer) 377 ka->sa.sa_restorer)
406 sp = (unsigned long) ka->sa.sa_restorer; 378 sp = (unsigned long) ka->sa.sa_restorer;
@@ -408,6 +380,8 @@ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
408 if (used_math()) { 380 if (used_math()) {
409 sp = sp - sig_xstate_ia32_size; 381 sp = sp - sig_xstate_ia32_size;
410 *fpstate = (struct _fpstate_ia32 *) sp; 382 *fpstate = (struct _fpstate_ia32 *) sp;
383 if (save_i387_xstate_ia32(*fpstate) < 0)
384 return (void __user *) -1L;
411 } 385 }
412 386
413 sp -= frame_size; 387 sp -= frame_size;
@@ -420,7 +394,7 @@ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
420int ia32_setup_frame(int sig, struct k_sigaction *ka, 394int ia32_setup_frame(int sig, struct k_sigaction *ka,
421 compat_sigset_t *set, struct pt_regs *regs) 395 compat_sigset_t *set, struct pt_regs *regs)
422{ 396{
423 struct sigframe __user *frame; 397 struct sigframe_ia32 __user *frame;
424 void __user *restorer; 398 void __user *restorer;
425 int err = 0; 399 int err = 0;
426 void __user *fpstate = NULL; 400 void __user *fpstate = NULL;
@@ -430,12 +404,10 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
430 u16 poplmovl; 404 u16 poplmovl;
431 u32 val; 405 u32 val;
432 u16 int80; 406 u16 int80;
433 u16 pad;
434 } __attribute__((packed)) code = { 407 } __attribute__((packed)) code = {
435 0xb858, /* popl %eax ; movl $...,%eax */ 408 0xb858, /* popl %eax ; movl $...,%eax */
436 __NR_ia32_sigreturn, 409 __NR_ia32_sigreturn,
437 0x80cd, /* int $0x80 */ 410 0x80cd, /* int $0x80 */
438 0,
439 }; 411 };
440 412
441 frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); 413 frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
@@ -471,7 +443,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
471 * These are actually not used anymore, but left because some 443 * These are actually not used anymore, but left because some
472 * gdb versions depend on them as a marker. 444 * gdb versions depend on them as a marker.
473 */ 445 */
474 err |= __copy_to_user(frame->retcode, &code, 8); 446 err |= __put_user(*((u64 *)&code), (u64 *)frame->retcode);
475 if (err) 447 if (err)
476 return -EFAULT; 448 return -EFAULT;
477 449
@@ -501,7 +473,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
501int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 473int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
502 compat_sigset_t *set, struct pt_regs *regs) 474 compat_sigset_t *set, struct pt_regs *regs)
503{ 475{
504 struct rt_sigframe __user *frame; 476 struct rt_sigframe_ia32 __user *frame;
505 void __user *restorer; 477 void __user *restorer;
506 int err = 0; 478 int err = 0;
507 void __user *fpstate = NULL; 479 void __user *fpstate = NULL;
@@ -511,8 +483,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
511 u8 movl; 483 u8 movl;
512 u32 val; 484 u32 val;
513 u16 int80; 485 u16 int80;
514 u16 pad; 486 u8 pad;
515 u8 pad2;
516 } __attribute__((packed)) code = { 487 } __attribute__((packed)) code = {
517 0xb8, 488 0xb8,
518 __NR_ia32_rt_sigreturn, 489 __NR_ia32_rt_sigreturn,
@@ -559,7 +530,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
559 * Not actually used anymore, but left because some gdb 530 * Not actually used anymore, but left because some gdb
560 * versions need it. 531 * versions need it.
561 */ 532 */
562 err |= __copy_to_user(frame->retcode, &code, 8); 533 err |= __put_user(*((u64 *)&code), (u64 *)frame->retcode);
563 if (err) 534 if (err)
564 return -EFAULT; 535 return -EFAULT;
565 536
@@ -572,11 +543,6 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
572 regs->dx = (unsigned long) &frame->info; 543 regs->dx = (unsigned long) &frame->info;
573 regs->cx = (unsigned long) &frame->uc; 544 regs->cx = (unsigned long) &frame->uc;
574 545
575 /* Make -mregparm=3 work */
576 regs->ax = sig;
577 regs->dx = (unsigned long) &frame->info;
578 regs->cx = (unsigned long) &frame->uc;
579
580 loadsegment(ds, __USER32_DS); 546 loadsegment(ds, __USER32_DS);
581 loadsegment(es, __USER32_DS); 547 loadsegment(es, __USER32_DS);
582 548
diff --git a/arch/x86/ia32/ipc32.c b/arch/x86/ia32/ipc32.c
index d21991ce606c..29cdcd02ead3 100644
--- a/arch/x86/ia32/ipc32.c
+++ b/arch/x86/ia32/ipc32.c
@@ -8,6 +8,7 @@
8#include <linux/shm.h> 8#include <linux/shm.h>
9#include <linux/ipc.h> 9#include <linux/ipc.h>
10#include <linux/compat.h> 10#include <linux/compat.h>
11#include <asm/sys_ia32.h>
11 12
12asmlinkage long sys32_ipc(u32 call, int first, int second, int third, 13asmlinkage long sys32_ipc(u32 call, int first, int second, int third,
13 compat_uptr_t ptr, u32 fifth) 14 compat_uptr_t ptr, u32 fifth)
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 2e09dcd3c0a6..6c0d7f6231af 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -44,8 +44,8 @@
44#include <asm/types.h> 44#include <asm/types.h>
45#include <asm/uaccess.h> 45#include <asm/uaccess.h>
46#include <asm/atomic.h> 46#include <asm/atomic.h>
47#include <asm/ia32.h>
48#include <asm/vgtod.h> 47#include <asm/vgtod.h>
48#include <asm/sys_ia32.h>
49 49
50#define AA(__x) ((unsigned long)(__x)) 50#define AA(__x) ((unsigned long)(__x))
51 51
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index ac302a2fa339..95c8cd9d22b5 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -190,16 +190,23 @@
190/* FIXME: move this macro to <linux/pci.h> */ 190/* FIXME: move this macro to <linux/pci.h> */
191#define PCI_BUS(x) (((x) >> 8) & 0xff) 191#define PCI_BUS(x) (((x) >> 8) & 0xff)
192 192
193/* Protection domain flags */
194#define PD_DMA_OPS_MASK (1UL << 0) /* domain used for dma_ops */
195#define PD_DEFAULT_MASK (1UL << 1) /* domain is a default dma_ops
196 domain for an IOMMU */
197
193/* 198/*
194 * This structure contains generic data for IOMMU protection domains 199 * This structure contains generic data for IOMMU protection domains
195 * independent of their use. 200 * independent of their use.
196 */ 201 */
197struct protection_domain { 202struct protection_domain {
198 spinlock_t lock; /* mostly used to lock the page table*/ 203 spinlock_t lock; /* mostly used to lock the page table*/
199 u16 id; /* the domain id written to the device table */ 204 u16 id; /* the domain id written to the device table */
200 int mode; /* paging mode (0-6 levels) */ 205 int mode; /* paging mode (0-6 levels) */
201 u64 *pt_root; /* page table root pointer */ 206 u64 *pt_root; /* page table root pointer */
202 void *priv; /* private data */ 207 unsigned long flags; /* flags to find out type of domain */
208 unsigned dev_cnt; /* devices assigned to this domain */
209 void *priv; /* private data */
203}; 210};
204 211
205/* 212/*
@@ -295,7 +302,7 @@ struct amd_iommu {
295 bool int_enabled; 302 bool int_enabled;
296 303
297 /* if one, we need to send a completion wait command */ 304 /* if one, we need to send a completion wait command */
298 int need_sync; 305 bool need_sync;
299 306
300 /* default dma_ops domain for that IOMMU */ 307 /* default dma_ops domain for that IOMMU */
301 struct dma_ops_domain *default_dom; 308 struct dma_ops_domain *default_dom;
@@ -374,7 +381,7 @@ extern struct protection_domain **amd_iommu_pd_table;
374extern unsigned long *amd_iommu_pd_alloc_bitmap; 381extern unsigned long *amd_iommu_pd_alloc_bitmap;
375 382
376/* will be 1 if device isolation is enabled */ 383/* will be 1 if device isolation is enabled */
377extern int amd_iommu_isolate; 384extern bool amd_iommu_isolate;
378 385
379/* 386/*
380 * If true, the addresses will be flushed on unmap time, not when 387 * If true, the addresses will be flushed on unmap time, not when
@@ -382,18 +389,6 @@ extern int amd_iommu_isolate;
382 */ 389 */
383extern bool amd_iommu_unmap_flush; 390extern bool amd_iommu_unmap_flush;
384 391
385/* takes a PCI device id and prints it out in a readable form */
386static inline void print_devid(u16 devid, int nl)
387{
388 int bus = devid >> 8;
389 int dev = devid >> 3 & 0x1f;
390 int fn = devid & 0x07;
391
392 printk("%02x:%02x.%x", bus, dev, fn);
393 if (nl)
394 printk("\n");
395}
396
397/* takes bus and device/function and returns the device id 392/* takes bus and device/function and returns the device id
398 * FIXME: should that be in generic PCI code? */ 393 * FIXME: should that be in generic PCI code? */
399static inline u16 calc_devid(u8 bus, u8 devfn) 394static inline u16 calc_devid(u8 bus, u8 devfn)
@@ -401,4 +396,32 @@ static inline u16 calc_devid(u8 bus, u8 devfn)
401 return (((u16)bus) << 8) | devfn; 396 return (((u16)bus) << 8) | devfn;
402} 397}
403 398
399#ifdef CONFIG_AMD_IOMMU_STATS
400
401struct __iommu_counter {
402 char *name;
403 struct dentry *dent;
404 u64 value;
405};
406
407#define DECLARE_STATS_COUNTER(nm) \
408 static struct __iommu_counter nm = { \
409 .name = #nm, \
410 }
411
412#define INC_STATS_COUNTER(name) name.value += 1
413#define ADD_STATS_COUNTER(name, x) name.value += (x)
414#define SUB_STATS_COUNTER(name, x) name.value -= (x)
415
416#else /* CONFIG_AMD_IOMMU_STATS */
417
418#define DECLARE_STATS_COUNTER(name)
419#define INC_STATS_COUNTER(name)
420#define ADD_STATS_COUNTER(name, x)
421#define SUB_STATS_COUNTER(name, x)
422
423static inline void amd_iommu_stats_init(void) { }
424
425#endif /* CONFIG_AMD_IOMMU_STATS */
426
404#endif /* _ASM_X86_AMD_IOMMU_TYPES_H */ 427#endif /* _ASM_X86_AMD_IOMMU_TYPES_H */
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 3b1510b4fc57..ab1d51a8855e 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -54,7 +54,6 @@ extern int disable_apic;
54extern int is_vsmp_box(void); 54extern int is_vsmp_box(void);
55extern void xapic_wait_icr_idle(void); 55extern void xapic_wait_icr_idle(void);
56extern u32 safe_xapic_wait_icr_idle(void); 56extern u32 safe_xapic_wait_icr_idle(void);
57extern u64 xapic_icr_read(void);
58extern void xapic_icr_write(u32, u32); 57extern void xapic_icr_write(u32, u32);
59extern int setup_profiling_timer(unsigned int); 58extern int setup_profiling_timer(unsigned int);
60 59
@@ -93,7 +92,7 @@ static inline u32 native_apic_msr_read(u32 reg)
93} 92}
94 93
95#ifndef CONFIG_X86_32 94#ifndef CONFIG_X86_32
96extern int x2apic, x2apic_preenabled; 95extern int x2apic;
97extern void check_x2apic(void); 96extern void check_x2apic(void);
98extern void enable_x2apic(void); 97extern void enable_x2apic(void);
99extern void enable_IR_x2apic(void); 98extern void enable_IR_x2apic(void);
@@ -193,6 +192,7 @@ extern u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask);
193static inline void lapic_shutdown(void) { } 192static inline void lapic_shutdown(void) { }
194#define local_apic_timer_c2_ok 1 193#define local_apic_timer_c2_ok 1
195static inline void init_apic_mappings(void) { } 194static inline void init_apic_mappings(void) { }
195static inline void disable_local_APIC(void) { }
196 196
197#endif /* !CONFIG_X86_LOCAL_APIC */ 197#endif /* !CONFIG_X86_LOCAL_APIC */
198 198
diff --git a/arch/x86/include/asm/bigsmp/apic.h b/arch/x86/include/asm/bigsmp/apic.h
index 1d9543b9d358..d8dd9f537911 100644
--- a/arch/x86/include/asm/bigsmp/apic.h
+++ b/arch/x86/include/asm/bigsmp/apic.h
@@ -9,12 +9,12 @@ static inline int apic_id_registered(void)
9 return (1); 9 return (1);
10} 10}
11 11
12static inline cpumask_t target_cpus(void) 12static inline const cpumask_t *target_cpus(void)
13{ 13{
14#ifdef CONFIG_SMP 14#ifdef CONFIG_SMP
15 return cpu_online_map; 15 return &cpu_online_map;
16#else 16#else
17 return cpumask_of_cpu(0); 17 return &cpumask_of_cpu(0);
18#endif 18#endif
19} 19}
20 20
@@ -24,8 +24,6 @@ static inline cpumask_t target_cpus(void)
24#define INT_DELIVERY_MODE (dest_Fixed) 24#define INT_DELIVERY_MODE (dest_Fixed)
25#define INT_DEST_MODE (0) /* phys delivery to target proc */ 25#define INT_DEST_MODE (0) /* phys delivery to target proc */
26#define NO_BALANCE_IRQ (0) 26#define NO_BALANCE_IRQ (0)
27#define WAKE_SECONDARY_VIA_INIT
28
29 27
30static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid) 28static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
31{ 29{
@@ -81,7 +79,7 @@ static inline int apicid_to_node(int logical_apicid)
81 79
82static inline int cpu_present_to_apicid(int mps_cpu) 80static inline int cpu_present_to_apicid(int mps_cpu)
83{ 81{
84 if (mps_cpu < NR_CPUS) 82 if (mps_cpu < nr_cpu_ids)
85 return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu); 83 return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu);
86 84
87 return BAD_APICID; 85 return BAD_APICID;
@@ -96,7 +94,7 @@ extern u8 cpu_2_logical_apicid[];
96/* Mapping from cpu number to logical apicid */ 94/* Mapping from cpu number to logical apicid */
97static inline int cpu_to_logical_apicid(int cpu) 95static inline int cpu_to_logical_apicid(int cpu)
98{ 96{
99 if (cpu >= NR_CPUS) 97 if (cpu >= nr_cpu_ids)
100 return BAD_APICID; 98 return BAD_APICID;
101 return cpu_physical_id(cpu); 99 return cpu_physical_id(cpu);
102} 100}
@@ -121,16 +119,34 @@ static inline int check_phys_apicid_present(int boot_cpu_physical_apicid)
121} 119}
122 120
123/* As we are using single CPU as destination, pick only one CPU here */ 121/* As we are using single CPU as destination, pick only one CPU here */
124static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) 122static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask)
125{ 123{
126 int cpu; 124 int cpu;
127 int apicid; 125 int apicid;
128 126
129 cpu = first_cpu(cpumask); 127 cpu = first_cpu(*cpumask);
130 apicid = cpu_to_logical_apicid(cpu); 128 apicid = cpu_to_logical_apicid(cpu);
131 return apicid; 129 return apicid;
132} 130}
133 131
132static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *cpumask,
133 const struct cpumask *andmask)
134{
135 int cpu;
136
137 /*
138 * We're using fixed IRQ delivery, can only return one phys APIC ID.
139 * May as well be the first.
140 */
141 for_each_cpu_and(cpu, cpumask, andmask)
142 if (cpumask_test_cpu(cpu, cpu_online_mask))
143 break;
144 if (cpu < nr_cpu_ids)
145 return cpu_to_logical_apicid(cpu);
146
147 return BAD_APICID;
148}
149
134static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb) 150static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
135{ 151{
136 return cpuid_apic >> index_msb; 152 return cpuid_apic >> index_msb;
diff --git a/arch/x86/include/asm/bigsmp/ipi.h b/arch/x86/include/asm/bigsmp/ipi.h
index 9404c535b7ec..27fcd01b3ae6 100644
--- a/arch/x86/include/asm/bigsmp/ipi.h
+++ b/arch/x86/include/asm/bigsmp/ipi.h
@@ -1,25 +1,22 @@
1#ifndef __ASM_MACH_IPI_H 1#ifndef __ASM_MACH_IPI_H
2#define __ASM_MACH_IPI_H 2#define __ASM_MACH_IPI_H
3 3
4void send_IPI_mask_sequence(cpumask_t mask, int vector); 4void send_IPI_mask_sequence(const struct cpumask *mask, int vector);
5void send_IPI_mask_allbutself(const struct cpumask *mask, int vector);
5 6
6static inline void send_IPI_mask(cpumask_t mask, int vector) 7static inline void send_IPI_mask(const struct cpumask *mask, int vector)
7{ 8{
8 send_IPI_mask_sequence(mask, vector); 9 send_IPI_mask_sequence(mask, vector);
9} 10}
10 11
11static inline void send_IPI_allbutself(int vector) 12static inline void send_IPI_allbutself(int vector)
12{ 13{
13 cpumask_t mask = cpu_online_map; 14 send_IPI_mask_allbutself(cpu_online_mask, vector);
14 cpu_clear(smp_processor_id(), mask);
15
16 if (!cpus_empty(mask))
17 send_IPI_mask(mask, vector);
18} 15}
19 16
20static inline void send_IPI_all(int vector) 17static inline void send_IPI_all(int vector)
21{ 18{
22 send_IPI_mask(cpu_online_map, vector); 19 send_IPI_mask(cpu_online_mask, vector);
23} 20}
24 21
25#endif /* __ASM_MACH_IPI_H */ 22#endif /* __ASM_MACH_IPI_H */
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 360010322711..9fa9dcdf344b 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -168,7 +168,15 @@ static inline void __change_bit(int nr, volatile unsigned long *addr)
168 */ 168 */
169static inline void change_bit(int nr, volatile unsigned long *addr) 169static inline void change_bit(int nr, volatile unsigned long *addr)
170{ 170{
171 asm volatile(LOCK_PREFIX "btc %1,%0" : ADDR : "Ir" (nr)); 171 if (IS_IMMEDIATE(nr)) {
172 asm volatile(LOCK_PREFIX "xorb %1,%0"
173 : CONST_MASK_ADDR(nr, addr)
174 : "iq" ((u8)CONST_MASK(nr)));
175 } else {
176 asm volatile(LOCK_PREFIX "btc %1,%0"
177 : BITOP_ADDR(addr)
178 : "Ir" (nr));
179 }
172} 180}
173 181
174/** 182/**
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index 3def2065fcea..d9cf1cd156d2 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -9,7 +9,7 @@
9#ifdef CONFIG_X86_32 9#ifdef CONFIG_X86_32
10# define __BUG_C0 "2:\t.long 1b, %c0\n" 10# define __BUG_C0 "2:\t.long 1b, %c0\n"
11#else 11#else
12# define __BUG_C0 "2:\t.quad 1b, %c0\n" 12# define __BUG_C0 "2:\t.long 1b - 2b, %c0 - 2b\n"
13#endif 13#endif
14 14
15#define BUG() \ 15#define BUG() \
diff --git a/arch/x86/include/asm/byteorder.h b/arch/x86/include/asm/byteorder.h
index e02ae2d89acf..f110ad417df3 100644
--- a/arch/x86/include/asm/byteorder.h
+++ b/arch/x86/include/asm/byteorder.h
@@ -4,26 +4,33 @@
4#include <asm/types.h> 4#include <asm/types.h>
5#include <linux/compiler.h> 5#include <linux/compiler.h>
6 6
7#ifdef __GNUC__ 7#define __LITTLE_ENDIAN
8 8
9#ifdef __i386__ 9static inline __attribute_const__ __u32 __arch_swab32(__u32 val)
10
11static inline __attribute_const__ __u32 ___arch__swab32(__u32 x)
12{ 10{
13#ifdef CONFIG_X86_BSWAP 11#ifdef __i386__
14 asm("bswap %0" : "=r" (x) : "0" (x)); 12# ifdef CONFIG_X86_BSWAP
15#else 13 asm("bswap %0" : "=r" (val) : "0" (val));
14# else
16 asm("xchgb %b0,%h0\n\t" /* swap lower bytes */ 15 asm("xchgb %b0,%h0\n\t" /* swap lower bytes */
17 "rorl $16,%0\n\t" /* swap words */ 16 "rorl $16,%0\n\t" /* swap words */
18 "xchgb %b0,%h0" /* swap higher bytes */ 17 "xchgb %b0,%h0" /* swap higher bytes */
19 : "=q" (x) 18 : "=q" (val)
20 : "0" (x)); 19 : "0" (val));
20# endif
21
22#else /* __i386__ */
23 asm("bswapl %0"
24 : "=r" (val)
25 : "0" (val));
21#endif 26#endif
22 return x; 27 return val;
23} 28}
29#define __arch_swab32 __arch_swab32
24 30
25static inline __attribute_const__ __u64 ___arch__swab64(__u64 val) 31static inline __attribute_const__ __u64 __arch_swab64(__u64 val)
26{ 32{
33#ifdef __i386__
27 union { 34 union {
28 struct { 35 struct {
29 __u32 a; 36 __u32 a;
@@ -32,50 +39,27 @@ static inline __attribute_const__ __u64 ___arch__swab64(__u64 val)
32 __u64 u; 39 __u64 u;
33 } v; 40 } v;
34 v.u = val; 41 v.u = val;
35#ifdef CONFIG_X86_BSWAP 42# ifdef CONFIG_X86_BSWAP
36 asm("bswapl %0 ; bswapl %1 ; xchgl %0,%1" 43 asm("bswapl %0 ; bswapl %1 ; xchgl %0,%1"
37 : "=r" (v.s.a), "=r" (v.s.b) 44 : "=r" (v.s.a), "=r" (v.s.b)
38 : "0" (v.s.a), "1" (v.s.b)); 45 : "0" (v.s.a), "1" (v.s.b));
39#else 46# else
40 v.s.a = ___arch__swab32(v.s.a); 47 v.s.a = __arch_swab32(v.s.a);
41 v.s.b = ___arch__swab32(v.s.b); 48 v.s.b = __arch_swab32(v.s.b);
42 asm("xchgl %0,%1" 49 asm("xchgl %0,%1"
43 : "=r" (v.s.a), "=r" (v.s.b) 50 : "=r" (v.s.a), "=r" (v.s.b)
44 : "0" (v.s.a), "1" (v.s.b)); 51 : "0" (v.s.a), "1" (v.s.b));
45#endif 52# endif
46 return v.u; 53 return v.u;
47}
48
49#else /* __i386__ */ 54#else /* __i386__ */
50
51static inline __attribute_const__ __u64 ___arch__swab64(__u64 x)
52{
53 asm("bswapq %0" 55 asm("bswapq %0"
54 : "=r" (x) 56 : "=r" (val)
55 : "0" (x)); 57 : "0" (val));
56 return x; 58 return val;
57}
58
59static inline __attribute_const__ __u32 ___arch__swab32(__u32 x)
60{
61 asm("bswapl %0"
62 : "=r" (x)
63 : "0" (x));
64 return x;
65}
66
67#endif 59#endif
60}
61#define __arch_swab64 __arch_swab64
68 62
69/* Do not define swab16. Gcc is smart enough to recognize "C" version and 63#include <linux/byteorder.h>
70 convert it into rotation or exhange. */
71
72#define __arch__swab64(x) ___arch__swab64(x)
73#define __arch__swab32(x) ___arch__swab32(x)
74
75#define __BYTEORDER_HAS_U64__
76
77#endif /* __GNUC__ */
78
79#include <linux/byteorder/little_endian.h>
80 64
81#endif /* _ASM_X86_BYTEORDER_H */ 65#endif /* _ASM_X86_BYTEORDER_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index cfdf8c2c5c31..ea408dcba513 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -80,7 +80,6 @@
80#define X86_FEATURE_UP (3*32+ 9) /* smp kernel running on up */ 80#define X86_FEATURE_UP (3*32+ 9) /* smp kernel running on up */
81#define X86_FEATURE_FXSAVE_LEAK (3*32+10) /* "" FXSAVE leaks FOP/FIP/FOP */ 81#define X86_FEATURE_FXSAVE_LEAK (3*32+10) /* "" FXSAVE leaks FOP/FIP/FOP */
82#define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */ 82#define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */
83#define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */
84#define X86_FEATURE_PEBS (3*32+12) /* Precise-Event Based Sampling */ 83#define X86_FEATURE_PEBS (3*32+12) /* Precise-Event Based Sampling */
85#define X86_FEATURE_BTS (3*32+13) /* Branch Trace Store */ 84#define X86_FEATURE_BTS (3*32+13) /* Branch Trace Store */
86#define X86_FEATURE_SYSCALL32 (3*32+14) /* "" syscall in ia32 userspace */ 85#define X86_FEATURE_SYSCALL32 (3*32+14) /* "" syscall in ia32 userspace */
@@ -92,6 +91,8 @@
92#define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */ 91#define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */
93#define X86_FEATURE_AMDC1E (3*32+21) /* AMD C1E detected */ 92#define X86_FEATURE_AMDC1E (3*32+21) /* AMD C1E detected */
94#define X86_FEATURE_XTOPOLOGY (3*32+22) /* cpu topology enum extensions */ 93#define X86_FEATURE_XTOPOLOGY (3*32+22) /* cpu topology enum extensions */
94#define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */
95#define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */
95 96
96/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ 97/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
97#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ 98#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */
@@ -117,6 +118,7 @@
117#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ 118#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
118#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */ 119#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */
119#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */ 120#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */
121#define X86_FEATURE_HYPERVISOR (4*32+31) /* Running on a hypervisor */
120 122
121/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ 123/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
122#define X86_FEATURE_XSTORE (5*32+ 2) /* "rng" RNG present (xstore) */ 124#define X86_FEATURE_XSTORE (5*32+ 2) /* "rng" RNG present (xstore) */
@@ -237,6 +239,7 @@ extern const char * const x86_power_flags[32];
237#define cpu_has_xmm4_2 boot_cpu_has(X86_FEATURE_XMM4_2) 239#define cpu_has_xmm4_2 boot_cpu_has(X86_FEATURE_XMM4_2)
238#define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC) 240#define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC)
239#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) 241#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE)
242#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR)
240 243
241#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64) 244#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64)
242# define cpu_has_invlpg 1 245# define cpu_has_invlpg 1
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index e6b82b17b072..dc27705f5443 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -320,16 +320,14 @@ static inline void set_intr_gate(unsigned int n, void *addr)
320 _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS); 320 _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
321} 321}
322 322
323#define SYS_VECTOR_FREE 0
324#define SYS_VECTOR_ALLOCED 1
325
326extern int first_system_vector; 323extern int first_system_vector;
327extern char system_vectors[]; 324/* used_vectors is BITMAP for irq is not managed by percpu vector_irq */
325extern unsigned long used_vectors[];
328 326
329static inline void alloc_system_vector(int vector) 327static inline void alloc_system_vector(int vector)
330{ 328{
331 if (system_vectors[vector] == SYS_VECTOR_FREE) { 329 if (!test_bit(vector, used_vectors)) {
332 system_vectors[vector] = SYS_VECTOR_ALLOCED; 330 set_bit(vector, used_vectors);
333 if (first_system_vector > vector) 331 if (first_system_vector > vector)
334 first_system_vector = vector; 332 first_system_vector = vector;
335 } else 333 } else
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 097794ff6b79..4035357f5b9d 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -65,18 +65,16 @@ static inline struct dma_mapping_ops *get_dma_ops(struct device *dev)
65 return dma_ops; 65 return dma_ops;
66 else 66 else
67 return dev->archdata.dma_ops; 67 return dev->archdata.dma_ops;
68#endif /* _ASM_X86_DMA_MAPPING_H */ 68#endif
69} 69}
70 70
71/* Make sure we keep the same behaviour */ 71/* Make sure we keep the same behaviour */
72static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) 72static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
73{ 73{
74#ifdef CONFIG_X86_64
75 struct dma_mapping_ops *ops = get_dma_ops(dev); 74 struct dma_mapping_ops *ops = get_dma_ops(dev);
76 if (ops->mapping_error) 75 if (ops->mapping_error)
77 return ops->mapping_error(dev, dma_addr); 76 return ops->mapping_error(dev, dma_addr);
78 77
79#endif
80 return (dma_addr == bad_dma_address); 78 return (dma_addr == bad_dma_address);
81} 79}
82 80
diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h
index a95008457ea4..a8f672ba100c 100644
--- a/arch/x86/include/asm/ds.h
+++ b/arch/x86/include/asm/ds.h
@@ -6,14 +6,13 @@
6 * precise-event based sampling (PEBS). 6 * precise-event based sampling (PEBS).
7 * 7 *
8 * It manages: 8 * It manages:
9 * - per-thread and per-cpu allocation of BTS and PEBS 9 * - DS and BTS hardware configuration
10 * - buffer memory allocation (optional) 10 * - buffer overflow handling (to be done)
11 * - buffer overflow handling
12 * - buffer access 11 * - buffer access
13 * 12 *
14 * It assumes: 13 * It does not do:
15 * - get_task_struct on all parameter tasks 14 * - security checking (is the caller allowed to trace the task)
16 * - current is allowed to trace parameter tasks 15 * - buffer allocation (memory accounting)
17 * 16 *
18 * 17 *
19 * Copyright (C) 2007-2008 Intel Corporation. 18 * Copyright (C) 2007-2008 Intel Corporation.
@@ -26,11 +25,51 @@
26 25
27#include <linux/types.h> 26#include <linux/types.h>
28#include <linux/init.h> 27#include <linux/init.h>
28#include <linux/err.h>
29 29
30 30
31#ifdef CONFIG_X86_DS 31#ifdef CONFIG_X86_DS
32 32
33struct task_struct; 33struct task_struct;
34struct ds_context;
35struct ds_tracer;
36struct bts_tracer;
37struct pebs_tracer;
38
39typedef void (*bts_ovfl_callback_t)(struct bts_tracer *);
40typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *);
41
42
43/*
44 * A list of features plus corresponding macros to talk about them in
45 * the ds_request function's flags parameter.
46 *
47 * We use the enum to index an array of corresponding control bits;
48 * we use the macro to index a flags bit-vector.
49 */
50enum ds_feature {
51 dsf_bts = 0,
52 dsf_bts_kernel,
53#define BTS_KERNEL (1 << dsf_bts_kernel)
54 /* trace kernel-mode branches */
55
56 dsf_bts_user,
57#define BTS_USER (1 << dsf_bts_user)
58 /* trace user-mode branches */
59
60 dsf_bts_overflow,
61 dsf_bts_max,
62 dsf_pebs = dsf_bts_max,
63
64 dsf_pebs_max,
65 dsf_ctl_max = dsf_pebs_max,
66 dsf_bts_timestamps = dsf_ctl_max,
67#define BTS_TIMESTAMPS (1 << dsf_bts_timestamps)
68 /* add timestamps into BTS trace */
69
70#define BTS_USER_FLAGS (BTS_KERNEL | BTS_USER | BTS_TIMESTAMPS)
71};
72
34 73
35/* 74/*
36 * Request BTS or PEBS 75 * Request BTS or PEBS
@@ -38,163 +77,169 @@ struct task_struct;
38 * Due to alignement constraints, the actual buffer may be slightly 77 * Due to alignement constraints, the actual buffer may be slightly
39 * smaller than the requested or provided buffer. 78 * smaller than the requested or provided buffer.
40 * 79 *
41 * Returns 0 on success; -Eerrno otherwise 80 * Returns a pointer to a tracer structure on success, or
81 * ERR_PTR(errcode) on failure.
82 *
83 * The interrupt threshold is independent from the overflow callback
84 * to allow users to use their own overflow interrupt handling mechanism.
42 * 85 *
43 * task: the task to request recording for; 86 * task: the task to request recording for;
44 * NULL for per-cpu recording on the current cpu 87 * NULL for per-cpu recording on the current cpu
45 * base: the base pointer for the (non-pageable) buffer; 88 * base: the base pointer for the (non-pageable) buffer;
46 * NULL if buffer allocation requested 89 * size: the size of the provided buffer in bytes
47 * size: the size of the requested or provided buffer
48 * ovfl: pointer to a function to be called on buffer overflow; 90 * ovfl: pointer to a function to be called on buffer overflow;
49 * NULL if cyclic buffer requested 91 * NULL if cyclic buffer requested
92 * th: the interrupt threshold in records from the end of the buffer;
93 * -1 if no interrupt threshold is requested.
94 * flags: a bit-mask of the above flags
50 */ 95 */
51typedef void (*ds_ovfl_callback_t)(struct task_struct *); 96extern struct bts_tracer *ds_request_bts(struct task_struct *task,
52extern int ds_request_bts(struct task_struct *task, void *base, size_t size, 97 void *base, size_t size,
53 ds_ovfl_callback_t ovfl); 98 bts_ovfl_callback_t ovfl,
54extern int ds_request_pebs(struct task_struct *task, void *base, size_t size, 99 size_t th, unsigned int flags);
55 ds_ovfl_callback_t ovfl); 100extern struct pebs_tracer *ds_request_pebs(struct task_struct *task,
101 void *base, size_t size,
102 pebs_ovfl_callback_t ovfl,
103 size_t th, unsigned int flags);
56 104
57/* 105/*
58 * Release BTS or PEBS resources 106 * Release BTS or PEBS resources
107 * Suspend and resume BTS or PEBS tracing
59 * 108 *
60 * Frees buffers allocated on ds_request. 109 * tracer: the tracer handle returned from ds_request_~()
61 *
62 * Returns 0 on success; -Eerrno otherwise
63 *
64 * task: the task to release resources for;
65 * NULL to release resources for the current cpu
66 */ 110 */
67extern int ds_release_bts(struct task_struct *task); 111extern void ds_release_bts(struct bts_tracer *tracer);
68extern int ds_release_pebs(struct task_struct *task); 112extern void ds_suspend_bts(struct bts_tracer *tracer);
113extern void ds_resume_bts(struct bts_tracer *tracer);
114extern void ds_release_pebs(struct pebs_tracer *tracer);
115extern void ds_suspend_pebs(struct pebs_tracer *tracer);
116extern void ds_resume_pebs(struct pebs_tracer *tracer);
69 117
70/*
71 * Return the (array) index of the write pointer.
72 * (assuming an array of BTS/PEBS records)
73 *
74 * Returns -Eerrno on error
75 *
76 * task: the task to access;
77 * NULL to access the current cpu
78 * pos (out): if not NULL, will hold the result
79 */
80extern int ds_get_bts_index(struct task_struct *task, size_t *pos);
81extern int ds_get_pebs_index(struct task_struct *task, size_t *pos);
82 118
83/* 119/*
84 * Return the (array) index one record beyond the end of the array. 120 * The raw DS buffer state as it is used for BTS and PEBS recording.
85 * (assuming an array of BTS/PEBS records)
86 * 121 *
87 * Returns -Eerrno on error 122 * This is the low-level, arch-dependent interface for working
88 * 123 * directly on the raw trace data.
89 * task: the task to access;
90 * NULL to access the current cpu
91 * pos (out): if not NULL, will hold the result
92 */ 124 */
93extern int ds_get_bts_end(struct task_struct *task, size_t *pos); 125struct ds_trace {
94extern int ds_get_pebs_end(struct task_struct *task, size_t *pos); 126 /* the number of bts/pebs records */
127 size_t n;
128 /* the size of a bts/pebs record in bytes */
129 size_t size;
130 /* pointers into the raw buffer:
131 - to the first entry */
132 void *begin;
133 /* - one beyond the last entry */
134 void *end;
135 /* - one beyond the newest entry */
136 void *top;
137 /* - the interrupt threshold */
138 void *ith;
139 /* flags given on ds_request() */
140 unsigned int flags;
141};
95 142
96/* 143/*
97 * Provide a pointer to the BTS/PEBS record at parameter index. 144 * An arch-independent view on branch trace data.
98 * (assuming an array of BTS/PEBS records)
99 *
100 * The pointer points directly into the buffer. The user is
101 * responsible for copying the record.
102 *
103 * Returns the size of a single record on success; -Eerrno on error
104 *
105 * task: the task to access;
106 * NULL to access the current cpu
107 * index: the index of the requested record
108 * record (out): pointer to the requested record
109 */ 145 */
110extern int ds_access_bts(struct task_struct *task, 146enum bts_qualifier {
111 size_t index, const void **record); 147 bts_invalid,
112extern int ds_access_pebs(struct task_struct *task, 148#define BTS_INVALID bts_invalid
113 size_t index, const void **record); 149
150 bts_branch,
151#define BTS_BRANCH bts_branch
152
153 bts_task_arrives,
154#define BTS_TASK_ARRIVES bts_task_arrives
155
156 bts_task_departs,
157#define BTS_TASK_DEPARTS bts_task_departs
158
159 bts_qual_bit_size = 4,
160 bts_qual_max = (1 << bts_qual_bit_size),
161};
162
163struct bts_struct {
164 __u64 qualifier;
165 union {
166 /* BTS_BRANCH */
167 struct {
168 __u64 from;
169 __u64 to;
170 } lbr;
171 /* BTS_TASK_ARRIVES or BTS_TASK_DEPARTS */
172 struct {
173 __u64 jiffies;
174 pid_t pid;
175 } timestamp;
176 } variant;
177};
114 178
115/*
116 * Write one or more BTS/PEBS records at the write pointer index and
117 * advance the write pointer.
118 *
119 * If size is not a multiple of the record size, trailing bytes are
120 * zeroed out.
121 *
122 * May result in one or more overflow notifications.
123 *
124 * If called during overflow handling, that is, with index >=
125 * interrupt threshold, the write will wrap around.
126 *
127 * An overflow notification is given if and when the interrupt
128 * threshold is reached during or after the write.
129 *
130 * Returns the number of bytes written or -Eerrno.
131 *
132 * task: the task to access;
133 * NULL to access the current cpu
134 * buffer: the buffer to write
135 * size: the size of the buffer
136 */
137extern int ds_write_bts(struct task_struct *task,
138 const void *buffer, size_t size);
139extern int ds_write_pebs(struct task_struct *task,
140 const void *buffer, size_t size);
141 179
142/* 180/*
143 * Same as ds_write_bts/pebs, but omit ownership checks. 181 * The BTS state.
144 * 182 *
145 * This is needed to have some other task than the owner of the 183 * This gives access to the raw DS state and adds functions to provide
146 * BTS/PEBS buffer or the parameter task itself write into the 184 * an arch-independent view of the BTS data.
147 * respective buffer.
148 */ 185 */
149extern int ds_unchecked_write_bts(struct task_struct *task, 186struct bts_trace {
150 const void *buffer, size_t size); 187 struct ds_trace ds;
151extern int ds_unchecked_write_pebs(struct task_struct *task, 188
152 const void *buffer, size_t size); 189 int (*read)(struct bts_tracer *tracer, const void *at,
190 struct bts_struct *out);
191 int (*write)(struct bts_tracer *tracer, const struct bts_struct *in);
192};
193
153 194
154/* 195/*
155 * Reset the write pointer of the BTS/PEBS buffer. 196 * The PEBS state.
156 * 197 *
157 * Returns 0 on success; -Eerrno on error 198 * This gives access to the raw DS state and the PEBS-specific counter
158 * 199 * reset value.
159 * task: the task to access;
160 * NULL to access the current cpu
161 */ 200 */
162extern int ds_reset_bts(struct task_struct *task); 201struct pebs_trace {
163extern int ds_reset_pebs(struct task_struct *task); 202 struct ds_trace ds;
203
204 /* the PEBS reset value */
205 unsigned long long reset_value;
206};
207
164 208
165/* 209/*
166 * Clear the BTS/PEBS buffer and reset the write pointer. 210 * Read the BTS or PEBS trace.
167 * The entire buffer will be zeroed out.
168 * 211 *
169 * Returns 0 on success; -Eerrno on error 212 * Returns a view on the trace collected for the parameter tracer.
213 *
214 * The view remains valid as long as the traced task is not running or
215 * the tracer is suspended.
216 * Writes into the trace buffer are not reflected.
170 * 217 *
171 * task: the task to access; 218 * tracer: the tracer handle returned from ds_request_~()
172 * NULL to access the current cpu
173 */ 219 */
174extern int ds_clear_bts(struct task_struct *task); 220extern const struct bts_trace *ds_read_bts(struct bts_tracer *tracer);
175extern int ds_clear_pebs(struct task_struct *task); 221extern const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer);
222
176 223
177/* 224/*
178 * Provide the PEBS counter reset value. 225 * Reset the write pointer of the BTS/PEBS buffer.
179 * 226 *
180 * Returns 0 on success; -Eerrno on error 227 * Returns 0 on success; -Eerrno on error
181 * 228 *
182 * task: the task to access; 229 * tracer: the tracer handle returned from ds_request_~()
183 * NULL to access the current cpu
184 * value (out): the counter reset value
185 */ 230 */
186extern int ds_get_pebs_reset(struct task_struct *task, u64 *value); 231extern int ds_reset_bts(struct bts_tracer *tracer);
232extern int ds_reset_pebs(struct pebs_tracer *tracer);
187 233
188/* 234/*
189 * Set the PEBS counter reset value. 235 * Set the PEBS counter reset value.
190 * 236 *
191 * Returns 0 on success; -Eerrno on error 237 * Returns 0 on success; -Eerrno on error
192 * 238 *
193 * task: the task to access; 239 * tracer: the tracer handle returned from ds_request_pebs()
194 * NULL to access the current cpu
195 * value: the new counter reset value 240 * value: the new counter reset value
196 */ 241 */
197extern int ds_set_pebs_reset(struct task_struct *task, u64 value); 242extern int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value);
198 243
199/* 244/*
200 * Initialization 245 * Initialization
@@ -202,39 +247,26 @@ extern int ds_set_pebs_reset(struct task_struct *task, u64 value);
202struct cpuinfo_x86; 247struct cpuinfo_x86;
203extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *); 248extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *);
204 249
205
206
207/* 250/*
208 * The DS context - part of struct thread_struct. 251 * Context switch work
209 */ 252 */
210struct ds_context { 253extern void ds_switch_to(struct task_struct *prev, struct task_struct *next);
211 /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */
212 unsigned char *ds;
213 /* the owner of the BTS and PEBS configuration, respectively */
214 struct task_struct *owner[2];
215 /* buffer overflow notification function for BTS and PEBS */
216 ds_ovfl_callback_t callback[2];
217 /* the original buffer address */
218 void *buffer[2];
219 /* the number of allocated pages for on-request allocated buffers */
220 unsigned int pages[2];
221 /* use count */
222 unsigned long count;
223 /* a pointer to the context location inside the thread_struct
224 * or the per_cpu context array */
225 struct ds_context **this;
226 /* a pointer to the task owning this context, or NULL, if the
227 * context is owned by a cpu */
228 struct task_struct *task;
229};
230 254
231/* called by exit_thread() to free leftover contexts */ 255/*
232extern void ds_free(struct ds_context *context); 256 * Task clone/init and cleanup work
257 */
258extern void ds_copy_thread(struct task_struct *tsk, struct task_struct *father);
259extern void ds_exit_thread(struct task_struct *tsk);
233 260
234#else /* CONFIG_X86_DS */ 261#else /* CONFIG_X86_DS */
235 262
236struct cpuinfo_x86; 263struct cpuinfo_x86;
237static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {} 264static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {}
265static inline void ds_switch_to(struct task_struct *prev,
266 struct task_struct *next) {}
267static inline void ds_copy_thread(struct task_struct *tsk,
268 struct task_struct *father) {}
269static inline void ds_exit_thread(struct task_struct *tsk) {}
238 270
239#endif /* CONFIG_X86_DS */ 271#endif /* CONFIG_X86_DS */
240#endif /* _ASM_X86_DS_H */ 272#endif /* _ASM_X86_DS_H */
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h
index 804b6e6be929..3afc5e87cfdd 100644
--- a/arch/x86/include/asm/dwarf2.h
+++ b/arch/x86/include/asm/dwarf2.h
@@ -6,56 +6,91 @@
6#endif 6#endif
7 7
8/* 8/*
9 Macros for dwarf2 CFI unwind table entries. 9 * Macros for dwarf2 CFI unwind table entries.
10 See "as.info" for details on these pseudo ops. Unfortunately 10 * See "as.info" for details on these pseudo ops. Unfortunately
11 they are only supported in very new binutils, so define them 11 * they are only supported in very new binutils, so define them
12 away for older version. 12 * away for older version.
13 */ 13 */
14 14
15#ifdef CONFIG_AS_CFI 15#ifdef CONFIG_AS_CFI
16 16
17#define CFI_STARTPROC .cfi_startproc 17#define CFI_STARTPROC .cfi_startproc
18#define CFI_ENDPROC .cfi_endproc 18#define CFI_ENDPROC .cfi_endproc
19#define CFI_DEF_CFA .cfi_def_cfa 19#define CFI_DEF_CFA .cfi_def_cfa
20#define CFI_DEF_CFA_REGISTER .cfi_def_cfa_register 20#define CFI_DEF_CFA_REGISTER .cfi_def_cfa_register
21#define CFI_DEF_CFA_OFFSET .cfi_def_cfa_offset 21#define CFI_DEF_CFA_OFFSET .cfi_def_cfa_offset
22#define CFI_ADJUST_CFA_OFFSET .cfi_adjust_cfa_offset 22#define CFI_ADJUST_CFA_OFFSET .cfi_adjust_cfa_offset
23#define CFI_OFFSET .cfi_offset 23#define CFI_OFFSET .cfi_offset
24#define CFI_REL_OFFSET .cfi_rel_offset 24#define CFI_REL_OFFSET .cfi_rel_offset
25#define CFI_REGISTER .cfi_register 25#define CFI_REGISTER .cfi_register
26#define CFI_RESTORE .cfi_restore 26#define CFI_RESTORE .cfi_restore
27#define CFI_REMEMBER_STATE .cfi_remember_state 27#define CFI_REMEMBER_STATE .cfi_remember_state
28#define CFI_RESTORE_STATE .cfi_restore_state 28#define CFI_RESTORE_STATE .cfi_restore_state
29#define CFI_UNDEFINED .cfi_undefined 29#define CFI_UNDEFINED .cfi_undefined
30 30
31#ifdef CONFIG_AS_CFI_SIGNAL_FRAME 31#ifdef CONFIG_AS_CFI_SIGNAL_FRAME
32#define CFI_SIGNAL_FRAME .cfi_signal_frame 32#define CFI_SIGNAL_FRAME .cfi_signal_frame
33#else 33#else
34#define CFI_SIGNAL_FRAME 34#define CFI_SIGNAL_FRAME
35#endif 35#endif
36 36
37#else 37#else
38 38
39/* Due to the structure of pre-exisiting code, don't use assembler line 39/*
40 comment character # to ignore the arguments. Instead, use a dummy macro. */ 40 * Due to the structure of pre-exisiting code, don't use assembler line
41 * comment character # to ignore the arguments. Instead, use a dummy macro.
42 */
41.macro cfi_ignore a=0, b=0, c=0, d=0 43.macro cfi_ignore a=0, b=0, c=0, d=0
42.endm 44.endm
43 45
44#define CFI_STARTPROC cfi_ignore 46#define CFI_STARTPROC cfi_ignore
45#define CFI_ENDPROC cfi_ignore 47#define CFI_ENDPROC cfi_ignore
46#define CFI_DEF_CFA cfi_ignore 48#define CFI_DEF_CFA cfi_ignore
47#define CFI_DEF_CFA_REGISTER cfi_ignore 49#define CFI_DEF_CFA_REGISTER cfi_ignore
48#define CFI_DEF_CFA_OFFSET cfi_ignore 50#define CFI_DEF_CFA_OFFSET cfi_ignore
49#define CFI_ADJUST_CFA_OFFSET cfi_ignore 51#define CFI_ADJUST_CFA_OFFSET cfi_ignore
50#define CFI_OFFSET cfi_ignore 52#define CFI_OFFSET cfi_ignore
51#define CFI_REL_OFFSET cfi_ignore 53#define CFI_REL_OFFSET cfi_ignore
52#define CFI_REGISTER cfi_ignore 54#define CFI_REGISTER cfi_ignore
53#define CFI_RESTORE cfi_ignore 55#define CFI_RESTORE cfi_ignore
54#define CFI_REMEMBER_STATE cfi_ignore 56#define CFI_REMEMBER_STATE cfi_ignore
55#define CFI_RESTORE_STATE cfi_ignore 57#define CFI_RESTORE_STATE cfi_ignore
56#define CFI_UNDEFINED cfi_ignore 58#define CFI_UNDEFINED cfi_ignore
57#define CFI_SIGNAL_FRAME cfi_ignore 59#define CFI_SIGNAL_FRAME cfi_ignore
58 60
59#endif 61#endif
60 62
63/*
64 * An attempt to make CFI annotations more or less
65 * correct and shorter. It is implied that you know
66 * what you're doing if you use them.
67 */
68#ifdef __ASSEMBLY__
69#ifdef CONFIG_X86_64
70 .macro pushq_cfi reg
71 pushq \reg
72 CFI_ADJUST_CFA_OFFSET 8
73 .endm
74
75 .macro popq_cfi reg
76 popq \reg
77 CFI_ADJUST_CFA_OFFSET -8
78 .endm
79
80 .macro movq_cfi reg offset=0
81 movq %\reg, \offset(%rsp)
82 CFI_REL_OFFSET \reg, \offset
83 .endm
84
85 .macro movq_cfi_restore offset reg
86 movq \offset(%rsp), %\reg
87 CFI_RESTORE \reg
88 .endm
89#else /*!CONFIG_X86_64*/
90
91 /* 32bit defenitions are missed yet */
92
93#endif /*!CONFIG_X86_64*/
94#endif /*__ASSEMBLY__*/
95
61#endif /* _ASM_X86_DWARF2_H */ 96#endif /* _ASM_X86_DWARF2_H */
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index a2e545c91c35..ca5ffb2856b6 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -90,6 +90,7 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size);
90 90
91#endif /* CONFIG_X86_32 */ 91#endif /* CONFIG_X86_32 */
92 92
93extern int add_efi_memmap;
93extern void efi_reserve_early(void); 94extern void efi_reserve_early(void);
94extern void efi_call_phys_prelog(void); 95extern void efi_call_phys_prelog(void);
95extern void efi_call_phys_epilog(void); 96extern void efi_call_phys_epilog(void);
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 40ca1bea7916..f51a3ddde01a 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -325,7 +325,7 @@ struct linux_binprm;
325 325
326#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 326#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
327extern int arch_setup_additional_pages(struct linux_binprm *bprm, 327extern int arch_setup_additional_pages(struct linux_binprm *bprm,
328 int executable_stack); 328 int uses_interp);
329 329
330extern int syscall32_setup_pages(struct linux_binprm *, int exstack); 330extern int syscall32_setup_pages(struct linux_binprm *, int exstack);
331#define compat_arch_setup_additional_pages syscall32_setup_pages 331#define compat_arch_setup_additional_pages syscall32_setup_pages
diff --git a/arch/x86/include/asm/emergency-restart.h b/arch/x86/include/asm/emergency-restart.h
index 94826cf87455..cc70c1c78ca4 100644
--- a/arch/x86/include/asm/emergency-restart.h
+++ b/arch/x86/include/asm/emergency-restart.h
@@ -8,7 +8,9 @@ enum reboot_type {
8 BOOT_BIOS = 'b', 8 BOOT_BIOS = 'b',
9#endif 9#endif
10 BOOT_ACPI = 'a', 10 BOOT_ACPI = 'a',
11 BOOT_EFI = 'e' 11 BOOT_EFI = 'e',
12 BOOT_CF9 = 'p',
13 BOOT_CF9_COND = 'q',
12}; 14};
13 15
14extern enum reboot_type reboot_type; 16extern enum reboot_type reboot_type;
diff --git a/arch/x86/include/asm/es7000/apic.h b/arch/x86/include/asm/es7000/apic.h
index 380f0b4f17ed..bc53d5ef1386 100644
--- a/arch/x86/include/asm/es7000/apic.h
+++ b/arch/x86/include/asm/es7000/apic.h
@@ -9,31 +9,27 @@ static inline int apic_id_registered(void)
9 return (1); 9 return (1);
10} 10}
11 11
12static inline cpumask_t target_cpus(void) 12static inline const cpumask_t *target_cpus_cluster(void)
13{ 13{
14#if defined CONFIG_ES7000_CLUSTERED_APIC 14 return &CPU_MASK_ALL;
15 return CPU_MASK_ALL;
16#else
17 return cpumask_of_cpu(smp_processor_id());
18#endif
19} 15}
20 16
21#if defined CONFIG_ES7000_CLUSTERED_APIC 17static inline const cpumask_t *target_cpus(void)
22#define APIC_DFR_VALUE (APIC_DFR_CLUSTER) 18{
23#define INT_DELIVERY_MODE (dest_LowestPrio) 19 return &cpumask_of_cpu(smp_processor_id());
24#define INT_DEST_MODE (1) /* logical delivery broadcast to all procs */ 20}
25#define NO_BALANCE_IRQ (1) 21
26#undef WAKE_SECONDARY_VIA_INIT 22#define APIC_DFR_VALUE_CLUSTER (APIC_DFR_CLUSTER)
27#define WAKE_SECONDARY_VIA_MIP 23#define INT_DELIVERY_MODE_CLUSTER (dest_LowestPrio)
28#else 24#define INT_DEST_MODE_CLUSTER (1) /* logical delivery broadcast to all procs */
25#define NO_BALANCE_IRQ_CLUSTER (1)
26
29#define APIC_DFR_VALUE (APIC_DFR_FLAT) 27#define APIC_DFR_VALUE (APIC_DFR_FLAT)
30#define INT_DELIVERY_MODE (dest_Fixed) 28#define INT_DELIVERY_MODE (dest_Fixed)
31#define INT_DEST_MODE (0) /* phys delivery to target procs */ 29#define INT_DEST_MODE (0) /* phys delivery to target procs */
32#define NO_BALANCE_IRQ (0) 30#define NO_BALANCE_IRQ (0)
33#undef APIC_DEST_LOGICAL 31#undef APIC_DEST_LOGICAL
34#define APIC_DEST_LOGICAL 0x0 32#define APIC_DEST_LOGICAL 0x0
35#define WAKE_SECONDARY_VIA_INIT
36#endif
37 33
38static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid) 34static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
39{ 35{
@@ -60,6 +56,16 @@ static inline unsigned long calculate_ldr(int cpu)
60 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel 56 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
61 * document number 292116). So here it goes... 57 * document number 292116). So here it goes...
62 */ 58 */
59static inline void init_apic_ldr_cluster(void)
60{
61 unsigned long val;
62 int cpu = smp_processor_id();
63
64 apic_write(APIC_DFR, APIC_DFR_VALUE_CLUSTER);
65 val = calculate_ldr(cpu);
66 apic_write(APIC_LDR, val);
67}
68
63static inline void init_apic_ldr(void) 69static inline void init_apic_ldr(void)
64{ 70{
65 unsigned long val; 71 unsigned long val;
@@ -70,17 +76,14 @@ static inline void init_apic_ldr(void)
70 apic_write(APIC_LDR, val); 76 apic_write(APIC_LDR, val);
71} 77}
72 78
73#ifndef CONFIG_X86_GENERICARCH
74extern void enable_apic_mode(void);
75#endif
76
77extern int apic_version [MAX_APICS]; 79extern int apic_version [MAX_APICS];
78static inline void setup_apic_routing(void) 80static inline void setup_apic_routing(void)
79{ 81{
80 int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id()); 82 int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id());
81 printk("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n", 83 printk("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n",
82 (apic_version[apic] == 0x14) ? 84 (apic_version[apic] == 0x14) ?
83 "Physical Cluster" : "Logical Cluster", nr_ioapics, cpus_addr(target_cpus())[0]); 85 "Physical Cluster" : "Logical Cluster",
86 nr_ioapics, cpus_addr(*target_cpus())[0]);
84} 87}
85 88
86static inline int multi_timer_check(int apic, int irq) 89static inline int multi_timer_check(int apic, int irq)
@@ -98,7 +101,7 @@ static inline int cpu_present_to_apicid(int mps_cpu)
98{ 101{
99 if (!mps_cpu) 102 if (!mps_cpu)
100 return boot_cpu_physical_apicid; 103 return boot_cpu_physical_apicid;
101 else if (mps_cpu < NR_CPUS) 104 else if (mps_cpu < nr_cpu_ids)
102 return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu); 105 return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu);
103 else 106 else
104 return BAD_APICID; 107 return BAD_APICID;
@@ -118,9 +121,9 @@ extern u8 cpu_2_logical_apicid[];
118static inline int cpu_to_logical_apicid(int cpu) 121static inline int cpu_to_logical_apicid(int cpu)
119{ 122{
120#ifdef CONFIG_SMP 123#ifdef CONFIG_SMP
121 if (cpu >= NR_CPUS) 124 if (cpu >= nr_cpu_ids)
122 return BAD_APICID; 125 return BAD_APICID;
123 return (int)cpu_2_logical_apicid[cpu]; 126 return (int)cpu_2_logical_apicid[cpu];
124#else 127#else
125 return logical_smp_processor_id(); 128 return logical_smp_processor_id();
126#endif 129#endif
@@ -144,38 +147,64 @@ static inline int check_phys_apicid_present(int cpu_physical_apicid)
144 return (1); 147 return (1);
145} 148}
146 149
147static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) 150static inline unsigned int
151cpu_mask_to_apicid_cluster(const struct cpumask *cpumask)
148{ 152{
149 int num_bits_set; 153 int num_bits_set;
150 int cpus_found = 0; 154 int cpus_found = 0;
151 int cpu; 155 int cpu;
152 int apicid; 156 int apicid;
153 157
154 num_bits_set = cpus_weight(cpumask); 158 num_bits_set = cpumask_weight(cpumask);
155 /* Return id to all */ 159 /* Return id to all */
156 if (num_bits_set == NR_CPUS) 160 if (num_bits_set == nr_cpu_ids)
157#if defined CONFIG_ES7000_CLUSTERED_APIC
158 return 0xFF; 161 return 0xFF;
159#else
160 return cpu_to_logical_apicid(0);
161#endif
162 /* 162 /*
163 * The cpus in the mask must all be on the apic cluster. If are not 163 * The cpus in the mask must all be on the apic cluster. If are not
164 * on the same apicid cluster return default value of TARGET_CPUS. 164 * on the same apicid cluster return default value of TARGET_CPUS.
165 */ 165 */
166 cpu = first_cpu(cpumask); 166 cpu = cpumask_first(cpumask);
167 apicid = cpu_to_logical_apicid(cpu); 167 apicid = cpu_to_logical_apicid(cpu);
168 while (cpus_found < num_bits_set) { 168 while (cpus_found < num_bits_set) {
169 if (cpu_isset(cpu, cpumask)) { 169 if (cpumask_test_cpu(cpu, cpumask)) {
170 int new_apicid = cpu_to_logical_apicid(cpu); 170 int new_apicid = cpu_to_logical_apicid(cpu);
171 if (apicid_cluster(apicid) != 171 if (apicid_cluster(apicid) !=
172 apicid_cluster(new_apicid)){ 172 apicid_cluster(new_apicid)){
173 printk ("%s: Not a valid mask!\n", __func__); 173 printk ("%s: Not a valid mask!\n", __func__);
174#if defined CONFIG_ES7000_CLUSTERED_APIC
175 return 0xFF; 174 return 0xFF;
176#else 175 }
176 apicid = new_apicid;
177 cpus_found++;
178 }
179 cpu++;
180 }
181 return apicid;
182}
183
184static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask)
185{
186 int num_bits_set;
187 int cpus_found = 0;
188 int cpu;
189 int apicid;
190
191 num_bits_set = cpus_weight(*cpumask);
192 /* Return id to all */
193 if (num_bits_set == nr_cpu_ids)
194 return cpu_to_logical_apicid(0);
195 /*
196 * The cpus in the mask must all be on the apic cluster. If are not
197 * on the same apicid cluster return default value of TARGET_CPUS.
198 */
199 cpu = first_cpu(*cpumask);
200 apicid = cpu_to_logical_apicid(cpu);
201 while (cpus_found < num_bits_set) {
202 if (cpu_isset(cpu, *cpumask)) {
203 int new_apicid = cpu_to_logical_apicid(cpu);
204 if (apicid_cluster(apicid) !=
205 apicid_cluster(new_apicid)){
206 printk ("%s: Not a valid mask!\n", __func__);
177 return cpu_to_logical_apicid(0); 207 return cpu_to_logical_apicid(0);
178#endif
179 } 208 }
180 apicid = new_apicid; 209 apicid = new_apicid;
181 cpus_found++; 210 cpus_found++;
@@ -185,6 +214,24 @@ static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
185 return apicid; 214 return apicid;
186} 215}
187 216
217
218static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *inmask,
219 const struct cpumask *andmask)
220{
221 int apicid = cpu_to_logical_apicid(0);
222 cpumask_var_t cpumask;
223
224 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
225 return apicid;
226
227 cpumask_and(cpumask, inmask, andmask);
228 cpumask_and(cpumask, cpumask, cpu_online_mask);
229 apicid = cpu_mask_to_apicid(cpumask);
230
231 free_cpumask_var(cpumask);
232 return apicid;
233}
234
188static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb) 235static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
189{ 236{
190 return cpuid_apic >> index_msb; 237 return cpuid_apic >> index_msb;
diff --git a/arch/x86/include/asm/es7000/ipi.h b/arch/x86/include/asm/es7000/ipi.h
index 632a955fcc0a..7e8ed24d4b8a 100644
--- a/arch/x86/include/asm/es7000/ipi.h
+++ b/arch/x86/include/asm/es7000/ipi.h
@@ -1,24 +1,22 @@
1#ifndef __ASM_ES7000_IPI_H 1#ifndef __ASM_ES7000_IPI_H
2#define __ASM_ES7000_IPI_H 2#define __ASM_ES7000_IPI_H
3 3
4void send_IPI_mask_sequence(cpumask_t mask, int vector); 4void send_IPI_mask_sequence(const struct cpumask *mask, int vector);
5void send_IPI_mask_allbutself(const struct cpumask *mask, int vector);
5 6
6static inline void send_IPI_mask(cpumask_t mask, int vector) 7static inline void send_IPI_mask(const struct cpumask *mask, int vector)
7{ 8{
8 send_IPI_mask_sequence(mask, vector); 9 send_IPI_mask_sequence(mask, vector);
9} 10}
10 11
11static inline void send_IPI_allbutself(int vector) 12static inline void send_IPI_allbutself(int vector)
12{ 13{
13 cpumask_t mask = cpu_online_map; 14 send_IPI_mask_allbutself(cpu_online_mask, vector);
14 cpu_clear(smp_processor_id(), mask);
15 if (!cpus_empty(mask))
16 send_IPI_mask(mask, vector);
17} 15}
18 16
19static inline void send_IPI_all(int vector) 17static inline void send_IPI_all(int vector)
20{ 18{
21 send_IPI_mask(cpu_online_map, vector); 19 send_IPI_mask(cpu_online_mask, vector);
22} 20}
23 21
24#endif /* __ASM_ES7000_IPI_H */ 22#endif /* __ASM_ES7000_IPI_H */
diff --git a/arch/x86/include/asm/es7000/wakecpu.h b/arch/x86/include/asm/es7000/wakecpu.h
index 398493461913..78f0daaee436 100644
--- a/arch/x86/include/asm/es7000/wakecpu.h
+++ b/arch/x86/include/asm/es7000/wakecpu.h
@@ -1,36 +1,12 @@
1#ifndef __ASM_ES7000_WAKECPU_H 1#ifndef __ASM_ES7000_WAKECPU_H
2#define __ASM_ES7000_WAKECPU_H 2#define __ASM_ES7000_WAKECPU_H
3 3
4/* 4#define TRAMPOLINE_PHYS_LOW 0x467
5 * This file copes with machines that wakeup secondary CPUs by the 5#define TRAMPOLINE_PHYS_HIGH 0x469
6 * INIT, INIT, STARTUP sequence.
7 */
8
9#ifdef CONFIG_ES7000_CLUSTERED_APIC
10#define WAKE_SECONDARY_VIA_MIP
11#else
12#define WAKE_SECONDARY_VIA_INIT
13#endif
14
15#ifdef WAKE_SECONDARY_VIA_MIP
16extern int es7000_start_cpu(int cpu, unsigned long eip);
17static inline int
18wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
19{
20 int boot_error = 0;
21 boot_error = es7000_start_cpu(phys_apicid, start_eip);
22 return boot_error;
23}
24#endif
25
26#define TRAMPOLINE_LOW phys_to_virt(0x467)
27#define TRAMPOLINE_HIGH phys_to_virt(0x469)
28
29#define boot_cpu_apicid boot_cpu_physical_apicid
30 6
31static inline void wait_for_init_deassert(atomic_t *deassert) 7static inline void wait_for_init_deassert(atomic_t *deassert)
32{ 8{
33#ifdef WAKE_SECONDARY_VIA_INIT 9#ifndef CONFIG_ES7000_CLUSTERED_APIC
34 while (!atomic_read(deassert)) 10 while (!atomic_read(deassert))
35 cpu_relax(); 11 cpu_relax();
36#endif 12#endif
@@ -50,9 +26,12 @@ static inline void restore_NMI_vector(unsigned short *high, unsigned short *low)
50{ 26{
51} 27}
52 28
53#define inquire_remote_apic(apicid) do { \ 29extern void __inquire_remote_apic(int apicid);
54 if (apic_verbosity >= APIC_DEBUG) \ 30
55 __inquire_remote_apic(apicid); \ 31static inline void inquire_remote_apic(int apicid)
56 } while (0) 32{
33 if (apic_verbosity >= APIC_DEBUG)
34 __inquire_remote_apic(apicid);
35}
57 36
58#endif /* __ASM_MACH_WAKECPU_H */ 37#endif /* __ASM_MACH_WAKECPU_H */
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 9e8bc29b8b17..b55b4a7fbefd 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -1,6 +1,33 @@
1#ifndef _ASM_X86_FTRACE_H 1#ifndef _ASM_X86_FTRACE_H
2#define _ASM_X86_FTRACE_H 2#define _ASM_X86_FTRACE_H
3 3
4#ifdef __ASSEMBLY__
5
6 .macro MCOUNT_SAVE_FRAME
7 /* taken from glibc */
8 subq $0x38, %rsp
9 movq %rax, (%rsp)
10 movq %rcx, 8(%rsp)
11 movq %rdx, 16(%rsp)
12 movq %rsi, 24(%rsp)
13 movq %rdi, 32(%rsp)
14 movq %r8, 40(%rsp)
15 movq %r9, 48(%rsp)
16 .endm
17
18 .macro MCOUNT_RESTORE_FRAME
19 movq 48(%rsp), %r9
20 movq 40(%rsp), %r8
21 movq 32(%rsp), %rdi
22 movq 24(%rsp), %rsi
23 movq 16(%rsp), %rdx
24 movq 8(%rsp), %rcx
25 movq (%rsp), %rax
26 addq $0x38, %rsp
27 .endm
28
29#endif
30
4#ifdef CONFIG_FUNCTION_TRACER 31#ifdef CONFIG_FUNCTION_TRACER
5#define MCOUNT_ADDR ((long)(mcount)) 32#define MCOUNT_ADDR ((long)(mcount))
6#define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ 33#define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */
@@ -17,8 +44,40 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr)
17 */ 44 */
18 return addr - 1; 45 return addr - 1;
19} 46}
20#endif
21 47
48#ifdef CONFIG_DYNAMIC_FTRACE
49
50struct dyn_arch_ftrace {
51 /* No extra data needed for x86 */
52};
53
54#endif /* CONFIG_DYNAMIC_FTRACE */
55#endif /* __ASSEMBLY__ */
22#endif /* CONFIG_FUNCTION_TRACER */ 56#endif /* CONFIG_FUNCTION_TRACER */
23 57
58#ifdef CONFIG_FUNCTION_GRAPH_TRACER
59
60#ifndef __ASSEMBLY__
61
62/*
63 * Stack of return addresses for functions
64 * of a thread.
65 * Used in struct thread_info
66 */
67struct ftrace_ret_stack {
68 unsigned long ret;
69 unsigned long func;
70 unsigned long long calltime;
71};
72
73/*
74 * Primary handler of a function return.
75 * It relays on ftrace_return_to_handler.
76 * Defined in entry_32/64.S
77 */
78extern void return_to_handler(void);
79
80#endif /* __ASSEMBLY__ */
81#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
82
24#endif /* _ASM_X86_FTRACE_H */ 83#endif /* _ASM_X86_FTRACE_H */
diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h
index 74252264433d..6cfdafa409d8 100644
--- a/arch/x86/include/asm/gart.h
+++ b/arch/x86/include/asm/gart.h
@@ -29,6 +29,39 @@ extern int fix_aperture;
29#define AMD64_GARTCACHECTL 0x9c 29#define AMD64_GARTCACHECTL 0x9c
30#define AMD64_GARTEN (1<<0) 30#define AMD64_GARTEN (1<<0)
31 31
32#ifdef CONFIG_GART_IOMMU
33extern int gart_iommu_aperture;
34extern int gart_iommu_aperture_allowed;
35extern int gart_iommu_aperture_disabled;
36
37extern void early_gart_iommu_check(void);
38extern void gart_iommu_init(void);
39extern void gart_iommu_shutdown(void);
40extern void __init gart_parse_options(char *);
41extern void gart_iommu_hole_init(void);
42
43#else
44#define gart_iommu_aperture 0
45#define gart_iommu_aperture_allowed 0
46#define gart_iommu_aperture_disabled 1
47
48static inline void early_gart_iommu_check(void)
49{
50}
51static inline void gart_iommu_init(void)
52{
53}
54static inline void gart_iommu_shutdown(void)
55{
56}
57static inline void gart_parse_options(char *options)
58{
59}
60static inline void gart_iommu_hole_init(void)
61{
62}
63#endif
64
32extern int agp_amd64_init(void); 65extern int agp_amd64_init(void);
33 66
34static inline void enable_gart_translation(struct pci_dev *dev, u64 addr) 67static inline void enable_gart_translation(struct pci_dev *dev, u64 addr)
diff --git a/arch/x86/include/asm/genapic_32.h b/arch/x86/include/asm/genapic_32.h
index 5cbd4fcc06fd..746f37a7963a 100644
--- a/arch/x86/include/asm/genapic_32.h
+++ b/arch/x86/include/asm/genapic_32.h
@@ -2,6 +2,7 @@
2#define _ASM_X86_GENAPIC_32_H 2#define _ASM_X86_GENAPIC_32_H
3 3
4#include <asm/mpspec.h> 4#include <asm/mpspec.h>
5#include <asm/atomic.h>
5 6
6/* 7/*
7 * Generic APIC driver interface. 8 * Generic APIC driver interface.
@@ -23,7 +24,7 @@ struct genapic {
23 int (*probe)(void); 24 int (*probe)(void);
24 25
25 int (*apic_id_registered)(void); 26 int (*apic_id_registered)(void);
26 cpumask_t (*target_cpus)(void); 27 const struct cpumask *(*target_cpus)(void);
27 int int_delivery_mode; 28 int int_delivery_mode;
28 int int_dest_mode; 29 int int_dest_mode;
29 int ESR_DISABLE; 30 int ESR_DISABLE;
@@ -56,15 +57,27 @@ struct genapic {
56 57
57 unsigned (*get_apic_id)(unsigned long x); 58 unsigned (*get_apic_id)(unsigned long x);
58 unsigned long apic_id_mask; 59 unsigned long apic_id_mask;
59 unsigned int (*cpu_mask_to_apicid)(cpumask_t cpumask); 60 unsigned int (*cpu_mask_to_apicid)(const struct cpumask *cpumask);
60 cpumask_t (*vector_allocation_domain)(int cpu); 61 unsigned int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask,
62 const struct cpumask *andmask);
63 void (*vector_allocation_domain)(int cpu, struct cpumask *retmask);
61 64
62#ifdef CONFIG_SMP 65#ifdef CONFIG_SMP
63 /* ipi */ 66 /* ipi */
64 void (*send_IPI_mask)(cpumask_t mask, int vector); 67 void (*send_IPI_mask)(const struct cpumask *mask, int vector);
68 void (*send_IPI_mask_allbutself)(const struct cpumask *mask,
69 int vector);
65 void (*send_IPI_allbutself)(int vector); 70 void (*send_IPI_allbutself)(int vector);
66 void (*send_IPI_all)(int vector); 71 void (*send_IPI_all)(int vector);
67#endif 72#endif
73 int (*wakeup_cpu)(int apicid, unsigned long start_eip);
74 int trampoline_phys_low;
75 int trampoline_phys_high;
76 void (*wait_for_init_deassert)(atomic_t *deassert);
77 void (*smp_callin_clear_local_apic)(void);
78 void (*store_NMI_vector)(unsigned short *high, unsigned short *low);
79 void (*restore_NMI_vector)(unsigned short *high, unsigned short *low);
80 void (*inquire_remote_apic)(int apicid);
68}; 81};
69 82
70#define APICFUNC(x) .x = x, 83#define APICFUNC(x) .x = x,
@@ -105,16 +118,25 @@ struct genapic {
105 APICFUNC(get_apic_id) \ 118 APICFUNC(get_apic_id) \
106 .apic_id_mask = APIC_ID_MASK, \ 119 .apic_id_mask = APIC_ID_MASK, \
107 APICFUNC(cpu_mask_to_apicid) \ 120 APICFUNC(cpu_mask_to_apicid) \
108 APICFUNC(vector_allocation_domain) \ 121 APICFUNC(cpu_mask_to_apicid_and) \
122 APICFUNC(vector_allocation_domain) \
109 APICFUNC(acpi_madt_oem_check) \ 123 APICFUNC(acpi_madt_oem_check) \
110 IPIFUNC(send_IPI_mask) \ 124 IPIFUNC(send_IPI_mask) \
111 IPIFUNC(send_IPI_allbutself) \ 125 IPIFUNC(send_IPI_allbutself) \
112 IPIFUNC(send_IPI_all) \ 126 IPIFUNC(send_IPI_all) \
113 APICFUNC(enable_apic_mode) \ 127 APICFUNC(enable_apic_mode) \
114 APICFUNC(phys_pkg_id) \ 128 APICFUNC(phys_pkg_id) \
129 .trampoline_phys_low = TRAMPOLINE_PHYS_LOW, \
130 .trampoline_phys_high = TRAMPOLINE_PHYS_HIGH, \
131 APICFUNC(wait_for_init_deassert) \
132 APICFUNC(smp_callin_clear_local_apic) \
133 APICFUNC(store_NMI_vector) \
134 APICFUNC(restore_NMI_vector) \
135 APICFUNC(inquire_remote_apic) \
115} 136}
116 137
117extern struct genapic *genapic; 138extern struct genapic *genapic;
139extern void es7000_update_genapic_to_cluster(void);
118 140
119enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC}; 141enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
120#define get_uv_system_type() UV_NONE 142#define get_uv_system_type() UV_NONE
diff --git a/arch/x86/include/asm/genapic_64.h b/arch/x86/include/asm/genapic_64.h
index 13c4e96199ea..adf32fb56aa6 100644
--- a/arch/x86/include/asm/genapic_64.h
+++ b/arch/x86/include/asm/genapic_64.h
@@ -1,6 +1,8 @@
1#ifndef _ASM_X86_GENAPIC_64_H 1#ifndef _ASM_X86_GENAPIC_64_H
2#define _ASM_X86_GENAPIC_64_H 2#define _ASM_X86_GENAPIC_64_H
3 3
4#include <linux/cpumask.h>
5
4/* 6/*
5 * Copyright 2004 James Cleverdon, IBM. 7 * Copyright 2004 James Cleverdon, IBM.
6 * Subject to the GNU Public License, v.2 8 * Subject to the GNU Public License, v.2
@@ -18,20 +20,26 @@ struct genapic {
18 u32 int_delivery_mode; 20 u32 int_delivery_mode;
19 u32 int_dest_mode; 21 u32 int_dest_mode;
20 int (*apic_id_registered)(void); 22 int (*apic_id_registered)(void);
21 cpumask_t (*target_cpus)(void); 23 const struct cpumask *(*target_cpus)(void);
22 cpumask_t (*vector_allocation_domain)(int cpu); 24 void (*vector_allocation_domain)(int cpu, struct cpumask *retmask);
23 void (*init_apic_ldr)(void); 25 void (*init_apic_ldr)(void);
24 /* ipi */ 26 /* ipi */
25 void (*send_IPI_mask)(cpumask_t mask, int vector); 27 void (*send_IPI_mask)(const struct cpumask *mask, int vector);
28 void (*send_IPI_mask_allbutself)(const struct cpumask *mask,
29 int vector);
26 void (*send_IPI_allbutself)(int vector); 30 void (*send_IPI_allbutself)(int vector);
27 void (*send_IPI_all)(int vector); 31 void (*send_IPI_all)(int vector);
28 void (*send_IPI_self)(int vector); 32 void (*send_IPI_self)(int vector);
29 /* */ 33 /* */
30 unsigned int (*cpu_mask_to_apicid)(cpumask_t cpumask); 34 unsigned int (*cpu_mask_to_apicid)(const struct cpumask *cpumask);
35 unsigned int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask,
36 const struct cpumask *andmask);
31 unsigned int (*phys_pkg_id)(int index_msb); 37 unsigned int (*phys_pkg_id)(int index_msb);
32 unsigned int (*get_apic_id)(unsigned long x); 38 unsigned int (*get_apic_id)(unsigned long x);
33 unsigned long (*set_apic_id)(unsigned int id); 39 unsigned long (*set_apic_id)(unsigned int id);
34 unsigned long apic_id_mask; 40 unsigned long apic_id_mask;
41 /* wakeup_secondary_cpu */
42 int (*wakeup_cpu)(int apicid, unsigned long start_eip);
35}; 43};
36 44
37extern struct genapic *genapic; 45extern struct genapic *genapic;
diff --git a/arch/x86/include/asm/hardirq_32.h b/arch/x86/include/asm/hardirq_32.h
index 5ca135e72f2b..cf7954d1405f 100644
--- a/arch/x86/include/asm/hardirq_32.h
+++ b/arch/x86/include/asm/hardirq_32.h
@@ -22,6 +22,8 @@ DECLARE_PER_CPU(irq_cpustat_t, irq_stat);
22#define __ARCH_IRQ_STAT 22#define __ARCH_IRQ_STAT
23#define __IRQ_STAT(cpu, member) (per_cpu(irq_stat, cpu).member) 23#define __IRQ_STAT(cpu, member) (per_cpu(irq_stat, cpu).member)
24 24
25#define inc_irq_stat(member) (__get_cpu_var(irq_stat).member++)
26
25void ack_bad_irq(unsigned int irq); 27void ack_bad_irq(unsigned int irq);
26#include <linux/irq_cpustat.h> 28#include <linux/irq_cpustat.h>
27 29
diff --git a/arch/x86/include/asm/hardirq_64.h b/arch/x86/include/asm/hardirq_64.h
index 1ba381fc51d3..b5a6b5d56704 100644
--- a/arch/x86/include/asm/hardirq_64.h
+++ b/arch/x86/include/asm/hardirq_64.h
@@ -11,6 +11,8 @@
11 11
12#define __ARCH_IRQ_STAT 1 12#define __ARCH_IRQ_STAT 1
13 13
14#define inc_irq_stat(member) add_pda(member, 1)
15
14#define local_softirq_pending() read_pda(__softirq_pending) 16#define local_softirq_pending() read_pda(__softirq_pending)
15 17
16#define __ARCH_SET_SOFTIRQ_PENDING 1 18#define __ARCH_SET_SOFTIRQ_PENDING 1
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index b97aecb0b61d..8de644b6b959 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -109,9 +109,7 @@ extern asmlinkage void smp_invalidate_interrupt(struct pt_regs *);
109#endif 109#endif
110#endif 110#endif
111 111
112#ifdef CONFIG_X86_32 112extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void);
113extern void (*const interrupt[NR_VECTORS])(void);
114#endif
115 113
116typedef int vector_irq_t[NR_VECTORS]; 114typedef int vector_irq_t[NR_VECTORS];
117DECLARE_PER_CPU(vector_irq_t, vector_irq); 115DECLARE_PER_CPU(vector_irq_t, vector_irq);
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
new file mode 100644
index 000000000000..369f5c5d09a1
--- /dev/null
+++ b/arch/x86/include/asm/hypervisor.h
@@ -0,0 +1,26 @@
1/*
2 * Copyright (C) 2008, VMware, Inc.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
12 * NON INFRINGEMENT. See the GNU General Public License for more
13 * details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 */
20#ifndef ASM_X86__HYPERVISOR_H
21#define ASM_X86__HYPERVISOR_H
22
23extern unsigned long get_hypervisor_tsc_freq(void);
24extern void init_hypervisor(struct cpuinfo_x86 *c);
25
26#endif
diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h
index 97989c0e534c..50ca486fd88c 100644
--- a/arch/x86/include/asm/ia32.h
+++ b/arch/x86/include/asm/ia32.h
@@ -129,24 +129,6 @@ typedef struct compat_siginfo {
129 } _sifields; 129 } _sifields;
130} compat_siginfo_t; 130} compat_siginfo_t;
131 131
132struct sigframe32 {
133 u32 pretcode;
134 int sig;
135 struct sigcontext_ia32 sc;
136 struct _fpstate_ia32 fpstate;
137 unsigned int extramask[_COMPAT_NSIG_WORDS-1];
138};
139
140struct rt_sigframe32 {
141 u32 pretcode;
142 int sig;
143 u32 pinfo;
144 u32 puc;
145 compat_siginfo_t info;
146 struct ucontext_ia32 uc;
147 struct _fpstate_ia32 fpstate;
148};
149
150struct ustat32 { 132struct ustat32 {
151 __u32 f_tfree; 133 __u32 f_tfree;
152 compat_ino_t f_tinode; 134 compat_ino_t f_tinode;
diff --git a/arch/x86/include/asm/idle.h b/arch/x86/include/asm/idle.h
index 44c89c3a23e9..38d87379e270 100644
--- a/arch/x86/include/asm/idle.h
+++ b/arch/x86/include/asm/idle.h
@@ -8,8 +8,13 @@ struct notifier_block;
8void idle_notifier_register(struct notifier_block *n); 8void idle_notifier_register(struct notifier_block *n);
9void idle_notifier_unregister(struct notifier_block *n); 9void idle_notifier_unregister(struct notifier_block *n);
10 10
11#ifdef CONFIG_X86_64
11void enter_idle(void); 12void enter_idle(void);
12void exit_idle(void); 13void exit_idle(void);
14#else /* !CONFIG_X86_64 */
15static inline void enter_idle(void) { }
16static inline void exit_idle(void) { }
17#endif /* CONFIG_X86_64 */
13 18
14void c1e_remove_cpu(int cpu); 19void c1e_remove_cpu(int cpu);
15 20
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index ac2abc88cd95..05cfed4485fa 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -4,6 +4,7 @@
4#define ARCH_HAS_IOREMAP_WC 4#define ARCH_HAS_IOREMAP_WC
5 5
6#include <linux/compiler.h> 6#include <linux/compiler.h>
7#include <asm-generic/int-ll64.h>
7 8
8#define build_mmio_read(name, size, type, reg, barrier) \ 9#define build_mmio_read(name, size, type, reg, barrier) \
9static inline type name(const volatile void __iomem *addr) \ 10static inline type name(const volatile void __iomem *addr) \
@@ -45,21 +46,39 @@ build_mmio_write(__writel, "l", unsigned int, "r", )
45#define mmiowb() barrier() 46#define mmiowb() barrier()
46 47
47#ifdef CONFIG_X86_64 48#ifdef CONFIG_X86_64
49
48build_mmio_read(readq, "q", unsigned long, "=r", :"memory") 50build_mmio_read(readq, "q", unsigned long, "=r", :"memory")
49build_mmio_read(__readq, "q", unsigned long, "=r", )
50build_mmio_write(writeq, "q", unsigned long, "r", :"memory") 51build_mmio_write(writeq, "q", unsigned long, "r", :"memory")
51build_mmio_write(__writeq, "q", unsigned long, "r", )
52 52
53#define readq_relaxed(a) __readq(a) 53#else
54#define __raw_readq __readq 54
55#define __raw_writeq writeq 55static inline __u64 readq(const volatile void __iomem *addr)
56{
57 const volatile u32 __iomem *p = addr;
58 u32 low, high;
59
60 low = readl(p);
61 high = readl(p + 1);
62
63 return low + ((u64)high << 32);
64}
65
66static inline void writeq(__u64 val, volatile void __iomem *addr)
67{
68 writel(val, addr);
69 writel(val >> 32, addr+4);
70}
56 71
57/* Let people know we have them */
58#define readq readq
59#define writeq writeq
60#endif 72#endif
61 73
62extern int iommu_bio_merge; 74#define readq_relaxed(a) readq(a)
75
76#define __raw_readq(a) readq(a)
77#define __raw_writeq(val, addr) writeq(val, addr)
78
79/* Let people know that we have them */
80#define readq readq
81#define writeq writeq
63 82
64#ifdef CONFIG_X86_32 83#ifdef CONFIG_X86_32
65# include "io_32.h" 84# include "io_32.h"
diff --git a/arch/x86/include/asm/io_64.h b/arch/x86/include/asm/io_64.h
index fea325a1122f..563c16270ba6 100644
--- a/arch/x86/include/asm/io_64.h
+++ b/arch/x86/include/asm/io_64.h
@@ -232,8 +232,6 @@ void memset_io(volatile void __iomem *a, int b, size_t c);
232 232
233#define flush_write_buffers() 233#define flush_write_buffers()
234 234
235#define BIO_VMERGE_BOUNDARY iommu_bio_merge
236
237/* 235/*
238 * Convert a virtual cached pointer to an uncached pointer 236 * Convert a virtual cached pointer to an uncached pointer
239 */ 237 */
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 6afd9933a7dd..7a1f44ac1f17 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -156,11 +156,21 @@ extern int sis_apic_bug;
156/* 1 if "noapic" boot option passed */ 156/* 1 if "noapic" boot option passed */
157extern int skip_ioapic_setup; 157extern int skip_ioapic_setup;
158 158
159/* 1 if "noapic" boot option passed */
160extern int noioapicquirk;
161
162/* -1 if "noapic" boot option passed */
163extern int noioapicreroute;
164
159/* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */ 165/* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */
160extern int timer_through_8259; 166extern int timer_through_8259;
161 167
162static inline void disable_ioapic_setup(void) 168static inline void disable_ioapic_setup(void)
163{ 169{
170#ifdef CONFIG_PCI
171 noioapicquirk = 1;
172 noioapicreroute = -1;
173#endif
164 skip_ioapic_setup = 1; 174 skip_ioapic_setup = 1;
165} 175}
166 176
@@ -188,17 +198,14 @@ extern void restore_IO_APIC_setup(void);
188extern void reinit_intr_remapped_IO_APIC(int); 198extern void reinit_intr_remapped_IO_APIC(int);
189#endif 199#endif
190 200
191extern int probe_nr_irqs(void); 201extern void probe_nr_irqs_gsi(void);
192 202
193#else /* !CONFIG_X86_IO_APIC */ 203#else /* !CONFIG_X86_IO_APIC */
194#define io_apic_assign_pci_irqs 0 204#define io_apic_assign_pci_irqs 0
195static const int timer_through_8259 = 0; 205static const int timer_through_8259 = 0;
196static inline void ioapic_init_mappings(void) { } 206static inline void ioapic_init_mappings(void) { }
197 207
198static inline int probe_nr_irqs(void) 208static inline void probe_nr_irqs_gsi(void) { }
199{
200 return NR_IRQS;
201}
202#endif 209#endif
203 210
204#endif /* _ASM_X86_IO_APIC_H */ 211#endif /* _ASM_X86_IO_APIC_H */
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index 0b500c5b6446..a6ee9e6f530f 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -7,42 +7,7 @@ extern struct dma_mapping_ops nommu_dma_ops;
7extern int force_iommu, no_iommu; 7extern int force_iommu, no_iommu;
8extern int iommu_detected; 8extern int iommu_detected;
9 9
10extern unsigned long iommu_nr_pages(unsigned long addr, unsigned long len);
11
12/* 10 seconds */ 10/* 10 seconds */
13#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) 11#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
14 12
15#ifdef CONFIG_GART_IOMMU
16extern int gart_iommu_aperture;
17extern int gart_iommu_aperture_allowed;
18extern int gart_iommu_aperture_disabled;
19
20extern void early_gart_iommu_check(void);
21extern void gart_iommu_init(void);
22extern void gart_iommu_shutdown(void);
23extern void __init gart_parse_options(char *);
24extern void gart_iommu_hole_init(void);
25
26#else
27#define gart_iommu_aperture 0
28#define gart_iommu_aperture_allowed 0
29#define gart_iommu_aperture_disabled 1
30
31static inline void early_gart_iommu_check(void)
32{
33}
34static inline void gart_iommu_init(void)
35{
36}
37static inline void gart_iommu_shutdown(void)
38{
39}
40static inline void gart_parse_options(char *options)
41{
42}
43static inline void gart_iommu_hole_init(void)
44{
45}
46#endif
47
48#endif /* _ASM_X86_IOMMU_H */ 13#endif /* _ASM_X86_IOMMU_H */
diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h
index f89dffb28aa9..c745a306f7d3 100644
--- a/arch/x86/include/asm/ipi.h
+++ b/arch/x86/include/asm/ipi.h
@@ -117,7 +117,8 @@ static inline void __send_IPI_dest_field(unsigned int mask, int vector,
117 native_apic_mem_write(APIC_ICR, cfg); 117 native_apic_mem_write(APIC_ICR, cfg);
118} 118}
119 119
120static inline void send_IPI_mask_sequence(cpumask_t mask, int vector) 120static inline void send_IPI_mask_sequence(const struct cpumask *mask,
121 int vector)
121{ 122{
122 unsigned long flags; 123 unsigned long flags;
123 unsigned long query_cpu; 124 unsigned long query_cpu;
@@ -128,11 +129,29 @@ static inline void send_IPI_mask_sequence(cpumask_t mask, int vector)
128 * - mbligh 129 * - mbligh
129 */ 130 */
130 local_irq_save(flags); 131 local_irq_save(flags);
131 for_each_cpu_mask_nr(query_cpu, mask) { 132 for_each_cpu(query_cpu, mask) {
132 __send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, query_cpu), 133 __send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, query_cpu),
133 vector, APIC_DEST_PHYSICAL); 134 vector, APIC_DEST_PHYSICAL);
134 } 135 }
135 local_irq_restore(flags); 136 local_irq_restore(flags);
136} 137}
137 138
139static inline void send_IPI_mask_allbutself(const struct cpumask *mask,
140 int vector)
141{
142 unsigned long flags;
143 unsigned int query_cpu;
144 unsigned int this_cpu = smp_processor_id();
145
146 /* See Hack comment above */
147
148 local_irq_save(flags);
149 for_each_cpu(query_cpu, mask)
150 if (query_cpu != this_cpu)
151 __send_IPI_dest_field(
152 per_cpu(x86_cpu_to_apicid, query_cpu),
153 vector, APIC_DEST_PHYSICAL);
154 local_irq_restore(flags);
155}
156
138#endif /* _ASM_X86_IPI_H */ 157#endif /* _ASM_X86_IPI_H */
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index bae0eda95486..592688ed04d3 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -31,13 +31,9 @@ static inline int irq_canonicalize(int irq)
31# endif 31# endif
32#endif 32#endif
33 33
34#ifdef CONFIG_IRQBALANCE
35extern int irqbalance_disable(char *str);
36#endif
37
38#ifdef CONFIG_HOTPLUG_CPU 34#ifdef CONFIG_HOTPLUG_CPU
39#include <linux/cpumask.h> 35#include <linux/cpumask.h>
40extern void fixup_irqs(cpumask_t map); 36extern void fixup_irqs(void);
41#endif 37#endif
42 38
43extern unsigned int do_IRQ(struct pt_regs *regs); 39extern unsigned int do_IRQ(struct pt_regs *regs);
@@ -46,5 +42,6 @@ extern void native_init_IRQ(void);
46 42
47/* Interrupt vector management */ 43/* Interrupt vector management */
48extern DECLARE_BITMAP(used_vectors, NR_VECTORS); 44extern DECLARE_BITMAP(used_vectors, NR_VECTORS);
45extern int vector_used_by_percpu_irq(unsigned int vector);
49 46
50#endif /* _ASM_X86_IRQ_H */ 47#endif /* _ASM_X86_IRQ_H */
diff --git a/arch/x86/include/asm/irq_regs_32.h b/arch/x86/include/asm/irq_regs_32.h
index af2f02d27fc7..86afd7473457 100644
--- a/arch/x86/include/asm/irq_regs_32.h
+++ b/arch/x86/include/asm/irq_regs_32.h
@@ -9,6 +9,8 @@
9 9
10#include <asm/percpu.h> 10#include <asm/percpu.h>
11 11
12#define ARCH_HAS_OWN_IRQ_REGS
13
12DECLARE_PER_CPU(struct pt_regs *, irq_regs); 14DECLARE_PER_CPU(struct pt_regs *, irq_regs);
13 15
14static inline struct pt_regs *get_irq_regs(void) 16static inline struct pt_regs *get_irq_regs(void)
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 0005adb0f941..f7ff65032b9d 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -101,12 +101,23 @@
101#define LAST_VM86_IRQ 15 101#define LAST_VM86_IRQ 15
102#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15) 102#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15)
103 103
104#define NR_IRQS_LEGACY 16
105
104#if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_X86_VOYAGER) 106#if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_X86_VOYAGER)
107
108#ifndef CONFIG_SPARSE_IRQ
105# if NR_CPUS < MAX_IO_APICS 109# if NR_CPUS < MAX_IO_APICS
106# define NR_IRQS (NR_VECTORS + (32 * NR_CPUS)) 110# define NR_IRQS (NR_VECTORS + (32 * NR_CPUS))
107# else 111# else
108# define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS)) 112# define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS))
109# endif 113# endif
114#else
115# if (8 * NR_CPUS) > (32 * MAX_IO_APICS)
116# define NR_IRQS (NR_VECTORS + (8 * NR_CPUS))
117# else
118# define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS))
119# endif
120#endif
110 121
111#elif defined(CONFIG_X86_VOYAGER) 122#elif defined(CONFIG_X86_VOYAGER)
112 123
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index a1f22771a15a..c61d8b2ab8b9 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -5,21 +5,8 @@
5# define PA_CONTROL_PAGE 0 5# define PA_CONTROL_PAGE 0
6# define VA_CONTROL_PAGE 1 6# define VA_CONTROL_PAGE 1
7# define PA_PGD 2 7# define PA_PGD 2
8# define VA_PGD 3 8# define PA_SWAP_PAGE 3
9# define PA_PTE_0 4 9# define PAGES_NR 4
10# define VA_PTE_0 5
11# define PA_PTE_1 6
12# define VA_PTE_1 7
13# define PA_SWAP_PAGE 8
14# ifdef CONFIG_X86_PAE
15# define PA_PMD_0 9
16# define VA_PMD_0 10
17# define PA_PMD_1 11
18# define VA_PMD_1 12
19# define PAGES_NR 13
20# else
21# define PAGES_NR 9
22# endif
23#else 10#else
24# define PA_CONTROL_PAGE 0 11# define PA_CONTROL_PAGE 0
25# define VA_CONTROL_PAGE 1 12# define VA_CONTROL_PAGE 1
@@ -170,6 +157,20 @@ relocate_kernel(unsigned long indirection_page,
170 unsigned long start_address) ATTRIB_NORET; 157 unsigned long start_address) ATTRIB_NORET;
171#endif 158#endif
172 159
160#ifdef CONFIG_X86_32
161#define ARCH_HAS_KIMAGE_ARCH
162
163struct kimage_arch {
164 pgd_t *pgd;
165#ifdef CONFIG_X86_PAE
166 pmd_t *pmd0;
167 pmd_t *pmd1;
168#endif
169 pte_t *pte0;
170 pte_t *pte1;
171};
172#endif
173
173#endif /* __ASSEMBLY__ */ 174#endif /* __ASSEMBLY__ */
174 175
175#endif /* _ASM_X86_KEXEC_H */ 176#endif /* _ASM_X86_KEXEC_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8346be87cfa1..730843d1d2fb 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -21,6 +21,7 @@
21 21
22#include <asm/pvclock-abi.h> 22#include <asm/pvclock-abi.h>
23#include <asm/desc.h> 23#include <asm/desc.h>
24#include <asm/mtrr.h>
24 25
25#define KVM_MAX_VCPUS 16 26#define KVM_MAX_VCPUS 16
26#define KVM_MEMORY_SLOTS 32 27#define KVM_MEMORY_SLOTS 32
@@ -86,6 +87,7 @@
86#define KVM_MIN_FREE_MMU_PAGES 5 87#define KVM_MIN_FREE_MMU_PAGES 5
87#define KVM_REFILL_PAGES 25 88#define KVM_REFILL_PAGES 25
88#define KVM_MAX_CPUID_ENTRIES 40 89#define KVM_MAX_CPUID_ENTRIES 40
90#define KVM_NR_FIXED_MTRR_REGION 88
89#define KVM_NR_VAR_MTRR 8 91#define KVM_NR_VAR_MTRR 8
90 92
91extern spinlock_t kvm_lock; 93extern spinlock_t kvm_lock;
@@ -180,6 +182,8 @@ struct kvm_mmu_page {
180 struct list_head link; 182 struct list_head link;
181 struct hlist_node hash_link; 183 struct hlist_node hash_link;
182 184
185 struct list_head oos_link;
186
183 /* 187 /*
184 * The following two entries are used to key the shadow page in the 188 * The following two entries are used to key the shadow page in the
185 * hash table. 189 * hash table.
@@ -190,13 +194,16 @@ struct kvm_mmu_page {
190 u64 *spt; 194 u64 *spt;
191 /* hold the gfn of each spte inside spt */ 195 /* hold the gfn of each spte inside spt */
192 gfn_t *gfns; 196 gfn_t *gfns;
193 unsigned long slot_bitmap; /* One bit set per slot which has memory 197 /*
194 * in this shadow page. 198 * One bit set per slot which has memory
195 */ 199 * in this shadow page.
200 */
201 DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
196 int multimapped; /* More than one parent_pte? */ 202 int multimapped; /* More than one parent_pte? */
197 int root_count; /* Currently serving as active root */ 203 int root_count; /* Currently serving as active root */
198 bool unsync; 204 bool unsync;
199 bool unsync_children; 205 bool global;
206 unsigned int unsync_children;
200 union { 207 union {
201 u64 *parent_pte; /* !multimapped */ 208 u64 *parent_pte; /* !multimapped */
202 struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */ 209 struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
@@ -327,8 +334,10 @@ struct kvm_vcpu_arch {
327 334
328 bool nmi_pending; 335 bool nmi_pending;
329 bool nmi_injected; 336 bool nmi_injected;
337 bool nmi_window_open;
330 338
331 u64 mtrr[0x100]; 339 struct mtrr_state_type mtrr_state;
340 u32 pat;
332}; 341};
333 342
334struct kvm_mem_alias { 343struct kvm_mem_alias {
@@ -350,11 +359,13 @@ struct kvm_arch{
350 */ 359 */
351 struct list_head active_mmu_pages; 360 struct list_head active_mmu_pages;
352 struct list_head assigned_dev_head; 361 struct list_head assigned_dev_head;
353 struct dmar_domain *intel_iommu_domain; 362 struct list_head oos_global_pages;
363 struct iommu_domain *iommu_domain;
354 struct kvm_pic *vpic; 364 struct kvm_pic *vpic;
355 struct kvm_ioapic *vioapic; 365 struct kvm_ioapic *vioapic;
356 struct kvm_pit *vpit; 366 struct kvm_pit *vpit;
357 struct hlist_head irq_ack_notifier_list; 367 struct hlist_head irq_ack_notifier_list;
368 int vapics_in_nmi_mode;
358 369
359 int round_robin_prev_vcpu; 370 int round_robin_prev_vcpu;
360 unsigned int tss_addr; 371 unsigned int tss_addr;
@@ -378,6 +389,7 @@ struct kvm_vm_stat {
378 u32 mmu_recycled; 389 u32 mmu_recycled;
379 u32 mmu_cache_miss; 390 u32 mmu_cache_miss;
380 u32 mmu_unsync; 391 u32 mmu_unsync;
392 u32 mmu_unsync_global;
381 u32 remote_tlb_flush; 393 u32 remote_tlb_flush;
382 u32 lpages; 394 u32 lpages;
383}; 395};
@@ -397,6 +409,7 @@ struct kvm_vcpu_stat {
397 u32 halt_exits; 409 u32 halt_exits;
398 u32 halt_wakeup; 410 u32 halt_wakeup;
399 u32 request_irq_exits; 411 u32 request_irq_exits;
412 u32 request_nmi_exits;
400 u32 irq_exits; 413 u32 irq_exits;
401 u32 host_state_reload; 414 u32 host_state_reload;
402 u32 efer_reload; 415 u32 efer_reload;
@@ -405,6 +418,7 @@ struct kvm_vcpu_stat {
405 u32 insn_emulation_fail; 418 u32 insn_emulation_fail;
406 u32 hypercalls; 419 u32 hypercalls;
407 u32 irq_injections; 420 u32 irq_injections;
421 u32 nmi_injections;
408}; 422};
409 423
410struct descriptor_table { 424struct descriptor_table {
@@ -477,6 +491,7 @@ struct kvm_x86_ops {
477 491
478 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); 492 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
479 int (*get_tdp_level)(void); 493 int (*get_tdp_level)(void);
494 int (*get_mt_mask_shift)(void);
480}; 495};
481 496
482extern struct kvm_x86_ops *kvm_x86_ops; 497extern struct kvm_x86_ops *kvm_x86_ops;
@@ -490,7 +505,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu);
490void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte); 505void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
491void kvm_mmu_set_base_ptes(u64 base_pte); 506void kvm_mmu_set_base_ptes(u64 base_pte);
492void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 507void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
493 u64 dirty_mask, u64 nx_mask, u64 x_mask); 508 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask);
494 509
495int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); 510int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
496void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); 511void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
@@ -587,12 +602,14 @@ unsigned long segment_base(u16 selector);
587 602
588void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); 603void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
589void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 604void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
590 const u8 *new, int bytes); 605 const u8 *new, int bytes,
606 bool guest_initiated);
591int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); 607int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
592void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); 608void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
593int kvm_mmu_load(struct kvm_vcpu *vcpu); 609int kvm_mmu_load(struct kvm_vcpu *vcpu);
594void kvm_mmu_unload(struct kvm_vcpu *vcpu); 610void kvm_mmu_unload(struct kvm_vcpu *vcpu);
595void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); 611void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
612void kvm_mmu_sync_global(struct kvm_vcpu *vcpu);
596 613
597int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); 614int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
598 615
@@ -607,6 +624,8 @@ void kvm_disable_tdp(void);
607int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); 624int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
608int complete_pio(struct kvm_vcpu *vcpu); 625int complete_pio(struct kvm_vcpu *vcpu);
609 626
627struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn);
628
610static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) 629static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
611{ 630{
612 struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); 631 struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
@@ -702,18 +721,6 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
702 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); 721 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
703} 722}
704 723
705#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30"
706#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2"
707#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3"
708#define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30"
709#define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0"
710#define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0"
711#define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4"
712#define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4"
713#define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30"
714#define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08"
715#define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
716
717#define MSR_IA32_TIME_STAMP_COUNTER 0x010 724#define MSR_IA32_TIME_STAMP_COUNTER 0x010
718 725
719#define TSS_IOPB_BASE_OFFSET 0x66 726#define TSS_IOPB_BASE_OFFSET 0x66
diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_x86_emulate.h
index 25179a29f208..6a159732881a 100644
--- a/arch/x86/include/asm/kvm_x86_emulate.h
+++ b/arch/x86/include/asm/kvm_x86_emulate.h
@@ -123,6 +123,7 @@ struct decode_cache {
123 u8 ad_bytes; 123 u8 ad_bytes;
124 u8 rex_prefix; 124 u8 rex_prefix;
125 struct operand src; 125 struct operand src;
126 struct operand src2;
126 struct operand dst; 127 struct operand dst;
127 bool has_seg_override; 128 bool has_seg_override;
128 u8 seg_override; 129 u8 seg_override;
@@ -146,22 +147,18 @@ struct x86_emulate_ctxt {
146 /* Register state before/after emulation. */ 147 /* Register state before/after emulation. */
147 struct kvm_vcpu *vcpu; 148 struct kvm_vcpu *vcpu;
148 149
149 /* Linear faulting address (if emulating a page-faulting instruction) */
150 unsigned long eflags; 150 unsigned long eflags;
151
152 /* Emulated execution mode, represented by an X86EMUL_MODE value. */ 151 /* Emulated execution mode, represented by an X86EMUL_MODE value. */
153 int mode; 152 int mode;
154
155 u32 cs_base; 153 u32 cs_base;
156 154
157 /* decode cache */ 155 /* decode cache */
158
159 struct decode_cache decode; 156 struct decode_cache decode;
160}; 157};
161 158
162/* Repeat String Operation Prefix */ 159/* Repeat String Operation Prefix */
163#define REPE_PREFIX 1 160#define REPE_PREFIX 1
164#define REPNE_PREFIX 2 161#define REPNE_PREFIX 2
165 162
166/* Execution mode, passed to the emulator. */ 163/* Execution mode, passed to the emulator. */
167#define X86EMUL_MODE_REAL 0 /* Real mode. */ 164#define X86EMUL_MODE_REAL 0 /* Real mode. */
@@ -170,7 +167,7 @@ struct x86_emulate_ctxt {
170#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ 167#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */
171 168
172/* Host execution mode. */ 169/* Host execution mode. */
173#if defined(__i386__) 170#if defined(CONFIG_X86_32)
174#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32 171#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32
175#elif defined(CONFIG_X86_64) 172#elif defined(CONFIG_X86_64)
176#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 173#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
index d28a507cef39..1caf57628b9c 100644
--- a/arch/x86/include/asm/lguest.h
+++ b/arch/x86/include/asm/lguest.h
@@ -15,7 +15,7 @@
15#define SHARED_SWITCHER_PAGES \ 15#define SHARED_SWITCHER_PAGES \
16 DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE) 16 DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE)
17/* Pages for switcher itself, then two pages per cpu */ 17/* Pages for switcher itself, then two pages per cpu */
18#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * NR_CPUS) 18#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids)
19 19
20/* We map at -4M for ease of mapping into the guest (one PTE page). */ 20/* We map at -4M for ease of mapping into the guest (one PTE page). */
21#define SWITCHER_ADDR 0xFFC00000 21#define SWITCHER_ADDR 0xFFC00000
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index f61ee8f937e4..5d98d0b68ffc 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -57,5 +57,65 @@
57#define __ALIGN_STR ".align 16,0x90" 57#define __ALIGN_STR ".align 16,0x90"
58#endif 58#endif
59 59
60/*
61 * to check ENTRY_X86/END_X86 and
62 * KPROBE_ENTRY_X86/KPROBE_END_X86
63 * unbalanced-missed-mixed appearance
64 */
65#define __set_entry_x86 .set ENTRY_X86_IN, 0
66#define __unset_entry_x86 .set ENTRY_X86_IN, 1
67#define __set_kprobe_x86 .set KPROBE_X86_IN, 0
68#define __unset_kprobe_x86 .set KPROBE_X86_IN, 1
69
70#define __macro_err_x86 .error "ENTRY_X86/KPROBE_X86 unbalanced,missed,mixed"
71
72#define __check_entry_x86 \
73 .ifdef ENTRY_X86_IN; \
74 .ifeq ENTRY_X86_IN; \
75 __macro_err_x86; \
76 .abort; \
77 .endif; \
78 .endif
79
80#define __check_kprobe_x86 \
81 .ifdef KPROBE_X86_IN; \
82 .ifeq KPROBE_X86_IN; \
83 __macro_err_x86; \
84 .abort; \
85 .endif; \
86 .endif
87
88#define __check_entry_kprobe_x86 \
89 __check_entry_x86; \
90 __check_kprobe_x86
91
92#define ENTRY_KPROBE_FINAL_X86 __check_entry_kprobe_x86
93
94#define ENTRY_X86(name) \
95 __check_entry_kprobe_x86; \
96 __set_entry_x86; \
97 .globl name; \
98 __ALIGN; \
99 name:
100
101#define END_X86(name) \
102 __unset_entry_x86; \
103 __check_entry_kprobe_x86; \
104 .size name, .-name
105
106#define KPROBE_ENTRY_X86(name) \
107 __check_entry_kprobe_x86; \
108 __set_kprobe_x86; \
109 .pushsection .kprobes.text, "ax"; \
110 .globl name; \
111 __ALIGN; \
112 name:
113
114#define KPROBE_END_X86(name) \
115 __unset_kprobe_x86; \
116 __check_entry_kprobe_x86; \
117 .size name, .-name; \
118 .popsection
119
60#endif /* _ASM_X86_LINKAGE_H */ 120#endif /* _ASM_X86_LINKAGE_H */
61 121
diff --git a/arch/x86/include/asm/mach-default/mach_apic.h b/arch/x86/include/asm/mach-default/mach_apic.h
index ff3a6c236c00..cc09cbbee27e 100644
--- a/arch/x86/include/asm/mach-default/mach_apic.h
+++ b/arch/x86/include/asm/mach-default/mach_apic.h
@@ -8,12 +8,12 @@
8 8
9#define APIC_DFR_VALUE (APIC_DFR_FLAT) 9#define APIC_DFR_VALUE (APIC_DFR_FLAT)
10 10
11static inline cpumask_t target_cpus(void) 11static inline const struct cpumask *target_cpus(void)
12{ 12{
13#ifdef CONFIG_SMP 13#ifdef CONFIG_SMP
14 return cpu_online_map; 14 return cpu_online_mask;
15#else 15#else
16 return cpumask_of_cpu(0); 16 return cpumask_of(0);
17#endif 17#endif
18} 18}
19 19
@@ -28,15 +28,18 @@ static inline cpumask_t target_cpus(void)
28#define apic_id_registered (genapic->apic_id_registered) 28#define apic_id_registered (genapic->apic_id_registered)
29#define init_apic_ldr (genapic->init_apic_ldr) 29#define init_apic_ldr (genapic->init_apic_ldr)
30#define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid) 30#define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid)
31#define cpu_mask_to_apicid_and (genapic->cpu_mask_to_apicid_and)
31#define phys_pkg_id (genapic->phys_pkg_id) 32#define phys_pkg_id (genapic->phys_pkg_id)
32#define vector_allocation_domain (genapic->vector_allocation_domain) 33#define vector_allocation_domain (genapic->vector_allocation_domain)
33#define read_apic_id() (GET_APIC_ID(apic_read(APIC_ID))) 34#define read_apic_id() (GET_APIC_ID(apic_read(APIC_ID)))
34#define send_IPI_self (genapic->send_IPI_self) 35#define send_IPI_self (genapic->send_IPI_self)
36#define wakeup_secondary_cpu (genapic->wakeup_cpu)
35extern void setup_apic_routing(void); 37extern void setup_apic_routing(void);
36#else 38#else
37#define INT_DELIVERY_MODE dest_LowestPrio 39#define INT_DELIVERY_MODE dest_LowestPrio
38#define INT_DEST_MODE 1 /* logical delivery broadcast to all procs */ 40#define INT_DEST_MODE 1 /* logical delivery broadcast to all procs */
39#define TARGET_CPUS (target_cpus()) 41#define TARGET_CPUS (target_cpus())
42#define wakeup_secondary_cpu wakeup_secondary_cpu_via_init
40/* 43/*
41 * Set up the logical destination ID. 44 * Set up the logical destination ID.
42 * 45 *
@@ -59,9 +62,19 @@ static inline int apic_id_registered(void)
59 return physid_isset(read_apic_id(), phys_cpu_present_map); 62 return physid_isset(read_apic_id(), phys_cpu_present_map);
60} 63}
61 64
62static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) 65static inline unsigned int cpu_mask_to_apicid(const struct cpumask *cpumask)
63{ 66{
64 return cpus_addr(cpumask)[0]; 67 return cpumask_bits(cpumask)[0];
68}
69
70static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *cpumask,
71 const struct cpumask *andmask)
72{
73 unsigned long mask1 = cpumask_bits(cpumask)[0];
74 unsigned long mask2 = cpumask_bits(andmask)[0];
75 unsigned long mask3 = cpumask_bits(cpu_online_mask)[0];
76
77 return (unsigned int)(mask1 & mask2 & mask3);
65} 78}
66 79
67static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb) 80static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
@@ -86,7 +99,7 @@ static inline int apicid_to_node(int logical_apicid)
86#endif 99#endif
87} 100}
88 101
89static inline cpumask_t vector_allocation_domain(int cpu) 102static inline void vector_allocation_domain(int cpu, struct cpumask *retmask)
90{ 103{
91 /* Careful. Some cpus do not strictly honor the set of cpus 104 /* Careful. Some cpus do not strictly honor the set of cpus
92 * specified in the interrupt destination when using lowest 105 * specified in the interrupt destination when using lowest
@@ -96,8 +109,7 @@ static inline cpumask_t vector_allocation_domain(int cpu)
96 * deliver interrupts to the wrong hyperthread when only one 109 * deliver interrupts to the wrong hyperthread when only one
97 * hyperthread was specified in the interrupt desitination. 110 * hyperthread was specified in the interrupt desitination.
98 */ 111 */
99 cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; 112 *retmask = (cpumask_t) { { [0] = APIC_ALL_CPUS } };
100 return domain;
101} 113}
102#endif 114#endif
103 115
@@ -129,7 +141,7 @@ static inline int cpu_to_logical_apicid(int cpu)
129 141
130static inline int cpu_present_to_apicid(int mps_cpu) 142static inline int cpu_present_to_apicid(int mps_cpu)
131{ 143{
132 if (mps_cpu < NR_CPUS && cpu_present(mps_cpu)) 144 if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu))
133 return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu); 145 return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
134 else 146 else
135 return BAD_APICID; 147 return BAD_APICID;
diff --git a/arch/x86/include/asm/mach-default/mach_ipi.h b/arch/x86/include/asm/mach-default/mach_ipi.h
index fabca01ebacf..191312d155da 100644
--- a/arch/x86/include/asm/mach-default/mach_ipi.h
+++ b/arch/x86/include/asm/mach-default/mach_ipi.h
@@ -4,7 +4,8 @@
4/* Avoid include hell */ 4/* Avoid include hell */
5#define NMI_VECTOR 0x02 5#define NMI_VECTOR 0x02
6 6
7void send_IPI_mask_bitmask(cpumask_t mask, int vector); 7void send_IPI_mask_bitmask(const struct cpumask *mask, int vector);
8void send_IPI_mask_allbutself(const struct cpumask *mask, int vector);
8void __send_IPI_shortcut(unsigned int shortcut, int vector); 9void __send_IPI_shortcut(unsigned int shortcut, int vector);
9 10
10extern int no_broadcast; 11extern int no_broadcast;
@@ -12,28 +13,27 @@ extern int no_broadcast;
12#ifdef CONFIG_X86_64 13#ifdef CONFIG_X86_64
13#include <asm/genapic.h> 14#include <asm/genapic.h>
14#define send_IPI_mask (genapic->send_IPI_mask) 15#define send_IPI_mask (genapic->send_IPI_mask)
16#define send_IPI_mask_allbutself (genapic->send_IPI_mask_allbutself)
15#else 17#else
16static inline void send_IPI_mask(cpumask_t mask, int vector) 18static inline void send_IPI_mask(const struct cpumask *mask, int vector)
17{ 19{
18 send_IPI_mask_bitmask(mask, vector); 20 send_IPI_mask_bitmask(mask, vector);
19} 21}
22void send_IPI_mask_allbutself(const struct cpumask *mask, int vector);
20#endif 23#endif
21 24
22static inline void __local_send_IPI_allbutself(int vector) 25static inline void __local_send_IPI_allbutself(int vector)
23{ 26{
24 if (no_broadcast || vector == NMI_VECTOR) { 27 if (no_broadcast || vector == NMI_VECTOR)
25 cpumask_t mask = cpu_online_map; 28 send_IPI_mask_allbutself(cpu_online_mask, vector);
26 29 else
27 cpu_clear(smp_processor_id(), mask);
28 send_IPI_mask(mask, vector);
29 } else
30 __send_IPI_shortcut(APIC_DEST_ALLBUT, vector); 30 __send_IPI_shortcut(APIC_DEST_ALLBUT, vector);
31} 31}
32 32
33static inline void __local_send_IPI_all(int vector) 33static inline void __local_send_IPI_all(int vector)
34{ 34{
35 if (no_broadcast || vector == NMI_VECTOR) 35 if (no_broadcast || vector == NMI_VECTOR)
36 send_IPI_mask(cpu_online_map, vector); 36 send_IPI_mask(cpu_online_mask, vector);
37 else 37 else
38 __send_IPI_shortcut(APIC_DEST_ALLINC, vector); 38 __send_IPI_shortcut(APIC_DEST_ALLINC, vector);
39} 39}
diff --git a/arch/x86/include/asm/mach-default/mach_wakecpu.h b/arch/x86/include/asm/mach-default/mach_wakecpu.h
index 9d80db91e992..ceb013660146 100644
--- a/arch/x86/include/asm/mach-default/mach_wakecpu.h
+++ b/arch/x86/include/asm/mach-default/mach_wakecpu.h
@@ -1,17 +1,8 @@
1#ifndef _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H 1#ifndef _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H
2#define _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H 2#define _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H
3 3
4/* 4#define TRAMPOLINE_PHYS_LOW (0x467)
5 * This file copes with machines that wakeup secondary CPUs by the 5#define TRAMPOLINE_PHYS_HIGH (0x469)
6 * INIT, INIT, STARTUP sequence.
7 */
8
9#define WAKE_SECONDARY_VIA_INIT
10
11#define TRAMPOLINE_LOW phys_to_virt(0x467)
12#define TRAMPOLINE_HIGH phys_to_virt(0x469)
13
14#define boot_cpu_apicid boot_cpu_physical_apicid
15 6
16static inline void wait_for_init_deassert(atomic_t *deassert) 7static inline void wait_for_init_deassert(atomic_t *deassert)
17{ 8{
@@ -33,9 +24,12 @@ static inline void restore_NMI_vector(unsigned short *high, unsigned short *low)
33{ 24{
34} 25}
35 26
36#define inquire_remote_apic(apicid) do { \ 27extern void __inquire_remote_apic(int apicid);
37 if (apic_verbosity >= APIC_DEBUG) \ 28
38 __inquire_remote_apic(apicid); \ 29static inline void inquire_remote_apic(int apicid)
39 } while (0) 30{
31 if (apic_verbosity >= APIC_DEBUG)
32 __inquire_remote_apic(apicid);
33}
40 34
41#endif /* _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H */ 35#endif /* _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H */
diff --git a/arch/x86/include/asm/mach-default/smpboot_hooks.h b/arch/x86/include/asm/mach-default/smpboot_hooks.h
index dbab36d64d48..23bf52103b89 100644
--- a/arch/x86/include/asm/mach-default/smpboot_hooks.h
+++ b/arch/x86/include/asm/mach-default/smpboot_hooks.h
@@ -13,9 +13,11 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
13 CMOS_WRITE(0xa, 0xf); 13 CMOS_WRITE(0xa, 0xf);
14 local_flush_tlb(); 14 local_flush_tlb();
15 pr_debug("1.\n"); 15 pr_debug("1.\n");
16 *((volatile unsigned short *) TRAMPOLINE_HIGH) = start_eip >> 4; 16 *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) =
17 start_eip >> 4;
17 pr_debug("2.\n"); 18 pr_debug("2.\n");
18 *((volatile unsigned short *) TRAMPOLINE_LOW) = start_eip & 0xf; 19 *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) =
20 start_eip & 0xf;
19 pr_debug("3.\n"); 21 pr_debug("3.\n");
20} 22}
21 23
@@ -32,7 +34,7 @@ static inline void smpboot_restore_warm_reset_vector(void)
32 */ 34 */
33 CMOS_WRITE(0, 0xf); 35 CMOS_WRITE(0, 0xf);
34 36
35 *((volatile long *) phys_to_virt(0x467)) = 0; 37 *((volatile long *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
36} 38}
37 39
38static inline void __init smpboot_setup_io_apic(void) 40static inline void __init smpboot_setup_io_apic(void)
diff --git a/arch/x86/include/asm/mach-generic/mach_apic.h b/arch/x86/include/asm/mach-generic/mach_apic.h
index 5180bd7478fb..48553e958ad5 100644
--- a/arch/x86/include/asm/mach-generic/mach_apic.h
+++ b/arch/x86/include/asm/mach-generic/mach_apic.h
@@ -24,9 +24,11 @@
24#define check_phys_apicid_present (genapic->check_phys_apicid_present) 24#define check_phys_apicid_present (genapic->check_phys_apicid_present)
25#define check_apicid_used (genapic->check_apicid_used) 25#define check_apicid_used (genapic->check_apicid_used)
26#define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid) 26#define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid)
27#define cpu_mask_to_apicid_and (genapic->cpu_mask_to_apicid_and)
27#define vector_allocation_domain (genapic->vector_allocation_domain) 28#define vector_allocation_domain (genapic->vector_allocation_domain)
28#define enable_apic_mode (genapic->enable_apic_mode) 29#define enable_apic_mode (genapic->enable_apic_mode)
29#define phys_pkg_id (genapic->phys_pkg_id) 30#define phys_pkg_id (genapic->phys_pkg_id)
31#define wakeup_secondary_cpu (genapic->wakeup_cpu)
30 32
31extern void generic_bigsmp_probe(void); 33extern void generic_bigsmp_probe(void);
32 34
diff --git a/arch/x86/include/asm/mach-generic/mach_wakecpu.h b/arch/x86/include/asm/mach-generic/mach_wakecpu.h
new file mode 100644
index 000000000000..1ab16b168c8a
--- /dev/null
+++ b/arch/x86/include/asm/mach-generic/mach_wakecpu.h
@@ -0,0 +1,12 @@
1#ifndef _ASM_X86_MACH_GENERIC_MACH_WAKECPU_H
2#define _ASM_X86_MACH_GENERIC_MACH_WAKECPU_H
3
4#define TRAMPOLINE_PHYS_LOW (genapic->trampoline_phys_low)
5#define TRAMPOLINE_PHYS_HIGH (genapic->trampoline_phys_high)
6#define wait_for_init_deassert (genapic->wait_for_init_deassert)
7#define smp_callin_clear_local_apic (genapic->smp_callin_clear_local_apic)
8#define store_NMI_vector (genapic->store_NMI_vector)
9#define restore_NMI_vector (genapic->restore_NMI_vector)
10#define inquire_remote_apic (genapic->inquire_remote_apic)
11
12#endif /* _ASM_X86_MACH_GENERIC_MACH_APIC_H */
diff --git a/arch/x86/include/asm/mmu_context_32.h b/arch/x86/include/asm/mmu_context_32.h
index 8e10015781fb..7e98ce1d2c0e 100644
--- a/arch/x86/include/asm/mmu_context_32.h
+++ b/arch/x86/include/asm/mmu_context_32.h
@@ -4,9 +4,8 @@
4static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) 4static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
5{ 5{
6#ifdef CONFIG_SMP 6#ifdef CONFIG_SMP
7 unsigned cpu = smp_processor_id(); 7 if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK)
8 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) 8 x86_write_percpu(cpu_tlbstate.state, TLBSTATE_LAZY);
9 per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_LAZY;
10#endif 9#endif
11} 10}
12 11
@@ -20,8 +19,8 @@ static inline void switch_mm(struct mm_struct *prev,
20 /* stop flush ipis for the previous mm */ 19 /* stop flush ipis for the previous mm */
21 cpu_clear(cpu, prev->cpu_vm_mask); 20 cpu_clear(cpu, prev->cpu_vm_mask);
22#ifdef CONFIG_SMP 21#ifdef CONFIG_SMP
23 per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK; 22 x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK);
24 per_cpu(cpu_tlbstate, cpu).active_mm = next; 23 x86_write_percpu(cpu_tlbstate.active_mm, next);
25#endif 24#endif
26 cpu_set(cpu, next->cpu_vm_mask); 25 cpu_set(cpu, next->cpu_vm_mask);
27 26
@@ -36,8 +35,8 @@ static inline void switch_mm(struct mm_struct *prev,
36 } 35 }
37#ifdef CONFIG_SMP 36#ifdef CONFIG_SMP
38 else { 37 else {
39 per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK; 38 x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK);
40 BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next); 39 BUG_ON(x86_read_percpu(cpu_tlbstate.active_mm) != next);
41 40
42 if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) { 41 if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
43 /* We were in lazy tlb mode and leave_mm disabled 42 /* We were in lazy tlb mode and leave_mm disabled
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 91885c28f66b..62d14ce3cd00 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -6,13 +6,13 @@
6#include <asm/mpspec_def.h> 6#include <asm/mpspec_def.h>
7 7
8extern int apic_version[MAX_APICS]; 8extern int apic_version[MAX_APICS];
9extern int pic_mode;
9 10
10#ifdef CONFIG_X86_32 11#ifdef CONFIG_X86_32
11#include <mach_mpspec.h> 12#include <mach_mpspec.h>
12 13
13extern unsigned int def_to_bigsmp; 14extern unsigned int def_to_bigsmp;
14extern u8 apicid_2_node[]; 15extern u8 apicid_2_node[];
15extern int pic_mode;
16 16
17#ifdef CONFIG_X86_NUMAQ 17#ifdef CONFIG_X86_NUMAQ
18extern int mp_bus_id_to_node[MAX_MP_BUSSES]; 18extern int mp_bus_id_to_node[MAX_MP_BUSSES];
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index e38859d577a1..cb58643947b9 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -85,7 +85,9 @@
85/* AMD64 MSRs. Not complete. See the architecture manual for a more 85/* AMD64 MSRs. Not complete. See the architecture manual for a more
86 complete list. */ 86 complete list. */
87 87
88#define MSR_AMD64_PATCH_LEVEL 0x0000008b
88#define MSR_AMD64_NB_CFG 0xc001001f 89#define MSR_AMD64_NB_CFG 0xc001001f
90#define MSR_AMD64_PATCH_LOADER 0xc0010020
89#define MSR_AMD64_IBSFETCHCTL 0xc0011030 91#define MSR_AMD64_IBSFETCHCTL 0xc0011030
90#define MSR_AMD64_IBSFETCHLINAD 0xc0011031 92#define MSR_AMD64_IBSFETCHLINAD 0xc0011031
91#define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032 93#define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index c2a812ebde89..638bf6241807 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -22,10 +22,10 @@ static inline unsigned long long native_read_tscp(unsigned int *aux)
22} 22}
23 23
24/* 24/*
25 * i386 calling convention returns 64-bit value in edx:eax, while 25 * both i386 and x86_64 returns 64-bit value in edx:eax, but gcc's "A"
26 * x86_64 returns at rax. Also, the "A" constraint does not really 26 * constraint has different meanings. For i386, "A" means exactly
27 * mean rdx:rax in x86_64, so we need specialized behaviour for each 27 * edx:eax, while for x86_64 it doesn't mean rdx:rax or edx:eax. Instead,
28 * architecture 28 * it means rax *or* rdx.
29 */ 29 */
30#ifdef CONFIG_X86_64 30#ifdef CONFIG_X86_64
31#define DECLARE_ARGS(val, low, high) unsigned low, high 31#define DECLARE_ARGS(val, low, high) unsigned low, high
@@ -85,7 +85,8 @@ static inline void native_write_msr(unsigned int msr,
85 asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory"); 85 asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory");
86} 86}
87 87
88static inline int native_write_msr_safe(unsigned int msr, 88/* Can be uninlined because referenced by paravirt */
89notrace static inline int native_write_msr_safe(unsigned int msr,
89 unsigned low, unsigned high) 90 unsigned low, unsigned high)
90{ 91{
91 int err; 92 int err;
@@ -181,10 +182,10 @@ static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
181} 182}
182 183
183#define rdtscl(low) \ 184#define rdtscl(low) \
184 ((low) = (u32)native_read_tsc()) 185 ((low) = (u32)__native_read_tsc())
185 186
186#define rdtscll(val) \ 187#define rdtscll(val) \
187 ((val) = native_read_tsc()) 188 ((val) = __native_read_tsc())
188 189
189#define rdpmc(counter, low, high) \ 190#define rdpmc(counter, low, high) \
190do { \ 191do { \
diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h
index 7c1e4258b31e..cb988aab716d 100644
--- a/arch/x86/include/asm/mtrr.h
+++ b/arch/x86/include/asm/mtrr.h
@@ -57,6 +57,31 @@ struct mtrr_gentry {
57}; 57};
58#endif /* !__i386__ */ 58#endif /* !__i386__ */
59 59
60struct mtrr_var_range {
61 u32 base_lo;
62 u32 base_hi;
63 u32 mask_lo;
64 u32 mask_hi;
65};
66
67/* In the Intel processor's MTRR interface, the MTRR type is always held in
68 an 8 bit field: */
69typedef u8 mtrr_type;
70
71#define MTRR_NUM_FIXED_RANGES 88
72#define MTRR_MAX_VAR_RANGES 256
73
74struct mtrr_state_type {
75 struct mtrr_var_range var_ranges[MTRR_MAX_VAR_RANGES];
76 mtrr_type fixed_ranges[MTRR_NUM_FIXED_RANGES];
77 unsigned char enabled;
78 unsigned char have_fixed;
79 mtrr_type def_type;
80};
81
82#define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg))
83#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1)
84
60/* These are the various ioctls */ 85/* These are the various ioctls */
61#define MTRRIOC_ADD_ENTRY _IOW(MTRR_IOCTL_BASE, 0, struct mtrr_sentry) 86#define MTRRIOC_ADD_ENTRY _IOW(MTRR_IOCTL_BASE, 0, struct mtrr_sentry)
62#define MTRRIOC_SET_ENTRY _IOW(MTRR_IOCTL_BASE, 1, struct mtrr_sentry) 87#define MTRRIOC_SET_ENTRY _IOW(MTRR_IOCTL_BASE, 1, struct mtrr_sentry)
diff --git a/arch/x86/include/asm/numaq/apic.h b/arch/x86/include/asm/numaq/apic.h
index 0bf2a06b7a4e..bf37bc49bd8e 100644
--- a/arch/x86/include/asm/numaq/apic.h
+++ b/arch/x86/include/asm/numaq/apic.h
@@ -7,9 +7,9 @@
7 7
8#define APIC_DFR_VALUE (APIC_DFR_CLUSTER) 8#define APIC_DFR_VALUE (APIC_DFR_CLUSTER)
9 9
10static inline cpumask_t target_cpus(void) 10static inline const cpumask_t *target_cpus(void)
11{ 11{
12 return CPU_MASK_ALL; 12 return &CPU_MASK_ALL;
13} 13}
14 14
15#define NO_BALANCE_IRQ (1) 15#define NO_BALANCE_IRQ (1)
@@ -63,8 +63,8 @@ static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map)
63extern u8 cpu_2_logical_apicid[]; 63extern u8 cpu_2_logical_apicid[];
64static inline int cpu_to_logical_apicid(int cpu) 64static inline int cpu_to_logical_apicid(int cpu)
65{ 65{
66 if (cpu >= NR_CPUS) 66 if (cpu >= nr_cpu_ids)
67 return BAD_APICID; 67 return BAD_APICID;
68 return (int)cpu_2_logical_apicid[cpu]; 68 return (int)cpu_2_logical_apicid[cpu];
69} 69}
70 70
@@ -122,7 +122,13 @@ static inline void enable_apic_mode(void)
122 * We use physical apicids here, not logical, so just return the default 122 * We use physical apicids here, not logical, so just return the default
123 * physical broadcast to stop people from breaking us 123 * physical broadcast to stop people from breaking us
124 */ 124 */
125static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) 125static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask)
126{
127 return (int) 0xF;
128}
129
130static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *cpumask,
131 const struct cpumask *andmask)
126{ 132{
127 return (int) 0xF; 133 return (int) 0xF;
128} 134}
diff --git a/arch/x86/include/asm/numaq/ipi.h b/arch/x86/include/asm/numaq/ipi.h
index 935588d286cf..a8374c652778 100644
--- a/arch/x86/include/asm/numaq/ipi.h
+++ b/arch/x86/include/asm/numaq/ipi.h
@@ -1,25 +1,22 @@
1#ifndef __ASM_NUMAQ_IPI_H 1#ifndef __ASM_NUMAQ_IPI_H
2#define __ASM_NUMAQ_IPI_H 2#define __ASM_NUMAQ_IPI_H
3 3
4void send_IPI_mask_sequence(cpumask_t, int vector); 4void send_IPI_mask_sequence(const struct cpumask *mask, int vector);
5void send_IPI_mask_allbutself(const struct cpumask *mask, int vector);
5 6
6static inline void send_IPI_mask(cpumask_t mask, int vector) 7static inline void send_IPI_mask(const struct cpumask *mask, int vector)
7{ 8{
8 send_IPI_mask_sequence(mask, vector); 9 send_IPI_mask_sequence(mask, vector);
9} 10}
10 11
11static inline void send_IPI_allbutself(int vector) 12static inline void send_IPI_allbutself(int vector)
12{ 13{
13 cpumask_t mask = cpu_online_map; 14 send_IPI_mask_allbutself(cpu_online_mask, vector);
14 cpu_clear(smp_processor_id(), mask);
15
16 if (!cpus_empty(mask))
17 send_IPI_mask(mask, vector);
18} 15}
19 16
20static inline void send_IPI_all(int vector) 17static inline void send_IPI_all(int vector)
21{ 18{
22 send_IPI_mask(cpu_online_map, vector); 19 send_IPI_mask(cpu_online_mask, vector);
23} 20}
24 21
25#endif /* __ASM_NUMAQ_IPI_H */ 22#endif /* __ASM_NUMAQ_IPI_H */
diff --git a/arch/x86/include/asm/numaq/wakecpu.h b/arch/x86/include/asm/numaq/wakecpu.h
index c577bda5b1c5..6f499df8eddb 100644
--- a/arch/x86/include/asm/numaq/wakecpu.h
+++ b/arch/x86/include/asm/numaq/wakecpu.h
@@ -3,12 +3,8 @@
3 3
4/* This file copes with machines that wakeup secondary CPUs by NMIs */ 4/* This file copes with machines that wakeup secondary CPUs by NMIs */
5 5
6#define WAKE_SECONDARY_VIA_NMI 6#define TRAMPOLINE_PHYS_LOW (0x8)
7 7#define TRAMPOLINE_PHYS_HIGH (0xa)
8#define TRAMPOLINE_LOW phys_to_virt(0x8)
9#define TRAMPOLINE_HIGH phys_to_virt(0xa)
10
11#define boot_cpu_apicid boot_cpu_logical_apicid
12 8
13/* We don't do anything here because we use NMI's to boot instead */ 9/* We don't do anything here because we use NMI's to boot instead */
14static inline void wait_for_init_deassert(atomic_t *deassert) 10static inline void wait_for_init_deassert(atomic_t *deassert)
@@ -27,17 +23,23 @@ static inline void smp_callin_clear_local_apic(void)
27static inline void store_NMI_vector(unsigned short *high, unsigned short *low) 23static inline void store_NMI_vector(unsigned short *high, unsigned short *low)
28{ 24{
29 printk("Storing NMI vector\n"); 25 printk("Storing NMI vector\n");
30 *high = *((volatile unsigned short *) TRAMPOLINE_HIGH); 26 *high =
31 *low = *((volatile unsigned short *) TRAMPOLINE_LOW); 27 *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH));
28 *low =
29 *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW));
32} 30}
33 31
34static inline void restore_NMI_vector(unsigned short *high, unsigned short *low) 32static inline void restore_NMI_vector(unsigned short *high, unsigned short *low)
35{ 33{
36 printk("Restoring NMI vector\n"); 34 printk("Restoring NMI vector\n");
37 *((volatile unsigned short *) TRAMPOLINE_HIGH) = *high; 35 *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) =
38 *((volatile unsigned short *) TRAMPOLINE_LOW) = *low; 36 *high;
37 *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) =
38 *low;
39} 39}
40 40
41#define inquire_remote_apic(apicid) {} 41static inline void inquire_remote_apic(int apicid)
42{
43}
42 44
43#endif /* __ASM_NUMAQ_WAKECPU_H */ 45#endif /* __ASM_NUMAQ_WAKECPU_H */
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 875b38edf193..a977de23cb4d 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -19,6 +19,8 @@ struct pci_sysdata {
19}; 19};
20 20
21extern int pci_routeirq; 21extern int pci_routeirq;
22extern int noioapicquirk;
23extern int noioapicreroute;
22 24
23/* scan a bus after allocating a pci_sysdata for it */ 25/* scan a bus after allocating a pci_sysdata for it */
24extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops, 26extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
@@ -82,6 +84,8 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
82static inline void early_quirks(void) { } 84static inline void early_quirks(void) { }
83#endif 85#endif
84 86
87extern void pci_iommu_alloc(void);
88
85#endif /* __KERNEL__ */ 89#endif /* __KERNEL__ */
86 90
87#ifdef CONFIG_X86_32 91#ifdef CONFIG_X86_32
@@ -98,9 +102,9 @@ static inline void early_quirks(void) { }
98 102
99#ifdef CONFIG_NUMA 103#ifdef CONFIG_NUMA
100/* Returns the node based on pci bus */ 104/* Returns the node based on pci bus */
101static inline int __pcibus_to_node(struct pci_bus *bus) 105static inline int __pcibus_to_node(const struct pci_bus *bus)
102{ 106{
103 struct pci_sysdata *sd = bus->sysdata; 107 const struct pci_sysdata *sd = bus->sysdata;
104 108
105 return sd->node; 109 return sd->node;
106} 110}
@@ -109,6 +113,12 @@ static inline cpumask_t __pcibus_to_cpumask(struct pci_bus *bus)
109{ 113{
110 return node_to_cpumask(__pcibus_to_node(bus)); 114 return node_to_cpumask(__pcibus_to_node(bus));
111} 115}
116
117static inline const struct cpumask *
118cpumask_of_pcibus(const struct pci_bus *bus)
119{
120 return cpumask_of_node(__pcibus_to_node(bus));
121}
112#endif 122#endif
113 123
114#endif /* _ASM_X86_PCI_H */ 124#endif /* _ASM_X86_PCI_H */
diff --git a/arch/x86/include/asm/pci_64.h b/arch/x86/include/asm/pci_64.h
index d02d936840a3..4da207982777 100644
--- a/arch/x86/include/asm/pci_64.h
+++ b/arch/x86/include/asm/pci_64.h
@@ -23,7 +23,6 @@ extern int (*pci_config_write)(int seg, int bus, int dev, int fn,
23 int reg, int len, u32 value); 23 int reg, int len, u32 value);
24 24
25extern void dma32_reserve_bootmem(void); 25extern void dma32_reserve_bootmem(void);
26extern void pci_iommu_alloc(void);
27 26
28/* The PCI address space does equal the physical memory 27/* The PCI address space does equal the physical memory
29 * address space. The networking and block device layers use 28 * address space. The networking and block device layers use
diff --git a/arch/x86/pci/pci.h b/arch/x86/include/asm/pci_x86.h
index 15b9cf6be729..e60fd3e14bdf 100644
--- a/arch/x86/pci/pci.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -57,7 +57,8 @@ extern struct pci_ops pci_root_ops;
57struct irq_info { 57struct irq_info {
58 u8 bus, devfn; /* Bus, device and function */ 58 u8 bus, devfn; /* Bus, device and function */
59 struct { 59 struct {
60 u8 link; /* IRQ line ID, chipset dependent, 0=not routed */ 60 u8 link; /* IRQ line ID, chipset dependent,
61 0 = not routed */
61 u16 bitmap; /* Available IRQs */ 62 u16 bitmap; /* Available IRQs */
62 } __attribute__((packed)) irq[4]; 63 } __attribute__((packed)) irq[4];
63 u8 slot; /* Slot number, 0=onboard */ 64 u8 slot; /* Slot number, 0=onboard */
@@ -69,11 +70,13 @@ struct irq_routing_table {
69 u16 version; /* PIRQ_VERSION */ 70 u16 version; /* PIRQ_VERSION */
70 u16 size; /* Table size in bytes */ 71 u16 size; /* Table size in bytes */
71 u8 rtr_bus, rtr_devfn; /* Where the interrupt router lies */ 72 u8 rtr_bus, rtr_devfn; /* Where the interrupt router lies */
72 u16 exclusive_irqs; /* IRQs devoted exclusively to PCI usage */ 73 u16 exclusive_irqs; /* IRQs devoted exclusively to
73 u16 rtr_vendor, rtr_device; /* Vendor and device ID of interrupt router */ 74 PCI usage */
75 u16 rtr_vendor, rtr_device; /* Vendor and device ID of
76 interrupt router */
74 u32 miniport_data; /* Crap */ 77 u32 miniport_data; /* Crap */
75 u8 rfu[11]; 78 u8 rfu[11];
76 u8 checksum; /* Modulo 256 checksum must give zero */ 79 u8 checksum; /* Modulo 256 checksum must give 0 */
77 struct irq_info slots[0]; 80 struct irq_info slots[0];
78} __attribute__((packed)); 81} __attribute__((packed));
79 82
@@ -96,6 +99,7 @@ extern struct pci_raw_ops *raw_pci_ops;
96extern struct pci_raw_ops *raw_pci_ext_ops; 99extern struct pci_raw_ops *raw_pci_ext_ops;
97 100
98extern struct pci_raw_ops pci_direct_conf1; 101extern struct pci_raw_ops pci_direct_conf1;
102extern bool port_cf9_safe;
99 103
100/* arch_initcall level */ 104/* arch_initcall level */
101extern int pci_direct_probe(void); 105extern int pci_direct_probe(void);
@@ -147,15 +151,15 @@ static inline unsigned int mmio_config_readl(void __iomem *pos)
147 151
148static inline void mmio_config_writeb(void __iomem *pos, u8 val) 152static inline void mmio_config_writeb(void __iomem *pos, u8 val)
149{ 153{
150 asm volatile("movb %%al,(%1)" :: "a" (val), "r" (pos) : "memory"); 154 asm volatile("movb %%al,(%1)" : : "a" (val), "r" (pos) : "memory");
151} 155}
152 156
153static inline void mmio_config_writew(void __iomem *pos, u16 val) 157static inline void mmio_config_writew(void __iomem *pos, u16 val)
154{ 158{
155 asm volatile("movw %%ax,(%1)" :: "a" (val), "r" (pos) : "memory"); 159 asm volatile("movw %%ax,(%1)" : : "a" (val), "r" (pos) : "memory");
156} 160}
157 161
158static inline void mmio_config_writel(void __iomem *pos, u32 val) 162static inline void mmio_config_writel(void __iomem *pos, u32 val)
159{ 163{
160 asm volatile("movl %%eax,(%1)" :: "a" (val), "r" (pos) : "memory"); 164 asm volatile("movl %%eax,(%1)" : : "a" (val), "r" (pos) : "memory");
161} 165}
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index b17edfd23628..e0d199fe1d83 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -56,23 +56,55 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp)
56#define pte_none(x) (!(x).pte_low) 56#define pte_none(x) (!(x).pte_low)
57 57
58/* 58/*
59 * Bits 0, 6 and 7 are taken, split up the 29 bits of offset 59 * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken,
60 * into this range: 60 * split up the 29 bits of offset into this range:
61 */ 61 */
62#define PTE_FILE_MAX_BITS 29 62#define PTE_FILE_MAX_BITS 29
63#define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1)
64#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
65#define PTE_FILE_SHIFT2 (_PAGE_BIT_FILE + 1)
66#define PTE_FILE_SHIFT3 (_PAGE_BIT_PROTNONE + 1)
67#else
68#define PTE_FILE_SHIFT2 (_PAGE_BIT_PROTNONE + 1)
69#define PTE_FILE_SHIFT3 (_PAGE_BIT_FILE + 1)
70#endif
71#define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1)
72#define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1)
63 73
64#define pte_to_pgoff(pte) \ 74#define pte_to_pgoff(pte) \
65 ((((pte).pte_low >> 1) & 0x1f) + (((pte).pte_low >> 8) << 5)) 75 ((((pte).pte_low >> PTE_FILE_SHIFT1) \
76 & ((1U << PTE_FILE_BITS1) - 1)) \
77 + ((((pte).pte_low >> PTE_FILE_SHIFT2) \
78 & ((1U << PTE_FILE_BITS2) - 1)) << PTE_FILE_BITS1) \
79 + (((pte).pte_low >> PTE_FILE_SHIFT3) \
80 << (PTE_FILE_BITS1 + PTE_FILE_BITS2)))
66 81
67#define pgoff_to_pte(off) \ 82#define pgoff_to_pte(off) \
68 ((pte_t) { .pte_low = (((off) & 0x1f) << 1) + \ 83 ((pte_t) { .pte_low = \
69 (((off) >> 5) << 8) + _PAGE_FILE }) 84 (((off) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1) \
85 + ((((off) >> PTE_FILE_BITS1) & ((1U << PTE_FILE_BITS2) - 1)) \
86 << PTE_FILE_SHIFT2) \
87 + (((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \
88 << PTE_FILE_SHIFT3) \
89 + _PAGE_FILE })
70 90
71/* Encode and de-code a swap entry */ 91/* Encode and de-code a swap entry */
72#define __swp_type(x) (((x).val >> 1) & 0x1f) 92#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
73#define __swp_offset(x) ((x).val >> 8) 93#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
74#define __swp_entry(type, offset) \ 94#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
75 ((swp_entry_t) { ((type) << 1) | ((offset) << 8) }) 95#else
96#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1)
97#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1)
98#endif
99
100#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
101
102#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \
103 & ((1U << SWP_TYPE_BITS) - 1))
104#define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT)
105#define __swp_entry(type, offset) ((swp_entry_t) { \
106 ((type) << (_PAGE_BIT_PRESENT + 1)) \
107 | ((offset) << SWP_OFFSET_SHIFT) })
76#define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low }) 108#define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low })
77#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) 109#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
78 110
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 52597aeadfff..447da43cddb3 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -166,6 +166,7 @@ static inline int pte_none(pte_t pte)
166#define PTE_FILE_MAX_BITS 32 166#define PTE_FILE_MAX_BITS 32
167 167
168/* Encode and de-code a swap entry */ 168/* Encode and de-code a swap entry */
169#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5)
169#define __swp_type(x) (((x).val) & 0x1f) 170#define __swp_type(x) (((x).val) & 0x1f)
170#define __swp_offset(x) ((x).val >> 5) 171#define __swp_offset(x) ((x).val >> 5)
171#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5}) 172#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5})
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index c012f3b11671..83e69f4a37f0 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -10,7 +10,6 @@
10#define _PAGE_BIT_PCD 4 /* page cache disabled */ 10#define _PAGE_BIT_PCD 4 /* page cache disabled */
11#define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */ 11#define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */
12#define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */ 12#define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */
13#define _PAGE_BIT_FILE 6
14#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ 13#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
15#define _PAGE_BIT_PAT 7 /* on 4KB pages */ 14#define _PAGE_BIT_PAT 7 /* on 4KB pages */
16#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ 15#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
@@ -22,6 +21,12 @@
22#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 21#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1
23#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ 22#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
24 23
24/* If _PAGE_BIT_PRESENT is clear, we use these: */
25/* - if the user mapped it with PROT_NONE; pte_present gives true */
26#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
27/* - set: nonlinear file mapping, saved PTE; unset:swap */
28#define _PAGE_BIT_FILE _PAGE_BIT_DIRTY
29
25#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT) 30#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
26#define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW) 31#define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW)
27#define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER) 32#define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER)
@@ -46,11 +51,8 @@
46#define _PAGE_NX (_AT(pteval_t, 0)) 51#define _PAGE_NX (_AT(pteval_t, 0))
47#endif 52#endif
48 53
49/* If _PAGE_PRESENT is clear, we use these: */ 54#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
50#define _PAGE_FILE _PAGE_DIRTY /* nonlinear file mapping, 55#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
51 * saved PTE; unset:swap */
52#define _PAGE_PROTNONE _PAGE_PSE /* if the user mapped it with PROT_NONE;
53 pte_present gives true */
54 56
55#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ 57#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
56 _PAGE_ACCESSED | _PAGE_DIRTY) 58 _PAGE_ACCESSED | _PAGE_DIRTY)
@@ -158,8 +160,19 @@
158#define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */ 160#define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */
159#endif 161#endif
160 162
163/*
164 * Macro to mark a page protection value as UC-
165 */
166#define pgprot_noncached(prot) \
167 ((boot_cpu_data.x86 > 3) \
168 ? (__pgprot(pgprot_val(prot) | _PAGE_CACHE_UC_MINUS)) \
169 : (prot))
170
161#ifndef __ASSEMBLY__ 171#ifndef __ASSEMBLY__
162 172
173#define pgprot_writecombine pgprot_writecombine
174extern pgprot_t pgprot_writecombine(pgprot_t prot);
175
163/* 176/*
164 * ZERO_PAGE is a global shared page that is always zero: used 177 * ZERO_PAGE is a global shared page that is always zero: used
165 * for zero-mapped memory areas etc.. 178 * for zero-mapped memory areas etc..
@@ -329,6 +342,9 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
329#define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask) 342#define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask)
330 343
331#ifndef __ASSEMBLY__ 344#ifndef __ASSEMBLY__
345/* Indicate that x86 has its own track and untrack pfn vma functions */
346#define __HAVE_PFNMAP_TRACKING
347
332#define __HAVE_PHYS_MEM_ACCESS_PROT 348#define __HAVE_PHYS_MEM_ACCESS_PROT
333struct file; 349struct file;
334pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, 350pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index f9d5889b336b..72b020deb46b 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -101,15 +101,6 @@ extern unsigned long pg0[];
101#endif 101#endif
102 102
103/* 103/*
104 * Macro to mark a page protection value as "uncacheable".
105 * On processors which do not support it, this is a no-op.
106 */
107#define pgprot_noncached(prot) \
108 ((boot_cpu_data.x86 > 3) \
109 ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) \
110 : (prot))
111
112/*
113 * Conversion functions: convert a page and protection to a page entry, 104 * Conversion functions: convert a page and protection to a page entry,
114 * and a page entry and page directory to the page they refer to. 105 * and a page entry and page directory to the page they refer to.
115 */ 106 */
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 545a0e042bb2..ba09289accaa 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -146,7 +146,7 @@ static inline void native_pgd_clear(pgd_t *pgd)
146#define PGDIR_MASK (~(PGDIR_SIZE - 1)) 146#define PGDIR_MASK (~(PGDIR_SIZE - 1))
147 147
148 148
149#define MAXMEM _AC(0x00003fffffffffff, UL) 149#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
150#define VMALLOC_START _AC(0xffffc20000000000, UL) 150#define VMALLOC_START _AC(0xffffc20000000000, UL)
151#define VMALLOC_END _AC(0xffffe1ffffffffff, UL) 151#define VMALLOC_END _AC(0xffffe1ffffffffff, UL)
152#define VMEMMAP_START _AC(0xffffe20000000000, UL) 152#define VMEMMAP_START _AC(0xffffe20000000000, UL)
@@ -177,12 +177,6 @@ static inline int pmd_bad(pmd_t pmd)
177#define pages_to_mb(x) ((x) >> (20 - PAGE_SHIFT)) /* FIXME: is this right? */ 177#define pages_to_mb(x) ((x) >> (20 - PAGE_SHIFT)) /* FIXME: is this right? */
178 178
179/* 179/*
180 * Macro to mark a page protection value as "uncacheable".
181 */
182#define pgprot_noncached(prot) \
183 (__pgprot(pgprot_val((prot)) | _PAGE_PCD | _PAGE_PWT))
184
185/*
186 * Conversion functions: convert a page and protection to a page entry, 180 * Conversion functions: convert a page and protection to a page entry,
187 * and a page entry and page directory to the page they refer to. 181 * and a page entry and page directory to the page they refer to.
188 */ 182 */
@@ -250,10 +244,22 @@ static inline int pud_large(pud_t pte)
250extern int direct_gbpages; 244extern int direct_gbpages;
251 245
252/* Encode and de-code a swap entry */ 246/* Encode and de-code a swap entry */
253#define __swp_type(x) (((x).val >> 1) & 0x3f) 247#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
254#define __swp_offset(x) ((x).val >> 8) 248#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
255#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | \ 249#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
256 ((offset) << 8) }) 250#else
251#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1)
252#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1)
253#endif
254
255#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
256
257#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \
258 & ((1U << SWP_TYPE_BITS) - 1))
259#define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT)
260#define __swp_entry(type, offset) ((swp_entry_t) { \
261 ((type) << (_PAGE_BIT_PRESENT + 1)) \
262 | ((offset) << SWP_OFFSET_SHIFT) })
257#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) }) 263#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) })
258#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) 264#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
259 265
diff --git a/arch/x86/include/asm/prctl.h b/arch/x86/include/asm/prctl.h
index fe681147a4f7..a8894647dd9a 100644
--- a/arch/x86/include/asm/prctl.h
+++ b/arch/x86/include/asm/prctl.h
@@ -6,5 +6,8 @@
6#define ARCH_GET_FS 0x1003 6#define ARCH_GET_FS 0x1003
7#define ARCH_GET_GS 0x1004 7#define ARCH_GET_GS 0x1004
8 8
9#ifdef CONFIG_X86_64
10extern long sys_arch_prctl(int, unsigned long);
11#endif /* CONFIG_X86_64 */
9 12
10#endif /* _ASM_X86_PRCTL_H */ 13#endif /* _ASM_X86_PRCTL_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 5ca01e383269..091cd8855f2e 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -110,6 +110,7 @@ struct cpuinfo_x86 {
110 /* Index into per_cpu list: */ 110 /* Index into per_cpu list: */
111 u16 cpu_index; 111 u16 cpu_index;
112#endif 112#endif
113 unsigned int x86_hyper_vendor;
113} __attribute__((__aligned__(SMP_CACHE_BYTES))); 114} __attribute__((__aligned__(SMP_CACHE_BYTES)));
114 115
115#define X86_VENDOR_INTEL 0 116#define X86_VENDOR_INTEL 0
@@ -123,6 +124,9 @@ struct cpuinfo_x86 {
123 124
124#define X86_VENDOR_UNKNOWN 0xff 125#define X86_VENDOR_UNKNOWN 0xff
125 126
127#define X86_HYPER_VENDOR_NONE 0
128#define X86_HYPER_VENDOR_VMWARE 1
129
126/* 130/*
127 * capabilities of CPUs 131 * capabilities of CPUs
128 */ 132 */
@@ -752,6 +756,19 @@ extern void switch_to_new_gdt(void);
752extern void cpu_init(void); 756extern void cpu_init(void);
753extern void init_gdt(int cpu); 757extern void init_gdt(int cpu);
754 758
759static inline unsigned long get_debugctlmsr(void)
760{
761 unsigned long debugctlmsr = 0;
762
763#ifndef CONFIG_X86_DEBUGCTLMSR
764 if (boot_cpu_data.x86 < 6)
765 return 0;
766#endif
767 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
768
769 return debugctlmsr;
770}
771
755static inline void update_debugctlmsr(unsigned long debugctlmsr) 772static inline void update_debugctlmsr(unsigned long debugctlmsr)
756{ 773{
757#ifndef CONFIG_X86_DEBUGCTLMSR 774#ifndef CONFIG_X86_DEBUGCTLMSR
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index eefb0594b058..6d34d954c228 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -6,7 +6,6 @@
6#include <asm/processor-flags.h> 6#include <asm/processor-flags.h>
7 7
8#ifdef __KERNEL__ 8#ifdef __KERNEL__
9#include <asm/ds.h> /* the DS BTS struct is used for ptrace too */
10#include <asm/segment.h> 9#include <asm/segment.h>
11#endif 10#endif
12 11
@@ -128,34 +127,6 @@ struct pt_regs {
128#endif /* !__i386__ */ 127#endif /* !__i386__ */
129 128
130 129
131#ifdef CONFIG_X86_PTRACE_BTS
132/* a branch trace record entry
133 *
134 * In order to unify the interface between various processor versions,
135 * we use the below data structure for all processors.
136 */
137enum bts_qualifier {
138 BTS_INVALID = 0,
139 BTS_BRANCH,
140 BTS_TASK_ARRIVES,
141 BTS_TASK_DEPARTS
142};
143
144struct bts_struct {
145 __u64 qualifier;
146 union {
147 /* BTS_BRANCH */
148 struct {
149 __u64 from_ip;
150 __u64 to_ip;
151 } lbr;
152 /* BTS_TASK_ARRIVES or
153 BTS_TASK_DEPARTS */
154 __u64 jiffies;
155 } variant;
156};
157#endif /* CONFIG_X86_PTRACE_BTS */
158
159#ifdef __KERNEL__ 130#ifdef __KERNEL__
160 131
161#include <linux/init.h> 132#include <linux/init.h>
@@ -163,13 +134,6 @@ struct bts_struct {
163struct cpuinfo_x86; 134struct cpuinfo_x86;
164struct task_struct; 135struct task_struct;
165 136
166#ifdef CONFIG_X86_PTRACE_BTS
167extern void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *);
168extern void ptrace_bts_take_timestamp(struct task_struct *, enum bts_qualifier);
169#else
170#define ptrace_bts_init_intel(config) do {} while (0)
171#endif /* CONFIG_X86_PTRACE_BTS */
172
173extern unsigned long profile_pc(struct pt_regs *regs); 137extern unsigned long profile_pc(struct pt_regs *regs);
174 138
175extern unsigned long 139extern unsigned long
@@ -271,6 +235,13 @@ extern int do_get_thread_area(struct task_struct *p, int idx,
271extern int do_set_thread_area(struct task_struct *p, int idx, 235extern int do_set_thread_area(struct task_struct *p, int idx,
272 struct user_desc __user *info, int can_allocate); 236 struct user_desc __user *info, int can_allocate);
273 237
238extern void x86_ptrace_untrace(struct task_struct *);
239extern void x86_ptrace_fork(struct task_struct *child,
240 unsigned long clone_flags);
241
242#define arch_ptrace_untrace(tsk) x86_ptrace_untrace(tsk)
243#define arch_ptrace_fork(child, flags) x86_ptrace_fork(child, flags)
244
274#endif /* __KERNEL__ */ 245#endif /* __KERNEL__ */
275 246
276#endif /* !__ASSEMBLY__ */ 247#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h
index df7710354f85..562d4fd31ba8 100644
--- a/arch/x86/include/asm/reboot.h
+++ b/arch/x86/include/asm/reboot.h
@@ -1,6 +1,8 @@
1#ifndef _ASM_X86_REBOOT_H 1#ifndef _ASM_X86_REBOOT_H
2#define _ASM_X86_REBOOT_H 2#define _ASM_X86_REBOOT_H
3 3
4#include <linux/kdebug.h>
5
4struct pt_regs; 6struct pt_regs;
5 7
6struct machine_ops { 8struct machine_ops {
@@ -18,4 +20,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs);
18void native_machine_shutdown(void); 20void native_machine_shutdown(void);
19void machine_real_restart(const unsigned char *code, int length); 21void machine_real_restart(const unsigned char *code, int length);
20 22
23typedef void (*nmi_shootdown_cb)(int, struct die_args*);
24void nmi_shootdown_cpus(nmi_shootdown_cb callback);
25
21#endif /* _ASM_X86_REBOOT_H */ 26#endif /* _ASM_X86_REBOOT_H */
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index f12d37237465..4fcd53fd5f43 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -8,6 +8,10 @@
8/* Interrupt control for vSMPowered x86_64 systems */ 8/* Interrupt control for vSMPowered x86_64 systems */
9void vsmp_init(void); 9void vsmp_init(void);
10 10
11
12void setup_bios_corruption_check(void);
13
14
11#ifdef CONFIG_X86_VISWS 15#ifdef CONFIG_X86_VISWS
12extern void visws_early_detect(void); 16extern void visws_early_detect(void);
13extern int is_visws_box(void); 17extern int is_visws_box(void);
@@ -16,6 +20,8 @@ static inline void visws_early_detect(void) { }
16static inline int is_visws_box(void) { return 0; } 20static inline int is_visws_box(void) { return 0; }
17#endif 21#endif
18 22
23extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip);
24extern int wakeup_secondary_cpu_via_init(int apicid, unsigned long start_eip);
19/* 25/*
20 * Any setup quirks to be performed? 26 * Any setup quirks to be performed?
21 */ 27 */
@@ -39,6 +45,7 @@ struct x86_quirks {
39 void (*smp_read_mpc_oem)(struct mp_config_oemtable *oemtable, 45 void (*smp_read_mpc_oem)(struct mp_config_oemtable *oemtable,
40 unsigned short oemsize); 46 unsigned short oemsize);
41 int (*setup_ioapic_ids)(void); 47 int (*setup_ioapic_ids)(void);
48 int (*update_genapic)(void);
42}; 49};
43 50
44extern struct x86_quirks *x86_quirks; 51extern struct x86_quirks *x86_quirks;
diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h
new file mode 100644
index 000000000000..4e0fe26d27d3
--- /dev/null
+++ b/arch/x86/include/asm/sigframe.h
@@ -0,0 +1,70 @@
1#ifndef _ASM_X86_SIGFRAME_H
2#define _ASM_X86_SIGFRAME_H
3
4#include <asm/sigcontext.h>
5#include <asm/siginfo.h>
6#include <asm/ucontext.h>
7
8#ifdef CONFIG_X86_32
9#define sigframe_ia32 sigframe
10#define rt_sigframe_ia32 rt_sigframe
11#define sigcontext_ia32 sigcontext
12#define _fpstate_ia32 _fpstate
13#define ucontext_ia32 ucontext
14#else /* !CONFIG_X86_32 */
15
16#ifdef CONFIG_IA32_EMULATION
17#include <asm/ia32.h>
18#endif /* CONFIG_IA32_EMULATION */
19
20#endif /* CONFIG_X86_32 */
21
22#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
23struct sigframe_ia32 {
24 u32 pretcode;
25 int sig;
26 struct sigcontext_ia32 sc;
27 /*
28 * fpstate is unused. fpstate is moved/allocated after
29 * retcode[] below. This movement allows to have the FP state and the
30 * future state extensions (xsave) stay together.
31 * And at the same time retaining the unused fpstate, prevents changing
32 * the offset of extramask[] in the sigframe and thus prevent any
33 * legacy application accessing/modifying it.
34 */
35 struct _fpstate_ia32 fpstate_unused;
36#ifdef CONFIG_IA32_EMULATION
37 unsigned int extramask[_COMPAT_NSIG_WORDS-1];
38#else /* !CONFIG_IA32_EMULATION */
39 unsigned long extramask[_NSIG_WORDS-1];
40#endif /* CONFIG_IA32_EMULATION */
41 char retcode[8];
42 /* fp state follows here */
43};
44
45struct rt_sigframe_ia32 {
46 u32 pretcode;
47 int sig;
48 u32 pinfo;
49 u32 puc;
50#ifdef CONFIG_IA32_EMULATION
51 compat_siginfo_t info;
52#else /* !CONFIG_IA32_EMULATION */
53 struct siginfo info;
54#endif /* CONFIG_IA32_EMULATION */
55 struct ucontext_ia32 uc;
56 char retcode[8];
57 /* fp state follows here */
58};
59#endif /* defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) */
60
61#ifdef CONFIG_X86_64
62struct rt_sigframe {
63 char __user *pretcode;
64 struct ucontext uc;
65 struct siginfo info;
66 /* fp state follows here */
67};
68#endif /* CONFIG_X86_64 */
69
70#endif /* _ASM_X86_SIGFRAME_H */
diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
index 96ac44f275da..7761a5d554bb 100644
--- a/arch/x86/include/asm/signal.h
+++ b/arch/x86/include/asm/signal.h
@@ -121,6 +121,10 @@ typedef unsigned long sigset_t;
121 121
122#ifndef __ASSEMBLY__ 122#ifndef __ASSEMBLY__
123 123
124# ifdef __KERNEL__
125extern void do_notify_resume(struct pt_regs *, void *, __u32);
126# endif /* __KERNEL__ */
127
124#ifdef __i386__ 128#ifdef __i386__
125# ifdef __KERNEL__ 129# ifdef __KERNEL__
126struct old_sigaction { 130struct old_sigaction {
@@ -141,8 +145,6 @@ struct k_sigaction {
141 struct sigaction sa; 145 struct sigaction sa;
142}; 146};
143 147
144extern void do_notify_resume(struct pt_regs *, void *, __u32);
145
146# else /* __KERNEL__ */ 148# else /* __KERNEL__ */
147/* Here we must cater to libcs that poke about in kernel headers. */ 149/* Here we must cater to libcs that poke about in kernel headers. */
148 150
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index d12811ce51d9..830b9fcb6427 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -60,7 +60,7 @@ struct smp_ops {
60 void (*cpu_die)(unsigned int cpu); 60 void (*cpu_die)(unsigned int cpu);
61 void (*play_dead)(void); 61 void (*play_dead)(void);
62 62
63 void (*send_call_func_ipi)(cpumask_t mask); 63 void (*send_call_func_ipi)(const struct cpumask *mask);
64 void (*send_call_func_single_ipi)(int cpu); 64 void (*send_call_func_single_ipi)(int cpu);
65}; 65};
66 66
@@ -125,7 +125,7 @@ static inline void arch_send_call_function_single_ipi(int cpu)
125 125
126static inline void arch_send_call_function_ipi(cpumask_t mask) 126static inline void arch_send_call_function_ipi(cpumask_t mask)
127{ 127{
128 smp_ops.send_call_func_ipi(mask); 128 smp_ops.send_call_func_ipi(&mask);
129} 129}
130 130
131void cpu_disable_common(void); 131void cpu_disable_common(void);
@@ -138,7 +138,7 @@ void native_cpu_die(unsigned int cpu);
138void native_play_dead(void); 138void native_play_dead(void);
139void play_dead_common(void); 139void play_dead_common(void);
140 140
141void native_send_call_func_ipi(cpumask_t mask); 141void native_send_call_func_ipi(const struct cpumask *mask);
142void native_send_call_func_single_ipi(int cpu); 142void native_send_call_func_single_ipi(int cpu);
143 143
144extern void prefill_possible_map(void); 144extern void prefill_possible_map(void);
diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h
index be44f7dab395..e3cc3c063ec5 100644
--- a/arch/x86/include/asm/sparsemem.h
+++ b/arch/x86/include/asm/sparsemem.h
@@ -27,7 +27,7 @@
27#else /* CONFIG_X86_32 */ 27#else /* CONFIG_X86_32 */
28# define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */ 28# define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */
29# define MAX_PHYSADDR_BITS 44 29# define MAX_PHYSADDR_BITS 44
30# define MAX_PHYSMEM_BITS 44 30# define MAX_PHYSMEM_BITS 44 /* Can be max 45 bits */
31#endif 31#endif
32 32
33#endif /* CONFIG_SPARSEMEM */ 33#endif /* CONFIG_SPARSEMEM */
diff --git a/arch/x86/include/asm/summit/apic.h b/arch/x86/include/asm/summit/apic.h
index 9b3070f1c2ac..4bb5fb34f030 100644
--- a/arch/x86/include/asm/summit/apic.h
+++ b/arch/x86/include/asm/summit/apic.h
@@ -14,13 +14,13 @@
14 14
15#define APIC_DFR_VALUE (APIC_DFR_CLUSTER) 15#define APIC_DFR_VALUE (APIC_DFR_CLUSTER)
16 16
17static inline cpumask_t target_cpus(void) 17static inline const cpumask_t *target_cpus(void)
18{ 18{
19 /* CPU_MASK_ALL (0xff) has undefined behaviour with 19 /* CPU_MASK_ALL (0xff) has undefined behaviour with
20 * dest_LowestPrio mode logical clustered apic interrupt routing 20 * dest_LowestPrio mode logical clustered apic interrupt routing
21 * Just start on cpu 0. IRQ balancing will spread load 21 * Just start on cpu 0. IRQ balancing will spread load
22 */ 22 */
23 return cpumask_of_cpu(0); 23 return &cpumask_of_cpu(0);
24} 24}
25 25
26#define INT_DELIVERY_MODE (dest_LowestPrio) 26#define INT_DELIVERY_MODE (dest_LowestPrio)
@@ -52,7 +52,7 @@ static inline void init_apic_ldr(void)
52 int i; 52 int i;
53 53
54 /* Create logical APIC IDs by counting CPUs already in cluster. */ 54 /* Create logical APIC IDs by counting CPUs already in cluster. */
55 for (count = 0, i = NR_CPUS; --i >= 0; ) { 55 for (count = 0, i = nr_cpu_ids; --i >= 0; ) {
56 lid = cpu_2_logical_apicid[i]; 56 lid = cpu_2_logical_apicid[i];
57 if (lid != BAD_APICID && apicid_cluster(lid) == my_cluster) 57 if (lid != BAD_APICID && apicid_cluster(lid) == my_cluster)
58 ++count; 58 ++count;
@@ -97,8 +97,8 @@ static inline int apicid_to_node(int logical_apicid)
97static inline int cpu_to_logical_apicid(int cpu) 97static inline int cpu_to_logical_apicid(int cpu)
98{ 98{
99#ifdef CONFIG_SMP 99#ifdef CONFIG_SMP
100 if (cpu >= NR_CPUS) 100 if (cpu >= nr_cpu_ids)
101 return BAD_APICID; 101 return BAD_APICID;
102 return (int)cpu_2_logical_apicid[cpu]; 102 return (int)cpu_2_logical_apicid[cpu];
103#else 103#else
104 return logical_smp_processor_id(); 104 return logical_smp_processor_id();
@@ -107,7 +107,7 @@ static inline int cpu_to_logical_apicid(int cpu)
107 107
108static inline int cpu_present_to_apicid(int mps_cpu) 108static inline int cpu_present_to_apicid(int mps_cpu)
109{ 109{
110 if (mps_cpu < NR_CPUS) 110 if (mps_cpu < nr_cpu_ids)
111 return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu); 111 return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
112 else 112 else
113 return BAD_APICID; 113 return BAD_APICID;
@@ -137,25 +137,25 @@ static inline void enable_apic_mode(void)
137{ 137{
138} 138}
139 139
140static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) 140static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask)
141{ 141{
142 int num_bits_set; 142 int num_bits_set;
143 int cpus_found = 0; 143 int cpus_found = 0;
144 int cpu; 144 int cpu;
145 int apicid; 145 int apicid;
146 146
147 num_bits_set = cpus_weight(cpumask); 147 num_bits_set = cpus_weight(*cpumask);
148 /* Return id to all */ 148 /* Return id to all */
149 if (num_bits_set == NR_CPUS) 149 if (num_bits_set >= nr_cpu_ids)
150 return (int) 0xFF; 150 return (int) 0xFF;
151 /* 151 /*
152 * The cpus in the mask must all be on the apic cluster. If are not 152 * The cpus in the mask must all be on the apic cluster. If are not
153 * on the same apicid cluster return default value of TARGET_CPUS. 153 * on the same apicid cluster return default value of TARGET_CPUS.
154 */ 154 */
155 cpu = first_cpu(cpumask); 155 cpu = first_cpu(*cpumask);
156 apicid = cpu_to_logical_apicid(cpu); 156 apicid = cpu_to_logical_apicid(cpu);
157 while (cpus_found < num_bits_set) { 157 while (cpus_found < num_bits_set) {
158 if (cpu_isset(cpu, cpumask)) { 158 if (cpu_isset(cpu, *cpumask)) {
159 int new_apicid = cpu_to_logical_apicid(cpu); 159 int new_apicid = cpu_to_logical_apicid(cpu);
160 if (apicid_cluster(apicid) != 160 if (apicid_cluster(apicid) !=
161 apicid_cluster(new_apicid)){ 161 apicid_cluster(new_apicid)){
@@ -170,6 +170,23 @@ static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
170 return apicid; 170 return apicid;
171} 171}
172 172
173static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *inmask,
174 const struct cpumask *andmask)
175{
176 int apicid = cpu_to_logical_apicid(0);
177 cpumask_var_t cpumask;
178
179 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
180 return apicid;
181
182 cpumask_and(cpumask, inmask, andmask);
183 cpumask_and(cpumask, cpumask, cpu_online_mask);
184 apicid = cpu_mask_to_apicid(cpumask);
185
186 free_cpumask_var(cpumask);
187 return apicid;
188}
189
173/* cpuid returns the value latched in the HW at reset, not the APIC ID 190/* cpuid returns the value latched in the HW at reset, not the APIC ID
174 * register's value. For any box whose BIOS changes APIC IDs, like 191 * register's value. For any box whose BIOS changes APIC IDs, like
175 * clustered APIC systems, we must use hard_smp_processor_id. 192 * clustered APIC systems, we must use hard_smp_processor_id.
diff --git a/arch/x86/include/asm/summit/ipi.h b/arch/x86/include/asm/summit/ipi.h
index 53bd1e7bd7b4..a8a2c24f50cc 100644
--- a/arch/x86/include/asm/summit/ipi.h
+++ b/arch/x86/include/asm/summit/ipi.h
@@ -1,9 +1,10 @@
1#ifndef __ASM_SUMMIT_IPI_H 1#ifndef __ASM_SUMMIT_IPI_H
2#define __ASM_SUMMIT_IPI_H 2#define __ASM_SUMMIT_IPI_H
3 3
4void send_IPI_mask_sequence(cpumask_t mask, int vector); 4void send_IPI_mask_sequence(const cpumask_t *mask, int vector);
5void send_IPI_mask_allbutself(const cpumask_t *mask, int vector);
5 6
6static inline void send_IPI_mask(cpumask_t mask, int vector) 7static inline void send_IPI_mask(const cpumask_t *mask, int vector)
7{ 8{
8 send_IPI_mask_sequence(mask, vector); 9 send_IPI_mask_sequence(mask, vector);
9} 10}
@@ -14,12 +15,12 @@ static inline void send_IPI_allbutself(int vector)
14 cpu_clear(smp_processor_id(), mask); 15 cpu_clear(smp_processor_id(), mask);
15 16
16 if (!cpus_empty(mask)) 17 if (!cpus_empty(mask))
17 send_IPI_mask(mask, vector); 18 send_IPI_mask(&mask, vector);
18} 19}
19 20
20static inline void send_IPI_all(int vector) 21static inline void send_IPI_all(int vector)
21{ 22{
22 send_IPI_mask(cpu_online_map, vector); 23 send_IPI_mask(&cpu_online_map, vector);
23} 24}
24 25
25#endif /* __ASM_SUMMIT_IPI_H */ 26#endif /* __ASM_SUMMIT_IPI_H */
diff --git a/arch/x86/kvm/svm.h b/arch/x86/include/asm/svm.h
index 1b8afa78e869..1b8afa78e869 100644
--- a/arch/x86/kvm/svm.h
+++ b/arch/x86/include/asm/svm.h
diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h
new file mode 100644
index 000000000000..ffb08be2a530
--- /dev/null
+++ b/arch/x86/include/asm/sys_ia32.h
@@ -0,0 +1,101 @@
1/*
2 * sys_ia32.h - Linux ia32 syscall interfaces
3 *
4 * Copyright (c) 2008 Jaswinder Singh Rajput
5 *
6 * This file is released under the GPLv2.
7 * See the file COPYING for more details.
8 */
9
10#ifndef _ASM_X86_SYS_IA32_H
11#define _ASM_X86_SYS_IA32_H
12
13#include <linux/compiler.h>
14#include <linux/linkage.h>
15#include <linux/types.h>
16#include <linux/signal.h>
17#include <asm/compat.h>
18#include <asm/ia32.h>
19
20/* ia32/sys_ia32.c */
21asmlinkage long sys32_truncate64(char __user *, unsigned long, unsigned long);
22asmlinkage long sys32_ftruncate64(unsigned int, unsigned long, unsigned long);
23
24asmlinkage long sys32_stat64(char __user *, struct stat64 __user *);
25asmlinkage long sys32_lstat64(char __user *, struct stat64 __user *);
26asmlinkage long sys32_fstat64(unsigned int, struct stat64 __user *);
27asmlinkage long sys32_fstatat(unsigned int, char __user *,
28 struct stat64 __user *, int);
29struct mmap_arg_struct;
30asmlinkage long sys32_mmap(struct mmap_arg_struct __user *);
31asmlinkage long sys32_mprotect(unsigned long, size_t, unsigned long);
32
33asmlinkage long sys32_pipe(int __user *);
34struct sigaction32;
35struct old_sigaction32;
36asmlinkage long sys32_rt_sigaction(int, struct sigaction32 __user *,
37 struct sigaction32 __user *, unsigned int);
38asmlinkage long sys32_sigaction(int, struct old_sigaction32 __user *,
39 struct old_sigaction32 __user *);
40asmlinkage long sys32_rt_sigprocmask(int, compat_sigset_t __user *,
41 compat_sigset_t __user *, unsigned int);
42asmlinkage long sys32_alarm(unsigned int);
43
44struct sel_arg_struct;
45asmlinkage long sys32_old_select(struct sel_arg_struct __user *);
46asmlinkage long sys32_waitpid(compat_pid_t, unsigned int *, int);
47asmlinkage long sys32_sysfs(int, u32, u32);
48
49asmlinkage long sys32_sched_rr_get_interval(compat_pid_t,
50 struct compat_timespec __user *);
51asmlinkage long sys32_rt_sigpending(compat_sigset_t __user *, compat_size_t);
52asmlinkage long sys32_rt_sigqueueinfo(int, int, compat_siginfo_t __user *);
53
54#ifdef CONFIG_SYSCTL_SYSCALL
55struct sysctl_ia32;
56asmlinkage long sys32_sysctl(struct sysctl_ia32 __user *);
57#endif
58
59asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32);
60asmlinkage long sys32_pwrite(unsigned int, char __user *, u32, u32, u32);
61
62asmlinkage long sys32_personality(unsigned long);
63asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32);
64
65asmlinkage long sys32_mmap2(unsigned long, unsigned long, unsigned long,
66 unsigned long, unsigned long, unsigned long);
67
68struct oldold_utsname;
69struct old_utsname;
70asmlinkage long sys32_olduname(struct oldold_utsname __user *);
71long sys32_uname(struct old_utsname __user *);
72
73long sys32_ustat(unsigned, struct ustat32 __user *);
74
75asmlinkage long sys32_execve(char __user *, compat_uptr_t __user *,
76 compat_uptr_t __user *, struct pt_regs *);
77asmlinkage long sys32_clone(unsigned int, unsigned int, struct pt_regs *);
78
79long sys32_lseek(unsigned int, int, unsigned int);
80long sys32_kill(int, int);
81long sys32_fadvise64_64(int, __u32, __u32, __u32, __u32, int);
82long sys32_vm86_warning(void);
83long sys32_lookup_dcookie(u32, u32, char __user *, size_t);
84
85asmlinkage ssize_t sys32_readahead(int, unsigned, unsigned, size_t);
86asmlinkage long sys32_sync_file_range(int, unsigned, unsigned,
87 unsigned, unsigned, int);
88asmlinkage long sys32_fadvise64(int, unsigned, unsigned, size_t, int);
89asmlinkage long sys32_fallocate(int, int, unsigned,
90 unsigned, unsigned, unsigned);
91
92/* ia32/ia32_signal.c */
93asmlinkage long sys32_sigsuspend(int, int, old_sigset_t);
94asmlinkage long sys32_sigaltstack(const stack_ia32_t __user *,
95 stack_ia32_t __user *, struct pt_regs *);
96asmlinkage long sys32_sigreturn(struct pt_regs *);
97asmlinkage long sys32_rt_sigreturn(struct pt_regs *);
98
99/* ia32/ipc32.c */
100asmlinkage long sys32_ipc(u32, int, int, int, compat_uptr_t, u32);
101#endif /* _ASM_X86_SYS_IA32_H */
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 87803da44010..9c6797c3e56c 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -19,6 +19,13 @@
19/* kernel/ioport.c */ 19/* kernel/ioport.c */
20asmlinkage long sys_ioperm(unsigned long, unsigned long, int); 20asmlinkage long sys_ioperm(unsigned long, unsigned long, int);
21 21
22/* kernel/ldt.c */
23asmlinkage int sys_modify_ldt(int, void __user *, unsigned long);
24
25/* kernel/tls.c */
26asmlinkage int sys_set_thread_area(struct user_desc __user *);
27asmlinkage int sys_get_thread_area(struct user_desc __user *);
28
22/* X86_32 only */ 29/* X86_32 only */
23#ifdef CONFIG_X86_32 30#ifdef CONFIG_X86_32
24/* kernel/process_32.c */ 31/* kernel/process_32.c */
@@ -33,14 +40,11 @@ asmlinkage int sys_sigaction(int, const struct old_sigaction __user *,
33 struct old_sigaction __user *); 40 struct old_sigaction __user *);
34asmlinkage int sys_sigaltstack(unsigned long); 41asmlinkage int sys_sigaltstack(unsigned long);
35asmlinkage unsigned long sys_sigreturn(unsigned long); 42asmlinkage unsigned long sys_sigreturn(unsigned long);
36asmlinkage int sys_rt_sigreturn(unsigned long); 43asmlinkage int sys_rt_sigreturn(struct pt_regs);
37 44
38/* kernel/ioport.c */ 45/* kernel/ioport.c */
39asmlinkage long sys_iopl(unsigned long); 46asmlinkage long sys_iopl(unsigned long);
40 47
41/* kernel/ldt.c */
42asmlinkage int sys_modify_ldt(int, void __user *, unsigned long);
43
44/* kernel/sys_i386_32.c */ 48/* kernel/sys_i386_32.c */
45asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long, 49asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long,
46 unsigned long, unsigned long, unsigned long); 50 unsigned long, unsigned long, unsigned long);
@@ -54,10 +58,6 @@ asmlinkage int sys_uname(struct old_utsname __user *);
54struct oldold_utsname; 58struct oldold_utsname;
55asmlinkage int sys_olduname(struct oldold_utsname __user *); 59asmlinkage int sys_olduname(struct oldold_utsname __user *);
56 60
57/* kernel/tls.c */
58asmlinkage int sys_set_thread_area(struct user_desc __user *);
59asmlinkage int sys_get_thread_area(struct user_desc __user *);
60
61/* kernel/vm86_32.c */ 61/* kernel/vm86_32.c */
62asmlinkage int sys_vm86old(struct pt_regs); 62asmlinkage int sys_vm86old(struct pt_regs);
63asmlinkage int sys_vm86(struct pt_regs); 63asmlinkage int sys_vm86(struct pt_regs);
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index 2ed3f0f44ff7..8e626ea33a1a 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -17,12 +17,12 @@
17# define AT_VECTOR_SIZE_ARCH 1 17# define AT_VECTOR_SIZE_ARCH 1
18#endif 18#endif
19 19
20#ifdef CONFIG_X86_32
21
22struct task_struct; /* one of the stranger aspects of C forward declarations */ 20struct task_struct; /* one of the stranger aspects of C forward declarations */
23struct task_struct *__switch_to(struct task_struct *prev, 21struct task_struct *__switch_to(struct task_struct *prev,
24 struct task_struct *next); 22 struct task_struct *next);
25 23
24#ifdef CONFIG_X86_32
25
26/* 26/*
27 * Saving eflags is important. It switches not only IOPL between tasks, 27 * Saving eflags is important. It switches not only IOPL between tasks,
28 * it also protects other tasks from NT leaking through sysenter etc. 28 * it also protects other tasks from NT leaking through sysenter etc.
@@ -314,6 +314,8 @@ extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
314 314
315void default_idle(void); 315void default_idle(void);
316 316
317void stop_this_cpu(void *dummy);
318
317/* 319/*
318 * Force strict CPU ordering. 320 * Force strict CPU ordering.
319 * And yes, this is required on UP too when we're talking 321 * And yes, this is required on UP too when we're talking
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e44d379faad2..98789647baa9 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -20,11 +20,13 @@
20struct task_struct; 20struct task_struct;
21struct exec_domain; 21struct exec_domain;
22#include <asm/processor.h> 22#include <asm/processor.h>
23#include <asm/ftrace.h>
24#include <asm/atomic.h>
23 25
24struct thread_info { 26struct thread_info {
25 struct task_struct *task; /* main task structure */ 27 struct task_struct *task; /* main task structure */
26 struct exec_domain *exec_domain; /* execution domain */ 28 struct exec_domain *exec_domain; /* execution domain */
27 unsigned long flags; /* low level flags */ 29 __u32 flags; /* low level flags */
28 __u32 status; /* thread synchronous flags */ 30 __u32 status; /* thread synchronous flags */
29 __u32 cpu; /* current CPU */ 31 __u32 cpu; /* current CPU */
30 int preempt_count; /* 0 => preemptable, 32 int preempt_count; /* 0 => preemptable,
@@ -91,7 +93,6 @@ struct thread_info {
91#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ 93#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
92#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ 94#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */
93#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ 95#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */
94#define TIF_BTS_TRACE_TS 27 /* record scheduling event timestamps */
95 96
96#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) 97#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
97#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) 98#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
@@ -113,7 +114,6 @@ struct thread_info {
113#define _TIF_FORCED_TF (1 << TIF_FORCED_TF) 114#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
114#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) 115#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR)
115#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) 116#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR)
116#define _TIF_BTS_TRACE_TS (1 << TIF_BTS_TRACE_TS)
117 117
118/* work to do in syscall_trace_enter() */ 118/* work to do in syscall_trace_enter() */
119#define _TIF_WORK_SYSCALL_ENTRY \ 119#define _TIF_WORK_SYSCALL_ENTRY \
@@ -139,8 +139,7 @@ struct thread_info {
139 139
140/* flags to check in __switch_to() */ 140/* flags to check in __switch_to() */
141#define _TIF_WORK_CTXSW \ 141#define _TIF_WORK_CTXSW \
142 (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS| \ 142 (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC)
143 _TIF_NOTSC)
144 143
145#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW 144#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
146#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG) 145#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index ff386ff50ed7..4e2f2e0aab27 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -61,13 +61,19 @@ static inline int cpu_to_node(int cpu)
61 * 61 *
62 * Side note: this function creates the returned cpumask on the stack 62 * Side note: this function creates the returned cpumask on the stack
63 * so with a high NR_CPUS count, excessive stack space is used. The 63 * so with a high NR_CPUS count, excessive stack space is used. The
64 * node_to_cpumask_ptr function should be used whenever possible. 64 * cpumask_of_node function should be used whenever possible.
65 */ 65 */
66static inline cpumask_t node_to_cpumask(int node) 66static inline cpumask_t node_to_cpumask(int node)
67{ 67{
68 return node_to_cpumask_map[node]; 68 return node_to_cpumask_map[node];
69} 69}
70 70
71/* Returns a bitmask of CPUs on Node 'node'. */
72static inline const struct cpumask *cpumask_of_node(int node)
73{
74 return &node_to_cpumask_map[node];
75}
76
71#else /* CONFIG_X86_64 */ 77#else /* CONFIG_X86_64 */
72 78
73/* Mappings between node number and cpus on that node. */ 79/* Mappings between node number and cpus on that node. */
@@ -82,7 +88,7 @@ DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map);
82#ifdef CONFIG_DEBUG_PER_CPU_MAPS 88#ifdef CONFIG_DEBUG_PER_CPU_MAPS
83extern int cpu_to_node(int cpu); 89extern int cpu_to_node(int cpu);
84extern int early_cpu_to_node(int cpu); 90extern int early_cpu_to_node(int cpu);
85extern const cpumask_t *_node_to_cpumask_ptr(int node); 91extern const cpumask_t *cpumask_of_node(int node);
86extern cpumask_t node_to_cpumask(int node); 92extern cpumask_t node_to_cpumask(int node);
87 93
88#else /* !CONFIG_DEBUG_PER_CPU_MAPS */ 94#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
@@ -103,7 +109,7 @@ static inline int early_cpu_to_node(int cpu)
103} 109}
104 110
105/* Returns a pointer to the cpumask of CPUs on Node 'node'. */ 111/* Returns a pointer to the cpumask of CPUs on Node 'node'. */
106static inline const cpumask_t *_node_to_cpumask_ptr(int node) 112static inline const cpumask_t *cpumask_of_node(int node)
107{ 113{
108 return &node_to_cpumask_map[node]; 114 return &node_to_cpumask_map[node];
109} 115}
@@ -116,12 +122,15 @@ static inline cpumask_t node_to_cpumask(int node)
116 122
117#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ 123#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
118 124
119/* Replace default node_to_cpumask_ptr with optimized version */ 125/*
126 * Replace default node_to_cpumask_ptr with optimized version
127 * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)"
128 */
120#define node_to_cpumask_ptr(v, node) \ 129#define node_to_cpumask_ptr(v, node) \
121 const cpumask_t *v = _node_to_cpumask_ptr(node) 130 const cpumask_t *v = cpumask_of_node(node)
122 131
123#define node_to_cpumask_ptr_next(v, node) \ 132#define node_to_cpumask_ptr_next(v, node) \
124 v = _node_to_cpumask_ptr(node) 133 v = cpumask_of_node(node)
125 134
126#endif /* CONFIG_X86_64 */ 135#endif /* CONFIG_X86_64 */
127 136
@@ -187,7 +196,7 @@ extern int __node_distance(int, int);
187#define cpu_to_node(cpu) 0 196#define cpu_to_node(cpu) 0
188#define early_cpu_to_node(cpu) 0 197#define early_cpu_to_node(cpu) 0
189 198
190static inline const cpumask_t *_node_to_cpumask_ptr(int node) 199static inline const cpumask_t *cpumask_of_node(int node)
191{ 200{
192 return &cpu_online_map; 201 return &cpu_online_map;
193} 202}
@@ -200,12 +209,15 @@ static inline int node_to_first_cpu(int node)
200 return first_cpu(cpu_online_map); 209 return first_cpu(cpu_online_map);
201} 210}
202 211
203/* Replace default node_to_cpumask_ptr with optimized version */ 212/*
213 * Replace default node_to_cpumask_ptr with optimized version
214 * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)"
215 */
204#define node_to_cpumask_ptr(v, node) \ 216#define node_to_cpumask_ptr(v, node) \
205 const cpumask_t *v = _node_to_cpumask_ptr(node) 217 const cpumask_t *v = cpumask_of_node(node)
206 218
207#define node_to_cpumask_ptr_next(v, node) \ 219#define node_to_cpumask_ptr_next(v, node) \
208 v = _node_to_cpumask_ptr(node) 220 v = cpumask_of_node(node)
209#endif 221#endif
210 222
211#include <asm-generic/topology.h> 223#include <asm-generic/topology.h>
@@ -214,18 +226,20 @@ static inline int node_to_first_cpu(int node)
214/* Returns the number of the first CPU on Node 'node'. */ 226/* Returns the number of the first CPU on Node 'node'. */
215static inline int node_to_first_cpu(int node) 227static inline int node_to_first_cpu(int node)
216{ 228{
217 node_to_cpumask_ptr(mask, node); 229 return cpumask_first(cpumask_of_node(node));
218 return first_cpu(*mask);
219} 230}
220#endif 231#endif
221 232
222extern cpumask_t cpu_coregroup_map(int cpu); 233extern cpumask_t cpu_coregroup_map(int cpu);
234extern const struct cpumask *cpu_coregroup_mask(int cpu);
223 235
224#ifdef ENABLE_TOPO_DEFINES 236#ifdef ENABLE_TOPO_DEFINES
225#define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id) 237#define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id)
226#define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id) 238#define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id)
227#define topology_core_siblings(cpu) (per_cpu(cpu_core_map, cpu)) 239#define topology_core_siblings(cpu) (per_cpu(cpu_core_map, cpu))
228#define topology_thread_siblings(cpu) (per_cpu(cpu_sibling_map, cpu)) 240#define topology_thread_siblings(cpu) (per_cpu(cpu_sibling_map, cpu))
241#define topology_core_cpumask(cpu) (&per_cpu(cpu_core_map, cpu))
242#define topology_thread_cpumask(cpu) (&per_cpu(cpu_sibling_map, cpu))
229 243
230/* indicates that pointers to the topology cpumask_t maps are valid */ 244/* indicates that pointers to the topology cpumask_t maps are valid */
231#define arch_provides_topology_pointers yes 245#define arch_provides_topology_pointers yes
diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h
index fa0d79facdbc..780ba0ab94f9 100644
--- a/arch/x86/include/asm/trampoline.h
+++ b/arch/x86/include/asm/trampoline.h
@@ -3,6 +3,7 @@
3 3
4#ifndef __ASSEMBLY__ 4#ifndef __ASSEMBLY__
5 5
6#ifdef CONFIG_X86_TRAMPOLINE
6/* 7/*
7 * Trampoline 80x86 program as an array. 8 * Trampoline 80x86 program as an array.
8 */ 9 */
@@ -13,8 +14,14 @@ extern unsigned char *trampoline_base;
13extern unsigned long init_rsp; 14extern unsigned long init_rsp;
14extern unsigned long initial_code; 15extern unsigned long initial_code;
15 16
17#define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE)
16#define TRAMPOLINE_BASE 0x6000 18#define TRAMPOLINE_BASE 0x6000
19
17extern unsigned long setup_trampoline(void); 20extern unsigned long setup_trampoline(void);
21extern void __init reserve_trampoline_memory(void);
22#else
23static inline void reserve_trampoline_memory(void) {};
24#endif /* CONFIG_X86_TRAMPOLINE */
18 25
19#endif /* __ASSEMBLY__ */ 26#endif /* __ASSEMBLY__ */
20 27
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 45dee286e45c..2ee0a3bceedf 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -46,6 +46,10 @@ dotraplinkage void do_coprocessor_segment_overrun(struct pt_regs *, long);
46dotraplinkage void do_invalid_TSS(struct pt_regs *, long); 46dotraplinkage void do_invalid_TSS(struct pt_regs *, long);
47dotraplinkage void do_segment_not_present(struct pt_regs *, long); 47dotraplinkage void do_segment_not_present(struct pt_regs *, long);
48dotraplinkage void do_stack_segment(struct pt_regs *, long); 48dotraplinkage void do_stack_segment(struct pt_regs *, long);
49#ifdef CONFIG_X86_64
50dotraplinkage void do_double_fault(struct pt_regs *, long);
51asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *);
52#endif
49dotraplinkage void do_general_protection(struct pt_regs *, long); 53dotraplinkage void do_general_protection(struct pt_regs *, long);
50dotraplinkage void do_page_fault(struct pt_regs *, unsigned long); 54dotraplinkage void do_page_fault(struct pt_regs *, unsigned long);
51dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long); 55dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long);
@@ -72,10 +76,13 @@ static inline int get_si_code(unsigned long condition)
72extern int panic_on_unrecovered_nmi; 76extern int panic_on_unrecovered_nmi;
73extern int kstack_depth_to_print; 77extern int kstack_depth_to_print;
74 78
75#ifdef CONFIG_X86_32
76void math_error(void __user *); 79void math_error(void __user *);
77unsigned long patch_espfix_desc(unsigned long, unsigned long);
78asmlinkage void math_emulate(long); 80asmlinkage void math_emulate(long);
81#ifdef CONFIG_X86_32
82unsigned long patch_espfix_desc(unsigned long, unsigned long);
83#else
84asmlinkage void smp_thermal_interrupt(void);
85asmlinkage void mce_threshold_interrupt(void);
79#endif 86#endif
80 87
81#endif /* _ASM_X86_TRAPS_H */ 88#endif /* _ASM_X86_TRAPS_H */
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 9cd83a8e40d5..38ae163cc91b 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -34,8 +34,6 @@ static inline cycles_t get_cycles(void)
34 34
35static __always_inline cycles_t vget_cycles(void) 35static __always_inline cycles_t vget_cycles(void)
36{ 36{
37 cycles_t cycles;
38
39 /* 37 /*
40 * We only do VDSOs on TSC capable CPUs, so this shouldnt 38 * We only do VDSOs on TSC capable CPUs, so this shouldnt
41 * access boot_cpu_data (which is not VDSO-safe): 39 * access boot_cpu_data (which is not VDSO-safe):
@@ -44,11 +42,7 @@ static __always_inline cycles_t vget_cycles(void)
44 if (!cpu_has_tsc) 42 if (!cpu_has_tsc)
45 return 0; 43 return 0;
46#endif 44#endif
47 rdtsc_barrier(); 45 return (cycles_t)__native_read_tsc();
48 cycles = (cycles_t)__native_read_tsc();
49 rdtsc_barrier();
50
51 return cycles;
52} 46}
53 47
54extern void tsc_init(void); 48extern void tsc_init(void);
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 35c54921b2e4..4340055b7559 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -157,6 +157,7 @@ extern int __get_user_bad(void);
157 int __ret_gu; \ 157 int __ret_gu; \
158 unsigned long __val_gu; \ 158 unsigned long __val_gu; \
159 __chk_user_ptr(ptr); \ 159 __chk_user_ptr(ptr); \
160 might_fault(); \
160 switch (sizeof(*(ptr))) { \ 161 switch (sizeof(*(ptr))) { \
161 case 1: \ 162 case 1: \
162 __get_user_x(1, __ret_gu, __val_gu, ptr); \ 163 __get_user_x(1, __ret_gu, __val_gu, ptr); \
@@ -241,6 +242,7 @@ extern void __put_user_8(void);
241 int __ret_pu; \ 242 int __ret_pu; \
242 __typeof__(*(ptr)) __pu_val; \ 243 __typeof__(*(ptr)) __pu_val; \
243 __chk_user_ptr(ptr); \ 244 __chk_user_ptr(ptr); \
245 might_fault(); \
244 __pu_val = x; \ 246 __pu_val = x; \
245 switch (sizeof(*(ptr))) { \ 247 switch (sizeof(*(ptr))) { \
246 case 1: \ 248 case 1: \
@@ -350,14 +352,14 @@ do { \
350 352
351#define __put_user_nocheck(x, ptr, size) \ 353#define __put_user_nocheck(x, ptr, size) \
352({ \ 354({ \
353 long __pu_err; \ 355 int __pu_err; \
354 __put_user_size((x), (ptr), (size), __pu_err, -EFAULT); \ 356 __put_user_size((x), (ptr), (size), __pu_err, -EFAULT); \
355 __pu_err; \ 357 __pu_err; \
356}) 358})
357 359
358#define __get_user_nocheck(x, ptr, size) \ 360#define __get_user_nocheck(x, ptr, size) \
359({ \ 361({ \
360 long __gu_err; \ 362 int __gu_err; \
361 unsigned long __gu_val; \ 363 unsigned long __gu_val; \
362 __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \ 364 __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \
363 (x) = (__force __typeof__(*(ptr)))__gu_val; \ 365 (x) = (__force __typeof__(*(ptr)))__gu_val; \
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index d095a3aeea1b..5e06259e90e5 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -82,8 +82,8 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
82static __always_inline unsigned long __must_check 82static __always_inline unsigned long __must_check
83__copy_to_user(void __user *to, const void *from, unsigned long n) 83__copy_to_user(void __user *to, const void *from, unsigned long n)
84{ 84{
85 might_sleep(); 85 might_fault();
86 return __copy_to_user_inatomic(to, from, n); 86 return __copy_to_user_inatomic(to, from, n);
87} 87}
88 88
89static __always_inline unsigned long 89static __always_inline unsigned long
@@ -137,7 +137,7 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
137static __always_inline unsigned long 137static __always_inline unsigned long
138__copy_from_user(void *to, const void __user *from, unsigned long n) 138__copy_from_user(void *to, const void __user *from, unsigned long n)
139{ 139{
140 might_sleep(); 140 might_fault();
141 if (__builtin_constant_p(n)) { 141 if (__builtin_constant_p(n)) {
142 unsigned long ret; 142 unsigned long ret;
143 143
@@ -159,7 +159,7 @@ __copy_from_user(void *to, const void __user *from, unsigned long n)
159static __always_inline unsigned long __copy_from_user_nocache(void *to, 159static __always_inline unsigned long __copy_from_user_nocache(void *to,
160 const void __user *from, unsigned long n) 160 const void __user *from, unsigned long n)
161{ 161{
162 might_sleep(); 162 might_fault();
163 if (__builtin_constant_p(n)) { 163 if (__builtin_constant_p(n)) {
164 unsigned long ret; 164 unsigned long ret;
165 165
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index f8cfd00db450..84210c479fca 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -29,6 +29,8 @@ static __always_inline __must_check
29int __copy_from_user(void *dst, const void __user *src, unsigned size) 29int __copy_from_user(void *dst, const void __user *src, unsigned size)
30{ 30{
31 int ret = 0; 31 int ret = 0;
32
33 might_fault();
32 if (!__builtin_constant_p(size)) 34 if (!__builtin_constant_p(size))
33 return copy_user_generic(dst, (__force void *)src, size); 35 return copy_user_generic(dst, (__force void *)src, size);
34 switch (size) { 36 switch (size) {
@@ -71,6 +73,8 @@ static __always_inline __must_check
71int __copy_to_user(void __user *dst, const void *src, unsigned size) 73int __copy_to_user(void __user *dst, const void *src, unsigned size)
72{ 74{
73 int ret = 0; 75 int ret = 0;
76
77 might_fault();
74 if (!__builtin_constant_p(size)) 78 if (!__builtin_constant_p(size))
75 return copy_user_generic((__force void *)dst, src, size); 79 return copy_user_generic((__force void *)dst, src, size);
76 switch (size) { 80 switch (size) {
@@ -113,6 +117,8 @@ static __always_inline __must_check
113int __copy_in_user(void __user *dst, const void __user *src, unsigned size) 117int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
114{ 118{
115 int ret = 0; 119 int ret = 0;
120
121 might_fault();
116 if (!__builtin_constant_p(size)) 122 if (!__builtin_constant_p(size))
117 return copy_user_generic((__force void *)dst, 123 return copy_user_generic((__force void *)dst,
118 (__force void *)src, size); 124 (__force void *)src, size);
diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h
index d931d3b7e6f7..7ed17ff502b9 100644
--- a/arch/x86/include/asm/uv/bios.h
+++ b/arch/x86/include/asm/uv/bios.h
@@ -32,13 +32,18 @@
32enum uv_bios_cmd { 32enum uv_bios_cmd {
33 UV_BIOS_COMMON, 33 UV_BIOS_COMMON,
34 UV_BIOS_GET_SN_INFO, 34 UV_BIOS_GET_SN_INFO,
35 UV_BIOS_FREQ_BASE 35 UV_BIOS_FREQ_BASE,
36 UV_BIOS_WATCHLIST_ALLOC,
37 UV_BIOS_WATCHLIST_FREE,
38 UV_BIOS_MEMPROTECT,
39 UV_BIOS_GET_PARTITION_ADDR
36}; 40};
37 41
38/* 42/*
39 * Status values returned from a BIOS call. 43 * Status values returned from a BIOS call.
40 */ 44 */
41enum { 45enum {
46 BIOS_STATUS_MORE_PASSES = 1,
42 BIOS_STATUS_SUCCESS = 0, 47 BIOS_STATUS_SUCCESS = 0,
43 BIOS_STATUS_UNIMPLEMENTED = -ENOSYS, 48 BIOS_STATUS_UNIMPLEMENTED = -ENOSYS,
44 BIOS_STATUS_EINVAL = -EINVAL, 49 BIOS_STATUS_EINVAL = -EINVAL,
@@ -71,6 +76,21 @@ union partition_info_u {
71 }; 76 };
72}; 77};
73 78
79union uv_watchlist_u {
80 u64 val;
81 struct {
82 u64 blade : 16,
83 size : 32,
84 filler : 16;
85 };
86};
87
88enum uv_memprotect {
89 UV_MEMPROT_RESTRICT_ACCESS,
90 UV_MEMPROT_ALLOW_AMO,
91 UV_MEMPROT_ALLOW_RW
92};
93
74/* 94/*
75 * bios calls have 6 parameters 95 * bios calls have 6 parameters
76 */ 96 */
@@ -80,14 +100,20 @@ extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64);
80 100
81extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *); 101extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *);
82extern s64 uv_bios_freq_base(u64, u64 *); 102extern s64 uv_bios_freq_base(u64, u64 *);
103extern int uv_bios_mq_watchlist_alloc(int, unsigned long, unsigned int,
104 unsigned long *);
105extern int uv_bios_mq_watchlist_free(int, int);
106extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect);
107extern s64 uv_bios_reserved_page_pa(u64, u64 *, u64 *, u64 *);
83 108
84extern void uv_bios_init(void); 109extern void uv_bios_init(void);
85 110
111extern unsigned long sn_rtc_cycles_per_second;
86extern int uv_type; 112extern int uv_type;
87extern long sn_partition_id; 113extern long sn_partition_id;
88extern long uv_coherency_id; 114extern long sn_coherency_id;
89extern long uv_region_size; 115extern long sn_region_size;
90#define partition_coherence_id() (uv_coherency_id) 116#define partition_coherence_id() (sn_coherency_id)
91 117
92extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */ 118extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */
93 119
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index e2363253bbbf..50423c7b56b2 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -133,61 +133,61 @@ struct bau_msg_payload {
133 * see table 4.2.3.0.1 in broacast_assist spec. 133 * see table 4.2.3.0.1 in broacast_assist spec.
134 */ 134 */
135struct bau_msg_header { 135struct bau_msg_header {
136 int dest_subnodeid:6; /* must be zero */ 136 unsigned int dest_subnodeid:6; /* must be zero */
137 /* bits 5:0 */ 137 /* bits 5:0 */
138 int base_dest_nodeid:15; /* nasid>>1 (pnode) of first bit in node_map */ 138 unsigned int base_dest_nodeid:15; /* nasid>>1 (pnode) of */
139 /* bits 20:6 */ 139 /* bits 20:6 */ /* first bit in node_map */
140 int command:8; /* message type */ 140 unsigned int command:8; /* message type */
141 /* bits 28:21 */ 141 /* bits 28:21 */
142 /* 0x38: SN3net EndPoint Message */ 142 /* 0x38: SN3net EndPoint Message */
143 int rsvd_1:3; /* must be zero */ 143 unsigned int rsvd_1:3; /* must be zero */
144 /* bits 31:29 */ 144 /* bits 31:29 */
145 /* int will align on 32 bits */ 145 /* int will align on 32 bits */
146 int rsvd_2:9; /* must be zero */ 146 unsigned int rsvd_2:9; /* must be zero */
147 /* bits 40:32 */ 147 /* bits 40:32 */
148 /* Suppl_A is 56-41 */ 148 /* Suppl_A is 56-41 */
149 int payload_2a:8; /* becomes byte 16 of msg */ 149 unsigned int payload_2a:8;/* becomes byte 16 of msg */
150 /* bits 48:41 */ /* not currently using */ 150 /* bits 48:41 */ /* not currently using */
151 int payload_2b:8; /* becomes byte 17 of msg */ 151 unsigned int payload_2b:8;/* becomes byte 17 of msg */
152 /* bits 56:49 */ /* not currently using */ 152 /* bits 56:49 */ /* not currently using */
153 /* Address field (96:57) is never used as an 153 /* Address field (96:57) is never used as an
154 address (these are address bits 42:3) */ 154 address (these are address bits 42:3) */
155 int rsvd_3:1; /* must be zero */ 155 unsigned int rsvd_3:1; /* must be zero */
156 /* bit 57 */ 156 /* bit 57 */
157 /* address bits 27:4 are payload */ 157 /* address bits 27:4 are payload */
158 /* these 24 bits become bytes 12-14 of msg */ 158 /* these 24 bits become bytes 12-14 of msg */
159 int replied_to:1; /* sent as 0 by the source to byte 12 */ 159 unsigned int replied_to:1;/* sent as 0 by the source to byte 12 */
160 /* bit 58 */ 160 /* bit 58 */
161 161
162 int payload_1a:5; /* not currently used */ 162 unsigned int payload_1a:5;/* not currently used */
163 /* bits 63:59 */ 163 /* bits 63:59 */
164 int payload_1b:8; /* not currently used */ 164 unsigned int payload_1b:8;/* not currently used */
165 /* bits 71:64 */ 165 /* bits 71:64 */
166 int payload_1c:8; /* not currently used */ 166 unsigned int payload_1c:8;/* not currently used */
167 /* bits 79:72 */ 167 /* bits 79:72 */
168 int payload_1d:2; /* not currently used */ 168 unsigned int payload_1d:2;/* not currently used */
169 /* bits 81:80 */ 169 /* bits 81:80 */
170 170
171 int rsvd_4:7; /* must be zero */ 171 unsigned int rsvd_4:7; /* must be zero */
172 /* bits 88:82 */ 172 /* bits 88:82 */
173 int sw_ack_flag:1; /* software acknowledge flag */ 173 unsigned int sw_ack_flag:1;/* software acknowledge flag */
174 /* bit 89 */ 174 /* bit 89 */
175 /* INTD trasactions at destination are to 175 /* INTD trasactions at destination are to
176 wait for software acknowledge */ 176 wait for software acknowledge */
177 int rsvd_5:6; /* must be zero */ 177 unsigned int rsvd_5:6; /* must be zero */
178 /* bits 95:90 */ 178 /* bits 95:90 */
179 int rsvd_6:5; /* must be zero */ 179 unsigned int rsvd_6:5; /* must be zero */
180 /* bits 100:96 */ 180 /* bits 100:96 */
181 int int_both:1; /* if 1, interrupt both sockets on the blade */ 181 unsigned int int_both:1;/* if 1, interrupt both sockets on the blade */
182 /* bit 101*/ 182 /* bit 101*/
183 int fairness:3; /* usually zero */ 183 unsigned int fairness:3;/* usually zero */
184 /* bits 104:102 */ 184 /* bits 104:102 */
185 int multilevel:1; /* multi-level multicast format */ 185 unsigned int multilevel:1; /* multi-level multicast format */
186 /* bit 105 */ 186 /* bit 105 */
187 /* 0 for TLB: endpoint multi-unicast messages */ 187 /* 0 for TLB: endpoint multi-unicast messages */
188 int chaining:1; /* next descriptor is part of this activation*/ 188 unsigned int chaining:1;/* next descriptor is part of this activation*/
189 /* bit 106 */ 189 /* bit 106 */
190 int rsvd_7:21; /* must be zero */ 190 unsigned int rsvd_7:21; /* must be zero */
191 /* bits 127:107 */ 191 /* bits 127:107 */
192}; 192};
193 193
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 7a5782610b2b..777327ef05c1 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -113,25 +113,37 @@
113 */ 113 */
114#define UV_MAX_NASID_VALUE (UV_MAX_NUMALINK_NODES * 2) 114#define UV_MAX_NASID_VALUE (UV_MAX_NUMALINK_NODES * 2)
115 115
116struct uv_scir_s {
117 struct timer_list timer;
118 unsigned long offset;
119 unsigned long last;
120 unsigned long idle_on;
121 unsigned long idle_off;
122 unsigned char state;
123 unsigned char enabled;
124};
125
116/* 126/*
117 * The following defines attributes of the HUB chip. These attributes are 127 * The following defines attributes of the HUB chip. These attributes are
118 * frequently referenced and are kept in the per-cpu data areas of each cpu. 128 * frequently referenced and are kept in the per-cpu data areas of each cpu.
119 * They are kept together in a struct to minimize cache misses. 129 * They are kept together in a struct to minimize cache misses.
120 */ 130 */
121struct uv_hub_info_s { 131struct uv_hub_info_s {
122 unsigned long global_mmr_base; 132 unsigned long global_mmr_base;
123 unsigned long gpa_mask; 133 unsigned long gpa_mask;
124 unsigned long gnode_upper; 134 unsigned long gnode_upper;
125 unsigned long lowmem_remap_top; 135 unsigned long lowmem_remap_top;
126 unsigned long lowmem_remap_base; 136 unsigned long lowmem_remap_base;
127 unsigned short pnode; 137 unsigned short pnode;
128 unsigned short pnode_mask; 138 unsigned short pnode_mask;
129 unsigned short coherency_domain_number; 139 unsigned short coherency_domain_number;
130 unsigned short numa_blade_id; 140 unsigned short numa_blade_id;
131 unsigned char blade_processor_id; 141 unsigned char blade_processor_id;
132 unsigned char m_val; 142 unsigned char m_val;
133 unsigned char n_val; 143 unsigned char n_val;
144 struct uv_scir_s scir;
134}; 145};
146
135DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); 147DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
136#define uv_hub_info (&__get_cpu_var(__uv_hub_info)) 148#define uv_hub_info (&__get_cpu_var(__uv_hub_info))
137#define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu)) 149#define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu))
@@ -163,6 +175,30 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
163 175
164#define UV_APIC_PNODE_SHIFT 6 176#define UV_APIC_PNODE_SHIFT 6
165 177
178/* Local Bus from cpu's perspective */
179#define LOCAL_BUS_BASE 0x1c00000
180#define LOCAL_BUS_SIZE (4 * 1024 * 1024)
181
182/*
183 * System Controller Interface Reg
184 *
185 * Note there are NO leds on a UV system. This register is only
186 * used by the system controller to monitor system-wide operation.
187 * There are 64 regs per node. With Nahelem cpus (2 cores per node,
188 * 8 cpus per core, 2 threads per cpu) there are 32 cpu threads on
189 * a node.
190 *
191 * The window is located at top of ACPI MMR space
192 */
193#define SCIR_WINDOW_COUNT 64
194#define SCIR_LOCAL_MMR_BASE (LOCAL_BUS_BASE + \
195 LOCAL_BUS_SIZE - \
196 SCIR_WINDOW_COUNT)
197
198#define SCIR_CPU_HEARTBEAT 0x01 /* timer interrupt */
199#define SCIR_CPU_ACTIVITY 0x02 /* not idle */
200#define SCIR_CPU_HB_INTERVAL (HZ) /* once per second */
201
166/* 202/*
167 * Macros for converting between kernel virtual addresses, socket local physical 203 * Macros for converting between kernel virtual addresses, socket local physical
168 * addresses, and UV global physical addresses. 204 * addresses, and UV global physical addresses.
@@ -174,7 +210,7 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
174static inline unsigned long uv_soc_phys_ram_to_gpa(unsigned long paddr) 210static inline unsigned long uv_soc_phys_ram_to_gpa(unsigned long paddr)
175{ 211{
176 if (paddr < uv_hub_info->lowmem_remap_top) 212 if (paddr < uv_hub_info->lowmem_remap_top)
177 paddr += uv_hub_info->lowmem_remap_base; 213 paddr |= uv_hub_info->lowmem_remap_base;
178 return paddr | uv_hub_info->gnode_upper; 214 return paddr | uv_hub_info->gnode_upper;
179} 215}
180 216
@@ -182,19 +218,7 @@ static inline unsigned long uv_soc_phys_ram_to_gpa(unsigned long paddr)
182/* socket virtual --> UV global physical address */ 218/* socket virtual --> UV global physical address */
183static inline unsigned long uv_gpa(void *v) 219static inline unsigned long uv_gpa(void *v)
184{ 220{
185 return __pa(v) | uv_hub_info->gnode_upper; 221 return uv_soc_phys_ram_to_gpa(__pa(v));
186}
187
188/* socket virtual --> UV global physical address */
189static inline void *uv_vgpa(void *v)
190{
191 return (void *)uv_gpa(v);
192}
193
194/* UV global physical address --> socket virtual */
195static inline void *uv_va(unsigned long gpa)
196{
197 return __va(gpa & uv_hub_info->gpa_mask);
198} 222}
199 223
200/* pnode, offset --> socket virtual */ 224/* pnode, offset --> socket virtual */
@@ -277,6 +301,16 @@ static inline void uv_write_local_mmr(unsigned long offset, unsigned long val)
277 *uv_local_mmr_address(offset) = val; 301 *uv_local_mmr_address(offset) = val;
278} 302}
279 303
304static inline unsigned char uv_read_local_mmr8(unsigned long offset)
305{
306 return *((unsigned char *)uv_local_mmr_address(offset));
307}
308
309static inline void uv_write_local_mmr8(unsigned long offset, unsigned char val)
310{
311 *((unsigned char *)uv_local_mmr_address(offset)) = val;
312}
313
280/* 314/*
281 * Structures and definitions for converting between cpu, node, pnode, and blade 315 * Structures and definitions for converting between cpu, node, pnode, and blade
282 * numbers. 316 * numbers.
@@ -351,5 +385,20 @@ static inline int uv_num_possible_blades(void)
351 return uv_possible_blades; 385 return uv_possible_blades;
352} 386}
353 387
354#endif /* _ASM_X86_UV_UV_HUB_H */ 388/* Update SCIR state */
389static inline void uv_set_scir_bits(unsigned char value)
390{
391 if (uv_hub_info->scir.state != value) {
392 uv_hub_info->scir.state = value;
393 uv_write_local_mmr8(uv_hub_info->scir.offset, value);
394 }
395}
396static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value)
397{
398 if (uv_cpu_hub_info(cpu)->scir.state != value) {
399 uv_cpu_hub_info(cpu)->scir.state = value;
400 uv_write_local_mmr8(uv_cpu_hub_info(cpu)->scir.offset, value);
401 }
402}
355 403
404#endif /* _ASM_X86_UV_UV_HUB_H */
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
new file mode 100644
index 000000000000..593636275238
--- /dev/null
+++ b/arch/x86/include/asm/virtext.h
@@ -0,0 +1,132 @@
1/* CPU virtualization extensions handling
2 *
3 * This should carry the code for handling CPU virtualization extensions
4 * that needs to live in the kernel core.
5 *
6 * Author: Eduardo Habkost <ehabkost@redhat.com>
7 *
8 * Copyright (C) 2008, Red Hat Inc.
9 *
10 * Contains code from KVM, Copyright (C) 2006 Qumranet, Inc.
11 *
12 * This work is licensed under the terms of the GNU GPL, version 2. See
13 * the COPYING file in the top-level directory.
14 */
15#ifndef _ASM_X86_VIRTEX_H
16#define _ASM_X86_VIRTEX_H
17
18#include <asm/processor.h>
19#include <asm/system.h>
20
21#include <asm/vmx.h>
22#include <asm/svm.h>
23
24/*
25 * VMX functions:
26 */
27
28static inline int cpu_has_vmx(void)
29{
30 unsigned long ecx = cpuid_ecx(1);
31 return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
32}
33
34
35/** Disable VMX on the current CPU
36 *
37 * vmxoff causes a undefined-opcode exception if vmxon was not run
38 * on the CPU previously. Only call this function if you know VMX
39 * is enabled.
40 */
41static inline void cpu_vmxoff(void)
42{
43 asm volatile (ASM_VMX_VMXOFF : : : "cc");
44 write_cr4(read_cr4() & ~X86_CR4_VMXE);
45}
46
47static inline int cpu_vmx_enabled(void)
48{
49 return read_cr4() & X86_CR4_VMXE;
50}
51
52/** Disable VMX if it is enabled on the current CPU
53 *
54 * You shouldn't call this if cpu_has_vmx() returns 0.
55 */
56static inline void __cpu_emergency_vmxoff(void)
57{
58 if (cpu_vmx_enabled())
59 cpu_vmxoff();
60}
61
62/** Disable VMX if it is supported and enabled on the current CPU
63 */
64static inline void cpu_emergency_vmxoff(void)
65{
66 if (cpu_has_vmx())
67 __cpu_emergency_vmxoff();
68}
69
70
71
72
73/*
74 * SVM functions:
75 */
76
77/** Check if the CPU has SVM support
78 *
79 * You can use the 'msg' arg to get a message describing the problem,
80 * if the function returns zero. Simply pass NULL if you are not interested
81 * on the messages; gcc should take care of not generating code for
82 * the messages on this case.
83 */
84static inline int cpu_has_svm(const char **msg)
85{
86 uint32_t eax, ebx, ecx, edx;
87
88 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
89 if (msg)
90 *msg = "not amd";
91 return 0;
92 }
93
94 cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
95 if (eax < SVM_CPUID_FUNC) {
96 if (msg)
97 *msg = "can't execute cpuid_8000000a";
98 return 0;
99 }
100
101 cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
102 if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) {
103 if (msg)
104 *msg = "svm not available";
105 return 0;
106 }
107 return 1;
108}
109
110
111/** Disable SVM on the current CPU
112 *
113 * You should call this only if cpu_has_svm() returned true.
114 */
115static inline void cpu_svm_disable(void)
116{
117 uint64_t efer;
118
119 wrmsrl(MSR_VM_HSAVE_PA, 0);
120 rdmsrl(MSR_EFER, efer);
121 wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK);
122}
123
124/** Makes sure SVM is disabled, if it is supported on the CPU
125 */
126static inline void cpu_emergency_svm_disable(void)
127{
128 if (cpu_has_svm(NULL))
129 cpu_svm_disable();
130}
131
132#endif /* _ASM_X86_VIRTEX_H */
diff --git a/arch/x86/include/asm/vmi.h b/arch/x86/include/asm/vmi.h
index b7c0dea119fe..61e08c0a2907 100644
--- a/arch/x86/include/asm/vmi.h
+++ b/arch/x86/include/asm/vmi.h
@@ -223,9 +223,15 @@ struct pci_header {
223} __attribute__((packed)); 223} __attribute__((packed));
224 224
225/* Function prototypes for bootstrapping */ 225/* Function prototypes for bootstrapping */
226#ifdef CONFIG_VMI
226extern void vmi_init(void); 227extern void vmi_init(void);
228extern void vmi_activate(void);
227extern void vmi_bringup(void); 229extern void vmi_bringup(void);
228extern void vmi_apply_boot_page_allocations(void); 230#else
231static inline void vmi_init(void) {}
232static inline void vmi_activate(void) {}
233static inline void vmi_bringup(void) {}
234#endif
229 235
230/* State needed to start an application processor in an SMP system. */ 236/* State needed to start an application processor in an SMP system. */
231struct vmi_ap_state { 237struct vmi_ap_state {
diff --git a/arch/x86/include/asm/vmware.h b/arch/x86/include/asm/vmware.h
new file mode 100644
index 000000000000..c11b7e100d83
--- /dev/null
+++ b/arch/x86/include/asm/vmware.h
@@ -0,0 +1,27 @@
1/*
2 * Copyright (C) 2008, VMware, Inc.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
12 * NON INFRINGEMENT. See the GNU General Public License for more
13 * details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 */
20#ifndef ASM_X86__VMWARE_H
21#define ASM_X86__VMWARE_H
22
23extern unsigned long vmware_get_tsc_khz(void);
24extern int vmware_platform(void);
25extern void vmware_set_feature_bits(struct cpuinfo_x86 *c);
26
27#endif
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/include/asm/vmx.h
index ec5edc339da6..d0238e6151d8 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -63,10 +63,13 @@
63 63
64#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 64#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200
65#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000 65#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000
66#define VM_EXIT_SAVE_IA32_PAT 0x00040000
67#define VM_EXIT_LOAD_IA32_PAT 0x00080000
66 68
67#define VM_ENTRY_IA32E_MODE 0x00000200 69#define VM_ENTRY_IA32E_MODE 0x00000200
68#define VM_ENTRY_SMM 0x00000400 70#define VM_ENTRY_SMM 0x00000400
69#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 71#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800
72#define VM_ENTRY_LOAD_IA32_PAT 0x00004000
70 73
71/* VMCS Encodings */ 74/* VMCS Encodings */
72enum vmcs_field { 75enum vmcs_field {
@@ -112,6 +115,8 @@ enum vmcs_field {
112 VMCS_LINK_POINTER_HIGH = 0x00002801, 115 VMCS_LINK_POINTER_HIGH = 0x00002801,
113 GUEST_IA32_DEBUGCTL = 0x00002802, 116 GUEST_IA32_DEBUGCTL = 0x00002802,
114 GUEST_IA32_DEBUGCTL_HIGH = 0x00002803, 117 GUEST_IA32_DEBUGCTL_HIGH = 0x00002803,
118 GUEST_IA32_PAT = 0x00002804,
119 GUEST_IA32_PAT_HIGH = 0x00002805,
115 GUEST_PDPTR0 = 0x0000280a, 120 GUEST_PDPTR0 = 0x0000280a,
116 GUEST_PDPTR0_HIGH = 0x0000280b, 121 GUEST_PDPTR0_HIGH = 0x0000280b,
117 GUEST_PDPTR1 = 0x0000280c, 122 GUEST_PDPTR1 = 0x0000280c,
@@ -120,6 +125,8 @@ enum vmcs_field {
120 GUEST_PDPTR2_HIGH = 0x0000280f, 125 GUEST_PDPTR2_HIGH = 0x0000280f,
121 GUEST_PDPTR3 = 0x00002810, 126 GUEST_PDPTR3 = 0x00002810,
122 GUEST_PDPTR3_HIGH = 0x00002811, 127 GUEST_PDPTR3_HIGH = 0x00002811,
128 HOST_IA32_PAT = 0x00002c00,
129 HOST_IA32_PAT_HIGH = 0x00002c01,
123 PIN_BASED_VM_EXEC_CONTROL = 0x00004000, 130 PIN_BASED_VM_EXEC_CONTROL = 0x00004000,
124 CPU_BASED_VM_EXEC_CONTROL = 0x00004002, 131 CPU_BASED_VM_EXEC_CONTROL = 0x00004002,
125 EXCEPTION_BITMAP = 0x00004004, 132 EXCEPTION_BITMAP = 0x00004004,
@@ -331,8 +338,9 @@ enum vmcs_field {
331 338
332#define AR_RESERVD_MASK 0xfffe0f00 339#define AR_RESERVD_MASK 0xfffe0f00
333 340
334#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 341#define TSS_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 0)
335#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT 10 342#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 1)
343#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 2)
336 344
337#define VMX_NR_VPIDS (1 << 16) 345#define VMX_NR_VPIDS (1 << 16)
338#define VMX_VPID_EXTENT_SINGLE_CONTEXT 1 346#define VMX_VPID_EXTENT_SINGLE_CONTEXT 1
@@ -356,4 +364,19 @@ enum vmcs_field {
356 364
357#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul 365#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul
358 366
367
368#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30"
369#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2"
370#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3"
371#define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30"
372#define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0"
373#define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0"
374#define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4"
375#define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4"
376#define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30"
377#define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08"
378#define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
379
380
381
359#endif 382#endif
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 3f6000d95fe2..5e79ca694326 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -33,8 +33,14 @@
33#ifndef _ASM_X86_XEN_HYPERCALL_H 33#ifndef _ASM_X86_XEN_HYPERCALL_H
34#define _ASM_X86_XEN_HYPERCALL_H 34#define _ASM_X86_XEN_HYPERCALL_H
35 35
36#include <linux/kernel.h>
37#include <linux/spinlock.h>
36#include <linux/errno.h> 38#include <linux/errno.h>
37#include <linux/string.h> 39#include <linux/string.h>
40#include <linux/types.h>
41
42#include <asm/page.h>
43#include <asm/pgtable.h>
38 44
39#include <xen/interface/xen.h> 45#include <xen/interface/xen.h>
40#include <xen/interface/sched.h> 46#include <xen/interface/sched.h>
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index a38d25ac87d2..81fbd735aec4 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -33,39 +33,10 @@
33#ifndef _ASM_X86_XEN_HYPERVISOR_H 33#ifndef _ASM_X86_XEN_HYPERVISOR_H
34#define _ASM_X86_XEN_HYPERVISOR_H 34#define _ASM_X86_XEN_HYPERVISOR_H
35 35
36#include <linux/types.h>
37#include <linux/kernel.h>
38
39#include <xen/interface/xen.h>
40#include <xen/interface/version.h>
41
42#include <asm/ptrace.h>
43#include <asm/page.h>
44#include <asm/desc.h>
45#if defined(__i386__)
46# ifdef CONFIG_X86_PAE
47# include <asm-generic/pgtable-nopud.h>
48# else
49# include <asm-generic/pgtable-nopmd.h>
50# endif
51#endif
52#include <asm/xen/hypercall.h>
53
54/* arch/i386/kernel/setup.c */ 36/* arch/i386/kernel/setup.c */
55extern struct shared_info *HYPERVISOR_shared_info; 37extern struct shared_info *HYPERVISOR_shared_info;
56extern struct start_info *xen_start_info; 38extern struct start_info *xen_start_info;
57 39
58/* arch/i386/mach-xen/evtchn.c */
59/* Force a proper event-channel callback from Xen. */
60extern void force_evtchn_callback(void);
61
62/* Turn jiffies into Xen system time. */
63u64 jiffies_to_st(unsigned long jiffies);
64
65
66#define MULTI_UVMFLAGS_INDEX 3
67#define MULTI_UVMDOMID_INDEX 4
68
69enum xen_domain_type { 40enum xen_domain_type {
70 XEN_NATIVE, 41 XEN_NATIVE,
71 XEN_PV_DOMAIN, 42 XEN_PV_DOMAIN,
@@ -74,9 +45,15 @@ enum xen_domain_type {
74 45
75extern enum xen_domain_type xen_domain_type; 46extern enum xen_domain_type xen_domain_type;
76 47
48#ifdef CONFIG_XEN
77#define xen_domain() (xen_domain_type != XEN_NATIVE) 49#define xen_domain() (xen_domain_type != XEN_NATIVE)
78#define xen_pv_domain() (xen_domain_type == XEN_PV_DOMAIN) 50#else
51#define xen_domain() (0)
52#endif
53
54#define xen_pv_domain() (xen_domain() && xen_domain_type == XEN_PV_DOMAIN)
55#define xen_hvm_domain() (xen_domain() && xen_domain_type == XEN_HVM_DOMAIN)
56
79#define xen_initial_domain() (xen_pv_domain() && xen_start_info->flags & SIF_INITDOMAIN) 57#define xen_initial_domain() (xen_pv_domain() && xen_start_info->flags & SIF_INITDOMAIN)
80#define xen_hvm_domain() (xen_domain_type == XEN_HVM_DOMAIN)
81 58
82#endif /* _ASM_X86_XEN_HYPERVISOR_H */ 59#endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index bc628998a1b9..7ef617ef1df3 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -1,11 +1,16 @@
1#ifndef _ASM_X86_XEN_PAGE_H 1#ifndef _ASM_X86_XEN_PAGE_H
2#define _ASM_X86_XEN_PAGE_H 2#define _ASM_X86_XEN_PAGE_H
3 3
4#include <linux/kernel.h>
5#include <linux/types.h>
6#include <linux/spinlock.h>
4#include <linux/pfn.h> 7#include <linux/pfn.h>
5 8
6#include <asm/uaccess.h> 9#include <asm/uaccess.h>
10#include <asm/page.h>
7#include <asm/pgtable.h> 11#include <asm/pgtable.h>
8 12
13#include <xen/interface/xen.h>
9#include <xen/features.h> 14#include <xen/features.h>
10 15
11/* Xen machine address */ 16/* Xen machine address */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index b62a7667828e..d364df03c1d6 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -12,6 +12,7 @@ CFLAGS_REMOVE_tsc.o = -pg
12CFLAGS_REMOVE_rtc.o = -pg 12CFLAGS_REMOVE_rtc.o = -pg
13CFLAGS_REMOVE_paravirt-spinlocks.o = -pg 13CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
14CFLAGS_REMOVE_ftrace.o = -pg 14CFLAGS_REMOVE_ftrace.o = -pg
15CFLAGS_REMOVE_early_printk.o = -pg
15endif 16endif
16 17
17# 18#
@@ -23,9 +24,9 @@ CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp)
23CFLAGS_hpet.o := $(nostackp) 24CFLAGS_hpet.o := $(nostackp)
24CFLAGS_tsc.o := $(nostackp) 25CFLAGS_tsc.o := $(nostackp)
25 26
26obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o 27obj-y := process_$(BITS).o signal.o entry_$(BITS).o
27obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o 28obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
28obj-y += time_$(BITS).o ioport.o ldt.o 29obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o
29obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o 30obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o
30obj-$(CONFIG_X86_VISWS) += visws_quirks.o 31obj-$(CONFIG_X86_VISWS) += visws_quirks.o
31obj-$(CONFIG_X86_32) += probe_roms_32.o 32obj-$(CONFIG_X86_32) += probe_roms_32.o
@@ -65,6 +66,7 @@ obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
65obj-$(CONFIG_X86_IO_APIC) += io_apic.o 66obj-$(CONFIG_X86_IO_APIC) += io_apic.o
66obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o 67obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
67obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o 68obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
69obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
68obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o 70obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
69obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o 71obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
70obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o 72obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
@@ -105,6 +107,10 @@ microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o
105microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o 107microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o
106obj-$(CONFIG_MICROCODE) += microcode.o 108obj-$(CONFIG_MICROCODE) += microcode.o
107 109
110obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
111
112obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o # NB rename without _64
113
108### 114###
109# 64 bit specific files 115# 64 bit specific files
110ifeq ($(CONFIG_X86_64),y) 116ifeq ($(CONFIG_X86_64),y)
@@ -118,7 +124,6 @@ ifeq ($(CONFIG_X86_64),y)
118 obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o 124 obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o
119 obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o 125 obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
120 obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o 126 obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o
121 obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o
122 127
123 obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o 128 obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
124endif 129endif
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 4c51a2f8fd31..29dc0c89d4af 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -538,9 +538,10 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
538 struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; 538 struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
539 union acpi_object *obj; 539 union acpi_object *obj;
540 struct acpi_madt_local_apic *lapic; 540 struct acpi_madt_local_apic *lapic;
541 cpumask_t tmp_map, new_map; 541 cpumask_var_t tmp_map, new_map;
542 u8 physid; 542 u8 physid;
543 int cpu; 543 int cpu;
544 int retval = -ENOMEM;
544 545
545 if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer))) 546 if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer)))
546 return -EINVAL; 547 return -EINVAL;
@@ -569,23 +570,37 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
569 buffer.length = ACPI_ALLOCATE_BUFFER; 570 buffer.length = ACPI_ALLOCATE_BUFFER;
570 buffer.pointer = NULL; 571 buffer.pointer = NULL;
571 572
572 tmp_map = cpu_present_map; 573 if (!alloc_cpumask_var(&tmp_map, GFP_KERNEL))
574 goto out;
575
576 if (!alloc_cpumask_var(&new_map, GFP_KERNEL))
577 goto free_tmp_map;
578
579 cpumask_copy(tmp_map, cpu_present_mask);
573 acpi_register_lapic(physid, lapic->lapic_flags & ACPI_MADT_ENABLED); 580 acpi_register_lapic(physid, lapic->lapic_flags & ACPI_MADT_ENABLED);
574 581
575 /* 582 /*
576 * If mp_register_lapic successfully generates a new logical cpu 583 * If mp_register_lapic successfully generates a new logical cpu
577 * number, then the following will get us exactly what was mapped 584 * number, then the following will get us exactly what was mapped
578 */ 585 */
579 cpus_andnot(new_map, cpu_present_map, tmp_map); 586 cpumask_andnot(new_map, cpu_present_mask, tmp_map);
580 if (cpus_empty(new_map)) { 587 if (cpumask_empty(new_map)) {
581 printk ("Unable to map lapic to logical cpu number\n"); 588 printk ("Unable to map lapic to logical cpu number\n");
582 return -EINVAL; 589 retval = -EINVAL;
590 goto free_new_map;
583 } 591 }
584 592
585 cpu = first_cpu(new_map); 593 cpu = cpumask_first(new_map);
586 594
587 *pcpu = cpu; 595 *pcpu = cpu;
588 return 0; 596 retval = 0;
597
598free_new_map:
599 free_cpumask_var(new_map);
600free_tmp_map:
601 free_cpumask_var(tmp_map);
602out:
603 return retval;
589} 604}
590 605
591/* wrapper to silence section mismatch warning */ 606/* wrapper to silence section mismatch warning */
@@ -598,7 +613,7 @@ EXPORT_SYMBOL(acpi_map_lsapic);
598int acpi_unmap_lsapic(int cpu) 613int acpi_unmap_lsapic(int cpu)
599{ 614{
600 per_cpu(x86_cpu_to_apicid, cpu) = -1; 615 per_cpu(x86_cpu_to_apicid, cpu) = -1;
601 cpu_clear(cpu, cpu_present_map); 616 set_cpu_present(cpu, false);
602 num_processors--; 617 num_processors--;
603 618
604 return (0); 619 return (0);
@@ -1360,6 +1375,17 @@ static void __init acpi_process_madt(void)
1360 disable_acpi(); 1375 disable_acpi();
1361 } 1376 }
1362 } 1377 }
1378
1379 /*
1380 * ACPI supports both logical (e.g. Hyper-Threading) and physical
1381 * processors, where MPS only supports physical.
1382 */
1383 if (acpi_lapic && acpi_ioapic)
1384 printk(KERN_INFO "Using ACPI (MADT) for SMP configuration "
1385 "information\n");
1386 else if (acpi_lapic)
1387 printk(KERN_INFO "Using ACPI for processor (LAPIC) "
1388 "configuration information\n");
1363#endif 1389#endif
1364 return; 1390 return;
1365} 1391}
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 5662e226b0c9..5113c080f0c4 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -20,10 +20,15 @@
20#include <linux/pci.h> 20#include <linux/pci.h>
21#include <linux/gfp.h> 21#include <linux/gfp.h>
22#include <linux/bitops.h> 22#include <linux/bitops.h>
23#include <linux/debugfs.h>
23#include <linux/scatterlist.h> 24#include <linux/scatterlist.h>
24#include <linux/iommu-helper.h> 25#include <linux/iommu-helper.h>
26#ifdef CONFIG_IOMMU_API
27#include <linux/iommu.h>
28#endif
25#include <asm/proto.h> 29#include <asm/proto.h>
26#include <asm/iommu.h> 30#include <asm/iommu.h>
31#include <asm/gart.h>
27#include <asm/amd_iommu_types.h> 32#include <asm/amd_iommu_types.h>
28#include <asm/amd_iommu.h> 33#include <asm/amd_iommu.h>
29 34
@@ -37,6 +42,10 @@ static DEFINE_RWLOCK(amd_iommu_devtable_lock);
37static LIST_HEAD(iommu_pd_list); 42static LIST_HEAD(iommu_pd_list);
38static DEFINE_SPINLOCK(iommu_pd_list_lock); 43static DEFINE_SPINLOCK(iommu_pd_list_lock);
39 44
45#ifdef CONFIG_IOMMU_API
46static struct iommu_ops amd_iommu_ops;
47#endif
48
40/* 49/*
41 * general struct to manage commands send to an IOMMU 50 * general struct to manage commands send to an IOMMU
42 */ 51 */
@@ -46,6 +55,68 @@ struct iommu_cmd {
46 55
47static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, 56static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
48 struct unity_map_entry *e); 57 struct unity_map_entry *e);
58static struct dma_ops_domain *find_protection_domain(u16 devid);
59
60
61#ifdef CONFIG_AMD_IOMMU_STATS
62
63/*
64 * Initialization code for statistics collection
65 */
66
67DECLARE_STATS_COUNTER(compl_wait);
68DECLARE_STATS_COUNTER(cnt_map_single);
69DECLARE_STATS_COUNTER(cnt_unmap_single);
70DECLARE_STATS_COUNTER(cnt_map_sg);
71DECLARE_STATS_COUNTER(cnt_unmap_sg);
72DECLARE_STATS_COUNTER(cnt_alloc_coherent);
73DECLARE_STATS_COUNTER(cnt_free_coherent);
74DECLARE_STATS_COUNTER(cross_page);
75DECLARE_STATS_COUNTER(domain_flush_single);
76DECLARE_STATS_COUNTER(domain_flush_all);
77DECLARE_STATS_COUNTER(alloced_io_mem);
78DECLARE_STATS_COUNTER(total_map_requests);
79
80static struct dentry *stats_dir;
81static struct dentry *de_isolate;
82static struct dentry *de_fflush;
83
84static void amd_iommu_stats_add(struct __iommu_counter *cnt)
85{
86 if (stats_dir == NULL)
87 return;
88
89 cnt->dent = debugfs_create_u64(cnt->name, 0444, stats_dir,
90 &cnt->value);
91}
92
93static void amd_iommu_stats_init(void)
94{
95 stats_dir = debugfs_create_dir("amd-iommu", NULL);
96 if (stats_dir == NULL)
97 return;
98
99 de_isolate = debugfs_create_bool("isolation", 0444, stats_dir,
100 (u32 *)&amd_iommu_isolate);
101
102 de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir,
103 (u32 *)&amd_iommu_unmap_flush);
104
105 amd_iommu_stats_add(&compl_wait);
106 amd_iommu_stats_add(&cnt_map_single);
107 amd_iommu_stats_add(&cnt_unmap_single);
108 amd_iommu_stats_add(&cnt_map_sg);
109 amd_iommu_stats_add(&cnt_unmap_sg);
110 amd_iommu_stats_add(&cnt_alloc_coherent);
111 amd_iommu_stats_add(&cnt_free_coherent);
112 amd_iommu_stats_add(&cross_page);
113 amd_iommu_stats_add(&domain_flush_single);
114 amd_iommu_stats_add(&domain_flush_all);
115 amd_iommu_stats_add(&alloced_io_mem);
116 amd_iommu_stats_add(&total_map_requests);
117}
118
119#endif
49 120
50/* returns !0 if the IOMMU is caching non-present entries in its TLB */ 121/* returns !0 if the IOMMU is caching non-present entries in its TLB */
51static int iommu_has_npcache(struct amd_iommu *iommu) 122static int iommu_has_npcache(struct amd_iommu *iommu)
@@ -188,13 +259,55 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
188 spin_lock_irqsave(&iommu->lock, flags); 259 spin_lock_irqsave(&iommu->lock, flags);
189 ret = __iommu_queue_command(iommu, cmd); 260 ret = __iommu_queue_command(iommu, cmd);
190 if (!ret) 261 if (!ret)
191 iommu->need_sync = 1; 262 iommu->need_sync = true;
192 spin_unlock_irqrestore(&iommu->lock, flags); 263 spin_unlock_irqrestore(&iommu->lock, flags);
193 264
194 return ret; 265 return ret;
195} 266}
196 267
197/* 268/*
269 * This function waits until an IOMMU has completed a completion
270 * wait command
271 */
272static void __iommu_wait_for_completion(struct amd_iommu *iommu)
273{
274 int ready = 0;
275 unsigned status = 0;
276 unsigned long i = 0;
277
278 INC_STATS_COUNTER(compl_wait);
279
280 while (!ready && (i < EXIT_LOOP_COUNT)) {
281 ++i;
282 /* wait for the bit to become one */
283 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
284 ready = status & MMIO_STATUS_COM_WAIT_INT_MASK;
285 }
286
287 /* set bit back to zero */
288 status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
289 writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
290
291 if (unlikely(i == EXIT_LOOP_COUNT))
292 panic("AMD IOMMU: Completion wait loop failed\n");
293}
294
295/*
296 * This function queues a completion wait command into the command
297 * buffer of an IOMMU
298 */
299static int __iommu_completion_wait(struct amd_iommu *iommu)
300{
301 struct iommu_cmd cmd;
302
303 memset(&cmd, 0, sizeof(cmd));
304 cmd.data[0] = CMD_COMPL_WAIT_INT_MASK;
305 CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
306
307 return __iommu_queue_command(iommu, &cmd);
308}
309
310/*
198 * This function is called whenever we need to ensure that the IOMMU has 311 * This function is called whenever we need to ensure that the IOMMU has
199 * completed execution of all commands we sent. It sends a 312 * completed execution of all commands we sent. It sends a
200 * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs 313 * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs
@@ -203,40 +316,23 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
203 */ 316 */
204static int iommu_completion_wait(struct amd_iommu *iommu) 317static int iommu_completion_wait(struct amd_iommu *iommu)
205{ 318{
206 int ret = 0, ready = 0; 319 int ret = 0;
207 unsigned status = 0; 320 unsigned long flags;
208 struct iommu_cmd cmd;
209 unsigned long flags, i = 0;
210
211 memset(&cmd, 0, sizeof(cmd));
212 cmd.data[0] = CMD_COMPL_WAIT_INT_MASK;
213 CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
214 321
215 spin_lock_irqsave(&iommu->lock, flags); 322 spin_lock_irqsave(&iommu->lock, flags);
216 323
217 if (!iommu->need_sync) 324 if (!iommu->need_sync)
218 goto out; 325 goto out;
219 326
220 iommu->need_sync = 0; 327 ret = __iommu_completion_wait(iommu);
221 328
222 ret = __iommu_queue_command(iommu, &cmd); 329 iommu->need_sync = false;
223 330
224 if (ret) 331 if (ret)
225 goto out; 332 goto out;
226 333
227 while (!ready && (i < EXIT_LOOP_COUNT)) { 334 __iommu_wait_for_completion(iommu);
228 ++i;
229 /* wait for the bit to become one */
230 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
231 ready = status & MMIO_STATUS_COM_WAIT_INT_MASK;
232 }
233
234 /* set bit back to zero */
235 status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
236 writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
237 335
238 if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit()))
239 printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n");
240out: 336out:
241 spin_unlock_irqrestore(&iommu->lock, flags); 337 spin_unlock_irqrestore(&iommu->lock, flags);
242 338
@@ -262,6 +358,21 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
262 return ret; 358 return ret;
263} 359}
264 360
361static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
362 u16 domid, int pde, int s)
363{
364 memset(cmd, 0, sizeof(*cmd));
365 address &= PAGE_MASK;
366 CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
367 cmd->data[1] |= domid;
368 cmd->data[2] = lower_32_bits(address);
369 cmd->data[3] = upper_32_bits(address);
370 if (s) /* size bit - we flush more than one 4kb page */
371 cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
372 if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
373 cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
374}
375
265/* 376/*
266 * Generic command send function for invalidaing TLB entries 377 * Generic command send function for invalidaing TLB entries
267 */ 378 */
@@ -271,16 +382,7 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
271 struct iommu_cmd cmd; 382 struct iommu_cmd cmd;
272 int ret; 383 int ret;
273 384
274 memset(&cmd, 0, sizeof(cmd)); 385 __iommu_build_inv_iommu_pages(&cmd, address, domid, pde, s);
275 address &= PAGE_MASK;
276 CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES);
277 cmd.data[1] |= domid;
278 cmd.data[2] = lower_32_bits(address);
279 cmd.data[3] = upper_32_bits(address);
280 if (s) /* size bit - we flush more than one 4kb page */
281 cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
282 if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
283 cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
284 386
285 ret = iommu_queue_command(iommu, &cmd); 387 ret = iommu_queue_command(iommu, &cmd);
286 388
@@ -319,9 +421,35 @@ static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid)
319{ 421{
320 u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; 422 u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
321 423
424 INC_STATS_COUNTER(domain_flush_single);
425
322 iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1); 426 iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1);
323} 427}
324 428
429/*
430 * This function is used to flush the IO/TLB for a given protection domain
431 * on every IOMMU in the system
432 */
433static void iommu_flush_domain(u16 domid)
434{
435 unsigned long flags;
436 struct amd_iommu *iommu;
437 struct iommu_cmd cmd;
438
439 INC_STATS_COUNTER(domain_flush_all);
440
441 __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
442 domid, 1, 1);
443
444 list_for_each_entry(iommu, &amd_iommu_list, list) {
445 spin_lock_irqsave(&iommu->lock, flags);
446 __iommu_queue_command(iommu, &cmd);
447 __iommu_completion_wait(iommu);
448 __iommu_wait_for_completion(iommu);
449 spin_unlock_irqrestore(&iommu->lock, flags);
450 }
451}
452
325/**************************************************************************** 453/****************************************************************************
326 * 454 *
327 * The functions below are used the create the page table mappings for 455 * The functions below are used the create the page table mappings for
@@ -336,15 +464,15 @@ static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid)
336 * supporting all features of AMD IOMMU page tables like level skipping 464 * supporting all features of AMD IOMMU page tables like level skipping
337 * and full 64 bit address spaces. 465 * and full 64 bit address spaces.
338 */ 466 */
339static int iommu_map(struct protection_domain *dom, 467static int iommu_map_page(struct protection_domain *dom,
340 unsigned long bus_addr, 468 unsigned long bus_addr,
341 unsigned long phys_addr, 469 unsigned long phys_addr,
342 int prot) 470 int prot)
343{ 471{
344 u64 __pte, *pte, *page; 472 u64 __pte, *pte, *page;
345 473
346 bus_addr = PAGE_ALIGN(bus_addr); 474 bus_addr = PAGE_ALIGN(bus_addr);
347 phys_addr = PAGE_ALIGN(bus_addr); 475 phys_addr = PAGE_ALIGN(phys_addr);
348 476
349 /* only support 512GB address spaces for now */ 477 /* only support 512GB address spaces for now */
350 if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK)) 478 if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK))
@@ -386,6 +514,28 @@ static int iommu_map(struct protection_domain *dom,
386 return 0; 514 return 0;
387} 515}
388 516
517static void iommu_unmap_page(struct protection_domain *dom,
518 unsigned long bus_addr)
519{
520 u64 *pte;
521
522 pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)];
523
524 if (!IOMMU_PTE_PRESENT(*pte))
525 return;
526
527 pte = IOMMU_PTE_PAGE(*pte);
528 pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
529
530 if (!IOMMU_PTE_PRESENT(*pte))
531 return;
532
533 pte = IOMMU_PTE_PAGE(*pte);
534 pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
535
536 *pte = 0;
537}
538
389/* 539/*
390 * This function checks if a specific unity mapping entry is needed for 540 * This function checks if a specific unity mapping entry is needed for
391 * this specific IOMMU. 541 * this specific IOMMU.
@@ -438,7 +588,7 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
438 588
439 for (addr = e->address_start; addr < e->address_end; 589 for (addr = e->address_start; addr < e->address_end;
440 addr += PAGE_SIZE) { 590 addr += PAGE_SIZE) {
441 ret = iommu_map(&dma_dom->domain, addr, addr, e->prot); 591 ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot);
442 if (ret) 592 if (ret)
443 return ret; 593 return ret;
444 /* 594 /*
@@ -569,6 +719,16 @@ static u16 domain_id_alloc(void)
569 return id; 719 return id;
570} 720}
571 721
722static void domain_id_free(int id)
723{
724 unsigned long flags;
725
726 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
727 if (id > 0 && id < MAX_DOMAIN_ID)
728 __clear_bit(id, amd_iommu_pd_alloc_bitmap);
729 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
730}
731
572/* 732/*
573 * Used to reserve address ranges in the aperture (e.g. for exclusion 733 * Used to reserve address ranges in the aperture (e.g. for exclusion
574 * ranges. 734 * ranges.
@@ -585,12 +745,12 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
585 iommu_area_reserve(dom->bitmap, start_page, pages); 745 iommu_area_reserve(dom->bitmap, start_page, pages);
586} 746}
587 747
588static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom) 748static void free_pagetable(struct protection_domain *domain)
589{ 749{
590 int i, j; 750 int i, j;
591 u64 *p1, *p2, *p3; 751 u64 *p1, *p2, *p3;
592 752
593 p1 = dma_dom->domain.pt_root; 753 p1 = domain->pt_root;
594 754
595 if (!p1) 755 if (!p1)
596 return; 756 return;
@@ -600,7 +760,7 @@ static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
600 continue; 760 continue;
601 761
602 p2 = IOMMU_PTE_PAGE(p1[i]); 762 p2 = IOMMU_PTE_PAGE(p1[i]);
603 for (j = 0; j < 512; ++i) { 763 for (j = 0; j < 512; ++j) {
604 if (!IOMMU_PTE_PRESENT(p2[j])) 764 if (!IOMMU_PTE_PRESENT(p2[j]))
605 continue; 765 continue;
606 p3 = IOMMU_PTE_PAGE(p2[j]); 766 p3 = IOMMU_PTE_PAGE(p2[j]);
@@ -611,6 +771,8 @@ static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
611 } 771 }
612 772
613 free_page((unsigned long)p1); 773 free_page((unsigned long)p1);
774
775 domain->pt_root = NULL;
614} 776}
615 777
616/* 778/*
@@ -622,7 +784,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
622 if (!dom) 784 if (!dom)
623 return; 785 return;
624 786
625 dma_ops_free_pagetable(dom); 787 free_pagetable(&dom->domain);
626 788
627 kfree(dom->pte_pages); 789 kfree(dom->pte_pages);
628 790
@@ -661,6 +823,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
661 goto free_dma_dom; 823 goto free_dma_dom;
662 dma_dom->domain.mode = PAGE_MODE_3_LEVEL; 824 dma_dom->domain.mode = PAGE_MODE_3_LEVEL;
663 dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL); 825 dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
826 dma_dom->domain.flags = PD_DMA_OPS_MASK;
664 dma_dom->domain.priv = dma_dom; 827 dma_dom->domain.priv = dma_dom;
665 if (!dma_dom->domain.pt_root) 828 if (!dma_dom->domain.pt_root)
666 goto free_dma_dom; 829 goto free_dma_dom;
@@ -723,6 +886,15 @@ free_dma_dom:
723} 886}
724 887
725/* 888/*
889 * little helper function to check whether a given protection domain is a
890 * dma_ops domain
891 */
892static bool dma_ops_domain(struct protection_domain *domain)
893{
894 return domain->flags & PD_DMA_OPS_MASK;
895}
896
897/*
726 * Find out the protection domain structure for a given PCI device. This 898 * Find out the protection domain structure for a given PCI device. This
727 * will give us the pointer to the page table root for example. 899 * will give us the pointer to the page table root for example.
728 */ 900 */
@@ -742,14 +914,15 @@ static struct protection_domain *domain_for_device(u16 devid)
742 * If a device is not yet associated with a domain, this function does 914 * If a device is not yet associated with a domain, this function does
743 * assigns it visible for the hardware 915 * assigns it visible for the hardware
744 */ 916 */
745static void set_device_domain(struct amd_iommu *iommu, 917static void attach_device(struct amd_iommu *iommu,
746 struct protection_domain *domain, 918 struct protection_domain *domain,
747 u16 devid) 919 u16 devid)
748{ 920{
749 unsigned long flags; 921 unsigned long flags;
750
751 u64 pte_root = virt_to_phys(domain->pt_root); 922 u64 pte_root = virt_to_phys(domain->pt_root);
752 923
924 domain->dev_cnt += 1;
925
753 pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) 926 pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
754 << DEV_ENTRY_MODE_SHIFT; 927 << DEV_ENTRY_MODE_SHIFT;
755 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV; 928 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
@@ -765,6 +938,116 @@ static void set_device_domain(struct amd_iommu *iommu,
765 iommu_queue_inv_dev_entry(iommu, devid); 938 iommu_queue_inv_dev_entry(iommu, devid);
766} 939}
767 940
941/*
942 * Removes a device from a protection domain (unlocked)
943 */
944static void __detach_device(struct protection_domain *domain, u16 devid)
945{
946
947 /* lock domain */
948 spin_lock(&domain->lock);
949
950 /* remove domain from the lookup table */
951 amd_iommu_pd_table[devid] = NULL;
952
953 /* remove entry from the device table seen by the hardware */
954 amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
955 amd_iommu_dev_table[devid].data[1] = 0;
956 amd_iommu_dev_table[devid].data[2] = 0;
957
958 /* decrease reference counter */
959 domain->dev_cnt -= 1;
960
961 /* ready */
962 spin_unlock(&domain->lock);
963}
964
965/*
966 * Removes a device from a protection domain (with devtable_lock held)
967 */
968static void detach_device(struct protection_domain *domain, u16 devid)
969{
970 unsigned long flags;
971
972 /* lock device table */
973 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
974 __detach_device(domain, devid);
975 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
976}
977
978static int device_change_notifier(struct notifier_block *nb,
979 unsigned long action, void *data)
980{
981 struct device *dev = data;
982 struct pci_dev *pdev = to_pci_dev(dev);
983 u16 devid = calc_devid(pdev->bus->number, pdev->devfn);
984 struct protection_domain *domain;
985 struct dma_ops_domain *dma_domain;
986 struct amd_iommu *iommu;
987 int order = amd_iommu_aperture_order;
988 unsigned long flags;
989
990 if (devid > amd_iommu_last_bdf)
991 goto out;
992
993 devid = amd_iommu_alias_table[devid];
994
995 iommu = amd_iommu_rlookup_table[devid];
996 if (iommu == NULL)
997 goto out;
998
999 domain = domain_for_device(devid);
1000
1001 if (domain && !dma_ops_domain(domain))
1002 WARN_ONCE(1, "AMD IOMMU WARNING: device %s already bound "
1003 "to a non-dma-ops domain\n", dev_name(dev));
1004
1005 switch (action) {
1006 case BUS_NOTIFY_BOUND_DRIVER:
1007 if (domain)
1008 goto out;
1009 dma_domain = find_protection_domain(devid);
1010 if (!dma_domain)
1011 dma_domain = iommu->default_dom;
1012 attach_device(iommu, &dma_domain->domain, devid);
1013 printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
1014 "device %s\n", dma_domain->domain.id, dev_name(dev));
1015 break;
1016 case BUS_NOTIFY_UNBIND_DRIVER:
1017 if (!domain)
1018 goto out;
1019 detach_device(domain, devid);
1020 break;
1021 case BUS_NOTIFY_ADD_DEVICE:
1022 /* allocate a protection domain if a device is added */
1023 dma_domain = find_protection_domain(devid);
1024 if (dma_domain)
1025 goto out;
1026 dma_domain = dma_ops_domain_alloc(iommu, order);
1027 if (!dma_domain)
1028 goto out;
1029 dma_domain->target_dev = devid;
1030
1031 spin_lock_irqsave(&iommu_pd_list_lock, flags);
1032 list_add_tail(&dma_domain->list, &iommu_pd_list);
1033 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
1034
1035 break;
1036 default:
1037 goto out;
1038 }
1039
1040 iommu_queue_inv_dev_entry(iommu, devid);
1041 iommu_completion_wait(iommu);
1042
1043out:
1044 return 0;
1045}
1046
1047struct notifier_block device_nb = {
1048 .notifier_call = device_change_notifier,
1049};
1050
768/***************************************************************************** 1051/*****************************************************************************
769 * 1052 *
770 * The next functions belong to the dma_ops mapping/unmapping code. 1053 * The next functions belong to the dma_ops mapping/unmapping code.
@@ -800,7 +1083,6 @@ static struct dma_ops_domain *find_protection_domain(u16 devid)
800 list_for_each_entry(entry, &iommu_pd_list, list) { 1083 list_for_each_entry(entry, &iommu_pd_list, list) {
801 if (entry->target_dev == devid) { 1084 if (entry->target_dev == devid) {
802 ret = entry; 1085 ret = entry;
803 list_del(&ret->list);
804 break; 1086 break;
805 } 1087 }
806 } 1088 }
@@ -851,14 +1133,13 @@ static int get_device_resources(struct device *dev,
851 if (!dma_dom) 1133 if (!dma_dom)
852 dma_dom = (*iommu)->default_dom; 1134 dma_dom = (*iommu)->default_dom;
853 *domain = &dma_dom->domain; 1135 *domain = &dma_dom->domain;
854 set_device_domain(*iommu, *domain, *bdf); 1136 attach_device(*iommu, *domain, *bdf);
855 printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " 1137 printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
856 "device ", (*domain)->id); 1138 "device %s\n", (*domain)->id, dev_name(dev));
857 print_devid(_bdf, 1);
858 } 1139 }
859 1140
860 if (domain_for_device(_bdf) == NULL) 1141 if (domain_for_device(_bdf) == NULL)
861 set_device_domain(*iommu, *domain, _bdf); 1142 attach_device(*iommu, *domain, _bdf);
862 1143
863 return 1; 1144 return 1;
864} 1145}
@@ -910,7 +1191,7 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
910 if (address >= dom->aperture_size) 1191 if (address >= dom->aperture_size)
911 return; 1192 return;
912 1193
913 WARN_ON(address & 0xfffULL || address > dom->aperture_size); 1194 WARN_ON(address & ~PAGE_MASK || address >= dom->aperture_size);
914 1195
915 pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)]; 1196 pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)];
916 pte += IOMMU_PTE_L0_INDEX(address); 1197 pte += IOMMU_PTE_L0_INDEX(address);
@@ -922,8 +1203,8 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
922 1203
923/* 1204/*
924 * This function contains common code for mapping of a physically 1205 * This function contains common code for mapping of a physically
925 * contiguous memory region into DMA address space. It is uses by all 1206 * contiguous memory region into DMA address space. It is used by all
926 * mapping functions provided by this IOMMU driver. 1207 * mapping functions provided with this IOMMU driver.
927 * Must be called with the domain lock held. 1208 * Must be called with the domain lock held.
928 */ 1209 */
929static dma_addr_t __map_single(struct device *dev, 1210static dma_addr_t __map_single(struct device *dev,
@@ -944,6 +1225,11 @@ static dma_addr_t __map_single(struct device *dev,
944 pages = iommu_num_pages(paddr, size, PAGE_SIZE); 1225 pages = iommu_num_pages(paddr, size, PAGE_SIZE);
945 paddr &= PAGE_MASK; 1226 paddr &= PAGE_MASK;
946 1227
1228 INC_STATS_COUNTER(total_map_requests);
1229
1230 if (pages > 1)
1231 INC_STATS_COUNTER(cross_page);
1232
947 if (align) 1233 if (align)
948 align_mask = (1UL << get_order(size)) - 1; 1234 align_mask = (1UL << get_order(size)) - 1;
949 1235
@@ -960,6 +1246,8 @@ static dma_addr_t __map_single(struct device *dev,
960 } 1246 }
961 address += offset; 1247 address += offset;
962 1248
1249 ADD_STATS_COUNTER(alloced_io_mem, size);
1250
963 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) { 1251 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
964 iommu_flush_tlb(iommu, dma_dom->domain.id); 1252 iommu_flush_tlb(iommu, dma_dom->domain.id);
965 dma_dom->need_flush = false; 1253 dma_dom->need_flush = false;
@@ -983,7 +1271,8 @@ static void __unmap_single(struct amd_iommu *iommu,
983 dma_addr_t i, start; 1271 dma_addr_t i, start;
984 unsigned int pages; 1272 unsigned int pages;
985 1273
986 if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size)) 1274 if ((dma_addr == bad_dma_address) ||
1275 (dma_addr + size > dma_dom->aperture_size))
987 return; 1276 return;
988 1277
989 pages = iommu_num_pages(dma_addr, size, PAGE_SIZE); 1278 pages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
@@ -995,6 +1284,8 @@ static void __unmap_single(struct amd_iommu *iommu,
995 start += PAGE_SIZE; 1284 start += PAGE_SIZE;
996 } 1285 }
997 1286
1287 SUB_STATS_COUNTER(alloced_io_mem, size);
1288
998 dma_ops_free_addresses(dma_dom, dma_addr, pages); 1289 dma_ops_free_addresses(dma_dom, dma_addr, pages);
999 1290
1000 if (amd_iommu_unmap_flush || dma_dom->need_flush) { 1291 if (amd_iommu_unmap_flush || dma_dom->need_flush) {
@@ -1016,6 +1307,8 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
1016 dma_addr_t addr; 1307 dma_addr_t addr;
1017 u64 dma_mask; 1308 u64 dma_mask;
1018 1309
1310 INC_STATS_COUNTER(cnt_map_single);
1311
1019 if (!check_device(dev)) 1312 if (!check_device(dev))
1020 return bad_dma_address; 1313 return bad_dma_address;
1021 1314
@@ -1027,6 +1320,9 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
1027 /* device not handled by any AMD IOMMU */ 1320 /* device not handled by any AMD IOMMU */
1028 return (dma_addr_t)paddr; 1321 return (dma_addr_t)paddr;
1029 1322
1323 if (!dma_ops_domain(domain))
1324 return bad_dma_address;
1325
1030 spin_lock_irqsave(&domain->lock, flags); 1326 spin_lock_irqsave(&domain->lock, flags);
1031 addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false, 1327 addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false,
1032 dma_mask); 1328 dma_mask);
@@ -1052,11 +1348,16 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
1052 struct protection_domain *domain; 1348 struct protection_domain *domain;
1053 u16 devid; 1349 u16 devid;
1054 1350
1351 INC_STATS_COUNTER(cnt_unmap_single);
1352
1055 if (!check_device(dev) || 1353 if (!check_device(dev) ||
1056 !get_device_resources(dev, &iommu, &domain, &devid)) 1354 !get_device_resources(dev, &iommu, &domain, &devid))
1057 /* device not handled by any AMD IOMMU */ 1355 /* device not handled by any AMD IOMMU */
1058 return; 1356 return;
1059 1357
1358 if (!dma_ops_domain(domain))
1359 return;
1360
1060 spin_lock_irqsave(&domain->lock, flags); 1361 spin_lock_irqsave(&domain->lock, flags);
1061 1362
1062 __unmap_single(iommu, domain->priv, dma_addr, size, dir); 1363 __unmap_single(iommu, domain->priv, dma_addr, size, dir);
@@ -1101,6 +1402,8 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
1101 int mapped_elems = 0; 1402 int mapped_elems = 0;
1102 u64 dma_mask; 1403 u64 dma_mask;
1103 1404
1405 INC_STATS_COUNTER(cnt_map_sg);
1406
1104 if (!check_device(dev)) 1407 if (!check_device(dev))
1105 return 0; 1408 return 0;
1106 1409
@@ -1111,6 +1414,9 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
1111 if (!iommu || !domain) 1414 if (!iommu || !domain)
1112 return map_sg_no_iommu(dev, sglist, nelems, dir); 1415 return map_sg_no_iommu(dev, sglist, nelems, dir);
1113 1416
1417 if (!dma_ops_domain(domain))
1418 return 0;
1419
1114 spin_lock_irqsave(&domain->lock, flags); 1420 spin_lock_irqsave(&domain->lock, flags);
1115 1421
1116 for_each_sg(sglist, s, nelems, i) { 1422 for_each_sg(sglist, s, nelems, i) {
@@ -1160,10 +1466,15 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
1160 u16 devid; 1466 u16 devid;
1161 int i; 1467 int i;
1162 1468
1469 INC_STATS_COUNTER(cnt_unmap_sg);
1470
1163 if (!check_device(dev) || 1471 if (!check_device(dev) ||
1164 !get_device_resources(dev, &iommu, &domain, &devid)) 1472 !get_device_resources(dev, &iommu, &domain, &devid))
1165 return; 1473 return;
1166 1474
1475 if (!dma_ops_domain(domain))
1476 return;
1477
1167 spin_lock_irqsave(&domain->lock, flags); 1478 spin_lock_irqsave(&domain->lock, flags);
1168 1479
1169 for_each_sg(sglist, s, nelems, i) { 1480 for_each_sg(sglist, s, nelems, i) {
@@ -1191,6 +1502,8 @@ static void *alloc_coherent(struct device *dev, size_t size,
1191 phys_addr_t paddr; 1502 phys_addr_t paddr;
1192 u64 dma_mask = dev->coherent_dma_mask; 1503 u64 dma_mask = dev->coherent_dma_mask;
1193 1504
1505 INC_STATS_COUNTER(cnt_alloc_coherent);
1506
1194 if (!check_device(dev)) 1507 if (!check_device(dev))
1195 return NULL; 1508 return NULL;
1196 1509
@@ -1209,6 +1522,9 @@ static void *alloc_coherent(struct device *dev, size_t size,
1209 return virt_addr; 1522 return virt_addr;
1210 } 1523 }
1211 1524
1525 if (!dma_ops_domain(domain))
1526 goto out_free;
1527
1212 if (!dma_mask) 1528 if (!dma_mask)
1213 dma_mask = *dev->dma_mask; 1529 dma_mask = *dev->dma_mask;
1214 1530
@@ -1217,18 +1533,20 @@ static void *alloc_coherent(struct device *dev, size_t size,
1217 *dma_addr = __map_single(dev, iommu, domain->priv, paddr, 1533 *dma_addr = __map_single(dev, iommu, domain->priv, paddr,
1218 size, DMA_BIDIRECTIONAL, true, dma_mask); 1534 size, DMA_BIDIRECTIONAL, true, dma_mask);
1219 1535
1220 if (*dma_addr == bad_dma_address) { 1536 if (*dma_addr == bad_dma_address)
1221 free_pages((unsigned long)virt_addr, get_order(size)); 1537 goto out_free;
1222 virt_addr = NULL;
1223 goto out;
1224 }
1225 1538
1226 iommu_completion_wait(iommu); 1539 iommu_completion_wait(iommu);
1227 1540
1228out:
1229 spin_unlock_irqrestore(&domain->lock, flags); 1541 spin_unlock_irqrestore(&domain->lock, flags);
1230 1542
1231 return virt_addr; 1543 return virt_addr;
1544
1545out_free:
1546
1547 free_pages((unsigned long)virt_addr, get_order(size));
1548
1549 return NULL;
1232} 1550}
1233 1551
1234/* 1552/*
@@ -1242,6 +1560,8 @@ static void free_coherent(struct device *dev, size_t size,
1242 struct protection_domain *domain; 1560 struct protection_domain *domain;
1243 u16 devid; 1561 u16 devid;
1244 1562
1563 INC_STATS_COUNTER(cnt_free_coherent);
1564
1245 if (!check_device(dev)) 1565 if (!check_device(dev))
1246 return; 1566 return;
1247 1567
@@ -1250,6 +1570,9 @@ static void free_coherent(struct device *dev, size_t size,
1250 if (!iommu || !domain) 1570 if (!iommu || !domain)
1251 goto free_mem; 1571 goto free_mem;
1252 1572
1573 if (!dma_ops_domain(domain))
1574 goto free_mem;
1575
1253 spin_lock_irqsave(&domain->lock, flags); 1576 spin_lock_irqsave(&domain->lock, flags);
1254 1577
1255 __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); 1578 __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
@@ -1293,7 +1616,7 @@ static int amd_iommu_dma_supported(struct device *dev, u64 mask)
1293 * we don't need to preallocate the protection domains anymore. 1616 * we don't need to preallocate the protection domains anymore.
1294 * For now we have to. 1617 * For now we have to.
1295 */ 1618 */
1296void prealloc_protection_domains(void) 1619static void prealloc_protection_domains(void)
1297{ 1620{
1298 struct pci_dev *dev = NULL; 1621 struct pci_dev *dev = NULL;
1299 struct dma_ops_domain *dma_dom; 1622 struct dma_ops_domain *dma_dom;
@@ -1302,7 +1625,7 @@ void prealloc_protection_domains(void)
1302 u16 devid; 1625 u16 devid;
1303 1626
1304 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 1627 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
1305 devid = (dev->bus->number << 8) | dev->devfn; 1628 devid = calc_devid(dev->bus->number, dev->devfn);
1306 if (devid > amd_iommu_last_bdf) 1629 if (devid > amd_iommu_last_bdf)
1307 continue; 1630 continue;
1308 devid = amd_iommu_alias_table[devid]; 1631 devid = amd_iommu_alias_table[devid];
@@ -1349,6 +1672,7 @@ int __init amd_iommu_init_dma_ops(void)
1349 iommu->default_dom = dma_ops_domain_alloc(iommu, order); 1672 iommu->default_dom = dma_ops_domain_alloc(iommu, order);
1350 if (iommu->default_dom == NULL) 1673 if (iommu->default_dom == NULL)
1351 return -ENOMEM; 1674 return -ENOMEM;
1675 iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
1352 ret = iommu_init_unity_mappings(iommu); 1676 ret = iommu_init_unity_mappings(iommu);
1353 if (ret) 1677 if (ret)
1354 goto free_domains; 1678 goto free_domains;
@@ -1372,6 +1696,12 @@ int __init amd_iommu_init_dma_ops(void)
1372 /* Make the driver finally visible to the drivers */ 1696 /* Make the driver finally visible to the drivers */
1373 dma_ops = &amd_iommu_dma_ops; 1697 dma_ops = &amd_iommu_dma_ops;
1374 1698
1699 register_iommu(&amd_iommu_ops);
1700
1701 bus_register_notifier(&pci_bus_type, &device_nb);
1702
1703 amd_iommu_stats_init();
1704
1375 return 0; 1705 return 0;
1376 1706
1377free_domains: 1707free_domains:
@@ -1383,3 +1713,224 @@ free_domains:
1383 1713
1384 return ret; 1714 return ret;
1385} 1715}
1716
1717/*****************************************************************************
1718 *
1719 * The following functions belong to the exported interface of AMD IOMMU
1720 *
1721 * This interface allows access to lower level functions of the IOMMU
1722 * like protection domain handling and assignement of devices to domains
1723 * which is not possible with the dma_ops interface.
1724 *
1725 *****************************************************************************/
1726
1727static void cleanup_domain(struct protection_domain *domain)
1728{
1729 unsigned long flags;
1730 u16 devid;
1731
1732 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1733
1734 for (devid = 0; devid <= amd_iommu_last_bdf; ++devid)
1735 if (amd_iommu_pd_table[devid] == domain)
1736 __detach_device(domain, devid);
1737
1738 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1739}
1740
1741static int amd_iommu_domain_init(struct iommu_domain *dom)
1742{
1743 struct protection_domain *domain;
1744
1745 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1746 if (!domain)
1747 return -ENOMEM;
1748
1749 spin_lock_init(&domain->lock);
1750 domain->mode = PAGE_MODE_3_LEVEL;
1751 domain->id = domain_id_alloc();
1752 if (!domain->id)
1753 goto out_free;
1754 domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL);
1755 if (!domain->pt_root)
1756 goto out_free;
1757
1758 dom->priv = domain;
1759
1760 return 0;
1761
1762out_free:
1763 kfree(domain);
1764
1765 return -ENOMEM;
1766}
1767
1768static void amd_iommu_domain_destroy(struct iommu_domain *dom)
1769{
1770 struct protection_domain *domain = dom->priv;
1771
1772 if (!domain)
1773 return;
1774
1775 if (domain->dev_cnt > 0)
1776 cleanup_domain(domain);
1777
1778 BUG_ON(domain->dev_cnt != 0);
1779
1780 free_pagetable(domain);
1781
1782 domain_id_free(domain->id);
1783
1784 kfree(domain);
1785
1786 dom->priv = NULL;
1787}
1788
1789static void amd_iommu_detach_device(struct iommu_domain *dom,
1790 struct device *dev)
1791{
1792 struct protection_domain *domain = dom->priv;
1793 struct amd_iommu *iommu;
1794 struct pci_dev *pdev;
1795 u16 devid;
1796
1797 if (dev->bus != &pci_bus_type)
1798 return;
1799
1800 pdev = to_pci_dev(dev);
1801
1802 devid = calc_devid(pdev->bus->number, pdev->devfn);
1803
1804 if (devid > 0)
1805 detach_device(domain, devid);
1806
1807 iommu = amd_iommu_rlookup_table[devid];
1808 if (!iommu)
1809 return;
1810
1811 iommu_queue_inv_dev_entry(iommu, devid);
1812 iommu_completion_wait(iommu);
1813}
1814
1815static int amd_iommu_attach_device(struct iommu_domain *dom,
1816 struct device *dev)
1817{
1818 struct protection_domain *domain = dom->priv;
1819 struct protection_domain *old_domain;
1820 struct amd_iommu *iommu;
1821 struct pci_dev *pdev;
1822 u16 devid;
1823
1824 if (dev->bus != &pci_bus_type)
1825 return -EINVAL;
1826
1827 pdev = to_pci_dev(dev);
1828
1829 devid = calc_devid(pdev->bus->number, pdev->devfn);
1830
1831 if (devid >= amd_iommu_last_bdf ||
1832 devid != amd_iommu_alias_table[devid])
1833 return -EINVAL;
1834
1835 iommu = amd_iommu_rlookup_table[devid];
1836 if (!iommu)
1837 return -EINVAL;
1838
1839 old_domain = domain_for_device(devid);
1840 if (old_domain)
1841 return -EBUSY;
1842
1843 attach_device(iommu, domain, devid);
1844
1845 iommu_completion_wait(iommu);
1846
1847 return 0;
1848}
1849
1850static int amd_iommu_map_range(struct iommu_domain *dom,
1851 unsigned long iova, phys_addr_t paddr,
1852 size_t size, int iommu_prot)
1853{
1854 struct protection_domain *domain = dom->priv;
1855 unsigned long i, npages = iommu_num_pages(paddr, size, PAGE_SIZE);
1856 int prot = 0;
1857 int ret;
1858
1859 if (iommu_prot & IOMMU_READ)
1860 prot |= IOMMU_PROT_IR;
1861 if (iommu_prot & IOMMU_WRITE)
1862 prot |= IOMMU_PROT_IW;
1863
1864 iova &= PAGE_MASK;
1865 paddr &= PAGE_MASK;
1866
1867 for (i = 0; i < npages; ++i) {
1868 ret = iommu_map_page(domain, iova, paddr, prot);
1869 if (ret)
1870 return ret;
1871
1872 iova += PAGE_SIZE;
1873 paddr += PAGE_SIZE;
1874 }
1875
1876 return 0;
1877}
1878
1879static void amd_iommu_unmap_range(struct iommu_domain *dom,
1880 unsigned long iova, size_t size)
1881{
1882
1883 struct protection_domain *domain = dom->priv;
1884 unsigned long i, npages = iommu_num_pages(iova, size, PAGE_SIZE);
1885
1886 iova &= PAGE_MASK;
1887
1888 for (i = 0; i < npages; ++i) {
1889 iommu_unmap_page(domain, iova);
1890 iova += PAGE_SIZE;
1891 }
1892
1893 iommu_flush_domain(domain->id);
1894}
1895
1896static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
1897 unsigned long iova)
1898{
1899 struct protection_domain *domain = dom->priv;
1900 unsigned long offset = iova & ~PAGE_MASK;
1901 phys_addr_t paddr;
1902 u64 *pte;
1903
1904 pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(iova)];
1905
1906 if (!IOMMU_PTE_PRESENT(*pte))
1907 return 0;
1908
1909 pte = IOMMU_PTE_PAGE(*pte);
1910 pte = &pte[IOMMU_PTE_L1_INDEX(iova)];
1911
1912 if (!IOMMU_PTE_PRESENT(*pte))
1913 return 0;
1914
1915 pte = IOMMU_PTE_PAGE(*pte);
1916 pte = &pte[IOMMU_PTE_L0_INDEX(iova)];
1917
1918 if (!IOMMU_PTE_PRESENT(*pte))
1919 return 0;
1920
1921 paddr = *pte & IOMMU_PAGE_MASK;
1922 paddr |= offset;
1923
1924 return paddr;
1925}
1926
1927static struct iommu_ops amd_iommu_ops = {
1928 .domain_init = amd_iommu_domain_init,
1929 .domain_destroy = amd_iommu_domain_destroy,
1930 .attach_dev = amd_iommu_attach_device,
1931 .detach_dev = amd_iommu_detach_device,
1932 .map = amd_iommu_map_range,
1933 .unmap = amd_iommu_unmap_range,
1934 .iova_to_phys = amd_iommu_iova_to_phys,
1935};
1936
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 30ae2701b3df..42c33cebf00f 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -28,6 +28,7 @@
28#include <asm/amd_iommu_types.h> 28#include <asm/amd_iommu_types.h>
29#include <asm/amd_iommu.h> 29#include <asm/amd_iommu.h>
30#include <asm/iommu.h> 30#include <asm/iommu.h>
31#include <asm/gart.h>
31 32
32/* 33/*
33 * definitions for the ACPI scanning code 34 * definitions for the ACPI scanning code
@@ -121,7 +122,8 @@ u16 amd_iommu_last_bdf; /* largest PCI device id we have
121LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings 122LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
122 we find in ACPI */ 123 we find in ACPI */
123unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ 124unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */
124int amd_iommu_isolate = 1; /* if 1, device isolation is enabled */ 125bool amd_iommu_isolate = true; /* if true, device isolation is
126 enabled */
125bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ 127bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
126 128
127LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the 129LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
@@ -242,20 +244,16 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
242} 244}
243 245
244/* Function to enable the hardware */ 246/* Function to enable the hardware */
245void __init iommu_enable(struct amd_iommu *iommu) 247static void __init iommu_enable(struct amd_iommu *iommu)
246{ 248{
247 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU " 249 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n",
248 "at %02x:%02x.%x cap 0x%hx\n", 250 dev_name(&iommu->dev->dev), iommu->cap_ptr);
249 iommu->dev->bus->number,
250 PCI_SLOT(iommu->dev->devfn),
251 PCI_FUNC(iommu->dev->devfn),
252 iommu->cap_ptr);
253 251
254 iommu_feature_enable(iommu, CONTROL_IOMMU_EN); 252 iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
255} 253}
256 254
257/* Function to enable IOMMU event logging and event interrupts */ 255/* Function to enable IOMMU event logging and event interrupts */
258void __init iommu_enable_event_logging(struct amd_iommu *iommu) 256static void __init iommu_enable_event_logging(struct amd_iommu *iommu)
259{ 257{
260 iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); 258 iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
261 iommu_feature_enable(iommu, CONTROL_EVT_INT_EN); 259 iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
@@ -427,6 +425,10 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
427 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, 425 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
428 &entry, sizeof(entry)); 426 &entry, sizeof(entry));
429 427
428 /* set head and tail to zero manually */
429 writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
430 writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
431
430 iommu_feature_enable(iommu, CONTROL_CMDBUF_EN); 432 iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
431 433
432 return cmd_buf; 434 return cmd_buf;
@@ -1074,7 +1076,8 @@ int __init amd_iommu_init(void)
1074 goto free; 1076 goto free;
1075 1077
1076 /* IOMMU rlookup table - find the IOMMU for a specific device */ 1078 /* IOMMU rlookup table - find the IOMMU for a specific device */
1077 amd_iommu_rlookup_table = (void *)__get_free_pages(GFP_KERNEL, 1079 amd_iommu_rlookup_table = (void *)__get_free_pages(
1080 GFP_KERNEL | __GFP_ZERO,
1078 get_order(rlookup_table_size)); 1081 get_order(rlookup_table_size));
1079 if (amd_iommu_rlookup_table == NULL) 1082 if (amd_iommu_rlookup_table == NULL)
1080 goto free; 1083 goto free;
@@ -1212,9 +1215,9 @@ static int __init parse_amd_iommu_options(char *str)
1212{ 1215{
1213 for (; *str; ++str) { 1216 for (; *str; ++str) {
1214 if (strncmp(str, "isolate", 7) == 0) 1217 if (strncmp(str, "isolate", 7) == 0)
1215 amd_iommu_isolate = 1; 1218 amd_iommu_isolate = true;
1216 if (strncmp(str, "share", 5) == 0) 1219 if (strncmp(str, "share", 5) == 0)
1217 amd_iommu_isolate = 0; 1220 amd_iommu_isolate = false;
1218 if (strncmp(str, "fullflush", 9) == 0) 1221 if (strncmp(str, "fullflush", 9) == 0)
1219 amd_iommu_unmap_flush = true; 1222 amd_iommu_unmap_flush = true;
1220 } 1223 }
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 9a32b37ee2ee..676debfc1702 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -1,8 +1,9 @@
1/* 1/*
2 * Firmware replacement code. 2 * Firmware replacement code.
3 * 3 *
4 * Work around broken BIOSes that don't set an aperture or only set the 4 * Work around broken BIOSes that don't set an aperture, only set the
5 * aperture in the AGP bridge. 5 * aperture in the AGP bridge, or set too small aperture.
6 *
6 * If all fails map the aperture over some low memory. This is cheaper than 7 * If all fails map the aperture over some low memory. This is cheaper than
7 * doing bounce buffering. The memory is lost. This is done at early boot 8 * doing bounce buffering. The memory is lost. This is done at early boot
8 * because only the bootmem allocator can allocate 32+MB. 9 * because only the bootmem allocator can allocate 32+MB.
diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index 16f94879b525..b13d3c4dbd42 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -30,6 +30,7 @@
30#include <linux/module.h> 30#include <linux/module.h>
31#include <linux/dmi.h> 31#include <linux/dmi.h>
32#include <linux/dmar.h> 32#include <linux/dmar.h>
33#include <linux/ftrace.h>
33 34
34#include <asm/atomic.h> 35#include <asm/atomic.h>
35#include <asm/smp.h> 36#include <asm/smp.h>
@@ -97,8 +98,8 @@ __setup("apicpmtimer", setup_apicpmtimer);
97#ifdef HAVE_X2APIC 98#ifdef HAVE_X2APIC
98int x2apic; 99int x2apic;
99/* x2apic enabled before OS handover */ 100/* x2apic enabled before OS handover */
100int x2apic_preenabled; 101static int x2apic_preenabled;
101int disable_x2apic; 102static int disable_x2apic;
102static __init int setup_nox2apic(char *str) 103static __init int setup_nox2apic(char *str)
103{ 104{
104 disable_x2apic = 1; 105 disable_x2apic = 1;
@@ -118,8 +119,6 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
118 119
119int first_system_vector = 0xfe; 120int first_system_vector = 0xfe;
120 121
121char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
122
123/* 122/*
124 * Debug level, exported for io_apic.c 123 * Debug level, exported for io_apic.c
125 */ 124 */
@@ -141,7 +140,7 @@ static int lapic_next_event(unsigned long delta,
141 struct clock_event_device *evt); 140 struct clock_event_device *evt);
142static void lapic_timer_setup(enum clock_event_mode mode, 141static void lapic_timer_setup(enum clock_event_mode mode,
143 struct clock_event_device *evt); 142 struct clock_event_device *evt);
144static void lapic_timer_broadcast(cpumask_t mask); 143static void lapic_timer_broadcast(const struct cpumask *mask);
145static void apic_pm_activate(void); 144static void apic_pm_activate(void);
146 145
147/* 146/*
@@ -227,7 +226,7 @@ void xapic_icr_write(u32 low, u32 id)
227 apic_write(APIC_ICR, low); 226 apic_write(APIC_ICR, low);
228} 227}
229 228
230u64 xapic_icr_read(void) 229static u64 xapic_icr_read(void)
231{ 230{
232 u32 icr1, icr2; 231 u32 icr1, icr2;
233 232
@@ -267,7 +266,7 @@ void x2apic_icr_write(u32 low, u32 id)
267 wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low); 266 wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low);
268} 267}
269 268
270u64 x2apic_icr_read(void) 269static u64 x2apic_icr_read(void)
271{ 270{
272 unsigned long val; 271 unsigned long val;
273 272
@@ -441,6 +440,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,
441 v = apic_read(APIC_LVTT); 440 v = apic_read(APIC_LVTT);
442 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); 441 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
443 apic_write(APIC_LVTT, v); 442 apic_write(APIC_LVTT, v);
443 apic_write(APIC_TMICT, 0xffffffff);
444 break; 444 break;
445 case CLOCK_EVT_MODE_RESUME: 445 case CLOCK_EVT_MODE_RESUME:
446 /* Nothing to do here */ 446 /* Nothing to do here */
@@ -453,7 +453,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,
453/* 453/*
454 * Local APIC timer broadcast function 454 * Local APIC timer broadcast function
455 */ 455 */
456static void lapic_timer_broadcast(cpumask_t mask) 456static void lapic_timer_broadcast(const struct cpumask *mask)
457{ 457{
458#ifdef CONFIG_SMP 458#ifdef CONFIG_SMP
459 send_IPI_mask(mask, LOCAL_TIMER_VECTOR); 459 send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
@@ -469,7 +469,7 @@ static void __cpuinit setup_APIC_timer(void)
469 struct clock_event_device *levt = &__get_cpu_var(lapic_events); 469 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
470 470
471 memcpy(levt, &lapic_clockevent, sizeof(*levt)); 471 memcpy(levt, &lapic_clockevent, sizeof(*levt));
472 levt->cpumask = cpumask_of_cpu(smp_processor_id()); 472 levt->cpumask = cpumask_of(smp_processor_id());
473 473
474 clockevents_register_device(levt); 474 clockevents_register_device(levt);
475} 475}
@@ -559,13 +559,13 @@ static int __init calibrate_by_pmtimer(long deltapm, long *delta)
559 } else { 559 } else {
560 res = (((u64)deltapm) * mult) >> 22; 560 res = (((u64)deltapm) * mult) >> 22;
561 do_div(res, 1000000); 561 do_div(res, 1000000);
562 printk(KERN_WARNING "APIC calibration not consistent " 562 pr_warning("APIC calibration not consistent "
563 "with PM Timer: %ldms instead of 100ms\n", 563 "with PM Timer: %ldms instead of 100ms\n",
564 (long)res); 564 (long)res);
565 /* Correct the lapic counter value */ 565 /* Correct the lapic counter value */
566 res = (((u64)(*delta)) * pm_100ms); 566 res = (((u64)(*delta)) * pm_100ms);
567 do_div(res, deltapm); 567 do_div(res, deltapm);
568 printk(KERN_INFO "APIC delta adjusted to PM-Timer: " 568 pr_info("APIC delta adjusted to PM-Timer: "
569 "%lu (%ld)\n", (unsigned long)res, *delta); 569 "%lu (%ld)\n", (unsigned long)res, *delta);
570 *delta = (long)res; 570 *delta = (long)res;
571 } 571 }
@@ -645,8 +645,7 @@ static int __init calibrate_APIC_clock(void)
645 */ 645 */
646 if (calibration_result < (1000000 / HZ)) { 646 if (calibration_result < (1000000 / HZ)) {
647 local_irq_enable(); 647 local_irq_enable();
648 printk(KERN_WARNING 648 pr_warning("APIC frequency too slow, disabling apic timer\n");
649 "APIC frequency too slow, disabling apic timer\n");
650 return -1; 649 return -1;
651 } 650 }
652 651
@@ -672,13 +671,9 @@ static int __init calibrate_APIC_clock(void)
672 while (lapic_cal_loops <= LAPIC_CAL_LOOPS) 671 while (lapic_cal_loops <= LAPIC_CAL_LOOPS)
673 cpu_relax(); 672 cpu_relax();
674 673
675 local_irq_disable();
676
677 /* Stop the lapic timer */ 674 /* Stop the lapic timer */
678 lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt); 675 lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt);
679 676
680 local_irq_enable();
681
682 /* Jiffies delta */ 677 /* Jiffies delta */
683 deltaj = lapic_cal_j2 - lapic_cal_j1; 678 deltaj = lapic_cal_j2 - lapic_cal_j1;
684 apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj); 679 apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj);
@@ -692,8 +687,7 @@ static int __init calibrate_APIC_clock(void)
692 local_irq_enable(); 687 local_irq_enable();
693 688
694 if (levt->features & CLOCK_EVT_FEAT_DUMMY) { 689 if (levt->features & CLOCK_EVT_FEAT_DUMMY) {
695 printk(KERN_WARNING 690 pr_warning("APIC timer disabled due to verification failure.\n");
696 "APIC timer disabled due to verification failure.\n");
697 return -1; 691 return -1;
698 } 692 }
699 693
@@ -714,7 +708,7 @@ void __init setup_boot_APIC_clock(void)
714 * broadcast mechanism is used. On UP systems simply ignore it. 708 * broadcast mechanism is used. On UP systems simply ignore it.
715 */ 709 */
716 if (disable_apic_timer) { 710 if (disable_apic_timer) {
717 printk(KERN_INFO "Disabling APIC timer\n"); 711 pr_info("Disabling APIC timer\n");
718 /* No broadcast on UP ! */ 712 /* No broadcast on UP ! */
719 if (num_possible_cpus() > 1) { 713 if (num_possible_cpus() > 1) {
720 lapic_clockevent.mult = 1; 714 lapic_clockevent.mult = 1;
@@ -741,7 +735,7 @@ void __init setup_boot_APIC_clock(void)
741 if (nmi_watchdog != NMI_IO_APIC) 735 if (nmi_watchdog != NMI_IO_APIC)
742 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; 736 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
743 else 737 else
744 printk(KERN_WARNING "APIC timer registered as dummy," 738 pr_warning("APIC timer registered as dummy,"
745 " due to nmi_watchdog=%d!\n", nmi_watchdog); 739 " due to nmi_watchdog=%d!\n", nmi_watchdog);
746 740
747 /* Setup the lapic or request the broadcast */ 741 /* Setup the lapic or request the broadcast */
@@ -773,8 +767,7 @@ static void local_apic_timer_interrupt(void)
773 * spurious. 767 * spurious.
774 */ 768 */
775 if (!evt->event_handler) { 769 if (!evt->event_handler) {
776 printk(KERN_WARNING 770 pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", cpu);
777 "Spurious LAPIC timer interrupt on cpu %d\n", cpu);
778 /* Switch it off */ 771 /* Switch it off */
779 lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt); 772 lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
780 return; 773 return;
@@ -783,11 +776,7 @@ static void local_apic_timer_interrupt(void)
783 /* 776 /*
784 * the NMI deadlock-detector uses this. 777 * the NMI deadlock-detector uses this.
785 */ 778 */
786#ifdef CONFIG_X86_64 779 inc_irq_stat(apic_timer_irqs);
787 add_pda(apic_timer_irqs, 1);
788#else
789 per_cpu(irq_stat, cpu).apic_timer_irqs++;
790#endif
791 780
792 evt->event_handler(evt); 781 evt->event_handler(evt);
793} 782}
@@ -800,7 +789,7 @@ static void local_apic_timer_interrupt(void)
800 * [ if a single-CPU system runs an SMP kernel then we call the local 789 * [ if a single-CPU system runs an SMP kernel then we call the local
801 * interrupt as well. Thus we cannot inline the local irq ... ] 790 * interrupt as well. Thus we cannot inline the local irq ... ]
802 */ 791 */
803void smp_apic_timer_interrupt(struct pt_regs *regs) 792void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
804{ 793{
805 struct pt_regs *old_regs = set_irq_regs(regs); 794 struct pt_regs *old_regs = set_irq_regs(regs);
806 795
@@ -814,9 +803,7 @@ void smp_apic_timer_interrupt(struct pt_regs *regs)
814 * Besides, if we don't timer interrupts ignore the global 803 * Besides, if we don't timer interrupts ignore the global
815 * interrupt lock, which is the WrongThing (tm) to do. 804 * interrupt lock, which is the WrongThing (tm) to do.
816 */ 805 */
817#ifdef CONFIG_X86_64
818 exit_idle(); 806 exit_idle();
819#endif
820 irq_enter(); 807 irq_enter();
821 local_apic_timer_interrupt(); 808 local_apic_timer_interrupt();
822 irq_exit(); 809 irq_exit();
@@ -1093,7 +1080,7 @@ static void __cpuinit lapic_setup_esr(void)
1093 unsigned int oldvalue, value, maxlvt; 1080 unsigned int oldvalue, value, maxlvt;
1094 1081
1095 if (!lapic_is_integrated()) { 1082 if (!lapic_is_integrated()) {
1096 printk(KERN_INFO "No ESR for 82489DX.\n"); 1083 pr_info("No ESR for 82489DX.\n");
1097 return; 1084 return;
1098 } 1085 }
1099 1086
@@ -1104,7 +1091,7 @@ static void __cpuinit lapic_setup_esr(void)
1104 * ESR disabled - we can't do anything useful with the 1091 * ESR disabled - we can't do anything useful with the
1105 * errors anyway - mbligh 1092 * errors anyway - mbligh
1106 */ 1093 */
1107 printk(KERN_INFO "Leaving ESR disabled.\n"); 1094 pr_info("Leaving ESR disabled.\n");
1108 return; 1095 return;
1109 } 1096 }
1110 1097
@@ -1298,7 +1285,7 @@ void check_x2apic(void)
1298 rdmsr(MSR_IA32_APICBASE, msr, msr2); 1285 rdmsr(MSR_IA32_APICBASE, msr, msr2);
1299 1286
1300 if (msr & X2APIC_ENABLE) { 1287 if (msr & X2APIC_ENABLE) {
1301 printk("x2apic enabled by BIOS, switching to x2apic ops\n"); 1288 pr_info("x2apic enabled by BIOS, switching to x2apic ops\n");
1302 x2apic_preenabled = x2apic = 1; 1289 x2apic_preenabled = x2apic = 1;
1303 apic_ops = &x2apic_ops; 1290 apic_ops = &x2apic_ops;
1304 } 1291 }
@@ -1310,7 +1297,7 @@ void enable_x2apic(void)
1310 1297
1311 rdmsr(MSR_IA32_APICBASE, msr, msr2); 1298 rdmsr(MSR_IA32_APICBASE, msr, msr2);
1312 if (!(msr & X2APIC_ENABLE)) { 1299 if (!(msr & X2APIC_ENABLE)) {
1313 printk("Enabling x2apic\n"); 1300 pr_info("Enabling x2apic\n");
1314 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0); 1301 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
1315 } 1302 }
1316} 1303}
@@ -1325,9 +1312,8 @@ void __init enable_IR_x2apic(void)
1325 return; 1312 return;
1326 1313
1327 if (!x2apic_preenabled && disable_x2apic) { 1314 if (!x2apic_preenabled && disable_x2apic) {
1328 printk(KERN_INFO 1315 pr_info("Skipped enabling x2apic and Interrupt-remapping "
1329 "Skipped enabling x2apic and Interrupt-remapping " 1316 "because of nox2apic\n");
1330 "because of nox2apic\n");
1331 return; 1317 return;
1332 } 1318 }
1333 1319
@@ -1335,22 +1321,19 @@ void __init enable_IR_x2apic(void)
1335 panic("Bios already enabled x2apic, can't enforce nox2apic"); 1321 panic("Bios already enabled x2apic, can't enforce nox2apic");
1336 1322
1337 if (!x2apic_preenabled && skip_ioapic_setup) { 1323 if (!x2apic_preenabled && skip_ioapic_setup) {
1338 printk(KERN_INFO 1324 pr_info("Skipped enabling x2apic and Interrupt-remapping "
1339 "Skipped enabling x2apic and Interrupt-remapping " 1325 "because of skipping io-apic setup\n");
1340 "because of skipping io-apic setup\n");
1341 return; 1326 return;
1342 } 1327 }
1343 1328
1344 ret = dmar_table_init(); 1329 ret = dmar_table_init();
1345 if (ret) { 1330 if (ret) {
1346 printk(KERN_INFO 1331 pr_info("dmar_table_init() failed with %d:\n", ret);
1347 "dmar_table_init() failed with %d:\n", ret);
1348 1332
1349 if (x2apic_preenabled) 1333 if (x2apic_preenabled)
1350 panic("x2apic enabled by bios. But IR enabling failed"); 1334 panic("x2apic enabled by bios. But IR enabling failed");
1351 else 1335 else
1352 printk(KERN_INFO 1336 pr_info("Not enabling x2apic,Intr-remapping\n");
1353 "Not enabling x2apic,Intr-remapping\n");
1354 return; 1337 return;
1355 } 1338 }
1356 1339
@@ -1359,7 +1342,7 @@ void __init enable_IR_x2apic(void)
1359 1342
1360 ret = save_mask_IO_APIC_setup(); 1343 ret = save_mask_IO_APIC_setup();
1361 if (ret) { 1344 if (ret) {
1362 printk(KERN_INFO "Saving IO-APIC state failed: %d\n", ret); 1345 pr_info("Saving IO-APIC state failed: %d\n", ret);
1363 goto end; 1346 goto end;
1364 } 1347 }
1365 1348
@@ -1394,14 +1377,11 @@ end:
1394 1377
1395 if (!ret) { 1378 if (!ret) {
1396 if (!x2apic_preenabled) 1379 if (!x2apic_preenabled)
1397 printk(KERN_INFO 1380 pr_info("Enabled x2apic and interrupt-remapping\n");
1398 "Enabled x2apic and interrupt-remapping\n");
1399 else 1381 else
1400 printk(KERN_INFO 1382 pr_info("Enabled Interrupt-remapping\n");
1401 "Enabled Interrupt-remapping\n");
1402 } else 1383 } else
1403 printk(KERN_ERR 1384 pr_err("Failed to enable Interrupt-remapping and x2apic\n");
1404 "Failed to enable Interrupt-remapping and x2apic\n");
1405#else 1385#else
1406 if (!cpu_has_x2apic) 1386 if (!cpu_has_x2apic)
1407 return; 1387 return;
@@ -1410,8 +1390,8 @@ end:
1410 panic("x2apic enabled prior OS handover," 1390 panic("x2apic enabled prior OS handover,"
1411 " enable CONFIG_INTR_REMAP"); 1391 " enable CONFIG_INTR_REMAP");
1412 1392
1413 printk(KERN_INFO "Enable CONFIG_INTR_REMAP for enabling intr-remapping " 1393 pr_info("Enable CONFIG_INTR_REMAP for enabling intr-remapping "
1414 " and x2apic\n"); 1394 " and x2apic\n");
1415#endif 1395#endif
1416 1396
1417 return; 1397 return;
@@ -1428,7 +1408,7 @@ end:
1428static int __init detect_init_APIC(void) 1408static int __init detect_init_APIC(void)
1429{ 1409{
1430 if (!cpu_has_apic) { 1410 if (!cpu_has_apic) {
1431 printk(KERN_INFO "No local APIC present\n"); 1411 pr_info("No local APIC present\n");
1432 return -1; 1412 return -1;
1433 } 1413 }
1434 1414
@@ -1469,8 +1449,8 @@ static int __init detect_init_APIC(void)
1469 * "lapic" specified. 1449 * "lapic" specified.
1470 */ 1450 */
1471 if (!force_enable_local_apic) { 1451 if (!force_enable_local_apic) {
1472 printk(KERN_INFO "Local APIC disabled by BIOS -- " 1452 pr_info("Local APIC disabled by BIOS -- "
1473 "you can enable it with \"lapic\"\n"); 1453 "you can enable it with \"lapic\"\n");
1474 return -1; 1454 return -1;
1475 } 1455 }
1476 /* 1456 /*
@@ -1480,8 +1460,7 @@ static int __init detect_init_APIC(void)
1480 */ 1460 */
1481 rdmsr(MSR_IA32_APICBASE, l, h); 1461 rdmsr(MSR_IA32_APICBASE, l, h);
1482 if (!(l & MSR_IA32_APICBASE_ENABLE)) { 1462 if (!(l & MSR_IA32_APICBASE_ENABLE)) {
1483 printk(KERN_INFO 1463 pr_info("Local APIC disabled by BIOS -- reenabling.\n");
1484 "Local APIC disabled by BIOS -- reenabling.\n");
1485 l &= ~MSR_IA32_APICBASE_BASE; 1464 l &= ~MSR_IA32_APICBASE_BASE;
1486 l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; 1465 l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
1487 wrmsr(MSR_IA32_APICBASE, l, h); 1466 wrmsr(MSR_IA32_APICBASE, l, h);
@@ -1494,7 +1473,7 @@ static int __init detect_init_APIC(void)
1494 */ 1473 */
1495 features = cpuid_edx(1); 1474 features = cpuid_edx(1);
1496 if (!(features & (1 << X86_FEATURE_APIC))) { 1475 if (!(features & (1 << X86_FEATURE_APIC))) {
1497 printk(KERN_WARNING "Could not enable APIC!\n"); 1476 pr_warning("Could not enable APIC!\n");
1498 return -1; 1477 return -1;
1499 } 1478 }
1500 set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); 1479 set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
@@ -1505,14 +1484,14 @@ static int __init detect_init_APIC(void)
1505 if (l & MSR_IA32_APICBASE_ENABLE) 1484 if (l & MSR_IA32_APICBASE_ENABLE)
1506 mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; 1485 mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
1507 1486
1508 printk(KERN_INFO "Found and enabled local APIC!\n"); 1487 pr_info("Found and enabled local APIC!\n");
1509 1488
1510 apic_pm_activate(); 1489 apic_pm_activate();
1511 1490
1512 return 0; 1491 return 0;
1513 1492
1514no_apic: 1493no_apic:
1515 printk(KERN_INFO "No local APIC present or hardware disabled\n"); 1494 pr_info("No local APIC present or hardware disabled\n");
1516 return -1; 1495 return -1;
1517} 1496}
1518#endif 1497#endif
@@ -1588,12 +1567,12 @@ int __init APIC_init_uniprocessor(void)
1588{ 1567{
1589#ifdef CONFIG_X86_64 1568#ifdef CONFIG_X86_64
1590 if (disable_apic) { 1569 if (disable_apic) {
1591 printk(KERN_INFO "Apic disabled\n"); 1570 pr_info("Apic disabled\n");
1592 return -1; 1571 return -1;
1593 } 1572 }
1594 if (!cpu_has_apic) { 1573 if (!cpu_has_apic) {
1595 disable_apic = 1; 1574 disable_apic = 1;
1596 printk(KERN_INFO "Apic disabled by BIOS\n"); 1575 pr_info("Apic disabled by BIOS\n");
1597 return -1; 1576 return -1;
1598 } 1577 }
1599#else 1578#else
@@ -1605,8 +1584,8 @@ int __init APIC_init_uniprocessor(void)
1605 */ 1584 */
1606 if (!cpu_has_apic && 1585 if (!cpu_has_apic &&
1607 APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { 1586 APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
1608 printk(KERN_ERR "BIOS bug, local APIC 0x%x not detected!...\n", 1587 pr_err("BIOS bug, local APIC 0x%x not detected!...\n",
1609 boot_cpu_physical_apicid); 1588 boot_cpu_physical_apicid);
1610 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); 1589 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
1611 return -1; 1590 return -1;
1612 } 1591 }
@@ -1682,9 +1661,7 @@ void smp_spurious_interrupt(struct pt_regs *regs)
1682{ 1661{
1683 u32 v; 1662 u32 v;
1684 1663
1685#ifdef CONFIG_X86_64
1686 exit_idle(); 1664 exit_idle();
1687#endif
1688 irq_enter(); 1665 irq_enter();
1689 /* 1666 /*
1690 * Check if this really is a spurious interrupt and ACK it 1667 * Check if this really is a spurious interrupt and ACK it
@@ -1695,14 +1672,11 @@ void smp_spurious_interrupt(struct pt_regs *regs)
1695 if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) 1672 if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
1696 ack_APIC_irq(); 1673 ack_APIC_irq();
1697 1674
1698#ifdef CONFIG_X86_64 1675 inc_irq_stat(irq_spurious_count);
1699 add_pda(irq_spurious_count, 1); 1676
1700#else
1701 /* see sw-dev-man vol 3, chapter 7.4.13.5 */ 1677 /* see sw-dev-man vol 3, chapter 7.4.13.5 */
1702 printk(KERN_INFO "spurious APIC interrupt on CPU#%d, " 1678 pr_info("spurious APIC interrupt on CPU#%d, "
1703 "should never happen.\n", smp_processor_id()); 1679 "should never happen.\n", smp_processor_id());
1704 __get_cpu_var(irq_stat).irq_spurious_count++;
1705#endif
1706 irq_exit(); 1680 irq_exit();
1707} 1681}
1708 1682
@@ -1713,9 +1687,7 @@ void smp_error_interrupt(struct pt_regs *regs)
1713{ 1687{
1714 u32 v, v1; 1688 u32 v, v1;
1715 1689
1716#ifdef CONFIG_X86_64
1717 exit_idle(); 1690 exit_idle();
1718#endif
1719 irq_enter(); 1691 irq_enter();
1720 /* First tickle the hardware, only then report what went on. -- REW */ 1692 /* First tickle the hardware, only then report what went on. -- REW */
1721 v = apic_read(APIC_ESR); 1693 v = apic_read(APIC_ESR);
@@ -1724,17 +1696,18 @@ void smp_error_interrupt(struct pt_regs *regs)
1724 ack_APIC_irq(); 1696 ack_APIC_irq();
1725 atomic_inc(&irq_err_count); 1697 atomic_inc(&irq_err_count);
1726 1698
1727 /* Here is what the APIC error bits mean: 1699 /*
1728 0: Send CS error 1700 * Here is what the APIC error bits mean:
1729 1: Receive CS error 1701 * 0: Send CS error
1730 2: Send accept error 1702 * 1: Receive CS error
1731 3: Receive accept error 1703 * 2: Send accept error
1732 4: Reserved 1704 * 3: Receive accept error
1733 5: Send illegal vector 1705 * 4: Reserved
1734 6: Received illegal vector 1706 * 5: Send illegal vector
1735 7: Illegal register address 1707 * 6: Received illegal vector
1736 */ 1708 * 7: Illegal register address
1737 printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", 1709 */
1710 pr_debug("APIC error on CPU%d: %02x(%02x)\n",
1738 smp_processor_id(), v , v1); 1711 smp_processor_id(), v , v1);
1739 irq_exit(); 1712 irq_exit();
1740} 1713}
@@ -1832,28 +1805,32 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1832void __cpuinit generic_processor_info(int apicid, int version) 1805void __cpuinit generic_processor_info(int apicid, int version)
1833{ 1806{
1834 int cpu; 1807 int cpu;
1835 cpumask_t tmp_map;
1836 1808
1837 /* 1809 /*
1838 * Validate version 1810 * Validate version
1839 */ 1811 */
1840 if (version == 0x0) { 1812 if (version == 0x0) {
1841 printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! " 1813 pr_warning("BIOS bug, APIC version is 0 for CPU#%d! "
1842 "fixing up to 0x10. (tell your hw vendor)\n", 1814 "fixing up to 0x10. (tell your hw vendor)\n",
1843 version); 1815 version);
1844 version = 0x10; 1816 version = 0x10;
1845 } 1817 }
1846 apic_version[apicid] = version; 1818 apic_version[apicid] = version;
1847 1819
1848 if (num_processors >= NR_CPUS) { 1820 if (num_processors >= nr_cpu_ids) {
1849 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." 1821 int max = nr_cpu_ids;
1850 " Processor ignored.\n", NR_CPUS); 1822 int thiscpu = max + disabled_cpus;
1823
1824 pr_warning(
1825 "ACPI: NR_CPUS/possible_cpus limit of %i reached."
1826 " Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
1827
1828 disabled_cpus++;
1851 return; 1829 return;
1852 } 1830 }
1853 1831
1854 num_processors++; 1832 num_processors++;
1855 cpus_complement(tmp_map, cpu_present_map); 1833 cpu = cpumask_next_zero(-1, cpu_present_mask);
1856 cpu = first_cpu(tmp_map);
1857 1834
1858 physid_set(apicid, phys_cpu_present_map); 1835 physid_set(apicid, phys_cpu_present_map);
1859 if (apicid == boot_cpu_physical_apicid) { 1836 if (apicid == boot_cpu_physical_apicid) {
@@ -1903,8 +1880,8 @@ void __cpuinit generic_processor_info(int apicid, int version)
1903 } 1880 }
1904#endif 1881#endif
1905 1882
1906 cpu_set(cpu, cpu_possible_map); 1883 set_cpu_possible(cpu, true);
1907 cpu_set(cpu, cpu_present_map); 1884 set_cpu_present(cpu, true);
1908} 1885}
1909 1886
1910#ifdef CONFIG_X86_64 1887#ifdef CONFIG_X86_64
@@ -2106,7 +2083,7 @@ __cpuinit int apic_is_clustered_box(void)
2106 bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); 2083 bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
2107 bitmap_zero(clustermap, NUM_APIC_CLUSTERS); 2084 bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
2108 2085
2109 for (i = 0; i < NR_CPUS; i++) { 2086 for (i = 0; i < nr_cpu_ids; i++) {
2110 /* are we being called early in kernel startup? */ 2087 /* are we being called early in kernel startup? */
2111 if (bios_cpu_apicid) { 2088 if (bios_cpu_apicid) {
2112 id = bios_cpu_apicid[i]; 2089 id = bios_cpu_apicid[i];
@@ -2209,7 +2186,7 @@ static int __init apic_set_verbosity(char *arg)
2209 else if (strcmp("verbose", arg) == 0) 2186 else if (strcmp("verbose", arg) == 0)
2210 apic_verbosity = APIC_VERBOSE; 2187 apic_verbosity = APIC_VERBOSE;
2211 else { 2188 else {
2212 printk(KERN_WARNING "APIC Verbosity level %s not recognised" 2189 pr_warning("APIC Verbosity level %s not recognised"
2213 " use apic=verbose or apic=debug\n", arg); 2190 " use apic=verbose or apic=debug\n", arg);
2214 return -EINVAL; 2191 return -EINVAL;
2215 } 2192 }
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 5145a6e72bbb..3a26525a3f31 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -391,11 +391,7 @@ static int power_off;
391#else 391#else
392static int power_off = 1; 392static int power_off = 1;
393#endif 393#endif
394#ifdef CONFIG_APM_REAL_MODE_POWER_OFF
395static int realmode_power_off = 1;
396#else
397static int realmode_power_off; 394static int realmode_power_off;
398#endif
399#ifdef CONFIG_APM_ALLOW_INTS 395#ifdef CONFIG_APM_ALLOW_INTS
400static int allow_ints = 1; 396static int allow_ints = 1;
401#else 397#else
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 6649d09ad88f..ee4df08feee6 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -11,7 +11,7 @@
11#include <linux/suspend.h> 11#include <linux/suspend.h>
12#include <linux/kbuild.h> 12#include <linux/kbuild.h>
13#include <asm/ucontext.h> 13#include <asm/ucontext.h>
14#include "sigframe.h" 14#include <asm/sigframe.h>
15#include <asm/pgtable.h> 15#include <asm/pgtable.h>
16#include <asm/fixmap.h> 16#include <asm/fixmap.h>
17#include <asm/processor.h> 17#include <asm/processor.h>
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 7fcf63d22f8b..1d41d3f1edbc 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -20,6 +20,8 @@
20 20
21#include <xen/interface/xen.h> 21#include <xen/interface/xen.h>
22 22
23#include <asm/sigframe.h>
24
23#define __NO_STUBS 1 25#define __NO_STUBS 1
24#undef __SYSCALL 26#undef __SYSCALL
25#undef _ASM_X86_UNISTD_64_H 27#undef _ASM_X86_UNISTD_64_H
@@ -87,7 +89,7 @@ int main(void)
87 BLANK(); 89 BLANK();
88#undef ENTRY 90#undef ENTRY
89 DEFINE(IA32_RT_SIGFRAME_sigcontext, 91 DEFINE(IA32_RT_SIGFRAME_sigcontext,
90 offsetof (struct rt_sigframe32, uc.uc_mcontext)); 92 offsetof (struct rt_sigframe_ia32, uc.uc_mcontext));
91 BLANK(); 93 BLANK();
92#endif 94#endif
93 DEFINE(pbe_address, offsetof(struct pbe, address)); 95 DEFINE(pbe_address, offsetof(struct pbe, address));
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
index f0dfe6f17e7e..f63882728d91 100644
--- a/arch/x86/kernel/bios_uv.c
+++ b/arch/x86/kernel/bios_uv.c
@@ -25,7 +25,7 @@
25#include <asm/uv/bios.h> 25#include <asm/uv/bios.h>
26#include <asm/uv/uv_hub.h> 26#include <asm/uv/uv_hub.h>
27 27
28struct uv_systab uv_systab; 28static struct uv_systab uv_systab;
29 29
30s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) 30s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
31{ 31{
@@ -69,10 +69,10 @@ s64 uv_bios_call_reentrant(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
69 69
70long sn_partition_id; 70long sn_partition_id;
71EXPORT_SYMBOL_GPL(sn_partition_id); 71EXPORT_SYMBOL_GPL(sn_partition_id);
72long uv_coherency_id; 72long sn_coherency_id;
73EXPORT_SYMBOL_GPL(uv_coherency_id); 73EXPORT_SYMBOL_GPL(sn_coherency_id);
74long uv_region_size; 74long sn_region_size;
75EXPORT_SYMBOL_GPL(uv_region_size); 75EXPORT_SYMBOL_GPL(sn_region_size);
76int uv_type; 76int uv_type;
77 77
78 78
@@ -100,6 +100,56 @@ s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher,
100 return ret; 100 return ret;
101} 101}
102 102
103int
104uv_bios_mq_watchlist_alloc(int blade, unsigned long addr, unsigned int mq_size,
105 unsigned long *intr_mmr_offset)
106{
107 union uv_watchlist_u size_blade;
108 u64 watchlist;
109 s64 ret;
110
111 size_blade.size = mq_size;
112 size_blade.blade = blade;
113
114 /*
115 * bios returns watchlist number or negative error number.
116 */
117 ret = (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_ALLOC, addr,
118 size_blade.val, (u64)intr_mmr_offset,
119 (u64)&watchlist, 0);
120 if (ret < BIOS_STATUS_SUCCESS)
121 return ret;
122
123 return watchlist;
124}
125EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_alloc);
126
127int
128uv_bios_mq_watchlist_free(int blade, int watchlist_num)
129{
130 return (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_FREE,
131 blade, watchlist_num, 0, 0, 0);
132}
133EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_free);
134
135s64
136uv_bios_change_memprotect(u64 paddr, u64 len, enum uv_memprotect perms)
137{
138 return uv_bios_call_irqsave(UV_BIOS_MEMPROTECT, paddr, len,
139 perms, 0, 0);
140}
141EXPORT_SYMBOL_GPL(uv_bios_change_memprotect);
142
143s64
144uv_bios_reserved_page_pa(u64 buf, u64 *cookie, u64 *addr, u64 *len)
145{
146 s64 ret;
147
148 ret = uv_bios_call_irqsave(UV_BIOS_GET_PARTITION_ADDR, (u64)cookie,
149 (u64)addr, buf, (u64)len, 0);
150 return ret;
151}
152EXPORT_SYMBOL_GPL(uv_bios_reserved_page_pa);
103 153
104s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second) 154s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second)
105{ 155{
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
new file mode 100644
index 000000000000..2ac0ab71412a
--- /dev/null
+++ b/arch/x86/kernel/check.c
@@ -0,0 +1,161 @@
1#include <linux/module.h>
2#include <linux/sched.h>
3#include <linux/kthread.h>
4#include <linux/workqueue.h>
5#include <asm/e820.h>
6#include <asm/proto.h>
7
8/*
9 * Some BIOSes seem to corrupt the low 64k of memory during events
10 * like suspend/resume and unplugging an HDMI cable. Reserve all
11 * remaining free memory in that area and fill it with a distinct
12 * pattern.
13 */
14#define MAX_SCAN_AREAS 8
15
16static int __read_mostly memory_corruption_check = -1;
17
18static unsigned __read_mostly corruption_check_size = 64*1024;
19static unsigned __read_mostly corruption_check_period = 60; /* seconds */
20
21static struct e820entry scan_areas[MAX_SCAN_AREAS];
22static int num_scan_areas;
23
24
25static __init int set_corruption_check(char *arg)
26{
27 char *end;
28
29 memory_corruption_check = simple_strtol(arg, &end, 10);
30
31 return (*end == 0) ? 0 : -EINVAL;
32}
33early_param("memory_corruption_check", set_corruption_check);
34
35static __init int set_corruption_check_period(char *arg)
36{
37 char *end;
38
39 corruption_check_period = simple_strtoul(arg, &end, 10);
40
41 return (*end == 0) ? 0 : -EINVAL;
42}
43early_param("memory_corruption_check_period", set_corruption_check_period);
44
45static __init int set_corruption_check_size(char *arg)
46{
47 char *end;
48 unsigned size;
49
50 size = memparse(arg, &end);
51
52 if (*end == '\0')
53 corruption_check_size = size;
54
55 return (size == corruption_check_size) ? 0 : -EINVAL;
56}
57early_param("memory_corruption_check_size", set_corruption_check_size);
58
59
60void __init setup_bios_corruption_check(void)
61{
62 u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */
63
64 if (memory_corruption_check == -1) {
65 memory_corruption_check =
66#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
67 1
68#else
69 0
70#endif
71 ;
72 }
73
74 if (corruption_check_size == 0)
75 memory_corruption_check = 0;
76
77 if (!memory_corruption_check)
78 return;
79
80 corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
81
82 while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) {
83 u64 size;
84 addr = find_e820_area_size(addr, &size, PAGE_SIZE);
85
86 if (addr == 0)
87 break;
88
89 if ((addr + size) > corruption_check_size)
90 size = corruption_check_size - addr;
91
92 if (size == 0)
93 break;
94
95 e820_update_range(addr, size, E820_RAM, E820_RESERVED);
96 scan_areas[num_scan_areas].addr = addr;
97 scan_areas[num_scan_areas].size = size;
98 num_scan_areas++;
99
100 /* Assume we've already mapped this early memory */
101 memset(__va(addr), 0, size);
102
103 addr += size;
104 }
105
106 printk(KERN_INFO "Scanning %d areas for low memory corruption\n",
107 num_scan_areas);
108 update_e820();
109}
110
111
112void check_for_bios_corruption(void)
113{
114 int i;
115 int corruption = 0;
116
117 if (!memory_corruption_check)
118 return;
119
120 for (i = 0; i < num_scan_areas; i++) {
121 unsigned long *addr = __va(scan_areas[i].addr);
122 unsigned long size = scan_areas[i].size;
123
124 for (; size; addr++, size -= sizeof(unsigned long)) {
125 if (!*addr)
126 continue;
127 printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n",
128 addr, __pa(addr), *addr);
129 corruption = 1;
130 *addr = 0;
131 }
132 }
133
134 WARN_ONCE(corruption, KERN_ERR "Memory corruption detected in low memory\n");
135}
136
137static void check_corruption(struct work_struct *dummy);
138static DECLARE_DELAYED_WORK(bios_check_work, check_corruption);
139
140static void check_corruption(struct work_struct *dummy)
141{
142 check_for_bios_corruption();
143 schedule_delayed_work(&bios_check_work,
144 round_jiffies_relative(corruption_check_period*HZ));
145}
146
147static int start_periodic_check_for_corruption(void)
148{
149 if (!memory_corruption_check || corruption_check_period == 0)
150 return 0;
151
152 printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n",
153 corruption_check_period);
154
155 /* First time we run the checks right away */
156 schedule_delayed_work(&bios_check_work, 0);
157 return 0;
158}
159
160module_init(start_periodic_check_for_corruption);
161
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 82ec6075c057..82db7f45e2de 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -2,8 +2,14 @@
2# Makefile for x86-compatible CPU details and quirks 2# Makefile for x86-compatible CPU details and quirks
3# 3#
4 4
5# Don't trace early stages of a secondary CPU boot
6ifdef CONFIG_FUNCTION_TRACER
7CFLAGS_REMOVE_common.o = -pg
8endif
9
5obj-y := intel_cacheinfo.o addon_cpuid_features.o 10obj-y := intel_cacheinfo.o addon_cpuid_features.o
6obj-y += proc.o capflags.o powerflags.o common.o 11obj-y += proc.o capflags.o powerflags.o common.o
12obj-y += vmware.o hypervisor.o
7 13
8obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o 14obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
9obj-$(CONFIG_X86_64) += bugs_64.o 15obj-$(CONFIG_X86_64) += bugs_64.o
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index ef8f831af823..2cf23634b6d9 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -120,9 +120,17 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
120 c->cpu_core_id = phys_pkg_id(c->initial_apicid, ht_mask_width) 120 c->cpu_core_id = phys_pkg_id(c->initial_apicid, ht_mask_width)
121 & core_select_mask; 121 & core_select_mask;
122 c->phys_proc_id = phys_pkg_id(c->initial_apicid, core_plus_mask_width); 122 c->phys_proc_id = phys_pkg_id(c->initial_apicid, core_plus_mask_width);
123 /*
124 * Reinit the apicid, now that we have extended initial_apicid.
125 */
126 c->apicid = phys_pkg_id(c->initial_apicid, 0);
123#else 127#else
124 c->cpu_core_id = phys_pkg_id(ht_mask_width) & core_select_mask; 128 c->cpu_core_id = phys_pkg_id(ht_mask_width) & core_select_mask;
125 c->phys_proc_id = phys_pkg_id(core_plus_mask_width); 129 c->phys_proc_id = phys_pkg_id(core_plus_mask_width);
130 /*
131 * Reinit the apicid, now that we have extended initial_apicid.
132 */
133 c->apicid = phys_pkg_id(0);
126#endif 134#endif
127 c->x86_max_cores = (core_level_siblings / smp_num_siblings); 135 c->x86_max_cores = (core_level_siblings / smp_num_siblings);
128 136
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 8f1e31db2ad5..7c878f6aa919 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -283,9 +283,14 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
283{ 283{
284 early_init_amd_mc(c); 284 early_init_amd_mc(c);
285 285
286 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ 286 /*
287 if (c->x86_power & (1<<8)) 287 * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
288 * with P/T states and does not stop in deep C-states
289 */
290 if (c->x86_power & (1 << 8)) {
288 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 291 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
292 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
293 }
289 294
290#ifdef CONFIG_X86_64 295#ifdef CONFIG_X86_64
291 set_cpu_cap(c, X86_FEATURE_SYSCALL32); 296 set_cpu_cap(c, X86_FEATURE_SYSCALL32);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index b9c9ea0217a9..3f95a40f718a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -36,6 +36,7 @@
36#include <asm/proto.h> 36#include <asm/proto.h>
37#include <asm/sections.h> 37#include <asm/sections.h>
38#include <asm/setup.h> 38#include <asm/setup.h>
39#include <asm/hypervisor.h>
39 40
40#include "cpu.h" 41#include "cpu.h"
41 42
@@ -354,7 +355,7 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
354 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); 355 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
355 } else if (smp_num_siblings > 1) { 356 } else if (smp_num_siblings > 1) {
356 357
357 if (smp_num_siblings > NR_CPUS) { 358 if (smp_num_siblings > nr_cpu_ids) {
358 printk(KERN_WARNING "CPU: Unsupported number of siblings %d", 359 printk(KERN_WARNING "CPU: Unsupported number of siblings %d",
359 smp_num_siblings); 360 smp_num_siblings);
360 smp_num_siblings = 1; 361 smp_num_siblings = 1;
@@ -703,6 +704,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
703 detect_ht(c); 704 detect_ht(c);
704#endif 705#endif
705 706
707 init_hypervisor(c);
706 /* 708 /*
707 * On SMP, boot_cpu_data holds the common feature set between 709 * On SMP, boot_cpu_data holds the common feature set between
708 * all CPUs; so make sure that we indicate which features are 710 * all CPUs; so make sure that we indicate which features are
@@ -862,7 +864,7 @@ EXPORT_SYMBOL(_cpu_pda);
862 864
863struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; 865struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
864 866
865char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; 867static char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
866 868
867void __cpuinit pda_init(int cpu) 869void __cpuinit pda_init(int cpu)
868{ 870{
@@ -903,8 +905,8 @@ void __cpuinit pda_init(int cpu)
903 } 905 }
904} 906}
905 907
906char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + 908static char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
907 DEBUG_STKSZ] __page_aligned_bss; 909 DEBUG_STKSZ] __page_aligned_bss;
908 910
909extern asmlinkage void ignore_sysret(void); 911extern asmlinkage void ignore_sysret(void);
910 912
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 8e48c5d4467d..28102ad1a363 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -33,6 +33,7 @@
33#include <linux/cpufreq.h> 33#include <linux/cpufreq.h>
34#include <linux/compiler.h> 34#include <linux/compiler.h>
35#include <linux/dmi.h> 35#include <linux/dmi.h>
36#include <linux/ftrace.h>
36 37
37#include <linux/acpi.h> 38#include <linux/acpi.h>
38#include <acpi/processor.h> 39#include <acpi/processor.h>
@@ -391,6 +392,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
391 unsigned int next_perf_state = 0; /* Index into perf table */ 392 unsigned int next_perf_state = 0; /* Index into perf table */
392 unsigned int i; 393 unsigned int i;
393 int result = 0; 394 int result = 0;
395 struct power_trace it;
394 396
395 dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu); 397 dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
396 398
@@ -427,6 +429,8 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
427 } 429 }
428 } 430 }
429 431
432 trace_power_mark(&it, POWER_PSTATE, next_perf_state);
433
430 switch (data->cpu_feature) { 434 switch (data->cpu_feature) {
431 case SYSTEM_INTEL_MSR_CAPABLE: 435 case SYSTEM_INTEL_MSR_CAPABLE:
432 cmd.type = SYSTEM_INTEL_MSR_CAPABLE; 436 cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
@@ -513,6 +517,17 @@ acpi_cpufreq_guess_freq(struct acpi_cpufreq_data *data, unsigned int cpu)
513 } 517 }
514} 518}
515 519
520static void free_acpi_perf_data(void)
521{
522 unsigned int i;
523
524 /* Freeing a NULL pointer is OK, and alloc_percpu zeroes. */
525 for_each_possible_cpu(i)
526 free_cpumask_var(per_cpu_ptr(acpi_perf_data, i)
527 ->shared_cpu_map);
528 free_percpu(acpi_perf_data);
529}
530
516/* 531/*
517 * acpi_cpufreq_early_init - initialize ACPI P-States library 532 * acpi_cpufreq_early_init - initialize ACPI P-States library
518 * 533 *
@@ -523,6 +538,7 @@ acpi_cpufreq_guess_freq(struct acpi_cpufreq_data *data, unsigned int cpu)
523 */ 538 */
524static int __init acpi_cpufreq_early_init(void) 539static int __init acpi_cpufreq_early_init(void)
525{ 540{
541 unsigned int i;
526 dprintk("acpi_cpufreq_early_init\n"); 542 dprintk("acpi_cpufreq_early_init\n");
527 543
528 acpi_perf_data = alloc_percpu(struct acpi_processor_performance); 544 acpi_perf_data = alloc_percpu(struct acpi_processor_performance);
@@ -530,6 +546,16 @@ static int __init acpi_cpufreq_early_init(void)
530 dprintk("Memory allocation error for acpi_perf_data.\n"); 546 dprintk("Memory allocation error for acpi_perf_data.\n");
531 return -ENOMEM; 547 return -ENOMEM;
532 } 548 }
549 for_each_possible_cpu(i) {
550 if (!alloc_cpumask_var_node(
551 &per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map,
552 GFP_KERNEL, cpu_to_node(i))) {
553
554 /* Freeing a NULL pointer is OK: alloc_percpu zeroes. */
555 free_acpi_perf_data();
556 return -ENOMEM;
557 }
558 }
533 559
534 /* Do initialization in ACPI core */ 560 /* Do initialization in ACPI core */
535 acpi_processor_preregister_performance(acpi_perf_data); 561 acpi_processor_preregister_performance(acpi_perf_data);
@@ -600,9 +626,9 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
600 */ 626 */
601 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL || 627 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL ||
602 policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) { 628 policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {
603 policy->cpus = perf->shared_cpu_map; 629 cpumask_copy(&policy->cpus, perf->shared_cpu_map);
604 } 630 }
605 policy->related_cpus = perf->shared_cpu_map; 631 cpumask_copy(&policy->related_cpus, perf->shared_cpu_map);
606 632
607#ifdef CONFIG_SMP 633#ifdef CONFIG_SMP
608 dmi_check_system(sw_any_bug_dmi_table); 634 dmi_check_system(sw_any_bug_dmi_table);
@@ -791,7 +817,7 @@ static int __init acpi_cpufreq_init(void)
791 817
792 ret = cpufreq_register_driver(&acpi_cpufreq_driver); 818 ret = cpufreq_register_driver(&acpi_cpufreq_driver);
793 if (ret) 819 if (ret)
794 free_percpu(acpi_perf_data); 820 free_acpi_perf_data();
795 821
796 return ret; 822 return ret;
797} 823}
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index 7c7d56b43136..1b446d79a8fd 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -310,6 +310,12 @@ static int powernow_acpi_init(void)
310 goto err0; 310 goto err0;
311 } 311 }
312 312
313 if (!alloc_cpumask_var(&acpi_processor_perf->shared_cpu_map,
314 GFP_KERNEL)) {
315 retval = -ENOMEM;
316 goto err05;
317 }
318
313 if (acpi_processor_register_performance(acpi_processor_perf, 0)) { 319 if (acpi_processor_register_performance(acpi_processor_perf, 0)) {
314 retval = -EIO; 320 retval = -EIO;
315 goto err1; 321 goto err1;
@@ -412,6 +418,8 @@ static int powernow_acpi_init(void)
412err2: 418err2:
413 acpi_processor_unregister_performance(acpi_processor_perf, 0); 419 acpi_processor_unregister_performance(acpi_processor_perf, 0);
414err1: 420err1:
421 free_cpumask_var(acpi_processor_perf->shared_cpu_map);
422err05:
415 kfree(acpi_processor_perf); 423 kfree(acpi_processor_perf);
416err0: 424err0:
417 printk(KERN_WARNING PFX "ACPI perflib can not be used in this platform\n"); 425 printk(KERN_WARNING PFX "ACPI perflib can not be used in this platform\n");
@@ -652,6 +660,7 @@ static int powernow_cpu_exit (struct cpufreq_policy *policy) {
652#ifdef CONFIG_X86_POWERNOW_K7_ACPI 660#ifdef CONFIG_X86_POWERNOW_K7_ACPI
653 if (acpi_processor_perf) { 661 if (acpi_processor_perf) {
654 acpi_processor_unregister_performance(acpi_processor_perf, 0); 662 acpi_processor_unregister_performance(acpi_processor_perf, 0);
663 free_cpumask_var(acpi_processor_perf->shared_cpu_map);
655 kfree(acpi_processor_perf); 664 kfree(acpi_processor_perf);
656 } 665 }
657#endif 666#endif
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 7f05f44b97e9..c3c9adbaa26f 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -766,7 +766,7 @@ static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned
766static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) 766static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
767{ 767{
768 struct cpufreq_frequency_table *powernow_table; 768 struct cpufreq_frequency_table *powernow_table;
769 int ret_val; 769 int ret_val = -ENODEV;
770 770
771 if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { 771 if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
772 dprintk("register performance failed: bad ACPI data\n"); 772 dprintk("register performance failed: bad ACPI data\n");
@@ -815,6 +815,13 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
815 /* notify BIOS that we exist */ 815 /* notify BIOS that we exist */
816 acpi_processor_notify_smm(THIS_MODULE); 816 acpi_processor_notify_smm(THIS_MODULE);
817 817
818 if (!alloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) {
819 printk(KERN_ERR PFX
820 "unable to alloc powernow_k8_data cpumask\n");
821 ret_val = -ENOMEM;
822 goto err_out_mem;
823 }
824
818 return 0; 825 return 0;
819 826
820err_out_mem: 827err_out_mem:
@@ -826,7 +833,7 @@ err_out:
826 /* data->acpi_data.state_count informs us at ->exit() whether ACPI was used */ 833 /* data->acpi_data.state_count informs us at ->exit() whether ACPI was used */
827 data->acpi_data.state_count = 0; 834 data->acpi_data.state_count = 0;
828 835
829 return -ENODEV; 836 return ret_val;
830} 837}
831 838
832static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table) 839static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table)
@@ -929,6 +936,7 @@ static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data)
929{ 936{
930 if (data->acpi_data.state_count) 937 if (data->acpi_data.state_count)
931 acpi_processor_unregister_performance(&data->acpi_data, data->cpu); 938 acpi_processor_unregister_performance(&data->acpi_data, data->cpu);
939 free_cpumask_var(data->acpi_data.shared_cpu_map);
932} 940}
933 941
934#else 942#else
@@ -1134,7 +1142,8 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1134 data->cpu = pol->cpu; 1142 data->cpu = pol->cpu;
1135 data->currpstate = HW_PSTATE_INVALID; 1143 data->currpstate = HW_PSTATE_INVALID;
1136 1144
1137 if (powernow_k8_cpu_init_acpi(data)) { 1145 rc = powernow_k8_cpu_init_acpi(data);
1146 if (rc) {
1138 /* 1147 /*
1139 * Use the PSB BIOS structure. This is only availabe on 1148 * Use the PSB BIOS structure. This is only availabe on
1140 * an UP version, and is deprecated by AMD. 1149 * an UP version, and is deprecated by AMD.
@@ -1152,20 +1161,17 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1152 "ACPI maintainers and complain to your BIOS " 1161 "ACPI maintainers and complain to your BIOS "
1153 "vendor.\n"); 1162 "vendor.\n");
1154#endif 1163#endif
1155 kfree(data); 1164 goto err_out;
1156 return -ENODEV;
1157 } 1165 }
1158 if (pol->cpu != 0) { 1166 if (pol->cpu != 0) {
1159 printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for " 1167 printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for "
1160 "CPU other than CPU0. Complain to your BIOS " 1168 "CPU other than CPU0. Complain to your BIOS "
1161 "vendor.\n"); 1169 "vendor.\n");
1162 kfree(data); 1170 goto err_out;
1163 return -ENODEV;
1164 } 1171 }
1165 rc = find_psb_table(data); 1172 rc = find_psb_table(data);
1166 if (rc) { 1173 if (rc) {
1167 kfree(data); 1174 goto err_out;
1168 return -ENODEV;
1169 } 1175 }
1170 } 1176 }
1171 1177
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
new file mode 100644
index 000000000000..fb5b86af0b01
--- /dev/null
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -0,0 +1,58 @@
1/*
2 * Common hypervisor code
3 *
4 * Copyright (C) 2008, VMware, Inc.
5 * Author : Alok N Kataria <akataria@vmware.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
15 * NON INFRINGEMENT. See the GNU General Public License for more
16 * details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
21 *
22 */
23
24#include <asm/processor.h>
25#include <asm/vmware.h>
26#include <asm/hypervisor.h>
27
28static inline void __cpuinit
29detect_hypervisor_vendor(struct cpuinfo_x86 *c)
30{
31 if (vmware_platform()) {
32 c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE;
33 } else {
34 c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE;
35 }
36}
37
38unsigned long get_hypervisor_tsc_freq(void)
39{
40 if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE)
41 return vmware_get_tsc_khz();
42 return 0;
43}
44
45static inline void __cpuinit
46hypervisor_set_feature_bits(struct cpuinfo_x86 *c)
47{
48 if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) {
49 vmware_set_feature_bits(c);
50 return;
51 }
52}
53
54void __cpuinit init_hypervisor(struct cpuinfo_x86 *c)
55{
56 detect_hypervisor_vendor(c);
57 hypervisor_set_feature_bits(c);
58}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index cce0b6118d55..8ea6929e974c 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -11,7 +11,6 @@
11#include <asm/pgtable.h> 11#include <asm/pgtable.h>
12#include <asm/msr.h> 12#include <asm/msr.h>
13#include <asm/uaccess.h> 13#include <asm/uaccess.h>
14#include <asm/ptrace.h>
15#include <asm/ds.h> 14#include <asm/ds.h>
16#include <asm/bugs.h> 15#include <asm/bugs.h>
17 16
@@ -41,6 +40,16 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
41 if (c->x86 == 15 && c->x86_cache_alignment == 64) 40 if (c->x86 == 15 && c->x86_cache_alignment == 64)
42 c->x86_cache_alignment = 128; 41 c->x86_cache_alignment = 128;
43#endif 42#endif
43
44 /*
45 * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
46 * with P/T states and does not stop in deep C-states
47 */
48 if (c->x86_power & (1 << 8)) {
49 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
50 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
51 }
52
44} 53}
45 54
46#ifdef CONFIG_X86_32 55#ifdef CONFIG_X86_32
@@ -242,6 +251,13 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
242 251
243 intel_workarounds(c); 252 intel_workarounds(c);
244 253
254 /*
255 * Detect the extended topology information if available. This
256 * will reinitialise the initial_apicid which will be used
257 * in init_intel_cacheinfo()
258 */
259 detect_extended_topology(c);
260
245 l2 = init_intel_cacheinfo(c); 261 l2 = init_intel_cacheinfo(c);
246 if (c->cpuid_level > 9) { 262 if (c->cpuid_level > 9) {
247 unsigned eax = cpuid_eax(10); 263 unsigned eax = cpuid_eax(10);
@@ -307,13 +323,8 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
307 set_cpu_cap(c, X86_FEATURE_P4); 323 set_cpu_cap(c, X86_FEATURE_P4);
308 if (c->x86 == 6) 324 if (c->x86 == 6)
309 set_cpu_cap(c, X86_FEATURE_P3); 325 set_cpu_cap(c, X86_FEATURE_P3);
310
311 if (cpu_has_bts)
312 ptrace_bts_init_intel(c);
313
314#endif 326#endif
315 327
316 detect_extended_topology(c);
317 if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { 328 if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
318 /* 329 /*
319 * let's use the legacy cpuid vector 0x1 and 0x4 for topology 330 * let's use the legacy cpuid vector 0x1 and 0x4 for topology
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 3f46afbb1cf1..48533d77be78 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -534,31 +534,16 @@ static void __cpuinit free_cache_attributes(unsigned int cpu)
534 per_cpu(cpuid4_info, cpu) = NULL; 534 per_cpu(cpuid4_info, cpu) = NULL;
535} 535}
536 536
537static int __cpuinit detect_cache_attributes(unsigned int cpu) 537static void __cpuinit get_cpu_leaves(void *_retval)
538{ 538{
539 struct _cpuid4_info *this_leaf; 539 int j, *retval = _retval, cpu = smp_processor_id();
540 unsigned long j;
541 int retval;
542 cpumask_t oldmask;
543
544 if (num_cache_leaves == 0)
545 return -ENOENT;
546
547 per_cpu(cpuid4_info, cpu) = kzalloc(
548 sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
549 if (per_cpu(cpuid4_info, cpu) == NULL)
550 return -ENOMEM;
551
552 oldmask = current->cpus_allowed;
553 retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
554 if (retval)
555 goto out;
556 540
557 /* Do cpuid and store the results */ 541 /* Do cpuid and store the results */
558 for (j = 0; j < num_cache_leaves; j++) { 542 for (j = 0; j < num_cache_leaves; j++) {
543 struct _cpuid4_info *this_leaf;
559 this_leaf = CPUID4_INFO_IDX(cpu, j); 544 this_leaf = CPUID4_INFO_IDX(cpu, j);
560 retval = cpuid4_cache_lookup(j, this_leaf); 545 *retval = cpuid4_cache_lookup(j, this_leaf);
561 if (unlikely(retval < 0)) { 546 if (unlikely(*retval < 0)) {
562 int i; 547 int i;
563 548
564 for (i = 0; i < j; i++) 549 for (i = 0; i < j; i++)
@@ -567,9 +552,21 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
567 } 552 }
568 cache_shared_cpu_map_setup(cpu, j); 553 cache_shared_cpu_map_setup(cpu, j);
569 } 554 }
570 set_cpus_allowed_ptr(current, &oldmask); 555}
556
557static int __cpuinit detect_cache_attributes(unsigned int cpu)
558{
559 int retval;
560
561 if (num_cache_leaves == 0)
562 return -ENOENT;
571 563
572out: 564 per_cpu(cpuid4_info, cpu) = kzalloc(
565 sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
566 if (per_cpu(cpuid4_info, cpu) == NULL)
567 return -ENOMEM;
568
569 smp_call_function_single(cpu, get_cpu_leaves, &retval, true);
573 if (retval) { 570 if (retval) {
574 kfree(per_cpu(cpuid4_info, cpu)); 571 kfree(per_cpu(cpuid4_info, cpu));
575 per_cpu(cpuid4_info, cpu) = NULL; 572 per_cpu(cpuid4_info, cpu) = NULL;
@@ -626,8 +623,8 @@ static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
626 cpumask_t *mask = &this_leaf->shared_cpu_map; 623 cpumask_t *mask = &this_leaf->shared_cpu_map;
627 624
628 n = type? 625 n = type?
629 cpulist_scnprintf(buf, len-2, *mask): 626 cpulist_scnprintf(buf, len-2, mask) :
630 cpumask_scnprintf(buf, len-2, *mask); 627 cpumask_scnprintf(buf, len-2, mask);
631 buf[n++] = '\n'; 628 buf[n++] = '\n';
632 buf[n] = '\0'; 629 buf[n] = '\0';
633 } 630 }
@@ -644,20 +641,17 @@ static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf)
644 return show_shared_cpu_map_func(leaf, 1, buf); 641 return show_shared_cpu_map_func(leaf, 1, buf);
645} 642}
646 643
647static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) { 644static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)
648 switch(this_leaf->eax.split.type) { 645{
649 case CACHE_TYPE_DATA: 646 switch (this_leaf->eax.split.type) {
647 case CACHE_TYPE_DATA:
650 return sprintf(buf, "Data\n"); 648 return sprintf(buf, "Data\n");
651 break; 649 case CACHE_TYPE_INST:
652 case CACHE_TYPE_INST:
653 return sprintf(buf, "Instruction\n"); 650 return sprintf(buf, "Instruction\n");
654 break; 651 case CACHE_TYPE_UNIFIED:
655 case CACHE_TYPE_UNIFIED:
656 return sprintf(buf, "Unified\n"); 652 return sprintf(buf, "Unified\n");
657 break; 653 default:
658 default:
659 return sprintf(buf, "Unknown\n"); 654 return sprintf(buf, "Unknown\n");
660 break;
661 } 655 }
662} 656}
663 657
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 4b031a4ac856..1c838032fd37 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -510,12 +510,9 @@ static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
510 */ 510 */
511void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 511void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
512{ 512{
513 static cpumask_t mce_cpus = CPU_MASK_NONE;
514
515 mce_cpu_quirks(c); 513 mce_cpu_quirks(c);
516 514
517 if (mce_dont_init || 515 if (mce_dont_init ||
518 cpu_test_and_set(smp_processor_id(), mce_cpus) ||
519 !mce_available(c)) 516 !mce_available(c))
520 return; 517 return;
521 518
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 5eb390a4b2e9..a5a5e0530370 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -83,34 +83,41 @@ static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
83 * CPU Initialization 83 * CPU Initialization
84 */ 84 */
85 85
86struct thresh_restart {
87 struct threshold_block *b;
88 int reset;
89 u16 old_limit;
90};
91
86/* must be called with correct cpu affinity */ 92/* must be called with correct cpu affinity */
87static void threshold_restart_bank(struct threshold_block *b, 93static long threshold_restart_bank(void *_tr)
88 int reset, u16 old_limit)
89{ 94{
95 struct thresh_restart *tr = _tr;
90 u32 mci_misc_hi, mci_misc_lo; 96 u32 mci_misc_hi, mci_misc_lo;
91 97
92 rdmsr(b->address, mci_misc_lo, mci_misc_hi); 98 rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
93 99
94 if (b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX)) 100 if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
95 reset = 1; /* limit cannot be lower than err count */ 101 tr->reset = 1; /* limit cannot be lower than err count */
96 102
97 if (reset) { /* reset err count and overflow bit */ 103 if (tr->reset) { /* reset err count and overflow bit */
98 mci_misc_hi = 104 mci_misc_hi =
99 (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | 105 (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
100 (THRESHOLD_MAX - b->threshold_limit); 106 (THRESHOLD_MAX - tr->b->threshold_limit);
101 } else if (old_limit) { /* change limit w/o reset */ 107 } else if (tr->old_limit) { /* change limit w/o reset */
102 int new_count = (mci_misc_hi & THRESHOLD_MAX) + 108 int new_count = (mci_misc_hi & THRESHOLD_MAX) +
103 (old_limit - b->threshold_limit); 109 (tr->old_limit - tr->b->threshold_limit);
104 mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | 110 mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
105 (new_count & THRESHOLD_MAX); 111 (new_count & THRESHOLD_MAX);
106 } 112 }
107 113
108 b->interrupt_enable ? 114 tr->b->interrupt_enable ?
109 (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) : 115 (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
110 (mci_misc_hi &= ~MASK_INT_TYPE_HI); 116 (mci_misc_hi &= ~MASK_INT_TYPE_HI);
111 117
112 mci_misc_hi |= MASK_COUNT_EN_HI; 118 mci_misc_hi |= MASK_COUNT_EN_HI;
113 wrmsr(b->address, mci_misc_lo, mci_misc_hi); 119 wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
120 return 0;
114} 121}
115 122
116/* cpu init entry point, called from mce.c with preempt off */ 123/* cpu init entry point, called from mce.c with preempt off */
@@ -120,6 +127,7 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
120 unsigned int cpu = smp_processor_id(); 127 unsigned int cpu = smp_processor_id();
121 u8 lvt_off; 128 u8 lvt_off;
122 u32 low = 0, high = 0, address = 0; 129 u32 low = 0, high = 0, address = 0;
130 struct thresh_restart tr;
123 131
124 for (bank = 0; bank < NR_BANKS; ++bank) { 132 for (bank = 0; bank < NR_BANKS; ++bank) {
125 for (block = 0; block < NR_BLOCKS; ++block) { 133 for (block = 0; block < NR_BLOCKS; ++block) {
@@ -162,7 +170,10 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
162 wrmsr(address, low, high); 170 wrmsr(address, low, high);
163 171
164 threshold_defaults.address = address; 172 threshold_defaults.address = address;
165 threshold_restart_bank(&threshold_defaults, 0, 0); 173 tr.b = &threshold_defaults;
174 tr.reset = 0;
175 tr.old_limit = 0;
176 threshold_restart_bank(&tr);
166 } 177 }
167 } 178 }
168} 179}
@@ -237,7 +248,7 @@ asmlinkage void mce_threshold_interrupt(void)
237 } 248 }
238 } 249 }
239out: 250out:
240 add_pda(irq_threshold_count, 1); 251 inc_irq_stat(irq_threshold_count);
241 irq_exit(); 252 irq_exit();
242} 253}
243 254
@@ -251,20 +262,6 @@ struct threshold_attr {
251 ssize_t(*store) (struct threshold_block *, const char *, size_t count); 262 ssize_t(*store) (struct threshold_block *, const char *, size_t count);
252}; 263};
253 264
254static void affinity_set(unsigned int cpu, cpumask_t *oldmask,
255 cpumask_t *newmask)
256{
257 *oldmask = current->cpus_allowed;
258 cpus_clear(*newmask);
259 cpu_set(cpu, *newmask);
260 set_cpus_allowed_ptr(current, newmask);
261}
262
263static void affinity_restore(const cpumask_t *oldmask)
264{
265 set_cpus_allowed_ptr(current, oldmask);
266}
267
268#define SHOW_FIELDS(name) \ 265#define SHOW_FIELDS(name) \
269static ssize_t show_ ## name(struct threshold_block * b, char *buf) \ 266static ssize_t show_ ## name(struct threshold_block * b, char *buf) \
270{ \ 267{ \
@@ -277,15 +274,16 @@ static ssize_t store_interrupt_enable(struct threshold_block *b,
277 const char *buf, size_t count) 274 const char *buf, size_t count)
278{ 275{
279 char *end; 276 char *end;
280 cpumask_t oldmask, newmask; 277 struct thresh_restart tr;
281 unsigned long new = simple_strtoul(buf, &end, 0); 278 unsigned long new = simple_strtoul(buf, &end, 0);
282 if (end == buf) 279 if (end == buf)
283 return -EINVAL; 280 return -EINVAL;
284 b->interrupt_enable = !!new; 281 b->interrupt_enable = !!new;
285 282
286 affinity_set(b->cpu, &oldmask, &newmask); 283 tr.b = b;
287 threshold_restart_bank(b, 0, 0); 284 tr.reset = 0;
288 affinity_restore(&oldmask); 285 tr.old_limit = 0;
286 work_on_cpu(b->cpu, threshold_restart_bank, &tr);
289 287
290 return end - buf; 288 return end - buf;
291} 289}
@@ -294,8 +292,7 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
294 const char *buf, size_t count) 292 const char *buf, size_t count)
295{ 293{
296 char *end; 294 char *end;
297 cpumask_t oldmask, newmask; 295 struct thresh_restart tr;
298 u16 old;
299 unsigned long new = simple_strtoul(buf, &end, 0); 296 unsigned long new = simple_strtoul(buf, &end, 0);
300 if (end == buf) 297 if (end == buf)
301 return -EINVAL; 298 return -EINVAL;
@@ -303,34 +300,36 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
303 new = THRESHOLD_MAX; 300 new = THRESHOLD_MAX;
304 if (new < 1) 301 if (new < 1)
305 new = 1; 302 new = 1;
306 old = b->threshold_limit; 303 tr.old_limit = b->threshold_limit;
307 b->threshold_limit = new; 304 b->threshold_limit = new;
305 tr.b = b;
306 tr.reset = 0;
308 307
309 affinity_set(b->cpu, &oldmask, &newmask); 308 work_on_cpu(b->cpu, threshold_restart_bank, &tr);
310 threshold_restart_bank(b, 0, old);
311 affinity_restore(&oldmask);
312 309
313 return end - buf; 310 return end - buf;
314} 311}
315 312
316static ssize_t show_error_count(struct threshold_block *b, char *buf) 313static long local_error_count(void *_b)
317{ 314{
318 u32 high, low; 315 struct threshold_block *b = _b;
319 cpumask_t oldmask, newmask; 316 u32 low, high;
320 affinity_set(b->cpu, &oldmask, &newmask); 317
321 rdmsr(b->address, low, high); 318 rdmsr(b->address, low, high);
322 affinity_restore(&oldmask); 319 return (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit);
323 return sprintf(buf, "%x\n", 320}
324 (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit)); 321
322static ssize_t show_error_count(struct threshold_block *b, char *buf)
323{
324 return sprintf(buf, "%lx\n", work_on_cpu(b->cpu, local_error_count, b));
325} 325}
326 326
327static ssize_t store_error_count(struct threshold_block *b, 327static ssize_t store_error_count(struct threshold_block *b,
328 const char *buf, size_t count) 328 const char *buf, size_t count)
329{ 329{
330 cpumask_t oldmask, newmask; 330 struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 };
331 affinity_set(b->cpu, &oldmask, &newmask); 331
332 threshold_restart_bank(b, 1, 0); 332 work_on_cpu(b->cpu, threshold_restart_bank, &tr);
333 affinity_restore(&oldmask);
334 return 1; 333 return 1;
335} 334}
336 335
@@ -463,12 +462,19 @@ out_free:
463 return err; 462 return err;
464} 463}
465 464
465static long local_allocate_threshold_blocks(void *_bank)
466{
467 unsigned int *bank = _bank;
468
469 return allocate_threshold_blocks(smp_processor_id(), *bank, 0,
470 MSR_IA32_MC0_MISC + *bank * 4);
471}
472
466/* symlinks sibling shared banks to first core. first core owns dir/files. */ 473/* symlinks sibling shared banks to first core. first core owns dir/files. */
467static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) 474static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
468{ 475{
469 int i, err = 0; 476 int i, err = 0;
470 struct threshold_bank *b = NULL; 477 struct threshold_bank *b = NULL;
471 cpumask_t oldmask, newmask;
472 char name[32]; 478 char name[32];
473 479
474 sprintf(name, "threshold_bank%i", bank); 480 sprintf(name, "threshold_bank%i", bank);
@@ -519,11 +525,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
519 525
520 per_cpu(threshold_banks, cpu)[bank] = b; 526 per_cpu(threshold_banks, cpu)[bank] = b;
521 527
522 affinity_set(cpu, &oldmask, &newmask); 528 err = work_on_cpu(cpu, local_allocate_threshold_blocks, &bank);
523 err = allocate_threshold_blocks(cpu, bank, 0,
524 MSR_IA32_MC0_MISC + bank * 4);
525 affinity_restore(&oldmask);
526
527 if (err) 529 if (err)
528 goto out_free; 530 goto out_free;
529 531
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index c17eaf5dd6dd..4b48f251fd39 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -26,7 +26,7 @@ asmlinkage void smp_thermal_interrupt(void)
26 if (therm_throt_process(msr_val & 1)) 26 if (therm_throt_process(msr_val & 1))
27 mce_log_therm_throt_event(smp_processor_id(), msr_val); 27 mce_log_therm_throt_event(smp_processor_id(), msr_val);
28 28
29 add_pda(irq_thermal_count, 1); 29 inc_irq_stat(irq_thermal_count);
30 irq_exit(); 30 irq_exit();
31} 31}
32 32
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 4e8d77f01eeb..b59ddcc88cd8 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -14,14 +14,6 @@
14#include <asm/pat.h> 14#include <asm/pat.h>
15#include "mtrr.h" 15#include "mtrr.h"
16 16
17struct mtrr_state {
18 struct mtrr_var_range var_ranges[MAX_VAR_RANGES];
19 mtrr_type fixed_ranges[NUM_FIXED_RANGES];
20 unsigned char enabled;
21 unsigned char have_fixed;
22 mtrr_type def_type;
23};
24
25struct fixed_range_block { 17struct fixed_range_block {
26 int base_msr; /* start address of an MTRR block */ 18 int base_msr; /* start address of an MTRR block */
27 int ranges; /* number of MTRRs in this block */ 19 int ranges; /* number of MTRRs in this block */
@@ -35,10 +27,12 @@ static struct fixed_range_block fixed_range_blocks[] = {
35}; 27};
36 28
37static unsigned long smp_changes_mask; 29static unsigned long smp_changes_mask;
38static struct mtrr_state mtrr_state = {};
39static int mtrr_state_set; 30static int mtrr_state_set;
40u64 mtrr_tom2; 31u64 mtrr_tom2;
41 32
33struct mtrr_state_type mtrr_state = {};
34EXPORT_SYMBOL_GPL(mtrr_state);
35
42#undef MODULE_PARAM_PREFIX 36#undef MODULE_PARAM_PREFIX
43#define MODULE_PARAM_PREFIX "mtrr." 37#define MODULE_PARAM_PREFIX "mtrr."
44 38
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index c78c04821ea1..d259e5d2e054 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -49,7 +49,7 @@
49 49
50u32 num_var_ranges = 0; 50u32 num_var_ranges = 0;
51 51
52unsigned int mtrr_usage_table[MAX_VAR_RANGES]; 52unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
53static DEFINE_MUTEX(mtrr_mutex); 53static DEFINE_MUTEX(mtrr_mutex);
54 54
55u64 size_or_mask, size_and_mask; 55u64 size_or_mask, size_and_mask;
@@ -574,7 +574,7 @@ struct mtrr_value {
574 unsigned long lsize; 574 unsigned long lsize;
575}; 575};
576 576
577static struct mtrr_value mtrr_state[MAX_VAR_RANGES]; 577static struct mtrr_value mtrr_state[MTRR_MAX_VAR_RANGES];
578 578
579static int mtrr_save(struct sys_device * sysdev, pm_message_t state) 579static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
580{ 580{
@@ -803,6 +803,7 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
803} 803}
804 804
805static struct res_range __initdata range[RANGE_NUM]; 805static struct res_range __initdata range[RANGE_NUM];
806static int __initdata nr_range;
806 807
807#ifdef CONFIG_MTRR_SANITIZER 808#ifdef CONFIG_MTRR_SANITIZER
808 809
@@ -823,16 +824,14 @@ static int enable_mtrr_cleanup __initdata =
823 824
824static int __init disable_mtrr_cleanup_setup(char *str) 825static int __init disable_mtrr_cleanup_setup(char *str)
825{ 826{
826 if (enable_mtrr_cleanup != -1) 827 enable_mtrr_cleanup = 0;
827 enable_mtrr_cleanup = 0;
828 return 0; 828 return 0;
829} 829}
830early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup); 830early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
831 831
832static int __init enable_mtrr_cleanup_setup(char *str) 832static int __init enable_mtrr_cleanup_setup(char *str)
833{ 833{
834 if (enable_mtrr_cleanup != -1) 834 enable_mtrr_cleanup = 1;
835 enable_mtrr_cleanup = 1;
836 return 0; 835 return 0;
837} 836}
838early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup); 837early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
@@ -1206,39 +1205,43 @@ struct mtrr_cleanup_result {
1206#define PSHIFT (PAGE_SHIFT - 10) 1205#define PSHIFT (PAGE_SHIFT - 10)
1207 1206
1208static struct mtrr_cleanup_result __initdata result[NUM_RESULT]; 1207static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
1209static struct res_range __initdata range_new[RANGE_NUM];
1210static unsigned long __initdata min_loss_pfn[RANGE_NUM]; 1208static unsigned long __initdata min_loss_pfn[RANGE_NUM];
1211 1209
1212static int __init mtrr_cleanup(unsigned address_bits) 1210static void __init print_out_mtrr_range_state(void)
1213{ 1211{
1214 unsigned long extra_remove_base, extra_remove_size;
1215 unsigned long base, size, def, dummy;
1216 mtrr_type type;
1217 int nr_range, nr_range_new;
1218 u64 chunk_size, gran_size;
1219 unsigned long range_sums, range_sums_new;
1220 int index_good;
1221 int num_reg_good;
1222 int i; 1212 int i;
1213 char start_factor = 'K', size_factor = 'K';
1214 unsigned long start_base, size_base;
1215 mtrr_type type;
1223 1216
1224 /* extra one for all 0 */ 1217 for (i = 0; i < num_var_ranges; i++) {
1225 int num[MTRR_NUM_TYPES + 1];
1226 1218
1227 if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) 1219 size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
1228 return 0; 1220 if (!size_base)
1229 rdmsr(MTRRdefType_MSR, def, dummy); 1221 continue;
1230 def &= 0xff;
1231 if (def != MTRR_TYPE_UNCACHABLE)
1232 return 0;
1233 1222
1234 /* get it and store it aside */ 1223 size_base = to_size_factor(size_base, &size_factor),
1235 memset(range_state, 0, sizeof(range_state)); 1224 start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
1236 for (i = 0; i < num_var_ranges; i++) { 1225 start_base = to_size_factor(start_base, &start_factor),
1237 mtrr_if->get(i, &base, &size, &type); 1226 type = range_state[i].type;
1238 range_state[i].base_pfn = base; 1227
1239 range_state[i].size_pfn = size; 1228 printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
1240 range_state[i].type = type; 1229 i, start_base, start_factor,
1230 size_base, size_factor,
1231 (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
1232 ((type == MTRR_TYPE_WRPROT) ? "WP" :
1233 ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
1234 );
1241 } 1235 }
1236}
1237
1238static int __init mtrr_need_cleanup(void)
1239{
1240 int i;
1241 mtrr_type type;
1242 unsigned long size;
1243 /* extra one for all 0 */
1244 int num[MTRR_NUM_TYPES + 1];
1242 1245
1243 /* check entries number */ 1246 /* check entries number */
1244 memset(num, 0, sizeof(num)); 1247 memset(num, 0, sizeof(num));
@@ -1263,29 +1266,133 @@ static int __init mtrr_cleanup(unsigned address_bits)
1263 num_var_ranges - num[MTRR_NUM_TYPES]) 1266 num_var_ranges - num[MTRR_NUM_TYPES])
1264 return 0; 1267 return 0;
1265 1268
1266 /* print original var MTRRs at first, for debugging: */ 1269 return 1;
1267 printk(KERN_DEBUG "original variable MTRRs\n"); 1270}
1268 for (i = 0; i < num_var_ranges; i++) {
1269 char start_factor = 'K', size_factor = 'K';
1270 unsigned long start_base, size_base;
1271 1271
1272 size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10); 1272static unsigned long __initdata range_sums;
1273 if (!size_base) 1273static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
1274 continue; 1274 unsigned long extra_remove_base,
1275 unsigned long extra_remove_size,
1276 int i)
1277{
1278 int num_reg;
1279 static struct res_range range_new[RANGE_NUM];
1280 static int nr_range_new;
1281 unsigned long range_sums_new;
1282
1283 /* convert ranges to var ranges state */
1284 num_reg = x86_setup_var_mtrrs(range, nr_range,
1285 chunk_size, gran_size);
1286
1287 /* we got new setting in range_state, check it */
1288 memset(range_new, 0, sizeof(range_new));
1289 nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
1290 extra_remove_base, extra_remove_size);
1291 range_sums_new = sum_ranges(range_new, nr_range_new);
1292
1293 result[i].chunk_sizek = chunk_size >> 10;
1294 result[i].gran_sizek = gran_size >> 10;
1295 result[i].num_reg = num_reg;
1296 if (range_sums < range_sums_new) {
1297 result[i].lose_cover_sizek =
1298 (range_sums_new - range_sums) << PSHIFT;
1299 result[i].bad = 1;
1300 } else
1301 result[i].lose_cover_sizek =
1302 (range_sums - range_sums_new) << PSHIFT;
1275 1303
1276 size_base = to_size_factor(size_base, &size_factor), 1304 /* double check it */
1277 start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10); 1305 if (!result[i].bad && !result[i].lose_cover_sizek) {
1278 start_base = to_size_factor(start_base, &start_factor), 1306 if (nr_range_new != nr_range ||
1279 type = range_state[i].type; 1307 memcmp(range, range_new, sizeof(range)))
1308 result[i].bad = 1;
1309 }
1280 1310
1281 printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n", 1311 if (!result[i].bad && (range_sums - range_sums_new <
1282 i, start_base, start_factor, 1312 min_loss_pfn[num_reg])) {
1283 size_base, size_factor, 1313 min_loss_pfn[num_reg] =
1284 (type == MTRR_TYPE_UNCACHABLE) ? "UC" : 1314 range_sums - range_sums_new;
1285 ((type == MTRR_TYPE_WRPROT) ? "WP" :
1286 ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
1287 );
1288 } 1315 }
1316}
1317
1318static void __init mtrr_print_out_one_result(int i)
1319{
1320 char gran_factor, chunk_factor, lose_factor;
1321 unsigned long gran_base, chunk_base, lose_base;
1322
1323 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1324 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1325 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1326 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
1327 result[i].bad ? "*BAD*" : " ",
1328 gran_base, gran_factor, chunk_base, chunk_factor);
1329 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
1330 result[i].num_reg, result[i].bad ? "-" : "",
1331 lose_base, lose_factor);
1332}
1333
1334static int __init mtrr_search_optimal_index(void)
1335{
1336 int i;
1337 int num_reg_good;
1338 int index_good;
1339
1340 if (nr_mtrr_spare_reg >= num_var_ranges)
1341 nr_mtrr_spare_reg = num_var_ranges - 1;
1342 num_reg_good = -1;
1343 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
1344 if (!min_loss_pfn[i])
1345 num_reg_good = i;
1346 }
1347
1348 index_good = -1;
1349 if (num_reg_good != -1) {
1350 for (i = 0; i < NUM_RESULT; i++) {
1351 if (!result[i].bad &&
1352 result[i].num_reg == num_reg_good &&
1353 !result[i].lose_cover_sizek) {
1354 index_good = i;
1355 break;
1356 }
1357 }
1358 }
1359
1360 return index_good;
1361}
1362
1363
1364static int __init mtrr_cleanup(unsigned address_bits)
1365{
1366 unsigned long extra_remove_base, extra_remove_size;
1367 unsigned long base, size, def, dummy;
1368 mtrr_type type;
1369 u64 chunk_size, gran_size;
1370 int index_good;
1371 int i;
1372
1373 if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
1374 return 0;
1375 rdmsr(MTRRdefType_MSR, def, dummy);
1376 def &= 0xff;
1377 if (def != MTRR_TYPE_UNCACHABLE)
1378 return 0;
1379
1380 /* get it and store it aside */
1381 memset(range_state, 0, sizeof(range_state));
1382 for (i = 0; i < num_var_ranges; i++) {
1383 mtrr_if->get(i, &base, &size, &type);
1384 range_state[i].base_pfn = base;
1385 range_state[i].size_pfn = size;
1386 range_state[i].type = type;
1387 }
1388
1389 /* check if we need handle it and can handle it */
1390 if (!mtrr_need_cleanup())
1391 return 0;
1392
1393 /* print original var MTRRs at first, for debugging: */
1394 printk(KERN_DEBUG "original variable MTRRs\n");
1395 print_out_mtrr_range_state();
1289 1396
1290 memset(range, 0, sizeof(range)); 1397 memset(range, 0, sizeof(range));
1291 extra_remove_size = 0; 1398 extra_remove_size = 0;
@@ -1309,176 +1416,64 @@ static int __init mtrr_cleanup(unsigned address_bits)
1309 range_sums >> (20 - PAGE_SHIFT)); 1416 range_sums >> (20 - PAGE_SHIFT));
1310 1417
1311 if (mtrr_chunk_size && mtrr_gran_size) { 1418 if (mtrr_chunk_size && mtrr_gran_size) {
1312 int num_reg; 1419 i = 0;
1313 char gran_factor, chunk_factor, lose_factor; 1420 mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size,
1314 unsigned long gran_base, chunk_base, lose_base; 1421 extra_remove_base, extra_remove_size, i);
1315
1316 debug_print++;
1317 /* convert ranges to var ranges state */
1318 num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size,
1319 mtrr_gran_size);
1320 1422
1321 /* we got new setting in range_state, check it */ 1423 mtrr_print_out_one_result(i);
1322 memset(range_new, 0, sizeof(range_new));
1323 nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
1324 extra_remove_base,
1325 extra_remove_size);
1326 range_sums_new = sum_ranges(range_new, nr_range_new);
1327 1424
1328 i = 0;
1329 result[i].chunk_sizek = mtrr_chunk_size >> 10;
1330 result[i].gran_sizek = mtrr_gran_size >> 10;
1331 result[i].num_reg = num_reg;
1332 if (range_sums < range_sums_new) {
1333 result[i].lose_cover_sizek =
1334 (range_sums_new - range_sums) << PSHIFT;
1335 result[i].bad = 1;
1336 } else
1337 result[i].lose_cover_sizek =
1338 (range_sums - range_sums_new) << PSHIFT;
1339
1340 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1341 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1342 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1343 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
1344 result[i].bad?"*BAD*":" ",
1345 gran_base, gran_factor, chunk_base, chunk_factor);
1346 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
1347 result[i].num_reg, result[i].bad?"-":"",
1348 lose_base, lose_factor);
1349 if (!result[i].bad) { 1425 if (!result[i].bad) {
1350 set_var_mtrr_all(address_bits); 1426 set_var_mtrr_all(address_bits);
1351 return 1; 1427 return 1;
1352 } 1428 }
1353 printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, " 1429 printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
1354 "will find optimal one\n"); 1430 "will find optimal one\n");
1355 debug_print--;
1356 memset(result, 0, sizeof(result[0]));
1357 } 1431 }
1358 1432
1359 i = 0; 1433 i = 0;
1360 memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn)); 1434 memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
1361 memset(result, 0, sizeof(result)); 1435 memset(result, 0, sizeof(result));
1362 for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) { 1436 for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
1363 char gran_factor;
1364 unsigned long gran_base;
1365
1366 if (debug_print)
1367 gran_base = to_size_factor(gran_size >> 10, &gran_factor);
1368 1437
1369 for (chunk_size = gran_size; chunk_size < (1ULL<<32); 1438 for (chunk_size = gran_size; chunk_size < (1ULL<<32);
1370 chunk_size <<= 1) { 1439 chunk_size <<= 1) {
1371 int num_reg;
1372 1440
1373 if (debug_print) {
1374 char chunk_factor;
1375 unsigned long chunk_base;
1376
1377 chunk_base = to_size_factor(chunk_size>>10, &chunk_factor),
1378 printk(KERN_INFO "\n");
1379 printk(KERN_INFO "gran_size: %ld%c chunk_size: %ld%c \n",
1380 gran_base, gran_factor, chunk_base, chunk_factor);
1381 }
1382 if (i >= NUM_RESULT) 1441 if (i >= NUM_RESULT)
1383 continue; 1442 continue;
1384 1443
1385 /* convert ranges to var ranges state */ 1444 mtrr_calc_range_state(chunk_size, gran_size,
1386 num_reg = x86_setup_var_mtrrs(range, nr_range, 1445 extra_remove_base, extra_remove_size, i);
1387 chunk_size, gran_size); 1446 if (debug_print) {
1388 1447 mtrr_print_out_one_result(i);
1389 /* we got new setting in range_state, check it */ 1448 printk(KERN_INFO "\n");
1390 memset(range_new, 0, sizeof(range_new));
1391 nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
1392 extra_remove_base, extra_remove_size);
1393 range_sums_new = sum_ranges(range_new, nr_range_new);
1394
1395 result[i].chunk_sizek = chunk_size >> 10;
1396 result[i].gran_sizek = gran_size >> 10;
1397 result[i].num_reg = num_reg;
1398 if (range_sums < range_sums_new) {
1399 result[i].lose_cover_sizek =
1400 (range_sums_new - range_sums) << PSHIFT;
1401 result[i].bad = 1;
1402 } else
1403 result[i].lose_cover_sizek =
1404 (range_sums - range_sums_new) << PSHIFT;
1405
1406 /* double check it */
1407 if (!result[i].bad && !result[i].lose_cover_sizek) {
1408 if (nr_range_new != nr_range ||
1409 memcmp(range, range_new, sizeof(range)))
1410 result[i].bad = 1;
1411 } 1449 }
1412 1450
1413 if (!result[i].bad && (range_sums - range_sums_new <
1414 min_loss_pfn[num_reg])) {
1415 min_loss_pfn[num_reg] =
1416 range_sums - range_sums_new;
1417 }
1418 i++; 1451 i++;
1419 } 1452 }
1420 } 1453 }
1421 1454
1422 /* print out all */
1423 for (i = 0; i < NUM_RESULT; i++) {
1424 char gran_factor, chunk_factor, lose_factor;
1425 unsigned long gran_base, chunk_base, lose_base;
1426
1427 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1428 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1429 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1430 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
1431 result[i].bad?"*BAD*":" ",
1432 gran_base, gran_factor, chunk_base, chunk_factor);
1433 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
1434 result[i].num_reg, result[i].bad?"-":"",
1435 lose_base, lose_factor);
1436 }
1437
1438 /* try to find the optimal index */ 1455 /* try to find the optimal index */
1439 if (nr_mtrr_spare_reg >= num_var_ranges) 1456 index_good = mtrr_search_optimal_index();
1440 nr_mtrr_spare_reg = num_var_ranges - 1;
1441 num_reg_good = -1;
1442 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
1443 if (!min_loss_pfn[i])
1444 num_reg_good = i;
1445 }
1446
1447 index_good = -1;
1448 if (num_reg_good != -1) {
1449 for (i = 0; i < NUM_RESULT; i++) {
1450 if (!result[i].bad &&
1451 result[i].num_reg == num_reg_good &&
1452 !result[i].lose_cover_sizek) {
1453 index_good = i;
1454 break;
1455 }
1456 }
1457 }
1458 1457
1459 if (index_good != -1) { 1458 if (index_good != -1) {
1460 char gran_factor, chunk_factor, lose_factor;
1461 unsigned long gran_base, chunk_base, lose_base;
1462
1463 printk(KERN_INFO "Found optimal setting for mtrr clean up\n"); 1459 printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
1464 i = index_good; 1460 i = index_good;
1465 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), 1461 mtrr_print_out_one_result(i);
1466 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), 1462
1467 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1468 printk(KERN_INFO "gran_size: %ld%c \tchunk_size: %ld%c \t",
1469 gran_base, gran_factor, chunk_base, chunk_factor);
1470 printk(KERN_CONT "num_reg: %d \tlose RAM: %ld%c\n",
1471 result[i].num_reg, lose_base, lose_factor);
1472 /* convert ranges to var ranges state */ 1463 /* convert ranges to var ranges state */
1473 chunk_size = result[i].chunk_sizek; 1464 chunk_size = result[i].chunk_sizek;
1474 chunk_size <<= 10; 1465 chunk_size <<= 10;
1475 gran_size = result[i].gran_sizek; 1466 gran_size = result[i].gran_sizek;
1476 gran_size <<= 10; 1467 gran_size <<= 10;
1477 debug_print++;
1478 x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); 1468 x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
1479 debug_print--;
1480 set_var_mtrr_all(address_bits); 1469 set_var_mtrr_all(address_bits);
1470 printk(KERN_DEBUG "New variable MTRRs\n");
1471 print_out_mtrr_range_state();
1481 return 1; 1472 return 1;
1473 } else {
1474 /* print out all */
1475 for (i = 0; i < NUM_RESULT; i++)
1476 mtrr_print_out_one_result(i);
1482 } 1477 }
1483 1478
1484 printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n"); 1479 printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
@@ -1562,7 +1557,6 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1562{ 1557{
1563 unsigned long i, base, size, highest_pfn = 0, def, dummy; 1558 unsigned long i, base, size, highest_pfn = 0, def, dummy;
1564 mtrr_type type; 1559 mtrr_type type;
1565 int nr_range;
1566 u64 total_trim_size; 1560 u64 total_trim_size;
1567 1561
1568 /* extra one for all 0 */ 1562 /* extra one for all 0 */
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 2dc4ec656b23..ffd60409cc6d 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -8,11 +8,6 @@
8#define MTRRcap_MSR 0x0fe 8#define MTRRcap_MSR 0x0fe
9#define MTRRdefType_MSR 0x2ff 9#define MTRRdefType_MSR 0x2ff
10 10
11#define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg))
12#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1)
13
14#define NUM_FIXED_RANGES 88
15#define MAX_VAR_RANGES 256
16#define MTRRfix64K_00000_MSR 0x250 11#define MTRRfix64K_00000_MSR 0x250
17#define MTRRfix16K_80000_MSR 0x258 12#define MTRRfix16K_80000_MSR 0x258
18#define MTRRfix16K_A0000_MSR 0x259 13#define MTRRfix16K_A0000_MSR 0x259
@@ -29,11 +24,7 @@
29#define MTRR_CHANGE_MASK_VARIABLE 0x02 24#define MTRR_CHANGE_MASK_VARIABLE 0x02
30#define MTRR_CHANGE_MASK_DEFTYPE 0x04 25#define MTRR_CHANGE_MASK_DEFTYPE 0x04
31 26
32/* In the Intel processor's MTRR interface, the MTRR type is always held in 27extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
33 an 8 bit field: */
34typedef u8 mtrr_type;
35
36extern unsigned int mtrr_usage_table[MAX_VAR_RANGES];
37 28
38struct mtrr_ops { 29struct mtrr_ops {
39 u32 vendor; 30 u32 vendor;
@@ -70,13 +61,6 @@ struct set_mtrr_context {
70 u32 ccr3; 61 u32 ccr3;
71}; 62};
72 63
73struct mtrr_var_range {
74 u32 base_lo;
75 u32 base_hi;
76 u32 mask_lo;
77 u32 mask_hi;
78};
79
80void set_mtrr_done(struct set_mtrr_context *ctxt); 64void set_mtrr_done(struct set_mtrr_context *ctxt);
81void set_mtrr_cache_disable(struct set_mtrr_context *ctxt); 65void set_mtrr_cache_disable(struct set_mtrr_context *ctxt);
82void set_mtrr_prepare_save(struct set_mtrr_context *ctxt); 66void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
new file mode 100644
index 000000000000..284c399e3234
--- /dev/null
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -0,0 +1,112 @@
1/*
2 * VMware Detection code.
3 *
4 * Copyright (C) 2008, VMware, Inc.
5 * Author : Alok N Kataria <akataria@vmware.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
15 * NON INFRINGEMENT. See the GNU General Public License for more
16 * details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
21 *
22 */
23
24#include <linux/dmi.h>
25#include <asm/div64.h>
26#include <asm/vmware.h>
27
28#define CPUID_VMWARE_INFO_LEAF 0x40000000
29#define VMWARE_HYPERVISOR_MAGIC 0x564D5868
30#define VMWARE_HYPERVISOR_PORT 0x5658
31
32#define VMWARE_PORT_CMD_GETVERSION 10
33#define VMWARE_PORT_CMD_GETHZ 45
34
35#define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \
36 __asm__("inl (%%dx)" : \
37 "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \
38 "0"(VMWARE_HYPERVISOR_MAGIC), \
39 "1"(VMWARE_PORT_CMD_##cmd), \
40 "2"(VMWARE_HYPERVISOR_PORT), "3"(UINT_MAX) : \
41 "memory");
42
43static inline int __vmware_platform(void)
44{
45 uint32_t eax, ebx, ecx, edx;
46 VMWARE_PORT(GETVERSION, eax, ebx, ecx, edx);
47 return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC;
48}
49
50static unsigned long __vmware_get_tsc_khz(void)
51{
52 uint64_t tsc_hz;
53 uint32_t eax, ebx, ecx, edx;
54
55 VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
56
57 if (ebx == UINT_MAX)
58 return 0;
59 tsc_hz = eax | (((uint64_t)ebx) << 32);
60 do_div(tsc_hz, 1000);
61 BUG_ON(tsc_hz >> 32);
62 return tsc_hz;
63}
64
65/*
66 * While checking the dmi string infomation, just checking the product
67 * serial key should be enough, as this will always have a VMware
68 * specific string when running under VMware hypervisor.
69 */
70int vmware_platform(void)
71{
72 if (cpu_has_hypervisor) {
73 unsigned int eax, ebx, ecx, edx;
74 char hyper_vendor_id[13];
75
76 cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &ebx, &ecx, &edx);
77 memcpy(hyper_vendor_id + 0, &ebx, 4);
78 memcpy(hyper_vendor_id + 4, &ecx, 4);
79 memcpy(hyper_vendor_id + 8, &edx, 4);
80 hyper_vendor_id[12] = '\0';
81 if (!strcmp(hyper_vendor_id, "VMwareVMware"))
82 return 1;
83 } else if (dmi_available && dmi_name_in_serial("VMware") &&
84 __vmware_platform())
85 return 1;
86
87 return 0;
88}
89
90unsigned long vmware_get_tsc_khz(void)
91{
92 BUG_ON(!vmware_platform());
93 return __vmware_get_tsc_khz();
94}
95
96/*
97 * VMware hypervisor takes care of exporting a reliable TSC to the guest.
98 * Still, due to timing difference when running on virtual cpus, the TSC can
99 * be marked as unstable in some cases. For example, the TSC sync check at
100 * bootup can fail due to a marginal offset between vcpus' TSCs (though the
101 * TSCs do not drift from each other). Also, the ACPI PM timer clocksource
102 * is not suitable as a watchdog when running on a hypervisor because the
103 * kernel may miss a wrap of the counter if the vcpu is descheduled for a
104 * long time. To skip these checks at runtime we set these capability bits,
105 * so that the kernel could just trust the hypervisor with providing a
106 * reliable virtual TSC that is suitable for timekeeping.
107 */
108void __cpuinit vmware_set_feature_bits(struct cpuinfo_x86 *c)
109{
110 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
111 set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE);
112}
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 72cefd1e649b..2ac1f0c2beb3 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -39,10 +39,10 @@
39#include <linux/device.h> 39#include <linux/device.h>
40#include <linux/cpu.h> 40#include <linux/cpu.h>
41#include <linux/notifier.h> 41#include <linux/notifier.h>
42#include <linux/uaccess.h>
42 43
43#include <asm/processor.h> 44#include <asm/processor.h>
44#include <asm/msr.h> 45#include <asm/msr.h>
45#include <asm/uaccess.h>
46#include <asm/system.h> 46#include <asm/system.h>
47 47
48static struct class *cpuid_class; 48static struct class *cpuid_class;
@@ -82,7 +82,7 @@ static loff_t cpuid_seek(struct file *file, loff_t offset, int orig)
82} 82}
83 83
84static ssize_t cpuid_read(struct file *file, char __user *buf, 84static ssize_t cpuid_read(struct file *file, char __user *buf,
85 size_t count, loff_t * ppos) 85 size_t count, loff_t *ppos)
86{ 86{
87 char __user *tmp = buf; 87 char __user *tmp = buf;
88 struct cpuid_regs cmd; 88 struct cpuid_regs cmd;
@@ -117,11 +117,11 @@ static int cpuid_open(struct inode *inode, struct file *file)
117 unsigned int cpu; 117 unsigned int cpu;
118 struct cpuinfo_x86 *c; 118 struct cpuinfo_x86 *c;
119 int ret = 0; 119 int ret = 0;
120 120
121 lock_kernel(); 121 lock_kernel();
122 122
123 cpu = iminor(file->f_path.dentry->d_inode); 123 cpu = iminor(file->f_path.dentry->d_inode);
124 if (cpu >= NR_CPUS || !cpu_online(cpu)) { 124 if (cpu >= nr_cpu_ids || !cpu_online(cpu)) {
125 ret = -ENXIO; /* No such CPU */ 125 ret = -ENXIO; /* No such CPU */
126 goto out; 126 goto out;
127 } 127 }
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 268553817909..c689d19e35ab 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -26,37 +26,21 @@
26#include <linux/kdebug.h> 26#include <linux/kdebug.h>
27#include <asm/smp.h> 27#include <asm/smp.h>
28#include <asm/reboot.h> 28#include <asm/reboot.h>
29#include <asm/virtext.h>
29 30
30#include <mach_ipi.h> 31#include <mach_ipi.h>
31 32
32/* This keeps a track of which one is crashing cpu. */
33static int crashing_cpu;
34 33
35#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) 34#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
36static atomic_t waiting_for_crash_ipi;
37 35
38static int crash_nmi_callback(struct notifier_block *self, 36static void kdump_nmi_callback(int cpu, struct die_args *args)
39 unsigned long val, void *data)
40{ 37{
41 struct pt_regs *regs; 38 struct pt_regs *regs;
42#ifdef CONFIG_X86_32 39#ifdef CONFIG_X86_32
43 struct pt_regs fixed_regs; 40 struct pt_regs fixed_regs;
44#endif 41#endif
45 int cpu;
46 42
47 if (val != DIE_NMI_IPI) 43 regs = args->regs;
48 return NOTIFY_OK;
49
50 regs = ((struct die_args *)data)->regs;
51 cpu = raw_smp_processor_id();
52
53 /* Don't do anything if this handler is invoked on crashing cpu.
54 * Otherwise, system will completely hang. Crashing cpu can get
55 * an NMI if system was initially booted with nmi_watchdog parameter.
56 */
57 if (cpu == crashing_cpu)
58 return NOTIFY_STOP;
59 local_irq_disable();
60 44
61#ifdef CONFIG_X86_32 45#ifdef CONFIG_X86_32
62 if (!user_mode_vm(regs)) { 46 if (!user_mode_vm(regs)) {
@@ -65,54 +49,28 @@ static int crash_nmi_callback(struct notifier_block *self,
65 } 49 }
66#endif 50#endif
67 crash_save_cpu(regs, cpu); 51 crash_save_cpu(regs, cpu);
68 disable_local_APIC();
69 atomic_dec(&waiting_for_crash_ipi);
70 /* Assume hlt works */
71 halt();
72 for (;;)
73 cpu_relax();
74 52
75 return 1; 53 /* Disable VMX or SVM if needed.
76} 54 *
55 * We need to disable virtualization on all CPUs.
56 * Having VMX or SVM enabled on any CPU may break rebooting
57 * after the kdump kernel has finished its task.
58 */
59 cpu_emergency_vmxoff();
60 cpu_emergency_svm_disable();
77 61
78static void smp_send_nmi_allbutself(void) 62 disable_local_APIC();
79{
80 cpumask_t mask = cpu_online_map;
81 cpu_clear(safe_smp_processor_id(), mask);
82 if (!cpus_empty(mask))
83 send_IPI_mask(mask, NMI_VECTOR);
84} 63}
85 64
86static struct notifier_block crash_nmi_nb = { 65static void kdump_nmi_shootdown_cpus(void)
87 .notifier_call = crash_nmi_callback,
88};
89
90static void nmi_shootdown_cpus(void)
91{ 66{
92 unsigned long msecs; 67 nmi_shootdown_cpus(kdump_nmi_callback);
93
94 atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
95 /* Would it be better to replace the trap vector here? */
96 if (register_die_notifier(&crash_nmi_nb))
97 return; /* return what? */
98 /* Ensure the new callback function is set before sending
99 * out the NMI
100 */
101 wmb();
102
103 smp_send_nmi_allbutself();
104
105 msecs = 1000; /* Wait at most a second for the other cpus to stop */
106 while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
107 mdelay(1);
108 msecs--;
109 }
110 68
111 /* Leave the nmi callback set */
112 disable_local_APIC(); 69 disable_local_APIC();
113} 70}
71
114#else 72#else
115static void nmi_shootdown_cpus(void) 73static void kdump_nmi_shootdown_cpus(void)
116{ 74{
117 /* There are no cpus to shootdown */ 75 /* There are no cpus to shootdown */
118} 76}
@@ -131,9 +89,15 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
131 /* The kernel is broken so disable interrupts */ 89 /* The kernel is broken so disable interrupts */
132 local_irq_disable(); 90 local_irq_disable();
133 91
134 /* Make a note of crashing cpu. Will be used in NMI callback.*/ 92 kdump_nmi_shootdown_cpus();
135 crashing_cpu = safe_smp_processor_id(); 93
136 nmi_shootdown_cpus(); 94 /* Booting kdump kernel with VMX or SVM enabled won't work,
95 * because (among other limitations) we can't disable paging
96 * with the virt flags.
97 */
98 cpu_emergency_vmxoff();
99 cpu_emergency_svm_disable();
100
137 lapic_shutdown(); 101 lapic_shutdown();
138#if defined(CONFIG_X86_IO_APIC) 102#if defined(CONFIG_X86_IO_APIC)
139 disable_IO_APIC(); 103 disable_IO_APIC();
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index a2d1176c38ee..da91701a2348 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -6,14 +6,13 @@
6 * precise-event based sampling (PEBS). 6 * precise-event based sampling (PEBS).
7 * 7 *
8 * It manages: 8 * It manages:
9 * - per-thread and per-cpu allocation of BTS and PEBS 9 * - DS and BTS hardware configuration
10 * - buffer memory allocation (optional) 10 * - buffer overflow handling (to be done)
11 * - buffer overflow handling
12 * - buffer access 11 * - buffer access
13 * 12 *
14 * It assumes: 13 * It does not do:
15 * - get_task_struct on all parameter tasks 14 * - security checking (is the caller allowed to trace the task)
16 * - current is allowed to trace parameter tasks 15 * - buffer allocation (memory accounting)
17 * 16 *
18 * 17 *
19 * Copyright (C) 2007-2008 Intel Corporation. 18 * Copyright (C) 2007-2008 Intel Corporation.
@@ -28,22 +27,69 @@
28#include <linux/slab.h> 27#include <linux/slab.h>
29#include <linux/sched.h> 28#include <linux/sched.h>
30#include <linux/mm.h> 29#include <linux/mm.h>
30#include <linux/kernel.h>
31 31
32 32
33/* 33/*
34 * The configuration for a particular DS hardware implementation. 34 * The configuration for a particular DS hardware implementation.
35 */ 35 */
36struct ds_configuration { 36struct ds_configuration {
37 /* the size of the DS structure in bytes */ 37 /* the name of the configuration */
38 unsigned char sizeof_ds; 38 const char *name;
39 /* the size of one pointer-typed field in the DS structure in bytes; 39 /* the size of one pointer-typed field in the DS structure and
40 this covers the first 8 fields related to buffer management. */ 40 in the BTS and PEBS buffers in bytes;
41 this covers the first 8 DS fields related to buffer management. */
41 unsigned char sizeof_field; 42 unsigned char sizeof_field;
42 /* the size of a BTS/PEBS record in bytes */ 43 /* the size of a BTS/PEBS record in bytes */
43 unsigned char sizeof_rec[2]; 44 unsigned char sizeof_rec[2];
45 /* a series of bit-masks to control various features indexed
46 * by enum ds_feature */
47 unsigned long ctl[dsf_ctl_max];
44}; 48};
45static struct ds_configuration ds_cfg; 49static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array);
46 50
51#define ds_cfg per_cpu(ds_cfg_array, smp_processor_id())
52
53#define MAX_SIZEOF_DS (12 * 8) /* maximal size of a DS configuration */
54#define MAX_SIZEOF_BTS (3 * 8) /* maximal size of a BTS record */
55#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment */
56
57#define BTS_CONTROL \
58 (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\
59 ds_cfg.ctl[dsf_bts_overflow])
60
61
62/*
63 * A BTS or PEBS tracer.
64 *
65 * This holds the configuration of the tracer and serves as a handle
66 * to identify tracers.
67 */
68struct ds_tracer {
69 /* the DS context (partially) owned by this tracer */
70 struct ds_context *context;
71 /* the buffer provided on ds_request() and its size in bytes */
72 void *buffer;
73 size_t size;
74};
75
76struct bts_tracer {
77 /* the common DS part */
78 struct ds_tracer ds;
79 /* the trace including the DS configuration */
80 struct bts_trace trace;
81 /* buffer overflow notification function */
82 bts_ovfl_callback_t ovfl;
83};
84
85struct pebs_tracer {
86 /* the common DS part */
87 struct ds_tracer ds;
88 /* the trace including the DS configuration */
89 struct pebs_trace trace;
90 /* buffer overflow notification function */
91 pebs_ovfl_callback_t ovfl;
92};
47 93
48/* 94/*
49 * Debug Store (DS) save area configuration (see Intel64 and IA32 95 * Debug Store (DS) save area configuration (see Intel64 and IA32
@@ -109,32 +155,9 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
109 155
110 156
111/* 157/*
112 * Locking is done only for allocating BTS or PEBS resources and for 158 * Locking is done only for allocating BTS or PEBS resources.
113 * guarding context and buffer memory allocation.
114 *
115 * Most functions require the current task to own the ds context part
116 * they are going to access. All the locking is done when validating
117 * access to the context.
118 */ 159 */
119static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock); 160static DEFINE_SPINLOCK(ds_lock);
120
121/*
122 * Validate that the current task is allowed to access the BTS/PEBS
123 * buffer of the parameter task.
124 *
125 * Returns 0, if access is granted; -Eerrno, otherwise.
126 */
127static inline int ds_validate_access(struct ds_context *context,
128 enum ds_qualifier qual)
129{
130 if (!context)
131 return -EPERM;
132
133 if (context->owner[qual] == current)
134 return 0;
135
136 return -EPERM;
137}
138 161
139 162
140/* 163/*
@@ -150,27 +173,32 @@ static inline int ds_validate_access(struct ds_context *context,
150 * >0 number of per-thread tracers 173 * >0 number of per-thread tracers
151 * <0 number of per-cpu tracers 174 * <0 number of per-cpu tracers
152 * 175 *
153 * The below functions to get and put tracers and to check the
154 * allocation type require the ds_lock to be held by the caller.
155 *
156 * Tracers essentially gives the number of ds contexts for a certain 176 * Tracers essentially gives the number of ds contexts for a certain
157 * type of allocation. 177 * type of allocation.
158 */ 178 */
159static long tracers; 179static atomic_t tracers = ATOMIC_INIT(0);
160 180
161static inline void get_tracer(struct task_struct *task) 181static inline void get_tracer(struct task_struct *task)
162{ 182{
163 tracers += (task ? 1 : -1); 183 if (task)
184 atomic_inc(&tracers);
185 else
186 atomic_dec(&tracers);
164} 187}
165 188
166static inline void put_tracer(struct task_struct *task) 189static inline void put_tracer(struct task_struct *task)
167{ 190{
168 tracers -= (task ? 1 : -1); 191 if (task)
192 atomic_dec(&tracers);
193 else
194 atomic_inc(&tracers);
169} 195}
170 196
171static inline int check_tracer(struct task_struct *task) 197static inline int check_tracer(struct task_struct *task)
172{ 198{
173 return (task ? (tracers >= 0) : (tracers <= 0)); 199 return task ?
200 (atomic_read(&tracers) >= 0) :
201 (atomic_read(&tracers) <= 0);
174} 202}
175 203
176 204
@@ -183,99 +211,70 @@ static inline int check_tracer(struct task_struct *task)
183 * 211 *
184 * Contexts are use-counted. They are allocated on first access and 212 * Contexts are use-counted. They are allocated on first access and
185 * deallocated when the last user puts the context. 213 * deallocated when the last user puts the context.
186 *
187 * We distinguish between an allocating and a non-allocating get of a
188 * context:
189 * - the allocating get is used for requesting BTS/PEBS resources. It
190 * requires the caller to hold the global ds_lock.
191 * - the non-allocating get is used for all other cases. A
192 * non-existing context indicates an error. It acquires and releases
193 * the ds_lock itself for obtaining the context.
194 *
195 * A context and its DS configuration are allocated and deallocated
196 * together. A context always has a DS configuration of the
197 * appropriate size.
198 */
199static DEFINE_PER_CPU(struct ds_context *, system_context);
200
201#define this_system_context per_cpu(system_context, smp_processor_id())
202
203/*
204 * Returns the pointer to the parameter task's context or to the
205 * system-wide context, if task is NULL.
206 *
207 * Increases the use count of the returned context, if not NULL.
208 */ 214 */
209static inline struct ds_context *ds_get_context(struct task_struct *task) 215struct ds_context {
210{ 216 /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */
211 struct ds_context *context; 217 unsigned char ds[MAX_SIZEOF_DS];
212 unsigned long irq; 218 /* the owner of the BTS and PEBS configuration, respectively */
219 struct bts_tracer *bts_master;
220 struct pebs_tracer *pebs_master;
221 /* use count */
222 unsigned long count;
223 /* a pointer to the context location inside the thread_struct
224 * or the per_cpu context array */
225 struct ds_context **this;
226 /* a pointer to the task owning this context, or NULL, if the
227 * context is owned by a cpu */
228 struct task_struct *task;
229};
213 230
214 spin_lock_irqsave(&ds_lock, irq); 231static DEFINE_PER_CPU(struct ds_context *, system_context_array);
215 232
216 context = (task ? task->thread.ds_ctx : this_system_context); 233#define system_context per_cpu(system_context_array, smp_processor_id())
217 if (context)
218 context->count++;
219 234
220 spin_unlock_irqrestore(&ds_lock, irq);
221
222 return context;
223}
224 235
225/* 236static inline struct ds_context *ds_get_context(struct task_struct *task)
226 * Same as ds_get_context, but allocates the context and it's DS
227 * structure, if necessary; returns NULL; if out of memory.
228 */
229static inline struct ds_context *ds_alloc_context(struct task_struct *task)
230{ 237{
231 struct ds_context **p_context = 238 struct ds_context **p_context =
232 (task ? &task->thread.ds_ctx : &this_system_context); 239 (task ? &task->thread.ds_ctx : &system_context);
233 struct ds_context *context = *p_context; 240 struct ds_context *context = NULL;
241 struct ds_context *new_context = NULL;
234 unsigned long irq; 242 unsigned long irq;
235 243
236 if (!context) { 244 /* Chances are small that we already have a context. */
237 context = kzalloc(sizeof(*context), GFP_KERNEL); 245 new_context = kzalloc(sizeof(*new_context), GFP_KERNEL);
238 if (!context) 246 if (!new_context)
239 return NULL; 247 return NULL;
240
241 context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL);
242 if (!context->ds) {
243 kfree(context);
244 return NULL;
245 }
246 248
247 spin_lock_irqsave(&ds_lock, irq); 249 spin_lock_irqsave(&ds_lock, irq);
248 250
249 if (*p_context) { 251 context = *p_context;
250 kfree(context->ds); 252 if (!context) {
251 kfree(context); 253 context = new_context;
252 254
253 context = *p_context; 255 context->this = p_context;
254 } else { 256 context->task = task;
255 *p_context = context; 257 context->count = 0;
256 258
257 context->this = p_context; 259 if (task)
258 context->task = task; 260 set_tsk_thread_flag(task, TIF_DS_AREA_MSR);
259 261
260 if (task) 262 if (!task || (task == current))
261 set_tsk_thread_flag(task, TIF_DS_AREA_MSR); 263 wrmsrl(MSR_IA32_DS_AREA, (unsigned long)context->ds);
262 264
263 if (!task || (task == current)) 265 *p_context = context;
264 wrmsrl(MSR_IA32_DS_AREA,
265 (unsigned long)context->ds);
266 }
267 spin_unlock_irqrestore(&ds_lock, irq);
268 } 266 }
269 267
270 context->count++; 268 context->count++;
271 269
270 spin_unlock_irqrestore(&ds_lock, irq);
271
272 if (context != new_context)
273 kfree(new_context);
274
272 return context; 275 return context;
273} 276}
274 277
275/*
276 * Decreases the use count of the parameter context, if not NULL.
277 * Deallocates the context, if the use count reaches zero.
278 */
279static inline void ds_put_context(struct ds_context *context) 278static inline void ds_put_context(struct ds_context *context)
280{ 279{
281 unsigned long irq; 280 unsigned long irq;
@@ -285,8 +284,10 @@ static inline void ds_put_context(struct ds_context *context)
285 284
286 spin_lock_irqsave(&ds_lock, irq); 285 spin_lock_irqsave(&ds_lock, irq);
287 286
288 if (--context->count) 287 if (--context->count) {
289 goto out; 288 spin_unlock_irqrestore(&ds_lock, irq);
289 return;
290 }
290 291
291 *(context->this) = NULL; 292 *(context->this) = NULL;
292 293
@@ -296,135 +297,263 @@ static inline void ds_put_context(struct ds_context *context)
296 if (!context->task || (context->task == current)) 297 if (!context->task || (context->task == current))
297 wrmsrl(MSR_IA32_DS_AREA, 0); 298 wrmsrl(MSR_IA32_DS_AREA, 0);
298 299
299 put_tracer(context->task); 300 spin_unlock_irqrestore(&ds_lock, irq);
300 301
301 /* free any leftover buffers from tracers that did not
302 * deallocate them properly. */
303 kfree(context->buffer[ds_bts]);
304 kfree(context->buffer[ds_pebs]);
305 kfree(context->ds);
306 kfree(context); 302 kfree(context);
307 out:
308 spin_unlock_irqrestore(&ds_lock, irq);
309} 303}
310 304
311 305
312/* 306/*
313 * Handle a buffer overflow 307 * Call the tracer's callback on a buffer overflow.
314 * 308 *
315 * task: the task whose buffers are overflowing;
316 * NULL for a buffer overflow on the current cpu
317 * context: the ds context 309 * context: the ds context
318 * qual: the buffer type 310 * qual: the buffer type
319 */ 311 */
320static void ds_overflow(struct task_struct *task, struct ds_context *context, 312static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
321 enum ds_qualifier qual)
322{ 313{
323 if (!context) 314 switch (qual) {
324 return; 315 case ds_bts:
325 316 if (context->bts_master &&
326 if (context->callback[qual]) 317 context->bts_master->ovfl)
327 (*context->callback[qual])(task); 318 context->bts_master->ovfl(context->bts_master);
328 319 break;
329 /* todo: do some more overflow handling */ 320 case ds_pebs:
321 if (context->pebs_master &&
322 context->pebs_master->ovfl)
323 context->pebs_master->ovfl(context->pebs_master);
324 break;
325 }
330} 326}
331 327
332 328
333/* 329/*
334 * Allocate a non-pageable buffer of the parameter size. 330 * Write raw data into the BTS or PEBS buffer.
335 * Checks the memory and the locked memory rlimit.
336 * 331 *
337 * Returns the buffer, if successful; 332 * The remainder of any partially written record is zeroed out.
338 * NULL, if out of memory or rlimit exceeded.
339 * 333 *
340 * size: the requested buffer size in bytes 334 * context: the DS context
341 * pages (out): if not NULL, contains the number of pages reserved 335 * qual: the buffer type
336 * record: the data to write
337 * size: the size of the data
342 */ 338 */
343static inline void *ds_allocate_buffer(size_t size, unsigned int *pages) 339static int ds_write(struct ds_context *context, enum ds_qualifier qual,
340 const void *record, size_t size)
344{ 341{
345 unsigned long rlim, vm, pgsz; 342 int bytes_written = 0;
346 void *buffer;
347 343
348 pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; 344 if (!record)
345 return -EINVAL;
349 346
350 rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; 347 while (size) {
351 vm = current->mm->total_vm + pgsz; 348 unsigned long base, index, end, write_end, int_th;
352 if (rlim < vm) 349 unsigned long write_size, adj_write_size;
353 return NULL;
354 350
355 rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; 351 /*
356 vm = current->mm->locked_vm + pgsz; 352 * write as much as possible without producing an
357 if (rlim < vm) 353 * overflow interrupt.
358 return NULL; 354 *
355 * interrupt_threshold must either be
356 * - bigger than absolute_maximum or
357 * - point to a record between buffer_base and absolute_maximum
358 *
359 * index points to a valid record.
360 */
361 base = ds_get(context->ds, qual, ds_buffer_base);
362 index = ds_get(context->ds, qual, ds_index);
363 end = ds_get(context->ds, qual, ds_absolute_maximum);
364 int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
359 365
360 buffer = kzalloc(size, GFP_KERNEL); 366 write_end = min(end, int_th);
361 if (!buffer)
362 return NULL;
363 367
364 current->mm->total_vm += pgsz; 368 /* if we are already beyond the interrupt threshold,
365 current->mm->locked_vm += pgsz; 369 * we fill the entire buffer */
370 if (write_end <= index)
371 write_end = end;
366 372
367 if (pages) 373 if (write_end <= index)
368 *pages = pgsz; 374 break;
375
376 write_size = min((unsigned long) size, write_end - index);
377 memcpy((void *)index, record, write_size);
369 378
370 return buffer; 379 record = (const char *)record + write_size;
380 size -= write_size;
381 bytes_written += write_size;
382
383 adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
384 adj_write_size *= ds_cfg.sizeof_rec[qual];
385
386 /* zero out trailing bytes */
387 memset((char *)index + write_size, 0,
388 adj_write_size - write_size);
389 index += adj_write_size;
390
391 if (index >= end)
392 index = base;
393 ds_set(context->ds, qual, ds_index, index);
394
395 if (index >= int_th)
396 ds_overflow(context, qual);
397 }
398
399 return bytes_written;
371} 400}
372 401
373static int ds_request(struct task_struct *task, void *base, size_t size, 402
374 ds_ovfl_callback_t ovfl, enum ds_qualifier qual) 403/*
404 * Branch Trace Store (BTS) uses the following format. Different
405 * architectures vary in the size of those fields.
406 * - source linear address
407 * - destination linear address
408 * - flags
409 *
410 * Later architectures use 64bit pointers throughout, whereas earlier
411 * architectures use 32bit pointers in 32bit mode.
412 *
413 * We compute the base address for the first 8 fields based on:
414 * - the field size stored in the DS configuration
415 * - the relative field position
416 *
417 * In order to store additional information in the BTS buffer, we use
418 * a special source address to indicate that the record requires
419 * special interpretation.
420 *
421 * Netburst indicated via a bit in the flags field whether the branch
422 * was predicted; this is ignored.
423 *
424 * We use two levels of abstraction:
425 * - the raw data level defined here
426 * - an arch-independent level defined in ds.h
427 */
428
429enum bts_field {
430 bts_from,
431 bts_to,
432 bts_flags,
433
434 bts_qual = bts_from,
435 bts_jiffies = bts_to,
436 bts_pid = bts_flags,
437
438 bts_qual_mask = (bts_qual_max - 1),
439 bts_escape = ((unsigned long)-1 & ~bts_qual_mask)
440};
441
442static inline unsigned long bts_get(const char *base, enum bts_field field)
375{ 443{
376 struct ds_context *context; 444 base += (ds_cfg.sizeof_field * field);
377 unsigned long buffer, adj; 445 return *(unsigned long *)base;
378 const unsigned long alignment = (1 << 3); 446}
379 unsigned long irq; 447
380 int error = 0; 448static inline void bts_set(char *base, enum bts_field field, unsigned long val)
449{
450 base += (ds_cfg.sizeof_field * field);;
451 (*(unsigned long *)base) = val;
452}
381 453
382 if (!ds_cfg.sizeof_ds)
383 return -EOPNOTSUPP;
384 454
385 /* we require some space to do alignment adjustments below */ 455/*
386 if (size < (alignment + ds_cfg.sizeof_rec[qual])) 456 * The raw BTS data is architecture dependent.
457 *
458 * For higher-level users, we give an arch-independent view.
459 * - ds.h defines struct bts_struct
460 * - bts_read translates one raw bts record into a bts_struct
461 * - bts_write translates one bts_struct into the raw format and
462 * writes it into the top of the parameter tracer's buffer.
463 *
464 * return: bytes read/written on success; -Eerrno, otherwise
465 */
466static int bts_read(struct bts_tracer *tracer, const void *at,
467 struct bts_struct *out)
468{
469 if (!tracer)
387 return -EINVAL; 470 return -EINVAL;
388 471
389 /* buffer overflow notification is not yet implemented */ 472 if (at < tracer->trace.ds.begin)
390 if (ovfl) 473 return -EINVAL;
391 return -EOPNOTSUPP;
392 474
475 if (tracer->trace.ds.end < (at + tracer->trace.ds.size))
476 return -EINVAL;
393 477
394 context = ds_alloc_context(task); 478 memset(out, 0, sizeof(*out));
395 if (!context) 479 if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) {
396 return -ENOMEM; 480 out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask);
481 out->variant.timestamp.jiffies = bts_get(at, bts_jiffies);
482 out->variant.timestamp.pid = bts_get(at, bts_pid);
483 } else {
484 out->qualifier = bts_branch;
485 out->variant.lbr.from = bts_get(at, bts_from);
486 out->variant.lbr.to = bts_get(at, bts_to);
487
488 if (!out->variant.lbr.from && !out->variant.lbr.to)
489 out->qualifier = bts_invalid;
490 }
397 491
398 spin_lock_irqsave(&ds_lock, irq); 492 return ds_cfg.sizeof_rec[ds_bts];
493}
399 494
400 error = -EPERM; 495static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in)
401 if (!check_tracer(task)) 496{
402 goto out_unlock; 497 unsigned char raw[MAX_SIZEOF_BTS];
403 498
404 get_tracer(task); 499 if (!tracer)
500 return -EINVAL;
405 501
406 error = -EALREADY; 502 if (MAX_SIZEOF_BTS < ds_cfg.sizeof_rec[ds_bts])
407 if (context->owner[qual] == current) 503 return -EOVERFLOW;
408 goto out_put_tracer;
409 error = -EPERM;
410 if (context->owner[qual] != NULL)
411 goto out_put_tracer;
412 context->owner[qual] = current;
413 504
414 spin_unlock_irqrestore(&ds_lock, irq); 505 switch (in->qualifier) {
506 case bts_invalid:
507 bts_set(raw, bts_from, 0);
508 bts_set(raw, bts_to, 0);
509 bts_set(raw, bts_flags, 0);
510 break;
511 case bts_branch:
512 bts_set(raw, bts_from, in->variant.lbr.from);
513 bts_set(raw, bts_to, in->variant.lbr.to);
514 bts_set(raw, bts_flags, 0);
515 break;
516 case bts_task_arrives:
517 case bts_task_departs:
518 bts_set(raw, bts_qual, (bts_escape | in->qualifier));
519 bts_set(raw, bts_jiffies, in->variant.timestamp.jiffies);
520 bts_set(raw, bts_pid, in->variant.timestamp.pid);
521 break;
522 default:
523 return -EINVAL;
524 }
415 525
526 return ds_write(tracer->ds.context, ds_bts, raw,
527 ds_cfg.sizeof_rec[ds_bts]);
528}
416 529
417 error = -ENOMEM;
418 if (!base) {
419 base = ds_allocate_buffer(size, &context->pages[qual]);
420 if (!base)
421 goto out_release;
422 530
423 context->buffer[qual] = base; 531static void ds_write_config(struct ds_context *context,
424 } 532 struct ds_trace *cfg, enum ds_qualifier qual)
425 error = 0; 533{
534 unsigned char *ds = context->ds;
535
536 ds_set(ds, qual, ds_buffer_base, (unsigned long)cfg->begin);
537 ds_set(ds, qual, ds_index, (unsigned long)cfg->top);
538 ds_set(ds, qual, ds_absolute_maximum, (unsigned long)cfg->end);
539 ds_set(ds, qual, ds_interrupt_threshold, (unsigned long)cfg->ith);
540}
541
542static void ds_read_config(struct ds_context *context,
543 struct ds_trace *cfg, enum ds_qualifier qual)
544{
545 unsigned char *ds = context->ds;
426 546
427 context->callback[qual] = ovfl; 547 cfg->begin = (void *)ds_get(ds, qual, ds_buffer_base);
548 cfg->top = (void *)ds_get(ds, qual, ds_index);
549 cfg->end = (void *)ds_get(ds, qual, ds_absolute_maximum);
550 cfg->ith = (void *)ds_get(ds, qual, ds_interrupt_threshold);
551}
552
553static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
554 void *base, size_t size, size_t ith,
555 unsigned int flags) {
556 unsigned long buffer, adj;
428 557
429 /* adjust the buffer address and size to meet alignment 558 /* adjust the buffer address and size to meet alignment
430 * constraints: 559 * constraints:
@@ -436,410 +565,383 @@ static int ds_request(struct task_struct *task, void *base, size_t size,
436 */ 565 */
437 buffer = (unsigned long)base; 566 buffer = (unsigned long)base;
438 567
439 adj = ALIGN(buffer, alignment) - buffer; 568 adj = ALIGN(buffer, DS_ALIGNMENT) - buffer;
440 buffer += adj; 569 buffer += adj;
441 size -= adj; 570 size -= adj;
442 571
443 size /= ds_cfg.sizeof_rec[qual]; 572 trace->n = size / ds_cfg.sizeof_rec[qual];
444 size *= ds_cfg.sizeof_rec[qual]; 573 trace->size = ds_cfg.sizeof_rec[qual];
445
446 ds_set(context->ds, qual, ds_buffer_base, buffer);
447 ds_set(context->ds, qual, ds_index, buffer);
448 ds_set(context->ds, qual, ds_absolute_maximum, buffer + size);
449 574
450 if (ovfl) { 575 size = (trace->n * trace->size);
451 /* todo: select a suitable interrupt threshold */
452 } else
453 ds_set(context->ds, qual,
454 ds_interrupt_threshold, buffer + size + 1);
455 576
456 /* we keep the context until ds_release */ 577 trace->begin = (void *)buffer;
457 return error; 578 trace->top = trace->begin;
458 579 trace->end = (void *)(buffer + size);
459 out_release: 580 /* The value for 'no threshold' is -1, which will set the
460 context->owner[qual] = NULL; 581 * threshold outside of the buffer, just like we want it.
461 ds_put_context(context); 582 */
462 put_tracer(task); 583 trace->ith = (void *)(buffer + size - ith);
463 return error;
464
465 out_put_tracer:
466 spin_unlock_irqrestore(&ds_lock, irq);
467 ds_put_context(context);
468 put_tracer(task);
469 return error;
470 584
471 out_unlock: 585 trace->flags = flags;
472 spin_unlock_irqrestore(&ds_lock, irq);
473 ds_put_context(context);
474 return error;
475} 586}
476 587
477int ds_request_bts(struct task_struct *task, void *base, size_t size,
478 ds_ovfl_callback_t ovfl)
479{
480 return ds_request(task, base, size, ovfl, ds_bts);
481}
482 588
483int ds_request_pebs(struct task_struct *task, void *base, size_t size, 589static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
484 ds_ovfl_callback_t ovfl) 590 enum ds_qualifier qual, struct task_struct *task,
485{ 591 void *base, size_t size, size_t th, unsigned int flags)
486 return ds_request(task, base, size, ovfl, ds_pebs);
487}
488
489static int ds_release(struct task_struct *task, enum ds_qualifier qual)
490{ 592{
491 struct ds_context *context; 593 struct ds_context *context;
492 int error; 594 int error;
493 595
494 context = ds_get_context(task); 596 error = -EINVAL;
495 error = ds_validate_access(context, qual); 597 if (!base)
496 if (error < 0)
497 goto out; 598 goto out;
498 599
499 kfree(context->buffer[qual]); 600 /* we require some space to do alignment adjustments below */
500 context->buffer[qual] = NULL; 601 error = -EINVAL;
501 602 if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual]))
502 current->mm->total_vm -= context->pages[qual]; 603 goto out;
503 current->mm->locked_vm -= context->pages[qual];
504 context->pages[qual] = 0;
505 context->owner[qual] = NULL;
506
507 /*
508 * we put the context twice:
509 * once for the ds_get_context
510 * once for the corresponding ds_request
511 */
512 ds_put_context(context);
513 out:
514 ds_put_context(context);
515 return error;
516}
517 604
518int ds_release_bts(struct task_struct *task) 605 if (th != (size_t)-1) {
519{ 606 th *= ds_cfg.sizeof_rec[qual];
520 return ds_release(task, ds_bts);
521}
522 607
523int ds_release_pebs(struct task_struct *task) 608 error = -EINVAL;
524{ 609 if (size <= th)
525 return ds_release(task, ds_pebs); 610 goto out;
526} 611 }
527 612
528static int ds_get_index(struct task_struct *task, size_t *pos, 613 tracer->buffer = base;
529 enum ds_qualifier qual) 614 tracer->size = size;
530{
531 struct ds_context *context;
532 unsigned long base, index;
533 int error;
534 615
616 error = -ENOMEM;
535 context = ds_get_context(task); 617 context = ds_get_context(task);
536 error = ds_validate_access(context, qual); 618 if (!context)
537 if (error < 0)
538 goto out; 619 goto out;
620 tracer->context = context;
539 621
540 base = ds_get(context->ds, qual, ds_buffer_base); 622 ds_init_ds_trace(trace, qual, base, size, th, flags);
541 index = ds_get(context->ds, qual, ds_index);
542 623
543 error = ((index - base) / ds_cfg.sizeof_rec[qual]); 624 error = 0;
544 if (pos)
545 *pos = error;
546 out: 625 out:
547 ds_put_context(context);
548 return error; 626 return error;
549} 627}
550 628
551int ds_get_bts_index(struct task_struct *task, size_t *pos) 629struct bts_tracer *ds_request_bts(struct task_struct *task,
552{ 630 void *base, size_t size,
553 return ds_get_index(task, pos, ds_bts); 631 bts_ovfl_callback_t ovfl, size_t th,
554} 632 unsigned int flags)
555
556int ds_get_pebs_index(struct task_struct *task, size_t *pos)
557{ 633{
558 return ds_get_index(task, pos, ds_pebs); 634 struct bts_tracer *tracer;
559} 635 unsigned long irq;
560
561static int ds_get_end(struct task_struct *task, size_t *pos,
562 enum ds_qualifier qual)
563{
564 struct ds_context *context;
565 unsigned long base, end;
566 int error; 636 int error;
567 637
568 context = ds_get_context(task); 638 error = -EOPNOTSUPP;
569 error = ds_validate_access(context, qual); 639 if (!ds_cfg.ctl[dsf_bts])
570 if (error < 0)
571 goto out; 640 goto out;
572 641
573 base = ds_get(context->ds, qual, ds_buffer_base); 642 /* buffer overflow notification is not yet implemented */
574 end = ds_get(context->ds, qual, ds_absolute_maximum); 643 error = -EOPNOTSUPP;
644 if (ovfl)
645 goto out;
575 646
576 error = ((end - base) / ds_cfg.sizeof_rec[qual]); 647 error = -ENOMEM;
577 if (pos) 648 tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
578 *pos = error; 649 if (!tracer)
579 out: 650 goto out;
580 ds_put_context(context); 651 tracer->ovfl = ovfl;
581 return error;
582}
583 652
584int ds_get_bts_end(struct task_struct *task, size_t *pos) 653 error = ds_request(&tracer->ds, &tracer->trace.ds,
585{ 654 ds_bts, task, base, size, th, flags);
586 return ds_get_end(task, pos, ds_bts); 655 if (error < 0)
587} 656 goto out_tracer;
588 657
589int ds_get_pebs_end(struct task_struct *task, size_t *pos)
590{
591 return ds_get_end(task, pos, ds_pebs);
592}
593 658
594static int ds_access(struct task_struct *task, size_t index, 659 spin_lock_irqsave(&ds_lock, irq);
595 const void **record, enum ds_qualifier qual)
596{
597 struct ds_context *context;
598 unsigned long base, idx;
599 int error;
600 660
601 if (!record) 661 error = -EPERM;
602 return -EINVAL; 662 if (!check_tracer(task))
663 goto out_unlock;
664 get_tracer(task);
603 665
604 context = ds_get_context(task); 666 error = -EPERM;
605 error = ds_validate_access(context, qual); 667 if (tracer->ds.context->bts_master)
606 if (error < 0) 668 goto out_put_tracer;
607 goto out; 669 tracer->ds.context->bts_master = tracer;
608 670
609 base = ds_get(context->ds, qual, ds_buffer_base); 671 spin_unlock_irqrestore(&ds_lock, irq);
610 idx = base + (index * ds_cfg.sizeof_rec[qual]);
611 672
612 error = -EINVAL;
613 if (idx > ds_get(context->ds, qual, ds_absolute_maximum))
614 goto out;
615 673
616 *record = (const void *)idx; 674 tracer->trace.read = bts_read;
617 error = ds_cfg.sizeof_rec[qual]; 675 tracer->trace.write = bts_write;
618 out:
619 ds_put_context(context);
620 return error;
621}
622 676
623int ds_access_bts(struct task_struct *task, size_t index, const void **record) 677 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
624{ 678 ds_resume_bts(tracer);
625 return ds_access(task, index, record, ds_bts);
626}
627 679
628int ds_access_pebs(struct task_struct *task, size_t index, const void **record) 680 return tracer;
629{ 681
630 return ds_access(task, index, record, ds_pebs); 682 out_put_tracer:
683 put_tracer(task);
684 out_unlock:
685 spin_unlock_irqrestore(&ds_lock, irq);
686 ds_put_context(tracer->ds.context);
687 out_tracer:
688 kfree(tracer);
689 out:
690 return ERR_PTR(error);
631} 691}
632 692
633static int ds_write(struct task_struct *task, const void *record, size_t size, 693struct pebs_tracer *ds_request_pebs(struct task_struct *task,
634 enum ds_qualifier qual, int force) 694 void *base, size_t size,
695 pebs_ovfl_callback_t ovfl, size_t th,
696 unsigned int flags)
635{ 697{
636 struct ds_context *context; 698 struct pebs_tracer *tracer;
699 unsigned long irq;
637 int error; 700 int error;
638 701
639 if (!record) 702 /* buffer overflow notification is not yet implemented */
640 return -EINVAL; 703 error = -EOPNOTSUPP;
704 if (ovfl)
705 goto out;
641 706
642 error = -EPERM; 707 error = -ENOMEM;
643 context = ds_get_context(task); 708 tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
644 if (!context) 709 if (!tracer)
645 goto out; 710 goto out;
711 tracer->ovfl = ovfl;
646 712
647 if (!force) { 713 error = ds_request(&tracer->ds, &tracer->trace.ds,
648 error = ds_validate_access(context, qual); 714 ds_pebs, task, base, size, th, flags);
649 if (error < 0) 715 if (error < 0)
650 goto out; 716 goto out_tracer;
651 }
652 717
653 error = 0; 718 spin_lock_irqsave(&ds_lock, irq);
654 while (size) {
655 unsigned long base, index, end, write_end, int_th;
656 unsigned long write_size, adj_write_size;
657 719
658 /* 720 error = -EPERM;
659 * write as much as possible without producing an 721 if (!check_tracer(task))
660 * overflow interrupt. 722 goto out_unlock;
661 * 723 get_tracer(task);
662 * interrupt_threshold must either be
663 * - bigger than absolute_maximum or
664 * - point to a record between buffer_base and absolute_maximum
665 *
666 * index points to a valid record.
667 */
668 base = ds_get(context->ds, qual, ds_buffer_base);
669 index = ds_get(context->ds, qual, ds_index);
670 end = ds_get(context->ds, qual, ds_absolute_maximum);
671 int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
672 724
673 write_end = min(end, int_th); 725 error = -EPERM;
726 if (tracer->ds.context->pebs_master)
727 goto out_put_tracer;
728 tracer->ds.context->pebs_master = tracer;
674 729
675 /* if we are already beyond the interrupt threshold, 730 spin_unlock_irqrestore(&ds_lock, irq);
676 * we fill the entire buffer */
677 if (write_end <= index)
678 write_end = end;
679 731
680 if (write_end <= index) 732 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
681 goto out; 733 ds_resume_pebs(tracer);
682 734
683 write_size = min((unsigned long) size, write_end - index); 735 return tracer;
684 memcpy((void *)index, record, write_size);
685 736
686 record = (const char *)record + write_size; 737 out_put_tracer:
687 size -= write_size; 738 put_tracer(task);
688 error += write_size; 739 out_unlock:
740 spin_unlock_irqrestore(&ds_lock, irq);
741 ds_put_context(tracer->ds.context);
742 out_tracer:
743 kfree(tracer);
744 out:
745 return ERR_PTR(error);
746}
689 747
690 adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; 748void ds_release_bts(struct bts_tracer *tracer)
691 adj_write_size *= ds_cfg.sizeof_rec[qual]; 749{
750 if (!tracer)
751 return;
692 752
693 /* zero out trailing bytes */ 753 ds_suspend_bts(tracer);
694 memset((char *)index + write_size, 0,
695 adj_write_size - write_size);
696 index += adj_write_size;
697 754
698 if (index >= end) 755 WARN_ON_ONCE(tracer->ds.context->bts_master != tracer);
699 index = base; 756 tracer->ds.context->bts_master = NULL;
700 ds_set(context->ds, qual, ds_index, index);
701 757
702 if (index >= int_th) 758 put_tracer(tracer->ds.context->task);
703 ds_overflow(task, context, qual); 759 ds_put_context(tracer->ds.context);
704 }
705 760
706 out: 761 kfree(tracer);
707 ds_put_context(context);
708 return error;
709} 762}
710 763
711int ds_write_bts(struct task_struct *task, const void *record, size_t size) 764void ds_suspend_bts(struct bts_tracer *tracer)
712{ 765{
713 return ds_write(task, record, size, ds_bts, /* force = */ 0); 766 struct task_struct *task;
714}
715 767
716int ds_write_pebs(struct task_struct *task, const void *record, size_t size) 768 if (!tracer)
717{ 769 return;
718 return ds_write(task, record, size, ds_pebs, /* force = */ 0);
719}
720 770
721int ds_unchecked_write_bts(struct task_struct *task, 771 task = tracer->ds.context->task;
722 const void *record, size_t size)
723{
724 return ds_write(task, record, size, ds_bts, /* force = */ 1);
725}
726 772
727int ds_unchecked_write_pebs(struct task_struct *task, 773 if (!task || (task == current))
728 const void *record, size_t size) 774 update_debugctlmsr(get_debugctlmsr() & ~BTS_CONTROL);
729{ 775
730 return ds_write(task, record, size, ds_pebs, /* force = */ 1); 776 if (task) {
777 task->thread.debugctlmsr &= ~BTS_CONTROL;
778
779 if (!task->thread.debugctlmsr)
780 clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
781 }
731} 782}
732 783
733static int ds_reset_or_clear(struct task_struct *task, 784void ds_resume_bts(struct bts_tracer *tracer)
734 enum ds_qualifier qual, int clear)
735{ 785{
736 struct ds_context *context; 786 struct task_struct *task;
737 unsigned long base, end; 787 unsigned long control;
738 int error;
739 788
740 context = ds_get_context(task); 789 if (!tracer)
741 error = ds_validate_access(context, qual); 790 return;
742 if (error < 0)
743 goto out;
744 791
745 base = ds_get(context->ds, qual, ds_buffer_base); 792 task = tracer->ds.context->task;
746 end = ds_get(context->ds, qual, ds_absolute_maximum);
747 793
748 if (clear) 794 control = ds_cfg.ctl[dsf_bts];
749 memset((void *)base, 0, end - base); 795 if (!(tracer->trace.ds.flags & BTS_KERNEL))
796 control |= ds_cfg.ctl[dsf_bts_kernel];
797 if (!(tracer->trace.ds.flags & BTS_USER))
798 control |= ds_cfg.ctl[dsf_bts_user];
750 799
751 ds_set(context->ds, qual, ds_index, base); 800 if (task) {
801 task->thread.debugctlmsr |= control;
802 set_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
803 }
752 804
753 error = 0; 805 if (!task || (task == current))
754 out: 806 update_debugctlmsr(get_debugctlmsr() | control);
755 ds_put_context(context);
756 return error;
757} 807}
758 808
759int ds_reset_bts(struct task_struct *task) 809void ds_release_pebs(struct pebs_tracer *tracer)
760{ 810{
761 return ds_reset_or_clear(task, ds_bts, /* clear = */ 0); 811 if (!tracer)
812 return;
813
814 ds_suspend_pebs(tracer);
815
816 WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer);
817 tracer->ds.context->pebs_master = NULL;
818
819 put_tracer(tracer->ds.context->task);
820 ds_put_context(tracer->ds.context);
821
822 kfree(tracer);
762} 823}
763 824
764int ds_reset_pebs(struct task_struct *task) 825void ds_suspend_pebs(struct pebs_tracer *tracer)
765{ 826{
766 return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0); 827
767} 828}
768 829
769int ds_clear_bts(struct task_struct *task) 830void ds_resume_pebs(struct pebs_tracer *tracer)
770{ 831{
771 return ds_reset_or_clear(task, ds_bts, /* clear = */ 1); 832
772} 833}
773 834
774int ds_clear_pebs(struct task_struct *task) 835const struct bts_trace *ds_read_bts(struct bts_tracer *tracer)
775{ 836{
776 return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1); 837 if (!tracer)
838 return NULL;
839
840 ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
841 return &tracer->trace;
777} 842}
778 843
779int ds_get_pebs_reset(struct task_struct *task, u64 *value) 844const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer)
780{ 845{
781 struct ds_context *context; 846 if (!tracer)
782 int error; 847 return NULL;
848
849 ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
850 tracer->trace.reset_value =
851 *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8));
783 852
784 if (!value) 853 return &tracer->trace;
854}
855
856int ds_reset_bts(struct bts_tracer *tracer)
857{
858 if (!tracer)
785 return -EINVAL; 859 return -EINVAL;
786 860
787 context = ds_get_context(task); 861 tracer->trace.ds.top = tracer->trace.ds.begin;
788 error = ds_validate_access(context, ds_pebs);
789 if (error < 0)
790 goto out;
791 862
792 *value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)); 863 ds_set(tracer->ds.context->ds, ds_bts, ds_index,
864 (unsigned long)tracer->trace.ds.top);
793 865
794 error = 0; 866 return 0;
795 out:
796 ds_put_context(context);
797 return error;
798} 867}
799 868
800int ds_set_pebs_reset(struct task_struct *task, u64 value) 869int ds_reset_pebs(struct pebs_tracer *tracer)
801{ 870{
802 struct ds_context *context; 871 if (!tracer)
803 int error; 872 return -EINVAL;
804 873
805 context = ds_get_context(task); 874 tracer->trace.ds.top = tracer->trace.ds.begin;
806 error = ds_validate_access(context, ds_pebs);
807 if (error < 0)
808 goto out;
809 875
810 *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value; 876 ds_set(tracer->ds.context->ds, ds_bts, ds_index,
877 (unsigned long)tracer->trace.ds.top);
811 878
812 error = 0; 879 return 0;
813 out: 880}
814 ds_put_context(context); 881
815 return error; 882int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value)
883{
884 if (!tracer)
885 return -EINVAL;
886
887 *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value;
888
889 return 0;
816} 890}
817 891
818static const struct ds_configuration ds_cfg_var = { 892static const struct ds_configuration ds_cfg_netburst = {
819 .sizeof_ds = sizeof(long) * 12, 893 .name = "netburst",
820 .sizeof_field = sizeof(long), 894 .ctl[dsf_bts] = (1 << 2) | (1 << 3),
821 .sizeof_rec[ds_bts] = sizeof(long) * 3, 895 .ctl[dsf_bts_kernel] = (1 << 5),
896 .ctl[dsf_bts_user] = (1 << 6),
897
898 .sizeof_field = sizeof(long),
899 .sizeof_rec[ds_bts] = sizeof(long) * 3,
822#ifdef __i386__ 900#ifdef __i386__
823 .sizeof_rec[ds_pebs] = sizeof(long) * 10 901 .sizeof_rec[ds_pebs] = sizeof(long) * 10,
824#else 902#else
825 .sizeof_rec[ds_pebs] = sizeof(long) * 18 903 .sizeof_rec[ds_pebs] = sizeof(long) * 18,
826#endif 904#endif
827}; 905};
828static const struct ds_configuration ds_cfg_64 = { 906static const struct ds_configuration ds_cfg_pentium_m = {
829 .sizeof_ds = 8 * 12, 907 .name = "pentium m",
830 .sizeof_field = 8, 908 .ctl[dsf_bts] = (1 << 6) | (1 << 7),
831 .sizeof_rec[ds_bts] = 8 * 3, 909
910 .sizeof_field = sizeof(long),
911 .sizeof_rec[ds_bts] = sizeof(long) * 3,
832#ifdef __i386__ 912#ifdef __i386__
833 .sizeof_rec[ds_pebs] = 8 * 10 913 .sizeof_rec[ds_pebs] = sizeof(long) * 10,
834#else 914#else
835 .sizeof_rec[ds_pebs] = 8 * 18 915 .sizeof_rec[ds_pebs] = sizeof(long) * 18,
836#endif 916#endif
837}; 917};
918static const struct ds_configuration ds_cfg_core2 = {
919 .name = "core 2",
920 .ctl[dsf_bts] = (1 << 6) | (1 << 7),
921 .ctl[dsf_bts_kernel] = (1 << 9),
922 .ctl[dsf_bts_user] = (1 << 10),
923
924 .sizeof_field = 8,
925 .sizeof_rec[ds_bts] = 8 * 3,
926 .sizeof_rec[ds_pebs] = 8 * 18,
927};
838 928
839static inline void 929static void
840ds_configure(const struct ds_configuration *cfg) 930ds_configure(const struct ds_configuration *cfg)
841{ 931{
932 memset(&ds_cfg, 0, sizeof(ds_cfg));
842 ds_cfg = *cfg; 933 ds_cfg = *cfg;
934
935 printk(KERN_INFO "[ds] using %s configuration\n", ds_cfg.name);
936
937 if (!cpu_has_bts) {
938 ds_cfg.ctl[dsf_bts] = 0;
939 printk(KERN_INFO "[ds] bts not available\n");
940 }
941 if (!cpu_has_pebs)
942 printk(KERN_INFO "[ds] pebs not available\n");
943
944 WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field));
843} 945}
844 946
845void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) 947void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
@@ -847,16 +949,15 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
847 switch (c->x86) { 949 switch (c->x86) {
848 case 0x6: 950 case 0x6:
849 switch (c->x86_model) { 951 switch (c->x86_model) {
952 case 0 ... 0xC:
953 /* sorry, don't know about them */
954 break;
850 case 0xD: 955 case 0xD:
851 case 0xE: /* Pentium M */ 956 case 0xE: /* Pentium M */
852 ds_configure(&ds_cfg_var); 957 ds_configure(&ds_cfg_pentium_m);
853 break; 958 break;
854 case 0xF: /* Core2 */ 959 default: /* Core2, Atom, ... */
855 case 0x1C: /* Atom */ 960 ds_configure(&ds_cfg_core2);
856 ds_configure(&ds_cfg_64);
857 break;
858 default:
859 /* sorry, don't know about them */
860 break; 961 break;
861 } 962 }
862 break; 963 break;
@@ -865,7 +966,7 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
865 case 0x0: 966 case 0x0:
866 case 0x1: 967 case 0x1:
867 case 0x2: /* Netburst */ 968 case 0x2: /* Netburst */
868 ds_configure(&ds_cfg_var); 969 ds_configure(&ds_cfg_netburst);
869 break; 970 break;
870 default: 971 default:
871 /* sorry, don't know about them */ 972 /* sorry, don't know about them */
@@ -878,12 +979,52 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
878 } 979 }
879} 980}
880 981
881void ds_free(struct ds_context *context) 982/*
983 * Change the DS configuration from tracing prev to tracing next.
984 */
985void ds_switch_to(struct task_struct *prev, struct task_struct *next)
986{
987 struct ds_context *prev_ctx = prev->thread.ds_ctx;
988 struct ds_context *next_ctx = next->thread.ds_ctx;
989
990 if (prev_ctx) {
991 update_debugctlmsr(0);
992
993 if (prev_ctx->bts_master &&
994 (prev_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) {
995 struct bts_struct ts = {
996 .qualifier = bts_task_departs,
997 .variant.timestamp.jiffies = jiffies_64,
998 .variant.timestamp.pid = prev->pid
999 };
1000 bts_write(prev_ctx->bts_master, &ts);
1001 }
1002 }
1003
1004 if (next_ctx) {
1005 if (next_ctx->bts_master &&
1006 (next_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) {
1007 struct bts_struct ts = {
1008 .qualifier = bts_task_arrives,
1009 .variant.timestamp.jiffies = jiffies_64,
1010 .variant.timestamp.pid = next->pid
1011 };
1012 bts_write(next_ctx->bts_master, &ts);
1013 }
1014
1015 wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds);
1016 }
1017
1018 update_debugctlmsr(next->thread.debugctlmsr);
1019}
1020
1021void ds_copy_thread(struct task_struct *tsk, struct task_struct *father)
1022{
1023 clear_tsk_thread_flag(tsk, TIF_DS_AREA_MSR);
1024 tsk->thread.ds_ctx = NULL;
1025}
1026
1027void ds_exit_thread(struct task_struct *tsk)
882{ 1028{
883 /* This is called when the task owning the parameter context 1029 WARN_ON(tsk->thread.ds_ctx);
884 * is dying. There should not be any user of that context left
885 * to disturb us, anymore. */
886 unsigned long leftovers = context->count;
887 while (leftovers--)
888 ds_put_context(context);
889} 1030}
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
new file mode 100644
index 000000000000..6b1f6f6f8661
--- /dev/null
+++ b/arch/x86/kernel/dumpstack.c
@@ -0,0 +1,351 @@
1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
4 */
5#include <linux/kallsyms.h>
6#include <linux/kprobes.h>
7#include <linux/uaccess.h>
8#include <linux/utsname.h>
9#include <linux/hardirq.h>
10#include <linux/kdebug.h>
11#include <linux/module.h>
12#include <linux/ptrace.h>
13#include <linux/kexec.h>
14#include <linux/bug.h>
15#include <linux/nmi.h>
16#include <linux/sysfs.h>
17
18#include <asm/stacktrace.h>
19
20#include "dumpstack.h"
21
22int panic_on_unrecovered_nmi;
23unsigned int code_bytes = 64;
24int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
25static int die_counter;
26
27void printk_address(unsigned long address, int reliable)
28{
29 printk(" [<%p>] %s%pS\n", (void *) address,
30 reliable ? "" : "? ", (void *) address);
31}
32
33#ifdef CONFIG_FUNCTION_GRAPH_TRACER
34static void
35print_ftrace_graph_addr(unsigned long addr, void *data,
36 const struct stacktrace_ops *ops,
37 struct thread_info *tinfo, int *graph)
38{
39 struct task_struct *task = tinfo->task;
40 unsigned long ret_addr;
41 int index = task->curr_ret_stack;
42
43 if (addr != (unsigned long)return_to_handler)
44 return;
45
46 if (!task->ret_stack || index < *graph)
47 return;
48
49 index -= *graph;
50 ret_addr = task->ret_stack[index].ret;
51
52 ops->address(data, ret_addr, 1);
53
54 (*graph)++;
55}
56#else
57static inline void
58print_ftrace_graph_addr(unsigned long addr, void *data,
59 const struct stacktrace_ops *ops,
60 struct thread_info *tinfo, int *graph)
61{ }
62#endif
63
64/*
65 * x86-64 can have up to three kernel stacks:
66 * process stack
67 * interrupt stack
68 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
69 */
70
71static inline int valid_stack_ptr(struct thread_info *tinfo,
72 void *p, unsigned int size, void *end)
73{
74 void *t = tinfo;
75 if (end) {
76 if (p < end && p >= (end-THREAD_SIZE))
77 return 1;
78 else
79 return 0;
80 }
81 return p > t && p < t + THREAD_SIZE - size;
82}
83
84unsigned long
85print_context_stack(struct thread_info *tinfo,
86 unsigned long *stack, unsigned long bp,
87 const struct stacktrace_ops *ops, void *data,
88 unsigned long *end, int *graph)
89{
90 struct stack_frame *frame = (struct stack_frame *)bp;
91
92 while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
93 unsigned long addr;
94
95 addr = *stack;
96 if (__kernel_text_address(addr)) {
97 if ((unsigned long) stack == bp + sizeof(long)) {
98 ops->address(data, addr, 1);
99 frame = frame->next_frame;
100 bp = (unsigned long) frame;
101 } else {
102 ops->address(data, addr, bp == 0);
103 }
104 print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
105 }
106 stack++;
107 }
108 return bp;
109}
110
111
112static void
113print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
114{
115 printk(data);
116 print_symbol(msg, symbol);
117 printk("\n");
118}
119
120static void print_trace_warning(void *data, char *msg)
121{
122 printk("%s%s\n", (char *)data, msg);
123}
124
125static int print_trace_stack(void *data, char *name)
126{
127 printk("%s <%s> ", (char *)data, name);
128 return 0;
129}
130
131/*
132 * Print one address/symbol entries per line.
133 */
134static void print_trace_address(void *data, unsigned long addr, int reliable)
135{
136 touch_nmi_watchdog();
137 printk(data);
138 printk_address(addr, reliable);
139}
140
141static const struct stacktrace_ops print_trace_ops = {
142 .warning = print_trace_warning,
143 .warning_symbol = print_trace_warning_symbol,
144 .stack = print_trace_stack,
145 .address = print_trace_address,
146};
147
148void
149show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
150 unsigned long *stack, unsigned long bp, char *log_lvl)
151{
152 printk("%sCall Trace:\n", log_lvl);
153 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
154}
155
156void show_trace(struct task_struct *task, struct pt_regs *regs,
157 unsigned long *stack, unsigned long bp)
158{
159 show_trace_log_lvl(task, regs, stack, bp, "");
160}
161
162void show_stack(struct task_struct *task, unsigned long *sp)
163{
164 show_stack_log_lvl(task, NULL, sp, 0, "");
165}
166
167/*
168 * The architecture-independent dump_stack generator
169 */
170void dump_stack(void)
171{
172 unsigned long bp = 0;
173 unsigned long stack;
174
175#ifdef CONFIG_FRAME_POINTER
176 if (!bp)
177 get_bp(bp);
178#endif
179
180 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
181 current->pid, current->comm, print_tainted(),
182 init_utsname()->release,
183 (int)strcspn(init_utsname()->version, " "),
184 init_utsname()->version);
185 show_trace(NULL, NULL, &stack, bp);
186}
187EXPORT_SYMBOL(dump_stack);
188
189static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
190static int die_owner = -1;
191static unsigned int die_nest_count;
192
193unsigned __kprobes long oops_begin(void)
194{
195 int cpu;
196 unsigned long flags;
197
198 oops_enter();
199
200 /* racy, but better than risking deadlock. */
201 raw_local_irq_save(flags);
202 cpu = smp_processor_id();
203 if (!__raw_spin_trylock(&die_lock)) {
204 if (cpu == die_owner)
205 /* nested oops. should stop eventually */;
206 else
207 __raw_spin_lock(&die_lock);
208 }
209 die_nest_count++;
210 die_owner = cpu;
211 console_verbose();
212 bust_spinlocks(1);
213 return flags;
214}
215
216void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
217{
218 if (regs && kexec_should_crash(current))
219 crash_kexec(regs);
220
221 bust_spinlocks(0);
222 die_owner = -1;
223 add_taint(TAINT_DIE);
224 die_nest_count--;
225 if (!die_nest_count)
226 /* Nest count reaches zero, release the lock. */
227 __raw_spin_unlock(&die_lock);
228 raw_local_irq_restore(flags);
229 oops_exit();
230
231 if (!signr)
232 return;
233 if (in_interrupt())
234 panic("Fatal exception in interrupt");
235 if (panic_on_oops)
236 panic("Fatal exception");
237 do_exit(signr);
238}
239
240int __kprobes __die(const char *str, struct pt_regs *regs, long err)
241{
242#ifdef CONFIG_X86_32
243 unsigned short ss;
244 unsigned long sp;
245#endif
246 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
247#ifdef CONFIG_PREEMPT
248 printk("PREEMPT ");
249#endif
250#ifdef CONFIG_SMP
251 printk("SMP ");
252#endif
253#ifdef CONFIG_DEBUG_PAGEALLOC
254 printk("DEBUG_PAGEALLOC");
255#endif
256 printk("\n");
257 sysfs_printk_last_file();
258 if (notify_die(DIE_OOPS, str, regs, err,
259 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
260 return 1;
261
262 show_registers(regs);
263#ifdef CONFIG_X86_32
264 sp = (unsigned long) (&regs->sp);
265 savesegment(ss, ss);
266 if (user_mode(regs)) {
267 sp = regs->sp;
268 ss = regs->ss & 0xffff;
269 }
270 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
271 print_symbol("%s", regs->ip);
272 printk(" SS:ESP %04x:%08lx\n", ss, sp);
273#else
274 /* Executive summary in case the oops scrolled away */
275 printk(KERN_ALERT "RIP ");
276 printk_address(regs->ip, 1);
277 printk(" RSP <%016lx>\n", regs->sp);
278#endif
279 return 0;
280}
281
282/*
283 * This is gone through when something in the kernel has done something bad
284 * and is about to be terminated:
285 */
286void die(const char *str, struct pt_regs *regs, long err)
287{
288 unsigned long flags = oops_begin();
289 int sig = SIGSEGV;
290
291 if (!user_mode_vm(regs))
292 report_bug(regs->ip, regs);
293
294 if (__die(str, regs, err))
295 sig = 0;
296 oops_end(flags, regs, sig);
297}
298
299void notrace __kprobes
300die_nmi(char *str, struct pt_regs *regs, int do_panic)
301{
302 unsigned long flags;
303
304 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
305 return;
306
307 /*
308 * We are in trouble anyway, lets at least try
309 * to get a message out.
310 */
311 flags = oops_begin();
312 printk(KERN_EMERG "%s", str);
313 printk(" on CPU%d, ip %08lx, registers:\n",
314 smp_processor_id(), regs->ip);
315 show_registers(regs);
316 oops_end(flags, regs, 0);
317 if (do_panic || panic_on_oops)
318 panic("Non maskable interrupt");
319 nmi_exit();
320 local_irq_enable();
321 do_exit(SIGBUS);
322}
323
324static int __init oops_setup(char *s)
325{
326 if (!s)
327 return -EINVAL;
328 if (!strcmp(s, "panic"))
329 panic_on_oops = 1;
330 return 0;
331}
332early_param("oops", oops_setup);
333
334static int __init kstack_setup(char *s)
335{
336 if (!s)
337 return -EINVAL;
338 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
339 return 0;
340}
341early_param("kstack", kstack_setup);
342
343static int __init code_bytes_setup(char *s)
344{
345 code_bytes = simple_strtoul(s, NULL, 0);
346 if (code_bytes > 8192)
347 code_bytes = 8192;
348
349 return 1;
350}
351__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
new file mode 100644
index 000000000000..da87590b8698
--- /dev/null
+++ b/arch/x86/kernel/dumpstack.h
@@ -0,0 +1,39 @@
1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
4 */
5
6#ifndef DUMPSTACK_H
7#define DUMPSTACK_H
8
9#ifdef CONFIG_X86_32
10#define STACKSLOTS_PER_LINE 8
11#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
12#else
13#define STACKSLOTS_PER_LINE 4
14#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
15#endif
16
17extern unsigned long
18print_context_stack(struct thread_info *tinfo,
19 unsigned long *stack, unsigned long bp,
20 const struct stacktrace_ops *ops, void *data,
21 unsigned long *end, int *graph);
22
23extern void
24show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
25 unsigned long *stack, unsigned long bp, char *log_lvl);
26
27extern void
28show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
29 unsigned long *sp, unsigned long bp, char *log_lvl);
30
31extern unsigned int code_bytes;
32extern int kstack_depth_to_print;
33
34/* The form of the top of the frame on the stack */
35struct stack_frame {
36 struct stack_frame *next_frame;
37 unsigned long return_address;
38};
39#endif
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index b3614752197b..d593cd1f58dc 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -17,69 +17,14 @@
17 17
18#include <asm/stacktrace.h> 18#include <asm/stacktrace.h>
19 19
20#define STACKSLOTS_PER_LINE 8 20#include "dumpstack.h"
21#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
22
23int panic_on_unrecovered_nmi;
24int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
25static unsigned int code_bytes = 64;
26static int die_counter;
27
28void printk_address(unsigned long address, int reliable)
29{
30 printk(" [<%p>] %s%pS\n", (void *) address,
31 reliable ? "" : "? ", (void *) address);
32}
33
34static inline int valid_stack_ptr(struct thread_info *tinfo,
35 void *p, unsigned int size, void *end)
36{
37 void *t = tinfo;
38 if (end) {
39 if (p < end && p >= (end-THREAD_SIZE))
40 return 1;
41 else
42 return 0;
43 }
44 return p > t && p < t + THREAD_SIZE - size;
45}
46
47/* The form of the top of the frame on the stack */
48struct stack_frame {
49 struct stack_frame *next_frame;
50 unsigned long return_address;
51};
52
53static inline unsigned long
54print_context_stack(struct thread_info *tinfo,
55 unsigned long *stack, unsigned long bp,
56 const struct stacktrace_ops *ops, void *data,
57 unsigned long *end)
58{
59 struct stack_frame *frame = (struct stack_frame *)bp;
60
61 while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
62 unsigned long addr;
63
64 addr = *stack;
65 if (__kernel_text_address(addr)) {
66 if ((unsigned long) stack == bp + sizeof(long)) {
67 ops->address(data, addr, 1);
68 frame = frame->next_frame;
69 bp = (unsigned long) frame;
70 } else {
71 ops->address(data, addr, bp == 0);
72 }
73 }
74 stack++;
75 }
76 return bp;
77}
78 21
79void dump_trace(struct task_struct *task, struct pt_regs *regs, 22void dump_trace(struct task_struct *task, struct pt_regs *regs,
80 unsigned long *stack, unsigned long bp, 23 unsigned long *stack, unsigned long bp,
81 const struct stacktrace_ops *ops, void *data) 24 const struct stacktrace_ops *ops, void *data)
82{ 25{
26 int graph = 0;
27
83 if (!task) 28 if (!task)
84 task = current; 29 task = current;
85 30
@@ -107,7 +52,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
107 52
108 context = (struct thread_info *) 53 context = (struct thread_info *)
109 ((unsigned long)stack & (~(THREAD_SIZE - 1))); 54 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
110 bp = print_context_stack(context, stack, bp, ops, data, NULL); 55 bp = print_context_stack(context, stack, bp, ops,
56 data, NULL, &graph);
111 57
112 stack = (unsigned long *)context->previous_esp; 58 stack = (unsigned long *)context->previous_esp;
113 if (!stack) 59 if (!stack)
@@ -119,57 +65,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
119} 65}
120EXPORT_SYMBOL(dump_trace); 66EXPORT_SYMBOL(dump_trace);
121 67
122static void 68void
123print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
124{
125 printk(data);
126 print_symbol(msg, symbol);
127 printk("\n");
128}
129
130static void print_trace_warning(void *data, char *msg)
131{
132 printk("%s%s\n", (char *)data, msg);
133}
134
135static int print_trace_stack(void *data, char *name)
136{
137 printk("%s <%s> ", (char *)data, name);
138 return 0;
139}
140
141/*
142 * Print one address/symbol entries per line.
143 */
144static void print_trace_address(void *data, unsigned long addr, int reliable)
145{
146 touch_nmi_watchdog();
147 printk(data);
148 printk_address(addr, reliable);
149}
150
151static const struct stacktrace_ops print_trace_ops = {
152 .warning = print_trace_warning,
153 .warning_symbol = print_trace_warning_symbol,
154 .stack = print_trace_stack,
155 .address = print_trace_address,
156};
157
158static void
159show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
160 unsigned long *stack, unsigned long bp, char *log_lvl)
161{
162 printk("%sCall Trace:\n", log_lvl);
163 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
164}
165
166void show_trace(struct task_struct *task, struct pt_regs *regs,
167 unsigned long *stack, unsigned long bp)
168{
169 show_trace_log_lvl(task, regs, stack, bp, "");
170}
171
172static void
173show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 69show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
174 unsigned long *sp, unsigned long bp, char *log_lvl) 70 unsigned long *sp, unsigned long bp, char *log_lvl)
175{ 71{
@@ -196,33 +92,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
196 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 92 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
197} 93}
198 94
199void show_stack(struct task_struct *task, unsigned long *sp)
200{
201 show_stack_log_lvl(task, NULL, sp, 0, "");
202}
203
204/*
205 * The architecture-independent dump_stack generator
206 */
207void dump_stack(void)
208{
209 unsigned long bp = 0;
210 unsigned long stack;
211
212#ifdef CONFIG_FRAME_POINTER
213 if (!bp)
214 get_bp(bp);
215#endif
216
217 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
218 current->pid, current->comm, print_tainted(),
219 init_utsname()->release,
220 (int)strcspn(init_utsname()->version, " "),
221 init_utsname()->version);
222 show_trace(NULL, NULL, &stack, bp);
223}
224
225EXPORT_SYMBOL(dump_stack);
226 95
227void show_registers(struct pt_regs *regs) 96void show_registers(struct pt_regs *regs)
228{ 97{
@@ -283,167 +152,3 @@ int is_valid_bugaddr(unsigned long ip)
283 return ud2 == 0x0b0f; 152 return ud2 == 0x0b0f;
284} 153}
285 154
286static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
287static int die_owner = -1;
288static unsigned int die_nest_count;
289
290unsigned __kprobes long oops_begin(void)
291{
292 unsigned long flags;
293
294 oops_enter();
295
296 if (die_owner != raw_smp_processor_id()) {
297 console_verbose();
298 raw_local_irq_save(flags);
299 __raw_spin_lock(&die_lock);
300 die_owner = smp_processor_id();
301 die_nest_count = 0;
302 bust_spinlocks(1);
303 } else {
304 raw_local_irq_save(flags);
305 }
306 die_nest_count++;
307 return flags;
308}
309
310void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
311{
312 bust_spinlocks(0);
313 die_owner = -1;
314 add_taint(TAINT_DIE);
315 __raw_spin_unlock(&die_lock);
316 raw_local_irq_restore(flags);
317
318 if (!regs)
319 return;
320
321 if (kexec_should_crash(current))
322 crash_kexec(regs);
323 if (in_interrupt())
324 panic("Fatal exception in interrupt");
325 if (panic_on_oops)
326 panic("Fatal exception");
327 oops_exit();
328 do_exit(signr);
329}
330
331int __kprobes __die(const char *str, struct pt_regs *regs, long err)
332{
333 unsigned short ss;
334 unsigned long sp;
335
336 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
337#ifdef CONFIG_PREEMPT
338 printk("PREEMPT ");
339#endif
340#ifdef CONFIG_SMP
341 printk("SMP ");
342#endif
343#ifdef CONFIG_DEBUG_PAGEALLOC
344 printk("DEBUG_PAGEALLOC");
345#endif
346 printk("\n");
347 sysfs_printk_last_file();
348 if (notify_die(DIE_OOPS, str, regs, err,
349 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
350 return 1;
351
352 show_registers(regs);
353 /* Executive summary in case the oops scrolled away */
354 sp = (unsigned long) (&regs->sp);
355 savesegment(ss, ss);
356 if (user_mode(regs)) {
357 sp = regs->sp;
358 ss = regs->ss & 0xffff;
359 }
360 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
361 print_symbol("%s", regs->ip);
362 printk(" SS:ESP %04x:%08lx\n", ss, sp);
363 return 0;
364}
365
366/*
367 * This is gone through when something in the kernel has done something bad
368 * and is about to be terminated:
369 */
370void die(const char *str, struct pt_regs *regs, long err)
371{
372 unsigned long flags = oops_begin();
373
374 if (die_nest_count < 3) {
375 report_bug(regs->ip, regs);
376
377 if (__die(str, regs, err))
378 regs = NULL;
379 } else {
380 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
381 }
382
383 oops_end(flags, regs, SIGSEGV);
384}
385
386static DEFINE_SPINLOCK(nmi_print_lock);
387
388void notrace __kprobes
389die_nmi(char *str, struct pt_regs *regs, int do_panic)
390{
391 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
392 return;
393
394 spin_lock(&nmi_print_lock);
395 /*
396 * We are in trouble anyway, lets at least try
397 * to get a message out:
398 */
399 bust_spinlocks(1);
400 printk(KERN_EMERG "%s", str);
401 printk(" on CPU%d, ip %08lx, registers:\n",
402 smp_processor_id(), regs->ip);
403 show_registers(regs);
404 if (do_panic)
405 panic("Non maskable interrupt");
406 console_silent();
407 spin_unlock(&nmi_print_lock);
408
409 /*
410 * If we are in kernel we are probably nested up pretty bad
411 * and might aswell get out now while we still can:
412 */
413 if (!user_mode_vm(regs)) {
414 current->thread.trap_no = 2;
415 crash_kexec(regs);
416 }
417
418 bust_spinlocks(0);
419 do_exit(SIGSEGV);
420}
421
422static int __init oops_setup(char *s)
423{
424 if (!s)
425 return -EINVAL;
426 if (!strcmp(s, "panic"))
427 panic_on_oops = 1;
428 return 0;
429}
430early_param("oops", oops_setup);
431
432static int __init kstack_setup(char *s)
433{
434 if (!s)
435 return -EINVAL;
436 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
437 return 0;
438}
439early_param("kstack", kstack_setup);
440
441static int __init code_bytes_setup(char *s)
442{
443 code_bytes = simple_strtoul(s, NULL, 0);
444 if (code_bytes > 8192)
445 code_bytes = 8192;
446
447 return 1;
448}
449__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 96a5db7da8a7..c302d0707048 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -17,19 +17,7 @@
17 17
18#include <asm/stacktrace.h> 18#include <asm/stacktrace.h>
19 19
20#define STACKSLOTS_PER_LINE 4 20#include "dumpstack.h"
21#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
22
23int panic_on_unrecovered_nmi;
24int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
25static unsigned int code_bytes = 64;
26static int die_counter;
27
28void printk_address(unsigned long address, int reliable)
29{
30 printk(" [<%p>] %s%pS\n", (void *) address,
31 reliable ? "" : "? ", (void *) address);
32}
33 21
34static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, 22static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
35 unsigned *usedp, char **idp) 23 unsigned *usedp, char **idp)
@@ -113,51 +101,6 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
113 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack 101 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
114 */ 102 */
115 103
116static inline int valid_stack_ptr(struct thread_info *tinfo,
117 void *p, unsigned int size, void *end)
118{
119 void *t = tinfo;
120 if (end) {
121 if (p < end && p >= (end-THREAD_SIZE))
122 return 1;
123 else
124 return 0;
125 }
126 return p > t && p < t + THREAD_SIZE - size;
127}
128
129/* The form of the top of the frame on the stack */
130struct stack_frame {
131 struct stack_frame *next_frame;
132 unsigned long return_address;
133};
134
135static inline unsigned long
136print_context_stack(struct thread_info *tinfo,
137 unsigned long *stack, unsigned long bp,
138 const struct stacktrace_ops *ops, void *data,
139 unsigned long *end)
140{
141 struct stack_frame *frame = (struct stack_frame *)bp;
142
143 while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
144 unsigned long addr;
145
146 addr = *stack;
147 if (__kernel_text_address(addr)) {
148 if ((unsigned long) stack == bp + sizeof(long)) {
149 ops->address(data, addr, 1);
150 frame = frame->next_frame;
151 bp = (unsigned long) frame;
152 } else {
153 ops->address(data, addr, bp == 0);
154 }
155 }
156 stack++;
157 }
158 return bp;
159}
160
161void dump_trace(struct task_struct *task, struct pt_regs *regs, 104void dump_trace(struct task_struct *task, struct pt_regs *regs,
162 unsigned long *stack, unsigned long bp, 105 unsigned long *stack, unsigned long bp,
163 const struct stacktrace_ops *ops, void *data) 106 const struct stacktrace_ops *ops, void *data)
@@ -166,6 +109,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
166 unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; 109 unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
167 unsigned used = 0; 110 unsigned used = 0;
168 struct thread_info *tinfo; 111 struct thread_info *tinfo;
112 int graph = 0;
169 113
170 if (!task) 114 if (!task)
171 task = current; 115 task = current;
@@ -206,7 +150,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
206 break; 150 break;
207 151
208 bp = print_context_stack(tinfo, stack, bp, ops, 152 bp = print_context_stack(tinfo, stack, bp, ops,
209 data, estack_end); 153 data, estack_end, &graph);
210 ops->stack(data, "<EOE>"); 154 ops->stack(data, "<EOE>");
211 /* 155 /*
212 * We link to the next stack via the 156 * We link to the next stack via the
@@ -225,7 +169,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
225 if (ops->stack(data, "IRQ") < 0) 169 if (ops->stack(data, "IRQ") < 0)
226 break; 170 break;
227 bp = print_context_stack(tinfo, stack, bp, 171 bp = print_context_stack(tinfo, stack, bp,
228 ops, data, irqstack_end); 172 ops, data, irqstack_end, &graph);
229 /* 173 /*
230 * We link to the next stack (which would be 174 * We link to the next stack (which would be
231 * the process stack normally) the last 175 * the process stack normally) the last
@@ -243,62 +187,12 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
243 /* 187 /*
244 * This handles the process stack: 188 * This handles the process stack:
245 */ 189 */
246 bp = print_context_stack(tinfo, stack, bp, ops, data, NULL); 190 bp = print_context_stack(tinfo, stack, bp, ops, data, NULL, &graph);
247 put_cpu(); 191 put_cpu();
248} 192}
249EXPORT_SYMBOL(dump_trace); 193EXPORT_SYMBOL(dump_trace);
250 194
251static void 195void
252print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
253{
254 printk(data);
255 print_symbol(msg, symbol);
256 printk("\n");
257}
258
259static void print_trace_warning(void *data, char *msg)
260{
261 printk("%s%s\n", (char *)data, msg);
262}
263
264static int print_trace_stack(void *data, char *name)
265{
266 printk("%s <%s> ", (char *)data, name);
267 return 0;
268}
269
270/*
271 * Print one address/symbol entries per line.
272 */
273static void print_trace_address(void *data, unsigned long addr, int reliable)
274{
275 touch_nmi_watchdog();
276 printk(data);
277 printk_address(addr, reliable);
278}
279
280static const struct stacktrace_ops print_trace_ops = {
281 .warning = print_trace_warning,
282 .warning_symbol = print_trace_warning_symbol,
283 .stack = print_trace_stack,
284 .address = print_trace_address,
285};
286
287static void
288show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
289 unsigned long *stack, unsigned long bp, char *log_lvl)
290{
291 printk("%sCall Trace:\n", log_lvl);
292 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
293}
294
295void show_trace(struct task_struct *task, struct pt_regs *regs,
296 unsigned long *stack, unsigned long bp)
297{
298 show_trace_log_lvl(task, regs, stack, bp, "");
299}
300
301static void
302show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 196show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
303 unsigned long *sp, unsigned long bp, char *log_lvl) 197 unsigned long *sp, unsigned long bp, char *log_lvl)
304{ 198{
@@ -342,33 +236,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
342 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 236 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
343} 237}
344 238
345void show_stack(struct task_struct *task, unsigned long *sp)
346{
347 show_stack_log_lvl(task, NULL, sp, 0, "");
348}
349
350/*
351 * The architecture-independent dump_stack generator
352 */
353void dump_stack(void)
354{
355 unsigned long bp = 0;
356 unsigned long stack;
357
358#ifdef CONFIG_FRAME_POINTER
359 if (!bp)
360 get_bp(bp);
361#endif
362
363 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
364 current->pid, current->comm, print_tainted(),
365 init_utsname()->release,
366 (int)strcspn(init_utsname()->version, " "),
367 init_utsname()->version);
368 show_trace(NULL, NULL, &stack, bp);
369}
370EXPORT_SYMBOL(dump_stack);
371
372void show_registers(struct pt_regs *regs) 239void show_registers(struct pt_regs *regs)
373{ 240{
374 int i; 241 int i;
@@ -429,147 +296,3 @@ int is_valid_bugaddr(unsigned long ip)
429 return ud2 == 0x0b0f; 296 return ud2 == 0x0b0f;
430} 297}
431 298
432static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
433static int die_owner = -1;
434static unsigned int die_nest_count;
435
436unsigned __kprobes long oops_begin(void)
437{
438 int cpu;
439 unsigned long flags;
440
441 oops_enter();
442
443 /* racy, but better than risking deadlock. */
444 raw_local_irq_save(flags);
445 cpu = smp_processor_id();
446 if (!__raw_spin_trylock(&die_lock)) {
447 if (cpu == die_owner)
448 /* nested oops. should stop eventually */;
449 else
450 __raw_spin_lock(&die_lock);
451 }
452 die_nest_count++;
453 die_owner = cpu;
454 console_verbose();
455 bust_spinlocks(1);
456 return flags;
457}
458
459void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
460{
461 die_owner = -1;
462 bust_spinlocks(0);
463 die_nest_count--;
464 if (!die_nest_count)
465 /* Nest count reaches zero, release the lock. */
466 __raw_spin_unlock(&die_lock);
467 raw_local_irq_restore(flags);
468 if (!regs) {
469 oops_exit();
470 return;
471 }
472 if (in_interrupt())
473 panic("Fatal exception in interrupt");
474 if (panic_on_oops)
475 panic("Fatal exception");
476 oops_exit();
477 do_exit(signr);
478}
479
480int __kprobes __die(const char *str, struct pt_regs *regs, long err)
481{
482 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
483#ifdef CONFIG_PREEMPT
484 printk("PREEMPT ");
485#endif
486#ifdef CONFIG_SMP
487 printk("SMP ");
488#endif
489#ifdef CONFIG_DEBUG_PAGEALLOC
490 printk("DEBUG_PAGEALLOC");
491#endif
492 printk("\n");
493 sysfs_printk_last_file();
494 if (notify_die(DIE_OOPS, str, regs, err,
495 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
496 return 1;
497
498 show_registers(regs);
499 add_taint(TAINT_DIE);
500 /* Executive summary in case the oops scrolled away */
501 printk(KERN_ALERT "RIP ");
502 printk_address(regs->ip, 1);
503 printk(" RSP <%016lx>\n", regs->sp);
504 if (kexec_should_crash(current))
505 crash_kexec(regs);
506 return 0;
507}
508
509void die(const char *str, struct pt_regs *regs, long err)
510{
511 unsigned long flags = oops_begin();
512
513 if (!user_mode(regs))
514 report_bug(regs->ip, regs);
515
516 if (__die(str, regs, err))
517 regs = NULL;
518 oops_end(flags, regs, SIGSEGV);
519}
520
521notrace __kprobes void
522die_nmi(char *str, struct pt_regs *regs, int do_panic)
523{
524 unsigned long flags;
525
526 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
527 return;
528
529 flags = oops_begin();
530 /*
531 * We are in trouble anyway, lets at least try
532 * to get a message out.
533 */
534 printk(KERN_EMERG "%s", str);
535 printk(" on CPU%d, ip %08lx, registers:\n",
536 smp_processor_id(), regs->ip);
537 show_registers(regs);
538 if (kexec_should_crash(current))
539 crash_kexec(regs);
540 if (do_panic || panic_on_oops)
541 panic("Non maskable interrupt");
542 oops_end(flags, NULL, SIGBUS);
543 nmi_exit();
544 local_irq_enable();
545 do_exit(SIGBUS);
546}
547
548static int __init oops_setup(char *s)
549{
550 if (!s)
551 return -EINVAL;
552 if (!strcmp(s, "panic"))
553 panic_on_oops = 1;
554 return 0;
555}
556early_param("oops", oops_setup);
557
558static int __init kstack_setup(char *s)
559{
560 if (!s)
561 return -EINVAL;
562 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
563 return 0;
564}
565early_param("kstack", kstack_setup);
566
567static int __init code_bytes_setup(char *s)
568{
569 code_bytes = simple_strtoul(s, NULL, 0);
570 if (code_bytes > 8192)
571 code_bytes = 8192;
572
573 return 1;
574}
575__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 7aafeb5263ef..65a13943e098 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -677,22 +677,6 @@ struct early_res {
677}; 677};
678static struct early_res early_res[MAX_EARLY_RES] __initdata = { 678static struct early_res early_res[MAX_EARLY_RES] __initdata = {
679 { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */ 679 { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
680#if defined(CONFIG_X86_64) && defined(CONFIG_X86_TRAMPOLINE)
681 { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
682#endif
683#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
684 /*
685 * But first pinch a few for the stack/trampoline stuff
686 * FIXME: Don't need the extra page at 4K, but need to fix
687 * trampoline before removing it. (see the GDT stuff)
688 */
689 { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE" },
690 /*
691 * Has to be in very low memory so we can execute
692 * real-mode AP code.
693 */
694 { TRAMPOLINE_BASE, TRAMPOLINE_BASE + PAGE_SIZE, "TRAMPOLINE" },
695#endif
696 {} 680 {}
697}; 681};
698 682
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 1b894b72c0f5..744aa7fc49d5 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -17,6 +17,7 @@
17#include <asm/io_apic.h> 17#include <asm/io_apic.h>
18#include <asm/apic.h> 18#include <asm/apic.h>
19#include <asm/iommu.h> 19#include <asm/iommu.h>
20#include <asm/gart.h>
20 21
21static void __init fix_hypertransport_config(int num, int slot, int func) 22static void __init fix_hypertransport_config(int num, int slot, int func)
22{ 23{
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 34ad997d3834..504ad198e4ad 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -875,49 +875,6 @@ static struct console early_dbgp_console = {
875}; 875};
876#endif 876#endif
877 877
878/* Console interface to a host file on AMD's SimNow! */
879
880static int simnow_fd;
881
882enum {
883 MAGIC1 = 0xBACCD00A,
884 MAGIC2 = 0xCA110000,
885 XOPEN = 5,
886 XWRITE = 4,
887};
888
889static noinline long simnow(long cmd, long a, long b, long c)
890{
891 long ret;
892
893 asm volatile("cpuid" :
894 "=a" (ret) :
895 "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2));
896 return ret;
897}
898
899static void __init simnow_init(char *str)
900{
901 char *fn = "klog";
902
903 if (*str == '=')
904 fn = ++str;
905 /* error ignored */
906 simnow_fd = simnow(XOPEN, (unsigned long)fn, O_WRONLY|O_APPEND|O_CREAT, 0644);
907}
908
909static void simnow_write(struct console *con, const char *s, unsigned n)
910{
911 simnow(XWRITE, simnow_fd, (unsigned long)s, n);
912}
913
914static struct console simnow_console = {
915 .name = "simnow",
916 .write = simnow_write,
917 .flags = CON_PRINTBUFFER,
918 .index = -1,
919};
920
921/* Direct interface for emergencies */ 878/* Direct interface for emergencies */
922static struct console *early_console = &early_vga_console; 879static struct console *early_console = &early_vga_console;
923static int __initdata early_console_initialized; 880static int __initdata early_console_initialized;
@@ -929,7 +886,7 @@ asmlinkage void early_printk(const char *fmt, ...)
929 va_list ap; 886 va_list ap;
930 887
931 va_start(ap, fmt); 888 va_start(ap, fmt);
932 n = vscnprintf(buf, 512, fmt, ap); 889 n = vscnprintf(buf, sizeof(buf), fmt, ap);
933 early_console->write(early_console, buf, n); 890 early_console->write(early_console, buf, n);
934 va_end(ap); 891 va_end(ap);
935} 892}
@@ -960,10 +917,6 @@ static int __init setup_early_printk(char *buf)
960 max_ypos = boot_params.screen_info.orig_video_lines; 917 max_ypos = boot_params.screen_info.orig_video_lines;
961 current_ypos = boot_params.screen_info.orig_y; 918 current_ypos = boot_params.screen_info.orig_y;
962 early_console = &early_vga_console; 919 early_console = &early_vga_console;
963 } else if (!strncmp(buf, "simnow", 6)) {
964 simnow_init(buf + 6);
965 early_console = &simnow_console;
966 keep_early = 1;
967#ifdef CONFIG_EARLY_PRINTK_DBGP 920#ifdef CONFIG_EARLY_PRINTK_DBGP
968 } else if (!strncmp(buf, "dbgp", 4)) { 921 } else if (!strncmp(buf, "dbgp", 4)) {
969 if (early_dbgp_init(buf+4) < 0) 922 if (early_dbgp_init(buf+4) < 0)
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 28b597ef9ca1..d6f0490a7391 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -619,28 +619,37 @@ END(syscall_badsys)
61927:; 61927:;
620 620
621/* 621/*
622 * Build the entry stubs and pointer table with 622 * Build the entry stubs and pointer table with some assembler magic.
623 * some assembler magic. 623 * We pack 7 stubs into a single 32-byte chunk, which will fit in a
624 * single cache line on all modern x86 implementations.
624 */ 625 */
625.section .rodata,"a" 626.section .init.rodata,"a"
626ENTRY(interrupt) 627ENTRY(interrupt)
627.text 628.text
628 629 .p2align 5
630 .p2align CONFIG_X86_L1_CACHE_SHIFT
629ENTRY(irq_entries_start) 631ENTRY(irq_entries_start)
630 RING0_INT_FRAME 632 RING0_INT_FRAME
631vector=0 633vector=FIRST_EXTERNAL_VECTOR
632.rept NR_VECTORS 634.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7
633 ALIGN 635 .balign 32
634 .if vector 636 .rept 7
637 .if vector < NR_VECTORS
638 .if vector <> FIRST_EXTERNAL_VECTOR
635 CFI_ADJUST_CFA_OFFSET -4 639 CFI_ADJUST_CFA_OFFSET -4
636 .endif 640 .endif
6371: pushl $~(vector) 6411: pushl $(~vector+0x80) /* Note: always in signed byte range */
638 CFI_ADJUST_CFA_OFFSET 4 642 CFI_ADJUST_CFA_OFFSET 4
639 jmp common_interrupt 643 .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
640 .previous 644 jmp 2f
645 .endif
646 .previous
641 .long 1b 647 .long 1b
642 .text 648 .text
643vector=vector+1 649vector=vector+1
650 .endif
651 .endr
6522: jmp common_interrupt
644.endr 653.endr
645END(irq_entries_start) 654END(irq_entries_start)
646 655
@@ -652,8 +661,9 @@ END(interrupt)
652 * the CPU automatically disables interrupts when executing an IRQ vector, 661 * the CPU automatically disables interrupts when executing an IRQ vector,
653 * so IRQ-flags tracing has to follow that: 662 * so IRQ-flags tracing has to follow that:
654 */ 663 */
655 ALIGN 664 .p2align CONFIG_X86_L1_CACHE_SHIFT
656common_interrupt: 665common_interrupt:
666 addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */
657 SAVE_ALL 667 SAVE_ALL
658 TRACE_IRQS_OFF 668 TRACE_IRQS_OFF
659 movl %esp,%eax 669 movl %esp,%eax
@@ -678,65 +688,6 @@ ENDPROC(name)
678/* The include is where all of the SMP etc. interrupts come from */ 688/* The include is where all of the SMP etc. interrupts come from */
679#include "entry_arch.h" 689#include "entry_arch.h"
680 690
681KPROBE_ENTRY(page_fault)
682 RING0_EC_FRAME
683 pushl $do_page_fault
684 CFI_ADJUST_CFA_OFFSET 4
685 ALIGN
686error_code:
687 /* the function address is in %fs's slot on the stack */
688 pushl %es
689 CFI_ADJUST_CFA_OFFSET 4
690 /*CFI_REL_OFFSET es, 0*/
691 pushl %ds
692 CFI_ADJUST_CFA_OFFSET 4
693 /*CFI_REL_OFFSET ds, 0*/
694 pushl %eax
695 CFI_ADJUST_CFA_OFFSET 4
696 CFI_REL_OFFSET eax, 0
697 pushl %ebp
698 CFI_ADJUST_CFA_OFFSET 4
699 CFI_REL_OFFSET ebp, 0
700 pushl %edi
701 CFI_ADJUST_CFA_OFFSET 4
702 CFI_REL_OFFSET edi, 0
703 pushl %esi
704 CFI_ADJUST_CFA_OFFSET 4
705 CFI_REL_OFFSET esi, 0
706 pushl %edx
707 CFI_ADJUST_CFA_OFFSET 4
708 CFI_REL_OFFSET edx, 0
709 pushl %ecx
710 CFI_ADJUST_CFA_OFFSET 4
711 CFI_REL_OFFSET ecx, 0
712 pushl %ebx
713 CFI_ADJUST_CFA_OFFSET 4
714 CFI_REL_OFFSET ebx, 0
715 cld
716 pushl %fs
717 CFI_ADJUST_CFA_OFFSET 4
718 /*CFI_REL_OFFSET fs, 0*/
719 movl $(__KERNEL_PERCPU), %ecx
720 movl %ecx, %fs
721 UNWIND_ESPFIX_STACK
722 popl %ecx
723 CFI_ADJUST_CFA_OFFSET -4
724 /*CFI_REGISTER es, ecx*/
725 movl PT_FS(%esp), %edi # get the function address
726 movl PT_ORIG_EAX(%esp), %edx # get the error code
727 movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
728 mov %ecx, PT_FS(%esp)
729 /*CFI_REL_OFFSET fs, ES*/
730 movl $(__USER_DS), %ecx
731 movl %ecx, %ds
732 movl %ecx, %es
733 TRACE_IRQS_OFF
734 movl %esp,%eax # pt_regs pointer
735 call *%edi
736 jmp ret_from_exception
737 CFI_ENDPROC
738KPROBE_END(page_fault)
739
740ENTRY(coprocessor_error) 691ENTRY(coprocessor_error)
741 RING0_INT_FRAME 692 RING0_INT_FRAME
742 pushl $0 693 pushl $0
@@ -767,140 +718,6 @@ ENTRY(device_not_available)
767 CFI_ENDPROC 718 CFI_ENDPROC
768END(device_not_available) 719END(device_not_available)
769 720
770/*
771 * Debug traps and NMI can happen at the one SYSENTER instruction
772 * that sets up the real kernel stack. Check here, since we can't
773 * allow the wrong stack to be used.
774 *
775 * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
776 * already pushed 3 words if it hits on the sysenter instruction:
777 * eflags, cs and eip.
778 *
779 * We just load the right stack, and push the three (known) values
780 * by hand onto the new stack - while updating the return eip past
781 * the instruction that would have done it for sysenter.
782 */
783#define FIX_STACK(offset, ok, label) \
784 cmpw $__KERNEL_CS,4(%esp); \
785 jne ok; \
786label: \
787 movl TSS_sysenter_sp0+offset(%esp),%esp; \
788 CFI_DEF_CFA esp, 0; \
789 CFI_UNDEFINED eip; \
790 pushfl; \
791 CFI_ADJUST_CFA_OFFSET 4; \
792 pushl $__KERNEL_CS; \
793 CFI_ADJUST_CFA_OFFSET 4; \
794 pushl $sysenter_past_esp; \
795 CFI_ADJUST_CFA_OFFSET 4; \
796 CFI_REL_OFFSET eip, 0
797
798KPROBE_ENTRY(debug)
799 RING0_INT_FRAME
800 cmpl $ia32_sysenter_target,(%esp)
801 jne debug_stack_correct
802 FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
803debug_stack_correct:
804 pushl $-1 # mark this as an int
805 CFI_ADJUST_CFA_OFFSET 4
806 SAVE_ALL
807 TRACE_IRQS_OFF
808 xorl %edx,%edx # error code 0
809 movl %esp,%eax # pt_regs pointer
810 call do_debug
811 jmp ret_from_exception
812 CFI_ENDPROC
813KPROBE_END(debug)
814
815/*
816 * NMI is doubly nasty. It can happen _while_ we're handling
817 * a debug fault, and the debug fault hasn't yet been able to
818 * clear up the stack. So we first check whether we got an
819 * NMI on the sysenter entry path, but after that we need to
820 * check whether we got an NMI on the debug path where the debug
821 * fault happened on the sysenter path.
822 */
823KPROBE_ENTRY(nmi)
824 RING0_INT_FRAME
825 pushl %eax
826 CFI_ADJUST_CFA_OFFSET 4
827 movl %ss, %eax
828 cmpw $__ESPFIX_SS, %ax
829 popl %eax
830 CFI_ADJUST_CFA_OFFSET -4
831 je nmi_espfix_stack
832 cmpl $ia32_sysenter_target,(%esp)
833 je nmi_stack_fixup
834 pushl %eax
835 CFI_ADJUST_CFA_OFFSET 4
836 movl %esp,%eax
837 /* Do not access memory above the end of our stack page,
838 * it might not exist.
839 */
840 andl $(THREAD_SIZE-1),%eax
841 cmpl $(THREAD_SIZE-20),%eax
842 popl %eax
843 CFI_ADJUST_CFA_OFFSET -4
844 jae nmi_stack_correct
845 cmpl $ia32_sysenter_target,12(%esp)
846 je nmi_debug_stack_check
847nmi_stack_correct:
848 /* We have a RING0_INT_FRAME here */
849 pushl %eax
850 CFI_ADJUST_CFA_OFFSET 4
851 SAVE_ALL
852 TRACE_IRQS_OFF
853 xorl %edx,%edx # zero error code
854 movl %esp,%eax # pt_regs pointer
855 call do_nmi
856 jmp restore_nocheck_notrace
857 CFI_ENDPROC
858
859nmi_stack_fixup:
860 RING0_INT_FRAME
861 FIX_STACK(12,nmi_stack_correct, 1)
862 jmp nmi_stack_correct
863
864nmi_debug_stack_check:
865 /* We have a RING0_INT_FRAME here */
866 cmpw $__KERNEL_CS,16(%esp)
867 jne nmi_stack_correct
868 cmpl $debug,(%esp)
869 jb nmi_stack_correct
870 cmpl $debug_esp_fix_insn,(%esp)
871 ja nmi_stack_correct
872 FIX_STACK(24,nmi_stack_correct, 1)
873 jmp nmi_stack_correct
874
875nmi_espfix_stack:
876 /* We have a RING0_INT_FRAME here.
877 *
878 * create the pointer to lss back
879 */
880 pushl %ss
881 CFI_ADJUST_CFA_OFFSET 4
882 pushl %esp
883 CFI_ADJUST_CFA_OFFSET 4
884 addw $4, (%esp)
885 /* copy the iret frame of 12 bytes */
886 .rept 3
887 pushl 16(%esp)
888 CFI_ADJUST_CFA_OFFSET 4
889 .endr
890 pushl %eax
891 CFI_ADJUST_CFA_OFFSET 4
892 SAVE_ALL
893 TRACE_IRQS_OFF
894 FIXUP_ESPFIX_STACK # %eax == %esp
895 xorl %edx,%edx # zero error code
896 call do_nmi
897 RESTORE_REGS
898 lss 12+4(%esp), %esp # back to espfix stack
899 CFI_ADJUST_CFA_OFFSET -24
900 jmp irq_return
901 CFI_ENDPROC
902KPROBE_END(nmi)
903
904#ifdef CONFIG_PARAVIRT 721#ifdef CONFIG_PARAVIRT
905ENTRY(native_iret) 722ENTRY(native_iret)
906 iret 723 iret
@@ -916,19 +733,6 @@ ENTRY(native_irq_enable_sysexit)
916END(native_irq_enable_sysexit) 733END(native_irq_enable_sysexit)
917#endif 734#endif
918 735
919KPROBE_ENTRY(int3)
920 RING0_INT_FRAME
921 pushl $-1 # mark this as an int
922 CFI_ADJUST_CFA_OFFSET 4
923 SAVE_ALL
924 TRACE_IRQS_OFF
925 xorl %edx,%edx # zero error code
926 movl %esp,%eax # pt_regs pointer
927 call do_int3
928 jmp ret_from_exception
929 CFI_ENDPROC
930KPROBE_END(int3)
931
932ENTRY(overflow) 736ENTRY(overflow)
933 RING0_INT_FRAME 737 RING0_INT_FRAME
934 pushl $0 738 pushl $0
@@ -993,14 +797,6 @@ ENTRY(stack_segment)
993 CFI_ENDPROC 797 CFI_ENDPROC
994END(stack_segment) 798END(stack_segment)
995 799
996KPROBE_ENTRY(general_protection)
997 RING0_EC_FRAME
998 pushl $do_general_protection
999 CFI_ADJUST_CFA_OFFSET 4
1000 jmp error_code
1001 CFI_ENDPROC
1002KPROBE_END(general_protection)
1003
1004ENTRY(alignment_check) 800ENTRY(alignment_check)
1005 RING0_EC_FRAME 801 RING0_EC_FRAME
1006 pushl $do_alignment_check 802 pushl $do_alignment_check
@@ -1051,6 +847,7 @@ ENTRY(kernel_thread_helper)
1051 push %eax 847 push %eax
1052 CFI_ADJUST_CFA_OFFSET 4 848 CFI_ADJUST_CFA_OFFSET 4
1053 call do_exit 849 call do_exit
850 ud2 # padding for call trace
1054 CFI_ENDPROC 851 CFI_ENDPROC
1055ENDPROC(kernel_thread_helper) 852ENDPROC(kernel_thread_helper)
1056 853
@@ -1157,6 +954,9 @@ ENTRY(mcount)
1157END(mcount) 954END(mcount)
1158 955
1159ENTRY(ftrace_caller) 956ENTRY(ftrace_caller)
957 cmpl $0, function_trace_stop
958 jne ftrace_stub
959
1160 pushl %eax 960 pushl %eax
1161 pushl %ecx 961 pushl %ecx
1162 pushl %edx 962 pushl %edx
@@ -1171,6 +971,11 @@ ftrace_call:
1171 popl %edx 971 popl %edx
1172 popl %ecx 972 popl %ecx
1173 popl %eax 973 popl %eax
974#ifdef CONFIG_FUNCTION_GRAPH_TRACER
975.globl ftrace_graph_call
976ftrace_graph_call:
977 jmp ftrace_stub
978#endif
1174 979
1175.globl ftrace_stub 980.globl ftrace_stub
1176ftrace_stub: 981ftrace_stub:
@@ -1180,8 +985,18 @@ END(ftrace_caller)
1180#else /* ! CONFIG_DYNAMIC_FTRACE */ 985#else /* ! CONFIG_DYNAMIC_FTRACE */
1181 986
1182ENTRY(mcount) 987ENTRY(mcount)
988 cmpl $0, function_trace_stop
989 jne ftrace_stub
990
1183 cmpl $ftrace_stub, ftrace_trace_function 991 cmpl $ftrace_stub, ftrace_trace_function
1184 jnz trace 992 jnz trace
993#ifdef CONFIG_FUNCTION_GRAPH_TRACER
994 cmpl $ftrace_stub, ftrace_graph_return
995 jnz ftrace_graph_caller
996
997 cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
998 jnz ftrace_graph_caller
999#endif
1185.globl ftrace_stub 1000.globl ftrace_stub
1186ftrace_stub: 1001ftrace_stub:
1187 ret 1002 ret
@@ -1200,13 +1015,268 @@ trace:
1200 popl %edx 1015 popl %edx
1201 popl %ecx 1016 popl %ecx
1202 popl %eax 1017 popl %eax
1203
1204 jmp ftrace_stub 1018 jmp ftrace_stub
1205END(mcount) 1019END(mcount)
1206#endif /* CONFIG_DYNAMIC_FTRACE */ 1020#endif /* CONFIG_DYNAMIC_FTRACE */
1207#endif /* CONFIG_FUNCTION_TRACER */ 1021#endif /* CONFIG_FUNCTION_TRACER */
1208 1022
1023#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1024ENTRY(ftrace_graph_caller)
1025 cmpl $0, function_trace_stop
1026 jne ftrace_stub
1027
1028 pushl %eax
1029 pushl %ecx
1030 pushl %edx
1031 movl 0xc(%esp), %edx
1032 lea 0x4(%ebp), %eax
1033 subl $MCOUNT_INSN_SIZE, %edx
1034 call prepare_ftrace_return
1035 popl %edx
1036 popl %ecx
1037 popl %eax
1038 ret
1039END(ftrace_graph_caller)
1040
1041.globl return_to_handler
1042return_to_handler:
1043 pushl $0
1044 pushl %eax
1045 pushl %ecx
1046 pushl %edx
1047 call ftrace_return_to_handler
1048 movl %eax, 0xc(%esp)
1049 popl %edx
1050 popl %ecx
1051 popl %eax
1052 ret
1053#endif
1054
1209.section .rodata,"a" 1055.section .rodata,"a"
1210#include "syscall_table_32.S" 1056#include "syscall_table_32.S"
1211 1057
1212syscall_table_size=(.-sys_call_table) 1058syscall_table_size=(.-sys_call_table)
1059
1060/*
1061 * Some functions should be protected against kprobes
1062 */
1063 .pushsection .kprobes.text, "ax"
1064
1065ENTRY(page_fault)
1066 RING0_EC_FRAME
1067 pushl $do_page_fault
1068 CFI_ADJUST_CFA_OFFSET 4
1069 ALIGN
1070error_code:
1071 /* the function address is in %fs's slot on the stack */
1072 pushl %es
1073 CFI_ADJUST_CFA_OFFSET 4
1074 /*CFI_REL_OFFSET es, 0*/
1075 pushl %ds
1076 CFI_ADJUST_CFA_OFFSET 4
1077 /*CFI_REL_OFFSET ds, 0*/
1078 pushl %eax
1079 CFI_ADJUST_CFA_OFFSET 4
1080 CFI_REL_OFFSET eax, 0
1081 pushl %ebp
1082 CFI_ADJUST_CFA_OFFSET 4
1083 CFI_REL_OFFSET ebp, 0
1084 pushl %edi
1085 CFI_ADJUST_CFA_OFFSET 4
1086 CFI_REL_OFFSET edi, 0
1087 pushl %esi
1088 CFI_ADJUST_CFA_OFFSET 4
1089 CFI_REL_OFFSET esi, 0
1090 pushl %edx
1091 CFI_ADJUST_CFA_OFFSET 4
1092 CFI_REL_OFFSET edx, 0
1093 pushl %ecx
1094 CFI_ADJUST_CFA_OFFSET 4
1095 CFI_REL_OFFSET ecx, 0
1096 pushl %ebx
1097 CFI_ADJUST_CFA_OFFSET 4
1098 CFI_REL_OFFSET ebx, 0
1099 cld
1100 pushl %fs
1101 CFI_ADJUST_CFA_OFFSET 4
1102 /*CFI_REL_OFFSET fs, 0*/
1103 movl $(__KERNEL_PERCPU), %ecx
1104 movl %ecx, %fs
1105 UNWIND_ESPFIX_STACK
1106 popl %ecx
1107 CFI_ADJUST_CFA_OFFSET -4
1108 /*CFI_REGISTER es, ecx*/
1109 movl PT_FS(%esp), %edi # get the function address
1110 movl PT_ORIG_EAX(%esp), %edx # get the error code
1111 movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
1112 mov %ecx, PT_FS(%esp)
1113 /*CFI_REL_OFFSET fs, ES*/
1114 movl $(__USER_DS), %ecx
1115 movl %ecx, %ds
1116 movl %ecx, %es
1117 TRACE_IRQS_OFF
1118 movl %esp,%eax # pt_regs pointer
1119 call *%edi
1120 jmp ret_from_exception
1121 CFI_ENDPROC
1122END(page_fault)
1123
1124/*
1125 * Debug traps and NMI can happen at the one SYSENTER instruction
1126 * that sets up the real kernel stack. Check here, since we can't
1127 * allow the wrong stack to be used.
1128 *
1129 * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
1130 * already pushed 3 words if it hits on the sysenter instruction:
1131 * eflags, cs and eip.
1132 *
1133 * We just load the right stack, and push the three (known) values
1134 * by hand onto the new stack - while updating the return eip past
1135 * the instruction that would have done it for sysenter.
1136 */
1137#define FIX_STACK(offset, ok, label) \
1138 cmpw $__KERNEL_CS,4(%esp); \
1139 jne ok; \
1140label: \
1141 movl TSS_sysenter_sp0+offset(%esp),%esp; \
1142 CFI_DEF_CFA esp, 0; \
1143 CFI_UNDEFINED eip; \
1144 pushfl; \
1145 CFI_ADJUST_CFA_OFFSET 4; \
1146 pushl $__KERNEL_CS; \
1147 CFI_ADJUST_CFA_OFFSET 4; \
1148 pushl $sysenter_past_esp; \
1149 CFI_ADJUST_CFA_OFFSET 4; \
1150 CFI_REL_OFFSET eip, 0
1151
1152ENTRY(debug)
1153 RING0_INT_FRAME
1154 cmpl $ia32_sysenter_target,(%esp)
1155 jne debug_stack_correct
1156 FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
1157debug_stack_correct:
1158 pushl $-1 # mark this as an int
1159 CFI_ADJUST_CFA_OFFSET 4
1160 SAVE_ALL
1161 TRACE_IRQS_OFF
1162 xorl %edx,%edx # error code 0
1163 movl %esp,%eax # pt_regs pointer
1164 call do_debug
1165 jmp ret_from_exception
1166 CFI_ENDPROC
1167END(debug)
1168
1169/*
1170 * NMI is doubly nasty. It can happen _while_ we're handling
1171 * a debug fault, and the debug fault hasn't yet been able to
1172 * clear up the stack. So we first check whether we got an
1173 * NMI on the sysenter entry path, but after that we need to
1174 * check whether we got an NMI on the debug path where the debug
1175 * fault happened on the sysenter path.
1176 */
1177ENTRY(nmi)
1178 RING0_INT_FRAME
1179 pushl %eax
1180 CFI_ADJUST_CFA_OFFSET 4
1181 movl %ss, %eax
1182 cmpw $__ESPFIX_SS, %ax
1183 popl %eax
1184 CFI_ADJUST_CFA_OFFSET -4
1185 je nmi_espfix_stack
1186 cmpl $ia32_sysenter_target,(%esp)
1187 je nmi_stack_fixup
1188 pushl %eax
1189 CFI_ADJUST_CFA_OFFSET 4
1190 movl %esp,%eax
1191 /* Do not access memory above the end of our stack page,
1192 * it might not exist.
1193 */
1194 andl $(THREAD_SIZE-1),%eax
1195 cmpl $(THREAD_SIZE-20),%eax
1196 popl %eax
1197 CFI_ADJUST_CFA_OFFSET -4
1198 jae nmi_stack_correct
1199 cmpl $ia32_sysenter_target,12(%esp)
1200 je nmi_debug_stack_check
1201nmi_stack_correct:
1202 /* We have a RING0_INT_FRAME here */
1203 pushl %eax
1204 CFI_ADJUST_CFA_OFFSET 4
1205 SAVE_ALL
1206 TRACE_IRQS_OFF
1207 xorl %edx,%edx # zero error code
1208 movl %esp,%eax # pt_regs pointer
1209 call do_nmi
1210 jmp restore_nocheck_notrace
1211 CFI_ENDPROC
1212
1213nmi_stack_fixup:
1214 RING0_INT_FRAME
1215 FIX_STACK(12,nmi_stack_correct, 1)
1216 jmp nmi_stack_correct
1217
1218nmi_debug_stack_check:
1219 /* We have a RING0_INT_FRAME here */
1220 cmpw $__KERNEL_CS,16(%esp)
1221 jne nmi_stack_correct
1222 cmpl $debug,(%esp)
1223 jb nmi_stack_correct
1224 cmpl $debug_esp_fix_insn,(%esp)
1225 ja nmi_stack_correct
1226 FIX_STACK(24,nmi_stack_correct, 1)
1227 jmp nmi_stack_correct
1228
1229nmi_espfix_stack:
1230 /* We have a RING0_INT_FRAME here.
1231 *
1232 * create the pointer to lss back
1233 */
1234 pushl %ss
1235 CFI_ADJUST_CFA_OFFSET 4
1236 pushl %esp
1237 CFI_ADJUST_CFA_OFFSET 4
1238 addw $4, (%esp)
1239 /* copy the iret frame of 12 bytes */
1240 .rept 3
1241 pushl 16(%esp)
1242 CFI_ADJUST_CFA_OFFSET 4
1243 .endr
1244 pushl %eax
1245 CFI_ADJUST_CFA_OFFSET 4
1246 SAVE_ALL
1247 TRACE_IRQS_OFF
1248 FIXUP_ESPFIX_STACK # %eax == %esp
1249 xorl %edx,%edx # zero error code
1250 call do_nmi
1251 RESTORE_REGS
1252 lss 12+4(%esp), %esp # back to espfix stack
1253 CFI_ADJUST_CFA_OFFSET -24
1254 jmp irq_return
1255 CFI_ENDPROC
1256END(nmi)
1257
1258ENTRY(int3)
1259 RING0_INT_FRAME
1260 pushl $-1 # mark this as an int
1261 CFI_ADJUST_CFA_OFFSET 4
1262 SAVE_ALL
1263 TRACE_IRQS_OFF
1264 xorl %edx,%edx # zero error code
1265 movl %esp,%eax # pt_regs pointer
1266 call do_int3
1267 jmp ret_from_exception
1268 CFI_ENDPROC
1269END(int3)
1270
1271ENTRY(general_protection)
1272 RING0_EC_FRAME
1273 pushl $do_general_protection
1274 CFI_ADJUST_CFA_OFFSET 4
1275 jmp error_code
1276 CFI_ENDPROC
1277END(general_protection)
1278
1279/*
1280 * End of kprobes section
1281 */
1282 .popsection
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b86f332c96a6..e28c7a987793 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -11,15 +11,15 @@
11 * 11 *
12 * NOTE: This code handles signal-recognition, which happens every time 12 * NOTE: This code handles signal-recognition, which happens every time
13 * after an interrupt and after each system call. 13 * after an interrupt and after each system call.
14 * 14 *
15 * Normal syscalls and interrupts don't save a full stack frame, this is 15 * Normal syscalls and interrupts don't save a full stack frame, this is
16 * only done for syscall tracing, signals or fork/exec et.al. 16 * only done for syscall tracing, signals or fork/exec et.al.
17 * 17 *
18 * A note on terminology: 18 * A note on terminology:
19 * - top of stack: Architecture defined interrupt frame from SS to RIP 19 * - top of stack: Architecture defined interrupt frame from SS to RIP
20 * at the top of the kernel process stack. 20 * at the top of the kernel process stack.
21 * - partial stack frame: partially saved registers upto R11. 21 * - partial stack frame: partially saved registers upto R11.
22 * - full stack frame: Like partial stack frame, but all register saved. 22 * - full stack frame: Like partial stack frame, but all register saved.
23 * 23 *
24 * Some macro usage: 24 * Some macro usage:
25 * - CFI macros are used to generate dwarf2 unwind information for better 25 * - CFI macros are used to generate dwarf2 unwind information for better
@@ -60,7 +60,6 @@
60#define __AUDIT_ARCH_LE 0x40000000 60#define __AUDIT_ARCH_LE 0x40000000
61 61
62 .code64 62 .code64
63
64#ifdef CONFIG_FUNCTION_TRACER 63#ifdef CONFIG_FUNCTION_TRACER
65#ifdef CONFIG_DYNAMIC_FTRACE 64#ifdef CONFIG_DYNAMIC_FTRACE
66ENTRY(mcount) 65ENTRY(mcount)
@@ -68,16 +67,10 @@ ENTRY(mcount)
68END(mcount) 67END(mcount)
69 68
70ENTRY(ftrace_caller) 69ENTRY(ftrace_caller)
70 cmpl $0, function_trace_stop
71 jne ftrace_stub
71 72
72 /* taken from glibc */ 73 MCOUNT_SAVE_FRAME
73 subq $0x38, %rsp
74 movq %rax, (%rsp)
75 movq %rcx, 8(%rsp)
76 movq %rdx, 16(%rsp)
77 movq %rsi, 24(%rsp)
78 movq %rdi, 32(%rsp)
79 movq %r8, 40(%rsp)
80 movq %r9, 48(%rsp)
81 74
82 movq 0x38(%rsp), %rdi 75 movq 0x38(%rsp), %rdi
83 movq 8(%rbp), %rsi 76 movq 8(%rbp), %rsi
@@ -87,14 +80,13 @@ ENTRY(ftrace_caller)
87ftrace_call: 80ftrace_call:
88 call ftrace_stub 81 call ftrace_stub
89 82
90 movq 48(%rsp), %r9 83 MCOUNT_RESTORE_FRAME
91 movq 40(%rsp), %r8 84
92 movq 32(%rsp), %rdi 85#ifdef CONFIG_FUNCTION_GRAPH_TRACER
93 movq 24(%rsp), %rsi 86.globl ftrace_graph_call
94 movq 16(%rsp), %rdx 87ftrace_graph_call:
95 movq 8(%rsp), %rcx 88 jmp ftrace_stub
96 movq (%rsp), %rax 89#endif
97 addq $0x38, %rsp
98 90
99.globl ftrace_stub 91.globl ftrace_stub
100ftrace_stub: 92ftrace_stub:
@@ -103,15 +95,63 @@ END(ftrace_caller)
103 95
104#else /* ! CONFIG_DYNAMIC_FTRACE */ 96#else /* ! CONFIG_DYNAMIC_FTRACE */
105ENTRY(mcount) 97ENTRY(mcount)
98 cmpl $0, function_trace_stop
99 jne ftrace_stub
100
106 cmpq $ftrace_stub, ftrace_trace_function 101 cmpq $ftrace_stub, ftrace_trace_function
107 jnz trace 102 jnz trace
103
104#ifdef CONFIG_FUNCTION_GRAPH_TRACER
105 cmpq $ftrace_stub, ftrace_graph_return
106 jnz ftrace_graph_caller
107
108 cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
109 jnz ftrace_graph_caller
110#endif
111
108.globl ftrace_stub 112.globl ftrace_stub
109ftrace_stub: 113ftrace_stub:
110 retq 114 retq
111 115
112trace: 116trace:
113 /* taken from glibc */ 117 MCOUNT_SAVE_FRAME
114 subq $0x38, %rsp 118
119 movq 0x38(%rsp), %rdi
120 movq 8(%rbp), %rsi
121 subq $MCOUNT_INSN_SIZE, %rdi
122
123 call *ftrace_trace_function
124
125 MCOUNT_RESTORE_FRAME
126
127 jmp ftrace_stub
128END(mcount)
129#endif /* CONFIG_DYNAMIC_FTRACE */
130#endif /* CONFIG_FUNCTION_TRACER */
131
132#ifdef CONFIG_FUNCTION_GRAPH_TRACER
133ENTRY(ftrace_graph_caller)
134 cmpl $0, function_trace_stop
135 jne ftrace_stub
136
137 MCOUNT_SAVE_FRAME
138
139 leaq 8(%rbp), %rdi
140 movq 0x38(%rsp), %rsi
141 subq $MCOUNT_INSN_SIZE, %rsi
142
143 call prepare_ftrace_return
144
145 MCOUNT_RESTORE_FRAME
146
147 retq
148END(ftrace_graph_caller)
149
150
151.globl return_to_handler
152return_to_handler:
153 subq $80, %rsp
154
115 movq %rax, (%rsp) 155 movq %rax, (%rsp)
116 movq %rcx, 8(%rsp) 156 movq %rcx, 8(%rsp)
117 movq %rdx, 16(%rsp) 157 movq %rdx, 16(%rsp)
@@ -119,13 +159,14 @@ trace:
119 movq %rdi, 32(%rsp) 159 movq %rdi, 32(%rsp)
120 movq %r8, 40(%rsp) 160 movq %r8, 40(%rsp)
121 movq %r9, 48(%rsp) 161 movq %r9, 48(%rsp)
162 movq %r10, 56(%rsp)
163 movq %r11, 64(%rsp)
122 164
123 movq 0x38(%rsp), %rdi 165 call ftrace_return_to_handler
124 movq 8(%rbp), %rsi
125 subq $MCOUNT_INSN_SIZE, %rdi
126
127 call *ftrace_trace_function
128 166
167 movq %rax, 72(%rsp)
168 movq 64(%rsp), %r11
169 movq 56(%rsp), %r10
129 movq 48(%rsp), %r9 170 movq 48(%rsp), %r9
130 movq 40(%rsp), %r8 171 movq 40(%rsp), %r8
131 movq 32(%rsp), %rdi 172 movq 32(%rsp), %rdi
@@ -133,16 +174,14 @@ trace:
133 movq 16(%rsp), %rdx 174 movq 16(%rsp), %rdx
134 movq 8(%rsp), %rcx 175 movq 8(%rsp), %rcx
135 movq (%rsp), %rax 176 movq (%rsp), %rax
136 addq $0x38, %rsp 177 addq $72, %rsp
178 retq
179#endif
137 180
138 jmp ftrace_stub
139END(mcount)
140#endif /* CONFIG_DYNAMIC_FTRACE */
141#endif /* CONFIG_FUNCTION_TRACER */
142 181
143#ifndef CONFIG_PREEMPT 182#ifndef CONFIG_PREEMPT
144#define retint_kernel retint_restore_args 183#define retint_kernel retint_restore_args
145#endif 184#endif
146 185
147#ifdef CONFIG_PARAVIRT 186#ifdef CONFIG_PARAVIRT
148ENTRY(native_usergs_sysret64) 187ENTRY(native_usergs_sysret64)
@@ -161,29 +200,29 @@ ENTRY(native_usergs_sysret64)
161.endm 200.endm
162 201
163/* 202/*
164 * C code is not supposed to know about undefined top of stack. Every time 203 * C code is not supposed to know about undefined top of stack. Every time
165 * a C function with an pt_regs argument is called from the SYSCALL based 204 * a C function with an pt_regs argument is called from the SYSCALL based
166 * fast path FIXUP_TOP_OF_STACK is needed. 205 * fast path FIXUP_TOP_OF_STACK is needed.
167 * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs 206 * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
168 * manipulation. 207 * manipulation.
169 */ 208 */
170 209
171 /* %rsp:at FRAMEEND */ 210 /* %rsp:at FRAMEEND */
172 .macro FIXUP_TOP_OF_STACK tmp 211 .macro FIXUP_TOP_OF_STACK tmp offset=0
173 movq %gs:pda_oldrsp,\tmp 212 movq %gs:pda_oldrsp,\tmp
174 movq \tmp,RSP(%rsp) 213 movq \tmp,RSP+\offset(%rsp)
175 movq $__USER_DS,SS(%rsp) 214 movq $__USER_DS,SS+\offset(%rsp)
176 movq $__USER_CS,CS(%rsp) 215 movq $__USER_CS,CS+\offset(%rsp)
177 movq $-1,RCX(%rsp) 216 movq $-1,RCX+\offset(%rsp)
178 movq R11(%rsp),\tmp /* get eflags */ 217 movq R11+\offset(%rsp),\tmp /* get eflags */
179 movq \tmp,EFLAGS(%rsp) 218 movq \tmp,EFLAGS+\offset(%rsp)
180 .endm 219 .endm
181 220
182 .macro RESTORE_TOP_OF_STACK tmp,offset=0 221 .macro RESTORE_TOP_OF_STACK tmp offset=0
183 movq RSP-\offset(%rsp),\tmp 222 movq RSP+\offset(%rsp),\tmp
184 movq \tmp,%gs:pda_oldrsp 223 movq \tmp,%gs:pda_oldrsp
185 movq EFLAGS-\offset(%rsp),\tmp 224 movq EFLAGS+\offset(%rsp),\tmp
186 movq \tmp,R11-\offset(%rsp) 225 movq \tmp,R11+\offset(%rsp)
187 .endm 226 .endm
188 227
189 .macro FAKE_STACK_FRAME child_rip 228 .macro FAKE_STACK_FRAME child_rip
@@ -195,7 +234,7 @@ ENTRY(native_usergs_sysret64)
195 pushq %rax /* rsp */ 234 pushq %rax /* rsp */
196 CFI_ADJUST_CFA_OFFSET 8 235 CFI_ADJUST_CFA_OFFSET 8
197 CFI_REL_OFFSET rsp,0 236 CFI_REL_OFFSET rsp,0
198 pushq $(1<<9) /* eflags - interrupts on */ 237 pushq $X86_EFLAGS_IF /* eflags - interrupts on */
199 CFI_ADJUST_CFA_OFFSET 8 238 CFI_ADJUST_CFA_OFFSET 8
200 /*CFI_REL_OFFSET rflags,0*/ 239 /*CFI_REL_OFFSET rflags,0*/
201 pushq $__KERNEL_CS /* cs */ 240 pushq $__KERNEL_CS /* cs */
@@ -213,62 +252,184 @@ ENTRY(native_usergs_sysret64)
213 CFI_ADJUST_CFA_OFFSET -(6*8) 252 CFI_ADJUST_CFA_OFFSET -(6*8)
214 .endm 253 .endm
215 254
216 .macro CFI_DEFAULT_STACK start=1 255/*
256 * initial frame state for interrupts (and exceptions without error code)
257 */
258 .macro EMPTY_FRAME start=1 offset=0
217 .if \start 259 .if \start
218 CFI_STARTPROC simple 260 CFI_STARTPROC simple
219 CFI_SIGNAL_FRAME 261 CFI_SIGNAL_FRAME
220 CFI_DEF_CFA rsp,SS+8 262 CFI_DEF_CFA rsp,8+\offset
221 .else 263 .else
222 CFI_DEF_CFA_OFFSET SS+8 264 CFI_DEF_CFA_OFFSET 8+\offset
223 .endif 265 .endif
224 CFI_REL_OFFSET r15,R15
225 CFI_REL_OFFSET r14,R14
226 CFI_REL_OFFSET r13,R13
227 CFI_REL_OFFSET r12,R12
228 CFI_REL_OFFSET rbp,RBP
229 CFI_REL_OFFSET rbx,RBX
230 CFI_REL_OFFSET r11,R11
231 CFI_REL_OFFSET r10,R10
232 CFI_REL_OFFSET r9,R9
233 CFI_REL_OFFSET r8,R8
234 CFI_REL_OFFSET rax,RAX
235 CFI_REL_OFFSET rcx,RCX
236 CFI_REL_OFFSET rdx,RDX
237 CFI_REL_OFFSET rsi,RSI
238 CFI_REL_OFFSET rdi,RDI
239 CFI_REL_OFFSET rip,RIP
240 /*CFI_REL_OFFSET cs,CS*/
241 /*CFI_REL_OFFSET rflags,EFLAGS*/
242 CFI_REL_OFFSET rsp,RSP
243 /*CFI_REL_OFFSET ss,SS*/
244 .endm 266 .endm
267
268/*
269 * initial frame state for interrupts (and exceptions without error code)
270 */
271 .macro INTR_FRAME start=1 offset=0
272 EMPTY_FRAME \start, SS+8+\offset-RIP
273 /*CFI_REL_OFFSET ss, SS+\offset-RIP*/
274 CFI_REL_OFFSET rsp, RSP+\offset-RIP
275 /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/
276 /*CFI_REL_OFFSET cs, CS+\offset-RIP*/
277 CFI_REL_OFFSET rip, RIP+\offset-RIP
278 .endm
279
280/*
281 * initial frame state for exceptions with error code (and interrupts
282 * with vector already pushed)
283 */
284 .macro XCPT_FRAME start=1 offset=0
285 INTR_FRAME \start, RIP+\offset-ORIG_RAX
286 /*CFI_REL_OFFSET orig_rax, ORIG_RAX-ORIG_RAX*/
287 .endm
288
245/* 289/*
246 * A newly forked process directly context switches into this. 290 * frame that enables calling into C.
247 */ 291 */
248/* rdi: prev */ 292 .macro PARTIAL_FRAME start=1 offset=0
293 XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET
294 CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET
295 CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET
296 CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET
297 CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET
298 CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET
299 CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET
300 CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET
301 CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET
302 CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET
303 .endm
304
305/*
306 * frame that enables passing a complete pt_regs to a C function.
307 */
308 .macro DEFAULT_FRAME start=1 offset=0
309 PARTIAL_FRAME \start, R11+\offset-R15
310 CFI_REL_OFFSET rbx, RBX+\offset
311 CFI_REL_OFFSET rbp, RBP+\offset
312 CFI_REL_OFFSET r12, R12+\offset
313 CFI_REL_OFFSET r13, R13+\offset
314 CFI_REL_OFFSET r14, R14+\offset
315 CFI_REL_OFFSET r15, R15+\offset
316 .endm
317
318/* save partial stack frame */
319ENTRY(save_args)
320 XCPT_FRAME
321 cld
322 movq_cfi rdi, RDI+16-ARGOFFSET
323 movq_cfi rsi, RSI+16-ARGOFFSET
324 movq_cfi rdx, RDX+16-ARGOFFSET
325 movq_cfi rcx, RCX+16-ARGOFFSET
326 movq_cfi rax, RAX+16-ARGOFFSET
327 movq_cfi r8, R8+16-ARGOFFSET
328 movq_cfi r9, R9+16-ARGOFFSET
329 movq_cfi r10, R10+16-ARGOFFSET
330 movq_cfi r11, R11+16-ARGOFFSET
331
332 leaq -ARGOFFSET+16(%rsp),%rdi /* arg1 for handler */
333 movq_cfi rbp, 8 /* push %rbp */
334 leaq 8(%rsp), %rbp /* mov %rsp, %ebp */
335 testl $3, CS(%rdi)
336 je 1f
337 SWAPGS
338 /*
339 * irqcount is used to check if a CPU is already on an interrupt stack
340 * or not. While this is essentially redundant with preempt_count it is
341 * a little cheaper to use a separate counter in the PDA (short of
342 * moving irq_enter into assembly, which would be too much work)
343 */
3441: incl %gs:pda_irqcount
345 jne 2f
346 popq_cfi %rax /* move return address... */
347 mov %gs:pda_irqstackptr,%rsp
348 EMPTY_FRAME 0
349 pushq_cfi %rax /* ... to the new stack */
350 /*
351 * We entered an interrupt context - irqs are off:
352 */
3532: TRACE_IRQS_OFF
354 ret
355 CFI_ENDPROC
356END(save_args)
357
358ENTRY(save_rest)
359 PARTIAL_FRAME 1 REST_SKIP+8
360 movq 5*8+16(%rsp), %r11 /* save return address */
361 movq_cfi rbx, RBX+16
362 movq_cfi rbp, RBP+16
363 movq_cfi r12, R12+16
364 movq_cfi r13, R13+16
365 movq_cfi r14, R14+16
366 movq_cfi r15, R15+16
367 movq %r11, 8(%rsp) /* return address */
368 FIXUP_TOP_OF_STACK %r11, 16
369 ret
370 CFI_ENDPROC
371END(save_rest)
372
373/* save complete stack frame */
374ENTRY(save_paranoid)
375 XCPT_FRAME 1 RDI+8
376 cld
377 movq_cfi rdi, RDI+8
378 movq_cfi rsi, RSI+8
379 movq_cfi rdx, RDX+8
380 movq_cfi rcx, RCX+8
381 movq_cfi rax, RAX+8
382 movq_cfi r8, R8+8
383 movq_cfi r9, R9+8
384 movq_cfi r10, R10+8
385 movq_cfi r11, R11+8
386 movq_cfi rbx, RBX+8
387 movq_cfi rbp, RBP+8
388 movq_cfi r12, R12+8
389 movq_cfi r13, R13+8
390 movq_cfi r14, R14+8
391 movq_cfi r15, R15+8
392 movl $1,%ebx
393 movl $MSR_GS_BASE,%ecx
394 rdmsr
395 testl %edx,%edx
396 js 1f /* negative -> in kernel */
397 SWAPGS
398 xorl %ebx,%ebx
3991: ret
400 CFI_ENDPROC
401END(save_paranoid)
402
403/*
404 * A newly forked process directly context switches into this address.
405 *
406 * rdi: prev task we switched from
407 */
249ENTRY(ret_from_fork) 408ENTRY(ret_from_fork)
250 CFI_DEFAULT_STACK 409 DEFAULT_FRAME
410
251 push kernel_eflags(%rip) 411 push kernel_eflags(%rip)
252 CFI_ADJUST_CFA_OFFSET 8 412 CFI_ADJUST_CFA_OFFSET 8
253 popf # reset kernel eflags 413 popf # reset kernel eflags
254 CFI_ADJUST_CFA_OFFSET -8 414 CFI_ADJUST_CFA_OFFSET -8
255 call schedule_tail 415
416 call schedule_tail # rdi: 'prev' task parameter
417
256 GET_THREAD_INFO(%rcx) 418 GET_THREAD_INFO(%rcx)
257 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx) 419
258 jnz rff_trace 420 CFI_REMEMBER_STATE
259rff_action:
260 RESTORE_REST 421 RESTORE_REST
261 testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread? 422
423 testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
262 je int_ret_from_sys_call 424 je int_ret_from_sys_call
263 testl $_TIF_IA32,TI_flags(%rcx) 425
426 testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET
264 jnz int_ret_from_sys_call 427 jnz int_ret_from_sys_call
265 RESTORE_TOP_OF_STACK %rdi,ARGOFFSET 428
266 jmp ret_from_sys_call 429 RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
267rff_trace: 430 jmp ret_from_sys_call # go to the SYSRET fastpath
268 movq %rsp,%rdi 431
269 call syscall_trace_leave 432 CFI_RESTORE_STATE
270 GET_THREAD_INFO(%rcx)
271 jmp rff_action
272 CFI_ENDPROC 433 CFI_ENDPROC
273END(ret_from_fork) 434END(ret_from_fork)
274 435
@@ -278,20 +439,20 @@ END(ret_from_fork)
278 * SYSCALL does not save anything on the stack and does not change the 439 * SYSCALL does not save anything on the stack and does not change the
279 * stack pointer. 440 * stack pointer.
280 */ 441 */
281 442
282/* 443/*
283 * Register setup: 444 * Register setup:
284 * rax system call number 445 * rax system call number
285 * rdi arg0 446 * rdi arg0
286 * rcx return address for syscall/sysret, C arg3 447 * rcx return address for syscall/sysret, C arg3
287 * rsi arg1 448 * rsi arg1
288 * rdx arg2 449 * rdx arg2
289 * r10 arg3 (--> moved to rcx for C) 450 * r10 arg3 (--> moved to rcx for C)
290 * r8 arg4 451 * r8 arg4
291 * r9 arg5 452 * r9 arg5
292 * r11 eflags for syscall/sysret, temporary for C 453 * r11 eflags for syscall/sysret, temporary for C
293 * r12-r15,rbp,rbx saved by C code, not touched. 454 * r12-r15,rbp,rbx saved by C code, not touched.
294 * 455 *
295 * Interrupts are off on entry. 456 * Interrupts are off on entry.
296 * Only called from user space. 457 * Only called from user space.
297 * 458 *
@@ -301,7 +462,7 @@ END(ret_from_fork)
301 * When user can change the frames always force IRET. That is because 462 * When user can change the frames always force IRET. That is because
302 * it deals with uncanonical addresses better. SYSRET has trouble 463 * it deals with uncanonical addresses better. SYSRET has trouble
303 * with them due to bugs in both AMD and Intel CPUs. 464 * with them due to bugs in both AMD and Intel CPUs.
304 */ 465 */
305 466
306ENTRY(system_call) 467ENTRY(system_call)
307 CFI_STARTPROC simple 468 CFI_STARTPROC simple
@@ -317,7 +478,7 @@ ENTRY(system_call)
317 */ 478 */
318ENTRY(system_call_after_swapgs) 479ENTRY(system_call_after_swapgs)
319 480
320 movq %rsp,%gs:pda_oldrsp 481 movq %rsp,%gs:pda_oldrsp
321 movq %gs:pda_kernelstack,%rsp 482 movq %gs:pda_kernelstack,%rsp
322 /* 483 /*
323 * No need to follow this irqs off/on section - it's straight 484 * No need to follow this irqs off/on section - it's straight
@@ -325,7 +486,7 @@ ENTRY(system_call_after_swapgs)
325 */ 486 */
326 ENABLE_INTERRUPTS(CLBR_NONE) 487 ENABLE_INTERRUPTS(CLBR_NONE)
327 SAVE_ARGS 8,1 488 SAVE_ARGS 8,1
328 movq %rax,ORIG_RAX-ARGOFFSET(%rsp) 489 movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
329 movq %rcx,RIP-ARGOFFSET(%rsp) 490 movq %rcx,RIP-ARGOFFSET(%rsp)
330 CFI_REL_OFFSET rip,RIP-ARGOFFSET 491 CFI_REL_OFFSET rip,RIP-ARGOFFSET
331 GET_THREAD_INFO(%rcx) 492 GET_THREAD_INFO(%rcx)
@@ -339,19 +500,19 @@ system_call_fastpath:
339 movq %rax,RAX-ARGOFFSET(%rsp) 500 movq %rax,RAX-ARGOFFSET(%rsp)
340/* 501/*
341 * Syscall return path ending with SYSRET (fast path) 502 * Syscall return path ending with SYSRET (fast path)
342 * Has incomplete stack frame and undefined top of stack. 503 * Has incomplete stack frame and undefined top of stack.
343 */ 504 */
344ret_from_sys_call: 505ret_from_sys_call:
345 movl $_TIF_ALLWORK_MASK,%edi 506 movl $_TIF_ALLWORK_MASK,%edi
346 /* edi: flagmask */ 507 /* edi: flagmask */
347sysret_check: 508sysret_check:
348 LOCKDEP_SYS_EXIT 509 LOCKDEP_SYS_EXIT
349 GET_THREAD_INFO(%rcx) 510 GET_THREAD_INFO(%rcx)
350 DISABLE_INTERRUPTS(CLBR_NONE) 511 DISABLE_INTERRUPTS(CLBR_NONE)
351 TRACE_IRQS_OFF 512 TRACE_IRQS_OFF
352 movl TI_flags(%rcx),%edx 513 movl TI_flags(%rcx),%edx
353 andl %edi,%edx 514 andl %edi,%edx
354 jnz sysret_careful 515 jnz sysret_careful
355 CFI_REMEMBER_STATE 516 CFI_REMEMBER_STATE
356 /* 517 /*
357 * sysretq will re-enable interrupts: 518 * sysretq will re-enable interrupts:
@@ -366,7 +527,7 @@ sysret_check:
366 527
367 CFI_RESTORE_STATE 528 CFI_RESTORE_STATE
368 /* Handle reschedules */ 529 /* Handle reschedules */
369 /* edx: work, edi: workmask */ 530 /* edx: work, edi: workmask */
370sysret_careful: 531sysret_careful:
371 bt $TIF_NEED_RESCHED,%edx 532 bt $TIF_NEED_RESCHED,%edx
372 jnc sysret_signal 533 jnc sysret_signal
@@ -379,7 +540,7 @@ sysret_careful:
379 CFI_ADJUST_CFA_OFFSET -8 540 CFI_ADJUST_CFA_OFFSET -8
380 jmp sysret_check 541 jmp sysret_check
381 542
382 /* Handle a signal */ 543 /* Handle a signal */
383sysret_signal: 544sysret_signal:
384 TRACE_IRQS_ON 545 TRACE_IRQS_ON
385 ENABLE_INTERRUPTS(CLBR_NONE) 546 ENABLE_INTERRUPTS(CLBR_NONE)
@@ -388,17 +549,20 @@ sysret_signal:
388 jc sysret_audit 549 jc sysret_audit
389#endif 550#endif
390 /* edx: work flags (arg3) */ 551 /* edx: work flags (arg3) */
391 leaq do_notify_resume(%rip),%rax
392 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 552 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
393 xorl %esi,%esi # oldset -> arg2 553 xorl %esi,%esi # oldset -> arg2
394 call ptregscall_common 554 SAVE_REST
555 FIXUP_TOP_OF_STACK %r11
556 call do_notify_resume
557 RESTORE_TOP_OF_STACK %r11
558 RESTORE_REST
395 movl $_TIF_WORK_MASK,%edi 559 movl $_TIF_WORK_MASK,%edi
396 /* Use IRET because user could have changed frame. This 560 /* Use IRET because user could have changed frame. This
397 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ 561 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
398 DISABLE_INTERRUPTS(CLBR_NONE) 562 DISABLE_INTERRUPTS(CLBR_NONE)
399 TRACE_IRQS_OFF 563 TRACE_IRQS_OFF
400 jmp int_with_check 564 jmp int_with_check
401 565
402badsys: 566badsys:
403 movq $-ENOSYS,RAX-ARGOFFSET(%rsp) 567 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
404 jmp ret_from_sys_call 568 jmp ret_from_sys_call
@@ -437,7 +601,7 @@ sysret_audit:
437#endif /* CONFIG_AUDITSYSCALL */ 601#endif /* CONFIG_AUDITSYSCALL */
438 602
439 /* Do syscall tracing */ 603 /* Do syscall tracing */
440tracesys: 604tracesys:
441#ifdef CONFIG_AUDITSYSCALL 605#ifdef CONFIG_AUDITSYSCALL
442 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx) 606 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
443 jz auditsys 607 jz auditsys
@@ -460,8 +624,8 @@ tracesys:
460 call *sys_call_table(,%rax,8) 624 call *sys_call_table(,%rax,8)
461 movq %rax,RAX-ARGOFFSET(%rsp) 625 movq %rax,RAX-ARGOFFSET(%rsp)
462 /* Use IRET because user could have changed frame */ 626 /* Use IRET because user could have changed frame */
463 627
464/* 628/*
465 * Syscall return path ending with IRET. 629 * Syscall return path ending with IRET.
466 * Has correct top of stack, but partial stack frame. 630 * Has correct top of stack, but partial stack frame.
467 */ 631 */
@@ -505,18 +669,18 @@ int_very_careful:
505 TRACE_IRQS_ON 669 TRACE_IRQS_ON
506 ENABLE_INTERRUPTS(CLBR_NONE) 670 ENABLE_INTERRUPTS(CLBR_NONE)
507 SAVE_REST 671 SAVE_REST
508 /* Check for syscall exit trace */ 672 /* Check for syscall exit trace */
509 testl $_TIF_WORK_SYSCALL_EXIT,%edx 673 testl $_TIF_WORK_SYSCALL_EXIT,%edx
510 jz int_signal 674 jz int_signal
511 pushq %rdi 675 pushq %rdi
512 CFI_ADJUST_CFA_OFFSET 8 676 CFI_ADJUST_CFA_OFFSET 8
513 leaq 8(%rsp),%rdi # &ptregs -> arg1 677 leaq 8(%rsp),%rdi # &ptregs -> arg1
514 call syscall_trace_leave 678 call syscall_trace_leave
515 popq %rdi 679 popq %rdi
516 CFI_ADJUST_CFA_OFFSET -8 680 CFI_ADJUST_CFA_OFFSET -8
517 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi 681 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
518 jmp int_restore_rest 682 jmp int_restore_rest
519 683
520int_signal: 684int_signal:
521 testl $_TIF_DO_NOTIFY_MASK,%edx 685 testl $_TIF_DO_NOTIFY_MASK,%edx
522 jz 1f 686 jz 1f
@@ -531,22 +695,24 @@ int_restore_rest:
531 jmp int_with_check 695 jmp int_with_check
532 CFI_ENDPROC 696 CFI_ENDPROC
533END(system_call) 697END(system_call)
534 698
535/* 699/*
536 * Certain special system calls that need to save a complete full stack frame. 700 * Certain special system calls that need to save a complete full stack frame.
537 */ 701 */
538
539 .macro PTREGSCALL label,func,arg 702 .macro PTREGSCALL label,func,arg
540 .globl \label 703ENTRY(\label)
541\label: 704 PARTIAL_FRAME 1 8 /* offset 8: return address */
542 leaq \func(%rip),%rax 705 subq $REST_SKIP, %rsp
543 leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */ 706 CFI_ADJUST_CFA_OFFSET REST_SKIP
544 jmp ptregscall_common 707 call save_rest
708 DEFAULT_FRAME 0 8 /* offset 8: return address */
709 leaq 8(%rsp), \arg /* pt_regs pointer */
710 call \func
711 jmp ptregscall_common
712 CFI_ENDPROC
545END(\label) 713END(\label)
546 .endm 714 .endm
547 715
548 CFI_STARTPROC
549
550 PTREGSCALL stub_clone, sys_clone, %r8 716 PTREGSCALL stub_clone, sys_clone, %r8
551 PTREGSCALL stub_fork, sys_fork, %rdi 717 PTREGSCALL stub_fork, sys_fork, %rdi
552 PTREGSCALL stub_vfork, sys_vfork, %rdi 718 PTREGSCALL stub_vfork, sys_vfork, %rdi
@@ -554,25 +720,18 @@ END(\label)
554 PTREGSCALL stub_iopl, sys_iopl, %rsi 720 PTREGSCALL stub_iopl, sys_iopl, %rsi
555 721
556ENTRY(ptregscall_common) 722ENTRY(ptregscall_common)
557 popq %r11 723 DEFAULT_FRAME 1 8 /* offset 8: return address */
558 CFI_ADJUST_CFA_OFFSET -8 724 RESTORE_TOP_OF_STACK %r11, 8
559 CFI_REGISTER rip, r11 725 movq_cfi_restore R15+8, r15
560 SAVE_REST 726 movq_cfi_restore R14+8, r14
561 movq %r11, %r15 727 movq_cfi_restore R13+8, r13
562 CFI_REGISTER rip, r15 728 movq_cfi_restore R12+8, r12
563 FIXUP_TOP_OF_STACK %r11 729 movq_cfi_restore RBP+8, rbp
564 call *%rax 730 movq_cfi_restore RBX+8, rbx
565 RESTORE_TOP_OF_STACK %r11 731 ret $REST_SKIP /* pop extended registers */
566 movq %r15, %r11
567 CFI_REGISTER rip, r11
568 RESTORE_REST
569 pushq %r11
570 CFI_ADJUST_CFA_OFFSET 8
571 CFI_REL_OFFSET rip, 0
572 ret
573 CFI_ENDPROC 732 CFI_ENDPROC
574END(ptregscall_common) 733END(ptregscall_common)
575 734
576ENTRY(stub_execve) 735ENTRY(stub_execve)
577 CFI_STARTPROC 736 CFI_STARTPROC
578 popq %r11 737 popq %r11
@@ -588,11 +747,11 @@ ENTRY(stub_execve)
588 jmp int_ret_from_sys_call 747 jmp int_ret_from_sys_call
589 CFI_ENDPROC 748 CFI_ENDPROC
590END(stub_execve) 749END(stub_execve)
591 750
592/* 751/*
593 * sigreturn is special because it needs to restore all registers on return. 752 * sigreturn is special because it needs to restore all registers on return.
594 * This cannot be done with SYSRET, so use the IRET return path instead. 753 * This cannot be done with SYSRET, so use the IRET return path instead.
595 */ 754 */
596ENTRY(stub_rt_sigreturn) 755ENTRY(stub_rt_sigreturn)
597 CFI_STARTPROC 756 CFI_STARTPROC
598 addq $8, %rsp 757 addq $8, %rsp
@@ -608,70 +767,70 @@ ENTRY(stub_rt_sigreturn)
608END(stub_rt_sigreturn) 767END(stub_rt_sigreturn)
609 768
610/* 769/*
611 * initial frame state for interrupts and exceptions 770 * Build the entry stubs and pointer table with some assembler magic.
771 * We pack 7 stubs into a single 32-byte chunk, which will fit in a
772 * single cache line on all modern x86 implementations.
612 */ 773 */
613 .macro _frame ref 774 .section .init.rodata,"a"
614 CFI_STARTPROC simple 775ENTRY(interrupt)
615 CFI_SIGNAL_FRAME 776 .text
616 CFI_DEF_CFA rsp,SS+8-\ref 777 .p2align 5
617 /*CFI_REL_OFFSET ss,SS-\ref*/ 778 .p2align CONFIG_X86_L1_CACHE_SHIFT
618 CFI_REL_OFFSET rsp,RSP-\ref 779ENTRY(irq_entries_start)
619 /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/ 780 INTR_FRAME
620 /*CFI_REL_OFFSET cs,CS-\ref*/ 781vector=FIRST_EXTERNAL_VECTOR
621 CFI_REL_OFFSET rip,RIP-\ref 782.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7
622 .endm 783 .balign 32
784 .rept 7
785 .if vector < NR_VECTORS
786 .if vector <> FIRST_EXTERNAL_VECTOR
787 CFI_ADJUST_CFA_OFFSET -8
788 .endif
7891: pushq $(~vector+0x80) /* Note: always in signed byte range */
790 CFI_ADJUST_CFA_OFFSET 8
791 .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
792 jmp 2f
793 .endif
794 .previous
795 .quad 1b
796 .text
797vector=vector+1
798 .endif
799 .endr
8002: jmp common_interrupt
801.endr
802 CFI_ENDPROC
803END(irq_entries_start)
623 804
624/* initial frame state for interrupts (and exceptions without error code) */ 805.previous
625#define INTR_FRAME _frame RIP 806END(interrupt)
626/* initial frame state for exceptions with error code (and interrupts with 807.previous
627 vector already pushed) */
628#define XCPT_FRAME _frame ORIG_RAX
629 808
630/* 809/*
631 * Interrupt entry/exit. 810 * Interrupt entry/exit.
632 * 811 *
633 * Interrupt entry points save only callee clobbered registers in fast path. 812 * Interrupt entry points save only callee clobbered registers in fast path.
634 * 813 *
635 * Entry runs with interrupts off. 814 * Entry runs with interrupts off.
636 */ 815 */
637 816
638/* 0(%rsp): interrupt number */ 817/* 0(%rsp): ~(interrupt number) */
639 .macro interrupt func 818 .macro interrupt func
640 cld 819 subq $10*8, %rsp
641 SAVE_ARGS 820 CFI_ADJUST_CFA_OFFSET 10*8
642 leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler 821 call save_args
643 pushq %rbp 822 PARTIAL_FRAME 0
644 /*
645 * Save rbp twice: One is for marking the stack frame, as usual, and the
646 * other, to fill pt_regs properly. This is because bx comes right
647 * before the last saved register in that structure, and not bp. If the
648 * base pointer were in the place bx is today, this would not be needed.
649 */
650 movq %rbp, -8(%rsp)
651 CFI_ADJUST_CFA_OFFSET 8
652 CFI_REL_OFFSET rbp, 0
653 movq %rsp,%rbp
654 CFI_DEF_CFA_REGISTER rbp
655 testl $3,CS(%rdi)
656 je 1f
657 SWAPGS
658 /* irqcount is used to check if a CPU is already on an interrupt
659 stack or not. While this is essentially redundant with preempt_count
660 it is a little cheaper to use a separate counter in the PDA
661 (short of moving irq_enter into assembly, which would be too
662 much work) */
6631: incl %gs:pda_irqcount
664 cmoveq %gs:pda_irqstackptr,%rsp
665 push %rbp # backlink for old unwinder
666 /*
667 * We entered an interrupt context - irqs are off:
668 */
669 TRACE_IRQS_OFF
670 call \func 823 call \func
671 .endm 824 .endm
672 825
673ENTRY(common_interrupt) 826 /*
827 * The interrupt stubs push (~vector+0x80) onto the stack and
828 * then jump to common_interrupt.
829 */
830 .p2align CONFIG_X86_L1_CACHE_SHIFT
831common_interrupt:
674 XCPT_FRAME 832 XCPT_FRAME
833 addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */
675 interrupt do_IRQ 834 interrupt do_IRQ
676 /* 0(%rsp): oldrsp-ARGOFFSET */ 835 /* 0(%rsp): oldrsp-ARGOFFSET */
677ret_from_intr: 836ret_from_intr:
@@ -685,12 +844,12 @@ exit_intr:
685 GET_THREAD_INFO(%rcx) 844 GET_THREAD_INFO(%rcx)
686 testl $3,CS-ARGOFFSET(%rsp) 845 testl $3,CS-ARGOFFSET(%rsp)
687 je retint_kernel 846 je retint_kernel
688 847
689 /* Interrupt came from user space */ 848 /* Interrupt came from user space */
690 /* 849 /*
691 * Has a correct top of stack, but a partial stack frame 850 * Has a correct top of stack, but a partial stack frame
692 * %rcx: thread info. Interrupts off. 851 * %rcx: thread info. Interrupts off.
693 */ 852 */
694retint_with_reschedule: 853retint_with_reschedule:
695 movl $_TIF_WORK_MASK,%edi 854 movl $_TIF_WORK_MASK,%edi
696retint_check: 855retint_check:
@@ -763,20 +922,20 @@ retint_careful:
763 pushq %rdi 922 pushq %rdi
764 CFI_ADJUST_CFA_OFFSET 8 923 CFI_ADJUST_CFA_OFFSET 8
765 call schedule 924 call schedule
766 popq %rdi 925 popq %rdi
767 CFI_ADJUST_CFA_OFFSET -8 926 CFI_ADJUST_CFA_OFFSET -8
768 GET_THREAD_INFO(%rcx) 927 GET_THREAD_INFO(%rcx)
769 DISABLE_INTERRUPTS(CLBR_NONE) 928 DISABLE_INTERRUPTS(CLBR_NONE)
770 TRACE_IRQS_OFF 929 TRACE_IRQS_OFF
771 jmp retint_check 930 jmp retint_check
772 931
773retint_signal: 932retint_signal:
774 testl $_TIF_DO_NOTIFY_MASK,%edx 933 testl $_TIF_DO_NOTIFY_MASK,%edx
775 jz retint_swapgs 934 jz retint_swapgs
776 TRACE_IRQS_ON 935 TRACE_IRQS_ON
777 ENABLE_INTERRUPTS(CLBR_NONE) 936 ENABLE_INTERRUPTS(CLBR_NONE)
778 SAVE_REST 937 SAVE_REST
779 movq $-1,ORIG_RAX(%rsp) 938 movq $-1,ORIG_RAX(%rsp)
780 xorl %esi,%esi # oldset 939 xorl %esi,%esi # oldset
781 movq %rsp,%rdi # &pt_regs 940 movq %rsp,%rdi # &pt_regs
782 call do_notify_resume 941 call do_notify_resume
@@ -798,324 +957,211 @@ ENTRY(retint_kernel)
798 jnc retint_restore_args 957 jnc retint_restore_args
799 call preempt_schedule_irq 958 call preempt_schedule_irq
800 jmp exit_intr 959 jmp exit_intr
801#endif 960#endif
802 961
803 CFI_ENDPROC 962 CFI_ENDPROC
804END(common_interrupt) 963END(common_interrupt)
805 964
806/* 965/*
807 * APIC interrupts. 966 * APIC interrupts.
808 */ 967 */
809 .macro apicinterrupt num,func 968.macro apicinterrupt num sym do_sym
969ENTRY(\sym)
810 INTR_FRAME 970 INTR_FRAME
811 pushq $~(\num) 971 pushq $~(\num)
812 CFI_ADJUST_CFA_OFFSET 8 972 CFI_ADJUST_CFA_OFFSET 8
813 interrupt \func 973 interrupt \do_sym
814 jmp ret_from_intr 974 jmp ret_from_intr
815 CFI_ENDPROC 975 CFI_ENDPROC
816 .endm 976END(\sym)
817 977.endm
818ENTRY(thermal_interrupt)
819 apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
820END(thermal_interrupt)
821
822ENTRY(threshold_interrupt)
823 apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt
824END(threshold_interrupt)
825
826#ifdef CONFIG_SMP
827ENTRY(reschedule_interrupt)
828 apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
829END(reschedule_interrupt)
830
831 .macro INVALIDATE_ENTRY num
832ENTRY(invalidate_interrupt\num)
833 apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt
834END(invalidate_interrupt\num)
835 .endm
836 978
837 INVALIDATE_ENTRY 0 979#ifdef CONFIG_SMP
838 INVALIDATE_ENTRY 1 980apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
839 INVALIDATE_ENTRY 2 981 irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
840 INVALIDATE_ENTRY 3
841 INVALIDATE_ENTRY 4
842 INVALIDATE_ENTRY 5
843 INVALIDATE_ENTRY 6
844 INVALIDATE_ENTRY 7
845
846ENTRY(call_function_interrupt)
847 apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
848END(call_function_interrupt)
849ENTRY(call_function_single_interrupt)
850 apicinterrupt CALL_FUNCTION_SINGLE_VECTOR,smp_call_function_single_interrupt
851END(call_function_single_interrupt)
852ENTRY(irq_move_cleanup_interrupt)
853 apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt
854END(irq_move_cleanup_interrupt)
855#endif 982#endif
856 983
857ENTRY(apic_timer_interrupt) 984apicinterrupt UV_BAU_MESSAGE \
858 apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt 985 uv_bau_message_intr1 uv_bau_message_interrupt
859END(apic_timer_interrupt) 986apicinterrupt LOCAL_TIMER_VECTOR \
987 apic_timer_interrupt smp_apic_timer_interrupt
988
989#ifdef CONFIG_SMP
990apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
991 invalidate_interrupt0 smp_invalidate_interrupt
992apicinterrupt INVALIDATE_TLB_VECTOR_START+1 \
993 invalidate_interrupt1 smp_invalidate_interrupt
994apicinterrupt INVALIDATE_TLB_VECTOR_START+2 \
995 invalidate_interrupt2 smp_invalidate_interrupt
996apicinterrupt INVALIDATE_TLB_VECTOR_START+3 \
997 invalidate_interrupt3 smp_invalidate_interrupt
998apicinterrupt INVALIDATE_TLB_VECTOR_START+4 \
999 invalidate_interrupt4 smp_invalidate_interrupt
1000apicinterrupt INVALIDATE_TLB_VECTOR_START+5 \
1001 invalidate_interrupt5 smp_invalidate_interrupt
1002apicinterrupt INVALIDATE_TLB_VECTOR_START+6 \
1003 invalidate_interrupt6 smp_invalidate_interrupt
1004apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \
1005 invalidate_interrupt7 smp_invalidate_interrupt
1006#endif
860 1007
861ENTRY(uv_bau_message_intr1) 1008apicinterrupt THRESHOLD_APIC_VECTOR \
862 apicinterrupt 220,uv_bau_message_interrupt 1009 threshold_interrupt mce_threshold_interrupt
863END(uv_bau_message_intr1) 1010apicinterrupt THERMAL_APIC_VECTOR \
1011 thermal_interrupt smp_thermal_interrupt
1012
1013#ifdef CONFIG_SMP
1014apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
1015 call_function_single_interrupt smp_call_function_single_interrupt
1016apicinterrupt CALL_FUNCTION_VECTOR \
1017 call_function_interrupt smp_call_function_interrupt
1018apicinterrupt RESCHEDULE_VECTOR \
1019 reschedule_interrupt smp_reschedule_interrupt
1020#endif
864 1021
865ENTRY(error_interrupt) 1022apicinterrupt ERROR_APIC_VECTOR \
866 apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt 1023 error_interrupt smp_error_interrupt
867END(error_interrupt) 1024apicinterrupt SPURIOUS_APIC_VECTOR \
1025 spurious_interrupt smp_spurious_interrupt
868 1026
869ENTRY(spurious_interrupt)
870 apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
871END(spurious_interrupt)
872
873/* 1027/*
874 * Exception entry points. 1028 * Exception entry points.
875 */ 1029 */
876 .macro zeroentry sym 1030.macro zeroentry sym do_sym
1031ENTRY(\sym)
877 INTR_FRAME 1032 INTR_FRAME
878 PARAVIRT_ADJUST_EXCEPTION_FRAME 1033 PARAVIRT_ADJUST_EXCEPTION_FRAME
879 pushq $0 /* push error code/oldrax */ 1034 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
880 CFI_ADJUST_CFA_OFFSET 8 1035 subq $15*8,%rsp
881 pushq %rax /* push real oldrax to the rdi slot */ 1036 CFI_ADJUST_CFA_OFFSET 15*8
882 CFI_ADJUST_CFA_OFFSET 8 1037 call error_entry
883 CFI_REL_OFFSET rax,0 1038 DEFAULT_FRAME 0
884 leaq \sym(%rip),%rax 1039 movq %rsp,%rdi /* pt_regs pointer */
885 jmp error_entry 1040 xorl %esi,%esi /* no error code */
1041 call \do_sym
1042 jmp error_exit /* %ebx: no swapgs flag */
886 CFI_ENDPROC 1043 CFI_ENDPROC
887 .endm 1044END(\sym)
1045.endm
888 1046
889 .macro errorentry sym 1047.macro paranoidzeroentry sym do_sym
890 XCPT_FRAME 1048ENTRY(\sym)
1049 INTR_FRAME
891 PARAVIRT_ADJUST_EXCEPTION_FRAME 1050 PARAVIRT_ADJUST_EXCEPTION_FRAME
892 pushq %rax 1051 pushq $-1 /* ORIG_RAX: no syscall to restart */
893 CFI_ADJUST_CFA_OFFSET 8 1052 CFI_ADJUST_CFA_OFFSET 8
894 CFI_REL_OFFSET rax,0 1053 subq $15*8, %rsp
895 leaq \sym(%rip),%rax 1054 call save_paranoid
896 jmp error_entry 1055 TRACE_IRQS_OFF
1056 movq %rsp,%rdi /* pt_regs pointer */
1057 xorl %esi,%esi /* no error code */
1058 call \do_sym
1059 jmp paranoid_exit /* %ebx: no swapgs flag */
897 CFI_ENDPROC 1060 CFI_ENDPROC
898 .endm 1061END(\sym)
1062.endm
899 1063
900 /* error code is on the stack already */ 1064.macro paranoidzeroentry_ist sym do_sym ist
901 /* handle NMI like exceptions that can happen everywhere */ 1065ENTRY(\sym)
902 .macro paranoidentry sym, ist=0, irqtrace=1 1066 INTR_FRAME
903 SAVE_ALL 1067 PARAVIRT_ADJUST_EXCEPTION_FRAME
904 cld 1068 pushq $-1 /* ORIG_RAX: no syscall to restart */
905 movl $1,%ebx 1069 CFI_ADJUST_CFA_OFFSET 8
906 movl $MSR_GS_BASE,%ecx 1070 subq $15*8, %rsp
907 rdmsr 1071 call save_paranoid
908 testl %edx,%edx
909 js 1f
910 SWAPGS
911 xorl %ebx,%ebx
9121:
913 .if \ist
914 movq %gs:pda_data_offset, %rbp
915 .endif
916 .if \irqtrace
917 TRACE_IRQS_OFF
918 .endif
919 movq %rsp,%rdi
920 movq ORIG_RAX(%rsp),%rsi
921 movq $-1,ORIG_RAX(%rsp)
922 .if \ist
923 subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
924 .endif
925 call \sym
926 .if \ist
927 addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
928 .endif
929 DISABLE_INTERRUPTS(CLBR_NONE)
930 .if \irqtrace
931 TRACE_IRQS_OFF 1072 TRACE_IRQS_OFF
932 .endif 1073 movq %rsp,%rdi /* pt_regs pointer */
933 .endm 1074 xorl %esi,%esi /* no error code */
1075 movq %gs:pda_data_offset, %rbp
1076 subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
1077 call \do_sym
1078 addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
1079 jmp paranoid_exit /* %ebx: no swapgs flag */
1080 CFI_ENDPROC
1081END(\sym)
1082.endm
934 1083
935 /* 1084.macro errorentry sym do_sym
936 * "Paranoid" exit path from exception stack. 1085ENTRY(\sym)
937 * Paranoid because this is used by NMIs and cannot take 1086 XCPT_FRAME
938 * any kernel state for granted. 1087 PARAVIRT_ADJUST_EXCEPTION_FRAME
939 * We don't do kernel preemption checks here, because only 1088 subq $15*8,%rsp
940 * NMI should be common and it does not enable IRQs and 1089 CFI_ADJUST_CFA_OFFSET 15*8
941 * cannot get reschedule ticks. 1090 call error_entry
942 * 1091 DEFAULT_FRAME 0
943 * "trace" is 0 for the NMI handler only, because irq-tracing 1092 movq %rsp,%rdi /* pt_regs pointer */
944 * is fundamentally NMI-unsafe. (we cannot change the soft and 1093 movq ORIG_RAX(%rsp),%rsi /* get error code */
945 * hard flags at once, atomically) 1094 movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
946 */ 1095 call \do_sym
947 .macro paranoidexit trace=1 1096 jmp error_exit /* %ebx: no swapgs flag */
948 /* ebx: no swapgs flag */
949paranoid_exit\trace:
950 testl %ebx,%ebx /* swapgs needed? */
951 jnz paranoid_restore\trace
952 testl $3,CS(%rsp)
953 jnz paranoid_userspace\trace
954paranoid_swapgs\trace:
955 .if \trace
956 TRACE_IRQS_IRETQ 0
957 .endif
958 SWAPGS_UNSAFE_STACK
959paranoid_restore\trace:
960 RESTORE_ALL 8
961 jmp irq_return
962paranoid_userspace\trace:
963 GET_THREAD_INFO(%rcx)
964 movl TI_flags(%rcx),%ebx
965 andl $_TIF_WORK_MASK,%ebx
966 jz paranoid_swapgs\trace
967 movq %rsp,%rdi /* &pt_regs */
968 call sync_regs
969 movq %rax,%rsp /* switch stack for scheduling */
970 testl $_TIF_NEED_RESCHED,%ebx
971 jnz paranoid_schedule\trace
972 movl %ebx,%edx /* arg3: thread flags */
973 .if \trace
974 TRACE_IRQS_ON
975 .endif
976 ENABLE_INTERRUPTS(CLBR_NONE)
977 xorl %esi,%esi /* arg2: oldset */
978 movq %rsp,%rdi /* arg1: &pt_regs */
979 call do_notify_resume
980 DISABLE_INTERRUPTS(CLBR_NONE)
981 .if \trace
982 TRACE_IRQS_OFF
983 .endif
984 jmp paranoid_userspace\trace
985paranoid_schedule\trace:
986 .if \trace
987 TRACE_IRQS_ON
988 .endif
989 ENABLE_INTERRUPTS(CLBR_ANY)
990 call schedule
991 DISABLE_INTERRUPTS(CLBR_ANY)
992 .if \trace
993 TRACE_IRQS_OFF
994 .endif
995 jmp paranoid_userspace\trace
996 CFI_ENDPROC 1097 CFI_ENDPROC
997 .endm 1098END(\sym)
1099.endm
998 1100
999/* 1101 /* error code is on the stack already */
1000 * Exception entry point. This expects an error code/orig_rax on the stack 1102.macro paranoiderrorentry sym do_sym
1001 * and the exception handler in %rax. 1103ENTRY(\sym)
1002 */ 1104 XCPT_FRAME
1003KPROBE_ENTRY(error_entry) 1105 PARAVIRT_ADJUST_EXCEPTION_FRAME
1004 _frame RDI 1106 subq $15*8,%rsp
1005 CFI_REL_OFFSET rax,0 1107 CFI_ADJUST_CFA_OFFSET 15*8
1006 /* rdi slot contains rax, oldrax contains error code */ 1108 call save_paranoid
1007 cld 1109 DEFAULT_FRAME 0
1008 subq $14*8,%rsp
1009 CFI_ADJUST_CFA_OFFSET (14*8)
1010 movq %rsi,13*8(%rsp)
1011 CFI_REL_OFFSET rsi,RSI
1012 movq 14*8(%rsp),%rsi /* load rax from rdi slot */
1013 CFI_REGISTER rax,rsi
1014 movq %rdx,12*8(%rsp)
1015 CFI_REL_OFFSET rdx,RDX
1016 movq %rcx,11*8(%rsp)
1017 CFI_REL_OFFSET rcx,RCX
1018 movq %rsi,10*8(%rsp) /* store rax */
1019 CFI_REL_OFFSET rax,RAX
1020 movq %r8, 9*8(%rsp)
1021 CFI_REL_OFFSET r8,R8
1022 movq %r9, 8*8(%rsp)
1023 CFI_REL_OFFSET r9,R9
1024 movq %r10,7*8(%rsp)
1025 CFI_REL_OFFSET r10,R10
1026 movq %r11,6*8(%rsp)
1027 CFI_REL_OFFSET r11,R11
1028 movq %rbx,5*8(%rsp)
1029 CFI_REL_OFFSET rbx,RBX
1030 movq %rbp,4*8(%rsp)
1031 CFI_REL_OFFSET rbp,RBP
1032 movq %r12,3*8(%rsp)
1033 CFI_REL_OFFSET r12,R12
1034 movq %r13,2*8(%rsp)
1035 CFI_REL_OFFSET r13,R13
1036 movq %r14,1*8(%rsp)
1037 CFI_REL_OFFSET r14,R14
1038 movq %r15,(%rsp)
1039 CFI_REL_OFFSET r15,R15
1040 xorl %ebx,%ebx
1041 testl $3,CS(%rsp)
1042 je error_kernelspace
1043error_swapgs:
1044 SWAPGS
1045error_sti:
1046 TRACE_IRQS_OFF
1047 movq %rdi,RDI(%rsp)
1048 CFI_REL_OFFSET rdi,RDI
1049 movq %rsp,%rdi
1050 movq ORIG_RAX(%rsp),%rsi /* get error code */
1051 movq $-1,ORIG_RAX(%rsp)
1052 call *%rax
1053 /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
1054error_exit:
1055 movl %ebx,%eax
1056 RESTORE_REST
1057 DISABLE_INTERRUPTS(CLBR_NONE)
1058 TRACE_IRQS_OFF 1110 TRACE_IRQS_OFF
1059 GET_THREAD_INFO(%rcx) 1111 movq %rsp,%rdi /* pt_regs pointer */
1060 testl %eax,%eax 1112 movq ORIG_RAX(%rsp),%rsi /* get error code */
1061 jne retint_kernel 1113 movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
1062 LOCKDEP_SYS_EXIT_IRQ 1114 call \do_sym
1063 movl TI_flags(%rcx),%edx 1115 jmp paranoid_exit /* %ebx: no swapgs flag */
1064 movl $_TIF_WORK_MASK,%edi
1065 andl %edi,%edx
1066 jnz retint_careful
1067 jmp retint_swapgs
1068 CFI_ENDPROC 1116 CFI_ENDPROC
1117END(\sym)
1118.endm
1069 1119
1070error_kernelspace: 1120zeroentry divide_error do_divide_error
1071 incl %ebx 1121zeroentry overflow do_overflow
1072 /* There are two places in the kernel that can potentially fault with 1122zeroentry bounds do_bounds
1073 usergs. Handle them here. The exception handlers after 1123zeroentry invalid_op do_invalid_op
1074 iret run with kernel gs again, so don't set the user space flag. 1124zeroentry device_not_available do_device_not_available
1075 B stepping K8s sometimes report an truncated RIP for IRET 1125paranoiderrorentry double_fault do_double_fault
1076 exceptions returning to compat mode. Check for these here too. */ 1126zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun
1077 leaq irq_return(%rip),%rcx 1127errorentry invalid_TSS do_invalid_TSS
1078 cmpq %rcx,RIP(%rsp) 1128errorentry segment_not_present do_segment_not_present
1079 je error_swapgs 1129zeroentry spurious_interrupt_bug do_spurious_interrupt_bug
1080 movl %ecx,%ecx /* zero extend */ 1130zeroentry coprocessor_error do_coprocessor_error
1081 cmpq %rcx,RIP(%rsp) 1131errorentry alignment_check do_alignment_check
1082 je error_swapgs 1132zeroentry simd_coprocessor_error do_simd_coprocessor_error
1083 cmpq $gs_change,RIP(%rsp) 1133
1084 je error_swapgs 1134 /* Reload gs selector with exception handling */
1085 jmp error_sti 1135 /* edi: new selector */
1086KPROBE_END(error_entry)
1087
1088 /* Reload gs selector with exception handling */
1089 /* edi: new selector */
1090ENTRY(native_load_gs_index) 1136ENTRY(native_load_gs_index)
1091 CFI_STARTPROC 1137 CFI_STARTPROC
1092 pushf 1138 pushf
1093 CFI_ADJUST_CFA_OFFSET 8 1139 CFI_ADJUST_CFA_OFFSET 8
1094 DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI)) 1140 DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI))
1095 SWAPGS 1141 SWAPGS
1096gs_change: 1142gs_change:
1097 movl %edi,%gs 1143 movl %edi,%gs
10982: mfence /* workaround */ 11442: mfence /* workaround */
1099 SWAPGS 1145 SWAPGS
1100 popf 1146 popf
1101 CFI_ADJUST_CFA_OFFSET -8 1147 CFI_ADJUST_CFA_OFFSET -8
1102 ret 1148 ret
1103 CFI_ENDPROC 1149 CFI_ENDPROC
1104ENDPROC(native_load_gs_index) 1150END(native_load_gs_index)
1105 1151
1106 .section __ex_table,"a" 1152 .section __ex_table,"a"
1107 .align 8 1153 .align 8
1108 .quad gs_change,bad_gs 1154 .quad gs_change,bad_gs
1109 .previous 1155 .previous
1110 .section .fixup,"ax" 1156 .section .fixup,"ax"
1111 /* running with kernelgs */ 1157 /* running with kernelgs */
1112bad_gs: 1158bad_gs:
1113 SWAPGS /* switch back to user gs */ 1159 SWAPGS /* switch back to user gs */
1114 xorl %eax,%eax 1160 xorl %eax,%eax
1115 movl %eax,%gs 1161 movl %eax,%gs
1116 jmp 2b 1162 jmp 2b
1117 .previous 1163 .previous
1118 1164
1119/* 1165/*
1120 * Create a kernel thread. 1166 * Create a kernel thread.
1121 * 1167 *
@@ -1138,7 +1184,7 @@ ENTRY(kernel_thread)
1138 1184
1139 xorl %r8d,%r8d 1185 xorl %r8d,%r8d
1140 xorl %r9d,%r9d 1186 xorl %r9d,%r9d
1141 1187
1142 # clone now 1188 # clone now
1143 call do_fork 1189 call do_fork
1144 movq %rax,RAX(%rsp) 1190 movq %rax,RAX(%rsp)
@@ -1149,15 +1195,15 @@ ENTRY(kernel_thread)
1149 * so internally to the x86_64 port you can rely on kernel_thread() 1195 * so internally to the x86_64 port you can rely on kernel_thread()
1150 * not to reschedule the child before returning, this avoids the need 1196 * not to reschedule the child before returning, this avoids the need
1151 * of hacks for example to fork off the per-CPU idle tasks. 1197 * of hacks for example to fork off the per-CPU idle tasks.
1152 * [Hopefully no generic code relies on the reschedule -AK] 1198 * [Hopefully no generic code relies on the reschedule -AK]
1153 */ 1199 */
1154 RESTORE_ALL 1200 RESTORE_ALL
1155 UNFAKE_STACK_FRAME 1201 UNFAKE_STACK_FRAME
1156 ret 1202 ret
1157 CFI_ENDPROC 1203 CFI_ENDPROC
1158ENDPROC(kernel_thread) 1204END(kernel_thread)
1159 1205
1160child_rip: 1206ENTRY(child_rip)
1161 pushq $0 # fake return address 1207 pushq $0 # fake return address
1162 CFI_STARTPROC 1208 CFI_STARTPROC
1163 /* 1209 /*
@@ -1170,8 +1216,9 @@ child_rip:
1170 # exit 1216 # exit
1171 mov %eax, %edi 1217 mov %eax, %edi
1172 call do_exit 1218 call do_exit
1219 ud2 # padding for call trace
1173 CFI_ENDPROC 1220 CFI_ENDPROC
1174ENDPROC(child_rip) 1221END(child_rip)
1175 1222
1176/* 1223/*
1177 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. 1224 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
@@ -1191,10 +1238,10 @@ ENDPROC(child_rip)
1191ENTRY(kernel_execve) 1238ENTRY(kernel_execve)
1192 CFI_STARTPROC 1239 CFI_STARTPROC
1193 FAKE_STACK_FRAME $0 1240 FAKE_STACK_FRAME $0
1194 SAVE_ALL 1241 SAVE_ALL
1195 movq %rsp,%rcx 1242 movq %rsp,%rcx
1196 call sys_execve 1243 call sys_execve
1197 movq %rax, RAX(%rsp) 1244 movq %rax, RAX(%rsp)
1198 RESTORE_REST 1245 RESTORE_REST
1199 testq %rax,%rax 1246 testq %rax,%rax
1200 je int_ret_from_sys_call 1247 je int_ret_from_sys_call
@@ -1202,129 +1249,7 @@ ENTRY(kernel_execve)
1202 UNFAKE_STACK_FRAME 1249 UNFAKE_STACK_FRAME
1203 ret 1250 ret
1204 CFI_ENDPROC 1251 CFI_ENDPROC
1205ENDPROC(kernel_execve) 1252END(kernel_execve)
1206
1207KPROBE_ENTRY(page_fault)
1208 errorentry do_page_fault
1209KPROBE_END(page_fault)
1210
1211ENTRY(coprocessor_error)
1212 zeroentry do_coprocessor_error
1213END(coprocessor_error)
1214
1215ENTRY(simd_coprocessor_error)
1216 zeroentry do_simd_coprocessor_error
1217END(simd_coprocessor_error)
1218
1219ENTRY(device_not_available)
1220 zeroentry do_device_not_available
1221END(device_not_available)
1222
1223 /* runs on exception stack */
1224KPROBE_ENTRY(debug)
1225 INTR_FRAME
1226 PARAVIRT_ADJUST_EXCEPTION_FRAME
1227 pushq $0
1228 CFI_ADJUST_CFA_OFFSET 8
1229 paranoidentry do_debug, DEBUG_STACK
1230 paranoidexit
1231KPROBE_END(debug)
1232
1233 /* runs on exception stack */
1234KPROBE_ENTRY(nmi)
1235 INTR_FRAME
1236 PARAVIRT_ADJUST_EXCEPTION_FRAME
1237 pushq $-1
1238 CFI_ADJUST_CFA_OFFSET 8
1239 paranoidentry do_nmi, 0, 0
1240#ifdef CONFIG_TRACE_IRQFLAGS
1241 paranoidexit 0
1242#else
1243 jmp paranoid_exit1
1244 CFI_ENDPROC
1245#endif
1246KPROBE_END(nmi)
1247
1248KPROBE_ENTRY(int3)
1249 INTR_FRAME
1250 PARAVIRT_ADJUST_EXCEPTION_FRAME
1251 pushq $0
1252 CFI_ADJUST_CFA_OFFSET 8
1253 paranoidentry do_int3, DEBUG_STACK
1254 jmp paranoid_exit1
1255 CFI_ENDPROC
1256KPROBE_END(int3)
1257
1258ENTRY(overflow)
1259 zeroentry do_overflow
1260END(overflow)
1261
1262ENTRY(bounds)
1263 zeroentry do_bounds
1264END(bounds)
1265
1266ENTRY(invalid_op)
1267 zeroentry do_invalid_op
1268END(invalid_op)
1269
1270ENTRY(coprocessor_segment_overrun)
1271 zeroentry do_coprocessor_segment_overrun
1272END(coprocessor_segment_overrun)
1273
1274 /* runs on exception stack */
1275ENTRY(double_fault)
1276 XCPT_FRAME
1277 PARAVIRT_ADJUST_EXCEPTION_FRAME
1278 paranoidentry do_double_fault
1279 jmp paranoid_exit1
1280 CFI_ENDPROC
1281END(double_fault)
1282
1283ENTRY(invalid_TSS)
1284 errorentry do_invalid_TSS
1285END(invalid_TSS)
1286
1287ENTRY(segment_not_present)
1288 errorentry do_segment_not_present
1289END(segment_not_present)
1290
1291 /* runs on exception stack */
1292ENTRY(stack_segment)
1293 XCPT_FRAME
1294 PARAVIRT_ADJUST_EXCEPTION_FRAME
1295 paranoidentry do_stack_segment
1296 jmp paranoid_exit1
1297 CFI_ENDPROC
1298END(stack_segment)
1299
1300KPROBE_ENTRY(general_protection)
1301 errorentry do_general_protection
1302KPROBE_END(general_protection)
1303
1304ENTRY(alignment_check)
1305 errorentry do_alignment_check
1306END(alignment_check)
1307
1308ENTRY(divide_error)
1309 zeroentry do_divide_error
1310END(divide_error)
1311
1312ENTRY(spurious_interrupt_bug)
1313 zeroentry do_spurious_interrupt_bug
1314END(spurious_interrupt_bug)
1315
1316#ifdef CONFIG_X86_MCE
1317 /* runs on exception stack */
1318ENTRY(machine_check)
1319 INTR_FRAME
1320 PARAVIRT_ADJUST_EXCEPTION_FRAME
1321 pushq $0
1322 CFI_ADJUST_CFA_OFFSET 8
1323 paranoidentry do_machine_check
1324 jmp paranoid_exit1
1325 CFI_ENDPROC
1326END(machine_check)
1327#endif
1328 1253
1329/* Call softirq on interrupt stack. Interrupts are off. */ 1254/* Call softirq on interrupt stack. Interrupts are off. */
1330ENTRY(call_softirq) 1255ENTRY(call_softirq)
@@ -1344,40 +1269,33 @@ ENTRY(call_softirq)
1344 decl %gs:pda_irqcount 1269 decl %gs:pda_irqcount
1345 ret 1270 ret
1346 CFI_ENDPROC 1271 CFI_ENDPROC
1347ENDPROC(call_softirq) 1272END(call_softirq)
1348
1349KPROBE_ENTRY(ignore_sysret)
1350 CFI_STARTPROC
1351 mov $-ENOSYS,%eax
1352 sysret
1353 CFI_ENDPROC
1354ENDPROC(ignore_sysret)
1355 1273
1356#ifdef CONFIG_XEN 1274#ifdef CONFIG_XEN
1357ENTRY(xen_hypervisor_callback) 1275zeroentry xen_hypervisor_callback xen_do_hypervisor_callback
1358 zeroentry xen_do_hypervisor_callback
1359END(xen_hypervisor_callback)
1360 1276
1361/* 1277/*
1362# A note on the "critical region" in our callback handler. 1278 * A note on the "critical region" in our callback handler.
1363# We want to avoid stacking callback handlers due to events occurring 1279 * We want to avoid stacking callback handlers due to events occurring
1364# during handling of the last event. To do this, we keep events disabled 1280 * during handling of the last event. To do this, we keep events disabled
1365# until we've done all processing. HOWEVER, we must enable events before 1281 * until we've done all processing. HOWEVER, we must enable events before
1366# popping the stack frame (can't be done atomically) and so it would still 1282 * popping the stack frame (can't be done atomically) and so it would still
1367# be possible to get enough handler activations to overflow the stack. 1283 * be possible to get enough handler activations to overflow the stack.
1368# Although unlikely, bugs of that kind are hard to track down, so we'd 1284 * Although unlikely, bugs of that kind are hard to track down, so we'd
1369# like to avoid the possibility. 1285 * like to avoid the possibility.
1370# So, on entry to the handler we detect whether we interrupted an 1286 * So, on entry to the handler we detect whether we interrupted an
1371# existing activation in its critical region -- if so, we pop the current 1287 * existing activation in its critical region -- if so, we pop the current
1372# activation and restart the handler using the previous one. 1288 * activation and restart the handler using the previous one.
1373*/ 1289 */
1374ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) 1290ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
1375 CFI_STARTPROC 1291 CFI_STARTPROC
1376/* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will 1292/*
1377 see the correct pointer to the pt_regs */ 1293 * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
1294 * see the correct pointer to the pt_regs
1295 */
1378 movq %rdi, %rsp # we don't return, adjust the stack frame 1296 movq %rdi, %rsp # we don't return, adjust the stack frame
1379 CFI_ENDPROC 1297 CFI_ENDPROC
1380 CFI_DEFAULT_STACK 1298 DEFAULT_FRAME
138111: incl %gs:pda_irqcount 129911: incl %gs:pda_irqcount
1382 movq %rsp,%rbp 1300 movq %rsp,%rbp
1383 CFI_DEF_CFA_REGISTER rbp 1301 CFI_DEF_CFA_REGISTER rbp
@@ -1392,23 +1310,26 @@ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
1392END(do_hypervisor_callback) 1310END(do_hypervisor_callback)
1393 1311
1394/* 1312/*
1395# Hypervisor uses this for application faults while it executes. 1313 * Hypervisor uses this for application faults while it executes.
1396# We get here for two reasons: 1314 * We get here for two reasons:
1397# 1. Fault while reloading DS, ES, FS or GS 1315 * 1. Fault while reloading DS, ES, FS or GS
1398# 2. Fault while executing IRET 1316 * 2. Fault while executing IRET
1399# Category 1 we do not need to fix up as Xen has already reloaded all segment 1317 * Category 1 we do not need to fix up as Xen has already reloaded all segment
1400# registers that could be reloaded and zeroed the others. 1318 * registers that could be reloaded and zeroed the others.
1401# Category 2 we fix up by killing the current process. We cannot use the 1319 * Category 2 we fix up by killing the current process. We cannot use the
1402# normal Linux return path in this case because if we use the IRET hypercall 1320 * normal Linux return path in this case because if we use the IRET hypercall
1403# to pop the stack frame we end up in an infinite loop of failsafe callbacks. 1321 * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
1404# We distinguish between categories by comparing each saved segment register 1322 * We distinguish between categories by comparing each saved segment register
1405# with its current contents: any discrepancy means we in category 1. 1323 * with its current contents: any discrepancy means we in category 1.
1406*/ 1324 */
1407ENTRY(xen_failsafe_callback) 1325ENTRY(xen_failsafe_callback)
1408 framesz = (RIP-0x30) /* workaround buggy gas */ 1326 INTR_FRAME 1 (6*8)
1409 _frame framesz 1327 /*CFI_REL_OFFSET gs,GS*/
1410 CFI_REL_OFFSET rcx, 0 1328 /*CFI_REL_OFFSET fs,FS*/
1411 CFI_REL_OFFSET r11, 8 1329 /*CFI_REL_OFFSET es,ES*/
1330 /*CFI_REL_OFFSET ds,DS*/
1331 CFI_REL_OFFSET r11,8
1332 CFI_REL_OFFSET rcx,0
1412 movw %ds,%cx 1333 movw %ds,%cx
1413 cmpw %cx,0x10(%rsp) 1334 cmpw %cx,0x10(%rsp)
1414 CFI_REMEMBER_STATE 1335 CFI_REMEMBER_STATE
@@ -1429,12 +1350,9 @@ ENTRY(xen_failsafe_callback)
1429 CFI_RESTORE r11 1350 CFI_RESTORE r11
1430 addq $0x30,%rsp 1351 addq $0x30,%rsp
1431 CFI_ADJUST_CFA_OFFSET -0x30 1352 CFI_ADJUST_CFA_OFFSET -0x30
1432 pushq $0 1353 pushq_cfi $0 /* RIP */
1433 CFI_ADJUST_CFA_OFFSET 8 1354 pushq_cfi %r11
1434 pushq %r11 1355 pushq_cfi %rcx
1435 CFI_ADJUST_CFA_OFFSET 8
1436 pushq %rcx
1437 CFI_ADJUST_CFA_OFFSET 8
1438 jmp general_protection 1356 jmp general_protection
1439 CFI_RESTORE_STATE 1357 CFI_RESTORE_STATE
14401: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ 13581: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
@@ -1444,11 +1362,223 @@ ENTRY(xen_failsafe_callback)
1444 CFI_RESTORE r11 1362 CFI_RESTORE r11
1445 addq $0x30,%rsp 1363 addq $0x30,%rsp
1446 CFI_ADJUST_CFA_OFFSET -0x30 1364 CFI_ADJUST_CFA_OFFSET -0x30
1447 pushq $0 1365 pushq_cfi $0
1448 CFI_ADJUST_CFA_OFFSET 8
1449 SAVE_ALL 1366 SAVE_ALL
1450 jmp error_exit 1367 jmp error_exit
1451 CFI_ENDPROC 1368 CFI_ENDPROC
1452END(xen_failsafe_callback) 1369END(xen_failsafe_callback)
1453 1370
1454#endif /* CONFIG_XEN */ 1371#endif /* CONFIG_XEN */
1372
1373/*
1374 * Some functions should be protected against kprobes
1375 */
1376 .pushsection .kprobes.text, "ax"
1377
1378paranoidzeroentry_ist debug do_debug DEBUG_STACK
1379paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
1380paranoiderrorentry stack_segment do_stack_segment
1381errorentry general_protection do_general_protection
1382errorentry page_fault do_page_fault
1383#ifdef CONFIG_X86_MCE
1384paranoidzeroentry machine_check do_machine_check
1385#endif
1386
1387 /*
1388 * "Paranoid" exit path from exception stack.
1389 * Paranoid because this is used by NMIs and cannot take
1390 * any kernel state for granted.
1391 * We don't do kernel preemption checks here, because only
1392 * NMI should be common and it does not enable IRQs and
1393 * cannot get reschedule ticks.
1394 *
1395 * "trace" is 0 for the NMI handler only, because irq-tracing
1396 * is fundamentally NMI-unsafe. (we cannot change the soft and
1397 * hard flags at once, atomically)
1398 */
1399
1400 /* ebx: no swapgs flag */
1401ENTRY(paranoid_exit)
1402 INTR_FRAME
1403 DISABLE_INTERRUPTS(CLBR_NONE)
1404 TRACE_IRQS_OFF
1405 testl %ebx,%ebx /* swapgs needed? */
1406 jnz paranoid_restore
1407 testl $3,CS(%rsp)
1408 jnz paranoid_userspace
1409paranoid_swapgs:
1410 TRACE_IRQS_IRETQ 0
1411 SWAPGS_UNSAFE_STACK
1412paranoid_restore:
1413 RESTORE_ALL 8
1414 jmp irq_return
1415paranoid_userspace:
1416 GET_THREAD_INFO(%rcx)
1417 movl TI_flags(%rcx),%ebx
1418 andl $_TIF_WORK_MASK,%ebx
1419 jz paranoid_swapgs
1420 movq %rsp,%rdi /* &pt_regs */
1421 call sync_regs
1422 movq %rax,%rsp /* switch stack for scheduling */
1423 testl $_TIF_NEED_RESCHED,%ebx
1424 jnz paranoid_schedule
1425 movl %ebx,%edx /* arg3: thread flags */
1426 TRACE_IRQS_ON
1427 ENABLE_INTERRUPTS(CLBR_NONE)
1428 xorl %esi,%esi /* arg2: oldset */
1429 movq %rsp,%rdi /* arg1: &pt_regs */
1430 call do_notify_resume
1431 DISABLE_INTERRUPTS(CLBR_NONE)
1432 TRACE_IRQS_OFF
1433 jmp paranoid_userspace
1434paranoid_schedule:
1435 TRACE_IRQS_ON
1436 ENABLE_INTERRUPTS(CLBR_ANY)
1437 call schedule
1438 DISABLE_INTERRUPTS(CLBR_ANY)
1439 TRACE_IRQS_OFF
1440 jmp paranoid_userspace
1441 CFI_ENDPROC
1442END(paranoid_exit)
1443
1444/*
1445 * Exception entry point. This expects an error code/orig_rax on the stack.
1446 * returns in "no swapgs flag" in %ebx.
1447 */
1448ENTRY(error_entry)
1449 XCPT_FRAME
1450 CFI_ADJUST_CFA_OFFSET 15*8
1451 /* oldrax contains error code */
1452 cld
1453 movq_cfi rdi, RDI+8
1454 movq_cfi rsi, RSI+8
1455 movq_cfi rdx, RDX+8
1456 movq_cfi rcx, RCX+8
1457 movq_cfi rax, RAX+8
1458 movq_cfi r8, R8+8
1459 movq_cfi r9, R9+8
1460 movq_cfi r10, R10+8
1461 movq_cfi r11, R11+8
1462 movq_cfi rbx, RBX+8
1463 movq_cfi rbp, RBP+8
1464 movq_cfi r12, R12+8
1465 movq_cfi r13, R13+8
1466 movq_cfi r14, R14+8
1467 movq_cfi r15, R15+8
1468 xorl %ebx,%ebx
1469 testl $3,CS+8(%rsp)
1470 je error_kernelspace
1471error_swapgs:
1472 SWAPGS
1473error_sti:
1474 TRACE_IRQS_OFF
1475 ret
1476 CFI_ENDPROC
1477
1478/*
1479 * There are two places in the kernel that can potentially fault with
1480 * usergs. Handle them here. The exception handlers after iret run with
1481 * kernel gs again, so don't set the user space flag. B stepping K8s
1482 * sometimes report an truncated RIP for IRET exceptions returning to
1483 * compat mode. Check for these here too.
1484 */
1485error_kernelspace:
1486 incl %ebx
1487 leaq irq_return(%rip),%rcx
1488 cmpq %rcx,RIP+8(%rsp)
1489 je error_swapgs
1490 movl %ecx,%ecx /* zero extend */
1491 cmpq %rcx,RIP+8(%rsp)
1492 je error_swapgs
1493 cmpq $gs_change,RIP+8(%rsp)
1494 je error_swapgs
1495 jmp error_sti
1496END(error_entry)
1497
1498
1499/* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
1500ENTRY(error_exit)
1501 DEFAULT_FRAME
1502 movl %ebx,%eax
1503 RESTORE_REST
1504 DISABLE_INTERRUPTS(CLBR_NONE)
1505 TRACE_IRQS_OFF
1506 GET_THREAD_INFO(%rcx)
1507 testl %eax,%eax
1508 jne retint_kernel
1509 LOCKDEP_SYS_EXIT_IRQ
1510 movl TI_flags(%rcx),%edx
1511 movl $_TIF_WORK_MASK,%edi
1512 andl %edi,%edx
1513 jnz retint_careful
1514 jmp retint_swapgs
1515 CFI_ENDPROC
1516END(error_exit)
1517
1518
1519 /* runs on exception stack */
1520ENTRY(nmi)
1521 INTR_FRAME
1522 PARAVIRT_ADJUST_EXCEPTION_FRAME
1523 pushq_cfi $-1
1524 subq $15*8, %rsp
1525 CFI_ADJUST_CFA_OFFSET 15*8
1526 call save_paranoid
1527 DEFAULT_FRAME 0
1528 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
1529 movq %rsp,%rdi
1530 movq $-1,%rsi
1531 call do_nmi
1532#ifdef CONFIG_TRACE_IRQFLAGS
1533 /* paranoidexit; without TRACE_IRQS_OFF */
1534 /* ebx: no swapgs flag */
1535 DISABLE_INTERRUPTS(CLBR_NONE)
1536 testl %ebx,%ebx /* swapgs needed? */
1537 jnz nmi_restore
1538 testl $3,CS(%rsp)
1539 jnz nmi_userspace
1540nmi_swapgs:
1541 SWAPGS_UNSAFE_STACK
1542nmi_restore:
1543 RESTORE_ALL 8
1544 jmp irq_return
1545nmi_userspace:
1546 GET_THREAD_INFO(%rcx)
1547 movl TI_flags(%rcx),%ebx
1548 andl $_TIF_WORK_MASK,%ebx
1549 jz nmi_swapgs
1550 movq %rsp,%rdi /* &pt_regs */
1551 call sync_regs
1552 movq %rax,%rsp /* switch stack for scheduling */
1553 testl $_TIF_NEED_RESCHED,%ebx
1554 jnz nmi_schedule
1555 movl %ebx,%edx /* arg3: thread flags */
1556 ENABLE_INTERRUPTS(CLBR_NONE)
1557 xorl %esi,%esi /* arg2: oldset */
1558 movq %rsp,%rdi /* arg1: &pt_regs */
1559 call do_notify_resume
1560 DISABLE_INTERRUPTS(CLBR_NONE)
1561 jmp nmi_userspace
1562nmi_schedule:
1563 ENABLE_INTERRUPTS(CLBR_ANY)
1564 call schedule
1565 DISABLE_INTERRUPTS(CLBR_ANY)
1566 jmp nmi_userspace
1567 CFI_ENDPROC
1568#else
1569 jmp paranoid_exit
1570 CFI_ENDPROC
1571#endif
1572END(nmi)
1573
1574ENTRY(ignore_sysret)
1575 CFI_STARTPROC
1576 mov $-ENOSYS,%eax
1577 sysret
1578 CFI_ENDPROC
1579END(ignore_sysret)
1580
1581/*
1582 * End of kprobes section
1583 */
1584 .popsection
diff --git a/arch/x86/kernel/es7000_32.c b/arch/x86/kernel/es7000_32.c
index 0aa2c443d600..53699c931ad4 100644
--- a/arch/x86/kernel/es7000_32.c
+++ b/arch/x86/kernel/es7000_32.c
@@ -38,8 +38,11 @@
38#include <asm/io.h> 38#include <asm/io.h>
39#include <asm/nmi.h> 39#include <asm/nmi.h>
40#include <asm/smp.h> 40#include <asm/smp.h>
41#include <asm/atomic.h>
41#include <asm/apicdef.h> 42#include <asm/apicdef.h>
42#include <mach_mpparse.h> 43#include <mach_mpparse.h>
44#include <asm/genapic.h>
45#include <asm/setup.h>
43 46
44/* 47/*
45 * ES7000 chipsets 48 * ES7000 chipsets
@@ -161,6 +164,43 @@ es7000_rename_gsi(int ioapic, int gsi)
161 return gsi; 164 return gsi;
162} 165}
163 166
167static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
168{
169 unsigned long vect = 0, psaival = 0;
170
171 if (psai == NULL)
172 return -1;
173
174 vect = ((unsigned long)__pa(eip)/0x1000) << 16;
175 psaival = (0x1000000 | vect | cpu);
176
177 while (*psai & 0x1000000)
178 ;
179
180 *psai = psaival;
181
182 return 0;
183}
184
185static void noop_wait_for_deassert(atomic_t *deassert_not_used)
186{
187}
188
189static int __init es7000_update_genapic(void)
190{
191 genapic->wakeup_cpu = wakeup_secondary_cpu_via_mip;
192
193 /* MPENTIUMIII */
194 if (boot_cpu_data.x86 == 6 &&
195 (boot_cpu_data.x86_model >= 7 || boot_cpu_data.x86_model <= 11)) {
196 es7000_update_genapic_to_cluster();
197 genapic->wait_for_init_deassert = noop_wait_for_deassert;
198 genapic->wakeup_cpu = wakeup_secondary_cpu_via_mip;
199 }
200
201 return 0;
202}
203
164void __init 204void __init
165setup_unisys(void) 205setup_unisys(void)
166{ 206{
@@ -176,6 +216,8 @@ setup_unisys(void)
176 else 216 else
177 es7000_plat = ES7000_CLASSIC; 217 es7000_plat = ES7000_CLASSIC;
178 ioapic_renumber_irq = es7000_rename_gsi; 218 ioapic_renumber_irq = es7000_rename_gsi;
219
220 x86_quirks->update_genapic = es7000_update_genapic;
179} 221}
180 222
181/* 223/*
@@ -317,26 +359,6 @@ es7000_mip_write(struct mip_reg *mip_reg)
317 return status; 359 return status;
318} 360}
319 361
320int
321es7000_start_cpu(int cpu, unsigned long eip)
322{
323 unsigned long vect = 0, psaival = 0;
324
325 if (psai == NULL)
326 return -1;
327
328 vect = ((unsigned long)__pa(eip)/0x1000) << 16;
329 psaival = (0x1000000 | vect | cpu);
330
331 while (*psai & 0x1000000)
332 ;
333
334 *psai = psaival;
335
336 return 0;
337
338}
339
340void __init 362void __init
341es7000_sw_apic(void) 363es7000_sw_apic(void)
342{ 364{
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 50ea0ac8c9bf..1b43086b097a 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -14,14 +14,17 @@
14#include <linux/uaccess.h> 14#include <linux/uaccess.h>
15#include <linux/ftrace.h> 15#include <linux/ftrace.h>
16#include <linux/percpu.h> 16#include <linux/percpu.h>
17#include <linux/sched.h>
17#include <linux/init.h> 18#include <linux/init.h>
18#include <linux/list.h> 19#include <linux/list.h>
19 20
20#include <asm/ftrace.h> 21#include <asm/ftrace.h>
22#include <linux/ftrace.h>
21#include <asm/nops.h> 23#include <asm/nops.h>
24#include <asm/nmi.h>
22 25
23 26
24static unsigned char ftrace_nop[MCOUNT_INSN_SIZE]; 27#ifdef CONFIG_DYNAMIC_FTRACE
25 28
26union ftrace_code_union { 29union ftrace_code_union {
27 char code[MCOUNT_INSN_SIZE]; 30 char code[MCOUNT_INSN_SIZE];
@@ -31,18 +34,12 @@ union ftrace_code_union {
31 } __attribute__((packed)); 34 } __attribute__((packed));
32}; 35};
33 36
34
35static int ftrace_calc_offset(long ip, long addr) 37static int ftrace_calc_offset(long ip, long addr)
36{ 38{
37 return (int)(addr - ip); 39 return (int)(addr - ip);
38} 40}
39 41
40unsigned char *ftrace_nop_replace(void) 42static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
41{
42 return ftrace_nop;
43}
44
45unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
46{ 43{
47 static union ftrace_code_union calc; 44 static union ftrace_code_union calc;
48 45
@@ -56,7 +53,142 @@ unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
56 return calc.code; 53 return calc.code;
57} 54}
58 55
59int 56/*
57 * Modifying code must take extra care. On an SMP machine, if
58 * the code being modified is also being executed on another CPU
59 * that CPU will have undefined results and possibly take a GPF.
60 * We use kstop_machine to stop other CPUS from exectuing code.
61 * But this does not stop NMIs from happening. We still need
62 * to protect against that. We separate out the modification of
63 * the code to take care of this.
64 *
65 * Two buffers are added: An IP buffer and a "code" buffer.
66 *
67 * 1) Put the instruction pointer into the IP buffer
68 * and the new code into the "code" buffer.
69 * 2) Set a flag that says we are modifying code
70 * 3) Wait for any running NMIs to finish.
71 * 4) Write the code
72 * 5) clear the flag.
73 * 6) Wait for any running NMIs to finish.
74 *
75 * If an NMI is executed, the first thing it does is to call
76 * "ftrace_nmi_enter". This will check if the flag is set to write
77 * and if it is, it will write what is in the IP and "code" buffers.
78 *
79 * The trick is, it does not matter if everyone is writing the same
80 * content to the code location. Also, if a CPU is executing code
81 * it is OK to write to that code location if the contents being written
82 * are the same as what exists.
83 */
84
85static atomic_t in_nmi = ATOMIC_INIT(0);
86static int mod_code_status; /* holds return value of text write */
87static int mod_code_write; /* set when NMI should do the write */
88static void *mod_code_ip; /* holds the IP to write to */
89static void *mod_code_newcode; /* holds the text to write to the IP */
90
91static unsigned nmi_wait_count;
92static atomic_t nmi_update_count = ATOMIC_INIT(0);
93
94int ftrace_arch_read_dyn_info(char *buf, int size)
95{
96 int r;
97
98 r = snprintf(buf, size, "%u %u",
99 nmi_wait_count,
100 atomic_read(&nmi_update_count));
101 return r;
102}
103
104static void ftrace_mod_code(void)
105{
106 /*
107 * Yes, more than one CPU process can be writing to mod_code_status.
108 * (and the code itself)
109 * But if one were to fail, then they all should, and if one were
110 * to succeed, then they all should.
111 */
112 mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode,
113 MCOUNT_INSN_SIZE);
114}
115
116void ftrace_nmi_enter(void)
117{
118 atomic_inc(&in_nmi);
119 /* Must have in_nmi seen before reading write flag */
120 smp_mb();
121 if (mod_code_write) {
122 ftrace_mod_code();
123 atomic_inc(&nmi_update_count);
124 }
125}
126
127void ftrace_nmi_exit(void)
128{
129 /* Finish all executions before clearing in_nmi */
130 smp_wmb();
131 atomic_dec(&in_nmi);
132}
133
134static void wait_for_nmi(void)
135{
136 int waited = 0;
137
138 while (atomic_read(&in_nmi)) {
139 waited = 1;
140 cpu_relax();
141 }
142
143 if (waited)
144 nmi_wait_count++;
145}
146
147static int
148do_ftrace_mod_code(unsigned long ip, void *new_code)
149{
150 mod_code_ip = (void *)ip;
151 mod_code_newcode = new_code;
152
153 /* The buffers need to be visible before we let NMIs write them */
154 smp_wmb();
155
156 mod_code_write = 1;
157
158 /* Make sure write bit is visible before we wait on NMIs */
159 smp_mb();
160
161 wait_for_nmi();
162
163 /* Make sure all running NMIs have finished before we write the code */
164 smp_mb();
165
166 ftrace_mod_code();
167
168 /* Make sure the write happens before clearing the bit */
169 smp_wmb();
170
171 mod_code_write = 0;
172
173 /* make sure NMIs see the cleared bit */
174 smp_mb();
175
176 wait_for_nmi();
177
178 return mod_code_status;
179}
180
181
182
183
184static unsigned char ftrace_nop[MCOUNT_INSN_SIZE];
185
186static unsigned char *ftrace_nop_replace(void)
187{
188 return ftrace_nop;
189}
190
191static int
60ftrace_modify_code(unsigned long ip, unsigned char *old_code, 192ftrace_modify_code(unsigned long ip, unsigned char *old_code,
61 unsigned char *new_code) 193 unsigned char *new_code)
62{ 194{
@@ -81,7 +213,7 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
81 return -EINVAL; 213 return -EINVAL;
82 214
83 /* replace the text with the new text */ 215 /* replace the text with the new text */
84 if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE)) 216 if (do_ftrace_mod_code(ip, new_code))
85 return -EPERM; 217 return -EPERM;
86 218
87 sync_core(); 219 sync_core();
@@ -89,6 +221,29 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
89 return 0; 221 return 0;
90} 222}
91 223
224int ftrace_make_nop(struct module *mod,
225 struct dyn_ftrace *rec, unsigned long addr)
226{
227 unsigned char *new, *old;
228 unsigned long ip = rec->ip;
229
230 old = ftrace_call_replace(ip, addr);
231 new = ftrace_nop_replace();
232
233 return ftrace_modify_code(rec->ip, old, new);
234}
235
236int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
237{
238 unsigned char *new, *old;
239 unsigned long ip = rec->ip;
240
241 old = ftrace_nop_replace();
242 new = ftrace_call_replace(ip, addr);
243
244 return ftrace_modify_code(rec->ip, old, new);
245}
246
92int ftrace_update_ftrace_func(ftrace_func_t func) 247int ftrace_update_ftrace_func(ftrace_func_t func)
93{ 248{
94 unsigned long ip = (unsigned long)(&ftrace_call); 249 unsigned long ip = (unsigned long)(&ftrace_call);
@@ -165,3 +320,218 @@ int __init ftrace_dyn_arch_init(void *data)
165 320
166 return 0; 321 return 0;
167} 322}
323#endif
324
325#ifdef CONFIG_FUNCTION_GRAPH_TRACER
326
327#ifdef CONFIG_DYNAMIC_FTRACE
328extern void ftrace_graph_call(void);
329
330static int ftrace_mod_jmp(unsigned long ip,
331 int old_offset, int new_offset)
332{
333 unsigned char code[MCOUNT_INSN_SIZE];
334
335 if (probe_kernel_read(code, (void *)ip, MCOUNT_INSN_SIZE))
336 return -EFAULT;
337
338 if (code[0] != 0xe9 || old_offset != *(int *)(&code[1]))
339 return -EINVAL;
340
341 *(int *)(&code[1]) = new_offset;
342
343 if (do_ftrace_mod_code(ip, &code))
344 return -EPERM;
345
346 return 0;
347}
348
349int ftrace_enable_ftrace_graph_caller(void)
350{
351 unsigned long ip = (unsigned long)(&ftrace_graph_call);
352 int old_offset, new_offset;
353
354 old_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
355 new_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
356
357 return ftrace_mod_jmp(ip, old_offset, new_offset);
358}
359
360int ftrace_disable_ftrace_graph_caller(void)
361{
362 unsigned long ip = (unsigned long)(&ftrace_graph_call);
363 int old_offset, new_offset;
364
365 old_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
366 new_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
367
368 return ftrace_mod_jmp(ip, old_offset, new_offset);
369}
370
371#else /* CONFIG_DYNAMIC_FTRACE */
372
373/*
374 * These functions are picked from those used on
375 * this page for dynamic ftrace. They have been
376 * simplified to ignore all traces in NMI context.
377 */
378static atomic_t in_nmi;
379
380void ftrace_nmi_enter(void)
381{
382 atomic_inc(&in_nmi);
383}
384
385void ftrace_nmi_exit(void)
386{
387 atomic_dec(&in_nmi);
388}
389
390#endif /* !CONFIG_DYNAMIC_FTRACE */
391
392/* Add a function return address to the trace stack on thread info.*/
393static int push_return_trace(unsigned long ret, unsigned long long time,
394 unsigned long func, int *depth)
395{
396 int index;
397
398 if (!current->ret_stack)
399 return -EBUSY;
400
401 /* The return trace stack is full */
402 if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
403 atomic_inc(&current->trace_overrun);
404 return -EBUSY;
405 }
406
407 index = ++current->curr_ret_stack;
408 barrier();
409 current->ret_stack[index].ret = ret;
410 current->ret_stack[index].func = func;
411 current->ret_stack[index].calltime = time;
412 *depth = index;
413
414 return 0;
415}
416
417/* Retrieve a function return address to the trace stack on thread info.*/
418static void pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
419{
420 int index;
421
422 index = current->curr_ret_stack;
423
424 if (unlikely(index < 0)) {
425 ftrace_graph_stop();
426 WARN_ON(1);
427 /* Might as well panic, otherwise we have no where to go */
428 *ret = (unsigned long)panic;
429 return;
430 }
431
432 *ret = current->ret_stack[index].ret;
433 trace->func = current->ret_stack[index].func;
434 trace->calltime = current->ret_stack[index].calltime;
435 trace->overrun = atomic_read(&current->trace_overrun);
436 trace->depth = index;
437 barrier();
438 current->curr_ret_stack--;
439
440}
441
442/*
443 * Send the trace to the ring-buffer.
444 * @return the original return address.
445 */
446unsigned long ftrace_return_to_handler(void)
447{
448 struct ftrace_graph_ret trace;
449 unsigned long ret;
450
451 pop_return_trace(&trace, &ret);
452 trace.rettime = cpu_clock(raw_smp_processor_id());
453 ftrace_graph_return(&trace);
454
455 if (unlikely(!ret)) {
456 ftrace_graph_stop();
457 WARN_ON(1);
458 /* Might as well panic. What else to do? */
459 ret = (unsigned long)panic;
460 }
461
462 return ret;
463}
464
465/*
466 * Hook the return address and push it in the stack of return addrs
467 * in current thread info.
468 */
469void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
470{
471 unsigned long old;
472 unsigned long long calltime;
473 int faulted;
474 struct ftrace_graph_ent trace;
475 unsigned long return_hooker = (unsigned long)
476 &return_to_handler;
477
478 /* Nmi's are currently unsupported */
479 if (unlikely(atomic_read(&in_nmi)))
480 return;
481
482 if (unlikely(atomic_read(&current->tracing_graph_pause)))
483 return;
484
485 /*
486 * Protect against fault, even if it shouldn't
487 * happen. This tool is too much intrusive to
488 * ignore such a protection.
489 */
490 asm volatile(
491 "1: " _ASM_MOV " (%[parent_old]), %[old]\n"
492 "2: " _ASM_MOV " %[return_hooker], (%[parent_replaced])\n"
493 " movl $0, %[faulted]\n"
494
495 ".section .fixup, \"ax\"\n"
496 "3: movl $1, %[faulted]\n"
497 ".previous\n"
498
499 _ASM_EXTABLE(1b, 3b)
500 _ASM_EXTABLE(2b, 3b)
501
502 : [parent_replaced] "=r" (parent), [old] "=r" (old),
503 [faulted] "=r" (faulted)
504 : [parent_old] "0" (parent), [return_hooker] "r" (return_hooker)
505 : "memory"
506 );
507
508 if (unlikely(faulted)) {
509 ftrace_graph_stop();
510 WARN_ON(1);
511 return;
512 }
513
514 if (unlikely(!__kernel_text_address(old))) {
515 ftrace_graph_stop();
516 *parent = old;
517 WARN_ON(1);
518 return;
519 }
520
521 calltime = cpu_clock(raw_smp_processor_id());
522
523 if (push_return_trace(old, calltime,
524 self_addr, &trace.depth) == -EBUSY) {
525 *parent = old;
526 return;
527 }
528
529 trace.func = self_addr;
530
531 /* Only trace if the calling function expects to */
532 if (!ftrace_graph_entry(&trace)) {
533 current->curr_ret_stack--;
534 *parent = old;
535 }
536}
537#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index 6c9bfc9e1e95..2bced78b0b8e 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -21,6 +21,7 @@
21#include <asm/smp.h> 21#include <asm/smp.h>
22#include <asm/ipi.h> 22#include <asm/ipi.h>
23#include <asm/genapic.h> 23#include <asm/genapic.h>
24#include <asm/setup.h>
24 25
25extern struct genapic apic_flat; 26extern struct genapic apic_flat;
26extern struct genapic apic_physflat; 27extern struct genapic apic_physflat;
@@ -53,6 +54,9 @@ void __init setup_apic_routing(void)
53 genapic = &apic_physflat; 54 genapic = &apic_physflat;
54 printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); 55 printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
55 } 56 }
57
58 if (x86_quirks->update_genapic)
59 x86_quirks->update_genapic();
56} 60}
57 61
58/* Same for both flat and physical. */ 62/* Same for both flat and physical. */
diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c
index c0262791bda4..34185488e4fb 100644
--- a/arch/x86/kernel/genapic_flat_64.c
+++ b/arch/x86/kernel/genapic_flat_64.c
@@ -30,12 +30,12 @@ static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
30 return 1; 30 return 1;
31} 31}
32 32
33static cpumask_t flat_target_cpus(void) 33static const struct cpumask *flat_target_cpus(void)
34{ 34{
35 return cpu_online_map; 35 return cpu_online_mask;
36} 36}
37 37
38static cpumask_t flat_vector_allocation_domain(int cpu) 38static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask)
39{ 39{
40 /* Careful. Some cpus do not strictly honor the set of cpus 40 /* Careful. Some cpus do not strictly honor the set of cpus
41 * specified in the interrupt destination when using lowest 41 * specified in the interrupt destination when using lowest
@@ -45,8 +45,8 @@ static cpumask_t flat_vector_allocation_domain(int cpu)
45 * deliver interrupts to the wrong hyperthread when only one 45 * deliver interrupts to the wrong hyperthread when only one
46 * hyperthread was specified in the interrupt desitination. 46 * hyperthread was specified in the interrupt desitination.
47 */ 47 */
48 cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; 48 cpumask_clear(retmask);
49 return domain; 49 cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
50} 50}
51 51
52/* 52/*
@@ -69,9 +69,8 @@ static void flat_init_apic_ldr(void)
69 apic_write(APIC_LDR, val); 69 apic_write(APIC_LDR, val);
70} 70}
71 71
72static void flat_send_IPI_mask(cpumask_t cpumask, int vector) 72static inline void _flat_send_IPI_mask(unsigned long mask, int vector)
73{ 73{
74 unsigned long mask = cpus_addr(cpumask)[0];
75 unsigned long flags; 74 unsigned long flags;
76 75
77 local_irq_save(flags); 76 local_irq_save(flags);
@@ -79,20 +78,41 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
79 local_irq_restore(flags); 78 local_irq_restore(flags);
80} 79}
81 80
81static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector)
82{
83 unsigned long mask = cpumask_bits(cpumask)[0];
84
85 _flat_send_IPI_mask(mask, vector);
86}
87
88static void flat_send_IPI_mask_allbutself(const struct cpumask *cpumask,
89 int vector)
90{
91 unsigned long mask = cpumask_bits(cpumask)[0];
92 int cpu = smp_processor_id();
93
94 if (cpu < BITS_PER_LONG)
95 clear_bit(cpu, &mask);
96 _flat_send_IPI_mask(mask, vector);
97}
98
82static void flat_send_IPI_allbutself(int vector) 99static void flat_send_IPI_allbutself(int vector)
83{ 100{
101 int cpu = smp_processor_id();
84#ifdef CONFIG_HOTPLUG_CPU 102#ifdef CONFIG_HOTPLUG_CPU
85 int hotplug = 1; 103 int hotplug = 1;
86#else 104#else
87 int hotplug = 0; 105 int hotplug = 0;
88#endif 106#endif
89 if (hotplug || vector == NMI_VECTOR) { 107 if (hotplug || vector == NMI_VECTOR) {
90 cpumask_t allbutme = cpu_online_map; 108 if (!cpumask_equal(cpu_online_mask, cpumask_of(cpu))) {
109 unsigned long mask = cpumask_bits(cpu_online_mask)[0];
91 110
92 cpu_clear(smp_processor_id(), allbutme); 111 if (cpu < BITS_PER_LONG)
112 clear_bit(cpu, &mask);
93 113
94 if (!cpus_empty(allbutme)) 114 _flat_send_IPI_mask(mask, vector);
95 flat_send_IPI_mask(allbutme, vector); 115 }
96 } else if (num_online_cpus() > 1) { 116 } else if (num_online_cpus() > 1) {
97 __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL); 117 __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL);
98 } 118 }
@@ -101,7 +121,7 @@ static void flat_send_IPI_allbutself(int vector)
101static void flat_send_IPI_all(int vector) 121static void flat_send_IPI_all(int vector)
102{ 122{
103 if (vector == NMI_VECTOR) 123 if (vector == NMI_VECTOR)
104 flat_send_IPI_mask(cpu_online_map, vector); 124 flat_send_IPI_mask(cpu_online_mask, vector);
105 else 125 else
106 __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); 126 __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
107} 127}
@@ -135,9 +155,18 @@ static int flat_apic_id_registered(void)
135 return physid_isset(read_xapic_id(), phys_cpu_present_map); 155 return physid_isset(read_xapic_id(), phys_cpu_present_map);
136} 156}
137 157
138static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask) 158static unsigned int flat_cpu_mask_to_apicid(const struct cpumask *cpumask)
159{
160 return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
161}
162
163static unsigned int flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
164 const struct cpumask *andmask)
139{ 165{
140 return cpus_addr(cpumask)[0] & APIC_ALL_CPUS; 166 unsigned long mask1 = cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
167 unsigned long mask2 = cpumask_bits(andmask)[0] & APIC_ALL_CPUS;
168
169 return mask1 & mask2;
141} 170}
142 171
143static unsigned int phys_pkg_id(int index_msb) 172static unsigned int phys_pkg_id(int index_msb)
@@ -157,8 +186,10 @@ struct genapic apic_flat = {
157 .send_IPI_all = flat_send_IPI_all, 186 .send_IPI_all = flat_send_IPI_all,
158 .send_IPI_allbutself = flat_send_IPI_allbutself, 187 .send_IPI_allbutself = flat_send_IPI_allbutself,
159 .send_IPI_mask = flat_send_IPI_mask, 188 .send_IPI_mask = flat_send_IPI_mask,
189 .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself,
160 .send_IPI_self = apic_send_IPI_self, 190 .send_IPI_self = apic_send_IPI_self,
161 .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, 191 .cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
192 .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
162 .phys_pkg_id = phys_pkg_id, 193 .phys_pkg_id = phys_pkg_id,
163 .get_apic_id = get_apic_id, 194 .get_apic_id = get_apic_id,
164 .set_apic_id = set_apic_id, 195 .set_apic_id = set_apic_id,
@@ -188,35 +219,39 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
188 return 0; 219 return 0;
189} 220}
190 221
191static cpumask_t physflat_target_cpus(void) 222static const struct cpumask *physflat_target_cpus(void)
192{ 223{
193 return cpu_online_map; 224 return cpu_online_mask;
194} 225}
195 226
196static cpumask_t physflat_vector_allocation_domain(int cpu) 227static void physflat_vector_allocation_domain(int cpu, struct cpumask *retmask)
197{ 228{
198 return cpumask_of_cpu(cpu); 229 cpumask_clear(retmask);
230 cpumask_set_cpu(cpu, retmask);
199} 231}
200 232
201static void physflat_send_IPI_mask(cpumask_t cpumask, int vector) 233static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector)
202{ 234{
203 send_IPI_mask_sequence(cpumask, vector); 235 send_IPI_mask_sequence(cpumask, vector);
204} 236}
205 237
206static void physflat_send_IPI_allbutself(int vector) 238static void physflat_send_IPI_mask_allbutself(const struct cpumask *cpumask,
239 int vector)
207{ 240{
208 cpumask_t allbutme = cpu_online_map; 241 send_IPI_mask_allbutself(cpumask, vector);
242}
209 243
210 cpu_clear(smp_processor_id(), allbutme); 244static void physflat_send_IPI_allbutself(int vector)
211 physflat_send_IPI_mask(allbutme, vector); 245{
246 send_IPI_mask_allbutself(cpu_online_mask, vector);
212} 247}
213 248
214static void physflat_send_IPI_all(int vector) 249static void physflat_send_IPI_all(int vector)
215{ 250{
216 physflat_send_IPI_mask(cpu_online_map, vector); 251 physflat_send_IPI_mask(cpu_online_mask, vector);
217} 252}
218 253
219static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask) 254static unsigned int physflat_cpu_mask_to_apicid(const struct cpumask *cpumask)
220{ 255{
221 int cpu; 256 int cpu;
222 257
@@ -224,13 +259,31 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask)
224 * We're using fixed IRQ delivery, can only return one phys APIC ID. 259 * We're using fixed IRQ delivery, can only return one phys APIC ID.
225 * May as well be the first. 260 * May as well be the first.
226 */ 261 */
227 cpu = first_cpu(cpumask); 262 cpu = cpumask_first(cpumask);
228 if ((unsigned)cpu < nr_cpu_ids) 263 if ((unsigned)cpu < nr_cpu_ids)
229 return per_cpu(x86_cpu_to_apicid, cpu); 264 return per_cpu(x86_cpu_to_apicid, cpu);
230 else 265 else
231 return BAD_APICID; 266 return BAD_APICID;
232} 267}
233 268
269static unsigned int
270physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
271 const struct cpumask *andmask)
272{
273 int cpu;
274
275 /*
276 * We're using fixed IRQ delivery, can only return one phys APIC ID.
277 * May as well be the first.
278 */
279 for_each_cpu_and(cpu, cpumask, andmask)
280 if (cpumask_test_cpu(cpu, cpu_online_mask))
281 break;
282 if (cpu < nr_cpu_ids)
283 return per_cpu(x86_cpu_to_apicid, cpu);
284 return BAD_APICID;
285}
286
234struct genapic apic_physflat = { 287struct genapic apic_physflat = {
235 .name = "physical flat", 288 .name = "physical flat",
236 .acpi_madt_oem_check = physflat_acpi_madt_oem_check, 289 .acpi_madt_oem_check = physflat_acpi_madt_oem_check,
@@ -243,8 +296,10 @@ struct genapic apic_physflat = {
243 .send_IPI_all = physflat_send_IPI_all, 296 .send_IPI_all = physflat_send_IPI_all,
244 .send_IPI_allbutself = physflat_send_IPI_allbutself, 297 .send_IPI_allbutself = physflat_send_IPI_allbutself,
245 .send_IPI_mask = physflat_send_IPI_mask, 298 .send_IPI_mask = physflat_send_IPI_mask,
299 .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself,
246 .send_IPI_self = apic_send_IPI_self, 300 .send_IPI_self = apic_send_IPI_self,
247 .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid, 301 .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid,
302 .cpu_mask_to_apicid_and = physflat_cpu_mask_to_apicid_and,
248 .phys_pkg_id = phys_pkg_id, 303 .phys_pkg_id = phys_pkg_id,
249 .get_apic_id = get_apic_id, 304 .get_apic_id = get_apic_id,
250 .set_apic_id = set_apic_id, 305 .set_apic_id = set_apic_id,
diff --git a/arch/x86/kernel/genx2apic_cluster.c b/arch/x86/kernel/genx2apic_cluster.c
index f6a2c8eb48a6..6ce497cc372d 100644
--- a/arch/x86/kernel/genx2apic_cluster.c
+++ b/arch/x86/kernel/genx2apic_cluster.c
@@ -22,19 +22,18 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
22 22
23/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ 23/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
24 24
25static cpumask_t x2apic_target_cpus(void) 25static const struct cpumask *x2apic_target_cpus(void)
26{ 26{
27 return cpumask_of_cpu(0); 27 return cpumask_of(0);
28} 28}
29 29
30/* 30/*
31 * for now each logical cpu is in its own vector allocation domain. 31 * for now each logical cpu is in its own vector allocation domain.
32 */ 32 */
33static cpumask_t x2apic_vector_allocation_domain(int cpu) 33static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
34{ 34{
35 cpumask_t domain = CPU_MASK_NONE; 35 cpumask_clear(retmask);
36 cpu_set(cpu, domain); 36 cpumask_set_cpu(cpu, retmask);
37 return domain;
38} 37}
39 38
40static void __x2apic_send_IPI_dest(unsigned int apicid, int vector, 39static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
@@ -56,32 +55,53 @@ static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
56 * at once. We have 16 cpu's in a cluster. This will minimize IPI register 55 * at once. We have 16 cpu's in a cluster. This will minimize IPI register
57 * writes. 56 * writes.
58 */ 57 */
59static void x2apic_send_IPI_mask(cpumask_t mask, int vector) 58static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
60{ 59{
61 unsigned long flags; 60 unsigned long flags;
62 unsigned long query_cpu; 61 unsigned long query_cpu;
63 62
64 local_irq_save(flags); 63 local_irq_save(flags);
65 for_each_cpu_mask(query_cpu, mask) { 64 for_each_cpu(query_cpu, mask)
66 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_logical_apicid, query_cpu), 65 __x2apic_send_IPI_dest(
67 vector, APIC_DEST_LOGICAL); 66 per_cpu(x86_cpu_to_logical_apicid, query_cpu),
68 } 67 vector, APIC_DEST_LOGICAL);
69 local_irq_restore(flags); 68 local_irq_restore(flags);
70} 69}
71 70
72static void x2apic_send_IPI_allbutself(int vector) 71static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask,
72 int vector)
73{ 73{
74 cpumask_t mask = cpu_online_map; 74 unsigned long flags;
75 unsigned long query_cpu;
76 unsigned long this_cpu = smp_processor_id();
75 77
76 cpu_clear(smp_processor_id(), mask); 78 local_irq_save(flags);
79 for_each_cpu(query_cpu, mask)
80 if (query_cpu != this_cpu)
81 __x2apic_send_IPI_dest(
82 per_cpu(x86_cpu_to_logical_apicid, query_cpu),
83 vector, APIC_DEST_LOGICAL);
84 local_irq_restore(flags);
85}
86
87static void x2apic_send_IPI_allbutself(int vector)
88{
89 unsigned long flags;
90 unsigned long query_cpu;
91 unsigned long this_cpu = smp_processor_id();
77 92
78 if (!cpus_empty(mask)) 93 local_irq_save(flags);
79 x2apic_send_IPI_mask(mask, vector); 94 for_each_online_cpu(query_cpu)
95 if (query_cpu != this_cpu)
96 __x2apic_send_IPI_dest(
97 per_cpu(x86_cpu_to_logical_apicid, query_cpu),
98 vector, APIC_DEST_LOGICAL);
99 local_irq_restore(flags);
80} 100}
81 101
82static void x2apic_send_IPI_all(int vector) 102static void x2apic_send_IPI_all(int vector)
83{ 103{
84 x2apic_send_IPI_mask(cpu_online_map, vector); 104 x2apic_send_IPI_mask(cpu_online_mask, vector);
85} 105}
86 106
87static int x2apic_apic_id_registered(void) 107static int x2apic_apic_id_registered(void)
@@ -89,21 +109,38 @@ static int x2apic_apic_id_registered(void)
89 return 1; 109 return 1;
90} 110}
91 111
92static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask) 112static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
93{ 113{
94 int cpu; 114 int cpu;
95 115
96 /* 116 /*
97 * We're using fixed IRQ delivery, can only return one phys APIC ID. 117 * We're using fixed IRQ delivery, can only return one logical APIC ID.
98 * May as well be the first. 118 * May as well be the first.
99 */ 119 */
100 cpu = first_cpu(cpumask); 120 cpu = cpumask_first(cpumask);
101 if ((unsigned)cpu < NR_CPUS) 121 if ((unsigned)cpu < nr_cpu_ids)
102 return per_cpu(x86_cpu_to_logical_apicid, cpu); 122 return per_cpu(x86_cpu_to_logical_apicid, cpu);
103 else 123 else
104 return BAD_APICID; 124 return BAD_APICID;
105} 125}
106 126
127static unsigned int x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
128 const struct cpumask *andmask)
129{
130 int cpu;
131
132 /*
133 * We're using fixed IRQ delivery, can only return one logical APIC ID.
134 * May as well be the first.
135 */
136 for_each_cpu_and(cpu, cpumask, andmask)
137 if (cpumask_test_cpu(cpu, cpu_online_mask))
138 break;
139 if (cpu < nr_cpu_ids)
140 return per_cpu(x86_cpu_to_logical_apicid, cpu);
141 return BAD_APICID;
142}
143
107static unsigned int get_apic_id(unsigned long x) 144static unsigned int get_apic_id(unsigned long x)
108{ 145{
109 unsigned int id; 146 unsigned int id;
@@ -150,8 +187,10 @@ struct genapic apic_x2apic_cluster = {
150 .send_IPI_all = x2apic_send_IPI_all, 187 .send_IPI_all = x2apic_send_IPI_all,
151 .send_IPI_allbutself = x2apic_send_IPI_allbutself, 188 .send_IPI_allbutself = x2apic_send_IPI_allbutself,
152 .send_IPI_mask = x2apic_send_IPI_mask, 189 .send_IPI_mask = x2apic_send_IPI_mask,
190 .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
153 .send_IPI_self = x2apic_send_IPI_self, 191 .send_IPI_self = x2apic_send_IPI_self,
154 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, 192 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
193 .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
155 .phys_pkg_id = phys_pkg_id, 194 .phys_pkg_id = phys_pkg_id,
156 .get_apic_id = get_apic_id, 195 .get_apic_id = get_apic_id,
157 .set_apic_id = set_apic_id, 196 .set_apic_id = set_apic_id,
diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/genx2apic_phys.c
index d042211768b7..21bcc0e098ba 100644
--- a/arch/x86/kernel/genx2apic_phys.c
+++ b/arch/x86/kernel/genx2apic_phys.c
@@ -29,16 +29,15 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
29 29
30/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ 30/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
31 31
32static cpumask_t x2apic_target_cpus(void) 32static const struct cpumask *x2apic_target_cpus(void)
33{ 33{
34 return cpumask_of_cpu(0); 34 return cpumask_of(0);
35} 35}
36 36
37static cpumask_t x2apic_vector_allocation_domain(int cpu) 37static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
38{ 38{
39 cpumask_t domain = CPU_MASK_NONE; 39 cpumask_clear(retmask);
40 cpu_set(cpu, domain); 40 cpumask_set_cpu(cpu, retmask);
41 return domain;
42} 41}
43 42
44static void __x2apic_send_IPI_dest(unsigned int apicid, int vector, 43static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
@@ -54,32 +53,54 @@ static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
54 x2apic_icr_write(cfg, apicid); 53 x2apic_icr_write(cfg, apicid);
55} 54}
56 55
57static void x2apic_send_IPI_mask(cpumask_t mask, int vector) 56static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
58{ 57{
59 unsigned long flags; 58 unsigned long flags;
60 unsigned long query_cpu; 59 unsigned long query_cpu;
61 60
62 local_irq_save(flags); 61 local_irq_save(flags);
63 for_each_cpu_mask(query_cpu, mask) { 62 for_each_cpu(query_cpu, mask) {
64 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu), 63 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
65 vector, APIC_DEST_PHYSICAL); 64 vector, APIC_DEST_PHYSICAL);
66 } 65 }
67 local_irq_restore(flags); 66 local_irq_restore(flags);
68} 67}
69 68
70static void x2apic_send_IPI_allbutself(int vector) 69static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask,
70 int vector)
71{ 71{
72 cpumask_t mask = cpu_online_map; 72 unsigned long flags;
73 unsigned long query_cpu;
74 unsigned long this_cpu = smp_processor_id();
75
76 local_irq_save(flags);
77 for_each_cpu(query_cpu, mask) {
78 if (query_cpu != this_cpu)
79 __x2apic_send_IPI_dest(
80 per_cpu(x86_cpu_to_apicid, query_cpu),
81 vector, APIC_DEST_PHYSICAL);
82 }
83 local_irq_restore(flags);
84}
73 85
74 cpu_clear(smp_processor_id(), mask); 86static void x2apic_send_IPI_allbutself(int vector)
87{
88 unsigned long flags;
89 unsigned long query_cpu;
90 unsigned long this_cpu = smp_processor_id();
75 91
76 if (!cpus_empty(mask)) 92 local_irq_save(flags);
77 x2apic_send_IPI_mask(mask, vector); 93 for_each_online_cpu(query_cpu)
94 if (query_cpu != this_cpu)
95 __x2apic_send_IPI_dest(
96 per_cpu(x86_cpu_to_apicid, query_cpu),
97 vector, APIC_DEST_PHYSICAL);
98 local_irq_restore(flags);
78} 99}
79 100
80static void x2apic_send_IPI_all(int vector) 101static void x2apic_send_IPI_all(int vector)
81{ 102{
82 x2apic_send_IPI_mask(cpu_online_map, vector); 103 x2apic_send_IPI_mask(cpu_online_mask, vector);
83} 104}
84 105
85static int x2apic_apic_id_registered(void) 106static int x2apic_apic_id_registered(void)
@@ -87,7 +108,7 @@ static int x2apic_apic_id_registered(void)
87 return 1; 108 return 1;
88} 109}
89 110
90static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask) 111static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
91{ 112{
92 int cpu; 113 int cpu;
93 114
@@ -95,13 +116,30 @@ static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask)
95 * We're using fixed IRQ delivery, can only return one phys APIC ID. 116 * We're using fixed IRQ delivery, can only return one phys APIC ID.
96 * May as well be the first. 117 * May as well be the first.
97 */ 118 */
98 cpu = first_cpu(cpumask); 119 cpu = cpumask_first(cpumask);
99 if ((unsigned)cpu < NR_CPUS) 120 if ((unsigned)cpu < nr_cpu_ids)
100 return per_cpu(x86_cpu_to_apicid, cpu); 121 return per_cpu(x86_cpu_to_apicid, cpu);
101 else 122 else
102 return BAD_APICID; 123 return BAD_APICID;
103} 124}
104 125
126static unsigned int x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
127 const struct cpumask *andmask)
128{
129 int cpu;
130
131 /*
132 * We're using fixed IRQ delivery, can only return one phys APIC ID.
133 * May as well be the first.
134 */
135 for_each_cpu_and(cpu, cpumask, andmask)
136 if (cpumask_test_cpu(cpu, cpu_online_mask))
137 break;
138 if (cpu < nr_cpu_ids)
139 return per_cpu(x86_cpu_to_apicid, cpu);
140 return BAD_APICID;
141}
142
105static unsigned int get_apic_id(unsigned long x) 143static unsigned int get_apic_id(unsigned long x)
106{ 144{
107 unsigned int id; 145 unsigned int id;
@@ -123,12 +161,12 @@ static unsigned int phys_pkg_id(int index_msb)
123 return current_cpu_data.initial_apicid >> index_msb; 161 return current_cpu_data.initial_apicid >> index_msb;
124} 162}
125 163
126void x2apic_send_IPI_self(int vector) 164static void x2apic_send_IPI_self(int vector)
127{ 165{
128 apic_write(APIC_SELF_IPI, vector); 166 apic_write(APIC_SELF_IPI, vector);
129} 167}
130 168
131void init_x2apic_ldr(void) 169static void init_x2apic_ldr(void)
132{ 170{
133 return; 171 return;
134} 172}
@@ -145,8 +183,10 @@ struct genapic apic_x2apic_phys = {
145 .send_IPI_all = x2apic_send_IPI_all, 183 .send_IPI_all = x2apic_send_IPI_all,
146 .send_IPI_allbutself = x2apic_send_IPI_allbutself, 184 .send_IPI_allbutself = x2apic_send_IPI_allbutself,
147 .send_IPI_mask = x2apic_send_IPI_mask, 185 .send_IPI_mask = x2apic_send_IPI_mask,
186 .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
148 .send_IPI_self = x2apic_send_IPI_self, 187 .send_IPI_self = x2apic_send_IPI_self,
149 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, 188 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
189 .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
150 .phys_pkg_id = phys_pkg_id, 190 .phys_pkg_id = phys_pkg_id,
151 .get_apic_id = get_apic_id, 191 .get_apic_id = get_apic_id,
152 .set_apic_id = set_apic_id, 192 .set_apic_id = set_apic_id,
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index 2c7dbdb98278..b193e082f6ce 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -10,6 +10,7 @@
10 10
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/threads.h> 12#include <linux/threads.h>
13#include <linux/cpu.h>
13#include <linux/cpumask.h> 14#include <linux/cpumask.h>
14#include <linux/string.h> 15#include <linux/string.h>
15#include <linux/ctype.h> 16#include <linux/ctype.h>
@@ -17,6 +18,9 @@
17#include <linux/sched.h> 18#include <linux/sched.h>
18#include <linux/module.h> 19#include <linux/module.h>
19#include <linux/hardirq.h> 20#include <linux/hardirq.h>
21#include <linux/timer.h>
22#include <linux/proc_fs.h>
23#include <asm/current.h>
20#include <asm/smp.h> 24#include <asm/smp.h>
21#include <asm/ipi.h> 25#include <asm/ipi.h>
22#include <asm/genapic.h> 26#include <asm/genapic.h>
@@ -75,16 +79,15 @@ EXPORT_SYMBOL(sn_rtc_cycles_per_second);
75 79
76/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ 80/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
77 81
78static cpumask_t uv_target_cpus(void) 82static const struct cpumask *uv_target_cpus(void)
79{ 83{
80 return cpumask_of_cpu(0); 84 return cpumask_of(0);
81} 85}
82 86
83static cpumask_t uv_vector_allocation_domain(int cpu) 87static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
84{ 88{
85 cpumask_t domain = CPU_MASK_NONE; 89 cpumask_clear(retmask);
86 cpu_set(cpu, domain); 90 cpumask_set_cpu(cpu, retmask);
87 return domain;
88} 91}
89 92
90int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip) 93int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip)
@@ -123,28 +126,37 @@ static void uv_send_IPI_one(int cpu, int vector)
123 uv_write_global_mmr64(pnode, UVH_IPI_INT, val); 126 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
124} 127}
125 128
126static void uv_send_IPI_mask(cpumask_t mask, int vector) 129static void uv_send_IPI_mask(const struct cpumask *mask, int vector)
127{ 130{
128 unsigned int cpu; 131 unsigned int cpu;
129 132
130 for_each_possible_cpu(cpu) 133 for_each_cpu(cpu, mask)
131 if (cpu_isset(cpu, mask)) 134 uv_send_IPI_one(cpu, vector);
135}
136
137static void uv_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
138{
139 unsigned int cpu;
140 unsigned int this_cpu = smp_processor_id();
141
142 for_each_cpu(cpu, mask)
143 if (cpu != this_cpu)
132 uv_send_IPI_one(cpu, vector); 144 uv_send_IPI_one(cpu, vector);
133} 145}
134 146
135static void uv_send_IPI_allbutself(int vector) 147static void uv_send_IPI_allbutself(int vector)
136{ 148{
137 cpumask_t mask = cpu_online_map; 149 unsigned int cpu;
138 150 unsigned int this_cpu = smp_processor_id();
139 cpu_clear(smp_processor_id(), mask);
140 151
141 if (!cpus_empty(mask)) 152 for_each_online_cpu(cpu)
142 uv_send_IPI_mask(mask, vector); 153 if (cpu != this_cpu)
154 uv_send_IPI_one(cpu, vector);
143} 155}
144 156
145static void uv_send_IPI_all(int vector) 157static void uv_send_IPI_all(int vector)
146{ 158{
147 uv_send_IPI_mask(cpu_online_map, vector); 159 uv_send_IPI_mask(cpu_online_mask, vector);
148} 160}
149 161
150static int uv_apic_id_registered(void) 162static int uv_apic_id_registered(void)
@@ -156,7 +168,7 @@ static void uv_init_apic_ldr(void)
156{ 168{
157} 169}
158 170
159static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask) 171static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask)
160{ 172{
161 int cpu; 173 int cpu;
162 174
@@ -164,13 +176,30 @@ static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask)
164 * We're using fixed IRQ delivery, can only return one phys APIC ID. 176 * We're using fixed IRQ delivery, can only return one phys APIC ID.
165 * May as well be the first. 177 * May as well be the first.
166 */ 178 */
167 cpu = first_cpu(cpumask); 179 cpu = cpumask_first(cpumask);
168 if ((unsigned)cpu < nr_cpu_ids) 180 if ((unsigned)cpu < nr_cpu_ids)
169 return per_cpu(x86_cpu_to_apicid, cpu); 181 return per_cpu(x86_cpu_to_apicid, cpu);
170 else 182 else
171 return BAD_APICID; 183 return BAD_APICID;
172} 184}
173 185
186static unsigned int uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
187 const struct cpumask *andmask)
188{
189 int cpu;
190
191 /*
192 * We're using fixed IRQ delivery, can only return one phys APIC ID.
193 * May as well be the first.
194 */
195 for_each_cpu_and(cpu, cpumask, andmask)
196 if (cpumask_test_cpu(cpu, cpu_online_mask))
197 break;
198 if (cpu < nr_cpu_ids)
199 return per_cpu(x86_cpu_to_apicid, cpu);
200 return BAD_APICID;
201}
202
174static unsigned int get_apic_id(unsigned long x) 203static unsigned int get_apic_id(unsigned long x)
175{ 204{
176 unsigned int id; 205 unsigned int id;
@@ -218,8 +247,10 @@ struct genapic apic_x2apic_uv_x = {
218 .send_IPI_all = uv_send_IPI_all, 247 .send_IPI_all = uv_send_IPI_all,
219 .send_IPI_allbutself = uv_send_IPI_allbutself, 248 .send_IPI_allbutself = uv_send_IPI_allbutself,
220 .send_IPI_mask = uv_send_IPI_mask, 249 .send_IPI_mask = uv_send_IPI_mask,
250 .send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself,
221 .send_IPI_self = uv_send_IPI_self, 251 .send_IPI_self = uv_send_IPI_self,
222 .cpu_mask_to_apicid = uv_cpu_mask_to_apicid, 252 .cpu_mask_to_apicid = uv_cpu_mask_to_apicid,
253 .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and,
223 .phys_pkg_id = phys_pkg_id, 254 .phys_pkg_id = phys_pkg_id,
224 .get_apic_id = get_apic_id, 255 .get_apic_id = get_apic_id,
225 .set_apic_id = set_apic_id, 256 .set_apic_id = set_apic_id,
@@ -356,6 +387,103 @@ static __init void uv_rtc_init(void)
356} 387}
357 388
358/* 389/*
390 * percpu heartbeat timer
391 */
392static void uv_heartbeat(unsigned long ignored)
393{
394 struct timer_list *timer = &uv_hub_info->scir.timer;
395 unsigned char bits = uv_hub_info->scir.state;
396
397 /* flip heartbeat bit */
398 bits ^= SCIR_CPU_HEARTBEAT;
399
400 /* is this cpu idle? */
401 if (idle_cpu(raw_smp_processor_id()))
402 bits &= ~SCIR_CPU_ACTIVITY;
403 else
404 bits |= SCIR_CPU_ACTIVITY;
405
406 /* update system controller interface reg */
407 uv_set_scir_bits(bits);
408
409 /* enable next timer period */
410 mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL);
411}
412
413static void __cpuinit uv_heartbeat_enable(int cpu)
414{
415 if (!uv_cpu_hub_info(cpu)->scir.enabled) {
416 struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer;
417
418 uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY);
419 setup_timer(timer, uv_heartbeat, cpu);
420 timer->expires = jiffies + SCIR_CPU_HB_INTERVAL;
421 add_timer_on(timer, cpu);
422 uv_cpu_hub_info(cpu)->scir.enabled = 1;
423 }
424
425 /* check boot cpu */
426 if (!uv_cpu_hub_info(0)->scir.enabled)
427 uv_heartbeat_enable(0);
428}
429
430#ifdef CONFIG_HOTPLUG_CPU
431static void __cpuinit uv_heartbeat_disable(int cpu)
432{
433 if (uv_cpu_hub_info(cpu)->scir.enabled) {
434 uv_cpu_hub_info(cpu)->scir.enabled = 0;
435 del_timer(&uv_cpu_hub_info(cpu)->scir.timer);
436 }
437 uv_set_cpu_scir_bits(cpu, 0xff);
438}
439
440/*
441 * cpu hotplug notifier
442 */
443static __cpuinit int uv_scir_cpu_notify(struct notifier_block *self,
444 unsigned long action, void *hcpu)
445{
446 long cpu = (long)hcpu;
447
448 switch (action) {
449 case CPU_ONLINE:
450 uv_heartbeat_enable(cpu);
451 break;
452 case CPU_DOWN_PREPARE:
453 uv_heartbeat_disable(cpu);
454 break;
455 default:
456 break;
457 }
458 return NOTIFY_OK;
459}
460
461static __init void uv_scir_register_cpu_notifier(void)
462{
463 hotcpu_notifier(uv_scir_cpu_notify, 0);
464}
465
466#else /* !CONFIG_HOTPLUG_CPU */
467
468static __init void uv_scir_register_cpu_notifier(void)
469{
470}
471
472static __init int uv_init_heartbeat(void)
473{
474 int cpu;
475
476 if (is_uv_system())
477 for_each_online_cpu(cpu)
478 uv_heartbeat_enable(cpu);
479 return 0;
480}
481
482late_initcall(uv_init_heartbeat);
483
484#endif /* !CONFIG_HOTPLUG_CPU */
485
486/*
359 * Called on each cpu to initialize the per_cpu UV data area. 487 * Called on each cpu to initialize the per_cpu UV data area.
360 * ZZZ hotplug not supported yet 488 * ZZZ hotplug not supported yet
361 */ 489 */
@@ -428,7 +556,7 @@ void __init uv_system_init(void)
428 556
429 uv_bios_init(); 557 uv_bios_init();
430 uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, 558 uv_bios_get_sn_info(0, &uv_type, &sn_partition_id,
431 &uv_coherency_id, &uv_region_size); 559 &sn_coherency_id, &sn_region_size);
432 uv_rtc_init(); 560 uv_rtc_init();
433 561
434 for_each_present_cpu(cpu) { 562 for_each_present_cpu(cpu) {
@@ -439,8 +567,7 @@ void __init uv_system_init(void)
439 uv_blade_info[blade].nr_possible_cpus++; 567 uv_blade_info[blade].nr_possible_cpus++;
440 568
441 uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; 569 uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base;
442 uv_cpu_hub_info(cpu)->lowmem_remap_top = 570 uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size;
443 lowmem_redir_base + lowmem_redir_size;
444 uv_cpu_hub_info(cpu)->m_val = m_val; 571 uv_cpu_hub_info(cpu)->m_val = m_val;
445 uv_cpu_hub_info(cpu)->n_val = m_val; 572 uv_cpu_hub_info(cpu)->n_val = m_val;
446 uv_cpu_hub_info(cpu)->numa_blade_id = blade; 573 uv_cpu_hub_info(cpu)->numa_blade_id = blade;
@@ -450,7 +577,8 @@ void __init uv_system_init(void)
450 uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; 577 uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
451 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; 578 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
452 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; 579 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
453 uv_cpu_hub_info(cpu)->coherency_domain_number = uv_coherency_id; 580 uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id;
581 uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu;
454 uv_node_to_blade[nid] = blade; 582 uv_node_to_blade[nid] = blade;
455 uv_cpu_to_blade[cpu] = blade; 583 uv_cpu_to_blade[cpu] = blade;
456 max_pnode = max(pnode, max_pnode); 584 max_pnode = max(pnode, max_pnode);
@@ -467,4 +595,6 @@ void __init uv_system_init(void)
467 map_mmioh_high(max_pnode); 595 map_mmioh_high(max_pnode);
468 596
469 uv_cpu_init(); 597 uv_cpu_init();
598 uv_scir_register_cpu_notifier();
599 proc_mkdir("sgi_uv", NULL);
470} 600}
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
index 1dcb0f13897e..3e66bd364a9d 100644
--- a/arch/x86/kernel/head.c
+++ b/arch/x86/kernel/head.c
@@ -35,7 +35,6 @@ void __init reserve_ebda_region(void)
35 35
36 /* start of EBDA area */ 36 /* start of EBDA area */
37 ebda_addr = get_bios_ebda(); 37 ebda_addr = get_bios_ebda();
38 printk(KERN_INFO "BIOS EBDA/lowmem at: %08x/%08x\n", ebda_addr, lowmem);
39 38
40 /* Fixup: bios puts an EBDA in the top 64K segment */ 39 /* Fixup: bios puts an EBDA in the top 64K segment */
41 /* of conventional memory, but does not adjust lowmem. */ 40 /* of conventional memory, but does not adjust lowmem. */
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index fa1d25dd83e3..ac108d1fe182 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -12,9 +12,12 @@
12#include <asm/sections.h> 12#include <asm/sections.h>
13#include <asm/e820.h> 13#include <asm/e820.h>
14#include <asm/bios_ebda.h> 14#include <asm/bios_ebda.h>
15#include <asm/trampoline.h>
15 16
16void __init i386_start_kernel(void) 17void __init i386_start_kernel(void)
17{ 18{
19 reserve_trampoline_memory();
20
18 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); 21 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
19 22
20#ifdef CONFIG_BLK_DEV_INITRD 23#ifdef CONFIG_BLK_DEV_INITRD
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index d16084f90649..b9a4d8c4b935 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -24,9 +24,10 @@
24#include <asm/kdebug.h> 24#include <asm/kdebug.h>
25#include <asm/e820.h> 25#include <asm/e820.h>
26#include <asm/bios_ebda.h> 26#include <asm/bios_ebda.h>
27#include <asm/trampoline.h>
27 28
28/* boot cpu pda */ 29/* boot cpu pda */
29static struct x8664_pda _boot_cpu_pda __read_mostly; 30static struct x8664_pda _boot_cpu_pda;
30 31
31#ifdef CONFIG_SMP 32#ifdef CONFIG_SMP
32/* 33/*
@@ -120,6 +121,8 @@ void __init x86_64_start_reservations(char *real_mode_data)
120{ 121{
121 copy_bootdata(__va(real_mode_data)); 122 copy_bootdata(__va(real_mode_data));
122 123
124 reserve_trampoline_memory();
125
123 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); 126 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
124 127
125#ifdef CONFIG_BLK_DEV_INITRD 128#ifdef CONFIG_BLK_DEV_INITRD
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 067d8de913f6..cd759ad90690 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -33,7 +33,9 @@
33 * HPET address is set in acpi/boot.c, when an ACPI entry exists 33 * HPET address is set in acpi/boot.c, when an ACPI entry exists
34 */ 34 */
35unsigned long hpet_address; 35unsigned long hpet_address;
36unsigned long hpet_num_timers; 36#ifdef CONFIG_PCI_MSI
37static unsigned long hpet_num_timers;
38#endif
37static void __iomem *hpet_virt_address; 39static void __iomem *hpet_virt_address;
38 40
39struct hpet_dev { 41struct hpet_dev {
@@ -246,7 +248,7 @@ static void hpet_legacy_clockevent_register(void)
246 * Start hpet with the boot cpu mask and make it 248 * Start hpet with the boot cpu mask and make it
247 * global after the IO_APIC has been initialized. 249 * global after the IO_APIC has been initialized.
248 */ 250 */
249 hpet_clockevent.cpumask = cpumask_of_cpu(smp_processor_id()); 251 hpet_clockevent.cpumask = cpumask_of(smp_processor_id());
250 clockevents_register_device(&hpet_clockevent); 252 clockevents_register_device(&hpet_clockevent);
251 global_clock_event = &hpet_clockevent; 253 global_clock_event = &hpet_clockevent;
252 printk(KERN_DEBUG "hpet clockevent registered\n"); 254 printk(KERN_DEBUG "hpet clockevent registered\n");
@@ -301,7 +303,7 @@ static void hpet_set_mode(enum clock_event_mode mode,
301 struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); 303 struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
302 hpet_setup_msi_irq(hdev->irq); 304 hpet_setup_msi_irq(hdev->irq);
303 disable_irq(hdev->irq); 305 disable_irq(hdev->irq);
304 irq_set_affinity(hdev->irq, cpumask_of_cpu(hdev->cpu)); 306 irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
305 enable_irq(hdev->irq); 307 enable_irq(hdev->irq);
306 } 308 }
307 break; 309 break;
@@ -449,7 +451,7 @@ static int hpet_setup_irq(struct hpet_dev *dev)
449 return -1; 451 return -1;
450 452
451 disable_irq(dev->irq); 453 disable_irq(dev->irq);
452 irq_set_affinity(dev->irq, cpumask_of_cpu(dev->cpu)); 454 irq_set_affinity(dev->irq, cpumask_of(dev->cpu));
453 enable_irq(dev->irq); 455 enable_irq(dev->irq);
454 456
455 printk(KERN_DEBUG "hpet: %s irq %d for MSI\n", 457 printk(KERN_DEBUG "hpet: %s irq %d for MSI\n",
@@ -500,7 +502,7 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
500 /* 5 usec minimum reprogramming delta. */ 502 /* 5 usec minimum reprogramming delta. */
501 evt->min_delta_ns = 5000; 503 evt->min_delta_ns = 5000;
502 504
503 evt->cpumask = cpumask_of_cpu(hdev->cpu); 505 evt->cpumask = cpumask_of(hdev->cpu);
504 clockevents_register_device(evt); 506 clockevents_register_device(evt);
505} 507}
506 508
@@ -811,7 +813,7 @@ int __init hpet_enable(void)
811 813
812out_nohpet: 814out_nohpet:
813 hpet_clear_mapping(); 815 hpet_clear_mapping();
814 boot_hpet_disable = 1; 816 hpet_address = 0;
815 return 0; 817 return 0;
816} 818}
817 819
@@ -834,10 +836,11 @@ static __init int hpet_late_init(void)
834 836
835 hpet_address = force_hpet_address; 837 hpet_address = force_hpet_address;
836 hpet_enable(); 838 hpet_enable();
837 if (!hpet_virt_address)
838 return -ENODEV;
839 } 839 }
840 840
841 if (!hpet_virt_address)
842 return -ENODEV;
843
841 hpet_reserve_platform_timers(hpet_readl(HPET_ID)); 844 hpet_reserve_platform_timers(hpet_readl(HPET_ID));
842 845
843 for_each_online_cpu(cpu) { 846 for_each_online_cpu(cpu) {
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index c1b5e3ece1f2..10f92fb532f3 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -114,7 +114,7 @@ void __init setup_pit_timer(void)
114 * Start pit with the boot cpu mask and make it global after the 114 * Start pit with the boot cpu mask and make it global after the
115 * IO_APIC has been initialized. 115 * IO_APIC has been initialized.
116 */ 116 */
117 pit_clockevent.cpumask = cpumask_of_cpu(smp_processor_id()); 117 pit_clockevent.cpumask = cpumask_of(smp_processor_id());
118 pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, 118 pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC,
119 pit_clockevent.shift); 119 pit_clockevent.shift);
120 pit_clockevent.max_delta_ns = 120 pit_clockevent.max_delta_ns =
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index a4f93b4120c1..df3bf269beab 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -10,11 +10,9 @@
10#include <asm/pgtable.h> 10#include <asm/pgtable.h>
11#include <asm/desc.h> 11#include <asm/desc.h>
12 12
13static struct fs_struct init_fs = INIT_FS;
14static struct signal_struct init_signals = INIT_SIGNALS(init_signals); 13static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
15static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); 14static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
16struct mm_struct init_mm = INIT_MM(init_mm); 15struct mm_struct init_mm = INIT_MM(init_mm);
17EXPORT_UNUSED_SYMBOL(init_mm); /* will be removed in 2.6.26 */
18 16
19/* 17/*
20 * Initial thread structure. 18 * Initial thread structure.
diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index 9043251210fb..3639442aa7a4 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -108,93 +108,276 @@ static int __init parse_noapic(char *str)
108early_param("noapic", parse_noapic); 108early_param("noapic", parse_noapic);
109 109
110struct irq_pin_list; 110struct irq_pin_list;
111
112/*
113 * This is performance-critical, we want to do it O(1)
114 *
115 * the indexing order of this array favors 1:1 mappings
116 * between pins and IRQs.
117 */
118
119struct irq_pin_list {
120 int apic, pin;
121 struct irq_pin_list *next;
122};
123
124static struct irq_pin_list *get_one_free_irq_2_pin(int cpu)
125{
126 struct irq_pin_list *pin;
127 int node;
128
129 node = cpu_to_node(cpu);
130
131 pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
132 printk(KERN_DEBUG " alloc irq_2_pin on cpu %d node %d\n", cpu, node);
133
134 return pin;
135}
136
111struct irq_cfg { 137struct irq_cfg {
112 unsigned int irq;
113 struct irq_pin_list *irq_2_pin; 138 struct irq_pin_list *irq_2_pin;
114 cpumask_t domain; 139 cpumask_var_t domain;
115 cpumask_t old_domain; 140 cpumask_var_t old_domain;
116 unsigned move_cleanup_count; 141 unsigned move_cleanup_count;
117 u8 vector; 142 u8 vector;
118 u8 move_in_progress : 1; 143 u8 move_in_progress : 1;
144#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
145 u8 move_desc_pending : 1;
146#endif
119}; 147};
120 148
121/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ 149/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
150#ifdef CONFIG_SPARSE_IRQ
151static struct irq_cfg irq_cfgx[] = {
152#else
122static struct irq_cfg irq_cfgx[NR_IRQS] = { 153static struct irq_cfg irq_cfgx[NR_IRQS] = {
123 [0] = { .irq = 0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, 154#endif
124 [1] = { .irq = 1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, 155 [0] = { .vector = IRQ0_VECTOR, },
125 [2] = { .irq = 2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, 156 [1] = { .vector = IRQ1_VECTOR, },
126 [3] = { .irq = 3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, }, 157 [2] = { .vector = IRQ2_VECTOR, },
127 [4] = { .irq = 4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, }, 158 [3] = { .vector = IRQ3_VECTOR, },
128 [5] = { .irq = 5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, }, 159 [4] = { .vector = IRQ4_VECTOR, },
129 [6] = { .irq = 6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, }, 160 [5] = { .vector = IRQ5_VECTOR, },
130 [7] = { .irq = 7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, }, 161 [6] = { .vector = IRQ6_VECTOR, },
131 [8] = { .irq = 8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, }, 162 [7] = { .vector = IRQ7_VECTOR, },
132 [9] = { .irq = 9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, }, 163 [8] = { .vector = IRQ8_VECTOR, },
133 [10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, }, 164 [9] = { .vector = IRQ9_VECTOR, },
134 [11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, }, 165 [10] = { .vector = IRQ10_VECTOR, },
135 [12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, }, 166 [11] = { .vector = IRQ11_VECTOR, },
136 [13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, }, 167 [12] = { .vector = IRQ12_VECTOR, },
137 [14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, }, 168 [13] = { .vector = IRQ13_VECTOR, },
138 [15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, }, 169 [14] = { .vector = IRQ14_VECTOR, },
170 [15] = { .vector = IRQ15_VECTOR, },
139}; 171};
140 172
141#define for_each_irq_cfg(irq, cfg) \ 173int __init arch_early_irq_init(void)
142 for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++) 174{
175 struct irq_cfg *cfg;
176 struct irq_desc *desc;
177 int count;
178 int i;
179
180 cfg = irq_cfgx;
181 count = ARRAY_SIZE(irq_cfgx);
143 182
183 for (i = 0; i < count; i++) {
184 desc = irq_to_desc(i);
185 desc->chip_data = &cfg[i];
186 alloc_bootmem_cpumask_var(&cfg[i].domain);
187 alloc_bootmem_cpumask_var(&cfg[i].old_domain);
188 if (i < NR_IRQS_LEGACY)
189 cpumask_setall(cfg[i].domain);
190 }
191
192 return 0;
193}
194
195#ifdef CONFIG_SPARSE_IRQ
144static struct irq_cfg *irq_cfg(unsigned int irq) 196static struct irq_cfg *irq_cfg(unsigned int irq)
145{ 197{
146 return irq < nr_irqs ? irq_cfgx + irq : NULL; 198 struct irq_cfg *cfg = NULL;
199 struct irq_desc *desc;
200
201 desc = irq_to_desc(irq);
202 if (desc)
203 cfg = desc->chip_data;
204
205 return cfg;
147} 206}
148 207
149static struct irq_cfg *irq_cfg_alloc(unsigned int irq) 208static struct irq_cfg *get_one_free_irq_cfg(int cpu)
150{ 209{
151 return irq_cfg(irq); 210 struct irq_cfg *cfg;
211 int node;
212
213 node = cpu_to_node(cpu);
214
215 cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
216 if (cfg) {
217 if (!alloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) {
218 kfree(cfg);
219 cfg = NULL;
220 } else if (!alloc_cpumask_var_node(&cfg->old_domain,
221 GFP_ATOMIC, node)) {
222 free_cpumask_var(cfg->domain);
223 kfree(cfg);
224 cfg = NULL;
225 } else {
226 cpumask_clear(cfg->domain);
227 cpumask_clear(cfg->old_domain);
228 }
229 }
230 printk(KERN_DEBUG " alloc irq_cfg on cpu %d node %d\n", cpu, node);
231
232 return cfg;
152} 233}
153 234
154/* 235int arch_init_chip_data(struct irq_desc *desc, int cpu)
155 * Rough estimation of how many shared IRQs there are, can be changed 236{
156 * anytime. 237 struct irq_cfg *cfg;
157 */
158#define MAX_PLUS_SHARED_IRQS NR_IRQS
159#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
160 238
161/* 239 cfg = desc->chip_data;
162 * This is performance-critical, we want to do it O(1) 240 if (!cfg) {
163 * 241 desc->chip_data = get_one_free_irq_cfg(cpu);
164 * the indexing order of this array favors 1:1 mappings 242 if (!desc->chip_data) {
165 * between pins and IRQs. 243 printk(KERN_ERR "can not alloc irq_cfg\n");
166 */ 244 BUG_ON(1);
245 }
246 }
167 247
168struct irq_pin_list { 248 return 0;
169 int apic, pin; 249}
170 struct irq_pin_list *next;
171};
172 250
173static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE]; 251#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
174static struct irq_pin_list *irq_2_pin_ptr;
175 252
176static void __init irq_2_pin_init(void) 253static void
254init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
177{ 255{
178 struct irq_pin_list *pin = irq_2_pin_head; 256 struct irq_pin_list *old_entry, *head, *tail, *entry;
179 int i; 257
258 cfg->irq_2_pin = NULL;
259 old_entry = old_cfg->irq_2_pin;
260 if (!old_entry)
261 return;
262
263 entry = get_one_free_irq_2_pin(cpu);
264 if (!entry)
265 return;
266
267 entry->apic = old_entry->apic;
268 entry->pin = old_entry->pin;
269 head = entry;
270 tail = entry;
271 old_entry = old_entry->next;
272 while (old_entry) {
273 entry = get_one_free_irq_2_pin(cpu);
274 if (!entry) {
275 entry = head;
276 while (entry) {
277 head = entry->next;
278 kfree(entry);
279 entry = head;
280 }
281 /* still use the old one */
282 return;
283 }
284 entry->apic = old_entry->apic;
285 entry->pin = old_entry->pin;
286 tail->next = entry;
287 tail = entry;
288 old_entry = old_entry->next;
289 }
180 290
181 for (i = 1; i < PIN_MAP_SIZE; i++) 291 tail->next = NULL;
182 pin[i-1].next = &pin[i]; 292 cfg->irq_2_pin = head;
293}
294
295static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)
296{
297 struct irq_pin_list *entry, *next;
298
299 if (old_cfg->irq_2_pin == cfg->irq_2_pin)
300 return;
183 301
184 irq_2_pin_ptr = &pin[0]; 302 entry = old_cfg->irq_2_pin;
303
304 while (entry) {
305 next = entry->next;
306 kfree(entry);
307 entry = next;
308 }
309 old_cfg->irq_2_pin = NULL;
185} 310}
186 311
187static struct irq_pin_list *get_one_free_irq_2_pin(void) 312void arch_init_copy_chip_data(struct irq_desc *old_desc,
313 struct irq_desc *desc, int cpu)
188{ 314{
189 struct irq_pin_list *pin = irq_2_pin_ptr; 315 struct irq_cfg *cfg;
316 struct irq_cfg *old_cfg;
190 317
191 if (!pin) 318 cfg = get_one_free_irq_cfg(cpu);
192 panic("can not get more irq_2_pin\n");
193 319
194 irq_2_pin_ptr = pin->next; 320 if (!cfg)
195 pin->next = NULL; 321 return;
196 return pin; 322
323 desc->chip_data = cfg;
324
325 old_cfg = old_desc->chip_data;
326
327 memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
328
329 init_copy_irq_2_pin(old_cfg, cfg, cpu);
330}
331
332static void free_irq_cfg(struct irq_cfg *old_cfg)
333{
334 kfree(old_cfg);
335}
336
337void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
338{
339 struct irq_cfg *old_cfg, *cfg;
340
341 old_cfg = old_desc->chip_data;
342 cfg = desc->chip_data;
343
344 if (old_cfg == cfg)
345 return;
346
347 if (old_cfg) {
348 free_irq_2_pin(old_cfg, cfg);
349 free_irq_cfg(old_cfg);
350 old_desc->chip_data = NULL;
351 }
352}
353
354static void
355set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
356{
357 struct irq_cfg *cfg = desc->chip_data;
358
359 if (!cfg->move_in_progress) {
360 /* it means that domain is not changed */
361 if (!cpumask_intersects(&desc->affinity, mask))
362 cfg->move_desc_pending = 1;
363 }
197} 364}
365#endif
366
367#else
368static struct irq_cfg *irq_cfg(unsigned int irq)
369{
370 return irq < nr_irqs ? irq_cfgx + irq : NULL;
371}
372
373#endif
374
375#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC
376static inline void
377set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
378{
379}
380#endif
198 381
199struct io_apic { 382struct io_apic {
200 unsigned int index; 383 unsigned int index;
@@ -237,11 +420,10 @@ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned
237 writel(value, &io_apic->data); 420 writel(value, &io_apic->data);
238} 421}
239 422
240static bool io_apic_level_ack_pending(unsigned int irq) 423static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
241{ 424{
242 struct irq_pin_list *entry; 425 struct irq_pin_list *entry;
243 unsigned long flags; 426 unsigned long flags;
244 struct irq_cfg *cfg = irq_cfg(irq);
245 427
246 spin_lock_irqsave(&ioapic_lock, flags); 428 spin_lock_irqsave(&ioapic_lock, flags);
247 entry = cfg->irq_2_pin; 429 entry = cfg->irq_2_pin;
@@ -323,13 +505,32 @@ static void ioapic_mask_entry(int apic, int pin)
323} 505}
324 506
325#ifdef CONFIG_SMP 507#ifdef CONFIG_SMP
326static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) 508static void send_cleanup_vector(struct irq_cfg *cfg)
509{
510 cpumask_var_t cleanup_mask;
511
512 if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
513 unsigned int i;
514 cfg->move_cleanup_count = 0;
515 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
516 cfg->move_cleanup_count++;
517 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
518 send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
519 } else {
520 cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
521 cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
522 send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
523 free_cpumask_var(cleanup_mask);
524 }
525 cfg->move_in_progress = 0;
526}
527
528static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
327{ 529{
328 int apic, pin; 530 int apic, pin;
329 struct irq_cfg *cfg;
330 struct irq_pin_list *entry; 531 struct irq_pin_list *entry;
532 u8 vector = cfg->vector;
331 533
332 cfg = irq_cfg(irq);
333 entry = cfg->irq_2_pin; 534 entry = cfg->irq_2_pin;
334 for (;;) { 535 for (;;) {
335 unsigned int reg; 536 unsigned int reg;
@@ -359,36 +560,61 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
359 } 560 }
360} 561}
361 562
362static int assign_irq_vector(int irq, cpumask_t mask); 563static int
564assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
363 565
364static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) 566/*
567 * Either sets desc->affinity to a valid value, and returns cpu_mask_to_apicid
568 * of that, or returns BAD_APICID and leaves desc->affinity untouched.
569 */
570static unsigned int
571set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
572{
573 struct irq_cfg *cfg;
574 unsigned int irq;
575
576 if (!cpumask_intersects(mask, cpu_online_mask))
577 return BAD_APICID;
578
579 irq = desc->irq;
580 cfg = desc->chip_data;
581 if (assign_irq_vector(irq, cfg, mask))
582 return BAD_APICID;
583
584 cpumask_and(&desc->affinity, cfg->domain, mask);
585 set_extra_move_desc(desc, mask);
586 return cpu_mask_to_apicid_and(&desc->affinity, cpu_online_mask);
587}
588
589static void
590set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
365{ 591{
366 struct irq_cfg *cfg; 592 struct irq_cfg *cfg;
367 unsigned long flags; 593 unsigned long flags;
368 unsigned int dest; 594 unsigned int dest;
369 cpumask_t tmp; 595 unsigned int irq;
370 struct irq_desc *desc;
371 596
372 cpus_and(tmp, mask, cpu_online_map); 597 irq = desc->irq;
373 if (cpus_empty(tmp)) 598 cfg = desc->chip_data;
374 return;
375 599
376 cfg = irq_cfg(irq); 600 spin_lock_irqsave(&ioapic_lock, flags);
377 if (assign_irq_vector(irq, mask)) 601 dest = set_desc_affinity(desc, mask);
378 return; 602 if (dest != BAD_APICID) {
603 /* Only the high 8 bits are valid. */
604 dest = SET_APIC_LOGICAL_ID(dest);
605 __target_IO_APIC_irq(irq, dest, cfg);
606 }
607 spin_unlock_irqrestore(&ioapic_lock, flags);
608}
379 609
380 cpus_and(tmp, cfg->domain, mask); 610static void
381 dest = cpu_mask_to_apicid(tmp); 611set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
382 /* 612{
383 * Only the high 8 bits are valid. 613 struct irq_desc *desc;
384 */
385 dest = SET_APIC_LOGICAL_ID(dest);
386 614
387 desc = irq_to_desc(irq); 615 desc = irq_to_desc(irq);
388 spin_lock_irqsave(&ioapic_lock, flags); 616
389 __target_IO_APIC_irq(irq, dest, cfg->vector); 617 set_ioapic_affinity_irq_desc(desc, mask);
390 desc->affinity = mask;
391 spin_unlock_irqrestore(&ioapic_lock, flags);
392} 618}
393#endif /* CONFIG_SMP */ 619#endif /* CONFIG_SMP */
394 620
@@ -397,16 +623,18 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
397 * shared ISA-space IRQs, so we have to support them. We are super 623 * shared ISA-space IRQs, so we have to support them. We are super
398 * fast in the common case, and fast for shared ISA-space IRQs. 624 * fast in the common case, and fast for shared ISA-space IRQs.
399 */ 625 */
400static void add_pin_to_irq(unsigned int irq, int apic, int pin) 626static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
401{ 627{
402 struct irq_cfg *cfg;
403 struct irq_pin_list *entry; 628 struct irq_pin_list *entry;
404 629
405 /* first time to refer irq_cfg, so with new */
406 cfg = irq_cfg_alloc(irq);
407 entry = cfg->irq_2_pin; 630 entry = cfg->irq_2_pin;
408 if (!entry) { 631 if (!entry) {
409 entry = get_one_free_irq_2_pin(); 632 entry = get_one_free_irq_2_pin(cpu);
633 if (!entry) {
634 printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n",
635 apic, pin);
636 return;
637 }
410 cfg->irq_2_pin = entry; 638 cfg->irq_2_pin = entry;
411 entry->apic = apic; 639 entry->apic = apic;
412 entry->pin = pin; 640 entry->pin = pin;
@@ -421,7 +649,7 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
421 entry = entry->next; 649 entry = entry->next;
422 } 650 }
423 651
424 entry->next = get_one_free_irq_2_pin(); 652 entry->next = get_one_free_irq_2_pin(cpu);
425 entry = entry->next; 653 entry = entry->next;
426 entry->apic = apic; 654 entry->apic = apic;
427 entry->pin = pin; 655 entry->pin = pin;
@@ -430,11 +658,10 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
430/* 658/*
431 * Reroute an IRQ to a different pin. 659 * Reroute an IRQ to a different pin.
432 */ 660 */
433static void __init replace_pin_at_irq(unsigned int irq, 661static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
434 int oldapic, int oldpin, 662 int oldapic, int oldpin,
435 int newapic, int newpin) 663 int newapic, int newpin)
436{ 664{
437 struct irq_cfg *cfg = irq_cfg(irq);
438 struct irq_pin_list *entry = cfg->irq_2_pin; 665 struct irq_pin_list *entry = cfg->irq_2_pin;
439 int replaced = 0; 666 int replaced = 0;
440 667
@@ -451,18 +678,16 @@ static void __init replace_pin_at_irq(unsigned int irq,
451 678
452 /* why? call replace before add? */ 679 /* why? call replace before add? */
453 if (!replaced) 680 if (!replaced)
454 add_pin_to_irq(irq, newapic, newpin); 681 add_pin_to_irq_cpu(cfg, cpu, newapic, newpin);
455} 682}
456 683
457static inline void io_apic_modify_irq(unsigned int irq, 684static inline void io_apic_modify_irq(struct irq_cfg *cfg,
458 int mask_and, int mask_or, 685 int mask_and, int mask_or,
459 void (*final)(struct irq_pin_list *entry)) 686 void (*final)(struct irq_pin_list *entry))
460{ 687{
461 int pin; 688 int pin;
462 struct irq_cfg *cfg;
463 struct irq_pin_list *entry; 689 struct irq_pin_list *entry;
464 690
465 cfg = irq_cfg(irq);
466 for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { 691 for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) {
467 unsigned int reg; 692 unsigned int reg;
468 pin = entry->pin; 693 pin = entry->pin;
@@ -475,13 +700,13 @@ static inline void io_apic_modify_irq(unsigned int irq,
475 } 700 }
476} 701}
477 702
478static void __unmask_IO_APIC_irq(unsigned int irq) 703static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
479{ 704{
480 io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 0, NULL); 705 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
481} 706}
482 707
483#ifdef CONFIG_X86_64 708#ifdef CONFIG_X86_64
484void io_apic_sync(struct irq_pin_list *entry) 709static void io_apic_sync(struct irq_pin_list *entry)
485{ 710{
486 /* 711 /*
487 * Synchronize the IO-APIC and the CPU by doing 712 * Synchronize the IO-APIC and the CPU by doing
@@ -492,47 +717,64 @@ void io_apic_sync(struct irq_pin_list *entry)
492 readl(&io_apic->data); 717 readl(&io_apic->data);
493} 718}
494 719
495static void __mask_IO_APIC_irq(unsigned int irq) 720static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
496{ 721{
497 io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); 722 io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
498} 723}
499#else /* CONFIG_X86_32 */ 724#else /* CONFIG_X86_32 */
500static void __mask_IO_APIC_irq(unsigned int irq) 725static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
501{ 726{
502 io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, NULL); 727 io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL);
503} 728}
504 729
505static void __mask_and_edge_IO_APIC_irq(unsigned int irq) 730static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg)
506{ 731{
507 io_apic_modify_irq(irq, ~IO_APIC_REDIR_LEVEL_TRIGGER, 732 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER,
508 IO_APIC_REDIR_MASKED, NULL); 733 IO_APIC_REDIR_MASKED, NULL);
509} 734}
510 735
511static void __unmask_and_level_IO_APIC_irq(unsigned int irq) 736static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg)
512{ 737{
513 io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 738 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED,
514 IO_APIC_REDIR_LEVEL_TRIGGER, NULL); 739 IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
515} 740}
516#endif /* CONFIG_X86_32 */ 741#endif /* CONFIG_X86_32 */
517 742
518static void mask_IO_APIC_irq (unsigned int irq) 743static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
519{ 744{
745 struct irq_cfg *cfg = desc->chip_data;
520 unsigned long flags; 746 unsigned long flags;
521 747
748 BUG_ON(!cfg);
749
522 spin_lock_irqsave(&ioapic_lock, flags); 750 spin_lock_irqsave(&ioapic_lock, flags);
523 __mask_IO_APIC_irq(irq); 751 __mask_IO_APIC_irq(cfg);
524 spin_unlock_irqrestore(&ioapic_lock, flags); 752 spin_unlock_irqrestore(&ioapic_lock, flags);
525} 753}
526 754
527static void unmask_IO_APIC_irq (unsigned int irq) 755static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
528{ 756{
757 struct irq_cfg *cfg = desc->chip_data;
529 unsigned long flags; 758 unsigned long flags;
530 759
531 spin_lock_irqsave(&ioapic_lock, flags); 760 spin_lock_irqsave(&ioapic_lock, flags);
532 __unmask_IO_APIC_irq(irq); 761 __unmask_IO_APIC_irq(cfg);
533 spin_unlock_irqrestore(&ioapic_lock, flags); 762 spin_unlock_irqrestore(&ioapic_lock, flags);
534} 763}
535 764
765static void mask_IO_APIC_irq(unsigned int irq)
766{
767 struct irq_desc *desc = irq_to_desc(irq);
768
769 mask_IO_APIC_irq_desc(desc);
770}
771static void unmask_IO_APIC_irq(unsigned int irq)
772{
773 struct irq_desc *desc = irq_to_desc(irq);
774
775 unmask_IO_APIC_irq_desc(desc);
776}
777
536static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) 778static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
537{ 779{
538 struct IO_APIC_route_entry entry; 780 struct IO_APIC_route_entry entry;
@@ -809,7 +1051,7 @@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
809 */ 1051 */
810static int EISA_ELCR(unsigned int irq) 1052static int EISA_ELCR(unsigned int irq)
811{ 1053{
812 if (irq < 16) { 1054 if (irq < NR_IRQS_LEGACY) {
813 unsigned int port = 0x4d0 + (irq >> 3); 1055 unsigned int port = 0x4d0 + (irq >> 3);
814 return (inb(port) >> (irq & 7)) & 1; 1056 return (inb(port) >> (irq & 7)) & 1;
815 } 1057 }
@@ -1034,7 +1276,8 @@ void unlock_vector_lock(void)
1034 spin_unlock(&vector_lock); 1276 spin_unlock(&vector_lock);
1035} 1277}
1036 1278
1037static int __assign_irq_vector(int irq, cpumask_t mask) 1279static int
1280__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1038{ 1281{
1039 /* 1282 /*
1040 * NOTE! The local APIC isn't very good at handling 1283 * NOTE! The local APIC isn't very good at handling
@@ -1049,52 +1292,49 @@ static int __assign_irq_vector(int irq, cpumask_t mask)
1049 */ 1292 */
1050 static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; 1293 static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
1051 unsigned int old_vector; 1294 unsigned int old_vector;
1052 int cpu; 1295 int cpu, err;
1053 struct irq_cfg *cfg; 1296 cpumask_var_t tmp_mask;
1054
1055 cfg = irq_cfg(irq);
1056
1057 /* Only try and allocate irqs on cpus that are present */
1058 cpus_and(mask, mask, cpu_online_map);
1059 1297
1060 if ((cfg->move_in_progress) || cfg->move_cleanup_count) 1298 if ((cfg->move_in_progress) || cfg->move_cleanup_count)
1061 return -EBUSY; 1299 return -EBUSY;
1062 1300
1301 if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
1302 return -ENOMEM;
1303
1063 old_vector = cfg->vector; 1304 old_vector = cfg->vector;
1064 if (old_vector) { 1305 if (old_vector) {
1065 cpumask_t tmp; 1306 cpumask_and(tmp_mask, mask, cpu_online_mask);
1066 cpus_and(tmp, cfg->domain, mask); 1307 cpumask_and(tmp_mask, cfg->domain, tmp_mask);
1067 if (!cpus_empty(tmp)) 1308 if (!cpumask_empty(tmp_mask)) {
1309 free_cpumask_var(tmp_mask);
1068 return 0; 1310 return 0;
1311 }
1069 } 1312 }
1070 1313
1071 for_each_cpu_mask_nr(cpu, mask) { 1314 /* Only try and allocate irqs on cpus that are present */
1072 cpumask_t domain, new_mask; 1315 err = -ENOSPC;
1316 for_each_cpu_and(cpu, mask, cpu_online_mask) {
1073 int new_cpu; 1317 int new_cpu;
1074 int vector, offset; 1318 int vector, offset;
1075 1319
1076 domain = vector_allocation_domain(cpu); 1320 vector_allocation_domain(cpu, tmp_mask);
1077 cpus_and(new_mask, domain, cpu_online_map);
1078 1321
1079 vector = current_vector; 1322 vector = current_vector;
1080 offset = current_offset; 1323 offset = current_offset;
1081next: 1324next:
1082 vector += 8; 1325 vector += 8;
1083 if (vector >= first_system_vector) { 1326 if (vector >= first_system_vector) {
1084 /* If we run out of vectors on large boxen, must share them. */ 1327 /* If out of vectors on large boxen, must share them. */
1085 offset = (offset + 1) % 8; 1328 offset = (offset + 1) % 8;
1086 vector = FIRST_DEVICE_VECTOR + offset; 1329 vector = FIRST_DEVICE_VECTOR + offset;
1087 } 1330 }
1088 if (unlikely(current_vector == vector)) 1331 if (unlikely(current_vector == vector))
1089 continue; 1332 continue;
1090#ifdef CONFIG_X86_64 1333
1091 if (vector == IA32_SYSCALL_VECTOR) 1334 if (test_bit(vector, used_vectors))
1092 goto next;
1093#else
1094 if (vector == SYSCALL_VECTOR)
1095 goto next; 1335 goto next;
1096#endif 1336
1097 for_each_cpu_mask_nr(new_cpu, new_mask) 1337 for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
1098 if (per_cpu(vector_irq, new_cpu)[vector] != -1) 1338 if (per_cpu(vector_irq, new_cpu)[vector] != -1)
1099 goto next; 1339 goto next;
1100 /* Found one! */ 1340 /* Found one! */
@@ -1102,49 +1342,47 @@ next:
1102 current_offset = offset; 1342 current_offset = offset;
1103 if (old_vector) { 1343 if (old_vector) {
1104 cfg->move_in_progress = 1; 1344 cfg->move_in_progress = 1;
1105 cfg->old_domain = cfg->domain; 1345 cpumask_copy(cfg->old_domain, cfg->domain);
1106 } 1346 }
1107 for_each_cpu_mask_nr(new_cpu, new_mask) 1347 for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
1108 per_cpu(vector_irq, new_cpu)[vector] = irq; 1348 per_cpu(vector_irq, new_cpu)[vector] = irq;
1109 cfg->vector = vector; 1349 cfg->vector = vector;
1110 cfg->domain = domain; 1350 cpumask_copy(cfg->domain, tmp_mask);
1111 return 0; 1351 err = 0;
1352 break;
1112 } 1353 }
1113 return -ENOSPC; 1354 free_cpumask_var(tmp_mask);
1355 return err;
1114} 1356}
1115 1357
1116static int assign_irq_vector(int irq, cpumask_t mask) 1358static int
1359assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1117{ 1360{
1118 int err; 1361 int err;
1119 unsigned long flags; 1362 unsigned long flags;
1120 1363
1121 spin_lock_irqsave(&vector_lock, flags); 1364 spin_lock_irqsave(&vector_lock, flags);
1122 err = __assign_irq_vector(irq, mask); 1365 err = __assign_irq_vector(irq, cfg, mask);
1123 spin_unlock_irqrestore(&vector_lock, flags); 1366 spin_unlock_irqrestore(&vector_lock, flags);
1124 return err; 1367 return err;
1125} 1368}
1126 1369
1127static void __clear_irq_vector(int irq) 1370static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
1128{ 1371{
1129 struct irq_cfg *cfg;
1130 cpumask_t mask;
1131 int cpu, vector; 1372 int cpu, vector;
1132 1373
1133 cfg = irq_cfg(irq);
1134 BUG_ON(!cfg->vector); 1374 BUG_ON(!cfg->vector);
1135 1375
1136 vector = cfg->vector; 1376 vector = cfg->vector;
1137 cpus_and(mask, cfg->domain, cpu_online_map); 1377 for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
1138 for_each_cpu_mask_nr(cpu, mask)
1139 per_cpu(vector_irq, cpu)[vector] = -1; 1378 per_cpu(vector_irq, cpu)[vector] = -1;
1140 1379
1141 cfg->vector = 0; 1380 cfg->vector = 0;
1142 cpus_clear(cfg->domain); 1381 cpumask_clear(cfg->domain);
1143 1382
1144 if (likely(!cfg->move_in_progress)) 1383 if (likely(!cfg->move_in_progress))
1145 return; 1384 return;
1146 cpus_and(mask, cfg->old_domain, cpu_online_map); 1385 for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
1147 for_each_cpu_mask_nr(cpu, mask) {
1148 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; 1386 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
1149 vector++) { 1387 vector++) {
1150 if (per_cpu(vector_irq, cpu)[vector] != irq) 1388 if (per_cpu(vector_irq, cpu)[vector] != irq)
@@ -1162,10 +1400,12 @@ void __setup_vector_irq(int cpu)
1162 /* This function must be called with vector_lock held */ 1400 /* This function must be called with vector_lock held */
1163 int irq, vector; 1401 int irq, vector;
1164 struct irq_cfg *cfg; 1402 struct irq_cfg *cfg;
1403 struct irq_desc *desc;
1165 1404
1166 /* Mark the inuse vectors */ 1405 /* Mark the inuse vectors */
1167 for_each_irq_cfg(irq, cfg) { 1406 for_each_irq_desc(irq, desc) {
1168 if (!cpu_isset(cpu, cfg->domain)) 1407 cfg = desc->chip_data;
1408 if (!cpumask_test_cpu(cpu, cfg->domain))
1169 continue; 1409 continue;
1170 vector = cfg->vector; 1410 vector = cfg->vector;
1171 per_cpu(vector_irq, cpu)[vector] = irq; 1411 per_cpu(vector_irq, cpu)[vector] = irq;
@@ -1177,7 +1417,7 @@ void __setup_vector_irq(int cpu)
1177 continue; 1417 continue;
1178 1418
1179 cfg = irq_cfg(irq); 1419 cfg = irq_cfg(irq);
1180 if (!cpu_isset(cpu, cfg->domain)) 1420 if (!cpumask_test_cpu(cpu, cfg->domain))
1181 per_cpu(vector_irq, cpu)[vector] = -1; 1421 per_cpu(vector_irq, cpu)[vector] = -1;
1182 } 1422 }
1183} 1423}
@@ -1215,11 +1455,8 @@ static inline int IO_APIC_irq_trigger(int irq)
1215} 1455}
1216#endif 1456#endif
1217 1457
1218static void ioapic_register_intr(int irq, unsigned long trigger) 1458static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long trigger)
1219{ 1459{
1220 struct irq_desc *desc;
1221
1222 desc = irq_to_desc(irq);
1223 1460
1224 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || 1461 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
1225 trigger == IOAPIC_LEVEL) 1462 trigger == IOAPIC_LEVEL)
@@ -1311,23 +1548,22 @@ static int setup_ioapic_entry(int apic, int irq,
1311 return 0; 1548 return 0;
1312} 1549}
1313 1550
1314static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, 1551static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_desc *desc,
1315 int trigger, int polarity) 1552 int trigger, int polarity)
1316{ 1553{
1317 struct irq_cfg *cfg; 1554 struct irq_cfg *cfg;
1318 struct IO_APIC_route_entry entry; 1555 struct IO_APIC_route_entry entry;
1319 cpumask_t mask; 1556 unsigned int dest;
1320 1557
1321 if (!IO_APIC_IRQ(irq)) 1558 if (!IO_APIC_IRQ(irq))
1322 return; 1559 return;
1323 1560
1324 cfg = irq_cfg(irq); 1561 cfg = desc->chip_data;
1325 1562
1326 mask = TARGET_CPUS; 1563 if (assign_irq_vector(irq, cfg, TARGET_CPUS))
1327 if (assign_irq_vector(irq, mask))
1328 return; 1564 return;
1329 1565
1330 cpus_and(mask, cfg->domain, mask); 1566 dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
1331 1567
1332 apic_printk(APIC_VERBOSE,KERN_DEBUG 1568 apic_printk(APIC_VERBOSE,KERN_DEBUG
1333 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " 1569 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
@@ -1337,16 +1573,15 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
1337 1573
1338 1574
1339 if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry, 1575 if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry,
1340 cpu_mask_to_apicid(mask), trigger, polarity, 1576 dest, trigger, polarity, cfg->vector)) {
1341 cfg->vector)) {
1342 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", 1577 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
1343 mp_ioapics[apic].mp_apicid, pin); 1578 mp_ioapics[apic].mp_apicid, pin);
1344 __clear_irq_vector(irq); 1579 __clear_irq_vector(irq, cfg);
1345 return; 1580 return;
1346 } 1581 }
1347 1582
1348 ioapic_register_intr(irq, trigger); 1583 ioapic_register_intr(irq, desc, trigger);
1349 if (irq < 16) 1584 if (irq < NR_IRQS_LEGACY)
1350 disable_8259A_irq(irq); 1585 disable_8259A_irq(irq);
1351 1586
1352 ioapic_write_entry(apic, pin, entry); 1587 ioapic_write_entry(apic, pin, entry);
@@ -1356,6 +1591,9 @@ static void __init setup_IO_APIC_irqs(void)
1356{ 1591{
1357 int apic, pin, idx, irq; 1592 int apic, pin, idx, irq;
1358 int notcon = 0; 1593 int notcon = 0;
1594 struct irq_desc *desc;
1595 struct irq_cfg *cfg;
1596 int cpu = boot_cpu_id;
1359 1597
1360 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); 1598 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
1361 1599
@@ -1387,9 +1625,15 @@ static void __init setup_IO_APIC_irqs(void)
1387 if (multi_timer_check(apic, irq)) 1625 if (multi_timer_check(apic, irq))
1388 continue; 1626 continue;
1389#endif 1627#endif
1390 add_pin_to_irq(irq, apic, pin); 1628 desc = irq_to_desc_alloc_cpu(irq, cpu);
1629 if (!desc) {
1630 printk(KERN_INFO "can not get irq_desc for %d\n", irq);
1631 continue;
1632 }
1633 cfg = desc->chip_data;
1634 add_pin_to_irq_cpu(cfg, cpu, apic, pin);
1391 1635
1392 setup_IO_APIC_irq(apic, pin, irq, 1636 setup_IO_APIC_irq(apic, pin, irq, desc,
1393 irq_trigger(idx), irq_polarity(idx)); 1637 irq_trigger(idx), irq_polarity(idx));
1394 } 1638 }
1395 } 1639 }
@@ -1448,6 +1692,7 @@ __apicdebuginit(void) print_IO_APIC(void)
1448 union IO_APIC_reg_03 reg_03; 1692 union IO_APIC_reg_03 reg_03;
1449 unsigned long flags; 1693 unsigned long flags;
1450 struct irq_cfg *cfg; 1694 struct irq_cfg *cfg;
1695 struct irq_desc *desc;
1451 unsigned int irq; 1696 unsigned int irq;
1452 1697
1453 if (apic_verbosity == APIC_QUIET) 1698 if (apic_verbosity == APIC_QUIET)
@@ -1537,8 +1782,11 @@ __apicdebuginit(void) print_IO_APIC(void)
1537 } 1782 }
1538 } 1783 }
1539 printk(KERN_DEBUG "IRQ to pin mappings:\n"); 1784 printk(KERN_DEBUG "IRQ to pin mappings:\n");
1540 for_each_irq_cfg(irq, cfg) { 1785 for_each_irq_desc(irq, desc) {
1541 struct irq_pin_list *entry = cfg->irq_2_pin; 1786 struct irq_pin_list *entry;
1787
1788 cfg = desc->chip_data;
1789 entry = cfg->irq_2_pin;
1542 if (!entry) 1790 if (!entry)
1543 continue; 1791 continue;
1544 printk(KERN_DEBUG "IRQ%d ", irq); 1792 printk(KERN_DEBUG "IRQ%d ", irq);
@@ -2022,14 +2270,16 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
2022{ 2270{
2023 int was_pending = 0; 2271 int was_pending = 0;
2024 unsigned long flags; 2272 unsigned long flags;
2273 struct irq_cfg *cfg;
2025 2274
2026 spin_lock_irqsave(&ioapic_lock, flags); 2275 spin_lock_irqsave(&ioapic_lock, flags);
2027 if (irq < 16) { 2276 if (irq < NR_IRQS_LEGACY) {
2028 disable_8259A_irq(irq); 2277 disable_8259A_irq(irq);
2029 if (i8259A_irq_pending(irq)) 2278 if (i8259A_irq_pending(irq))
2030 was_pending = 1; 2279 was_pending = 1;
2031 } 2280 }
2032 __unmask_IO_APIC_irq(irq); 2281 cfg = irq_cfg(irq);
2282 __unmask_IO_APIC_irq(cfg);
2033 spin_unlock_irqrestore(&ioapic_lock, flags); 2283 spin_unlock_irqrestore(&ioapic_lock, flags);
2034 2284
2035 return was_pending; 2285 return was_pending;
@@ -2043,7 +2293,7 @@ static int ioapic_retrigger_irq(unsigned int irq)
2043 unsigned long flags; 2293 unsigned long flags;
2044 2294
2045 spin_lock_irqsave(&vector_lock, flags); 2295 spin_lock_irqsave(&vector_lock, flags);
2046 send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector); 2296 send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
2047 spin_unlock_irqrestore(&vector_lock, flags); 2297 spin_unlock_irqrestore(&vector_lock, flags);
2048 2298
2049 return 1; 2299 return 1;
@@ -2092,35 +2342,35 @@ static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration);
2092 * as simple as edge triggered migration and we can do the irq migration 2342 * as simple as edge triggered migration and we can do the irq migration
2093 * with a simple atomic update to IO-APIC RTE. 2343 * with a simple atomic update to IO-APIC RTE.
2094 */ 2344 */
2095static void migrate_ioapic_irq(int irq, cpumask_t mask) 2345static void
2346migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2096{ 2347{
2097 struct irq_cfg *cfg; 2348 struct irq_cfg *cfg;
2098 struct irq_desc *desc;
2099 cpumask_t tmp, cleanup_mask;
2100 struct irte irte; 2349 struct irte irte;
2101 int modify_ioapic_rte; 2350 int modify_ioapic_rte;
2102 unsigned int dest; 2351 unsigned int dest;
2103 unsigned long flags; 2352 unsigned long flags;
2353 unsigned int irq;
2104 2354
2105 cpus_and(tmp, mask, cpu_online_map); 2355 if (!cpumask_intersects(mask, cpu_online_mask))
2106 if (cpus_empty(tmp))
2107 return; 2356 return;
2108 2357
2358 irq = desc->irq;
2109 if (get_irte(irq, &irte)) 2359 if (get_irte(irq, &irte))
2110 return; 2360 return;
2111 2361
2112 if (assign_irq_vector(irq, mask)) 2362 cfg = desc->chip_data;
2363 if (assign_irq_vector(irq, cfg, mask))
2113 return; 2364 return;
2114 2365
2115 cfg = irq_cfg(irq); 2366 set_extra_move_desc(desc, mask);
2116 cpus_and(tmp, cfg->domain, mask); 2367
2117 dest = cpu_mask_to_apicid(tmp); 2368 dest = cpu_mask_to_apicid_and(cfg->domain, mask);
2118 2369
2119 desc = irq_to_desc(irq);
2120 modify_ioapic_rte = desc->status & IRQ_LEVEL; 2370 modify_ioapic_rte = desc->status & IRQ_LEVEL;
2121 if (modify_ioapic_rte) { 2371 if (modify_ioapic_rte) {
2122 spin_lock_irqsave(&ioapic_lock, flags); 2372 spin_lock_irqsave(&ioapic_lock, flags);
2123 __target_IO_APIC_irq(irq, dest, cfg->vector); 2373 __target_IO_APIC_irq(irq, dest, cfg);
2124 spin_unlock_irqrestore(&ioapic_lock, flags); 2374 spin_unlock_irqrestore(&ioapic_lock, flags);
2125 } 2375 }
2126 2376
@@ -2132,24 +2382,20 @@ static void migrate_ioapic_irq(int irq, cpumask_t mask)
2132 */ 2382 */
2133 modify_irte(irq, &irte); 2383 modify_irte(irq, &irte);
2134 2384
2135 if (cfg->move_in_progress) { 2385 if (cfg->move_in_progress)
2136 cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); 2386 send_cleanup_vector(cfg);
2137 cfg->move_cleanup_count = cpus_weight(cleanup_mask);
2138 send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
2139 cfg->move_in_progress = 0;
2140 }
2141 2387
2142 desc->affinity = mask; 2388 cpumask_copy(&desc->affinity, mask);
2143} 2389}
2144 2390
2145static int migrate_irq_remapped_level(int irq) 2391static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
2146{ 2392{
2147 int ret = -1; 2393 int ret = -1;
2148 struct irq_desc *desc = irq_to_desc(irq); 2394 struct irq_cfg *cfg = desc->chip_data;
2149 2395
2150 mask_IO_APIC_irq(irq); 2396 mask_IO_APIC_irq_desc(desc);
2151 2397
2152 if (io_apic_level_ack_pending(irq)) { 2398 if (io_apic_level_ack_pending(cfg)) {
2153 /* 2399 /*
2154 * Interrupt in progress. Migrating irq now will change the 2400 * Interrupt in progress. Migrating irq now will change the
2155 * vector information in the IO-APIC RTE and that will confuse 2401 * vector information in the IO-APIC RTE and that will confuse
@@ -2161,14 +2407,15 @@ static int migrate_irq_remapped_level(int irq)
2161 } 2407 }
2162 2408
2163 /* everthing is clear. we have right of way */ 2409 /* everthing is clear. we have right of way */
2164 migrate_ioapic_irq(irq, desc->pending_mask); 2410 migrate_ioapic_irq_desc(desc, &desc->pending_mask);
2165 2411
2166 ret = 0; 2412 ret = 0;
2167 desc->status &= ~IRQ_MOVE_PENDING; 2413 desc->status &= ~IRQ_MOVE_PENDING;
2168 cpus_clear(desc->pending_mask); 2414 cpumask_clear(&desc->pending_mask);
2169 2415
2170unmask: 2416unmask:
2171 unmask_IO_APIC_irq(irq); 2417 unmask_IO_APIC_irq_desc(desc);
2418
2172 return ret; 2419 return ret;
2173} 2420}
2174 2421
@@ -2189,7 +2436,7 @@ static void ir_irq_migration(struct work_struct *work)
2189 continue; 2436 continue;
2190 } 2437 }
2191 2438
2192 desc->chip->set_affinity(irq, desc->pending_mask); 2439 desc->chip->set_affinity(irq, &desc->pending_mask);
2193 spin_unlock_irqrestore(&desc->lock, flags); 2440 spin_unlock_irqrestore(&desc->lock, flags);
2194 } 2441 }
2195 } 2442 }
@@ -2198,28 +2445,33 @@ static void ir_irq_migration(struct work_struct *work)
2198/* 2445/*
2199 * Migrates the IRQ destination in the process context. 2446 * Migrates the IRQ destination in the process context.
2200 */ 2447 */
2201static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) 2448static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
2449 const struct cpumask *mask)
2202{ 2450{
2203 struct irq_desc *desc = irq_to_desc(irq);
2204
2205 if (desc->status & IRQ_LEVEL) { 2451 if (desc->status & IRQ_LEVEL) {
2206 desc->status |= IRQ_MOVE_PENDING; 2452 desc->status |= IRQ_MOVE_PENDING;
2207 desc->pending_mask = mask; 2453 cpumask_copy(&desc->pending_mask, mask);
2208 migrate_irq_remapped_level(irq); 2454 migrate_irq_remapped_level_desc(desc);
2209 return; 2455 return;
2210 } 2456 }
2211 2457
2212 migrate_ioapic_irq(irq, mask); 2458 migrate_ioapic_irq_desc(desc, mask);
2459}
2460static void set_ir_ioapic_affinity_irq(unsigned int irq,
2461 const struct cpumask *mask)
2462{
2463 struct irq_desc *desc = irq_to_desc(irq);
2464
2465 set_ir_ioapic_affinity_irq_desc(desc, mask);
2213} 2466}
2214#endif 2467#endif
2215 2468
2216asmlinkage void smp_irq_move_cleanup_interrupt(void) 2469asmlinkage void smp_irq_move_cleanup_interrupt(void)
2217{ 2470{
2218 unsigned vector, me; 2471 unsigned vector, me;
2472
2219 ack_APIC_irq(); 2473 ack_APIC_irq();
2220#ifdef CONFIG_X86_64
2221 exit_idle(); 2474 exit_idle();
2222#endif
2223 irq_enter(); 2475 irq_enter();
2224 2476
2225 me = smp_processor_id(); 2477 me = smp_processor_id();
@@ -2229,6 +2481,9 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2229 struct irq_cfg *cfg; 2481 struct irq_cfg *cfg;
2230 irq = __get_cpu_var(vector_irq)[vector]; 2482 irq = __get_cpu_var(vector_irq)[vector];
2231 2483
2484 if (irq == -1)
2485 continue;
2486
2232 desc = irq_to_desc(irq); 2487 desc = irq_to_desc(irq);
2233 if (!desc) 2488 if (!desc)
2234 continue; 2489 continue;
@@ -2238,7 +2493,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2238 if (!cfg->move_cleanup_count) 2493 if (!cfg->move_cleanup_count)
2239 goto unlock; 2494 goto unlock;
2240 2495
2241 if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) 2496 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
2242 goto unlock; 2497 goto unlock;
2243 2498
2244 __get_cpu_var(vector_irq)[vector] = -1; 2499 __get_cpu_var(vector_irq)[vector] = -1;
@@ -2250,28 +2505,44 @@ unlock:
2250 irq_exit(); 2505 irq_exit();
2251} 2506}
2252 2507
2253static void irq_complete_move(unsigned int irq) 2508static void irq_complete_move(struct irq_desc **descp)
2254{ 2509{
2255 struct irq_cfg *cfg = irq_cfg(irq); 2510 struct irq_desc *desc = *descp;
2511 struct irq_cfg *cfg = desc->chip_data;
2256 unsigned vector, me; 2512 unsigned vector, me;
2257 2513
2258 if (likely(!cfg->move_in_progress)) 2514 if (likely(!cfg->move_in_progress)) {
2515#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
2516 if (likely(!cfg->move_desc_pending))
2517 return;
2518
2519 /* domain has not changed, but affinity did */
2520 me = smp_processor_id();
2521 if (cpu_isset(me, desc->affinity)) {
2522 *descp = desc = move_irq_desc(desc, me);
2523 /* get the new one */
2524 cfg = desc->chip_data;
2525 cfg->move_desc_pending = 0;
2526 }
2527#endif
2259 return; 2528 return;
2529 }
2260 2530
2261 vector = ~get_irq_regs()->orig_ax; 2531 vector = ~get_irq_regs()->orig_ax;
2262 me = smp_processor_id(); 2532 me = smp_processor_id();
2263 if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { 2533#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
2264 cpumask_t cleanup_mask; 2534 *descp = desc = move_irq_desc(desc, me);
2535 /* get the new one */
2536 cfg = desc->chip_data;
2537#endif
2265 2538
2266 cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); 2539 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
2267 cfg->move_cleanup_count = cpus_weight(cleanup_mask); 2540 send_cleanup_vector(cfg);
2268 send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
2269 cfg->move_in_progress = 0;
2270 }
2271} 2541}
2272#else 2542#else
2273static inline void irq_complete_move(unsigned int irq) {} 2543static inline void irq_complete_move(struct irq_desc **descp) {}
2274#endif 2544#endif
2545
2275#ifdef CONFIG_INTR_REMAP 2546#ifdef CONFIG_INTR_REMAP
2276static void ack_x2apic_level(unsigned int irq) 2547static void ack_x2apic_level(unsigned int irq)
2277{ 2548{
@@ -2282,11 +2553,14 @@ static void ack_x2apic_edge(unsigned int irq)
2282{ 2553{
2283 ack_x2APIC_irq(); 2554 ack_x2APIC_irq();
2284} 2555}
2556
2285#endif 2557#endif
2286 2558
2287static void ack_apic_edge(unsigned int irq) 2559static void ack_apic_edge(unsigned int irq)
2288{ 2560{
2289 irq_complete_move(irq); 2561 struct irq_desc *desc = irq_to_desc(irq);
2562
2563 irq_complete_move(&desc);
2290 move_native_irq(irq); 2564 move_native_irq(irq);
2291 ack_APIC_irq(); 2565 ack_APIC_irq();
2292} 2566}
@@ -2295,18 +2569,21 @@ atomic_t irq_mis_count;
2295 2569
2296static void ack_apic_level(unsigned int irq) 2570static void ack_apic_level(unsigned int irq)
2297{ 2571{
2572 struct irq_desc *desc = irq_to_desc(irq);
2573
2298#ifdef CONFIG_X86_32 2574#ifdef CONFIG_X86_32
2299 unsigned long v; 2575 unsigned long v;
2300 int i; 2576 int i;
2301#endif 2577#endif
2578 struct irq_cfg *cfg;
2302 int do_unmask_irq = 0; 2579 int do_unmask_irq = 0;
2303 2580
2304 irq_complete_move(irq); 2581 irq_complete_move(&desc);
2305#ifdef CONFIG_GENERIC_PENDING_IRQ 2582#ifdef CONFIG_GENERIC_PENDING_IRQ
2306 /* If we are moving the irq we need to mask it */ 2583 /* If we are moving the irq we need to mask it */
2307 if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) { 2584 if (unlikely(desc->status & IRQ_MOVE_PENDING)) {
2308 do_unmask_irq = 1; 2585 do_unmask_irq = 1;
2309 mask_IO_APIC_irq(irq); 2586 mask_IO_APIC_irq_desc(desc);
2310 } 2587 }
2311#endif 2588#endif
2312 2589
@@ -2330,7 +2607,8 @@ static void ack_apic_level(unsigned int irq)
2330 * operation to prevent an edge-triggered interrupt escaping meanwhile. 2607 * operation to prevent an edge-triggered interrupt escaping meanwhile.
2331 * The idea is from Manfred Spraul. --macro 2608 * The idea is from Manfred Spraul. --macro
2332 */ 2609 */
2333 i = irq_cfg(irq)->vector; 2610 cfg = desc->chip_data;
2611 i = cfg->vector;
2334 2612
2335 v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); 2613 v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
2336#endif 2614#endif
@@ -2369,17 +2647,18 @@ static void ack_apic_level(unsigned int irq)
2369 * accurate and is causing problems then it is a hardware bug 2647 * accurate and is causing problems then it is a hardware bug
2370 * and you can go talk to the chipset vendor about it. 2648 * and you can go talk to the chipset vendor about it.
2371 */ 2649 */
2372 if (!io_apic_level_ack_pending(irq)) 2650 cfg = desc->chip_data;
2651 if (!io_apic_level_ack_pending(cfg))
2373 move_masked_irq(irq); 2652 move_masked_irq(irq);
2374 unmask_IO_APIC_irq(irq); 2653 unmask_IO_APIC_irq_desc(desc);
2375 } 2654 }
2376 2655
2377#ifdef CONFIG_X86_32 2656#ifdef CONFIG_X86_32
2378 if (!(v & (1 << (i & 0x1f)))) { 2657 if (!(v & (1 << (i & 0x1f)))) {
2379 atomic_inc(&irq_mis_count); 2658 atomic_inc(&irq_mis_count);
2380 spin_lock(&ioapic_lock); 2659 spin_lock(&ioapic_lock);
2381 __mask_and_edge_IO_APIC_irq(irq); 2660 __mask_and_edge_IO_APIC_irq(cfg);
2382 __unmask_and_level_IO_APIC_irq(irq); 2661 __unmask_and_level_IO_APIC_irq(cfg);
2383 spin_unlock(&ioapic_lock); 2662 spin_unlock(&ioapic_lock);
2384 } 2663 }
2385#endif 2664#endif
@@ -2430,20 +2709,19 @@ static inline void init_IO_APIC_traps(void)
2430 * Also, we've got to be careful not to trash gate 2709 * Also, we've got to be careful not to trash gate
2431 * 0x80, because int 0x80 is hm, kind of importantish. ;) 2710 * 0x80, because int 0x80 is hm, kind of importantish. ;)
2432 */ 2711 */
2433 for_each_irq_cfg(irq, cfg) { 2712 for_each_irq_desc(irq, desc) {
2434 if (IO_APIC_IRQ(irq) && !cfg->vector) { 2713 cfg = desc->chip_data;
2714 if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
2435 /* 2715 /*
2436 * Hmm.. We don't have an entry for this, 2716 * Hmm.. We don't have an entry for this,
2437 * so default to an old-fashioned 8259 2717 * so default to an old-fashioned 8259
2438 * interrupt if we can.. 2718 * interrupt if we can..
2439 */ 2719 */
2440 if (irq < 16) 2720 if (irq < NR_IRQS_LEGACY)
2441 make_8259A_irq(irq); 2721 make_8259A_irq(irq);
2442 else { 2722 else
2443 desc = irq_to_desc(irq);
2444 /* Strange. Oh, well.. */ 2723 /* Strange. Oh, well.. */
2445 desc->chip = &no_irq_chip; 2724 desc->chip = &no_irq_chip;
2446 }
2447 } 2725 }
2448 } 2726 }
2449} 2727}
@@ -2468,7 +2746,7 @@ static void unmask_lapic_irq(unsigned int irq)
2468 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); 2746 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
2469} 2747}
2470 2748
2471static void ack_lapic_irq (unsigned int irq) 2749static void ack_lapic_irq(unsigned int irq)
2472{ 2750{
2473 ack_APIC_irq(); 2751 ack_APIC_irq();
2474} 2752}
@@ -2480,11 +2758,8 @@ static struct irq_chip lapic_chip __read_mostly = {
2480 .ack = ack_lapic_irq, 2758 .ack = ack_lapic_irq,
2481}; 2759};
2482 2760
2483static void lapic_register_intr(int irq) 2761static void lapic_register_intr(int irq, struct irq_desc *desc)
2484{ 2762{
2485 struct irq_desc *desc;
2486
2487 desc = irq_to_desc(irq);
2488 desc->status &= ~IRQ_LEVEL; 2763 desc->status &= ~IRQ_LEVEL;
2489 set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, 2764 set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
2490 "edge"); 2765 "edge");
@@ -2588,7 +2863,9 @@ int timer_through_8259 __initdata;
2588 */ 2863 */
2589static inline void __init check_timer(void) 2864static inline void __init check_timer(void)
2590{ 2865{
2591 struct irq_cfg *cfg = irq_cfg(0); 2866 struct irq_desc *desc = irq_to_desc(0);
2867 struct irq_cfg *cfg = desc->chip_data;
2868 int cpu = boot_cpu_id;
2592 int apic1, pin1, apic2, pin2; 2869 int apic1, pin1, apic2, pin2;
2593 unsigned long flags; 2870 unsigned long flags;
2594 unsigned int ver; 2871 unsigned int ver;
@@ -2603,7 +2880,7 @@ static inline void __init check_timer(void)
2603 * get/set the timer IRQ vector: 2880 * get/set the timer IRQ vector:
2604 */ 2881 */
2605 disable_8259A_irq(0); 2882 disable_8259A_irq(0);
2606 assign_irq_vector(0, TARGET_CPUS); 2883 assign_irq_vector(0, cfg, TARGET_CPUS);
2607 2884
2608 /* 2885 /*
2609 * As IRQ0 is to be enabled in the 8259A, the virtual 2886 * As IRQ0 is to be enabled in the 8259A, the virtual
@@ -2654,10 +2931,10 @@ static inline void __init check_timer(void)
2654 * Ok, does IRQ0 through the IOAPIC work? 2931 * Ok, does IRQ0 through the IOAPIC work?
2655 */ 2932 */
2656 if (no_pin1) { 2933 if (no_pin1) {
2657 add_pin_to_irq(0, apic1, pin1); 2934 add_pin_to_irq_cpu(cfg, cpu, apic1, pin1);
2658 setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); 2935 setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
2659 } 2936 }
2660 unmask_IO_APIC_irq(0); 2937 unmask_IO_APIC_irq_desc(desc);
2661 if (timer_irq_works()) { 2938 if (timer_irq_works()) {
2662 if (nmi_watchdog == NMI_IO_APIC) { 2939 if (nmi_watchdog == NMI_IO_APIC) {
2663 setup_nmi(); 2940 setup_nmi();
@@ -2683,9 +2960,9 @@ static inline void __init check_timer(void)
2683 /* 2960 /*
2684 * legacy devices should be connected to IO APIC #0 2961 * legacy devices should be connected to IO APIC #0
2685 */ 2962 */
2686 replace_pin_at_irq(0, apic1, pin1, apic2, pin2); 2963 replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2);
2687 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); 2964 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
2688 unmask_IO_APIC_irq(0); 2965 unmask_IO_APIC_irq_desc(desc);
2689 enable_8259A_irq(0); 2966 enable_8259A_irq(0);
2690 if (timer_irq_works()) { 2967 if (timer_irq_works()) {
2691 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); 2968 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
@@ -2717,7 +2994,7 @@ static inline void __init check_timer(void)
2717 apic_printk(APIC_QUIET, KERN_INFO 2994 apic_printk(APIC_QUIET, KERN_INFO
2718 "...trying to set up timer as Virtual Wire IRQ...\n"); 2995 "...trying to set up timer as Virtual Wire IRQ...\n");
2719 2996
2720 lapic_register_intr(0); 2997 lapic_register_intr(0, desc);
2721 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ 2998 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
2722 enable_8259A_irq(0); 2999 enable_8259A_irq(0);
2723 3000
@@ -2902,22 +3179,26 @@ unsigned int create_irq_nr(unsigned int irq_want)
2902 unsigned int irq; 3179 unsigned int irq;
2903 unsigned int new; 3180 unsigned int new;
2904 unsigned long flags; 3181 unsigned long flags;
2905 struct irq_cfg *cfg_new; 3182 struct irq_cfg *cfg_new = NULL;
2906 3183 int cpu = boot_cpu_id;
2907 irq_want = nr_irqs - 1; 3184 struct irq_desc *desc_new = NULL;
2908 3185
2909 irq = 0; 3186 irq = 0;
2910 spin_lock_irqsave(&vector_lock, flags); 3187 spin_lock_irqsave(&vector_lock, flags);
2911 for (new = irq_want; new > 0; new--) { 3188 for (new = irq_want; new < NR_IRQS; new++) {
2912 if (platform_legacy_irq(new)) 3189 if (platform_legacy_irq(new))
2913 continue; 3190 continue;
2914 cfg_new = irq_cfg(new); 3191
2915 if (cfg_new && cfg_new->vector != 0) 3192 desc_new = irq_to_desc_alloc_cpu(new, cpu);
3193 if (!desc_new) {
3194 printk(KERN_INFO "can not get irq_desc for %d\n", new);
2916 continue; 3195 continue;
2917 /* check if need to create one */ 3196 }
2918 if (!cfg_new) 3197 cfg_new = desc_new->chip_data;
2919 cfg_new = irq_cfg_alloc(new); 3198
2920 if (__assign_irq_vector(new, TARGET_CPUS) == 0) 3199 if (cfg_new->vector != 0)
3200 continue;
3201 if (__assign_irq_vector(new, cfg_new, TARGET_CPUS) == 0)
2921 irq = new; 3202 irq = new;
2922 break; 3203 break;
2923 } 3204 }
@@ -2925,15 +3206,21 @@ unsigned int create_irq_nr(unsigned int irq_want)
2925 3206
2926 if (irq > 0) { 3207 if (irq > 0) {
2927 dynamic_irq_init(irq); 3208 dynamic_irq_init(irq);
3209 /* restore it, in case dynamic_irq_init clear it */
3210 if (desc_new)
3211 desc_new->chip_data = cfg_new;
2928 } 3212 }
2929 return irq; 3213 return irq;
2930} 3214}
2931 3215
3216static int nr_irqs_gsi = NR_IRQS_LEGACY;
2932int create_irq(void) 3217int create_irq(void)
2933{ 3218{
3219 unsigned int irq_want;
2934 int irq; 3220 int irq;
2935 3221
2936 irq = create_irq_nr(nr_irqs - 1); 3222 irq_want = nr_irqs_gsi;
3223 irq = create_irq_nr(irq_want);
2937 3224
2938 if (irq == 0) 3225 if (irq == 0)
2939 irq = -1; 3226 irq = -1;
@@ -2944,14 +3231,22 @@ int create_irq(void)
2944void destroy_irq(unsigned int irq) 3231void destroy_irq(unsigned int irq)
2945{ 3232{
2946 unsigned long flags; 3233 unsigned long flags;
3234 struct irq_cfg *cfg;
3235 struct irq_desc *desc;
2947 3236
3237 /* store it, in case dynamic_irq_cleanup clear it */
3238 desc = irq_to_desc(irq);
3239 cfg = desc->chip_data;
2948 dynamic_irq_cleanup(irq); 3240 dynamic_irq_cleanup(irq);
3241 /* connect back irq_cfg */
3242 if (desc)
3243 desc->chip_data = cfg;
2949 3244
2950#ifdef CONFIG_INTR_REMAP 3245#ifdef CONFIG_INTR_REMAP
2951 free_irte(irq); 3246 free_irte(irq);
2952#endif 3247#endif
2953 spin_lock_irqsave(&vector_lock, flags); 3248 spin_lock_irqsave(&vector_lock, flags);
2954 __clear_irq_vector(irq); 3249 __clear_irq_vector(irq, cfg);
2955 spin_unlock_irqrestore(&vector_lock, flags); 3250 spin_unlock_irqrestore(&vector_lock, flags);
2956} 3251}
2957 3252
@@ -2964,16 +3259,13 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
2964 struct irq_cfg *cfg; 3259 struct irq_cfg *cfg;
2965 int err; 3260 int err;
2966 unsigned dest; 3261 unsigned dest;
2967 cpumask_t tmp;
2968 3262
2969 tmp = TARGET_CPUS; 3263 cfg = irq_cfg(irq);
2970 err = assign_irq_vector(irq, tmp); 3264 err = assign_irq_vector(irq, cfg, TARGET_CPUS);
2971 if (err) 3265 if (err)
2972 return err; 3266 return err;
2973 3267
2974 cfg = irq_cfg(irq); 3268 dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
2975 cpus_and(tmp, cfg->domain, tmp);
2976 dest = cpu_mask_to_apicid(tmp);
2977 3269
2978#ifdef CONFIG_INTR_REMAP 3270#ifdef CONFIG_INTR_REMAP
2979 if (irq_remapped(irq)) { 3271 if (irq_remapped(irq)) {
@@ -3027,64 +3319,48 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
3027} 3319}
3028 3320
3029#ifdef CONFIG_SMP 3321#ifdef CONFIG_SMP
3030static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) 3322static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3031{ 3323{
3324 struct irq_desc *desc = irq_to_desc(irq);
3032 struct irq_cfg *cfg; 3325 struct irq_cfg *cfg;
3033 struct msi_msg msg; 3326 struct msi_msg msg;
3034 unsigned int dest; 3327 unsigned int dest;
3035 cpumask_t tmp;
3036 struct irq_desc *desc;
3037 3328
3038 cpus_and(tmp, mask, cpu_online_map); 3329 dest = set_desc_affinity(desc, mask);
3039 if (cpus_empty(tmp)) 3330 if (dest == BAD_APICID)
3040 return; 3331 return;
3041 3332
3042 if (assign_irq_vector(irq, mask)) 3333 cfg = desc->chip_data;
3043 return;
3044 3334
3045 cfg = irq_cfg(irq); 3335 read_msi_msg_desc(desc, &msg);
3046 cpus_and(tmp, cfg->domain, mask);
3047 dest = cpu_mask_to_apicid(tmp);
3048
3049 read_msi_msg(irq, &msg);
3050 3336
3051 msg.data &= ~MSI_DATA_VECTOR_MASK; 3337 msg.data &= ~MSI_DATA_VECTOR_MASK;
3052 msg.data |= MSI_DATA_VECTOR(cfg->vector); 3338 msg.data |= MSI_DATA_VECTOR(cfg->vector);
3053 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; 3339 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
3054 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3340 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3055 3341
3056 write_msi_msg(irq, &msg); 3342 write_msi_msg_desc(desc, &msg);
3057 desc = irq_to_desc(irq);
3058 desc->affinity = mask;
3059} 3343}
3060
3061#ifdef CONFIG_INTR_REMAP 3344#ifdef CONFIG_INTR_REMAP
3062/* 3345/*
3063 * Migrate the MSI irq to another cpumask. This migration is 3346 * Migrate the MSI irq to another cpumask. This migration is
3064 * done in the process context using interrupt-remapping hardware. 3347 * done in the process context using interrupt-remapping hardware.
3065 */ 3348 */
3066static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) 3349static void
3350ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3067{ 3351{
3068 struct irq_cfg *cfg; 3352 struct irq_desc *desc = irq_to_desc(irq);
3353 struct irq_cfg *cfg = desc->chip_data;
3069 unsigned int dest; 3354 unsigned int dest;
3070 cpumask_t tmp, cleanup_mask;
3071 struct irte irte; 3355 struct irte irte;
3072 struct irq_desc *desc;
3073
3074 cpus_and(tmp, mask, cpu_online_map);
3075 if (cpus_empty(tmp))
3076 return;
3077 3356
3078 if (get_irte(irq, &irte)) 3357 if (get_irte(irq, &irte))
3079 return; 3358 return;
3080 3359
3081 if (assign_irq_vector(irq, mask)) 3360 dest = set_desc_affinity(desc, mask);
3361 if (dest == BAD_APICID)
3082 return; 3362 return;
3083 3363
3084 cfg = irq_cfg(irq);
3085 cpus_and(tmp, cfg->domain, mask);
3086 dest = cpu_mask_to_apicid(tmp);
3087
3088 irte.vector = cfg->vector; 3364 irte.vector = cfg->vector;
3089 irte.dest_id = IRTE_DEST(dest); 3365 irte.dest_id = IRTE_DEST(dest);
3090 3366
@@ -3098,16 +3374,10 @@ static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
3098 * at the new destination. So, time to cleanup the previous 3374 * at the new destination. So, time to cleanup the previous
3099 * vector allocation. 3375 * vector allocation.
3100 */ 3376 */
3101 if (cfg->move_in_progress) { 3377 if (cfg->move_in_progress)
3102 cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); 3378 send_cleanup_vector(cfg);
3103 cfg->move_cleanup_count = cpus_weight(cleanup_mask);
3104 send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
3105 cfg->move_in_progress = 0;
3106 }
3107
3108 desc = irq_to_desc(irq);
3109 desc->affinity = mask;
3110} 3379}
3380
3111#endif 3381#endif
3112#endif /* CONFIG_SMP */ 3382#endif /* CONFIG_SMP */
3113 3383
@@ -3166,7 +3436,7 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
3166} 3436}
3167#endif 3437#endif
3168 3438
3169static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) 3439static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3170{ 3440{
3171 int ret; 3441 int ret;
3172 struct msi_msg msg; 3442 struct msi_msg msg;
@@ -3175,7 +3445,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
3175 if (ret < 0) 3445 if (ret < 0)
3176 return ret; 3446 return ret;
3177 3447
3178 set_irq_msi(irq, desc); 3448 set_irq_msi(irq, msidesc);
3179 write_msi_msg(irq, &msg); 3449 write_msi_msg(irq, &msg);
3180 3450
3181#ifdef CONFIG_INTR_REMAP 3451#ifdef CONFIG_INTR_REMAP
@@ -3195,26 +3465,13 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
3195 return 0; 3465 return 0;
3196} 3466}
3197 3467
3198static unsigned int build_irq_for_pci_dev(struct pci_dev *dev) 3468int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc)
3199{
3200 unsigned int irq;
3201
3202 irq = dev->bus->number;
3203 irq <<= 8;
3204 irq |= dev->devfn;
3205 irq <<= 12;
3206
3207 return irq;
3208}
3209
3210int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
3211{ 3469{
3212 unsigned int irq; 3470 unsigned int irq;
3213 int ret; 3471 int ret;
3214 unsigned int irq_want; 3472 unsigned int irq_want;
3215 3473
3216 irq_want = build_irq_for_pci_dev(dev) + 0x100; 3474 irq_want = nr_irqs_gsi;
3217
3218 irq = create_irq_nr(irq_want); 3475 irq = create_irq_nr(irq_want);
3219 if (irq == 0) 3476 if (irq == 0)
3220 return -1; 3477 return -1;
@@ -3228,7 +3485,7 @@ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
3228 goto error; 3485 goto error;
3229no_ir: 3486no_ir:
3230#endif 3487#endif
3231 ret = setup_msi_irq(dev, desc, irq); 3488 ret = setup_msi_irq(dev, msidesc, irq);
3232 if (ret < 0) { 3489 if (ret < 0) {
3233 destroy_irq(irq); 3490 destroy_irq(irq);
3234 return ret; 3491 return ret;
@@ -3246,7 +3503,7 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3246{ 3503{
3247 unsigned int irq; 3504 unsigned int irq;
3248 int ret, sub_handle; 3505 int ret, sub_handle;
3249 struct msi_desc *desc; 3506 struct msi_desc *msidesc;
3250 unsigned int irq_want; 3507 unsigned int irq_want;
3251 3508
3252#ifdef CONFIG_INTR_REMAP 3509#ifdef CONFIG_INTR_REMAP
@@ -3254,10 +3511,11 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3254 int index = 0; 3511 int index = 0;
3255#endif 3512#endif
3256 3513
3257 irq_want = build_irq_for_pci_dev(dev) + 0x100; 3514 irq_want = nr_irqs_gsi;
3258 sub_handle = 0; 3515 sub_handle = 0;
3259 list_for_each_entry(desc, &dev->msi_list, list) { 3516 list_for_each_entry(msidesc, &dev->msi_list, list) {
3260 irq = create_irq_nr(irq_want--); 3517 irq = create_irq_nr(irq_want);
3518 irq_want++;
3261 if (irq == 0) 3519 if (irq == 0)
3262 return -1; 3520 return -1;
3263#ifdef CONFIG_INTR_REMAP 3521#ifdef CONFIG_INTR_REMAP
@@ -3289,7 +3547,7 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3289 } 3547 }
3290no_ir: 3548no_ir:
3291#endif 3549#endif
3292 ret = setup_msi_irq(dev, desc, irq); 3550 ret = setup_msi_irq(dev, msidesc, irq);
3293 if (ret < 0) 3551 if (ret < 0)
3294 goto error; 3552 goto error;
3295 sub_handle++; 3553 sub_handle++;
@@ -3308,24 +3566,18 @@ void arch_teardown_msi_irq(unsigned int irq)
3308 3566
3309#ifdef CONFIG_DMAR 3567#ifdef CONFIG_DMAR
3310#ifdef CONFIG_SMP 3568#ifdef CONFIG_SMP
3311static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) 3569static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3312{ 3570{
3571 struct irq_desc *desc = irq_to_desc(irq);
3313 struct irq_cfg *cfg; 3572 struct irq_cfg *cfg;
3314 struct msi_msg msg; 3573 struct msi_msg msg;
3315 unsigned int dest; 3574 unsigned int dest;
3316 cpumask_t tmp;
3317 struct irq_desc *desc;
3318 3575
3319 cpus_and(tmp, mask, cpu_online_map); 3576 dest = set_desc_affinity(desc, mask);
3320 if (cpus_empty(tmp)) 3577 if (dest == BAD_APICID)
3321 return; 3578 return;
3322 3579
3323 if (assign_irq_vector(irq, mask)) 3580 cfg = desc->chip_data;
3324 return;
3325
3326 cfg = irq_cfg(irq);
3327 cpus_and(tmp, cfg->domain, mask);
3328 dest = cpu_mask_to_apicid(tmp);
3329 3581
3330 dmar_msi_read(irq, &msg); 3582 dmar_msi_read(irq, &msg);
3331 3583
@@ -3335,9 +3587,8 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
3335 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3587 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3336 3588
3337 dmar_msi_write(irq, &msg); 3589 dmar_msi_write(irq, &msg);
3338 desc = irq_to_desc(irq);
3339 desc->affinity = mask;
3340} 3590}
3591
3341#endif /* CONFIG_SMP */ 3592#endif /* CONFIG_SMP */
3342 3593
3343struct irq_chip dmar_msi_type = { 3594struct irq_chip dmar_msi_type = {
@@ -3369,24 +3620,18 @@ int arch_setup_dmar_msi(unsigned int irq)
3369#ifdef CONFIG_HPET_TIMER 3620#ifdef CONFIG_HPET_TIMER
3370 3621
3371#ifdef CONFIG_SMP 3622#ifdef CONFIG_SMP
3372static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask) 3623static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3373{ 3624{
3625 struct irq_desc *desc = irq_to_desc(irq);
3374 struct irq_cfg *cfg; 3626 struct irq_cfg *cfg;
3375 struct irq_desc *desc;
3376 struct msi_msg msg; 3627 struct msi_msg msg;
3377 unsigned int dest; 3628 unsigned int dest;
3378 cpumask_t tmp;
3379 3629
3380 cpus_and(tmp, mask, cpu_online_map); 3630 dest = set_desc_affinity(desc, mask);
3381 if (cpus_empty(tmp)) 3631 if (dest == BAD_APICID)
3382 return; 3632 return;
3383 3633
3384 if (assign_irq_vector(irq, mask)) 3634 cfg = desc->chip_data;
3385 return;
3386
3387 cfg = irq_cfg(irq);
3388 cpus_and(tmp, cfg->domain, mask);
3389 dest = cpu_mask_to_apicid(tmp);
3390 3635
3391 hpet_msi_read(irq, &msg); 3636 hpet_msi_read(irq, &msg);
3392 3637
@@ -3396,9 +3641,8 @@ static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask)
3396 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3641 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3397 3642
3398 hpet_msi_write(irq, &msg); 3643 hpet_msi_write(irq, &msg);
3399 desc = irq_to_desc(irq);
3400 desc->affinity = mask;
3401} 3644}
3645
3402#endif /* CONFIG_SMP */ 3646#endif /* CONFIG_SMP */
3403 3647
3404struct irq_chip hpet_msi_type = { 3648struct irq_chip hpet_msi_type = {
@@ -3451,28 +3695,21 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
3451 write_ht_irq_msg(irq, &msg); 3695 write_ht_irq_msg(irq, &msg);
3452} 3696}
3453 3697
3454static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) 3698static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
3455{ 3699{
3700 struct irq_desc *desc = irq_to_desc(irq);
3456 struct irq_cfg *cfg; 3701 struct irq_cfg *cfg;
3457 unsigned int dest; 3702 unsigned int dest;
3458 cpumask_t tmp;
3459 struct irq_desc *desc;
3460 3703
3461 cpus_and(tmp, mask, cpu_online_map); 3704 dest = set_desc_affinity(desc, mask);
3462 if (cpus_empty(tmp)) 3705 if (dest == BAD_APICID)
3463 return; 3706 return;
3464 3707
3465 if (assign_irq_vector(irq, mask)) 3708 cfg = desc->chip_data;
3466 return;
3467
3468 cfg = irq_cfg(irq);
3469 cpus_and(tmp, cfg->domain, mask);
3470 dest = cpu_mask_to_apicid(tmp);
3471 3709
3472 target_ht_irq(irq, dest, cfg->vector); 3710 target_ht_irq(irq, dest, cfg->vector);
3473 desc = irq_to_desc(irq);
3474 desc->affinity = mask;
3475} 3711}
3712
3476#endif 3713#endif
3477 3714
3478static struct irq_chip ht_irq_chip = { 3715static struct irq_chip ht_irq_chip = {
@@ -3490,17 +3727,14 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3490{ 3727{
3491 struct irq_cfg *cfg; 3728 struct irq_cfg *cfg;
3492 int err; 3729 int err;
3493 cpumask_t tmp;
3494 3730
3495 tmp = TARGET_CPUS; 3731 cfg = irq_cfg(irq);
3496 err = assign_irq_vector(irq, tmp); 3732 err = assign_irq_vector(irq, cfg, TARGET_CPUS);
3497 if (!err) { 3733 if (!err) {
3498 struct ht_irq_msg msg; 3734 struct ht_irq_msg msg;
3499 unsigned dest; 3735 unsigned dest;
3500 3736
3501 cfg = irq_cfg(irq); 3737 dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
3502 cpus_and(tmp, cfg->domain, tmp);
3503 dest = cpu_mask_to_apicid(tmp);
3504 3738
3505 msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); 3739 msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
3506 3740
@@ -3536,7 +3770,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3536int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, 3770int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3537 unsigned long mmr_offset) 3771 unsigned long mmr_offset)
3538{ 3772{
3539 const cpumask_t *eligible_cpu = get_cpu_mask(cpu); 3773 const struct cpumask *eligible_cpu = cpumask_of(cpu);
3540 struct irq_cfg *cfg; 3774 struct irq_cfg *cfg;
3541 int mmr_pnode; 3775 int mmr_pnode;
3542 unsigned long mmr_value; 3776 unsigned long mmr_value;
@@ -3544,7 +3778,9 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3544 unsigned long flags; 3778 unsigned long flags;
3545 int err; 3779 int err;
3546 3780
3547 err = assign_irq_vector(irq, *eligible_cpu); 3781 cfg = irq_cfg(irq);
3782
3783 err = assign_irq_vector(irq, cfg, eligible_cpu);
3548 if (err != 0) 3784 if (err != 0)
3549 return err; 3785 return err;
3550 3786
@@ -3553,8 +3789,6 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3553 irq_name); 3789 irq_name);
3554 spin_unlock_irqrestore(&vector_lock, flags); 3790 spin_unlock_irqrestore(&vector_lock, flags);
3555 3791
3556 cfg = irq_cfg(irq);
3557
3558 mmr_value = 0; 3792 mmr_value = 0;
3559 entry = (struct uv_IO_APIC_route_entry *)&mmr_value; 3793 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
3560 BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); 3794 BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
@@ -3565,7 +3799,7 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3565 entry->polarity = 0; 3799 entry->polarity = 0;
3566 entry->trigger = 0; 3800 entry->trigger = 0;
3567 entry->mask = 0; 3801 entry->mask = 0;
3568 entry->dest = cpu_mask_to_apicid(*eligible_cpu); 3802 entry->dest = cpu_mask_to_apicid(eligible_cpu);
3569 3803
3570 mmr_pnode = uv_blade_to_pnode(mmr_blade); 3804 mmr_pnode = uv_blade_to_pnode(mmr_blade);
3571 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); 3805 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
@@ -3606,9 +3840,16 @@ int __init io_apic_get_redir_entries (int ioapic)
3606 return reg_01.bits.entries; 3840 return reg_01.bits.entries;
3607} 3841}
3608 3842
3609int __init probe_nr_irqs(void) 3843void __init probe_nr_irqs_gsi(void)
3610{ 3844{
3611 return NR_IRQS; 3845 int idx;
3846 int nr = 0;
3847
3848 for (idx = 0; idx < nr_ioapics; idx++)
3849 nr += io_apic_get_redir_entries(idx) + 1;
3850
3851 if (nr > nr_irqs_gsi)
3852 nr_irqs_gsi = nr;
3612} 3853}
3613 3854
3614/* -------------------------------------------------------------------------- 3855/* --------------------------------------------------------------------------
@@ -3707,19 +3948,31 @@ int __init io_apic_get_version(int ioapic)
3707 3948
3708int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) 3949int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
3709{ 3950{
3951 struct irq_desc *desc;
3952 struct irq_cfg *cfg;
3953 int cpu = boot_cpu_id;
3954
3710 if (!IO_APIC_IRQ(irq)) { 3955 if (!IO_APIC_IRQ(irq)) {
3711 apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", 3956 apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
3712 ioapic); 3957 ioapic);
3713 return -EINVAL; 3958 return -EINVAL;
3714 } 3959 }
3715 3960
3961 desc = irq_to_desc_alloc_cpu(irq, cpu);
3962 if (!desc) {
3963 printk(KERN_INFO "can not get irq_desc %d\n", irq);
3964 return 0;
3965 }
3966
3716 /* 3967 /*
3717 * IRQs < 16 are already in the irq_2_pin[] map 3968 * IRQs < 16 are already in the irq_2_pin[] map
3718 */ 3969 */
3719 if (irq >= 16) 3970 if (irq >= NR_IRQS_LEGACY) {
3720 add_pin_to_irq(irq, ioapic, pin); 3971 cfg = desc->chip_data;
3972 add_pin_to_irq_cpu(cfg, cpu, ioapic, pin);
3973 }
3721 3974
3722 setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity); 3975 setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity);
3723 3976
3724 return 0; 3977 return 0;
3725} 3978}
@@ -3757,7 +4010,7 @@ void __init setup_ioapic_dest(void)
3757 int pin, ioapic, irq, irq_entry; 4010 int pin, ioapic, irq, irq_entry;
3758 struct irq_desc *desc; 4011 struct irq_desc *desc;
3759 struct irq_cfg *cfg; 4012 struct irq_cfg *cfg;
3760 cpumask_t mask; 4013 const struct cpumask *mask;
3761 4014
3762 if (skip_ioapic_setup == 1) 4015 if (skip_ioapic_setup == 1)
3763 return; 4016 return;
@@ -3773,9 +4026,10 @@ void __init setup_ioapic_dest(void)
3773 * when you have too many devices, because at that time only boot 4026 * when you have too many devices, because at that time only boot
3774 * cpu is online. 4027 * cpu is online.
3775 */ 4028 */
3776 cfg = irq_cfg(irq); 4029 desc = irq_to_desc(irq);
4030 cfg = desc->chip_data;
3777 if (!cfg->vector) { 4031 if (!cfg->vector) {
3778 setup_IO_APIC_irq(ioapic, pin, irq, 4032 setup_IO_APIC_irq(ioapic, pin, irq, desc,
3779 irq_trigger(irq_entry), 4033 irq_trigger(irq_entry),
3780 irq_polarity(irq_entry)); 4034 irq_polarity(irq_entry));
3781 continue; 4035 continue;
@@ -3785,19 +4039,18 @@ void __init setup_ioapic_dest(void)
3785 /* 4039 /*
3786 * Honour affinities which have been set in early boot 4040 * Honour affinities which have been set in early boot
3787 */ 4041 */
3788 desc = irq_to_desc(irq);
3789 if (desc->status & 4042 if (desc->status &
3790 (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) 4043 (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
3791 mask = desc->affinity; 4044 mask = &desc->affinity;
3792 else 4045 else
3793 mask = TARGET_CPUS; 4046 mask = TARGET_CPUS;
3794 4047
3795#ifdef CONFIG_INTR_REMAP 4048#ifdef CONFIG_INTR_REMAP
3796 if (intr_remapping_enabled) 4049 if (intr_remapping_enabled)
3797 set_ir_ioapic_affinity_irq(irq, mask); 4050 set_ir_ioapic_affinity_irq_desc(desc, mask);
3798 else 4051 else
3799#endif 4052#endif
3800 set_ioapic_affinity_irq(irq, mask); 4053 set_ioapic_affinity_irq_desc(desc, mask);
3801 } 4054 }
3802 4055
3803 } 4056 }
@@ -3846,7 +4099,6 @@ void __init ioapic_init_mappings(void)
3846 struct resource *ioapic_res; 4099 struct resource *ioapic_res;
3847 int i; 4100 int i;
3848 4101
3849 irq_2_pin_init();
3850 ioapic_res = ioapic_setup_resources(); 4102 ioapic_res = ioapic_setup_resources();
3851 for (i = 0; i < nr_ioapics; i++) { 4103 for (i = 0; i < nr_ioapics; i++) {
3852 if (smp_found_config) { 4104 if (smp_found_config) {
diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c
index f1c688e46f35..285bbf8831fa 100644
--- a/arch/x86/kernel/ipi.c
+++ b/arch/x86/kernel/ipi.c
@@ -116,18 +116,18 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector)
116/* 116/*
117 * This is only used on smaller machines. 117 * This is only used on smaller machines.
118 */ 118 */
119void send_IPI_mask_bitmask(cpumask_t cpumask, int vector) 119void send_IPI_mask_bitmask(const struct cpumask *cpumask, int vector)
120{ 120{
121 unsigned long mask = cpus_addr(cpumask)[0]; 121 unsigned long mask = cpumask_bits(cpumask)[0];
122 unsigned long flags; 122 unsigned long flags;
123 123
124 local_irq_save(flags); 124 local_irq_save(flags);
125 WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]); 125 WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]);
126 __send_IPI_dest_field(mask, vector); 126 __send_IPI_dest_field(mask, vector);
127 local_irq_restore(flags); 127 local_irq_restore(flags);
128} 128}
129 129
130void send_IPI_mask_sequence(cpumask_t mask, int vector) 130void send_IPI_mask_sequence(const struct cpumask *mask, int vector)
131{ 131{
132 unsigned long flags; 132 unsigned long flags;
133 unsigned int query_cpu; 133 unsigned int query_cpu;
@@ -139,12 +139,24 @@ void send_IPI_mask_sequence(cpumask_t mask, int vector)
139 */ 139 */
140 140
141 local_irq_save(flags); 141 local_irq_save(flags);
142 for_each_possible_cpu(query_cpu) { 142 for_each_cpu(query_cpu, mask)
143 if (cpu_isset(query_cpu, mask)) { 143 __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu), vector);
144 local_irq_restore(flags);
145}
146
147void send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
148{
149 unsigned long flags;
150 unsigned int query_cpu;
151 unsigned int this_cpu = smp_processor_id();
152
153 /* See Hack comment above */
154
155 local_irq_save(flags);
156 for_each_cpu(query_cpu, mask)
157 if (query_cpu != this_cpu)
144 __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu), 158 __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu),
145 vector); 159 vector);
146 }
147 }
148 local_irq_restore(flags); 160 local_irq_restore(flags);
149} 161}
150 162
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index d1d4dc52f649..bce53e1352a0 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -9,6 +9,7 @@
9#include <asm/apic.h> 9#include <asm/apic.h>
10#include <asm/io_apic.h> 10#include <asm/io_apic.h>
11#include <asm/smp.h> 11#include <asm/smp.h>
12#include <asm/irq.h>
12 13
13atomic_t irq_err_count; 14atomic_t irq_err_count;
14 15
@@ -118,6 +119,9 @@ int show_interrupts(struct seq_file *p, void *v)
118 } 119 }
119 120
120 desc = irq_to_desc(i); 121 desc = irq_to_desc(i);
122 if (!desc)
123 return 0;
124
121 spin_lock_irqsave(&desc->lock, flags); 125 spin_lock_irqsave(&desc->lock, flags);
122#ifndef CONFIG_SMP 126#ifndef CONFIG_SMP
123 any_count = kstat_irqs(i); 127 any_count = kstat_irqs(i);
@@ -187,3 +191,5 @@ u64 arch_irq_stat(void)
187#endif 191#endif
188 return sum; 192 return sum;
189} 193}
194
195EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index a51382672de0..9dc5588f336a 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -233,25 +233,28 @@ unsigned int do_IRQ(struct pt_regs *regs)
233#ifdef CONFIG_HOTPLUG_CPU 233#ifdef CONFIG_HOTPLUG_CPU
234#include <mach_apic.h> 234#include <mach_apic.h>
235 235
236void fixup_irqs(cpumask_t map) 236/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
237void fixup_irqs(void)
237{ 238{
238 unsigned int irq; 239 unsigned int irq;
239 static int warned; 240 static int warned;
240 struct irq_desc *desc; 241 struct irq_desc *desc;
241 242
242 for_each_irq_desc(irq, desc) { 243 for_each_irq_desc(irq, desc) {
243 cpumask_t mask; 244 const struct cpumask *affinity;
244 245
246 if (!desc)
247 continue;
245 if (irq == 2) 248 if (irq == 2)
246 continue; 249 continue;
247 250
248 cpus_and(mask, desc->affinity, map); 251 affinity = &desc->affinity;
249 if (any_online_cpu(mask) == NR_CPUS) { 252 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
250 printk("Breaking affinity for irq %i\n", irq); 253 printk("Breaking affinity for irq %i\n", irq);
251 mask = map; 254 affinity = cpu_all_mask;
252 } 255 }
253 if (desc->chip->set_affinity) 256 if (desc->chip->set_affinity)
254 desc->chip->set_affinity(irq, mask); 257 desc->chip->set_affinity(irq, affinity);
255 else if (desc->action && !(warned++)) 258 else if (desc->action && !(warned++))
256 printk("Cannot set affinity for irq %i\n", irq); 259 printk("Cannot set affinity for irq %i\n", irq);
257 } 260 }
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 60eb84eb77a0..6383d50f82ea 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -13,12 +13,12 @@
13#include <linux/seq_file.h> 13#include <linux/seq_file.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/delay.h> 15#include <linux/delay.h>
16#include <linux/ftrace.h>
16#include <asm/uaccess.h> 17#include <asm/uaccess.h>
17#include <asm/io_apic.h> 18#include <asm/io_apic.h>
18#include <asm/idle.h> 19#include <asm/idle.h>
19#include <asm/smp.h> 20#include <asm/smp.h>
20 21
21#ifdef CONFIG_DEBUG_STACKOVERFLOW
22/* 22/*
23 * Probabilistic stack overflow check: 23 * Probabilistic stack overflow check:
24 * 24 *
@@ -28,26 +28,25 @@
28 */ 28 */
29static inline void stack_overflow_check(struct pt_regs *regs) 29static inline void stack_overflow_check(struct pt_regs *regs)
30{ 30{
31#ifdef CONFIG_DEBUG_STACKOVERFLOW
31 u64 curbase = (u64)task_stack_page(current); 32 u64 curbase = (u64)task_stack_page(current);
32 static unsigned long warned = -60*HZ; 33
33 34 WARN_ONCE(regs->sp >= curbase &&
34 if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE && 35 regs->sp <= curbase + THREAD_SIZE &&
35 regs->sp < curbase + sizeof(struct thread_info) + 128 && 36 regs->sp < curbase + sizeof(struct thread_info) +
36 time_after(jiffies, warned + 60*HZ)) { 37 sizeof(struct pt_regs) + 128,
37 printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n", 38
38 current->comm, curbase, regs->sp); 39 "do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
39 show_stack(NULL,NULL); 40 current->comm, curbase, regs->sp);
40 warned = jiffies;
41 }
42}
43#endif 41#endif
42}
44 43
45/* 44/*
46 * do_IRQ handles all normal device IRQ's (the special 45 * do_IRQ handles all normal device IRQ's (the special
47 * SMP cross-CPU interrupts have their own specific 46 * SMP cross-CPU interrupts have their own specific
48 * handlers). 47 * handlers).
49 */ 48 */
50asmlinkage unsigned int do_IRQ(struct pt_regs *regs) 49asmlinkage unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
51{ 50{
52 struct pt_regs *old_regs = set_irq_regs(regs); 51 struct pt_regs *old_regs = set_irq_regs(regs);
53 struct irq_desc *desc; 52 struct irq_desc *desc;
@@ -60,9 +59,7 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
60 irq_enter(); 59 irq_enter();
61 irq = __get_cpu_var(vector_irq)[vector]; 60 irq = __get_cpu_var(vector_irq)[vector];
62 61
63#ifdef CONFIG_DEBUG_STACKOVERFLOW
64 stack_overflow_check(regs); 62 stack_overflow_check(regs);
65#endif
66 63
67 desc = irq_to_desc(irq); 64 desc = irq_to_desc(irq);
68 if (likely(desc)) 65 if (likely(desc))
@@ -83,40 +80,43 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
83} 80}
84 81
85#ifdef CONFIG_HOTPLUG_CPU 82#ifdef CONFIG_HOTPLUG_CPU
86void fixup_irqs(cpumask_t map) 83/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
84void fixup_irqs(void)
87{ 85{
88 unsigned int irq; 86 unsigned int irq;
89 static int warned; 87 static int warned;
90 struct irq_desc *desc; 88 struct irq_desc *desc;
91 89
92 for_each_irq_desc(irq, desc) { 90 for_each_irq_desc(irq, desc) {
93 cpumask_t mask;
94 int break_affinity = 0; 91 int break_affinity = 0;
95 int set_affinity = 1; 92 int set_affinity = 1;
93 const struct cpumask *affinity;
96 94
95 if (!desc)
96 continue;
97 if (irq == 2) 97 if (irq == 2)
98 continue; 98 continue;
99 99
100 /* interrupt's are disabled at this point */ 100 /* interrupt's are disabled at this point */
101 spin_lock(&desc->lock); 101 spin_lock(&desc->lock);
102 102
103 affinity = &desc->affinity;
103 if (!irq_has_action(irq) || 104 if (!irq_has_action(irq) ||
104 cpus_equal(desc->affinity, map)) { 105 cpumask_equal(affinity, cpu_online_mask)) {
105 spin_unlock(&desc->lock); 106 spin_unlock(&desc->lock);
106 continue; 107 continue;
107 } 108 }
108 109
109 cpus_and(mask, desc->affinity, map); 110 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
110 if (cpus_empty(mask)) {
111 break_affinity = 1; 111 break_affinity = 1;
112 mask = map; 112 affinity = cpu_all_mask;
113 } 113 }
114 114
115 if (desc->chip->mask) 115 if (desc->chip->mask)
116 desc->chip->mask(irq); 116 desc->chip->mask(irq);
117 117
118 if (desc->chip->set_affinity) 118 if (desc->chip->set_affinity)
119 desc->chip->set_affinity(irq, mask); 119 desc->chip->set_affinity(irq, affinity);
120 else if (!(warned++)) 120 else if (!(warned++))
121 set_affinity = 0; 121 set_affinity = 0;
122 122
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 845aa9803e80..84723295f88a 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -68,8 +68,7 @@ void __init init_ISA_irqs (void)
68 /* 68 /*
69 * 16 old-style INTA-cycle interrupts: 69 * 16 old-style INTA-cycle interrupts:
70 */ 70 */
71 for (i = 0; i < 16; i++) { 71 for (i = 0; i < NR_IRQS_LEGACY; i++) {
72 /* first time call this irq_desc */
73 struct irq_desc *desc = irq_to_desc(i); 72 struct irq_desc *desc = irq_to_desc(i);
74 73
75 desc->status = IRQ_DISABLED; 74 desc->status = IRQ_DISABLED;
@@ -111,6 +110,18 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
111 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 110 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
112}; 111};
113 112
113int vector_used_by_percpu_irq(unsigned int vector)
114{
115 int cpu;
116
117 for_each_online_cpu(cpu) {
118 if (per_cpu(vector_irq, cpu)[vector] != -1)
119 return 1;
120 }
121
122 return 0;
123}
124
114/* Overridden in paravirt.c */ 125/* Overridden in paravirt.c */
115void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); 126void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
116 127
@@ -129,7 +140,7 @@ void __init native_init_IRQ(void)
129 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { 140 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
130 /* SYSCALL_VECTOR was reserved in trap_init. */ 141 /* SYSCALL_VECTOR was reserved in trap_init. */
131 if (i != SYSCALL_VECTOR) 142 if (i != SYSCALL_VECTOR)
132 set_intr_gate(i, interrupt[i]); 143 set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
133 } 144 }
134 145
135 146
@@ -147,10 +158,12 @@ void __init native_init_IRQ(void)
147 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); 158 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
148 159
149 /* IPI for single call function */ 160 /* IPI for single call function */
150 set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt); 161 alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
162 call_function_single_interrupt);
151 163
152 /* Low priority IPI to cleanup after moving an irq */ 164 /* Low priority IPI to cleanup after moving an irq */
153 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); 165 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
166 set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
154#endif 167#endif
155 168
156#ifdef CONFIG_X86_LOCAL_APIC 169#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index ff0235391285..31ebfe38e96c 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -24,41 +24,6 @@
24#include <asm/i8259.h> 24#include <asm/i8259.h>
25 25
26/* 26/*
27 * Common place to define all x86 IRQ vectors
28 *
29 * This builds up the IRQ handler stubs using some ugly macros in irq.h
30 *
31 * These macros create the low-level assembly IRQ routines that save
32 * register context and call do_IRQ(). do_IRQ() then does all the
33 * operations that are needed to keep the AT (or SMP IOAPIC)
34 * interrupt-controller happy.
35 */
36
37#define IRQ_NAME2(nr) nr##_interrupt(void)
38#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr)
39
40/*
41 * SMP has a few special interrupts for IPI messages
42 */
43
44#define BUILD_IRQ(nr) \
45 asmlinkage void IRQ_NAME(nr); \
46 asm("\n.text\n.p2align\n" \
47 "IRQ" #nr "_interrupt:\n\t" \
48 "push $~(" #nr ") ; " \
49 "jmp common_interrupt\n" \
50 ".previous");
51
52#define BI(x,y) \
53 BUILD_IRQ(x##y)
54
55#define BUILD_16_IRQS(x) \
56 BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
57 BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
58 BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
59 BI(x,c) BI(x,d) BI(x,e) BI(x,f)
60
61/*
62 * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: 27 * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
63 * (these are usually mapped to vectors 0x30-0x3f) 28 * (these are usually mapped to vectors 0x30-0x3f)
64 */ 29 */
@@ -73,37 +38,6 @@
73 * 38 *
74 * (these are usually mapped into the 0x30-0xff vector range) 39 * (these are usually mapped into the 0x30-0xff vector range)
75 */ 40 */
76 BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3)
77BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7)
78BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)
79BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf)
80
81#undef BUILD_16_IRQS
82#undef BI
83
84
85#define IRQ(x,y) \
86 IRQ##x##y##_interrupt
87
88#define IRQLIST_16(x) \
89 IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
90 IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
91 IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
92 IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
93
94/* for the irq vectors */
95static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
96 IRQLIST_16(0x2), IRQLIST_16(0x3),
97 IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
98 IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
99 IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf)
100};
101
102#undef IRQ
103#undef IRQLIST_16
104
105
106
107 41
108/* 42/*
109 * IRQ2 is cascade interrupt to second interrupt controller 43 * IRQ2 is cascade interrupt to second interrupt controller
@@ -135,6 +69,18 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
135 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 69 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
136}; 70};
137 71
72int vector_used_by_percpu_irq(unsigned int vector)
73{
74 int cpu;
75
76 for_each_online_cpu(cpu) {
77 if (per_cpu(vector_irq, cpu)[vector] != -1)
78 return 1;
79 }
80
81 return 0;
82}
83
138void __init init_ISA_irqs(void) 84void __init init_ISA_irqs(void)
139{ 85{
140 int i; 86 int i;
@@ -142,8 +88,7 @@ void __init init_ISA_irqs(void)
142 init_bsp_APIC(); 88 init_bsp_APIC();
143 init_8259A(0); 89 init_8259A(0);
144 90
145 for (i = 0; i < 16; i++) { 91 for (i = 0; i < NR_IRQS_LEGACY; i++) {
146 /* first time call this irq_desc */
147 struct irq_desc *desc = irq_to_desc(i); 92 struct irq_desc *desc = irq_to_desc(i);
148 93
149 desc->status = IRQ_DISABLED; 94 desc->status = IRQ_DISABLED;
@@ -188,6 +133,7 @@ static void __init smp_intr_init(void)
188 133
189 /* Low priority IPI to cleanup after moving an irq */ 134 /* Low priority IPI to cleanup after moving an irq */
190 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); 135 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
136 set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
191#endif 137#endif
192} 138}
193 139
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index e169ae9b6a62..652fce6d2cce 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -89,17 +89,17 @@ static cycle_t kvm_clock_read(void)
89 */ 89 */
90static unsigned long kvm_get_tsc_khz(void) 90static unsigned long kvm_get_tsc_khz(void)
91{ 91{
92 return preset_lpj; 92 struct pvclock_vcpu_time_info *src;
93 src = &per_cpu(hv_clock, 0);
94 return pvclock_tsc_khz(src);
93} 95}
94 96
95static void kvm_get_preset_lpj(void) 97static void kvm_get_preset_lpj(void)
96{ 98{
97 struct pvclock_vcpu_time_info *src;
98 unsigned long khz; 99 unsigned long khz;
99 u64 lpj; 100 u64 lpj;
100 101
101 src = &per_cpu(hv_clock, 0); 102 khz = kvm_get_tsc_khz();
102 khz = pvclock_tsc_khz(src);
103 103
104 lpj = ((u64)khz * 1000); 104 lpj = ((u64)khz * 1000);
105 do_div(lpj, HZ); 105 do_div(lpj, HZ);
@@ -194,5 +194,7 @@ void __init kvmclock_init(void)
194#endif 194#endif
195 kvm_get_preset_lpj(); 195 kvm_get_preset_lpj();
196 clocksource_register(&kvm_clock); 196 clocksource_register(&kvm_clock);
197 pv_info.paravirt_enabled = 1;
198 pv_info.name = "KVM";
197 } 199 }
198} 200}
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index eee32b43fee3..71f1d99a635d 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -12,8 +12,8 @@
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/smp.h> 13#include <linux/smp.h>
14#include <linux/vmalloc.h> 14#include <linux/vmalloc.h>
15#include <linux/uaccess.h>
15 16
16#include <asm/uaccess.h>
17#include <asm/system.h> 17#include <asm/system.h>
18#include <asm/ldt.h> 18#include <asm/ldt.h>
19#include <asm/desc.h> 19#include <asm/desc.h>
@@ -93,7 +93,7 @@ static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
93 if (err < 0) 93 if (err < 0)
94 return err; 94 return err;
95 95
96 for(i = 0; i < old->size; i++) 96 for (i = 0; i < old->size; i++)
97 write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE); 97 write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE);
98 return 0; 98 return 0;
99} 99}
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 7a385746509a..37f420018a41 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -13,6 +13,7 @@
13#include <linux/numa.h> 13#include <linux/numa.h>
14#include <linux/ftrace.h> 14#include <linux/ftrace.h>
15#include <linux/suspend.h> 15#include <linux/suspend.h>
16#include <linux/gfp.h>
16 17
17#include <asm/pgtable.h> 18#include <asm/pgtable.h>
18#include <asm/pgalloc.h> 19#include <asm/pgalloc.h>
@@ -25,15 +26,6 @@
25#include <asm/system.h> 26#include <asm/system.h>
26#include <asm/cacheflush.h> 27#include <asm/cacheflush.h>
27 28
28#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
29static u32 kexec_pgd[1024] PAGE_ALIGNED;
30#ifdef CONFIG_X86_PAE
31static u32 kexec_pmd0[1024] PAGE_ALIGNED;
32static u32 kexec_pmd1[1024] PAGE_ALIGNED;
33#endif
34static u32 kexec_pte0[1024] PAGE_ALIGNED;
35static u32 kexec_pte1[1024] PAGE_ALIGNED;
36
37static void set_idt(void *newidt, __u16 limit) 29static void set_idt(void *newidt, __u16 limit)
38{ 30{
39 struct desc_ptr curidt; 31 struct desc_ptr curidt;
@@ -76,6 +68,76 @@ static void load_segments(void)
76#undef __STR 68#undef __STR
77} 69}
78 70
71static void machine_kexec_free_page_tables(struct kimage *image)
72{
73 free_page((unsigned long)image->arch.pgd);
74#ifdef CONFIG_X86_PAE
75 free_page((unsigned long)image->arch.pmd0);
76 free_page((unsigned long)image->arch.pmd1);
77#endif
78 free_page((unsigned long)image->arch.pte0);
79 free_page((unsigned long)image->arch.pte1);
80}
81
82static int machine_kexec_alloc_page_tables(struct kimage *image)
83{
84 image->arch.pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL);
85#ifdef CONFIG_X86_PAE
86 image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
87 image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
88#endif
89 image->arch.pte0 = (pte_t *)get_zeroed_page(GFP_KERNEL);
90 image->arch.pte1 = (pte_t *)get_zeroed_page(GFP_KERNEL);
91 if (!image->arch.pgd ||
92#ifdef CONFIG_X86_PAE
93 !image->arch.pmd0 || !image->arch.pmd1 ||
94#endif
95 !image->arch.pte0 || !image->arch.pte1) {
96 machine_kexec_free_page_tables(image);
97 return -ENOMEM;
98 }
99 return 0;
100}
101
102static void machine_kexec_page_table_set_one(
103 pgd_t *pgd, pmd_t *pmd, pte_t *pte,
104 unsigned long vaddr, unsigned long paddr)
105{
106 pud_t *pud;
107
108 pgd += pgd_index(vaddr);
109#ifdef CONFIG_X86_PAE
110 if (!(pgd_val(*pgd) & _PAGE_PRESENT))
111 set_pgd(pgd, __pgd(__pa(pmd) | _PAGE_PRESENT));
112#endif
113 pud = pud_offset(pgd, vaddr);
114 pmd = pmd_offset(pud, vaddr);
115 if (!(pmd_val(*pmd) & _PAGE_PRESENT))
116 set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
117 pte = pte_offset_kernel(pmd, vaddr);
118 set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
119}
120
121static void machine_kexec_prepare_page_tables(struct kimage *image)
122{
123 void *control_page;
124 pmd_t *pmd = 0;
125
126 control_page = page_address(image->control_code_page);
127#ifdef CONFIG_X86_PAE
128 pmd = image->arch.pmd0;
129#endif
130 machine_kexec_page_table_set_one(
131 image->arch.pgd, pmd, image->arch.pte0,
132 (unsigned long)control_page, __pa(control_page));
133#ifdef CONFIG_X86_PAE
134 pmd = image->arch.pmd1;
135#endif
136 machine_kexec_page_table_set_one(
137 image->arch.pgd, pmd, image->arch.pte1,
138 __pa(control_page), __pa(control_page));
139}
140
79/* 141/*
80 * A architecture hook called to validate the 142 * A architecture hook called to validate the
81 * proposed image and prepare the control pages 143 * proposed image and prepare the control pages
@@ -87,12 +149,20 @@ static void load_segments(void)
87 * reboot code buffer to allow us to avoid allocations 149 * reboot code buffer to allow us to avoid allocations
88 * later. 150 * later.
89 * 151 *
90 * Make control page executable. 152 * - Make control page executable.
153 * - Allocate page tables
154 * - Setup page tables
91 */ 155 */
92int machine_kexec_prepare(struct kimage *image) 156int machine_kexec_prepare(struct kimage *image)
93{ 157{
158 int error;
159
94 if (nx_enabled) 160 if (nx_enabled)
95 set_pages_x(image->control_code_page, 1); 161 set_pages_x(image->control_code_page, 1);
162 error = machine_kexec_alloc_page_tables(image);
163 if (error)
164 return error;
165 machine_kexec_prepare_page_tables(image);
96 return 0; 166 return 0;
97} 167}
98 168
@@ -104,6 +174,7 @@ void machine_kexec_cleanup(struct kimage *image)
104{ 174{
105 if (nx_enabled) 175 if (nx_enabled)
106 set_pages_nx(image->control_code_page, 1); 176 set_pages_nx(image->control_code_page, 1);
177 machine_kexec_free_page_tables(image);
107} 178}
108 179
109/* 180/*
@@ -150,18 +221,7 @@ void machine_kexec(struct kimage *image)
150 relocate_kernel_ptr = control_page; 221 relocate_kernel_ptr = control_page;
151 page_list[PA_CONTROL_PAGE] = __pa(control_page); 222 page_list[PA_CONTROL_PAGE] = __pa(control_page);
152 page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; 223 page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
153 page_list[PA_PGD] = __pa(kexec_pgd); 224 page_list[PA_PGD] = __pa(image->arch.pgd);
154 page_list[VA_PGD] = (unsigned long)kexec_pgd;
155#ifdef CONFIG_X86_PAE
156 page_list[PA_PMD_0] = __pa(kexec_pmd0);
157 page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
158 page_list[PA_PMD_1] = __pa(kexec_pmd1);
159 page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
160#endif
161 page_list[PA_PTE_0] = __pa(kexec_pte0);
162 page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
163 page_list[PA_PTE_1] = __pa(kexec_pte1);
164 page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
165 225
166 if (image->type == KEXEC_TYPE_DEFAULT) 226 if (image->type == KEXEC_TYPE_DEFAULT)
167 page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) 227 page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
index 3b599518c322..c12314c9e86f 100644
--- a/arch/x86/kernel/mfgpt_32.c
+++ b/arch/x86/kernel/mfgpt_32.c
@@ -287,7 +287,7 @@ static struct clock_event_device mfgpt_clockevent = {
287 .set_mode = mfgpt_set_mode, 287 .set_mode = mfgpt_set_mode,
288 .set_next_event = mfgpt_next_event, 288 .set_next_event = mfgpt_next_event,
289 .rating = 250, 289 .rating = 250,
290 .cpumask = CPU_MASK_ALL, 290 .cpumask = cpu_all_mask,
291 .shift = 32 291 .shift = 32
292}; 292};
293 293
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 5f8e5d75a254..c25fdb382292 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -10,7 +10,7 @@
10 * This driver allows to upgrade microcode on AMD 10 * This driver allows to upgrade microcode on AMD
11 * family 0x10 and 0x11 processors. 11 * family 0x10 and 0x11 processors.
12 * 12 *
13 * Licensed unter the terms of the GNU General Public 13 * Licensed under the terms of the GNU General Public
14 * License version 2. See file COPYING for details. 14 * License version 2. See file COPYING for details.
15*/ 15*/
16 16
@@ -32,9 +32,9 @@
32#include <linux/platform_device.h> 32#include <linux/platform_device.h>
33#include <linux/pci.h> 33#include <linux/pci.h>
34#include <linux/pci_ids.h> 34#include <linux/pci_ids.h>
35#include <linux/uaccess.h>
35 36
36#include <asm/msr.h> 37#include <asm/msr.h>
37#include <asm/uaccess.h>
38#include <asm/processor.h> 38#include <asm/processor.h>
39#include <asm/microcode.h> 39#include <asm/microcode.h>
40 40
@@ -47,43 +47,38 @@ MODULE_LICENSE("GPL v2");
47#define UCODE_UCODE_TYPE 0x00000001 47#define UCODE_UCODE_TYPE 0x00000001
48 48
49struct equiv_cpu_entry { 49struct equiv_cpu_entry {
50 unsigned int installed_cpu; 50 u32 installed_cpu;
51 unsigned int fixed_errata_mask; 51 u32 fixed_errata_mask;
52 unsigned int fixed_errata_compare; 52 u32 fixed_errata_compare;
53 unsigned int equiv_cpu; 53 u16 equiv_cpu;
54}; 54 u16 res;
55} __attribute__((packed));
55 56
56struct microcode_header_amd { 57struct microcode_header_amd {
57 unsigned int data_code; 58 u32 data_code;
58 unsigned int patch_id; 59 u32 patch_id;
59 unsigned char mc_patch_data_id[2]; 60 u16 mc_patch_data_id;
60 unsigned char mc_patch_data_len; 61 u8 mc_patch_data_len;
61 unsigned char init_flag; 62 u8 init_flag;
62 unsigned int mc_patch_data_checksum; 63 u32 mc_patch_data_checksum;
63 unsigned int nb_dev_id; 64 u32 nb_dev_id;
64 unsigned int sb_dev_id; 65 u32 sb_dev_id;
65 unsigned char processor_rev_id[2]; 66 u16 processor_rev_id;
66 unsigned char nb_rev_id; 67 u8 nb_rev_id;
67 unsigned char sb_rev_id; 68 u8 sb_rev_id;
68 unsigned char bios_api_rev; 69 u8 bios_api_rev;
69 unsigned char reserved1[3]; 70 u8 reserved1[3];
70 unsigned int match_reg[8]; 71 u32 match_reg[8];
71}; 72} __attribute__((packed));
72 73
73struct microcode_amd { 74struct microcode_amd {
74 struct microcode_header_amd hdr; 75 struct microcode_header_amd hdr;
75 unsigned int mpb[0]; 76 unsigned int mpb[0];
76}; 77};
77 78
78#define UCODE_MAX_SIZE (2048) 79#define UCODE_MAX_SIZE 2048
79#define DEFAULT_UCODE_DATASIZE (896) 80#define UCODE_CONTAINER_SECTION_HDR 8
80#define MC_HEADER_SIZE (sizeof(struct microcode_header_amd)) 81#define UCODE_CONTAINER_HEADER_SIZE 12
81#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
82#define DWSIZE (sizeof(u32))
83/* For now we support a fixed ucode total size only */
84#define get_totalsize(mc) \
85 ((((struct microcode_amd *)mc)->hdr.mc_patch_data_len * 28) \
86 + MC_HEADER_SIZE)
87 82
88/* serialize access to the physical write */ 83/* serialize access to the physical write */
89static DEFINE_SPINLOCK(microcode_update_lock); 84static DEFINE_SPINLOCK(microcode_update_lock);
@@ -93,31 +88,24 @@ static struct equiv_cpu_entry *equiv_cpu_table;
93static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) 88static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
94{ 89{
95 struct cpuinfo_x86 *c = &cpu_data(cpu); 90 struct cpuinfo_x86 *c = &cpu_data(cpu);
91 u32 dummy;
96 92
97 memset(csig, 0, sizeof(*csig)); 93 memset(csig, 0, sizeof(*csig));
98
99 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { 94 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
100 printk(KERN_ERR "microcode: CPU%d not a capable AMD processor\n", 95 printk(KERN_WARNING "microcode: CPU%d: AMD CPU family 0x%x not "
101 cpu); 96 "supported\n", cpu, c->x86);
102 return -1; 97 return -1;
103 } 98 }
104 99 rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy);
105 asm volatile("movl %1, %%ecx; rdmsr" 100 printk(KERN_INFO "microcode: CPU%d: patch_level=0x%x\n", cpu, csig->rev);
106 : "=a" (csig->rev)
107 : "i" (0x0000008B) : "ecx");
108
109 printk(KERN_INFO "microcode: collect_cpu_info_amd : patch_id=0x%x\n",
110 csig->rev);
111
112 return 0; 101 return 0;
113} 102}
114 103
115static int get_matching_microcode(int cpu, void *mc, int rev) 104static int get_matching_microcode(int cpu, void *mc, int rev)
116{ 105{
117 struct microcode_header_amd *mc_header = mc; 106 struct microcode_header_amd *mc_header = mc;
118 struct pci_dev *nb_pci_dev, *sb_pci_dev;
119 unsigned int current_cpu_id; 107 unsigned int current_cpu_id;
120 unsigned int equiv_cpu_id = 0x00; 108 u16 equiv_cpu_id = 0;
121 unsigned int i = 0; 109 unsigned int i = 0;
122 110
123 BUG_ON(equiv_cpu_table == NULL); 111 BUG_ON(equiv_cpu_table == NULL);
@@ -132,57 +120,25 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
132 } 120 }
133 121
134 if (!equiv_cpu_id) { 122 if (!equiv_cpu_id) {
135 printk(KERN_ERR "microcode: CPU%d cpu_id " 123 printk(KERN_WARNING "microcode: CPU%d: cpu revision "
136 "not found in equivalent cpu table \n", cpu); 124 "not listed in equivalent cpu table\n", cpu);
137 return 0; 125 return 0;
138 } 126 }
139 127
140 if ((mc_header->processor_rev_id[0]) != (equiv_cpu_id & 0xff)) { 128 if (mc_header->processor_rev_id != equiv_cpu_id) {
141 printk(KERN_ERR 129 printk(KERN_ERR "microcode: CPU%d: patch mismatch "
142 "microcode: CPU%d patch does not match " 130 "(processor_rev_id: %x, equiv_cpu_id: %x)\n",
143 "(patch is %x, cpu extended is %x) \n", 131 cpu, mc_header->processor_rev_id, equiv_cpu_id);
144 cpu, mc_header->processor_rev_id[0],
145 (equiv_cpu_id & 0xff));
146 return 0; 132 return 0;
147 } 133 }
148 134
149 if ((mc_header->processor_rev_id[1]) != ((equiv_cpu_id >> 16) & 0xff)) { 135 /* ucode might be chipset specific -- currently we don't support this */
150 printk(KERN_ERR "microcode: CPU%d patch does not match " 136 if (mc_header->nb_dev_id || mc_header->sb_dev_id) {
151 "(patch is %x, cpu base id is %x) \n", 137 printk(KERN_ERR "microcode: CPU%d: loading of chipset "
152 cpu, mc_header->processor_rev_id[1], 138 "specific code not yet supported\n", cpu);
153 ((equiv_cpu_id >> 16) & 0xff));
154
155 return 0; 139 return 0;
156 } 140 }
157 141
158 /* ucode may be northbridge specific */
159 if (mc_header->nb_dev_id) {
160 nb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD,
161 (mc_header->nb_dev_id & 0xff),
162 NULL);
163 if ((!nb_pci_dev) ||
164 (mc_header->nb_rev_id != nb_pci_dev->revision)) {
165 printk(KERN_ERR "microcode: CPU%d NB mismatch \n", cpu);
166 pci_dev_put(nb_pci_dev);
167 return 0;
168 }
169 pci_dev_put(nb_pci_dev);
170 }
171
172 /* ucode may be southbridge specific */
173 if (mc_header->sb_dev_id) {
174 sb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD,
175 (mc_header->sb_dev_id & 0xff),
176 NULL);
177 if ((!sb_pci_dev) ||
178 (mc_header->sb_rev_id != sb_pci_dev->revision)) {
179 printk(KERN_ERR "microcode: CPU%d SB mismatch \n", cpu);
180 pci_dev_put(sb_pci_dev);
181 return 0;
182 }
183 pci_dev_put(sb_pci_dev);
184 }
185
186 if (mc_header->patch_id <= rev) 142 if (mc_header->patch_id <= rev)
187 return 0; 143 return 0;
188 144
@@ -192,12 +148,10 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
192static void apply_microcode_amd(int cpu) 148static void apply_microcode_amd(int cpu)
193{ 149{
194 unsigned long flags; 150 unsigned long flags;
195 unsigned int eax, edx; 151 u32 rev, dummy;
196 unsigned int rev;
197 int cpu_num = raw_smp_processor_id(); 152 int cpu_num = raw_smp_processor_id();
198 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; 153 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
199 struct microcode_amd *mc_amd = uci->mc; 154 struct microcode_amd *mc_amd = uci->mc;
200 unsigned long addr;
201 155
202 /* We should bind the task to the CPU */ 156 /* We should bind the task to the CPU */
203 BUG_ON(cpu_num != cpu); 157 BUG_ON(cpu_num != cpu);
@@ -206,42 +160,34 @@ static void apply_microcode_amd(int cpu)
206 return; 160 return;
207 161
208 spin_lock_irqsave(&microcode_update_lock, flags); 162 spin_lock_irqsave(&microcode_update_lock, flags);
209 163 wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
210 addr = (unsigned long)&mc_amd->hdr.data_code;
211 edx = (unsigned int)(((unsigned long)upper_32_bits(addr)));
212 eax = (unsigned int)(((unsigned long)lower_32_bits(addr)));
213
214 asm volatile("movl %0, %%ecx; wrmsr" :
215 : "i" (0xc0010020), "a" (eax), "d" (edx) : "ecx");
216
217 /* get patch id after patching */ 164 /* get patch id after patching */
218 asm volatile("movl %1, %%ecx; rdmsr" 165 rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
219 : "=a" (rev)
220 : "i" (0x0000008B) : "ecx");
221
222 spin_unlock_irqrestore(&microcode_update_lock, flags); 166 spin_unlock_irqrestore(&microcode_update_lock, flags);
223 167
224 /* check current patch id and patch's id for match */ 168 /* check current patch id and patch's id for match */
225 if (rev != mc_amd->hdr.patch_id) { 169 if (rev != mc_amd->hdr.patch_id) {
226 printk(KERN_ERR "microcode: CPU%d update from revision " 170 printk(KERN_ERR "microcode: CPU%d: update failed "
227 "0x%x to 0x%x failed\n", cpu_num, 171 "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id);
228 mc_amd->hdr.patch_id, rev);
229 return; 172 return;
230 } 173 }
231 174
232 printk(KERN_INFO "microcode: CPU%d updated from revision " 175 printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n",
233 "0x%x to 0x%x \n", 176 cpu, rev);
234 cpu_num, uci->cpu_sig.rev, mc_amd->hdr.patch_id);
235 177
236 uci->cpu_sig.rev = rev; 178 uci->cpu_sig.rev = rev;
237} 179}
238 180
239static void * get_next_ucode(u8 *buf, unsigned int size, 181static int get_ucode_data(void *to, const u8 *from, size_t n)
240 int (*get_ucode_data)(void *, const void *, size_t), 182{
241 unsigned int *mc_size) 183 memcpy(to, from, n);
184 return 0;
185}
186
187static void *get_next_ucode(const u8 *buf, unsigned int size,
188 unsigned int *mc_size)
242{ 189{
243 unsigned int total_size; 190 unsigned int total_size;
244#define UCODE_CONTAINER_SECTION_HDR 8
245 u8 section_hdr[UCODE_CONTAINER_SECTION_HDR]; 191 u8 section_hdr[UCODE_CONTAINER_SECTION_HDR];
246 void *mc; 192 void *mc;
247 193
@@ -249,39 +195,37 @@ static void * get_next_ucode(u8 *buf, unsigned int size,
249 return NULL; 195 return NULL;
250 196
251 if (section_hdr[0] != UCODE_UCODE_TYPE) { 197 if (section_hdr[0] != UCODE_UCODE_TYPE) {
252 printk(KERN_ERR "microcode: error! " 198 printk(KERN_ERR "microcode: error: invalid type field in "
253 "Wrong microcode payload type field\n"); 199 "container file section header\n");
254 return NULL; 200 return NULL;
255 } 201 }
256 202
257 total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); 203 total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8));
258 204
259 printk(KERN_INFO "microcode: size %u, total_size %u\n", 205 printk(KERN_DEBUG "microcode: size %u, total_size %u\n",
260 size, total_size); 206 size, total_size);
261 207
262 if (total_size > size || total_size > UCODE_MAX_SIZE) { 208 if (total_size > size || total_size > UCODE_MAX_SIZE) {
263 printk(KERN_ERR "microcode: error! Bad data in microcode data file\n"); 209 printk(KERN_ERR "microcode: error: size mismatch\n");
264 return NULL; 210 return NULL;
265 } 211 }
266 212
267 mc = vmalloc(UCODE_MAX_SIZE); 213 mc = vmalloc(UCODE_MAX_SIZE);
268 if (mc) { 214 if (mc) {
269 memset(mc, 0, UCODE_MAX_SIZE); 215 memset(mc, 0, UCODE_MAX_SIZE);
270 if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size)) { 216 if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR,
217 total_size)) {
271 vfree(mc); 218 vfree(mc);
272 mc = NULL; 219 mc = NULL;
273 } else 220 } else
274 *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR; 221 *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
275 } 222 }
276#undef UCODE_CONTAINER_SECTION_HDR
277 return mc; 223 return mc;
278} 224}
279 225
280 226
281static int install_equiv_cpu_table(u8 *buf, 227static int install_equiv_cpu_table(const u8 *buf)
282 int (*get_ucode_data)(void *, const void *, size_t))
283{ 228{
284#define UCODE_CONTAINER_HEADER_SIZE 12
285 u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE]; 229 u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE];
286 unsigned int *buf_pos = (unsigned int *)container_hdr; 230 unsigned int *buf_pos = (unsigned int *)container_hdr;
287 unsigned long size; 231 unsigned long size;
@@ -292,14 +236,15 @@ static int install_equiv_cpu_table(u8 *buf,
292 size = buf_pos[2]; 236 size = buf_pos[2];
293 237
294 if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { 238 if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
295 printk(KERN_ERR "microcode: error! " 239 printk(KERN_ERR "microcode: error: invalid type field in "
296 "Wrong microcode equivalnet cpu table\n"); 240 "container file section header\n");
297 return 0; 241 return 0;
298 } 242 }
299 243
300 equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size); 244 equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size);
301 if (!equiv_cpu_table) { 245 if (!equiv_cpu_table) {
302 printk(KERN_ERR "microcode: error, can't allocate memory for equiv CPU table\n"); 246 printk(KERN_ERR "microcode: failed to allocate "
247 "equivalent CPU table\n");
303 return 0; 248 return 0;
304 } 249 }
305 250
@@ -310,7 +255,6 @@ static int install_equiv_cpu_table(u8 *buf,
310 } 255 }
311 256
312 return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */ 257 return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
313#undef UCODE_CONTAINER_HEADER_SIZE
314} 258}
315 259
316static void free_equiv_cpu_table(void) 260static void free_equiv_cpu_table(void)
@@ -321,18 +265,20 @@ static void free_equiv_cpu_table(void)
321 } 265 }
322} 266}
323 267
324static int generic_load_microcode(int cpu, void *data, size_t size, 268static int generic_load_microcode(int cpu, const u8 *data, size_t size)
325 int (*get_ucode_data)(void *, const void *, size_t))
326{ 269{
327 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 270 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
328 u8 *ucode_ptr = data, *new_mc = NULL, *mc; 271 const u8 *ucode_ptr = data;
272 void *new_mc = NULL;
273 void *mc;
329 int new_rev = uci->cpu_sig.rev; 274 int new_rev = uci->cpu_sig.rev;
330 unsigned int leftover; 275 unsigned int leftover;
331 unsigned long offset; 276 unsigned long offset;
332 277
333 offset = install_equiv_cpu_table(ucode_ptr, get_ucode_data); 278 offset = install_equiv_cpu_table(ucode_ptr);
334 if (!offset) { 279 if (!offset) {
335 printk(KERN_ERR "microcode: installing equivalent cpu table failed\n"); 280 printk(KERN_ERR "microcode: failed to create "
281 "equivalent cpu table\n");
336 return -EINVAL; 282 return -EINVAL;
337 } 283 }
338 284
@@ -343,7 +289,7 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
343 unsigned int uninitialized_var(mc_size); 289 unsigned int uninitialized_var(mc_size);
344 struct microcode_header_amd *mc_header; 290 struct microcode_header_amd *mc_header;
345 291
346 mc = get_next_ucode(ucode_ptr, leftover, get_ucode_data, &mc_size); 292 mc = get_next_ucode(ucode_ptr, leftover, &mc_size);
347 if (!mc) 293 if (!mc)
348 break; 294 break;
349 295
@@ -353,7 +299,7 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
353 vfree(new_mc); 299 vfree(new_mc);
354 new_rev = mc_header->patch_id; 300 new_rev = mc_header->patch_id;
355 new_mc = mc; 301 new_mc = mc;
356 } else 302 } else
357 vfree(mc); 303 vfree(mc);
358 304
359 ucode_ptr += mc_size; 305 ucode_ptr += mc_size;
@@ -365,9 +311,9 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
365 if (uci->mc) 311 if (uci->mc)
366 vfree(uci->mc); 312 vfree(uci->mc);
367 uci->mc = new_mc; 313 uci->mc = new_mc;
368 pr_debug("microcode: CPU%d found a matching microcode update with" 314 pr_debug("microcode: CPU%d found a matching microcode "
369 " version 0x%x (current=0x%x)\n", 315 "update with version 0x%x (current=0x%x)\n",
370 cpu, new_rev, uci->cpu_sig.rev); 316 cpu, new_rev, uci->cpu_sig.rev);
371 } else 317 } else
372 vfree(new_mc); 318 vfree(new_mc);
373 } 319 }
@@ -377,12 +323,6 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
377 return (int)leftover; 323 return (int)leftover;
378} 324}
379 325
380static int get_ucode_fw(void *to, const void *from, size_t n)
381{
382 memcpy(to, from, n);
383 return 0;
384}
385
386static int request_microcode_fw(int cpu, struct device *device) 326static int request_microcode_fw(int cpu, struct device *device)
387{ 327{
388 const char *fw_name = "amd-ucode/microcode_amd.bin"; 328 const char *fw_name = "amd-ucode/microcode_amd.bin";
@@ -394,12 +334,11 @@ static int request_microcode_fw(int cpu, struct device *device)
394 334
395 ret = request_firmware(&firmware, fw_name, device); 335 ret = request_firmware(&firmware, fw_name, device);
396 if (ret) { 336 if (ret) {
397 printk(KERN_ERR "microcode: ucode data file %s load failed\n", fw_name); 337 printk(KERN_ERR "microcode: failed to load file %s\n", fw_name);
398 return ret; 338 return ret;
399 } 339 }
400 340
401 ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size, 341 ret = generic_load_microcode(cpu, firmware->data, firmware->size);
402 &get_ucode_fw);
403 342
404 release_firmware(firmware); 343 release_firmware(firmware);
405 344
@@ -408,8 +347,8 @@ static int request_microcode_fw(int cpu, struct device *device)
408 347
409static int request_microcode_user(int cpu, const void __user *buf, size_t size) 348static int request_microcode_user(int cpu, const void __user *buf, size_t size)
410{ 349{
411 printk(KERN_WARNING "microcode: AMD microcode update via /dev/cpu/microcode" 350 printk(KERN_INFO "microcode: AMD microcode update via "
412 "is not supported\n"); 351 "/dev/cpu/microcode not supported\n");
413 return -1; 352 return -1;
414} 353}
415 354
@@ -433,3 +372,4 @@ struct microcode_ops * __init init_amd_microcode(void)
433{ 372{
434 return &microcode_amd_ops; 373 return &microcode_amd_ops;
435} 374}
375
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 82fb2809ce32..c9b721ba968c 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -99,7 +99,7 @@ MODULE_LICENSE("GPL");
99 99
100#define MICROCODE_VERSION "2.00" 100#define MICROCODE_VERSION "2.00"
101 101
102struct microcode_ops *microcode_ops; 102static struct microcode_ops *microcode_ops;
103 103
104/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ 104/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
105static DEFINE_MUTEX(microcode_mutex); 105static DEFINE_MUTEX(microcode_mutex);
@@ -203,7 +203,7 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
203#endif 203#endif
204 204
205/* fake device for request_firmware */ 205/* fake device for request_firmware */
206struct platform_device *microcode_pdev; 206static struct platform_device *microcode_pdev;
207 207
208static ssize_t reload_store(struct sys_device *dev, 208static ssize_t reload_store(struct sys_device *dev,
209 struct sysdev_attribute *attr, 209 struct sysdev_attribute *attr,
@@ -272,13 +272,18 @@ static struct attribute_group mc_attr_group = {
272 .name = "microcode", 272 .name = "microcode",
273}; 273};
274 274
275static void microcode_fini_cpu(int cpu) 275static void __microcode_fini_cpu(int cpu)
276{ 276{
277 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 277 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
278 278
279 mutex_lock(&microcode_mutex);
280 microcode_ops->microcode_fini_cpu(cpu); 279 microcode_ops->microcode_fini_cpu(cpu);
281 uci->valid = 0; 280 uci->valid = 0;
281}
282
283static void microcode_fini_cpu(int cpu)
284{
285 mutex_lock(&microcode_mutex);
286 __microcode_fini_cpu(cpu);
282 mutex_unlock(&microcode_mutex); 287 mutex_unlock(&microcode_mutex);
283} 288}
284 289
@@ -306,12 +311,16 @@ static int microcode_resume_cpu(int cpu)
306 * to this cpu (a bit of paranoia): 311 * to this cpu (a bit of paranoia):
307 */ 312 */
308 if (microcode_ops->collect_cpu_info(cpu, &nsig)) { 313 if (microcode_ops->collect_cpu_info(cpu, &nsig)) {
309 microcode_fini_cpu(cpu); 314 __microcode_fini_cpu(cpu);
315 printk(KERN_ERR "failed to collect_cpu_info for resuming cpu #%d\n",
316 cpu);
310 return -1; 317 return -1;
311 } 318 }
312 319
313 if (memcmp(&nsig, &uci->cpu_sig, sizeof(nsig))) { 320 if ((nsig.sig != uci->cpu_sig.sig) || (nsig.pf != uci->cpu_sig.pf)) {
314 microcode_fini_cpu(cpu); 321 __microcode_fini_cpu(cpu);
322 printk(KERN_ERR "cached ucode doesn't match the resuming cpu #%d\n",
323 cpu);
315 /* Should we look for a new ucode here? */ 324 /* Should we look for a new ucode here? */
316 return 1; 325 return 1;
317 } 326 }
@@ -319,7 +328,7 @@ static int microcode_resume_cpu(int cpu)
319 return 0; 328 return 0;
320} 329}
321 330
322void microcode_update_cpu(int cpu) 331static void microcode_update_cpu(int cpu)
323{ 332{
324 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 333 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
325 int err = 0; 334 int err = 0;
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 622dc4a21784..b7f4c929e615 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -155,6 +155,7 @@ static DEFINE_SPINLOCK(microcode_update_lock);
155static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) 155static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
156{ 156{
157 struct cpuinfo_x86 *c = &cpu_data(cpu_num); 157 struct cpuinfo_x86 *c = &cpu_data(cpu_num);
158 unsigned long flags;
158 unsigned int val[2]; 159 unsigned int val[2];
159 160
160 memset(csig, 0, sizeof(*csig)); 161 memset(csig, 0, sizeof(*csig));
@@ -174,11 +175,16 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
174 csig->pf = 1 << ((val[1] >> 18) & 7); 175 csig->pf = 1 << ((val[1] >> 18) & 7);
175 } 176 }
176 177
178 /* serialize access to the physical write to MSR 0x79 */
179 spin_lock_irqsave(&microcode_update_lock, flags);
180
177 wrmsr(MSR_IA32_UCODE_REV, 0, 0); 181 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
178 /* see notes above for revision 1.07. Apparent chip bug */ 182 /* see notes above for revision 1.07. Apparent chip bug */
179 sync_core(); 183 sync_core();
180 /* get the current revision from MSR 0x8B */ 184 /* get the current revision from MSR 0x8B */
181 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); 185 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
186 spin_unlock_irqrestore(&microcode_update_lock, flags);
187
182 pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n", 188 pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
183 csig->sig, csig->pf, csig->rev); 189 csig->sig, csig->pf, csig->rev);
184 190
@@ -465,7 +471,7 @@ static void microcode_fini_cpu(int cpu)
465 uci->mc = NULL; 471 uci->mc = NULL;
466} 472}
467 473
468struct microcode_ops microcode_intel_ops = { 474static struct microcode_ops microcode_intel_ops = {
469 .request_microcode_user = request_microcode_user, 475 .request_microcode_user = request_microcode_user,
470 .request_microcode_fw = request_microcode_fw, 476 .request_microcode_fw = request_microcode_fw,
471 .collect_cpu_info = collect_cpu_info, 477 .collect_cpu_info = collect_cpu_info,
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index efc2f361fe85..666e43df51f9 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -13,8 +13,7 @@
13#include <asm/msr.h> 13#include <asm/msr.h>
14#include <asm/acpi.h> 14#include <asm/acpi.h>
15#include <asm/mmconfig.h> 15#include <asm/mmconfig.h>
16 16#include <asm/pci_x86.h>
17#include "../pci/pci.h"
18 17
19struct pci_hostbridge_probe { 18struct pci_hostbridge_probe {
20 u32 bus; 19 u32 bus;
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 0f4c1fd5a1f4..c5c5b8df1dbc 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -16,14 +16,14 @@
16#include <linux/bitops.h> 16#include <linux/bitops.h>
17#include <linux/acpi.h> 17#include <linux/acpi.h>
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/smp.h>
20#include <linux/acpi.h>
19 21
20#include <asm/smp.h>
21#include <asm/mtrr.h> 22#include <asm/mtrr.h>
22#include <asm/mpspec.h> 23#include <asm/mpspec.h>
23#include <asm/pgalloc.h> 24#include <asm/pgalloc.h>
24#include <asm/io_apic.h> 25#include <asm/io_apic.h>
25#include <asm/proto.h> 26#include <asm/proto.h>
26#include <asm/acpi.h>
27#include <asm/bios_ebda.h> 27#include <asm/bios_ebda.h>
28#include <asm/e820.h> 28#include <asm/e820.h>
29#include <asm/trampoline.h> 29#include <asm/trampoline.h>
@@ -95,8 +95,8 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
95#endif 95#endif
96 96
97 if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) { 97 if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) {
98 set_bit(m->mpc_busid, mp_bus_not_pci); 98 set_bit(m->mpc_busid, mp_bus_not_pci);
99#if defined(CONFIG_EISA) || defined (CONFIG_MCA) 99#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
100 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; 100 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
101#endif 101#endif
102 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { 102 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
@@ -104,7 +104,7 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
104 x86_quirks->mpc_oem_pci_bus(m); 104 x86_quirks->mpc_oem_pci_bus(m);
105 105
106 clear_bit(m->mpc_busid, mp_bus_not_pci); 106 clear_bit(m->mpc_busid, mp_bus_not_pci);
107#if defined(CONFIG_EISA) || defined (CONFIG_MCA) 107#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
108 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; 108 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
109 } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) { 109 } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
110 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA; 110 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
@@ -586,26 +586,23 @@ static void __init __get_smp_config(unsigned int early)
586{ 586{
587 struct intel_mp_floating *mpf = mpf_found; 587 struct intel_mp_floating *mpf = mpf_found;
588 588
589 if (x86_quirks->mach_get_smp_config) { 589 if (!mpf)
590 if (x86_quirks->mach_get_smp_config(early)) 590 return;
591 return; 591
592 }
593 if (acpi_lapic && early) 592 if (acpi_lapic && early)
594 return; 593 return;
594
595 /* 595 /*
596 * ACPI supports both logical (e.g. Hyper-Threading) and physical 596 * MPS doesn't support hyperthreading, aka only have
597 * processors, where MPS only supports physical. 597 * thread 0 apic id in MPS table
598 */ 598 */
599 if (acpi_lapic && acpi_ioapic) { 599 if (acpi_lapic && acpi_ioapic)
600 printk(KERN_INFO "Using ACPI (MADT) for SMP configuration "
601 "information\n");
602 return; 600 return;
603 } else if (acpi_lapic)
604 printk(KERN_INFO "Using ACPI for processor (LAPIC) "
605 "configuration information\n");
606 601
607 if (!mpf) 602 if (x86_quirks->mach_get_smp_config) {
608 return; 603 if (x86_quirks->mach_get_smp_config(early))
604 return;
605 }
609 606
610 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", 607 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
611 mpf->mpf_specification); 608 mpf->mpf_specification);
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 82a7c7ed6d45..726266695b2c 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -136,7 +136,7 @@ static int msr_open(struct inode *inode, struct file *file)
136 lock_kernel(); 136 lock_kernel();
137 cpu = iminor(file->f_path.dentry->d_inode); 137 cpu = iminor(file->f_path.dentry->d_inode);
138 138
139 if (cpu >= NR_CPUS || !cpu_online(cpu)) { 139 if (cpu >= nr_cpu_ids || !cpu_online(cpu)) {
140 ret = -ENXIO; /* No such CPU */ 140 ret = -ENXIO; /* No such CPU */
141 goto out; 141 goto out;
142 } 142 }
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 2c97f07f1c2c..45a09ccdc214 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -26,11 +26,10 @@
26#include <linux/kernel_stat.h> 26#include <linux/kernel_stat.h>
27#include <linux/kdebug.h> 27#include <linux/kdebug.h>
28#include <linux/smp.h> 28#include <linux/smp.h>
29#include <linux/nmi.h>
29 30
30#include <asm/i8259.h> 31#include <asm/i8259.h>
31#include <asm/io_apic.h> 32#include <asm/io_apic.h>
32#include <asm/smp.h>
33#include <asm/nmi.h>
34#include <asm/proto.h> 33#include <asm/proto.h>
35#include <asm/timer.h> 34#include <asm/timer.h>
36 35
@@ -131,6 +130,11 @@ static void report_broken_nmi(int cpu, int *prev_nmi_count)
131 atomic_dec(&nmi_active); 130 atomic_dec(&nmi_active);
132} 131}
133 132
133static void __acpi_nmi_disable(void *__unused)
134{
135 apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
136}
137
134int __init check_nmi_watchdog(void) 138int __init check_nmi_watchdog(void)
135{ 139{
136 unsigned int *prev_nmi_count; 140 unsigned int *prev_nmi_count;
@@ -179,8 +183,12 @@ int __init check_nmi_watchdog(void)
179 kfree(prev_nmi_count); 183 kfree(prev_nmi_count);
180 return 0; 184 return 0;
181error: 185error:
182 if (nmi_watchdog == NMI_IO_APIC && !timer_through_8259) 186 if (nmi_watchdog == NMI_IO_APIC) {
183 disable_8259A_irq(0); 187 if (!timer_through_8259)
188 disable_8259A_irq(0);
189 on_each_cpu(__acpi_nmi_disable, NULL, 1);
190 }
191
184#ifdef CONFIG_X86_32 192#ifdef CONFIG_X86_32
185 timer_ack = 0; 193 timer_ack = 0;
186#endif 194#endif
@@ -199,12 +207,17 @@ static int __init setup_nmi_watchdog(char *str)
199 ++str; 207 ++str;
200 } 208 }
201 209
202 get_option(&str, &nmi); 210 if (!strncmp(str, "lapic", 5))
203 211 nmi_watchdog = NMI_LOCAL_APIC;
204 if (nmi >= NMI_INVALID) 212 else if (!strncmp(str, "ioapic", 6))
205 return 0; 213 nmi_watchdog = NMI_IO_APIC;
214 else {
215 get_option(&str, &nmi);
216 if (nmi >= NMI_INVALID)
217 return 0;
218 nmi_watchdog = nmi;
219 }
206 220
207 nmi_watchdog = nmi;
208 return 1; 221 return 1;
209} 222}
210__setup("nmi_watchdog=", setup_nmi_watchdog); 223__setup("nmi_watchdog=", setup_nmi_watchdog);
@@ -285,11 +298,6 @@ void acpi_nmi_enable(void)
285 on_each_cpu(__acpi_nmi_enable, NULL, 1); 298 on_each_cpu(__acpi_nmi_enable, NULL, 1);
286} 299}
287 300
288static void __acpi_nmi_disable(void *__unused)
289{
290 apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
291}
292
293/* 301/*
294 * Disable timer based NMIs on all CPUs: 302 * Disable timer based NMIs on all CPUs:
295 */ 303 */
@@ -340,6 +348,8 @@ void stop_apic_nmi_watchdog(void *unused)
340 return; 348 return;
341 if (nmi_watchdog == NMI_LOCAL_APIC) 349 if (nmi_watchdog == NMI_LOCAL_APIC)
342 lapic_watchdog_stop(); 350 lapic_watchdog_stop();
351 else
352 __acpi_nmi_disable(NULL);
343 __get_cpu_var(wd_enabled) = 0; 353 __get_cpu_var(wd_enabled) = 0;
344 atomic_dec(&nmi_active); 354 atomic_dec(&nmi_active);
345} 355}
@@ -465,6 +475,24 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
465 475
466#ifdef CONFIG_SYSCTL 476#ifdef CONFIG_SYSCTL
467 477
478static void enable_ioapic_nmi_watchdog_single(void *unused)
479{
480 __get_cpu_var(wd_enabled) = 1;
481 atomic_inc(&nmi_active);
482 __acpi_nmi_enable(NULL);
483}
484
485static void enable_ioapic_nmi_watchdog(void)
486{
487 on_each_cpu(enable_ioapic_nmi_watchdog_single, NULL, 1);
488 touch_nmi_watchdog();
489}
490
491static void disable_ioapic_nmi_watchdog(void)
492{
493 on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
494}
495
468static int __init setup_unknown_nmi_panic(char *str) 496static int __init setup_unknown_nmi_panic(char *str)
469{ 497{
470 unknown_nmi_panic = 1; 498 unknown_nmi_panic = 1;
@@ -507,6 +535,11 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
507 enable_lapic_nmi_watchdog(); 535 enable_lapic_nmi_watchdog();
508 else 536 else
509 disable_lapic_nmi_watchdog(); 537 disable_lapic_nmi_watchdog();
538 } else if (nmi_watchdog == NMI_IO_APIC) {
539 if (nmi_watchdog_enabled)
540 enable_ioapic_nmi_watchdog();
541 else
542 disable_ioapic_nmi_watchdog();
510 } else { 543 } else {
511 printk(KERN_WARNING 544 printk(KERN_WARNING
512 "NMI watchdog doesn't know what hardware to touch\n"); 545 "NMI watchdog doesn't know what hardware to touch\n");
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index 4caff39078e0..0deea37a53cf 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -31,7 +31,7 @@
31#include <asm/numaq.h> 31#include <asm/numaq.h>
32#include <asm/topology.h> 32#include <asm/topology.h>
33#include <asm/processor.h> 33#include <asm/processor.h>
34#include <asm/mpspec.h> 34#include <asm/genapic.h>
35#include <asm/e820.h> 35#include <asm/e820.h>
36#include <asm/setup.h> 36#include <asm/setup.h>
37 37
@@ -235,6 +235,13 @@ static int __init numaq_setup_ioapic_ids(void)
235 return 1; 235 return 1;
236} 236}
237 237
238static int __init numaq_update_genapic(void)
239{
240 genapic->wakeup_cpu = wakeup_secondary_cpu_via_nmi;
241
242 return 0;
243}
244
238static struct x86_quirks numaq_x86_quirks __initdata = { 245static struct x86_quirks numaq_x86_quirks __initdata = {
239 .arch_pre_time_init = numaq_pre_time_init, 246 .arch_pre_time_init = numaq_pre_time_init,
240 .arch_time_init = NULL, 247 .arch_time_init = NULL,
@@ -250,6 +257,7 @@ static struct x86_quirks numaq_x86_quirks __initdata = {
250 .mpc_oem_pci_bus = mpc_oem_pci_bus, 257 .mpc_oem_pci_bus = mpc_oem_pci_bus,
251 .smp_read_mpc_oem = smp_read_mpc_oem, 258 .smp_read_mpc_oem = smp_read_mpc_oem,
252 .setup_ioapic_ids = numaq_setup_ioapic_ids, 259 .setup_ioapic_ids = numaq_setup_ioapic_ids,
260 .update_genapic = numaq_update_genapic,
253}; 261};
254 262
255void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem, 263void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 0e9f1982b1dd..95777b0faa73 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -7,7 +7,8 @@
7 7
8#include <asm/paravirt.h> 8#include <asm/paravirt.h>
9 9
10static void default_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags) 10static inline void
11default_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags)
11{ 12{
12 __raw_spin_lock(lock); 13 __raw_spin_lock(lock);
13} 14}
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 192624820217..19a1044a0cd9 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -6,6 +6,7 @@
6#include <asm/proto.h> 6#include <asm/proto.h>
7#include <asm/dma.h> 7#include <asm/dma.h>
8#include <asm/iommu.h> 8#include <asm/iommu.h>
9#include <asm/gart.h>
9#include <asm/calgary.h> 10#include <asm/calgary.h>
10#include <asm/amd_iommu.h> 11#include <asm/amd_iommu.h>
11 12
@@ -30,11 +31,6 @@ int no_iommu __read_mostly;
30/* Set this to 1 if there is a HW IOMMU in the system */ 31/* Set this to 1 if there is a HW IOMMU in the system */
31int iommu_detected __read_mostly = 0; 32int iommu_detected __read_mostly = 0;
32 33
33/* This tells the BIO block layer to assume merging. Default to off
34 because we cannot guarantee merging later. */
35int iommu_bio_merge __read_mostly = 0;
36EXPORT_SYMBOL(iommu_bio_merge);
37
38dma_addr_t bad_dma_address __read_mostly = 0; 34dma_addr_t bad_dma_address __read_mostly = 0;
39EXPORT_SYMBOL(bad_dma_address); 35EXPORT_SYMBOL(bad_dma_address);
40 36
@@ -105,11 +101,15 @@ static void __init dma32_free_bootmem(void)
105 dma32_bootmem_ptr = NULL; 101 dma32_bootmem_ptr = NULL;
106 dma32_bootmem_size = 0; 102 dma32_bootmem_size = 0;
107} 103}
104#endif
108 105
109void __init pci_iommu_alloc(void) 106void __init pci_iommu_alloc(void)
110{ 107{
108#ifdef CONFIG_X86_64
111 /* free the range so iommu could get some range less than 4G */ 109 /* free the range so iommu could get some range less than 4G */
112 dma32_free_bootmem(); 110 dma32_free_bootmem();
111#endif
112
113 /* 113 /*
114 * The order of these functions is important for 114 * The order of these functions is important for
115 * fall-back/fail-over reasons 115 * fall-back/fail-over reasons
@@ -125,15 +125,6 @@ void __init pci_iommu_alloc(void)
125 pci_swiotlb_init(); 125 pci_swiotlb_init();
126} 126}
127 127
128unsigned long iommu_nr_pages(unsigned long addr, unsigned long len)
129{
130 unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE);
131
132 return size >> PAGE_SHIFT;
133}
134EXPORT_SYMBOL(iommu_nr_pages);
135#endif
136
137void *dma_generic_alloc_coherent(struct device *dev, size_t size, 128void *dma_generic_alloc_coherent(struct device *dev, size_t size,
138 dma_addr_t *dma_addr, gfp_t flag) 129 dma_addr_t *dma_addr, gfp_t flag)
139{ 130{
@@ -188,7 +179,6 @@ static __init int iommu_setup(char *p)
188 } 179 }
189 180
190 if (!strncmp(p, "biomerge", 8)) { 181 if (!strncmp(p, "biomerge", 8)) {
191 iommu_bio_merge = 4096;
192 iommu_merge = 1; 182 iommu_merge = 1;
193 force_iommu = 1; 183 force_iommu = 1;
194 } 184 }
@@ -300,8 +290,8 @@ fs_initcall(pci_iommu_init);
300static __devinit void via_no_dac(struct pci_dev *dev) 290static __devinit void via_no_dac(struct pci_dev *dev)
301{ 291{
302 if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { 292 if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
303 printk(KERN_INFO "PCI: VIA PCI bridge detected." 293 printk(KERN_INFO
304 "Disabling DAC.\n"); 294 "PCI: VIA PCI bridge detected. Disabling DAC.\n");
305 forbid_dac = 1; 295 forbid_dac = 1;
306 } 296 }
307} 297}
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index ba7ad83e20a8..00c2bcd41463 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -52,7 +52,7 @@ static u32 *iommu_gatt_base; /* Remapping table */
52 * to trigger bugs with some popular PCI cards, in particular 3ware (but 52 * to trigger bugs with some popular PCI cards, in particular 3ware (but
53 * has been also also seen with Qlogic at least). 53 * has been also also seen with Qlogic at least).
54 */ 54 */
55int iommu_fullflush = 1; 55static int iommu_fullflush = 1;
56 56
57/* Allocation bitmap for the remapping area: */ 57/* Allocation bitmap for the remapping area: */
58static DEFINE_SPINLOCK(iommu_bitmap_lock); 58static DEFINE_SPINLOCK(iommu_bitmap_lock);
@@ -745,10 +745,8 @@ void __init gart_iommu_init(void)
745 unsigned long scratch; 745 unsigned long scratch;
746 long i; 746 long i;
747 747
748 if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) { 748 if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0)
749 printk(KERN_INFO "PCI-GART: No AMD GART found.\n");
750 return; 749 return;
751 }
752 750
753#ifndef CONFIG_AGP_AMD64 751#ifndef CONFIG_AGP_AMD64
754 no_agp = 1; 752 no_agp = 1;
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
index 3c539d111abb..242c3440687f 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -3,6 +3,8 @@
3#include <linux/pci.h> 3#include <linux/pci.h>
4#include <linux/cache.h> 4#include <linux/cache.h>
5#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/swiotlb.h>
7#include <linux/bootmem.h>
6#include <linux/dma-mapping.h> 8#include <linux/dma-mapping.h>
7 9
8#include <asm/iommu.h> 10#include <asm/iommu.h>
@@ -11,6 +13,31 @@
11 13
12int swiotlb __read_mostly; 14int swiotlb __read_mostly;
13 15
16void *swiotlb_alloc_boot(size_t size, unsigned long nslabs)
17{
18 return alloc_bootmem_low_pages(size);
19}
20
21void *swiotlb_alloc(unsigned order, unsigned long nslabs)
22{
23 return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order);
24}
25
26dma_addr_t swiotlb_phys_to_bus(phys_addr_t paddr)
27{
28 return paddr;
29}
30
31phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr)
32{
33 return baddr;
34}
35
36int __weak swiotlb_arch_range_needs_mapping(void *ptr, size_t size)
37{
38 return 0;
39}
40
14static dma_addr_t 41static dma_addr_t
15swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size, 42swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size,
16 int direction) 43 int direction)
@@ -50,8 +77,10 @@ struct dma_mapping_ops swiotlb_dma_ops = {
50void __init pci_swiotlb_init(void) 77void __init pci_swiotlb_init(void)
51{ 78{
52 /* don't initialize swiotlb if iommu=off (no_iommu=1) */ 79 /* don't initialize swiotlb if iommu=off (no_iommu=1) */
80#ifdef CONFIG_X86_64
53 if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) 81 if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)
54 swiotlb = 1; 82 swiotlb = 1;
83#endif
55 if (swiotlb_force) 84 if (swiotlb_force)
56 swiotlb = 1; 85 swiotlb = 1;
57 if (swiotlb) { 86 if (swiotlb) {
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index c622772744d8..e68bb9e30864 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -1,13 +1,16 @@
1#include <linux/errno.h> 1#include <linux/errno.h>
2#include <linux/kernel.h> 2#include <linux/kernel.h>
3#include <linux/mm.h> 3#include <linux/mm.h>
4#include <asm/idle.h>
4#include <linux/smp.h> 5#include <linux/smp.h>
5#include <linux/slab.h> 6#include <linux/slab.h>
6#include <linux/sched.h> 7#include <linux/sched.h>
7#include <linux/module.h> 8#include <linux/module.h>
8#include <linux/pm.h> 9#include <linux/pm.h>
9#include <linux/clockchips.h> 10#include <linux/clockchips.h>
11#include <linux/ftrace.h>
10#include <asm/system.h> 12#include <asm/system.h>
13#include <asm/apic.h>
11 14
12unsigned long idle_halt; 15unsigned long idle_halt;
13EXPORT_SYMBOL(idle_halt); 16EXPORT_SYMBOL(idle_halt);
@@ -100,6 +103,9 @@ static inline int hlt_use_halt(void)
100void default_idle(void) 103void default_idle(void)
101{ 104{
102 if (hlt_use_halt()) { 105 if (hlt_use_halt()) {
106 struct power_trace it;
107
108 trace_power_start(&it, POWER_CSTATE, 1);
103 current_thread_info()->status &= ~TS_POLLING; 109 current_thread_info()->status &= ~TS_POLLING;
104 /* 110 /*
105 * TS_POLLING-cleared state must be visible before we 111 * TS_POLLING-cleared state must be visible before we
@@ -112,6 +118,7 @@ void default_idle(void)
112 else 118 else
113 local_irq_enable(); 119 local_irq_enable();
114 current_thread_info()->status |= TS_POLLING; 120 current_thread_info()->status |= TS_POLLING;
121 trace_power_end(&it);
115 } else { 122 } else {
116 local_irq_enable(); 123 local_irq_enable();
117 /* loop is done by the caller */ 124 /* loop is done by the caller */
@@ -122,6 +129,21 @@ void default_idle(void)
122EXPORT_SYMBOL(default_idle); 129EXPORT_SYMBOL(default_idle);
123#endif 130#endif
124 131
132void stop_this_cpu(void *dummy)
133{
134 local_irq_disable();
135 /*
136 * Remove this CPU:
137 */
138 cpu_clear(smp_processor_id(), cpu_online_map);
139 disable_local_APIC();
140
141 for (;;) {
142 if (hlt_works(smp_processor_id()))
143 halt();
144 }
145}
146
125static void do_nothing(void *unused) 147static void do_nothing(void *unused)
126{ 148{
127} 149}
@@ -154,24 +176,31 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
154 */ 176 */
155void mwait_idle_with_hints(unsigned long ax, unsigned long cx) 177void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
156{ 178{
179 struct power_trace it;
180
181 trace_power_start(&it, POWER_CSTATE, (ax>>4)+1);
157 if (!need_resched()) { 182 if (!need_resched()) {
158 __monitor((void *)&current_thread_info()->flags, 0, 0); 183 __monitor((void *)&current_thread_info()->flags, 0, 0);
159 smp_mb(); 184 smp_mb();
160 if (!need_resched()) 185 if (!need_resched())
161 __mwait(ax, cx); 186 __mwait(ax, cx);
162 } 187 }
188 trace_power_end(&it);
163} 189}
164 190
165/* Default MONITOR/MWAIT with no hints, used for default C1 state */ 191/* Default MONITOR/MWAIT with no hints, used for default C1 state */
166static void mwait_idle(void) 192static void mwait_idle(void)
167{ 193{
194 struct power_trace it;
168 if (!need_resched()) { 195 if (!need_resched()) {
196 trace_power_start(&it, POWER_CSTATE, 1);
169 __monitor((void *)&current_thread_info()->flags, 0, 0); 197 __monitor((void *)&current_thread_info()->flags, 0, 0);
170 smp_mb(); 198 smp_mb();
171 if (!need_resched()) 199 if (!need_resched())
172 __sti_mwait(0, 0); 200 __sti_mwait(0, 0);
173 else 201 else
174 local_irq_enable(); 202 local_irq_enable();
203 trace_power_end(&it);
175 } else 204 } else
176 local_irq_enable(); 205 local_irq_enable();
177} 206}
@@ -183,9 +212,13 @@ static void mwait_idle(void)
183 */ 212 */
184static void poll_idle(void) 213static void poll_idle(void)
185{ 214{
215 struct power_trace it;
216
217 trace_power_start(&it, POWER_CSTATE, 0);
186 local_irq_enable(); 218 local_irq_enable();
187 while (!need_resched()) 219 while (!need_resched())
188 cpu_relax(); 220 cpu_relax();
221 trace_power_end(&it);
189} 222}
190 223
191/* 224/*
@@ -270,7 +303,7 @@ static void c1e_idle(void)
270 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); 303 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
271 if (lo & K8_INTP_C1E_ACTIVE_MASK) { 304 if (lo & K8_INTP_C1E_ACTIVE_MASK) {
272 c1e_detected = 1; 305 c1e_detected = 1;
273 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) 306 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
274 mark_tsc_unstable("TSC halt in AMD C1E"); 307 mark_tsc_unstable("TSC halt in AMD C1E");
275 printk(KERN_INFO "System has AMD C1E enabled\n"); 308 printk(KERN_INFO "System has AMD C1E enabled\n");
276 set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E); 309 set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 0a1302fe6d45..3ba155d24884 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -38,6 +38,7 @@
38#include <linux/percpu.h> 38#include <linux/percpu.h>
39#include <linux/prctl.h> 39#include <linux/prctl.h>
40#include <linux/dmi.h> 40#include <linux/dmi.h>
41#include <linux/ftrace.h>
41 42
42#include <asm/uaccess.h> 43#include <asm/uaccess.h>
43#include <asm/pgtable.h> 44#include <asm/pgtable.h>
@@ -59,6 +60,7 @@
59#include <asm/idle.h> 60#include <asm/idle.h>
60#include <asm/syscalls.h> 61#include <asm/syscalls.h>
61#include <asm/smp.h> 62#include <asm/smp.h>
63#include <asm/ds.h>
62 64
63asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 65asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
64 66
@@ -250,14 +252,8 @@ void exit_thread(void)
250 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; 252 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
251 put_cpu(); 253 put_cpu();
252 } 254 }
253#ifdef CONFIG_X86_DS 255
254 /* Free any DS contexts that have not been properly released. */ 256 ds_exit_thread(current);
255 if (unlikely(current->thread.ds_ctx)) {
256 /* we clear debugctl to make sure DS is not used. */
257 update_debugctlmsr(0);
258 ds_free(current->thread.ds_ctx);
259 }
260#endif /* CONFIG_X86_DS */
261} 257}
262 258
263void flush_thread(void) 259void flush_thread(void)
@@ -339,6 +335,12 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
339 kfree(p->thread.io_bitmap_ptr); 335 kfree(p->thread.io_bitmap_ptr);
340 p->thread.io_bitmap_max = 0; 336 p->thread.io_bitmap_max = 0;
341 } 337 }
338
339 ds_copy_thread(p, current);
340
341 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
342 p->thread.debugctlmsr = 0;
343
342 return err; 344 return err;
343} 345}
344 346
@@ -419,48 +421,19 @@ int set_tsc_mode(unsigned int val)
419 return 0; 421 return 0;
420} 422}
421 423
422#ifdef CONFIG_X86_DS
423static int update_debugctl(struct thread_struct *prev,
424 struct thread_struct *next, unsigned long debugctl)
425{
426 unsigned long ds_prev = 0;
427 unsigned long ds_next = 0;
428
429 if (prev->ds_ctx)
430 ds_prev = (unsigned long)prev->ds_ctx->ds;
431 if (next->ds_ctx)
432 ds_next = (unsigned long)next->ds_ctx->ds;
433
434 if (ds_next != ds_prev) {
435 /* we clear debugctl to make sure DS
436 * is not in use when we change it */
437 debugctl = 0;
438 update_debugctlmsr(0);
439 wrmsr(MSR_IA32_DS_AREA, ds_next, 0);
440 }
441 return debugctl;
442}
443#else
444static int update_debugctl(struct thread_struct *prev,
445 struct thread_struct *next, unsigned long debugctl)
446{
447 return debugctl;
448}
449#endif /* CONFIG_X86_DS */
450
451static noinline void 424static noinline void
452__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, 425__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
453 struct tss_struct *tss) 426 struct tss_struct *tss)
454{ 427{
455 struct thread_struct *prev, *next; 428 struct thread_struct *prev, *next;
456 unsigned long debugctl;
457 429
458 prev = &prev_p->thread; 430 prev = &prev_p->thread;
459 next = &next_p->thread; 431 next = &next_p->thread;
460 432
461 debugctl = update_debugctl(prev, next, prev->debugctlmsr); 433 if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
462 434 test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
463 if (next->debugctlmsr != debugctl) 435 ds_switch_to(prev_p, next_p);
436 else if (next->debugctlmsr != prev->debugctlmsr)
464 update_debugctlmsr(next->debugctlmsr); 437 update_debugctlmsr(next->debugctlmsr);
465 438
466 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { 439 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
@@ -482,15 +455,6 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
482 hard_enable_TSC(); 455 hard_enable_TSC();
483 } 456 }
484 457
485#ifdef CONFIG_X86_PTRACE_BTS
486 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
487 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
488
489 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
490 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
491#endif /* CONFIG_X86_PTRACE_BTS */
492
493
494 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { 458 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
495 /* 459 /*
496 * Disable the bitmap via an invalid offset. We still cache 460 * Disable the bitmap via an invalid offset. We still cache
@@ -548,7 +512,8 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
548 * the task-switch, and shows up in ret_from_fork in entry.S, 512 * the task-switch, and shows up in ret_from_fork in entry.S,
549 * for example. 513 * for example.
550 */ 514 */
551struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) 515__notrace_funcgraph struct task_struct *
516__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
552{ 517{
553 struct thread_struct *prev = &prev_p->thread, 518 struct thread_struct *prev = &prev_p->thread,
554 *next = &next_p->thread; 519 *next = &next_p->thread;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index c958120fb1b6..416fb9282f4f 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -39,6 +39,7 @@
39#include <linux/prctl.h> 39#include <linux/prctl.h>
40#include <linux/uaccess.h> 40#include <linux/uaccess.h>
41#include <linux/io.h> 41#include <linux/io.h>
42#include <linux/ftrace.h>
42 43
43#include <asm/pgtable.h> 44#include <asm/pgtable.h>
44#include <asm/system.h> 45#include <asm/system.h>
@@ -52,6 +53,7 @@
52#include <asm/ia32.h> 53#include <asm/ia32.h>
53#include <asm/idle.h> 54#include <asm/idle.h>
54#include <asm/syscalls.h> 55#include <asm/syscalls.h>
56#include <asm/ds.h>
55 57
56asmlinkage extern void ret_from_fork(void); 58asmlinkage extern void ret_from_fork(void);
57 59
@@ -235,14 +237,8 @@ void exit_thread(void)
235 t->io_bitmap_max = 0; 237 t->io_bitmap_max = 0;
236 put_cpu(); 238 put_cpu();
237 } 239 }
238#ifdef CONFIG_X86_DS 240
239 /* Free any DS contexts that have not been properly released. */ 241 ds_exit_thread(current);
240 if (unlikely(t->ds_ctx)) {
241 /* we clear debugctl to make sure DS is not used. */
242 update_debugctlmsr(0);
243 ds_free(t->ds_ctx);
244 }
245#endif /* CONFIG_X86_DS */
246} 242}
247 243
248void flush_thread(void) 244void flush_thread(void)
@@ -372,6 +368,12 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
372 if (err) 368 if (err)
373 goto out; 369 goto out;
374 } 370 }
371
372 ds_copy_thread(p, me);
373
374 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
375 p->thread.debugctlmsr = 0;
376
375 err = 0; 377 err = 0;
376out: 378out:
377 if (err && p->thread.io_bitmap_ptr) { 379 if (err && p->thread.io_bitmap_ptr) {
@@ -470,35 +472,14 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
470 struct tss_struct *tss) 472 struct tss_struct *tss)
471{ 473{
472 struct thread_struct *prev, *next; 474 struct thread_struct *prev, *next;
473 unsigned long debugctl;
474 475
475 prev = &prev_p->thread, 476 prev = &prev_p->thread,
476 next = &next_p->thread; 477 next = &next_p->thread;
477 478
478 debugctl = prev->debugctlmsr; 479 if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
479 480 test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
480#ifdef CONFIG_X86_DS 481 ds_switch_to(prev_p, next_p);
481 { 482 else if (next->debugctlmsr != prev->debugctlmsr)
482 unsigned long ds_prev = 0, ds_next = 0;
483
484 if (prev->ds_ctx)
485 ds_prev = (unsigned long)prev->ds_ctx->ds;
486 if (next->ds_ctx)
487 ds_next = (unsigned long)next->ds_ctx->ds;
488
489 if (ds_next != ds_prev) {
490 /*
491 * We clear debugctl to make sure DS
492 * is not in use when we change it:
493 */
494 debugctl = 0;
495 update_debugctlmsr(0);
496 wrmsrl(MSR_IA32_DS_AREA, ds_next);
497 }
498 }
499#endif /* CONFIG_X86_DS */
500
501 if (next->debugctlmsr != debugctl)
502 update_debugctlmsr(next->debugctlmsr); 483 update_debugctlmsr(next->debugctlmsr);
503 484
504 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { 485 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
@@ -533,14 +514,6 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
533 */ 514 */
534 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 515 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
535 } 516 }
536
537#ifdef CONFIG_X86_PTRACE_BTS
538 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
539 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
540
541 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
542 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
543#endif /* CONFIG_X86_PTRACE_BTS */
544} 517}
545 518
546/* 519/*
@@ -551,8 +524,9 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
551 * - could test fs/gs bitsliced 524 * - could test fs/gs bitsliced
552 * 525 *
553 * Kprobes not supported here. Set the probe on schedule instead. 526 * Kprobes not supported here. Set the probe on schedule instead.
527 * Function graph tracer not supported too.
554 */ 528 */
555struct task_struct * 529__notrace_funcgraph struct task_struct *
556__switch_to(struct task_struct *prev_p, struct task_struct *next_p) 530__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
557{ 531{
558 struct thread_struct *prev = &prev_p->thread; 532 struct thread_struct *prev = &prev_p->thread;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 0a6d8c12e10d..0a5df5f82fb9 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -581,158 +581,91 @@ static int ioperm_get(struct task_struct *target,
581} 581}
582 582
583#ifdef CONFIG_X86_PTRACE_BTS 583#ifdef CONFIG_X86_PTRACE_BTS
584/*
585 * The configuration for a particular BTS hardware implementation.
586 */
587struct bts_configuration {
588 /* the size of a BTS record in bytes; at most BTS_MAX_RECORD_SIZE */
589 unsigned char sizeof_bts;
590 /* the size of a field in the BTS record in bytes */
591 unsigned char sizeof_field;
592 /* a bitmask to enable/disable BTS in DEBUGCTL MSR */
593 unsigned long debugctl_mask;
594};
595static struct bts_configuration bts_cfg;
596
597#define BTS_MAX_RECORD_SIZE (8 * 3)
598
599
600/*
601 * Branch Trace Store (BTS) uses the following format. Different
602 * architectures vary in the size of those fields.
603 * - source linear address
604 * - destination linear address
605 * - flags
606 *
607 * Later architectures use 64bit pointers throughout, whereas earlier
608 * architectures use 32bit pointers in 32bit mode.
609 *
610 * We compute the base address for the first 8 fields based on:
611 * - the field size stored in the DS configuration
612 * - the relative field position
613 *
614 * In order to store additional information in the BTS buffer, we use
615 * a special source address to indicate that the record requires
616 * special interpretation.
617 *
618 * Netburst indicated via a bit in the flags field whether the branch
619 * was predicted; this is ignored.
620 */
621
622enum bts_field {
623 bts_from = 0,
624 bts_to,
625 bts_flags,
626
627 bts_escape = (unsigned long)-1,
628 bts_qual = bts_to,
629 bts_jiffies = bts_flags
630};
631
632static inline unsigned long bts_get(const char *base, enum bts_field field)
633{
634 base += (bts_cfg.sizeof_field * field);
635 return *(unsigned long *)base;
636}
637
638static inline void bts_set(char *base, enum bts_field field, unsigned long val)
639{
640 base += (bts_cfg.sizeof_field * field);;
641 (*(unsigned long *)base) = val;
642}
643
644/*
645 * Translate a BTS record from the raw format into the bts_struct format
646 *
647 * out (out): bts_struct interpretation
648 * raw: raw BTS record
649 */
650static void ptrace_bts_translate_record(struct bts_struct *out, const void *raw)
651{
652 memset(out, 0, sizeof(*out));
653 if (bts_get(raw, bts_from) == bts_escape) {
654 out->qualifier = bts_get(raw, bts_qual);
655 out->variant.jiffies = bts_get(raw, bts_jiffies);
656 } else {
657 out->qualifier = BTS_BRANCH;
658 out->variant.lbr.from_ip = bts_get(raw, bts_from);
659 out->variant.lbr.to_ip = bts_get(raw, bts_to);
660 }
661}
662
663static int ptrace_bts_read_record(struct task_struct *child, size_t index, 584static int ptrace_bts_read_record(struct task_struct *child, size_t index,
664 struct bts_struct __user *out) 585 struct bts_struct __user *out)
665{ 586{
666 struct bts_struct ret; 587 const struct bts_trace *trace;
667 const void *bts_record; 588 struct bts_struct bts;
668 size_t bts_index, bts_end; 589 const unsigned char *at;
669 int error; 590 int error;
670 591
671 error = ds_get_bts_end(child, &bts_end); 592 trace = ds_read_bts(child->bts);
672 if (error < 0) 593 if (!trace)
673 return error; 594 return -EPERM;
674
675 if (bts_end <= index)
676 return -EINVAL;
677 595
678 error = ds_get_bts_index(child, &bts_index); 596 at = trace->ds.top - ((index + 1) * trace->ds.size);
679 if (error < 0) 597 if ((void *)at < trace->ds.begin)
680 return error; 598 at += (trace->ds.n * trace->ds.size);
681 599
682 /* translate the ptrace bts index into the ds bts index */ 600 if (!trace->read)
683 bts_index += bts_end - (index + 1); 601 return -EOPNOTSUPP;
684 if (bts_end <= bts_index)
685 bts_index -= bts_end;
686 602
687 error = ds_access_bts(child, bts_index, &bts_record); 603 error = trace->read(child->bts, at, &bts);
688 if (error < 0) 604 if (error < 0)
689 return error; 605 return error;
690 606
691 ptrace_bts_translate_record(&ret, bts_record); 607 if (copy_to_user(out, &bts, sizeof(bts)))
692
693 if (copy_to_user(out, &ret, sizeof(ret)))
694 return -EFAULT; 608 return -EFAULT;
695 609
696 return sizeof(ret); 610 return sizeof(bts);
697} 611}
698 612
699static int ptrace_bts_drain(struct task_struct *child, 613static int ptrace_bts_drain(struct task_struct *child,
700 long size, 614 long size,
701 struct bts_struct __user *out) 615 struct bts_struct __user *out)
702{ 616{
703 struct bts_struct ret; 617 const struct bts_trace *trace;
704 const unsigned char *raw; 618 const unsigned char *at;
705 size_t end, i; 619 int error, drained = 0;
706 int error;
707 620
708 error = ds_get_bts_index(child, &end); 621 trace = ds_read_bts(child->bts);
709 if (error < 0) 622 if (!trace)
710 return error; 623 return -EPERM;
711 624
712 if (size < (end * sizeof(struct bts_struct))) 625 if (!trace->read)
626 return -EOPNOTSUPP;
627
628 if (size < (trace->ds.top - trace->ds.begin))
713 return -EIO; 629 return -EIO;
714 630
715 error = ds_access_bts(child, 0, (const void **)&raw); 631 for (at = trace->ds.begin; (void *)at < trace->ds.top;
716 if (error < 0) 632 out++, drained++, at += trace->ds.size) {
717 return error; 633 struct bts_struct bts;
634 int error;
718 635
719 for (i = 0; i < end; i++, out++, raw += bts_cfg.sizeof_bts) { 636 error = trace->read(child->bts, at, &bts);
720 ptrace_bts_translate_record(&ret, raw); 637 if (error < 0)
638 return error;
721 639
722 if (copy_to_user(out, &ret, sizeof(ret))) 640 if (copy_to_user(out, &bts, sizeof(bts)))
723 return -EFAULT; 641 return -EFAULT;
724 } 642 }
725 643
726 error = ds_clear_bts(child); 644 memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
645
646 error = ds_reset_bts(child->bts);
727 if (error < 0) 647 if (error < 0)
728 return error; 648 return error;
729 649
730 return end; 650 return drained;
731} 651}
732 652
733static void ptrace_bts_ovfl(struct task_struct *child) 653static int ptrace_bts_allocate_buffer(struct task_struct *child, size_t size)
734{ 654{
735 send_sig(child->thread.bts_ovfl_signal, child, 0); 655 child->bts_buffer = alloc_locked_buffer(size);
656 if (!child->bts_buffer)
657 return -ENOMEM;
658
659 child->bts_size = size;
660
661 return 0;
662}
663
664static void ptrace_bts_free_buffer(struct task_struct *child)
665{
666 free_locked_buffer(child->bts_buffer, child->bts_size);
667 child->bts_buffer = NULL;
668 child->bts_size = 0;
736} 669}
737 670
738static int ptrace_bts_config(struct task_struct *child, 671static int ptrace_bts_config(struct task_struct *child,
@@ -740,114 +673,86 @@ static int ptrace_bts_config(struct task_struct *child,
740 const struct ptrace_bts_config __user *ucfg) 673 const struct ptrace_bts_config __user *ucfg)
741{ 674{
742 struct ptrace_bts_config cfg; 675 struct ptrace_bts_config cfg;
743 int error = 0; 676 unsigned int flags = 0;
744
745 error = -EOPNOTSUPP;
746 if (!bts_cfg.sizeof_bts)
747 goto errout;
748 677
749 error = -EIO;
750 if (cfg_size < sizeof(cfg)) 678 if (cfg_size < sizeof(cfg))
751 goto errout; 679 return -EIO;
752 680
753 error = -EFAULT;
754 if (copy_from_user(&cfg, ucfg, sizeof(cfg))) 681 if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
755 goto errout; 682 return -EFAULT;
756 683
757 error = -EINVAL; 684 if (child->bts) {
758 if ((cfg.flags & PTRACE_BTS_O_SIGNAL) && 685 ds_release_bts(child->bts);
759 !(cfg.flags & PTRACE_BTS_O_ALLOC)) 686 child->bts = NULL;
760 goto errout; 687 }
761 688
762 if (cfg.flags & PTRACE_BTS_O_ALLOC) { 689 if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
763 ds_ovfl_callback_t ovfl = NULL; 690 if (!cfg.signal)
764 unsigned int sig = 0; 691 return -EINVAL;
765 692
766 /* we ignore the error in case we were not tracing child */ 693 return -EOPNOTSUPP;
767 (void)ds_release_bts(child);
768 694
769 if (cfg.flags & PTRACE_BTS_O_SIGNAL) { 695 child->thread.bts_ovfl_signal = cfg.signal;
770 if (!cfg.signal) 696 }
771 goto errout;
772 697
773 sig = cfg.signal; 698 if ((cfg.flags & PTRACE_BTS_O_ALLOC) &&
774 ovfl = ptrace_bts_ovfl; 699 (cfg.size != child->bts_size)) {
775 } 700 int error;
776 701
777 error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl); 702 ptrace_bts_free_buffer(child);
778 if (error < 0)
779 goto errout;
780 703
781 child->thread.bts_ovfl_signal = sig; 704 error = ptrace_bts_allocate_buffer(child, cfg.size);
705 if (error < 0)
706 return error;
782 } 707 }
783 708
784 error = -EINVAL;
785 if (!child->thread.ds_ctx && cfg.flags)
786 goto errout;
787
788 if (cfg.flags & PTRACE_BTS_O_TRACE) 709 if (cfg.flags & PTRACE_BTS_O_TRACE)
789 child->thread.debugctlmsr |= bts_cfg.debugctl_mask; 710 flags |= BTS_USER;
790 else
791 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
792 711
793 if (cfg.flags & PTRACE_BTS_O_SCHED) 712 if (cfg.flags & PTRACE_BTS_O_SCHED)
794 set_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 713 flags |= BTS_TIMESTAMPS;
795 else
796 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
797 714
798 error = sizeof(cfg); 715 child->bts = ds_request_bts(child, child->bts_buffer, child->bts_size,
716 /* ovfl = */ NULL, /* th = */ (size_t)-1,
717 flags);
718 if (IS_ERR(child->bts)) {
719 int error = PTR_ERR(child->bts);
799 720
800out: 721 ptrace_bts_free_buffer(child);
801 if (child->thread.debugctlmsr) 722 child->bts = NULL;
802 set_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
803 else
804 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
805 723
806 return error; 724 return error;
725 }
807 726
808errout: 727 return sizeof(cfg);
809 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
810 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
811 goto out;
812} 728}
813 729
814static int ptrace_bts_status(struct task_struct *child, 730static int ptrace_bts_status(struct task_struct *child,
815 long cfg_size, 731 long cfg_size,
816 struct ptrace_bts_config __user *ucfg) 732 struct ptrace_bts_config __user *ucfg)
817{ 733{
734 const struct bts_trace *trace;
818 struct ptrace_bts_config cfg; 735 struct ptrace_bts_config cfg;
819 size_t end;
820 const void *base, *max;
821 int error;
822 736
823 if (cfg_size < sizeof(cfg)) 737 if (cfg_size < sizeof(cfg))
824 return -EIO; 738 return -EIO;
825 739
826 error = ds_get_bts_end(child, &end); 740 trace = ds_read_bts(child->bts);
827 if (error < 0) 741 if (!trace)
828 return error; 742 return -EPERM;
829
830 error = ds_access_bts(child, /* index = */ 0, &base);
831 if (error < 0)
832 return error;
833
834 error = ds_access_bts(child, /* index = */ end, &max);
835 if (error < 0)
836 return error;
837 743
838 memset(&cfg, 0, sizeof(cfg)); 744 memset(&cfg, 0, sizeof(cfg));
839 cfg.size = (max - base); 745 cfg.size = trace->ds.end - trace->ds.begin;
840 cfg.signal = child->thread.bts_ovfl_signal; 746 cfg.signal = child->thread.bts_ovfl_signal;
841 cfg.bts_size = sizeof(struct bts_struct); 747 cfg.bts_size = sizeof(struct bts_struct);
842 748
843 if (cfg.signal) 749 if (cfg.signal)
844 cfg.flags |= PTRACE_BTS_O_SIGNAL; 750 cfg.flags |= PTRACE_BTS_O_SIGNAL;
845 751
846 if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) && 752 if (trace->ds.flags & BTS_USER)
847 child->thread.debugctlmsr & bts_cfg.debugctl_mask)
848 cfg.flags |= PTRACE_BTS_O_TRACE; 753 cfg.flags |= PTRACE_BTS_O_TRACE;
849 754
850 if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS)) 755 if (trace->ds.flags & BTS_TIMESTAMPS)
851 cfg.flags |= PTRACE_BTS_O_SCHED; 756 cfg.flags |= PTRACE_BTS_O_SCHED;
852 757
853 if (copy_to_user(ucfg, &cfg, sizeof(cfg))) 758 if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
@@ -856,110 +761,77 @@ static int ptrace_bts_status(struct task_struct *child,
856 return sizeof(cfg); 761 return sizeof(cfg);
857} 762}
858 763
859static int ptrace_bts_write_record(struct task_struct *child, 764static int ptrace_bts_clear(struct task_struct *child)
860 const struct bts_struct *in)
861{ 765{
862 unsigned char bts_record[BTS_MAX_RECORD_SIZE]; 766 const struct bts_trace *trace;
863 767
864 BUG_ON(BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts); 768 trace = ds_read_bts(child->bts);
769 if (!trace)
770 return -EPERM;
865 771
866 memset(bts_record, 0, bts_cfg.sizeof_bts); 772 memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
867 switch (in->qualifier) {
868 case BTS_INVALID:
869 break;
870 773
871 case BTS_BRANCH: 774 return ds_reset_bts(child->bts);
872 bts_set(bts_record, bts_from, in->variant.lbr.from_ip); 775}
873 bts_set(bts_record, bts_to, in->variant.lbr.to_ip);
874 break;
875 776
876 case BTS_TASK_ARRIVES: 777static int ptrace_bts_size(struct task_struct *child)
877 case BTS_TASK_DEPARTS: 778{
878 bts_set(bts_record, bts_from, bts_escape); 779 const struct bts_trace *trace;
879 bts_set(bts_record, bts_qual, in->qualifier);
880 bts_set(bts_record, bts_jiffies, in->variant.jiffies);
881 break;
882 780
883 default: 781 trace = ds_read_bts(child->bts);
884 return -EINVAL; 782 if (!trace)
885 } 783 return -EPERM;
886 784
887 /* The writing task will be the switched-to task on a context 785 return (trace->ds.top - trace->ds.begin) / trace->ds.size;
888 * switch. It needs to write into the switched-from task's BTS
889 * buffer. */
890 return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts);
891} 786}
892 787
893void ptrace_bts_take_timestamp(struct task_struct *tsk, 788static void ptrace_bts_fork(struct task_struct *tsk)
894 enum bts_qualifier qualifier)
895{ 789{
896 struct bts_struct rec = { 790 tsk->bts = NULL;
897 .qualifier = qualifier, 791 tsk->bts_buffer = NULL;
898 .variant.jiffies = jiffies_64 792 tsk->bts_size = 0;
899 }; 793 tsk->thread.bts_ovfl_signal = 0;
900
901 ptrace_bts_write_record(tsk, &rec);
902} 794}
903 795
904static const struct bts_configuration bts_cfg_netburst = { 796static void ptrace_bts_untrace(struct task_struct *child)
905 .sizeof_bts = sizeof(long) * 3, 797{
906 .sizeof_field = sizeof(long), 798 if (unlikely(child->bts)) {
907 .debugctl_mask = (1<<2)|(1<<3)|(1<<5) 799 ds_release_bts(child->bts);
908}; 800 child->bts = NULL;
801
802 /* We cannot update total_vm and locked_vm since
803 child's mm is already gone. But we can reclaim the
804 memory. */
805 kfree(child->bts_buffer);
806 child->bts_buffer = NULL;
807 child->bts_size = 0;
808 }
809}
909 810
910static const struct bts_configuration bts_cfg_pentium_m = { 811static void ptrace_bts_detach(struct task_struct *child)
911 .sizeof_bts = sizeof(long) * 3, 812{
912 .sizeof_field = sizeof(long), 813 if (unlikely(child->bts)) {
913 .debugctl_mask = (1<<6)|(1<<7) 814 ds_release_bts(child->bts);
914}; 815 child->bts = NULL;
915 816
916static const struct bts_configuration bts_cfg_core2 = { 817 ptrace_bts_free_buffer(child);
917 .sizeof_bts = 8 * 3, 818 }
918 .sizeof_field = 8, 819}
919 .debugctl_mask = (1<<6)|(1<<7)|(1<<9) 820#else
920}; 821static inline void ptrace_bts_fork(struct task_struct *tsk) {}
822static inline void ptrace_bts_detach(struct task_struct *child) {}
823static inline void ptrace_bts_untrace(struct task_struct *child) {}
824#endif /* CONFIG_X86_PTRACE_BTS */
921 825
922static inline void bts_configure(const struct bts_configuration *cfg) 826void x86_ptrace_fork(struct task_struct *child, unsigned long clone_flags)
923{ 827{
924 bts_cfg = *cfg; 828 ptrace_bts_fork(child);
925} 829}
926 830
927void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c) 831void x86_ptrace_untrace(struct task_struct *child)
928{ 832{
929 switch (c->x86) { 833 ptrace_bts_untrace(child);
930 case 0x6:
931 switch (c->x86_model) {
932 case 0xD:
933 case 0xE: /* Pentium M */
934 bts_configure(&bts_cfg_pentium_m);
935 break;
936 case 0xF: /* Core2 */
937 case 0x1C: /* Atom */
938 bts_configure(&bts_cfg_core2);
939 break;
940 default:
941 /* sorry, don't know about them */
942 break;
943 }
944 break;
945 case 0xF:
946 switch (c->x86_model) {
947 case 0x0:
948 case 0x1:
949 case 0x2: /* Netburst */
950 bts_configure(&bts_cfg_netburst);
951 break;
952 default:
953 /* sorry, don't know about them */
954 break;
955 }
956 break;
957 default:
958 /* sorry, don't know about them */
959 break;
960 }
961} 834}
962#endif /* CONFIG_X86_PTRACE_BTS */
963 835
964/* 836/*
965 * Called by kernel/ptrace.c when detaching.. 837 * Called by kernel/ptrace.c when detaching..
@@ -972,15 +844,7 @@ void ptrace_disable(struct task_struct *child)
972#ifdef TIF_SYSCALL_EMU 844#ifdef TIF_SYSCALL_EMU
973 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); 845 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
974#endif 846#endif
975#ifdef CONFIG_X86_PTRACE_BTS 847 ptrace_bts_detach(child);
976 (void)ds_release_bts(child);
977
978 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
979 if (!child->thread.debugctlmsr)
980 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
981
982 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
983#endif /* CONFIG_X86_PTRACE_BTS */
984} 848}
985 849
986#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 850#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
@@ -1112,7 +976,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
1112 break; 976 break;
1113 977
1114 case PTRACE_BTS_SIZE: 978 case PTRACE_BTS_SIZE:
1115 ret = ds_get_bts_index(child, /* pos = */ NULL); 979 ret = ptrace_bts_size(child);
1116 break; 980 break;
1117 981
1118 case PTRACE_BTS_GET: 982 case PTRACE_BTS_GET:
@@ -1121,7 +985,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
1121 break; 985 break;
1122 986
1123 case PTRACE_BTS_CLEAR: 987 case PTRACE_BTS_CLEAR:
1124 ret = ds_clear_bts(child); 988 ret = ptrace_bts_clear(child);
1125 break; 989 break;
1126 990
1127 case PTRACE_BTS_DRAIN: 991 case PTRACE_BTS_DRAIN:
@@ -1384,6 +1248,14 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
1384 1248
1385 case PTRACE_GET_THREAD_AREA: 1249 case PTRACE_GET_THREAD_AREA:
1386 case PTRACE_SET_THREAD_AREA: 1250 case PTRACE_SET_THREAD_AREA:
1251#ifdef CONFIG_X86_PTRACE_BTS
1252 case PTRACE_BTS_CONFIG:
1253 case PTRACE_BTS_STATUS:
1254 case PTRACE_BTS_SIZE:
1255 case PTRACE_BTS_GET:
1256 case PTRACE_BTS_CLEAR:
1257 case PTRACE_BTS_DRAIN:
1258#endif /* CONFIG_X86_PTRACE_BTS */
1387 return arch_ptrace(child, request, addr, data); 1259 return arch_ptrace(child, request, addr, data);
1388 1260
1389 default: 1261 default:
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 67465ed89310..309949e9e1c1 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -168,6 +168,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_31,
168 ich_force_enable_hpet); 168 ich_force_enable_hpet);
169DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1, 169DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1,
170 ich_force_enable_hpet); 170 ich_force_enable_hpet);
171DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_4,
172 ich_force_enable_hpet);
171DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7, 173DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7,
172 ich_force_enable_hpet); 174 ich_force_enable_hpet);
173 175
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index cc5a2545dd41..2b46eb41643b 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -12,6 +12,8 @@
12#include <asm/proto.h> 12#include <asm/proto.h>
13#include <asm/reboot_fixups.h> 13#include <asm/reboot_fixups.h>
14#include <asm/reboot.h> 14#include <asm/reboot.h>
15#include <asm/pci_x86.h>
16#include <asm/virtext.h>
15 17
16#ifdef CONFIG_X86_32 18#ifdef CONFIG_X86_32
17# include <linux/dmi.h> 19# include <linux/dmi.h>
@@ -21,6 +23,8 @@
21# include <asm/iommu.h> 23# include <asm/iommu.h>
22#endif 24#endif
23 25
26#include <mach_ipi.h>
27
24/* 28/*
25 * Power off function, if any 29 * Power off function, if any
26 */ 30 */
@@ -36,7 +40,16 @@ int reboot_force;
36static int reboot_cpu = -1; 40static int reboot_cpu = -1;
37#endif 41#endif
38 42
39/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] 43/* This is set if we need to go through the 'emergency' path.
44 * When machine_emergency_restart() is called, we may be on
45 * an inconsistent state and won't be able to do a clean cleanup
46 */
47static int reboot_emergency;
48
49/* This is set by the PCI code if either type 1 or type 2 PCI is detected */
50bool port_cf9_safe = false;
51
52/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci]
40 warm Don't set the cold reboot flag 53 warm Don't set the cold reboot flag
41 cold Set the cold reboot flag 54 cold Set the cold reboot flag
42 bios Reboot by jumping through the BIOS (only for X86_32) 55 bios Reboot by jumping through the BIOS (only for X86_32)
@@ -45,6 +58,7 @@ static int reboot_cpu = -1;
45 kbd Use the keyboard controller. cold reset (default) 58 kbd Use the keyboard controller. cold reset (default)
46 acpi Use the RESET_REG in the FADT 59 acpi Use the RESET_REG in the FADT
47 efi Use efi reset_system runtime service 60 efi Use efi reset_system runtime service
61 pci Use the so-called "PCI reset register", CF9
48 force Avoid anything that could hang. 62 force Avoid anything that could hang.
49 */ 63 */
50static int __init reboot_setup(char *str) 64static int __init reboot_setup(char *str)
@@ -79,6 +93,7 @@ static int __init reboot_setup(char *str)
79 case 'k': 93 case 'k':
80 case 't': 94 case 't':
81 case 'e': 95 case 'e':
96 case 'p':
82 reboot_type = *str; 97 reboot_type = *str;
83 break; 98 break;
84 99
@@ -360,6 +375,48 @@ static inline void kb_wait(void)
360 } 375 }
361} 376}
362 377
378static void vmxoff_nmi(int cpu, struct die_args *args)
379{
380 cpu_emergency_vmxoff();
381}
382
383/* Use NMIs as IPIs to tell all CPUs to disable virtualization
384 */
385static void emergency_vmx_disable_all(void)
386{
387 /* Just make sure we won't change CPUs while doing this */
388 local_irq_disable();
389
390 /* We need to disable VMX on all CPUs before rebooting, otherwise
391 * we risk hanging up the machine, because the CPU ignore INIT
392 * signals when VMX is enabled.
393 *
394 * We can't take any locks and we may be on an inconsistent
395 * state, so we use NMIs as IPIs to tell the other CPUs to disable
396 * VMX and halt.
397 *
398 * For safety, we will avoid running the nmi_shootdown_cpus()
399 * stuff unnecessarily, but we don't have a way to check
400 * if other CPUs have VMX enabled. So we will call it only if the
401 * CPU we are running on has VMX enabled.
402 *
403 * We will miss cases where VMX is not enabled on all CPUs. This
404 * shouldn't do much harm because KVM always enable VMX on all
405 * CPUs anyway. But we can miss it on the small window where KVM
406 * is still enabling VMX.
407 */
408 if (cpu_has_vmx() && cpu_vmx_enabled()) {
409 /* Disable VMX on this CPU.
410 */
411 cpu_vmxoff();
412
413 /* Halt and disable VMX on the other CPUs */
414 nmi_shootdown_cpus(vmxoff_nmi);
415
416 }
417}
418
419
363void __attribute__((weak)) mach_reboot_fixups(void) 420void __attribute__((weak)) mach_reboot_fixups(void)
364{ 421{
365} 422}
@@ -368,6 +425,9 @@ static void native_machine_emergency_restart(void)
368{ 425{
369 int i; 426 int i;
370 427
428 if (reboot_emergency)
429 emergency_vmx_disable_all();
430
371 /* Tell the BIOS if we want cold or warm reboot */ 431 /* Tell the BIOS if we want cold or warm reboot */
372 *((unsigned short *)__va(0x472)) = reboot_mode; 432 *((unsigned short *)__va(0x472)) = reboot_mode;
373 433
@@ -404,12 +464,27 @@ static void native_machine_emergency_restart(void)
404 reboot_type = BOOT_KBD; 464 reboot_type = BOOT_KBD;
405 break; 465 break;
406 466
407
408 case BOOT_EFI: 467 case BOOT_EFI:
409 if (efi_enabled) 468 if (efi_enabled)
410 efi.reset_system(reboot_mode ? EFI_RESET_WARM : EFI_RESET_COLD, 469 efi.reset_system(reboot_mode ?
470 EFI_RESET_WARM :
471 EFI_RESET_COLD,
411 EFI_SUCCESS, 0, NULL); 472 EFI_SUCCESS, 0, NULL);
473 reboot_type = BOOT_KBD;
474 break;
475
476 case BOOT_CF9:
477 port_cf9_safe = true;
478 /* fall through */
412 479
480 case BOOT_CF9_COND:
481 if (port_cf9_safe) {
482 u8 cf9 = inb(0xcf9) & ~6;
483 outb(cf9|2, 0xcf9); /* Request hard reset */
484 udelay(50);
485 outb(cf9|6, 0xcf9); /* Actually do the reset */
486 udelay(50);
487 }
413 reboot_type = BOOT_KBD; 488 reboot_type = BOOT_KBD;
414 break; 489 break;
415 } 490 }
@@ -426,7 +501,7 @@ void native_machine_shutdown(void)
426 501
427#ifdef CONFIG_X86_32 502#ifdef CONFIG_X86_32
428 /* See if there has been given a command line override */ 503 /* See if there has been given a command line override */
429 if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) && 504 if ((reboot_cpu != -1) && (reboot_cpu < nr_cpu_ids) &&
430 cpu_online(reboot_cpu)) 505 cpu_online(reboot_cpu))
431 reboot_cpu_id = reboot_cpu; 506 reboot_cpu_id = reboot_cpu;
432#endif 507#endif
@@ -436,7 +511,7 @@ void native_machine_shutdown(void)
436 reboot_cpu_id = smp_processor_id(); 511 reboot_cpu_id = smp_processor_id();
437 512
438 /* Make certain I only run on the appropriate processor */ 513 /* Make certain I only run on the appropriate processor */
439 set_cpus_allowed_ptr(current, &cpumask_of_cpu(reboot_cpu_id)); 514 set_cpus_allowed_ptr(current, cpumask_of(reboot_cpu_id));
440 515
441 /* O.K Now that I'm on the appropriate processor, 516 /* O.K Now that I'm on the appropriate processor,
442 * stop all of the others. 517 * stop all of the others.
@@ -459,17 +534,28 @@ void native_machine_shutdown(void)
459#endif 534#endif
460} 535}
461 536
537static void __machine_emergency_restart(int emergency)
538{
539 reboot_emergency = emergency;
540 machine_ops.emergency_restart();
541}
542
462static void native_machine_restart(char *__unused) 543static void native_machine_restart(char *__unused)
463{ 544{
464 printk("machine restart\n"); 545 printk("machine restart\n");
465 546
466 if (!reboot_force) 547 if (!reboot_force)
467 machine_shutdown(); 548 machine_shutdown();
468 machine_emergency_restart(); 549 __machine_emergency_restart(0);
469} 550}
470 551
471static void native_machine_halt(void) 552static void native_machine_halt(void)
472{ 553{
554 /* stop other cpus and apics */
555 machine_shutdown();
556
557 /* stop this cpu */
558 stop_this_cpu(NULL);
473} 559}
474 560
475static void native_machine_power_off(void) 561static void native_machine_power_off(void)
@@ -504,7 +590,7 @@ void machine_shutdown(void)
504 590
505void machine_emergency_restart(void) 591void machine_emergency_restart(void)
506{ 592{
507 machine_ops.emergency_restart(); 593 __machine_emergency_restart(1);
508} 594}
509 595
510void machine_restart(char *cmd) 596void machine_restart(char *cmd)
@@ -523,3 +609,92 @@ void machine_crash_shutdown(struct pt_regs *regs)
523 machine_ops.crash_shutdown(regs); 609 machine_ops.crash_shutdown(regs);
524} 610}
525#endif 611#endif
612
613
614#if defined(CONFIG_SMP)
615
616/* This keeps a track of which one is crashing cpu. */
617static int crashing_cpu;
618static nmi_shootdown_cb shootdown_callback;
619
620static atomic_t waiting_for_crash_ipi;
621
622static int crash_nmi_callback(struct notifier_block *self,
623 unsigned long val, void *data)
624{
625 int cpu;
626
627 if (val != DIE_NMI_IPI)
628 return NOTIFY_OK;
629
630 cpu = raw_smp_processor_id();
631
632 /* Don't do anything if this handler is invoked on crashing cpu.
633 * Otherwise, system will completely hang. Crashing cpu can get
634 * an NMI if system was initially booted with nmi_watchdog parameter.
635 */
636 if (cpu == crashing_cpu)
637 return NOTIFY_STOP;
638 local_irq_disable();
639
640 shootdown_callback(cpu, (struct die_args *)data);
641
642 atomic_dec(&waiting_for_crash_ipi);
643 /* Assume hlt works */
644 halt();
645 for (;;)
646 cpu_relax();
647
648 return 1;
649}
650
651static void smp_send_nmi_allbutself(void)
652{
653 send_IPI_allbutself(NMI_VECTOR);
654}
655
656static struct notifier_block crash_nmi_nb = {
657 .notifier_call = crash_nmi_callback,
658};
659
660/* Halt all other CPUs, calling the specified function on each of them
661 *
662 * This function can be used to halt all other CPUs on crash
663 * or emergency reboot time. The function passed as parameter
664 * will be called inside a NMI handler on all CPUs.
665 */
666void nmi_shootdown_cpus(nmi_shootdown_cb callback)
667{
668 unsigned long msecs;
669 local_irq_disable();
670
671 /* Make a note of crashing cpu. Will be used in NMI callback.*/
672 crashing_cpu = safe_smp_processor_id();
673
674 shootdown_callback = callback;
675
676 atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
677 /* Would it be better to replace the trap vector here? */
678 if (register_die_notifier(&crash_nmi_nb))
679 return; /* return what? */
680 /* Ensure the new callback function is set before sending
681 * out the NMI
682 */
683 wmb();
684
685 smp_send_nmi_allbutself();
686
687 msecs = 1000; /* Wait at most a second for the other cpus to stop */
688 while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
689 mdelay(1);
690 msecs--;
691 }
692
693 /* Leave the nmi callback set */
694}
695#else /* !CONFIG_SMP */
696void nmi_shootdown_cpus(nmi_shootdown_cb callback)
697{
698 /* No other CPUs to shoot down */
699}
700#endif
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index 6f50664b2ba5..a160f3119725 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -10,15 +10,12 @@
10#include <asm/page.h> 10#include <asm/page.h>
11#include <asm/kexec.h> 11#include <asm/kexec.h>
12#include <asm/processor-flags.h> 12#include <asm/processor-flags.h>
13#include <asm/pgtable.h>
14 13
15/* 14/*
16 * Must be relocatable PIC code callable as a C function 15 * Must be relocatable PIC code callable as a C function
17 */ 16 */
18 17
19#define PTR(x) (x << 2) 18#define PTR(x) (x << 2)
20#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
21#define PAE_PGD_ATTR (_PAGE_PRESENT)
22 19
23/* control_page + KEXEC_CONTROL_CODE_MAX_SIZE 20/* control_page + KEXEC_CONTROL_CODE_MAX_SIZE
24 * ~ control_page + PAGE_SIZE are used as data storage and stack for 21 * ~ control_page + PAGE_SIZE are used as data storage and stack for
@@ -39,7 +36,6 @@
39#define CP_PA_BACKUP_PAGES_MAP DATA(0x1c) 36#define CP_PA_BACKUP_PAGES_MAP DATA(0x1c)
40 37
41 .text 38 .text
42 .align PAGE_SIZE
43 .globl relocate_kernel 39 .globl relocate_kernel
44relocate_kernel: 40relocate_kernel:
45 /* Save the CPU context, used for jumping back */ 41 /* Save the CPU context, used for jumping back */
@@ -60,117 +56,6 @@ relocate_kernel:
60 movl %cr4, %eax 56 movl %cr4, %eax
61 movl %eax, CR4(%edi) 57 movl %eax, CR4(%edi)
62 58
63#ifdef CONFIG_X86_PAE
64 /* map the control page at its virtual address */
65
66 movl PTR(VA_PGD)(%ebp), %edi
67 movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
68 andl $0xc0000000, %eax
69 shrl $27, %eax
70 addl %edi, %eax
71
72 movl PTR(PA_PMD_0)(%ebp), %edx
73 orl $PAE_PGD_ATTR, %edx
74 movl %edx, (%eax)
75
76 movl PTR(VA_PMD_0)(%ebp), %edi
77 movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
78 andl $0x3fe00000, %eax
79 shrl $18, %eax
80 addl %edi, %eax
81
82 movl PTR(PA_PTE_0)(%ebp), %edx
83 orl $PAGE_ATTR, %edx
84 movl %edx, (%eax)
85
86 movl PTR(VA_PTE_0)(%ebp), %edi
87 movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
88 andl $0x001ff000, %eax
89 shrl $9, %eax
90 addl %edi, %eax
91
92 movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
93 orl $PAGE_ATTR, %edx
94 movl %edx, (%eax)
95
96 /* identity map the control page at its physical address */
97
98 movl PTR(VA_PGD)(%ebp), %edi
99 movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
100 andl $0xc0000000, %eax
101 shrl $27, %eax
102 addl %edi, %eax
103
104 movl PTR(PA_PMD_1)(%ebp), %edx
105 orl $PAE_PGD_ATTR, %edx
106 movl %edx, (%eax)
107
108 movl PTR(VA_PMD_1)(%ebp), %edi
109 movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
110 andl $0x3fe00000, %eax
111 shrl $18, %eax
112 addl %edi, %eax
113
114 movl PTR(PA_PTE_1)(%ebp), %edx
115 orl $PAGE_ATTR, %edx
116 movl %edx, (%eax)
117
118 movl PTR(VA_PTE_1)(%ebp), %edi
119 movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
120 andl $0x001ff000, %eax
121 shrl $9, %eax
122 addl %edi, %eax
123
124 movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
125 orl $PAGE_ATTR, %edx
126 movl %edx, (%eax)
127#else
128 /* map the control page at its virtual address */
129
130 movl PTR(VA_PGD)(%ebp), %edi
131 movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
132 andl $0xffc00000, %eax
133 shrl $20, %eax
134 addl %edi, %eax
135
136 movl PTR(PA_PTE_0)(%ebp), %edx
137 orl $PAGE_ATTR, %edx
138 movl %edx, (%eax)
139
140 movl PTR(VA_PTE_0)(%ebp), %edi
141 movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
142 andl $0x003ff000, %eax
143 shrl $10, %eax
144 addl %edi, %eax
145
146 movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
147 orl $PAGE_ATTR, %edx
148 movl %edx, (%eax)
149
150 /* identity map the control page at its physical address */
151
152 movl PTR(VA_PGD)(%ebp), %edi
153 movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
154 andl $0xffc00000, %eax
155 shrl $20, %eax
156 addl %edi, %eax
157
158 movl PTR(PA_PTE_1)(%ebp), %edx
159 orl $PAGE_ATTR, %edx
160 movl %edx, (%eax)
161
162 movl PTR(VA_PTE_1)(%ebp), %edi
163 movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
164 andl $0x003ff000, %eax
165 shrl $10, %eax
166 addl %edi, %eax
167
168 movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
169 orl $PAGE_ATTR, %edx
170 movl %edx, (%eax)
171#endif
172
173relocate_new_kernel:
174 /* read the arguments and say goodbye to the stack */ 59 /* read the arguments and say goodbye to the stack */
175 movl 20+4(%esp), %ebx /* page_list */ 60 movl 20+4(%esp), %ebx /* page_list */
176 movl 20+8(%esp), %ebp /* list of pages */ 61 movl 20+8(%esp), %ebp /* list of pages */
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 9d5674f7b6cc..ae0d8042cf69 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -93,11 +93,13 @@
93#include <asm/desc.h> 93#include <asm/desc.h>
94#include <asm/dma.h> 94#include <asm/dma.h>
95#include <asm/iommu.h> 95#include <asm/iommu.h>
96#include <asm/gart.h>
96#include <asm/mmu_context.h> 97#include <asm/mmu_context.h>
97#include <asm/proto.h> 98#include <asm/proto.h>
98 99
99#include <mach_apic.h> 100#include <mach_apic.h>
100#include <asm/paravirt.h> 101#include <asm/paravirt.h>
102#include <asm/hypervisor.h>
101 103
102#include <asm/percpu.h> 104#include <asm/percpu.h>
103#include <asm/topology.h> 105#include <asm/topology.h>
@@ -448,6 +450,7 @@ static void __init reserve_early_setup_data(void)
448 * @size: Size of the crashkernel memory to reserve. 450 * @size: Size of the crashkernel memory to reserve.
449 * Returns the base address on success, and -1ULL on failure. 451 * Returns the base address on success, and -1ULL on failure.
450 */ 452 */
453static
451unsigned long long __init find_and_reserve_crashkernel(unsigned long long size) 454unsigned long long __init find_and_reserve_crashkernel(unsigned long long size)
452{ 455{
453 const unsigned long long alignment = 16<<20; /* 16M */ 456 const unsigned long long alignment = 16<<20; /* 16M */
@@ -583,161 +586,24 @@ static int __init setup_elfcorehdr(char *arg)
583early_param("elfcorehdr", setup_elfcorehdr); 586early_param("elfcorehdr", setup_elfcorehdr);
584#endif 587#endif
585 588
586static struct x86_quirks default_x86_quirks __initdata; 589static int __init default_update_genapic(void)
587
588struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
589
590/*
591 * Some BIOSes seem to corrupt the low 64k of memory during events
592 * like suspend/resume and unplugging an HDMI cable. Reserve all
593 * remaining free memory in that area and fill it with a distinct
594 * pattern.
595 */
596#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
597#define MAX_SCAN_AREAS 8
598
599static int __read_mostly memory_corruption_check = -1;
600
601static unsigned __read_mostly corruption_check_size = 64*1024;
602static unsigned __read_mostly corruption_check_period = 60; /* seconds */
603
604static struct e820entry scan_areas[MAX_SCAN_AREAS];
605static int num_scan_areas;
606
607
608static int set_corruption_check(char *arg)
609{ 590{
610 char *end; 591#ifdef CONFIG_X86_SMP
611 592# if defined(CONFIG_X86_GENERICARCH) || defined(CONFIG_X86_64)
612 memory_corruption_check = simple_strtol(arg, &end, 10); 593 genapic->wakeup_cpu = wakeup_secondary_cpu_via_init;
613 594# endif
614 return (*end == 0) ? 0 : -EINVAL;
615}
616early_param("memory_corruption_check", set_corruption_check);
617
618static int set_corruption_check_period(char *arg)
619{
620 char *end;
621
622 corruption_check_period = simple_strtoul(arg, &end, 10);
623
624 return (*end == 0) ? 0 : -EINVAL;
625}
626early_param("memory_corruption_check_period", set_corruption_check_period);
627
628static int set_corruption_check_size(char *arg)
629{
630 char *end;
631 unsigned size;
632
633 size = memparse(arg, &end);
634
635 if (*end == '\0')
636 corruption_check_size = size;
637
638 return (size == corruption_check_size) ? 0 : -EINVAL;
639}
640early_param("memory_corruption_check_size", set_corruption_check_size);
641
642
643static void __init setup_bios_corruption_check(void)
644{
645 u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */
646
647 if (memory_corruption_check == -1) {
648 memory_corruption_check =
649#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
650 1
651#else
652 0
653#endif 595#endif
654 ;
655 }
656
657 if (corruption_check_size == 0)
658 memory_corruption_check = 0;
659
660 if (!memory_corruption_check)
661 return;
662
663 corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
664
665 while(addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) {
666 u64 size;
667 addr = find_e820_area_size(addr, &size, PAGE_SIZE);
668
669 if (addr == 0)
670 break;
671 596
672 if ((addr + size) > corruption_check_size) 597 return 0;
673 size = corruption_check_size - addr;
674
675 if (size == 0)
676 break;
677
678 e820_update_range(addr, size, E820_RAM, E820_RESERVED);
679 scan_areas[num_scan_areas].addr = addr;
680 scan_areas[num_scan_areas].size = size;
681 num_scan_areas++;
682
683 /* Assume we've already mapped this early memory */
684 memset(__va(addr), 0, size);
685
686 addr += size;
687 }
688
689 printk(KERN_INFO "Scanning %d areas for low memory corruption\n",
690 num_scan_areas);
691 update_e820();
692}
693
694static struct timer_list periodic_check_timer;
695
696void check_for_bios_corruption(void)
697{
698 int i;
699 int corruption = 0;
700
701 if (!memory_corruption_check)
702 return;
703
704 for(i = 0; i < num_scan_areas; i++) {
705 unsigned long *addr = __va(scan_areas[i].addr);
706 unsigned long size = scan_areas[i].size;
707
708 for(; size; addr++, size -= sizeof(unsigned long)) {
709 if (!*addr)
710 continue;
711 printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n",
712 addr, __pa(addr), *addr);
713 corruption = 1;
714 *addr = 0;
715 }
716 }
717
718 WARN(corruption, KERN_ERR "Memory corruption detected in low memory\n");
719}
720
721static void periodic_check_for_corruption(unsigned long data)
722{
723 check_for_bios_corruption();
724 mod_timer(&periodic_check_timer, round_jiffies(jiffies + corruption_check_period*HZ));
725} 598}
726 599
727void start_periodic_check_for_corruption(void) 600static struct x86_quirks default_x86_quirks __initdata = {
728{ 601 .update_genapic = default_update_genapic,
729 if (!memory_corruption_check || corruption_check_period == 0) 602};
730 return;
731
732 printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n",
733 corruption_check_period);
734 603
735 init_timer(&periodic_check_timer); 604struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
736 periodic_check_timer.function = &periodic_check_for_corruption;
737 periodic_check_for_corruption(0);
738}
739#endif
740 605
606#ifdef CONFIG_X86_RESERVE_LOW_64K
741static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) 607static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
742{ 608{
743 printk(KERN_NOTICE 609 printk(KERN_NOTICE
@@ -749,6 +615,7 @@ static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
749 615
750 return 0; 616 return 0;
751} 617}
618#endif
752 619
753/* List of systems that have known low memory corruption BIOS problems */ 620/* List of systems that have known low memory corruption BIOS problems */
754static struct dmi_system_id __initdata bad_bios_dmi_table[] = { 621static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
@@ -794,6 +661,9 @@ void __init setup_arch(char **cmdline_p)
794 printk(KERN_INFO "Command line: %s\n", boot_command_line); 661 printk(KERN_INFO "Command line: %s\n", boot_command_line);
795#endif 662#endif
796 663
664 /* VMI may relocate the fixmap; do this before touching ioremap area */
665 vmi_init();
666
797 early_cpu_init(); 667 early_cpu_init();
798 early_ioremap_init(); 668 early_ioremap_init();
799 669
@@ -880,13 +750,8 @@ void __init setup_arch(char **cmdline_p)
880 check_efer(); 750 check_efer();
881#endif 751#endif
882 752
883#if defined(CONFIG_VMI) && defined(CONFIG_X86_32) 753 /* Must be before kernel pagetables are setup */
884 /* 754 vmi_activate();
885 * Must be before kernel pagetables are setup
886 * or fixmap area is touched.
887 */
888 vmi_init();
889#endif
890 755
891 /* after early param, so could get panic from serial */ 756 /* after early param, so could get panic from serial */
892 reserve_early_setup_data(); 757 reserve_early_setup_data();
@@ -909,6 +774,12 @@ void __init setup_arch(char **cmdline_p)
909 774
910 dmi_check_system(bad_bios_dmi_table); 775 dmi_check_system(bad_bios_dmi_table);
911 776
777 /*
778 * VMware detection requires dmi to be available, so this
779 * needs to be done after dmi_scan_machine, for the BP.
780 */
781 init_hypervisor(&boot_cpu_data);
782
912#ifdef CONFIG_X86_32 783#ifdef CONFIG_X86_32
913 probe_roms(); 784 probe_roms();
914#endif 785#endif
@@ -1082,7 +953,7 @@ void __init setup_arch(char **cmdline_p)
1082 ioapic_init_mappings(); 953 ioapic_init_mappings();
1083 954
1084 /* need to wait for io_apic is mapped */ 955 /* need to wait for io_apic is mapped */
1085 nr_irqs = probe_nr_irqs(); 956 probe_nr_irqs_gsi();
1086 957
1087 kvm_guest_init(); 958 kvm_guest_init();
1088 959
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index ae0c0d3bb770..a4b619c33106 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -152,8 +152,11 @@ void __init setup_per_cpu_areas(void)
152 old_size = PERCPU_ENOUGH_ROOM; 152 old_size = PERCPU_ENOUGH_ROOM;
153 align = max_t(unsigned long, PAGE_SIZE, align); 153 align = max_t(unsigned long, PAGE_SIZE, align);
154 size = roundup(old_size, align); 154 size = roundup(old_size, align);
155 printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n", 155
156 size); 156 pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
157 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
158
159 pr_info("PERCPU: Allocating %zd bytes of per cpu data\n", size);
157 160
158 for_each_possible_cpu(cpu) { 161 for_each_possible_cpu(cpu) {
159#ifndef CONFIG_NEED_MULTIPLE_NODES 162#ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -164,28 +167,21 @@ void __init setup_per_cpu_areas(void)
164 if (!node_online(node) || !NODE_DATA(node)) { 167 if (!node_online(node) || !NODE_DATA(node)) {
165 ptr = __alloc_bootmem(size, align, 168 ptr = __alloc_bootmem(size, align,
166 __pa(MAX_DMA_ADDRESS)); 169 __pa(MAX_DMA_ADDRESS));
167 printk(KERN_INFO 170 pr_info("cpu %d has no node %d or node-local memory\n",
168 "cpu %d has no node %d or node-local memory\n",
169 cpu, node); 171 cpu, node);
170 if (ptr) 172 pr_debug("per cpu data for cpu%d at %016lx\n",
171 printk(KERN_DEBUG "per cpu data for cpu%d at %016lx\n", 173 cpu, __pa(ptr));
172 cpu, __pa(ptr)); 174 } else {
173 }
174 else {
175 ptr = __alloc_bootmem_node(NODE_DATA(node), size, align, 175 ptr = __alloc_bootmem_node(NODE_DATA(node), size, align,
176 __pa(MAX_DMA_ADDRESS)); 176 __pa(MAX_DMA_ADDRESS));
177 if (ptr) 177 pr_debug("per cpu data for cpu%d on node%d at %016lx\n",
178 printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n", 178 cpu, node, __pa(ptr));
179 cpu, node, __pa(ptr));
180 } 179 }
181#endif 180#endif
182 per_cpu_offset(cpu) = ptr - __per_cpu_start; 181 per_cpu_offset(cpu) = ptr - __per_cpu_start;
183 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); 182 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
184 } 183 }
185 184
186 printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
187 NR_CPUS, nr_cpu_ids, nr_node_ids);
188
189 /* Setup percpu data maps */ 185 /* Setup percpu data maps */
190 setup_per_cpu_maps(); 186 setup_per_cpu_maps();
191 187
@@ -282,7 +278,7 @@ static void __cpuinit numa_set_cpumask(int cpu, int enable)
282 else 278 else
283 cpu_clear(cpu, *mask); 279 cpu_clear(cpu, *mask);
284 280
285 cpulist_scnprintf(buf, sizeof(buf), *mask); 281 cpulist_scnprintf(buf, sizeof(buf), mask);
286 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", 282 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
287 enable? "numa_add_cpu":"numa_remove_cpu", cpu, node, buf); 283 enable? "numa_add_cpu":"numa_remove_cpu", cpu, node, buf);
288 } 284 }
@@ -334,25 +330,25 @@ static const cpumask_t cpu_mask_none;
334/* 330/*
335 * Returns a pointer to the bitmask of CPUs on Node 'node'. 331 * Returns a pointer to the bitmask of CPUs on Node 'node'.
336 */ 332 */
337const cpumask_t *_node_to_cpumask_ptr(int node) 333const cpumask_t *cpumask_of_node(int node)
338{ 334{
339 if (node_to_cpumask_map == NULL) { 335 if (node_to_cpumask_map == NULL) {
340 printk(KERN_WARNING 336 printk(KERN_WARNING
341 "_node_to_cpumask_ptr(%d): no node_to_cpumask_map!\n", 337 "cpumask_of_node(%d): no node_to_cpumask_map!\n",
342 node); 338 node);
343 dump_stack(); 339 dump_stack();
344 return (const cpumask_t *)&cpu_online_map; 340 return (const cpumask_t *)&cpu_online_map;
345 } 341 }
346 if (node >= nr_node_ids) { 342 if (node >= nr_node_ids) {
347 printk(KERN_WARNING 343 printk(KERN_WARNING
348 "_node_to_cpumask_ptr(%d): node > nr_node_ids(%d)\n", 344 "cpumask_of_node(%d): node > nr_node_ids(%d)\n",
349 node, nr_node_ids); 345 node, nr_node_ids);
350 dump_stack(); 346 dump_stack();
351 return &cpu_mask_none; 347 return &cpu_mask_none;
352 } 348 }
353 return &node_to_cpumask_map[node]; 349 return &node_to_cpumask_map[node];
354} 350}
355EXPORT_SYMBOL(_node_to_cpumask_ptr); 351EXPORT_SYMBOL(cpumask_of_node);
356 352
357/* 353/*
358 * Returns a bitmask of CPUs on Node 'node'. 354 * Returns a bitmask of CPUs on Node 'node'.
diff --git a/arch/x86/kernel/sigframe.h b/arch/x86/kernel/sigframe.h
deleted file mode 100644
index cc673aa55ce4..000000000000
--- a/arch/x86/kernel/sigframe.h
+++ /dev/null
@@ -1,42 +0,0 @@
1#ifdef CONFIG_X86_32
2struct sigframe {
3 char __user *pretcode;
4 int sig;
5 struct sigcontext sc;
6 /*
7 * fpstate is unused. fpstate is moved/allocated after
8 * retcode[] below. This movement allows to have the FP state and the
9 * future state extensions (xsave) stay together.
10 * And at the same time retaining the unused fpstate, prevents changing
11 * the offset of extramask[] in the sigframe and thus prevent any
12 * legacy application accessing/modifying it.
13 */
14 struct _fpstate fpstate_unused;
15 unsigned long extramask[_NSIG_WORDS-1];
16 char retcode[8];
17 /* fp state follows here */
18};
19
20struct rt_sigframe {
21 char __user *pretcode;
22 int sig;
23 struct siginfo __user *pinfo;
24 void __user *puc;
25 struct siginfo info;
26 struct ucontext uc;
27 char retcode[8];
28 /* fp state follows here */
29};
30#else
31struct rt_sigframe {
32 char __user *pretcode;
33 struct ucontext uc;
34 struct siginfo info;
35 /* fp state follows here */
36};
37
38int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
39 sigset_t *set, struct pt_regs *regs);
40int ia32_setup_frame(int sig, struct k_sigaction *ka,
41 sigset_t *set, struct pt_regs *regs);
42#endif
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal.c
index d6dd057d0f22..89bb7668041d 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal.c
@@ -1,36 +1,41 @@
1/* 1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds 2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
3 * 4 *
4 * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson 5 * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson
5 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes 6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
7 * 2000-2002 x86-64 support by Andi Kleen
6 */ 8 */
7#include <linux/list.h>
8 9
9#include <linux/personality.h> 10#include <linux/sched.h>
10#include <linux/binfmts.h> 11#include <linux/mm.h>
11#include <linux/suspend.h> 12#include <linux/smp.h>
12#include <linux/kernel.h> 13#include <linux/kernel.h>
13#include <linux/ptrace.h>
14#include <linux/signal.h> 14#include <linux/signal.h>
15#include <linux/stddef.h>
16#include <linux/unistd.h>
17#include <linux/errno.h> 15#include <linux/errno.h>
18#include <linux/sched.h>
19#include <linux/wait.h> 16#include <linux/wait.h>
17#include <linux/ptrace.h>
20#include <linux/tracehook.h> 18#include <linux/tracehook.h>
21#include <linux/elf.h> 19#include <linux/unistd.h>
22#include <linux/smp.h> 20#include <linux/stddef.h>
23#include <linux/mm.h> 21#include <linux/personality.h>
22#include <linux/uaccess.h>
24 23
25#include <asm/processor.h> 24#include <asm/processor.h>
26#include <asm/ucontext.h> 25#include <asm/ucontext.h>
27#include <asm/uaccess.h>
28#include <asm/i387.h> 26#include <asm/i387.h>
29#include <asm/vdso.h> 27#include <asm/vdso.h>
28
29#ifdef CONFIG_X86_64
30#include <asm/proto.h>
31#include <asm/ia32_unistd.h>
32#include <asm/mce.h>
33#endif /* CONFIG_X86_64 */
34
30#include <asm/syscall.h> 35#include <asm/syscall.h>
31#include <asm/syscalls.h> 36#include <asm/syscalls.h>
32 37
33#include "sigframe.h" 38#include <asm/sigframe.h>
34 39
35#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) 40#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
36 41
@@ -45,74 +50,6 @@
45# define FIX_EFLAGS __FIX_EFLAGS 50# define FIX_EFLAGS __FIX_EFLAGS
46#endif 51#endif
47 52
48/*
49 * Atomically swap in the new signal mask, and wait for a signal.
50 */
51asmlinkage int
52sys_sigsuspend(int history0, int history1, old_sigset_t mask)
53{
54 mask &= _BLOCKABLE;
55 spin_lock_irq(&current->sighand->siglock);
56 current->saved_sigmask = current->blocked;
57 siginitset(&current->blocked, mask);
58 recalc_sigpending();
59 spin_unlock_irq(&current->sighand->siglock);
60
61 current->state = TASK_INTERRUPTIBLE;
62 schedule();
63 set_restore_sigmask();
64
65 return -ERESTARTNOHAND;
66}
67
68asmlinkage int
69sys_sigaction(int sig, const struct old_sigaction __user *act,
70 struct old_sigaction __user *oact)
71{
72 struct k_sigaction new_ka, old_ka;
73 int ret;
74
75 if (act) {
76 old_sigset_t mask;
77
78 if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
79 __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
80 __get_user(new_ka.sa.sa_restorer, &act->sa_restorer))
81 return -EFAULT;
82
83 __get_user(new_ka.sa.sa_flags, &act->sa_flags);
84 __get_user(mask, &act->sa_mask);
85 siginitset(&new_ka.sa.sa_mask, mask);
86 }
87
88 ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
89
90 if (!ret && oact) {
91 if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
92 __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
93 __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer))
94 return -EFAULT;
95
96 __put_user(old_ka.sa.sa_flags, &oact->sa_flags);
97 __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask);
98 }
99
100 return ret;
101}
102
103asmlinkage int sys_sigaltstack(unsigned long bx)
104{
105 /*
106 * This is needed to make gcc realize it doesn't own the
107 * "struct pt_regs"
108 */
109 struct pt_regs *regs = (struct pt_regs *)&bx;
110 const stack_t __user *uss = (const stack_t __user *)bx;
111 stack_t __user *uoss = (stack_t __user *)regs->cx;
112
113 return do_sigaltstack(uss, uoss, regs->sp);
114}
115
116#define COPY(x) { \ 53#define COPY(x) { \
117 err |= __get_user(regs->x, &sc->x); \ 54 err |= __get_user(regs->x, &sc->x); \
118} 55}
@@ -123,7 +60,7 @@ asmlinkage int sys_sigaltstack(unsigned long bx)
123 regs->seg = tmp; \ 60 regs->seg = tmp; \
124} 61}
125 62
126#define COPY_SEG_STRICT(seg) { \ 63#define COPY_SEG_CPL3(seg) { \
127 unsigned short tmp; \ 64 unsigned short tmp; \
128 err |= __get_user(tmp, &sc->seg); \ 65 err |= __get_user(tmp, &sc->seg); \
129 regs->seg = tmp | 3; \ 66 regs->seg = tmp | 3; \
@@ -135,9 +72,6 @@ asmlinkage int sys_sigaltstack(unsigned long bx)
135 loadsegment(seg, tmp); \ 72 loadsegment(seg, tmp); \
136} 73}
137 74
138/*
139 * Do a signal return; undo the signal stack.
140 */
141static int 75static int
142restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, 76restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
143 unsigned long *pax) 77 unsigned long *pax)
@@ -149,14 +83,36 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
149 /* Always make any pending restarted system calls return -EINTR */ 83 /* Always make any pending restarted system calls return -EINTR */
150 current_thread_info()->restart_block.fn = do_no_restart_syscall; 84 current_thread_info()->restart_block.fn = do_no_restart_syscall;
151 85
86#ifdef CONFIG_X86_32
152 GET_SEG(gs); 87 GET_SEG(gs);
153 COPY_SEG(fs); 88 COPY_SEG(fs);
154 COPY_SEG(es); 89 COPY_SEG(es);
155 COPY_SEG(ds); 90 COPY_SEG(ds);
91#endif /* CONFIG_X86_32 */
92
156 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); 93 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
157 COPY(dx); COPY(cx); COPY(ip); 94 COPY(dx); COPY(cx); COPY(ip);
158 COPY_SEG_STRICT(cs); 95
159 COPY_SEG_STRICT(ss); 96#ifdef CONFIG_X86_64
97 COPY(r8);
98 COPY(r9);
99 COPY(r10);
100 COPY(r11);
101 COPY(r12);
102 COPY(r13);
103 COPY(r14);
104 COPY(r15);
105#endif /* CONFIG_X86_64 */
106
107#ifdef CONFIG_X86_32
108 COPY_SEG_CPL3(cs);
109 COPY_SEG_CPL3(ss);
110#else /* !CONFIG_X86_32 */
111 /* Kernel saves and restores only the CS segment register on signals,
112 * which is the bare minimum needed to allow mixed 32/64-bit code.
113 * App's signal handler can save/restore other segments if needed. */
114 COPY_SEG_CPL3(cs);
115#endif /* CONFIG_X86_32 */
160 116
161 err |= __get_user(tmpflags, &sc->flags); 117 err |= __get_user(tmpflags, &sc->flags);
162 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); 118 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
@@ -169,102 +125,24 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
169 return err; 125 return err;
170} 126}
171 127
172asmlinkage unsigned long sys_sigreturn(unsigned long __unused)
173{
174 struct sigframe __user *frame;
175 struct pt_regs *regs;
176 unsigned long ax;
177 sigset_t set;
178
179 regs = (struct pt_regs *) &__unused;
180 frame = (struct sigframe __user *)(regs->sp - 8);
181
182 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
183 goto badframe;
184 if (__get_user(set.sig[0], &frame->sc.oldmask) || (_NSIG_WORDS > 1
185 && __copy_from_user(&set.sig[1], &frame->extramask,
186 sizeof(frame->extramask))))
187 goto badframe;
188
189 sigdelsetmask(&set, ~_BLOCKABLE);
190 spin_lock_irq(&current->sighand->siglock);
191 current->blocked = set;
192 recalc_sigpending();
193 spin_unlock_irq(&current->sighand->siglock);
194
195 if (restore_sigcontext(regs, &frame->sc, &ax))
196 goto badframe;
197 return ax;
198
199badframe:
200 if (show_unhandled_signals && printk_ratelimit()) {
201 printk("%s%s[%d] bad frame in sigreturn frame:"
202 "%p ip:%lx sp:%lx oeax:%lx",
203 task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
204 current->comm, task_pid_nr(current), frame, regs->ip,
205 regs->sp, regs->orig_ax);
206 print_vma_addr(" in ", regs->ip);
207 printk(KERN_CONT "\n");
208 }
209
210 force_sig(SIGSEGV, current);
211
212 return 0;
213}
214
215static long do_rt_sigreturn(struct pt_regs *regs)
216{
217 struct rt_sigframe __user *frame;
218 unsigned long ax;
219 sigset_t set;
220
221 frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
222 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
223 goto badframe;
224 if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
225 goto badframe;
226
227 sigdelsetmask(&set, ~_BLOCKABLE);
228 spin_lock_irq(&current->sighand->siglock);
229 current->blocked = set;
230 recalc_sigpending();
231 spin_unlock_irq(&current->sighand->siglock);
232
233 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
234 goto badframe;
235
236 if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT)
237 goto badframe;
238
239 return ax;
240
241badframe:
242 signal_fault(regs, frame, "rt_sigreturn");
243 return 0;
244}
245
246asmlinkage int sys_rt_sigreturn(unsigned long __unused)
247{
248 struct pt_regs *regs = (struct pt_regs *)&__unused;
249
250 return do_rt_sigreturn(regs);
251}
252
253/*
254 * Set up a signal frame.
255 */
256static int 128static int
257setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, 129setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
258 struct pt_regs *regs, unsigned long mask) 130 struct pt_regs *regs, unsigned long mask)
259{ 131{
260 int tmp, err = 0; 132 int err = 0;
261 133
262 err |= __put_user(regs->fs, (unsigned int __user *)&sc->fs); 134#ifdef CONFIG_X86_32
263 savesegment(gs, tmp); 135 {
264 err |= __put_user(tmp, (unsigned int __user *)&sc->gs); 136 unsigned int tmp;
265 137
138 savesegment(gs, tmp);
139 err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
140 }
141 err |= __put_user(regs->fs, (unsigned int __user *)&sc->fs);
266 err |= __put_user(regs->es, (unsigned int __user *)&sc->es); 142 err |= __put_user(regs->es, (unsigned int __user *)&sc->es);
267 err |= __put_user(regs->ds, (unsigned int __user *)&sc->ds); 143 err |= __put_user(regs->ds, (unsigned int __user *)&sc->ds);
144#endif /* CONFIG_X86_32 */
145
268 err |= __put_user(regs->di, &sc->di); 146 err |= __put_user(regs->di, &sc->di);
269 err |= __put_user(regs->si, &sc->si); 147 err |= __put_user(regs->si, &sc->si);
270 err |= __put_user(regs->bp, &sc->bp); 148 err |= __put_user(regs->bp, &sc->bp);
@@ -273,19 +151,33 @@ setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
273 err |= __put_user(regs->dx, &sc->dx); 151 err |= __put_user(regs->dx, &sc->dx);
274 err |= __put_user(regs->cx, &sc->cx); 152 err |= __put_user(regs->cx, &sc->cx);
275 err |= __put_user(regs->ax, &sc->ax); 153 err |= __put_user(regs->ax, &sc->ax);
154#ifdef CONFIG_X86_64
155 err |= __put_user(regs->r8, &sc->r8);
156 err |= __put_user(regs->r9, &sc->r9);
157 err |= __put_user(regs->r10, &sc->r10);
158 err |= __put_user(regs->r11, &sc->r11);
159 err |= __put_user(regs->r12, &sc->r12);
160 err |= __put_user(regs->r13, &sc->r13);
161 err |= __put_user(regs->r14, &sc->r14);
162 err |= __put_user(regs->r15, &sc->r15);
163#endif /* CONFIG_X86_64 */
164
276 err |= __put_user(current->thread.trap_no, &sc->trapno); 165 err |= __put_user(current->thread.trap_no, &sc->trapno);
277 err |= __put_user(current->thread.error_code, &sc->err); 166 err |= __put_user(current->thread.error_code, &sc->err);
278 err |= __put_user(regs->ip, &sc->ip); 167 err |= __put_user(regs->ip, &sc->ip);
168#ifdef CONFIG_X86_32
279 err |= __put_user(regs->cs, (unsigned int __user *)&sc->cs); 169 err |= __put_user(regs->cs, (unsigned int __user *)&sc->cs);
280 err |= __put_user(regs->flags, &sc->flags); 170 err |= __put_user(regs->flags, &sc->flags);
281 err |= __put_user(regs->sp, &sc->sp_at_signal); 171 err |= __put_user(regs->sp, &sc->sp_at_signal);
282 err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss); 172 err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss);
173#else /* !CONFIG_X86_32 */
174 err |= __put_user(regs->flags, &sc->flags);
175 err |= __put_user(regs->cs, &sc->cs);
176 err |= __put_user(0, &sc->gs);
177 err |= __put_user(0, &sc->fs);
178#endif /* CONFIG_X86_32 */
283 179
284 tmp = save_i387_xstate(fpstate); 180 err |= __put_user(fpstate, &sc->fpstate);
285 if (tmp < 0)
286 err = 1;
287 else
288 err |= __put_user(tmp ? fpstate : NULL, &sc->fpstate);
289 181
290 /* non-iBCS2 extensions.. */ 182 /* non-iBCS2 extensions.. */
291 err |= __put_user(mask, &sc->oldmask); 183 err |= __put_user(mask, &sc->oldmask);
@@ -295,6 +187,32 @@ setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
295} 187}
296 188
297/* 189/*
190 * Set up a signal frame.
191 */
192#ifdef CONFIG_X86_32
193static const struct {
194 u16 poplmovl;
195 u32 val;
196 u16 int80;
197} __attribute__((packed)) retcode = {
198 0xb858, /* popl %eax; movl $..., %eax */
199 __NR_sigreturn,
200 0x80cd, /* int $0x80 */
201};
202
203static const struct {
204 u8 movl;
205 u32 val;
206 u16 int80;
207 u8 pad;
208} __attribute__((packed)) rt_retcode = {
209 0xb8, /* movl $..., %eax */
210 __NR_rt_sigreturn,
211 0x80cd, /* int $0x80 */
212 0
213};
214
215/*
298 * Determine which stack to use.. 216 * Determine which stack to use..
299 */ 217 */
300static inline void __user * 218static inline void __user *
@@ -328,6 +246,8 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
328 if (used_math()) { 246 if (used_math()) {
329 sp = sp - sig_xstate_size; 247 sp = sp - sig_xstate_size;
330 *fpstate = (struct _fpstate *) sp; 248 *fpstate = (struct _fpstate *) sp;
249 if (save_i387_xstate(*fpstate) < 0)
250 return (void __user *)-1L;
331 } 251 }
332 252
333 sp -= frame_size; 253 sp -= frame_size;
@@ -383,9 +303,7 @@ __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
383 * reasons and because gdb uses it as a signature to notice 303 * reasons and because gdb uses it as a signature to notice
384 * signal handler stack frames. 304 * signal handler stack frames.
385 */ 305 */
386 err |= __put_user(0xb858, (short __user *)(frame->retcode+0)); 306 err |= __put_user(*((u64 *)&retcode), (u64 *)frame->retcode);
387 err |= __put_user(__NR_sigreturn, (int __user *)(frame->retcode+2));
388 err |= __put_user(0x80cd, (short __user *)(frame->retcode+6));
389 307
390 if (err) 308 if (err)
391 return -EFAULT; 309 return -EFAULT;
@@ -454,9 +372,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
454 * reasons and because gdb uses it as a signature to notice 372 * reasons and because gdb uses it as a signature to notice
455 * signal handler stack frames. 373 * signal handler stack frames.
456 */ 374 */
457 err |= __put_user(0xb8, (char __user *)(frame->retcode+0)); 375 err |= __put_user(*((u64 *)&rt_retcode), (u64 *)frame->retcode);
458 err |= __put_user(__NR_rt_sigreturn, (int __user *)(frame->retcode+1));
459 err |= __put_user(0x80cd, (short __user *)(frame->retcode+5));
460 376
461 if (err) 377 if (err)
462 return -EFAULT; 378 return -EFAULT;
@@ -475,23 +391,293 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
475 391
476 return 0; 392 return 0;
477} 393}
394#else /* !CONFIG_X86_32 */
395/*
396 * Determine which stack to use..
397 */
398static void __user *
399get_stack(struct k_sigaction *ka, unsigned long sp, unsigned long size)
400{
401 /* Default to using normal stack - redzone*/
402 sp -= 128;
403
404 /* This is the X/Open sanctioned signal stack switching. */
405 if (ka->sa.sa_flags & SA_ONSTACK) {
406 if (sas_ss_flags(sp) == 0)
407 sp = current->sas_ss_sp + current->sas_ss_size;
408 }
409
410 return (void __user *)round_down(sp - size, 64);
411}
412
413static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
414 sigset_t *set, struct pt_regs *regs)
415{
416 struct rt_sigframe __user *frame;
417 void __user *fp = NULL;
418 int err = 0;
419 struct task_struct *me = current;
420
421 if (used_math()) {
422 fp = get_stack(ka, regs->sp, sig_xstate_size);
423 frame = (void __user *)round_down(
424 (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
425
426 if (save_i387_xstate(fp) < 0)
427 return -EFAULT;
428 } else
429 frame = get_stack(ka, regs->sp, sizeof(struct rt_sigframe)) - 8;
430
431 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
432 return -EFAULT;
433
434 if (ka->sa.sa_flags & SA_SIGINFO) {
435 if (copy_siginfo_to_user(&frame->info, info))
436 return -EFAULT;
437 }
438
439 /* Create the ucontext. */
440 if (cpu_has_xsave)
441 err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
442 else
443 err |= __put_user(0, &frame->uc.uc_flags);
444 err |= __put_user(0, &frame->uc.uc_link);
445 err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
446 err |= __put_user(sas_ss_flags(regs->sp),
447 &frame->uc.uc_stack.ss_flags);
448 err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
449 err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]);
450 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
451
452 /* Set up to return from userspace. If provided, use a stub
453 already in userspace. */
454 /* x86-64 should always use SA_RESTORER. */
455 if (ka->sa.sa_flags & SA_RESTORER) {
456 err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
457 } else {
458 /* could use a vstub here */
459 return -EFAULT;
460 }
461
462 if (err)
463 return -EFAULT;
464
465 /* Set up registers for signal handler */
466 regs->di = sig;
467 /* In case the signal handler was declared without prototypes */
468 regs->ax = 0;
469
470 /* This also works for non SA_SIGINFO handlers because they expect the
471 next argument after the signal number on the stack. */
472 regs->si = (unsigned long)&frame->info;
473 regs->dx = (unsigned long)&frame->uc;
474 regs->ip = (unsigned long) ka->sa.sa_handler;
475
476 regs->sp = (unsigned long)frame;
477
478 /* Set up the CS register to run signal handlers in 64-bit mode,
479 even if the handler happens to be interrupting 32-bit code. */
480 regs->cs = __USER_CS;
481
482 return 0;
483}
484#endif /* CONFIG_X86_32 */
485
486#ifdef CONFIG_X86_32
487/*
488 * Atomically swap in the new signal mask, and wait for a signal.
489 */
490asmlinkage int
491sys_sigsuspend(int history0, int history1, old_sigset_t mask)
492{
493 mask &= _BLOCKABLE;
494 spin_lock_irq(&current->sighand->siglock);
495 current->saved_sigmask = current->blocked;
496 siginitset(&current->blocked, mask);
497 recalc_sigpending();
498 spin_unlock_irq(&current->sighand->siglock);
499
500 current->state = TASK_INTERRUPTIBLE;
501 schedule();
502 set_restore_sigmask();
503
504 return -ERESTARTNOHAND;
505}
506
507asmlinkage int
508sys_sigaction(int sig, const struct old_sigaction __user *act,
509 struct old_sigaction __user *oact)
510{
511 struct k_sigaction new_ka, old_ka;
512 int ret;
513
514 if (act) {
515 old_sigset_t mask;
516
517 if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
518 __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
519 __get_user(new_ka.sa.sa_restorer, &act->sa_restorer))
520 return -EFAULT;
521
522 __get_user(new_ka.sa.sa_flags, &act->sa_flags);
523 __get_user(mask, &act->sa_mask);
524 siginitset(&new_ka.sa.sa_mask, mask);
525 }
526
527 ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
528
529 if (!ret && oact) {
530 if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
531 __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
532 __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer))
533 return -EFAULT;
534
535 __put_user(old_ka.sa.sa_flags, &oact->sa_flags);
536 __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask);
537 }
538
539 return ret;
540}
541#endif /* CONFIG_X86_32 */
542
543#ifdef CONFIG_X86_32
544asmlinkage int sys_sigaltstack(unsigned long bx)
545{
546 /*
547 * This is needed to make gcc realize it doesn't own the
548 * "struct pt_regs"
549 */
550 struct pt_regs *regs = (struct pt_regs *)&bx;
551 const stack_t __user *uss = (const stack_t __user *)bx;
552 stack_t __user *uoss = (stack_t __user *)regs->cx;
553
554 return do_sigaltstack(uss, uoss, regs->sp);
555}
556#else /* !CONFIG_X86_32 */
557asmlinkage long
558sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
559 struct pt_regs *regs)
560{
561 return do_sigaltstack(uss, uoss, regs->sp);
562}
563#endif /* CONFIG_X86_32 */
564
565/*
566 * Do a signal return; undo the signal stack.
567 */
568#ifdef CONFIG_X86_32
569asmlinkage unsigned long sys_sigreturn(unsigned long __unused)
570{
571 struct sigframe __user *frame;
572 struct pt_regs *regs;
573 unsigned long ax;
574 sigset_t set;
575
576 regs = (struct pt_regs *) &__unused;
577 frame = (struct sigframe __user *)(regs->sp - 8);
578
579 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
580 goto badframe;
581 if (__get_user(set.sig[0], &frame->sc.oldmask) || (_NSIG_WORDS > 1
582 && __copy_from_user(&set.sig[1], &frame->extramask,
583 sizeof(frame->extramask))))
584 goto badframe;
585
586 sigdelsetmask(&set, ~_BLOCKABLE);
587 spin_lock_irq(&current->sighand->siglock);
588 current->blocked = set;
589 recalc_sigpending();
590 spin_unlock_irq(&current->sighand->siglock);
591
592 if (restore_sigcontext(regs, &frame->sc, &ax))
593 goto badframe;
594 return ax;
595
596badframe:
597 signal_fault(regs, frame, "sigreturn");
598
599 return 0;
600}
601#endif /* CONFIG_X86_32 */
602
603static long do_rt_sigreturn(struct pt_regs *regs)
604{
605 struct rt_sigframe __user *frame;
606 unsigned long ax;
607 sigset_t set;
608
609 frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
610 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
611 goto badframe;
612 if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
613 goto badframe;
614
615 sigdelsetmask(&set, ~_BLOCKABLE);
616 spin_lock_irq(&current->sighand->siglock);
617 current->blocked = set;
618 recalc_sigpending();
619 spin_unlock_irq(&current->sighand->siglock);
620
621 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
622 goto badframe;
623
624 if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT)
625 goto badframe;
626
627 return ax;
628
629badframe:
630 signal_fault(regs, frame, "rt_sigreturn");
631 return 0;
632}
633
634#ifdef CONFIG_X86_32
635asmlinkage int sys_rt_sigreturn(struct pt_regs regs)
636{
637 return do_rt_sigreturn(&regs);
638}
639#else /* !CONFIG_X86_32 */
640asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
641{
642 return do_rt_sigreturn(regs);
643}
644#endif /* CONFIG_X86_32 */
478 645
479/* 646/*
480 * OK, we're invoking a handler: 647 * OK, we're invoking a handler:
481 */ 648 */
482static int signr_convert(int sig) 649static int signr_convert(int sig)
483{ 650{
651#ifdef CONFIG_X86_32
484 struct thread_info *info = current_thread_info(); 652 struct thread_info *info = current_thread_info();
485 653
486 if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32) 654 if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32)
487 return info->exec_domain->signal_invmap[sig]; 655 return info->exec_domain->signal_invmap[sig];
656#endif /* CONFIG_X86_32 */
488 return sig; 657 return sig;
489} 658}
490 659
660#ifdef CONFIG_X86_32
661
491#define is_ia32 1 662#define is_ia32 1
492#define ia32_setup_frame __setup_frame 663#define ia32_setup_frame __setup_frame
493#define ia32_setup_rt_frame __setup_rt_frame 664#define ia32_setup_rt_frame __setup_rt_frame
494 665
666#else /* !CONFIG_X86_32 */
667
668#ifdef CONFIG_IA32_EMULATION
669#define is_ia32 test_thread_flag(TIF_IA32)
670#else /* !CONFIG_IA32_EMULATION */
671#define is_ia32 0
672#endif /* CONFIG_IA32_EMULATION */
673
674int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
675 sigset_t *set, struct pt_regs *regs);
676int ia32_setup_frame(int sig, struct k_sigaction *ka,
677 sigset_t *set, struct pt_regs *regs);
678
679#endif /* CONFIG_X86_32 */
680
495static int 681static int
496setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 682setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
497 sigset_t *set, struct pt_regs *regs) 683 sigset_t *set, struct pt_regs *regs)
@@ -592,7 +778,13 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
592 return 0; 778 return 0;
593} 779}
594 780
781#ifdef CONFIG_X86_32
595#define NR_restart_syscall __NR_restart_syscall 782#define NR_restart_syscall __NR_restart_syscall
783#else /* !CONFIG_X86_32 */
784#define NR_restart_syscall \
785 test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall
786#endif /* CONFIG_X86_32 */
787
596/* 788/*
597 * Note that 'init' is a special process: it doesn't get signals it doesn't 789 * Note that 'init' is a special process: it doesn't get signals it doesn't
598 * want to handle. Thus you cannot kill init even with a SIGKILL even by 790 * want to handle. Thus you cannot kill init even with a SIGKILL even by
@@ -704,8 +896,9 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
704 struct task_struct *me = current; 896 struct task_struct *me = current;
705 897
706 if (show_unhandled_signals && printk_ratelimit()) { 898 if (show_unhandled_signals && printk_ratelimit()) {
707 printk(KERN_INFO 899 printk("%s"
708 "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", 900 "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
901 task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
709 me->comm, me->pid, where, frame, 902 me->comm, me->pid, where, frame,
710 regs->ip, regs->sp, regs->orig_ax); 903 regs->ip, regs->sp, regs->orig_ax);
711 print_vma_addr(" in ", regs->ip); 904 print_vma_addr(" in ", regs->ip);
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
deleted file mode 100644
index a5c9627f4db9..000000000000
--- a/arch/x86/kernel/signal_64.c
+++ /dev/null
@@ -1,516 +0,0 @@
1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
4 *
5 * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson
6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
7 * 2000-2002 x86-64 support by Andi Kleen
8 */
9
10#include <linux/sched.h>
11#include <linux/mm.h>
12#include <linux/smp.h>
13#include <linux/kernel.h>
14#include <linux/signal.h>
15#include <linux/errno.h>
16#include <linux/wait.h>
17#include <linux/ptrace.h>
18#include <linux/tracehook.h>
19#include <linux/unistd.h>
20#include <linux/stddef.h>
21#include <linux/personality.h>
22#include <linux/compiler.h>
23#include <linux/uaccess.h>
24
25#include <asm/processor.h>
26#include <asm/ucontext.h>
27#include <asm/i387.h>
28#include <asm/proto.h>
29#include <asm/ia32_unistd.h>
30#include <asm/mce.h>
31#include <asm/syscall.h>
32#include <asm/syscalls.h>
33#include "sigframe.h"
34
35#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
36
37#define __FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \
38 X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \
39 X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \
40 X86_EFLAGS_CF)
41
42#ifdef CONFIG_X86_32
43# define FIX_EFLAGS (__FIX_EFLAGS | X86_EFLAGS_RF)
44#else
45# define FIX_EFLAGS __FIX_EFLAGS
46#endif
47
48asmlinkage long
49sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
50 struct pt_regs *regs)
51{
52 return do_sigaltstack(uss, uoss, regs->sp);
53}
54
55#define COPY(x) { \
56 err |= __get_user(regs->x, &sc->x); \
57}
58
59#define COPY_SEG_STRICT(seg) { \
60 unsigned short tmp; \
61 err |= __get_user(tmp, &sc->seg); \
62 regs->seg = tmp | 3; \
63}
64
65/*
66 * Do a signal return; undo the signal stack.
67 */
68static int
69restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
70 unsigned long *pax)
71{
72 void __user *buf;
73 unsigned int tmpflags;
74 unsigned int err = 0;
75
76 /* Always make any pending restarted system calls return -EINTR */
77 current_thread_info()->restart_block.fn = do_no_restart_syscall;
78
79 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
80 COPY(dx); COPY(cx); COPY(ip);
81 COPY(r8);
82 COPY(r9);
83 COPY(r10);
84 COPY(r11);
85 COPY(r12);
86 COPY(r13);
87 COPY(r14);
88 COPY(r15);
89
90 /* Kernel saves and restores only the CS segment register on signals,
91 * which is the bare minimum needed to allow mixed 32/64-bit code.
92 * App's signal handler can save/restore other segments if needed. */
93 COPY_SEG_STRICT(cs);
94
95 err |= __get_user(tmpflags, &sc->flags);
96 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
97 regs->orig_ax = -1; /* disable syscall checks */
98
99 err |= __get_user(buf, &sc->fpstate);
100 err |= restore_i387_xstate(buf);
101
102 err |= __get_user(*pax, &sc->ax);
103 return err;
104}
105
106static long do_rt_sigreturn(struct pt_regs *regs)
107{
108 struct rt_sigframe __user *frame;
109 unsigned long ax;
110 sigset_t set;
111
112 frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
113 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
114 goto badframe;
115 if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
116 goto badframe;
117
118 sigdelsetmask(&set, ~_BLOCKABLE);
119 spin_lock_irq(&current->sighand->siglock);
120 current->blocked = set;
121 recalc_sigpending();
122 spin_unlock_irq(&current->sighand->siglock);
123
124 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
125 goto badframe;
126
127 if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT)
128 goto badframe;
129
130 return ax;
131
132badframe:
133 signal_fault(regs, frame, "rt_sigreturn");
134 return 0;
135}
136
137asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
138{
139 return do_rt_sigreturn(regs);
140}
141
142/*
143 * Set up a signal frame.
144 */
145
146static inline int
147setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs,
148 unsigned long mask, struct task_struct *me)
149{
150 int err = 0;
151
152 err |= __put_user(regs->cs, &sc->cs);
153 err |= __put_user(0, &sc->gs);
154 err |= __put_user(0, &sc->fs);
155
156 err |= __put_user(regs->di, &sc->di);
157 err |= __put_user(regs->si, &sc->si);
158 err |= __put_user(regs->bp, &sc->bp);
159 err |= __put_user(regs->sp, &sc->sp);
160 err |= __put_user(regs->bx, &sc->bx);
161 err |= __put_user(regs->dx, &sc->dx);
162 err |= __put_user(regs->cx, &sc->cx);
163 err |= __put_user(regs->ax, &sc->ax);
164 err |= __put_user(regs->r8, &sc->r8);
165 err |= __put_user(regs->r9, &sc->r9);
166 err |= __put_user(regs->r10, &sc->r10);
167 err |= __put_user(regs->r11, &sc->r11);
168 err |= __put_user(regs->r12, &sc->r12);
169 err |= __put_user(regs->r13, &sc->r13);
170 err |= __put_user(regs->r14, &sc->r14);
171 err |= __put_user(regs->r15, &sc->r15);
172 err |= __put_user(me->thread.trap_no, &sc->trapno);
173 err |= __put_user(me->thread.error_code, &sc->err);
174 err |= __put_user(regs->ip, &sc->ip);
175 err |= __put_user(regs->flags, &sc->flags);
176 err |= __put_user(mask, &sc->oldmask);
177 err |= __put_user(me->thread.cr2, &sc->cr2);
178
179 return err;
180}
181
182/*
183 * Determine which stack to use..
184 */
185
186static void __user *
187get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
188{
189 unsigned long sp;
190
191 /* Default to using normal stack - redzone*/
192 sp = regs->sp - 128;
193
194 /* This is the X/Open sanctioned signal stack switching. */
195 if (ka->sa.sa_flags & SA_ONSTACK) {
196 if (sas_ss_flags(sp) == 0)
197 sp = current->sas_ss_sp + current->sas_ss_size;
198 }
199
200 return (void __user *)round_down(sp - size, 64);
201}
202
203static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
204 sigset_t *set, struct pt_regs *regs)
205{
206 struct rt_sigframe __user *frame;
207 void __user *fp = NULL;
208 int err = 0;
209 struct task_struct *me = current;
210
211 if (used_math()) {
212 fp = get_stack(ka, regs, sig_xstate_size);
213 frame = (void __user *)round_down(
214 (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
215
216 if (save_i387_xstate(fp) < 0)
217 return -EFAULT;
218 } else
219 frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8;
220
221 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
222 return -EFAULT;
223
224 if (ka->sa.sa_flags & SA_SIGINFO) {
225 if (copy_siginfo_to_user(&frame->info, info))
226 return -EFAULT;
227 }
228
229 /* Create the ucontext. */
230 if (cpu_has_xsave)
231 err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
232 else
233 err |= __put_user(0, &frame->uc.uc_flags);
234 err |= __put_user(0, &frame->uc.uc_link);
235 err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
236 err |= __put_user(sas_ss_flags(regs->sp),
237 &frame->uc.uc_stack.ss_flags);
238 err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
239 err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me);
240 err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate);
241 if (sizeof(*set) == 16) {
242 __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
243 __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]);
244 } else
245 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
246
247 /* Set up to return from userspace. If provided, use a stub
248 already in userspace. */
249 /* x86-64 should always use SA_RESTORER. */
250 if (ka->sa.sa_flags & SA_RESTORER) {
251 err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
252 } else {
253 /* could use a vstub here */
254 return -EFAULT;
255 }
256
257 if (err)
258 return -EFAULT;
259
260 /* Set up registers for signal handler */
261 regs->di = sig;
262 /* In case the signal handler was declared without prototypes */
263 regs->ax = 0;
264
265 /* This also works for non SA_SIGINFO handlers because they expect the
266 next argument after the signal number on the stack. */
267 regs->si = (unsigned long)&frame->info;
268 regs->dx = (unsigned long)&frame->uc;
269 regs->ip = (unsigned long) ka->sa.sa_handler;
270
271 regs->sp = (unsigned long)frame;
272
273 /* Set up the CS register to run signal handlers in 64-bit mode,
274 even if the handler happens to be interrupting 32-bit code. */
275 regs->cs = __USER_CS;
276
277 return 0;
278}
279
280/*
281 * OK, we're invoking a handler
282 */
283static int signr_convert(int sig)
284{
285 return sig;
286}
287
288#ifdef CONFIG_IA32_EMULATION
289#define is_ia32 test_thread_flag(TIF_IA32)
290#else
291#define is_ia32 0
292#endif
293
294static int
295setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
296 sigset_t *set, struct pt_regs *regs)
297{
298 int usig = signr_convert(sig);
299 int ret;
300
301 /* Set up the stack frame */
302 if (is_ia32) {
303 if (ka->sa.sa_flags & SA_SIGINFO)
304 ret = ia32_setup_rt_frame(usig, ka, info, set, regs);
305 else
306 ret = ia32_setup_frame(usig, ka, set, regs);
307 } else
308 ret = __setup_rt_frame(sig, ka, info, set, regs);
309
310 if (ret) {
311 force_sigsegv(sig, current);
312 return -EFAULT;
313 }
314
315 return ret;
316}
317
318static int
319handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
320 sigset_t *oldset, struct pt_regs *regs)
321{
322 int ret;
323
324 /* Are we from a system call? */
325 if (syscall_get_nr(current, regs) >= 0) {
326 /* If so, check system call restarting.. */
327 switch (syscall_get_error(current, regs)) {
328 case -ERESTART_RESTARTBLOCK:
329 case -ERESTARTNOHAND:
330 regs->ax = -EINTR;
331 break;
332
333 case -ERESTARTSYS:
334 if (!(ka->sa.sa_flags & SA_RESTART)) {
335 regs->ax = -EINTR;
336 break;
337 }
338 /* fallthrough */
339 case -ERESTARTNOINTR:
340 regs->ax = regs->orig_ax;
341 regs->ip -= 2;
342 break;
343 }
344 }
345
346 /*
347 * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF
348 * flag so that register information in the sigcontext is correct.
349 */
350 if (unlikely(regs->flags & X86_EFLAGS_TF) &&
351 likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
352 regs->flags &= ~X86_EFLAGS_TF;
353
354 ret = setup_rt_frame(sig, ka, info, oldset, regs);
355
356 if (ret)
357 return ret;
358
359#ifdef CONFIG_X86_64
360 /*
361 * This has nothing to do with segment registers,
362 * despite the name. This magic affects uaccess.h
363 * macros' behavior. Reset it to the normal setting.
364 */
365 set_fs(USER_DS);
366#endif
367
368 /*
369 * Clear the direction flag as per the ABI for function entry.
370 */
371 regs->flags &= ~X86_EFLAGS_DF;
372
373 /*
374 * Clear TF when entering the signal handler, but
375 * notify any tracer that was single-stepping it.
376 * The tracer may want to single-step inside the
377 * handler too.
378 */
379 regs->flags &= ~X86_EFLAGS_TF;
380
381 spin_lock_irq(&current->sighand->siglock);
382 sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
383 if (!(ka->sa.sa_flags & SA_NODEFER))
384 sigaddset(&current->blocked, sig);
385 recalc_sigpending();
386 spin_unlock_irq(&current->sighand->siglock);
387
388 tracehook_signal_handler(sig, info, ka, regs,
389 test_thread_flag(TIF_SINGLESTEP));
390
391 return 0;
392}
393
394#define NR_restart_syscall \
395 test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall
396/*
397 * Note that 'init' is a special process: it doesn't get signals it doesn't
398 * want to handle. Thus you cannot kill init even with a SIGKILL even by
399 * mistake.
400 */
401static void do_signal(struct pt_regs *regs)
402{
403 struct k_sigaction ka;
404 siginfo_t info;
405 int signr;
406 sigset_t *oldset;
407
408 /*
409 * We want the common case to go fast, which is why we may in certain
410 * cases get here from kernel mode. Just return without doing anything
411 * if so.
412 * X86_32: vm86 regs switched out by assembly code before reaching
413 * here, so testing against kernel CS suffices.
414 */
415 if (!user_mode(regs))
416 return;
417
418 if (current_thread_info()->status & TS_RESTORE_SIGMASK)
419 oldset = &current->saved_sigmask;
420 else
421 oldset = &current->blocked;
422
423 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
424 if (signr > 0) {
425 /*
426 * Re-enable any watchpoints before delivering the
427 * signal to user space. The processor register will
428 * have been cleared if the watchpoint triggered
429 * inside the kernel.
430 */
431 if (current->thread.debugreg7)
432 set_debugreg(current->thread.debugreg7, 7);
433
434 /* Whee! Actually deliver the signal. */
435 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
436 /*
437 * A signal was successfully delivered; the saved
438 * sigmask will have been stored in the signal frame,
439 * and will be restored by sigreturn, so we can simply
440 * clear the TS_RESTORE_SIGMASK flag.
441 */
442 current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
443 }
444 return;
445 }
446
447 /* Did we come from a system call? */
448 if (syscall_get_nr(current, regs) >= 0) {
449 /* Restart the system call - no handlers present */
450 switch (syscall_get_error(current, regs)) {
451 case -ERESTARTNOHAND:
452 case -ERESTARTSYS:
453 case -ERESTARTNOINTR:
454 regs->ax = regs->orig_ax;
455 regs->ip -= 2;
456 break;
457
458 case -ERESTART_RESTARTBLOCK:
459 regs->ax = NR_restart_syscall;
460 regs->ip -= 2;
461 break;
462 }
463 }
464
465 /*
466 * If there's no signal to deliver, we just put the saved sigmask
467 * back.
468 */
469 if (current_thread_info()->status & TS_RESTORE_SIGMASK) {
470 current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
471 sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
472 }
473}
474
475/*
476 * notification of userspace execution resumption
477 * - triggered by the TIF_WORK_MASK flags
478 */
479void
480do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
481{
482#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
483 /* notify userspace of pending MCEs */
484 if (thread_info_flags & _TIF_MCE_NOTIFY)
485 mce_notify_user();
486#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
487
488 /* deal with pending signal delivery */
489 if (thread_info_flags & _TIF_SIGPENDING)
490 do_signal(regs);
491
492 if (thread_info_flags & _TIF_NOTIFY_RESUME) {
493 clear_thread_flag(TIF_NOTIFY_RESUME);
494 tracehook_notify_resume(regs);
495 }
496
497#ifdef CONFIG_X86_32
498 clear_thread_flag(TIF_IRET);
499#endif /* CONFIG_X86_32 */
500}
501
502void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
503{
504 struct task_struct *me = current;
505
506 if (show_unhandled_signals && printk_ratelimit()) {
507 printk(KERN_INFO
508 "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
509 me->comm, me->pid, where, frame,
510 regs->ip, regs->sp, regs->orig_ax);
511 print_vma_addr(" in ", regs->ip);
512 printk(KERN_CONT "\n");
513 }
514
515 force_sig(SIGSEGV, me);
516}
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 18f9b19f5f8f..beea2649a240 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -118,41 +118,28 @@ static void native_smp_send_reschedule(int cpu)
118 WARN_ON(1); 118 WARN_ON(1);
119 return; 119 return;
120 } 120 }
121 send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); 121 send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
122} 122}
123 123
124void native_send_call_func_single_ipi(int cpu) 124void native_send_call_func_single_ipi(int cpu)
125{ 125{
126 send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_SINGLE_VECTOR); 126 send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR);
127} 127}
128 128
129void native_send_call_func_ipi(cpumask_t mask) 129void native_send_call_func_ipi(const struct cpumask *mask)
130{ 130{
131 cpumask_t allbutself; 131 cpumask_t allbutself;
132 132
133 allbutself = cpu_online_map; 133 allbutself = cpu_online_map;
134 cpu_clear(smp_processor_id(), allbutself); 134 cpu_clear(smp_processor_id(), allbutself);
135 135
136 if (cpus_equal(mask, allbutself) && 136 if (cpus_equal(*mask, allbutself) &&
137 cpus_equal(cpu_online_map, cpu_callout_map)) 137 cpus_equal(cpu_online_map, cpu_callout_map))
138 send_IPI_allbutself(CALL_FUNCTION_VECTOR); 138 send_IPI_allbutself(CALL_FUNCTION_VECTOR);
139 else 139 else
140 send_IPI_mask(mask, CALL_FUNCTION_VECTOR); 140 send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
141} 141}
142 142
143static void stop_this_cpu(void *dummy)
144{
145 local_irq_disable();
146 /*
147 * Remove this CPU:
148 */
149 cpu_clear(smp_processor_id(), cpu_online_map);
150 disable_local_APIC();
151 if (hlt_works(smp_processor_id()))
152 for (;;) halt();
153 for (;;);
154}
155
156/* 143/*
157 * this function calls the 'stop' function on all other CPUs in the system. 144 * this function calls the 'stop' function on all other CPUs in the system.
158 */ 145 */
@@ -178,11 +165,7 @@ static void native_smp_send_stop(void)
178void smp_reschedule_interrupt(struct pt_regs *regs) 165void smp_reschedule_interrupt(struct pt_regs *regs)
179{ 166{
180 ack_APIC_irq(); 167 ack_APIC_irq();
181#ifdef CONFIG_X86_32 168 inc_irq_stat(irq_resched_count);
182 __get_cpu_var(irq_stat).irq_resched_count++;
183#else
184 add_pda(irq_resched_count, 1);
185#endif
186} 169}
187 170
188void smp_call_function_interrupt(struct pt_regs *regs) 171void smp_call_function_interrupt(struct pt_regs *regs)
@@ -190,11 +173,7 @@ void smp_call_function_interrupt(struct pt_regs *regs)
190 ack_APIC_irq(); 173 ack_APIC_irq();
191 irq_enter(); 174 irq_enter();
192 generic_smp_call_function_interrupt(); 175 generic_smp_call_function_interrupt();
193#ifdef CONFIG_X86_32 176 inc_irq_stat(irq_call_count);
194 __get_cpu_var(irq_stat).irq_call_count++;
195#else
196 add_pda(irq_call_count, 1);
197#endif
198 irq_exit(); 177 irq_exit();
199} 178}
200 179
@@ -203,11 +182,7 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
203 ack_APIC_irq(); 182 ack_APIC_irq();
204 irq_enter(); 183 irq_enter();
205 generic_smp_call_function_single_interrupt(); 184 generic_smp_call_function_single_interrupt();
206#ifdef CONFIG_X86_32 185 inc_irq_stat(irq_call_count);
207 __get_cpu_var(irq_stat).irq_call_count++;
208#else
209 add_pda(irq_call_count, 1);
210#endif
211 irq_exit(); 186 irq_exit();
212} 187}
213 188
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 7b1093397319..6bd4d9b73870 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -62,6 +62,7 @@
62#include <asm/mtrr.h> 62#include <asm/mtrr.h>
63#include <asm/vmi.h> 63#include <asm/vmi.h>
64#include <asm/genapic.h> 64#include <asm/genapic.h>
65#include <asm/setup.h>
65#include <linux/mc146818rtc.h> 66#include <linux/mc146818rtc.h>
66 67
67#include <mach_apic.h> 68#include <mach_apic.h>
@@ -101,14 +102,8 @@ EXPORT_SYMBOL(smp_num_siblings);
101/* Last level cache ID of each logical CPU */ 102/* Last level cache ID of each logical CPU */
102DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID; 103DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID;
103 104
104/* bitmap of online cpus */
105cpumask_t cpu_online_map __read_mostly;
106EXPORT_SYMBOL(cpu_online_map);
107
108cpumask_t cpu_callin_map; 105cpumask_t cpu_callin_map;
109cpumask_t cpu_callout_map; 106cpumask_t cpu_callout_map;
110cpumask_t cpu_possible_map;
111EXPORT_SYMBOL(cpu_possible_map);
112 107
113/* representing HT siblings of each logical CPU */ 108/* representing HT siblings of each logical CPU */
114DEFINE_PER_CPU(cpumask_t, cpu_sibling_map); 109DEFINE_PER_CPU(cpumask_t, cpu_sibling_map);
@@ -287,16 +282,14 @@ static int __cpuinitdata unsafe_smp;
287/* 282/*
288 * Activate a secondary processor. 283 * Activate a secondary processor.
289 */ 284 */
290static void __cpuinit start_secondary(void *unused) 285notrace static void __cpuinit start_secondary(void *unused)
291{ 286{
292 /* 287 /*
293 * Don't put *anything* before cpu_init(), SMP booting is too 288 * Don't put *anything* before cpu_init(), SMP booting is too
294 * fragile that we want to limit the things done here to the 289 * fragile that we want to limit the things done here to the
295 * most necessary things. 290 * most necessary things.
296 */ 291 */
297#ifdef CONFIG_VMI
298 vmi_bringup(); 292 vmi_bringup();
299#endif
300 cpu_init(); 293 cpu_init();
301 preempt_disable(); 294 preempt_disable();
302 smp_callin(); 295 smp_callin();
@@ -503,7 +496,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
503} 496}
504 497
505/* maps the cpu to the sched domain representing multi-core */ 498/* maps the cpu to the sched domain representing multi-core */
506cpumask_t cpu_coregroup_map(int cpu) 499const struct cpumask *cpu_coregroup_mask(int cpu)
507{ 500{
508 struct cpuinfo_x86 *c = &cpu_data(cpu); 501 struct cpuinfo_x86 *c = &cpu_data(cpu);
509 /* 502 /*
@@ -511,9 +504,14 @@ cpumask_t cpu_coregroup_map(int cpu)
511 * And for power savings, we return cpu_core_map 504 * And for power savings, we return cpu_core_map
512 */ 505 */
513 if (sched_mc_power_savings || sched_smt_power_savings) 506 if (sched_mc_power_savings || sched_smt_power_savings)
514 return per_cpu(cpu_core_map, cpu); 507 return &per_cpu(cpu_core_map, cpu);
515 else 508 else
516 return c->llc_shared_map; 509 return &c->llc_shared_map;
510}
511
512cpumask_t cpu_coregroup_map(int cpu)
513{
514 return *cpu_coregroup_mask(cpu);
517} 515}
518 516
519static void impress_friends(void) 517static void impress_friends(void)
@@ -536,7 +534,7 @@ static void impress_friends(void)
536 pr_debug("Before bogocount - setting activated=1.\n"); 534 pr_debug("Before bogocount - setting activated=1.\n");
537} 535}
538 536
539static inline void __inquire_remote_apic(int apicid) 537void __inquire_remote_apic(int apicid)
540{ 538{
541 unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; 539 unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
542 char *names[] = { "ID", "VERSION", "SPIV" }; 540 char *names[] = { "ID", "VERSION", "SPIV" };
@@ -575,14 +573,13 @@ static inline void __inquire_remote_apic(int apicid)
575 } 573 }
576} 574}
577 575
578#ifdef WAKE_SECONDARY_VIA_NMI
579/* 576/*
580 * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal 577 * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
581 * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this 578 * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
582 * won't ... remember to clear down the APIC, etc later. 579 * won't ... remember to clear down the APIC, etc later.
583 */ 580 */
584static int __devinit 581int __devinit
585wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) 582wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
586{ 583{
587 unsigned long send_status, accept_status = 0; 584 unsigned long send_status, accept_status = 0;
588 int maxlvt; 585 int maxlvt;
@@ -599,7 +596,7 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
599 * Give the other CPU some time to accept the IPI. 596 * Give the other CPU some time to accept the IPI.
600 */ 597 */
601 udelay(200); 598 udelay(200);
602 if (APIC_INTEGRATED(apic_version[phys_apicid])) { 599 if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
603 maxlvt = lapic_get_maxlvt(); 600 maxlvt = lapic_get_maxlvt();
604 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ 601 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
605 apic_write(APIC_ESR, 0); 602 apic_write(APIC_ESR, 0);
@@ -614,11 +611,9 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
614 611
615 return (send_status | accept_status); 612 return (send_status | accept_status);
616} 613}
617#endif /* WAKE_SECONDARY_VIA_NMI */
618 614
619#ifdef WAKE_SECONDARY_VIA_INIT 615int __devinit
620static int __devinit 616wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
621wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
622{ 617{
623 unsigned long send_status, accept_status = 0; 618 unsigned long send_status, accept_status = 0;
624 int maxlvt, num_starts, j; 619 int maxlvt, num_starts, j;
@@ -737,7 +732,6 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
737 732
738 return (send_status | accept_status); 733 return (send_status | accept_status);
739} 734}
740#endif /* WAKE_SECONDARY_VIA_INIT */
741 735
742struct create_idle { 736struct create_idle {
743 struct work_struct work; 737 struct work_struct work;
@@ -1086,8 +1080,10 @@ static int __init smp_sanity_check(unsigned max_cpus)
1086#endif 1080#endif
1087 1081
1088 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { 1082 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
1089 printk(KERN_WARNING "weird, boot CPU (#%d) not listed" 1083 printk(KERN_WARNING
1090 "by the BIOS.\n", hard_smp_processor_id()); 1084 "weird, boot CPU (#%d) not listed by the BIOS.\n",
1085 hard_smp_processor_id());
1086
1091 physid_set(hard_smp_processor_id(), phys_cpu_present_map); 1087 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
1092 } 1088 }
1093 1089
@@ -1158,7 +1154,7 @@ static void __init smp_cpu_index_default(void)
1158 for_each_possible_cpu(i) { 1154 for_each_possible_cpu(i) {
1159 c = &cpu_data(i); 1155 c = &cpu_data(i);
1160 /* mark all to hotplug */ 1156 /* mark all to hotplug */
1161 c->cpu_index = NR_CPUS; 1157 c->cpu_index = nr_cpu_ids;
1162 } 1158 }
1163} 1159}
1164 1160
@@ -1263,6 +1259,15 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
1263 check_nmi_watchdog(); 1259 check_nmi_watchdog();
1264} 1260}
1265 1261
1262static int __initdata setup_possible_cpus = -1;
1263static int __init _setup_possible_cpus(char *str)
1264{
1265 get_option(&str, &setup_possible_cpus);
1266 return 0;
1267}
1268early_param("possible_cpus", _setup_possible_cpus);
1269
1270
1266/* 1271/*
1267 * cpu_possible_map should be static, it cannot change as cpu's 1272 * cpu_possible_map should be static, it cannot change as cpu's
1268 * are onlined, or offlined. The reason is per-cpu data-structures 1273 * are onlined, or offlined. The reason is per-cpu data-structures
@@ -1275,7 +1280,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
1275 * 1280 *
1276 * Three ways to find out the number of additional hotplug CPUs: 1281 * Three ways to find out the number of additional hotplug CPUs:
1277 * - If the BIOS specified disabled CPUs in ACPI/mptables use that. 1282 * - If the BIOS specified disabled CPUs in ACPI/mptables use that.
1278 * - The user can overwrite it with additional_cpus=NUM 1283 * - The user can overwrite it with possible_cpus=NUM
1279 * - Otherwise don't reserve additional CPUs. 1284 * - Otherwise don't reserve additional CPUs.
1280 * We do this because additional CPUs waste a lot of memory. 1285 * We do this because additional CPUs waste a lot of memory.
1281 * -AK 1286 * -AK
@@ -1288,9 +1293,19 @@ __init void prefill_possible_map(void)
1288 if (!num_processors) 1293 if (!num_processors)
1289 num_processors = 1; 1294 num_processors = 1;
1290 1295
1291 possible = num_processors + disabled_cpus; 1296 if (setup_possible_cpus == -1)
1292 if (possible > NR_CPUS) 1297 possible = num_processors + disabled_cpus;
1293 possible = NR_CPUS; 1298 else
1299 possible = setup_possible_cpus;
1300
1301 total_cpus = max_t(int, possible, num_processors + disabled_cpus);
1302
1303 if (possible > CONFIG_NR_CPUS) {
1304 printk(KERN_WARNING
1305 "%d Processors exceeds NR_CPUS limit of %d\n",
1306 possible, CONFIG_NR_CPUS);
1307 possible = CONFIG_NR_CPUS;
1308 }
1294 1309
1295 printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", 1310 printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
1296 possible, max_t(int, possible - num_processors, 0)); 1311 possible, max_t(int, possible - num_processors, 0));
@@ -1355,7 +1370,7 @@ void cpu_disable_common(void)
1355 lock_vector_lock(); 1370 lock_vector_lock();
1356 remove_cpu_from_maps(cpu); 1371 remove_cpu_from_maps(cpu);
1357 unlock_vector_lock(); 1372 unlock_vector_lock();
1358 fixup_irqs(cpu_online_map); 1373 fixup_irqs();
1359} 1374}
1360 1375
1361int native_cpu_disable(void) 1376int native_cpu_disable(void)
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index a03e7f6d90c3..10786af95545 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -6,6 +6,7 @@
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/stacktrace.h> 7#include <linux/stacktrace.h>
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/uaccess.h>
9#include <asm/stacktrace.h> 10#include <asm/stacktrace.h>
10 11
11static void save_stack_warning(void *data, char *msg) 12static void save_stack_warning(void *data, char *msg)
@@ -83,3 +84,66 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
83 trace->entries[trace->nr_entries++] = ULONG_MAX; 84 trace->entries[trace->nr_entries++] = ULONG_MAX;
84} 85}
85EXPORT_SYMBOL_GPL(save_stack_trace_tsk); 86EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
87
88/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */
89
90struct stack_frame {
91 const void __user *next_fp;
92 unsigned long ret_addr;
93};
94
95static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
96{
97 int ret;
98
99 if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
100 return 0;
101
102 ret = 1;
103 pagefault_disable();
104 if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
105 ret = 0;
106 pagefault_enable();
107
108 return ret;
109}
110
111static inline void __save_stack_trace_user(struct stack_trace *trace)
112{
113 const struct pt_regs *regs = task_pt_regs(current);
114 const void __user *fp = (const void __user *)regs->bp;
115
116 if (trace->nr_entries < trace->max_entries)
117 trace->entries[trace->nr_entries++] = regs->ip;
118
119 while (trace->nr_entries < trace->max_entries) {
120 struct stack_frame frame;
121
122 frame.next_fp = NULL;
123 frame.ret_addr = 0;
124 if (!copy_stack_frame(fp, &frame))
125 break;
126 if ((unsigned long)fp < regs->sp)
127 break;
128 if (frame.ret_addr) {
129 trace->entries[trace->nr_entries++] =
130 frame.ret_addr;
131 }
132 if (fp == frame.next_fp)
133 break;
134 fp = frame.next_fp;
135 }
136}
137
138void save_stack_trace_user(struct stack_trace *trace)
139{
140 /*
141 * Trace user stack if we are not a kernel thread
142 */
143 if (current->mm) {
144 __save_stack_trace_user(trace);
145 }
146 if (trace->nr_entries < trace->max_entries)
147 trace->entries[trace->nr_entries++] = ULONG_MAX;
148}
149
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index 77b400f06ea2..65309e4cb1c0 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -75,7 +75,7 @@ EXPORT_SYMBOL(profile_pc);
75irqreturn_t timer_interrupt(int irq, void *dev_id) 75irqreturn_t timer_interrupt(int irq, void *dev_id)
76{ 76{
77 /* Keep nmi watchdog up to date */ 77 /* Keep nmi watchdog up to date */
78 per_cpu(irq_stat, smp_processor_id()).irq0_irqs++; 78 inc_irq_stat(irq0_irqs);
79 79
80#ifdef CONFIG_X86_IO_APIC 80#ifdef CONFIG_X86_IO_APIC
81 if (timer_ack) { 81 if (timer_ack) {
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index cb19d650c216..891e7a7c4334 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -49,9 +49,9 @@ unsigned long profile_pc(struct pt_regs *regs)
49} 49}
50EXPORT_SYMBOL(profile_pc); 50EXPORT_SYMBOL(profile_pc);
51 51
52irqreturn_t timer_interrupt(int irq, void *dev_id) 52static irqreturn_t timer_interrupt(int irq, void *dev_id)
53{ 53{
54 add_pda(irq0_irqs, 1); 54 inc_irq_stat(irq0_irqs);
55 55
56 global_clock_event->event_handler(global_clock_event); 56 global_clock_event->event_handler(global_clock_event);
57 57
@@ -80,6 +80,8 @@ unsigned long __init calibrate_cpu(void)
80 break; 80 break;
81 no_ctr_free = (i == 4); 81 no_ctr_free = (i == 4);
82 if (no_ctr_free) { 82 if (no_ctr_free) {
83 WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... "
84 "cpu_khz value may be incorrect.\n");
83 i = 3; 85 i = 3;
84 rdmsrl(MSR_K7_EVNTSEL3, evntsel3); 86 rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
85 wrmsrl(MSR_K7_EVNTSEL3, 0); 87 wrmsrl(MSR_K7_EVNTSEL3, 0);
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
index f4049f3513b6..ce5054642247 100644
--- a/arch/x86/kernel/tlb_32.c
+++ b/arch/x86/kernel/tlb_32.c
@@ -34,9 +34,8 @@ static DEFINE_SPINLOCK(tlbstate_lock);
34 */ 34 */
35void leave_mm(int cpu) 35void leave_mm(int cpu)
36{ 36{
37 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) 37 BUG_ON(x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK);
38 BUG(); 38 cpu_clear(cpu, x86_read_percpu(cpu_tlbstate.active_mm)->cpu_vm_mask);
39 cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
40 load_cr3(swapper_pg_dir); 39 load_cr3(swapper_pg_dir);
41} 40}
42EXPORT_SYMBOL_GPL(leave_mm); 41EXPORT_SYMBOL_GPL(leave_mm);
@@ -104,8 +103,8 @@ void smp_invalidate_interrupt(struct pt_regs *regs)
104 * BUG(); 103 * BUG();
105 */ 104 */
106 105
107 if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) { 106 if (flush_mm == x86_read_percpu(cpu_tlbstate.active_mm)) {
108 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) { 107 if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) {
109 if (flush_va == TLB_FLUSH_ALL) 108 if (flush_va == TLB_FLUSH_ALL)
110 local_flush_tlb(); 109 local_flush_tlb();
111 else 110 else
@@ -119,7 +118,7 @@ void smp_invalidate_interrupt(struct pt_regs *regs)
119 smp_mb__after_clear_bit(); 118 smp_mb__after_clear_bit();
120out: 119out:
121 put_cpu_no_resched(); 120 put_cpu_no_resched();
122 __get_cpu_var(irq_stat).irq_tlb_count++; 121 inc_irq_stat(irq_tlb_count);
123} 122}
124 123
125void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, 124void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
@@ -164,7 +163,7 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
164 * We have to send the IPI only to 163 * We have to send the IPI only to
165 * CPUs affected. 164 * CPUs affected.
166 */ 165 */
167 send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR); 166 send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR);
168 167
169 while (!cpus_empty(flush_cpumask)) 168 while (!cpus_empty(flush_cpumask))
170 /* nothing. lockup detection does not belong here */ 169 /* nothing. lockup detection does not belong here */
@@ -238,7 +237,7 @@ static void do_flush_tlb_all(void *info)
238 unsigned long cpu = smp_processor_id(); 237 unsigned long cpu = smp_processor_id();
239 238
240 __flush_tlb_all(); 239 __flush_tlb_all();
241 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY) 240 if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_LAZY)
242 leave_mm(cpu); 241 leave_mm(cpu);
243} 242}
244 243
diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c
index 8f919ca69494..f8be6f1d2e48 100644
--- a/arch/x86/kernel/tlb_64.c
+++ b/arch/x86/kernel/tlb_64.c
@@ -154,7 +154,7 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
154out: 154out:
155 ack_APIC_irq(); 155 ack_APIC_irq();
156 cpu_clear(cpu, f->flush_cpumask); 156 cpu_clear(cpu, f->flush_cpumask);
157 add_pda(irq_tlb_count, 1); 157 inc_irq_stat(irq_tlb_count);
158} 158}
159 159
160void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, 160void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
@@ -191,7 +191,7 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
191 * We have to send the IPI only to 191 * We have to send the IPI only to
192 * CPUs affected. 192 * CPUs affected.
193 */ 193 */
194 send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender); 194 send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR_START + sender);
195 195
196 while (!cpus_empty(f->flush_cpumask)) 196 while (!cpus_empty(f->flush_cpumask))
197 cpu_relax(); 197 cpu_relax();
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 04431f34fd16..f885023167e0 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -566,14 +566,10 @@ static int __init uv_ptc_init(void)
566 if (!is_uv_system()) 566 if (!is_uv_system())
567 return 0; 567 return 0;
568 568
569 if (!proc_mkdir("sgi_uv", NULL))
570 return -EINVAL;
571
572 proc_uv_ptc = create_proc_entry(UV_PTC_BASENAME, 0444, NULL); 569 proc_uv_ptc = create_proc_entry(UV_PTC_BASENAME, 0444, NULL);
573 if (!proc_uv_ptc) { 570 if (!proc_uv_ptc) {
574 printk(KERN_ERR "unable to create %s proc entry\n", 571 printk(KERN_ERR "unable to create %s proc entry\n",
575 UV_PTC_BASENAME); 572 UV_PTC_BASENAME);
576 remove_proc_entry("sgi_uv", NULL);
577 return -EINVAL; 573 return -EINVAL;
578 } 574 }
579 proc_uv_ptc->proc_fops = &proc_uv_ptc_operations; 575 proc_uv_ptc->proc_fops = &proc_uv_ptc_operations;
@@ -586,7 +582,6 @@ static int __init uv_ptc_init(void)
586static struct bau_control * __init uv_table_bases_init(int blade, int node) 582static struct bau_control * __init uv_table_bases_init(int blade, int node)
587{ 583{
588 int i; 584 int i;
589 int *ip;
590 struct bau_msg_status *msp; 585 struct bau_msg_status *msp;
591 struct bau_control *bau_tabp; 586 struct bau_control *bau_tabp;
592 587
@@ -603,13 +598,6 @@ static struct bau_control * __init uv_table_bases_init(int blade, int node)
603 bau_cpubits_clear(&msp->seen_by, (int) 598 bau_cpubits_clear(&msp->seen_by, (int)
604 uv_blade_nr_possible_cpus(blade)); 599 uv_blade_nr_possible_cpus(blade));
605 600
606 bau_tabp->watching =
607 kmalloc_node(sizeof(int) * DEST_NUM_RESOURCES, GFP_KERNEL, node);
608 BUG_ON(!bau_tabp->watching);
609
610 for (i = 0, ip = bau_tabp->watching; i < DEST_Q_SIZE; i++, ip++)
611 *ip = 0;
612
613 uv_bau_table_bases[blade] = bau_tabp; 601 uv_bau_table_bases[blade] = bau_tabp;
614 602
615 return bau_tabp; 603 return bau_tabp;
@@ -632,7 +620,6 @@ uv_table_bases_finish(int blade, int node, int cur_cpu,
632 bcp->bau_msg_head = bau_tablesp->va_queue_first; 620 bcp->bau_msg_head = bau_tablesp->va_queue_first;
633 bcp->va_queue_first = bau_tablesp->va_queue_first; 621 bcp->va_queue_first = bau_tablesp->va_queue_first;
634 bcp->va_queue_last = bau_tablesp->va_queue_last; 622 bcp->va_queue_last = bau_tablesp->va_queue_last;
635 bcp->watching = bau_tablesp->watching;
636 bcp->msg_statuses = bau_tablesp->msg_statuses; 623 bcp->msg_statuses = bau_tablesp->msg_statuses;
637 bcp->descriptor_base = adp; 624 bcp->descriptor_base = adp;
638 } 625 }
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index 1106fac6024d..808031a5ba19 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -1,10 +1,26 @@
1#include <linux/io.h> 1#include <linux/io.h>
2 2
3#include <asm/trampoline.h> 3#include <asm/trampoline.h>
4#include <asm/e820.h>
4 5
5/* ready for x86_64 and x86 */ 6/* ready for x86_64 and x86 */
6unsigned char *trampoline_base = __va(TRAMPOLINE_BASE); 7unsigned char *trampoline_base = __va(TRAMPOLINE_BASE);
7 8
9void __init reserve_trampoline_memory(void)
10{
11#ifdef CONFIG_X86_32
12 /*
13 * But first pinch a few for the stack/trampoline stuff
14 * FIXME: Don't need the extra page at 4K, but need to fix
15 * trampoline before removing it. (see the GDT stuff)
16 */
17 reserve_early(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE");
18#endif
19 /* Has to be in very low memory so we can execute real-mode AP code. */
20 reserve_early(TRAMPOLINE_BASE, TRAMPOLINE_BASE + TRAMPOLINE_SIZE,
21 "TRAMPOLINE");
22}
23
8/* 24/*
9 * Currently trivial. Write the real->protected mode 25 * Currently trivial. Write the real->protected mode
10 * bootstrap into the page concerned. The caller 26 * bootstrap into the page concerned. The caller
@@ -12,7 +28,6 @@ unsigned char *trampoline_base = __va(TRAMPOLINE_BASE);
12 */ 28 */
13unsigned long setup_trampoline(void) 29unsigned long setup_trampoline(void)
14{ 30{
15 memcpy(trampoline_base, trampoline_data, 31 memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE);
16 trampoline_end - trampoline_data);
17 return virt_to_phys(trampoline_base); 32 return virt_to_phys(trampoline_base);
18} 33}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 04d242ab0161..ce6650eb64e9 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -72,9 +72,6 @@
72 72
73#include "cpu/mcheck/mce.h" 73#include "cpu/mcheck/mce.h"
74 74
75DECLARE_BITMAP(used_vectors, NR_VECTORS);
76EXPORT_SYMBOL_GPL(used_vectors);
77
78asmlinkage int system_call(void); 75asmlinkage int system_call(void);
79 76
80/* Do we ignore FPU interrupts ? */ 77/* Do we ignore FPU interrupts ? */
@@ -89,6 +86,9 @@ gate_desc idt_table[256]
89 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; 86 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
90#endif 87#endif
91 88
89DECLARE_BITMAP(used_vectors, NR_VECTORS);
90EXPORT_SYMBOL_GPL(used_vectors);
91
92static int ignore_nmis; 92static int ignore_nmis;
93 93
94static inline void conditional_sti(struct pt_regs *regs) 94static inline void conditional_sti(struct pt_regs *regs)
@@ -292,8 +292,10 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
292 tsk->thread.error_code = error_code; 292 tsk->thread.error_code = error_code;
293 tsk->thread.trap_no = 8; 293 tsk->thread.trap_no = 8;
294 294
295 /* This is always a kernel trap and never fixable (and thus must 295 /*
296 never return). */ 296 * This is always a kernel trap and never fixable (and thus must
297 * never return).
298 */
297 for (;;) 299 for (;;)
298 die(str, regs, error_code); 300 die(str, regs, error_code);
299} 301}
@@ -481,11 +483,7 @@ do_nmi(struct pt_regs *regs, long error_code)
481{ 483{
482 nmi_enter(); 484 nmi_enter();
483 485
484#ifdef CONFIG_X86_32 486 inc_irq_stat(__nmi_count);
485 { int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); }
486#else
487 add_pda(__nmi_count, 1);
488#endif
489 487
490 if (!ignore_nmis) 488 if (!ignore_nmis)
491 default_do_nmi(regs); 489 default_do_nmi(regs);
@@ -524,9 +522,11 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
524} 522}
525 523
526#ifdef CONFIG_X86_64 524#ifdef CONFIG_X86_64
527/* Help handler running on IST stack to switch back to user stack 525/*
528 for scheduling or signal handling. The actual stack switch is done in 526 * Help handler running on IST stack to switch back to user stack
529 entry.S */ 527 * for scheduling or signal handling. The actual stack switch is done in
528 * entry.S
529 */
530asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) 530asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
531{ 531{
532 struct pt_regs *regs = eregs; 532 struct pt_regs *regs = eregs;
@@ -536,8 +536,10 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
536 /* Exception from user space */ 536 /* Exception from user space */
537 else if (user_mode(eregs)) 537 else if (user_mode(eregs))
538 regs = task_pt_regs(current); 538 regs = task_pt_regs(current);
539 /* Exception from kernel and interrupts are enabled. Move to 539 /*
540 kernel process stack. */ 540 * Exception from kernel and interrupts are enabled. Move to
541 * kernel process stack.
542 */
541 else if (eregs->flags & X86_EFLAGS_IF) 543 else if (eregs->flags & X86_EFLAGS_IF)
542 regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); 544 regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
543 if (eregs != regs) 545 if (eregs != regs)
@@ -664,7 +666,7 @@ void math_error(void __user *ip)
664{ 666{
665 struct task_struct *task; 667 struct task_struct *task;
666 siginfo_t info; 668 siginfo_t info;
667 unsigned short cwd, swd; 669 unsigned short cwd, swd, err;
668 670
669 /* 671 /*
670 * Save the info for the exception handler and clear the error. 672 * Save the info for the exception handler and clear the error.
@@ -675,7 +677,6 @@ void math_error(void __user *ip)
675 task->thread.error_code = 0; 677 task->thread.error_code = 0;
676 info.si_signo = SIGFPE; 678 info.si_signo = SIGFPE;
677 info.si_errno = 0; 679 info.si_errno = 0;
678 info.si_code = __SI_FAULT;
679 info.si_addr = ip; 680 info.si_addr = ip;
680 /* 681 /*
681 * (~cwd & swd) will mask out exceptions that are not set to unmasked 682 * (~cwd & swd) will mask out exceptions that are not set to unmasked
@@ -689,34 +690,30 @@ void math_error(void __user *ip)
689 */ 690 */
690 cwd = get_fpu_cwd(task); 691 cwd = get_fpu_cwd(task);
691 swd = get_fpu_swd(task); 692 swd = get_fpu_swd(task);
692 switch (swd & ~cwd & 0x3f) { 693
693 case 0x000: /* No unmasked exception */ 694 err = swd & ~cwd;
694#ifdef CONFIG_X86_32 695
695 return; 696 if (err & 0x001) { /* Invalid op */
696#endif
697 default: /* Multiple exceptions */
698 break;
699 case 0x001: /* Invalid Op */
700 /* 697 /*
701 * swd & 0x240 == 0x040: Stack Underflow 698 * swd & 0x240 == 0x040: Stack Underflow
702 * swd & 0x240 == 0x240: Stack Overflow 699 * swd & 0x240 == 0x240: Stack Overflow
703 * User must clear the SF bit (0x40) if set 700 * User must clear the SF bit (0x40) if set
704 */ 701 */
705 info.si_code = FPE_FLTINV; 702 info.si_code = FPE_FLTINV;
706 break; 703 } else if (err & 0x004) { /* Divide by Zero */
707 case 0x002: /* Denormalize */
708 case 0x010: /* Underflow */
709 info.si_code = FPE_FLTUND;
710 break;
711 case 0x004: /* Zero Divide */
712 info.si_code = FPE_FLTDIV; 704 info.si_code = FPE_FLTDIV;
713 break; 705 } else if (err & 0x008) { /* Overflow */
714 case 0x008: /* Overflow */
715 info.si_code = FPE_FLTOVF; 706 info.si_code = FPE_FLTOVF;
716 break; 707 } else if (err & 0x012) { /* Denormal, Underflow */
717 case 0x020: /* Precision */ 708 info.si_code = FPE_FLTUND;
709 } else if (err & 0x020) { /* Precision */
718 info.si_code = FPE_FLTRES; 710 info.si_code = FPE_FLTRES;
719 break; 711 } else {
712 /*
713 * If we're using IRQ 13, or supposedly even some trap 16
714 * implementations, it's possible we get a spurious trap...
715 */
716 return; /* Spurious trap, no error */
720 } 717 }
721 force_sig_info(SIGFPE, &info, task); 718 force_sig_info(SIGFPE, &info, task);
722} 719}
@@ -949,9 +946,7 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
949 946
950void __init trap_init(void) 947void __init trap_init(void)
951{ 948{
952#ifdef CONFIG_X86_32
953 int i; 949 int i;
954#endif
955 950
956#ifdef CONFIG_EISA 951#ifdef CONFIG_EISA
957 void __iomem *p = early_ioremap(0x0FFFD9, 4); 952 void __iomem *p = early_ioremap(0x0FFFD9, 4);
@@ -1008,11 +1003,15 @@ void __init trap_init(void)
1008 } 1003 }
1009 1004
1010 set_system_trap_gate(SYSCALL_VECTOR, &system_call); 1005 set_system_trap_gate(SYSCALL_VECTOR, &system_call);
1006#endif
1011 1007
1012 /* Reserve all the builtin and the syscall vector: */ 1008 /* Reserve all the builtin and the syscall vector: */
1013 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) 1009 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
1014 set_bit(i, used_vectors); 1010 set_bit(i, used_vectors);
1015 1011
1012#ifdef CONFIG_X86_64
1013 set_bit(IA32_SYSCALL_VECTOR, used_vectors);
1014#else
1016 set_bit(SYSCALL_VECTOR, used_vectors); 1015 set_bit(SYSCALL_VECTOR, used_vectors);
1017#endif 1016#endif
1018 /* 1017 /*
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 424093b157d3..599e58168631 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -15,6 +15,7 @@
15#include <asm/vgtod.h> 15#include <asm/vgtod.h>
16#include <asm/time.h> 16#include <asm/time.h>
17#include <asm/delay.h> 17#include <asm/delay.h>
18#include <asm/hypervisor.h>
18 19
19unsigned int cpu_khz; /* TSC clocks / usec, not used here */ 20unsigned int cpu_khz; /* TSC clocks / usec, not used here */
20EXPORT_SYMBOL(cpu_khz); 21EXPORT_SYMBOL(cpu_khz);
@@ -31,6 +32,7 @@ static int tsc_unstable;
31 erroneous rdtsc usage on !cpu_has_tsc processors */ 32 erroneous rdtsc usage on !cpu_has_tsc processors */
32static int tsc_disabled = -1; 33static int tsc_disabled = -1;
33 34
35static int tsc_clocksource_reliable;
34/* 36/*
35 * Scheduler clock - returns current time in nanosec units. 37 * Scheduler clock - returns current time in nanosec units.
36 */ 38 */
@@ -98,6 +100,15 @@ int __init notsc_setup(char *str)
98 100
99__setup("notsc", notsc_setup); 101__setup("notsc", notsc_setup);
100 102
103static int __init tsc_setup(char *str)
104{
105 if (!strcmp(str, "reliable"))
106 tsc_clocksource_reliable = 1;
107 return 1;
108}
109
110__setup("tsc=", tsc_setup);
111
101#define MAX_RETRIES 5 112#define MAX_RETRIES 5
102#define SMI_TRESHOLD 50000 113#define SMI_TRESHOLD 50000
103 114
@@ -352,9 +363,15 @@ unsigned long native_calibrate_tsc(void)
352{ 363{
353 u64 tsc1, tsc2, delta, ref1, ref2; 364 u64 tsc1, tsc2, delta, ref1, ref2;
354 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; 365 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
355 unsigned long flags, latch, ms, fast_calibrate; 366 unsigned long flags, latch, ms, fast_calibrate, tsc_khz;
356 int hpet = is_hpet_enabled(), i, loopmin; 367 int hpet = is_hpet_enabled(), i, loopmin;
357 368
369 tsc_khz = get_hypervisor_tsc_freq();
370 if (tsc_khz) {
371 printk(KERN_INFO "TSC: Frequency read from the hypervisor\n");
372 return tsc_khz;
373 }
374
358 local_irq_save(flags); 375 local_irq_save(flags);
359 fast_calibrate = quick_pit_calibrate(); 376 fast_calibrate = quick_pit_calibrate();
360 local_irq_restore(flags); 377 local_irq_restore(flags);
@@ -731,24 +748,21 @@ static struct dmi_system_id __initdata bad_tsc_dmi_table[] = {
731 {} 748 {}
732}; 749};
733 750
734/* 751static void __init check_system_tsc_reliable(void)
735 * Geode_LX - the OLPC CPU has a possibly a very reliable TSC 752{
736 */
737#ifdef CONFIG_MGEODE_LX 753#ifdef CONFIG_MGEODE_LX
738/* RTSC counts during suspend */ 754 /* RTSC counts during suspend */
739#define RTSC_SUSP 0x100 755#define RTSC_SUSP 0x100
740
741static void __init check_geode_tsc_reliable(void)
742{
743 unsigned long res_low, res_high; 756 unsigned long res_low, res_high;
744 757
745 rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high); 758 rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
759 /* Geode_LX - the OLPC CPU has a possibly a very reliable TSC */
746 if (res_low & RTSC_SUSP) 760 if (res_low & RTSC_SUSP)
747 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; 761 tsc_clocksource_reliable = 1;
748}
749#else
750static inline void check_geode_tsc_reliable(void) { }
751#endif 762#endif
763 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
764 tsc_clocksource_reliable = 1;
765}
752 766
753/* 767/*
754 * Make an educated guess if the TSC is trustworthy and synchronized 768 * Make an educated guess if the TSC is trustworthy and synchronized
@@ -783,6 +797,8 @@ static void __init init_tsc_clocksource(void)
783{ 797{
784 clocksource_tsc.mult = clocksource_khz2mult(tsc_khz, 798 clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
785 clocksource_tsc.shift); 799 clocksource_tsc.shift);
800 if (tsc_clocksource_reliable)
801 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
786 /* lower the rating if we already know its unstable: */ 802 /* lower the rating if we already know its unstable: */
787 if (check_tsc_unstable()) { 803 if (check_tsc_unstable()) {
788 clocksource_tsc.rating = 0; 804 clocksource_tsc.rating = 0;
@@ -843,7 +859,7 @@ void __init tsc_init(void)
843 if (unsynchronized_tsc()) 859 if (unsynchronized_tsc())
844 mark_tsc_unstable("TSCs unsynchronized"); 860 mark_tsc_unstable("TSCs unsynchronized");
845 861
846 check_geode_tsc_reliable(); 862 check_system_tsc_reliable();
847 init_tsc_clocksource(); 863 init_tsc_clocksource();
848} 864}
849 865
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 1c0dfbca87c1..bf36328f6ef9 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -112,6 +112,12 @@ void __cpuinit check_tsc_sync_source(int cpu)
112 if (unsynchronized_tsc()) 112 if (unsynchronized_tsc())
113 return; 113 return;
114 114
115 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
116 printk(KERN_INFO
117 "Skipping synchronization checks as TSC is reliable.\n");
118 return;
119 }
120
115 printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:", 121 printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:",
116 smp_processor_id(), cpu); 122 smp_processor_id(), cpu);
117 123
@@ -165,7 +171,7 @@ void __cpuinit check_tsc_sync_target(void)
165{ 171{
166 int cpus = 2; 172 int cpus = 2;
167 173
168 if (unsynchronized_tsc()) 174 if (unsynchronized_tsc() || boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
169 return; 175 return;
170 176
171 /* 177 /*
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 8b6c393ab9fd..23206ba16874 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -266,109 +266,6 @@ static void vmi_nop(void)
266{ 266{
267} 267}
268 268
269#ifdef CONFIG_DEBUG_PAGE_TYPE
270
271#ifdef CONFIG_X86_PAE
272#define MAX_BOOT_PTS (2048+4+1)
273#else
274#define MAX_BOOT_PTS (1024+1)
275#endif
276
277/*
278 * During boot, mem_map is not yet available in paging_init, so stash
279 * all the boot page allocations here.
280 */
281static struct {
282 u32 pfn;
283 int type;
284} boot_page_allocations[MAX_BOOT_PTS];
285static int num_boot_page_allocations;
286static int boot_allocations_applied;
287
288void vmi_apply_boot_page_allocations(void)
289{
290 int i;
291 BUG_ON(!mem_map);
292 for (i = 0; i < num_boot_page_allocations; i++) {
293 struct page *page = pfn_to_page(boot_page_allocations[i].pfn);
294 page->type = boot_page_allocations[i].type;
295 page->type = boot_page_allocations[i].type &
296 ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
297 }
298 boot_allocations_applied = 1;
299}
300
301static void record_page_type(u32 pfn, int type)
302{
303 BUG_ON(num_boot_page_allocations >= MAX_BOOT_PTS);
304 boot_page_allocations[num_boot_page_allocations].pfn = pfn;
305 boot_page_allocations[num_boot_page_allocations].type = type;
306 num_boot_page_allocations++;
307}
308
309static void check_zeroed_page(u32 pfn, int type, struct page *page)
310{
311 u32 *ptr;
312 int i;
313 int limit = PAGE_SIZE / sizeof(int);
314
315 if (page_address(page))
316 ptr = (u32 *)page_address(page);
317 else
318 ptr = (u32 *)__va(pfn << PAGE_SHIFT);
319 /*
320 * When cloning the root in non-PAE mode, only the userspace
321 * pdes need to be zeroed.
322 */
323 if (type & VMI_PAGE_CLONE)
324 limit = KERNEL_PGD_BOUNDARY;
325 for (i = 0; i < limit; i++)
326 BUG_ON(ptr[i]);
327}
328
329/*
330 * We stash the page type into struct page so we can verify the page
331 * types are used properly.
332 */
333static void vmi_set_page_type(u32 pfn, int type)
334{
335 /* PAE can have multiple roots per page - don't track */
336 if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP))
337 return;
338
339 if (boot_allocations_applied) {
340 struct page *page = pfn_to_page(pfn);
341 if (type != VMI_PAGE_NORMAL)
342 BUG_ON(page->type);
343 else
344 BUG_ON(page->type == VMI_PAGE_NORMAL);
345 page->type = type & ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
346 if (type & VMI_PAGE_ZEROED)
347 check_zeroed_page(pfn, type, page);
348 } else {
349 record_page_type(pfn, type);
350 }
351}
352
353static void vmi_check_page_type(u32 pfn, int type)
354{
355 /* PAE can have multiple roots per page - skip checks */
356 if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP))
357 return;
358
359 type &= ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
360 if (boot_allocations_applied) {
361 struct page *page = pfn_to_page(pfn);
362 BUG_ON((page->type ^ type) & VMI_PAGE_PAE);
363 BUG_ON(type == VMI_PAGE_NORMAL && page->type);
364 BUG_ON((type & page->type) == 0);
365 }
366}
367#else
368#define vmi_set_page_type(p,t) do { } while (0)
369#define vmi_check_page_type(p,t) do { } while (0)
370#endif
371
372#ifdef CONFIG_HIGHPTE 269#ifdef CONFIG_HIGHPTE
373static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type) 270static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
374{ 271{
@@ -395,7 +292,6 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
395 292
396static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn) 293static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
397{ 294{
398 vmi_set_page_type(pfn, VMI_PAGE_L1);
399 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); 295 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
400} 296}
401 297
@@ -406,27 +302,22 @@ static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn)
406 * It is called only for swapper_pg_dir, which already has 302 * It is called only for swapper_pg_dir, which already has
407 * data on it. 303 * data on it.
408 */ 304 */
409 vmi_set_page_type(pfn, VMI_PAGE_L2);
410 vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0); 305 vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
411} 306}
412 307
413static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count) 308static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count)
414{ 309{
415 vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
416 vmi_check_page_type(clonepfn, VMI_PAGE_L2);
417 vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count); 310 vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
418} 311}
419 312
420static void vmi_release_pte(unsigned long pfn) 313static void vmi_release_pte(unsigned long pfn)
421{ 314{
422 vmi_ops.release_page(pfn, VMI_PAGE_L1); 315 vmi_ops.release_page(pfn, VMI_PAGE_L1);
423 vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
424} 316}
425 317
426static void vmi_release_pmd(unsigned long pfn) 318static void vmi_release_pmd(unsigned long pfn)
427{ 319{
428 vmi_ops.release_page(pfn, VMI_PAGE_L2); 320 vmi_ops.release_page(pfn, VMI_PAGE_L2);
429 vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
430} 321}
431 322
432/* 323/*
@@ -450,26 +341,22 @@ static void vmi_release_pmd(unsigned long pfn)
450 341
451static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 342static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
452{ 343{
453 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
454 vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); 344 vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
455} 345}
456 346
457static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 347static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
458{ 348{
459 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
460 vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0)); 349 vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0));
461} 350}
462 351
463static void vmi_set_pte(pte_t *ptep, pte_t pte) 352static void vmi_set_pte(pte_t *ptep, pte_t pte)
464{ 353{
465 /* XXX because of set_pmd_pte, this can be called on PT or PD layers */ 354 /* XXX because of set_pmd_pte, this can be called on PT or PD layers */
466 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE | VMI_PAGE_PD);
467 vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT); 355 vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT);
468} 356}
469 357
470static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) 358static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
471{ 359{
472 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
473 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); 360 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
474} 361}
475 362
@@ -477,10 +364,8 @@ static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval)
477{ 364{
478#ifdef CONFIG_X86_PAE 365#ifdef CONFIG_X86_PAE
479 const pte_t pte = { .pte = pmdval.pmd }; 366 const pte_t pte = { .pte = pmdval.pmd };
480 vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PMD);
481#else 367#else
482 const pte_t pte = { pmdval.pud.pgd.pgd }; 368 const pte_t pte = { pmdval.pud.pgd.pgd };
483 vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PGD);
484#endif 369#endif
485 vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD); 370 vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD);
486} 371}
@@ -502,7 +387,6 @@ static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
502 387
503static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) 388static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
504{ 389{
505 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
506 vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1)); 390 vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1));
507} 391}
508 392
@@ -510,21 +394,18 @@ static void vmi_set_pud(pud_t *pudp, pud_t pudval)
510{ 394{
511 /* Um, eww */ 395 /* Um, eww */
512 const pte_t pte = { .pte = pudval.pgd.pgd }; 396 const pte_t pte = { .pte = pudval.pgd.pgd };
513 vmi_check_page_type(__pa(pudp) >> PAGE_SHIFT, VMI_PAGE_PGD);
514 vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP); 397 vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP);
515} 398}
516 399
517static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 400static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
518{ 401{
519 const pte_t pte = { .pte = 0 }; 402 const pte_t pte = { .pte = 0 };
520 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
521 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); 403 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
522} 404}
523 405
524static void vmi_pmd_clear(pmd_t *pmd) 406static void vmi_pmd_clear(pmd_t *pmd)
525{ 407{
526 const pte_t pte = { .pte = 0 }; 408 const pte_t pte = { .pte = 0 };
527 vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD);
528 vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD); 409 vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD);
529} 410}
530#endif 411#endif
@@ -960,8 +841,6 @@ static inline int __init activate_vmi(void)
960 841
961void __init vmi_init(void) 842void __init vmi_init(void)
962{ 843{
963 unsigned long flags;
964
965 if (!vmi_rom) 844 if (!vmi_rom)
966 probe_vmi_rom(); 845 probe_vmi_rom();
967 else 846 else
@@ -973,13 +852,21 @@ void __init vmi_init(void)
973 852
974 reserve_top_address(-vmi_rom->virtual_top); 853 reserve_top_address(-vmi_rom->virtual_top);
975 854
976 local_irq_save(flags);
977 activate_vmi();
978
979#ifdef CONFIG_X86_IO_APIC 855#ifdef CONFIG_X86_IO_APIC
980 /* This is virtual hardware; timer routing is wired correctly */ 856 /* This is virtual hardware; timer routing is wired correctly */
981 no_timer_check = 1; 857 no_timer_check = 1;
982#endif 858#endif
859}
860
861void vmi_activate(void)
862{
863 unsigned long flags;
864
865 if (!vmi_rom)
866 return;
867
868 local_irq_save(flags);
869 activate_vmi();
983 local_irq_restore(flags & X86_EFLAGS_IF); 870 local_irq_restore(flags & X86_EFLAGS_IF);
984} 871}
985 872
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 254ee07f8635..c4c1f9e09402 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -226,7 +226,7 @@ static void __devinit vmi_time_init_clockevent(void)
226 /* Upper bound is clockevent's use of ulong for cycle deltas. */ 226 /* Upper bound is clockevent's use of ulong for cycle deltas. */
227 evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt); 227 evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt);
228 evt->min_delta_ns = clockevent_delta2ns(1, evt); 228 evt->min_delta_ns = clockevent_delta2ns(1, evt);
229 evt->cpumask = cpumask_of_cpu(cpu); 229 evt->cpumask = cpumask_of(cpu);
230 230
231 printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n", 231 printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n",
232 evt->name, evt->mult, evt->shift); 232 evt->name, evt->mult, evt->shift);
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index a9b8560adbc2..82c67559dde7 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -44,6 +44,7 @@ SECTIONS
44 SCHED_TEXT 44 SCHED_TEXT
45 LOCK_TEXT 45 LOCK_TEXT
46 KPROBES_TEXT 46 KPROBES_TEXT
47 IRQENTRY_TEXT
47 *(.fixup) 48 *(.fixup)
48 *(.gnu.warning) 49 *(.gnu.warning)
49 _etext = .; /* End of text section */ 50 _etext = .; /* End of text section */
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 46e05447405b..1a614c0e6bef 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -35,6 +35,7 @@ SECTIONS
35 SCHED_TEXT 35 SCHED_TEXT
36 LOCK_TEXT 36 LOCK_TEXT
37 KPROBES_TEXT 37 KPROBES_TEXT
38 IRQENTRY_TEXT
38 *(.fixup) 39 *(.fixup)
39 *(.gnu.warning) 40 *(.gnu.warning)
40 _etext = .; /* End of text section */ 41 _etext = .; /* End of text section */
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 0b8b6690a86d..44153afc9067 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -17,6 +17,9 @@
17 * want per guest time just set the kernel.vsyscall64 sysctl to 0. 17 * want per guest time just set the kernel.vsyscall64 sysctl to 0.
18 */ 18 */
19 19
20/* Disable profiling for userspace code: */
21#define DISABLE_BRANCH_PROFILING
22
20#include <linux/time.h> 23#include <linux/time.h>
21#include <linux/init.h> 24#include <linux/init.h>
22#include <linux/kernel.h> 25#include <linux/kernel.h>
@@ -128,7 +131,16 @@ static __always_inline void do_vgettimeofday(struct timeval * tv)
128 gettimeofday(tv,NULL); 131 gettimeofday(tv,NULL);
129 return; 132 return;
130 } 133 }
134
135 /*
136 * Surround the RDTSC by barriers, to make sure it's not
137 * speculated to outside the seqlock critical section and
138 * does not cause time warps:
139 */
140 rdtsc_barrier();
131 now = vread(); 141 now = vread();
142 rdtsc_barrier();
143
132 base = __vsyscall_gtod_data.clock.cycle_last; 144 base = __vsyscall_gtod_data.clock.cycle_last;
133 mask = __vsyscall_gtod_data.clock.mask; 145 mask = __vsyscall_gtod_data.clock.mask;
134 mult = __vsyscall_gtod_data.clock.mult; 146 mult = __vsyscall_gtod_data.clock.mult;
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 15c3e6999182..2b54fe002e94 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -159,7 +159,7 @@ int save_i387_xstate(void __user *buf)
159 * Restore the extended state if present. Otherwise, restore the FP/SSE 159 * Restore the extended state if present. Otherwise, restore the FP/SSE
160 * state. 160 * state.
161 */ 161 */
162int restore_user_xstate(void __user *buf) 162static int restore_user_xstate(void __user *buf)
163{ 163{
164 struct _fpx_sw_bytes fx_sw_user; 164 struct _fpx_sw_bytes fx_sw_user;
165 u64 mask; 165 u64 mask;
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index c02343594b4d..d3ec292f00f2 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -7,8 +7,8 @@ common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
7ifeq ($(CONFIG_KVM_TRACE),y) 7ifeq ($(CONFIG_KVM_TRACE),y)
8common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o) 8common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o)
9endif 9endif
10ifeq ($(CONFIG_DMAR),y) 10ifeq ($(CONFIG_IOMMU_API),y)
11common-objs += $(addprefix ../../../virt/kvm/, vtd.o) 11common-objs += $(addprefix ../../../virt/kvm/, iommu.o)
12endif 12endif
13 13
14EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm 14EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 59ebd37ad79e..e665d1c623ca 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -603,10 +603,29 @@ void kvm_free_pit(struct kvm *kvm)
603 603
604static void __inject_pit_timer_intr(struct kvm *kvm) 604static void __inject_pit_timer_intr(struct kvm *kvm)
605{ 605{
606 struct kvm_vcpu *vcpu;
607 int i;
608
606 mutex_lock(&kvm->lock); 609 mutex_lock(&kvm->lock);
607 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); 610 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
608 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); 611 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
609 mutex_unlock(&kvm->lock); 612 mutex_unlock(&kvm->lock);
613
614 /*
615 * Provides NMI watchdog support via Virtual Wire mode.
616 * The route is: PIT -> PIC -> LVT0 in NMI mode.
617 *
618 * Note: Our Virtual Wire implementation is simplified, only
619 * propagating PIT interrupts to all VCPUs when they have set
620 * LVT0 to NMI delivery. Other PIC interrupts are just sent to
621 * VCPU0, and only if its LVT0 is in EXTINT mode.
622 */
623 if (kvm->arch.vapics_in_nmi_mode > 0)
624 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
625 vcpu = kvm->vcpus[i];
626 if (vcpu)
627 kvm_apic_nmi_wd_deliver(vcpu);
628 }
610} 629}
611 630
612void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) 631void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 17e41e165f1a..179dcb0103fd 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -26,10 +26,40 @@
26 * Port from Qemu. 26 * Port from Qemu.
27 */ 27 */
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/bitops.h>
29#include "irq.h" 30#include "irq.h"
30 31
31#include <linux/kvm_host.h> 32#include <linux/kvm_host.h>
32 33
34static void pic_lock(struct kvm_pic *s)
35{
36 spin_lock(&s->lock);
37}
38
39static void pic_unlock(struct kvm_pic *s)
40{
41 struct kvm *kvm = s->kvm;
42 unsigned acks = s->pending_acks;
43 bool wakeup = s->wakeup_needed;
44 struct kvm_vcpu *vcpu;
45
46 s->pending_acks = 0;
47 s->wakeup_needed = false;
48
49 spin_unlock(&s->lock);
50
51 while (acks) {
52 kvm_notify_acked_irq(kvm, __ffs(acks));
53 acks &= acks - 1;
54 }
55
56 if (wakeup) {
57 vcpu = s->kvm->vcpus[0];
58 if (vcpu)
59 kvm_vcpu_kick(vcpu);
60 }
61}
62
33static void pic_clear_isr(struct kvm_kpic_state *s, int irq) 63static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
34{ 64{
35 s->isr &= ~(1 << irq); 65 s->isr &= ~(1 << irq);
@@ -136,17 +166,21 @@ static void pic_update_irq(struct kvm_pic *s)
136 166
137void kvm_pic_update_irq(struct kvm_pic *s) 167void kvm_pic_update_irq(struct kvm_pic *s)
138{ 168{
169 pic_lock(s);
139 pic_update_irq(s); 170 pic_update_irq(s);
171 pic_unlock(s);
140} 172}
141 173
142void kvm_pic_set_irq(void *opaque, int irq, int level) 174void kvm_pic_set_irq(void *opaque, int irq, int level)
143{ 175{
144 struct kvm_pic *s = opaque; 176 struct kvm_pic *s = opaque;
145 177
178 pic_lock(s);
146 if (irq >= 0 && irq < PIC_NUM_PINS) { 179 if (irq >= 0 && irq < PIC_NUM_PINS) {
147 pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); 180 pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
148 pic_update_irq(s); 181 pic_update_irq(s);
149 } 182 }
183 pic_unlock(s);
150} 184}
151 185
152/* 186/*
@@ -172,6 +206,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
172 int irq, irq2, intno; 206 int irq, irq2, intno;
173 struct kvm_pic *s = pic_irqchip(kvm); 207 struct kvm_pic *s = pic_irqchip(kvm);
174 208
209 pic_lock(s);
175 irq = pic_get_irq(&s->pics[0]); 210 irq = pic_get_irq(&s->pics[0]);
176 if (irq >= 0) { 211 if (irq >= 0) {
177 pic_intack(&s->pics[0], irq); 212 pic_intack(&s->pics[0], irq);
@@ -196,6 +231,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
196 intno = s->pics[0].irq_base + irq; 231 intno = s->pics[0].irq_base + irq;
197 } 232 }
198 pic_update_irq(s); 233 pic_update_irq(s);
234 pic_unlock(s);
199 kvm_notify_acked_irq(kvm, irq); 235 kvm_notify_acked_irq(kvm, irq);
200 236
201 return intno; 237 return intno;
@@ -203,7 +239,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
203 239
204void kvm_pic_reset(struct kvm_kpic_state *s) 240void kvm_pic_reset(struct kvm_kpic_state *s)
205{ 241{
206 int irq, irqbase; 242 int irq, irqbase, n;
207 struct kvm *kvm = s->pics_state->irq_request_opaque; 243 struct kvm *kvm = s->pics_state->irq_request_opaque;
208 struct kvm_vcpu *vcpu0 = kvm->vcpus[0]; 244 struct kvm_vcpu *vcpu0 = kvm->vcpus[0];
209 245
@@ -214,8 +250,10 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
214 250
215 for (irq = 0; irq < PIC_NUM_PINS/2; irq++) { 251 for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
216 if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0)) 252 if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
217 if (s->irr & (1 << irq) || s->isr & (1 << irq)) 253 if (s->irr & (1 << irq) || s->isr & (1 << irq)) {
218 kvm_notify_acked_irq(kvm, irq+irqbase); 254 n = irq + irqbase;
255 s->pics_state->pending_acks |= 1 << n;
256 }
219 } 257 }
220 s->last_irr = 0; 258 s->last_irr = 0;
221 s->irr = 0; 259 s->irr = 0;
@@ -406,6 +444,7 @@ static void picdev_write(struct kvm_io_device *this,
406 printk(KERN_ERR "PIC: non byte write\n"); 444 printk(KERN_ERR "PIC: non byte write\n");
407 return; 445 return;
408 } 446 }
447 pic_lock(s);
409 switch (addr) { 448 switch (addr) {
410 case 0x20: 449 case 0x20:
411 case 0x21: 450 case 0x21:
@@ -418,6 +457,7 @@ static void picdev_write(struct kvm_io_device *this,
418 elcr_ioport_write(&s->pics[addr & 1], addr, data); 457 elcr_ioport_write(&s->pics[addr & 1], addr, data);
419 break; 458 break;
420 } 459 }
460 pic_unlock(s);
421} 461}
422 462
423static void picdev_read(struct kvm_io_device *this, 463static void picdev_read(struct kvm_io_device *this,
@@ -431,6 +471,7 @@ static void picdev_read(struct kvm_io_device *this,
431 printk(KERN_ERR "PIC: non byte read\n"); 471 printk(KERN_ERR "PIC: non byte read\n");
432 return; 472 return;
433 } 473 }
474 pic_lock(s);
434 switch (addr) { 475 switch (addr) {
435 case 0x20: 476 case 0x20:
436 case 0x21: 477 case 0x21:
@@ -444,6 +485,7 @@ static void picdev_read(struct kvm_io_device *this,
444 break; 485 break;
445 } 486 }
446 *(unsigned char *)val = data; 487 *(unsigned char *)val = data;
488 pic_unlock(s);
447} 489}
448 490
449/* 491/*
@@ -459,7 +501,7 @@ static void pic_irq_request(void *opaque, int level)
459 s->output = level; 501 s->output = level;
460 if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) { 502 if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
461 s->pics[0].isr_ack &= ~(1 << irq); 503 s->pics[0].isr_ack &= ~(1 << irq);
462 kvm_vcpu_kick(vcpu); 504 s->wakeup_needed = true;
463 } 505 }
464} 506}
465 507
@@ -469,6 +511,8 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
469 s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); 511 s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
470 if (!s) 512 if (!s)
471 return NULL; 513 return NULL;
514 spin_lock_init(&s->lock);
515 s->kvm = kvm;
472 s->pics[0].elcr_mask = 0xf8; 516 s->pics[0].elcr_mask = 0xf8;
473 s->pics[1].elcr_mask = 0xde; 517 s->pics[1].elcr_mask = 0xde;
474 s->irq_request = pic_irq_request; 518 s->irq_request = pic_irq_request;
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index f17c8f5bbf31..2bf32a03ceec 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -25,6 +25,7 @@
25#include <linux/mm_types.h> 25#include <linux/mm_types.h>
26#include <linux/hrtimer.h> 26#include <linux/hrtimer.h>
27#include <linux/kvm_host.h> 27#include <linux/kvm_host.h>
28#include <linux/spinlock.h>
28 29
29#include "iodev.h" 30#include "iodev.h"
30#include "ioapic.h" 31#include "ioapic.h"
@@ -59,6 +60,10 @@ struct kvm_kpic_state {
59}; 60};
60 61
61struct kvm_pic { 62struct kvm_pic {
63 spinlock_t lock;
64 bool wakeup_needed;
65 unsigned pending_acks;
66 struct kvm *kvm;
62 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ 67 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
63 irq_request_func *irq_request; 68 irq_request_func *irq_request;
64 void *irq_request_opaque; 69 void *irq_request_opaque;
@@ -87,6 +92,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s);
87void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec); 92void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
88void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); 93void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
89void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); 94void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
95void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu);
90void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); 96void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
91void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu); 97void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu);
92void __kvm_migrate_timers(struct kvm_vcpu *vcpu); 98void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h
index 65ef0fc2c036..8e5ee99551f6 100644
--- a/arch/x86/kvm/kvm_svm.h
+++ b/arch/x86/kvm/kvm_svm.h
@@ -7,7 +7,7 @@
7#include <linux/kvm_host.h> 7#include <linux/kvm_host.h>
8#include <asm/msr.h> 8#include <asm/msr.h>
9 9
10#include "svm.h" 10#include <asm/svm.h>
11 11
12static const u32 host_save_user_msrs[] = { 12static const u32 host_save_user_msrs[] = {
13#ifdef CONFIG_X86_64 13#ifdef CONFIG_X86_64
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 0fc3cab48943..afac68c0815c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -130,6 +130,11 @@ static inline int apic_lvtt_period(struct kvm_lapic *apic)
130 return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC; 130 return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC;
131} 131}
132 132
133static inline int apic_lvt_nmi_mode(u32 lvt_val)
134{
135 return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI;
136}
137
133static unsigned int apic_lvt_mask[APIC_LVT_NUM] = { 138static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
134 LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */ 139 LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */
135 LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ 140 LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */
@@ -354,6 +359,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
354 359
355 case APIC_DM_NMI: 360 case APIC_DM_NMI:
356 kvm_inject_nmi(vcpu); 361 kvm_inject_nmi(vcpu);
362 kvm_vcpu_kick(vcpu);
357 break; 363 break;
358 364
359 case APIC_DM_INIT: 365 case APIC_DM_INIT:
@@ -380,6 +386,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
380 } 386 }
381 break; 387 break;
382 388
389 case APIC_DM_EXTINT:
390 /*
391 * Should only be called by kvm_apic_local_deliver() with LVT0,
392 * before NMI watchdog was enabled. Already handled by
393 * kvm_apic_accept_pic_intr().
394 */
395 break;
396
383 default: 397 default:
384 printk(KERN_ERR "TODO: unsupported delivery mode %x\n", 398 printk(KERN_ERR "TODO: unsupported delivery mode %x\n",
385 delivery_mode); 399 delivery_mode);
@@ -663,6 +677,20 @@ static void start_apic_timer(struct kvm_lapic *apic)
663 apic->timer.period))); 677 apic->timer.period)));
664} 678}
665 679
680static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
681{
682 int nmi_wd_enabled = apic_lvt_nmi_mode(apic_get_reg(apic, APIC_LVT0));
683
684 if (apic_lvt_nmi_mode(lvt0_val)) {
685 if (!nmi_wd_enabled) {
686 apic_debug("Receive NMI setting on APIC_LVT0 "
687 "for cpu %d\n", apic->vcpu->vcpu_id);
688 apic->vcpu->kvm->arch.vapics_in_nmi_mode++;
689 }
690 } else if (nmi_wd_enabled)
691 apic->vcpu->kvm->arch.vapics_in_nmi_mode--;
692}
693
666static void apic_mmio_write(struct kvm_io_device *this, 694static void apic_mmio_write(struct kvm_io_device *this,
667 gpa_t address, int len, const void *data) 695 gpa_t address, int len, const void *data)
668{ 696{
@@ -743,10 +771,11 @@ static void apic_mmio_write(struct kvm_io_device *this,
743 apic_set_reg(apic, APIC_ICR2, val & 0xff000000); 771 apic_set_reg(apic, APIC_ICR2, val & 0xff000000);
744 break; 772 break;
745 773
774 case APIC_LVT0:
775 apic_manage_nmi_watchdog(apic, val);
746 case APIC_LVTT: 776 case APIC_LVTT:
747 case APIC_LVTTHMR: 777 case APIC_LVTTHMR:
748 case APIC_LVTPC: 778 case APIC_LVTPC:
749 case APIC_LVT0:
750 case APIC_LVT1: 779 case APIC_LVT1:
751 case APIC_LVTERR: 780 case APIC_LVTERR:
752 /* TODO: Check vector */ 781 /* TODO: Check vector */
@@ -961,12 +990,26 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu)
961 return 0; 990 return 0;
962} 991}
963 992
964static int __inject_apic_timer_irq(struct kvm_lapic *apic) 993static int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
994{
995 u32 reg = apic_get_reg(apic, lvt_type);
996 int vector, mode, trig_mode;
997
998 if (apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) {
999 vector = reg & APIC_VECTOR_MASK;
1000 mode = reg & APIC_MODE_MASK;
1001 trig_mode = reg & APIC_LVT_LEVEL_TRIGGER;
1002 return __apic_accept_irq(apic, mode, vector, 1, trig_mode);
1003 }
1004 return 0;
1005}
1006
1007void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
965{ 1008{
966 int vector; 1009 struct kvm_lapic *apic = vcpu->arch.apic;
967 1010
968 vector = apic_lvt_vector(apic, APIC_LVTT); 1011 if (apic)
969 return __apic_accept_irq(apic, APIC_DM_FIXED, vector, 1, 0); 1012 kvm_apic_local_deliver(apic, APIC_LVT0);
970} 1013}
971 1014
972static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) 1015static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
@@ -1061,9 +1104,8 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
1061{ 1104{
1062 struct kvm_lapic *apic = vcpu->arch.apic; 1105 struct kvm_lapic *apic = vcpu->arch.apic;
1063 1106
1064 if (apic && apic_lvt_enabled(apic, APIC_LVTT) && 1107 if (apic && atomic_read(&apic->timer.pending) > 0) {
1065 atomic_read(&apic->timer.pending) > 0) { 1108 if (kvm_apic_local_deliver(apic, APIC_LVTT))
1066 if (__inject_apic_timer_irq(apic))
1067 atomic_dec(&apic->timer.pending); 1109 atomic_dec(&apic->timer.pending);
1068 } 1110 }
1069} 1111}
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 410ddbc1aa2e..83f11c7474a1 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -17,7 +17,6 @@
17 * 17 *
18 */ 18 */
19 19
20#include "vmx.h"
21#include "mmu.h" 20#include "mmu.h"
22 21
23#include <linux/kvm_host.h> 22#include <linux/kvm_host.h>
@@ -33,6 +32,7 @@
33#include <asm/page.h> 32#include <asm/page.h>
34#include <asm/cmpxchg.h> 33#include <asm/cmpxchg.h>
35#include <asm/io.h> 34#include <asm/io.h>
35#include <asm/vmx.h>
36 36
37/* 37/*
38 * When setting this variable to true it enables Two-Dimensional-Paging 38 * When setting this variable to true it enables Two-Dimensional-Paging
@@ -168,6 +168,7 @@ static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
168static u64 __read_mostly shadow_user_mask; 168static u64 __read_mostly shadow_user_mask;
169static u64 __read_mostly shadow_accessed_mask; 169static u64 __read_mostly shadow_accessed_mask;
170static u64 __read_mostly shadow_dirty_mask; 170static u64 __read_mostly shadow_dirty_mask;
171static u64 __read_mostly shadow_mt_mask;
171 172
172void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) 173void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
173{ 174{
@@ -183,13 +184,14 @@ void kvm_mmu_set_base_ptes(u64 base_pte)
183EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes); 184EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
184 185
185void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 186void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
186 u64 dirty_mask, u64 nx_mask, u64 x_mask) 187 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask)
187{ 188{
188 shadow_user_mask = user_mask; 189 shadow_user_mask = user_mask;
189 shadow_accessed_mask = accessed_mask; 190 shadow_accessed_mask = accessed_mask;
190 shadow_dirty_mask = dirty_mask; 191 shadow_dirty_mask = dirty_mask;
191 shadow_nx_mask = nx_mask; 192 shadow_nx_mask = nx_mask;
192 shadow_x_mask = x_mask; 193 shadow_x_mask = x_mask;
194 shadow_mt_mask = mt_mask;
193} 195}
194EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); 196EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
195 197
@@ -384,7 +386,9 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn)
384{ 386{
385 int *write_count; 387 int *write_count;
386 388
387 write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn)); 389 gfn = unalias_gfn(kvm, gfn);
390 write_count = slot_largepage_idx(gfn,
391 gfn_to_memslot_unaliased(kvm, gfn));
388 *write_count += 1; 392 *write_count += 1;
389} 393}
390 394
@@ -392,16 +396,20 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
392{ 396{
393 int *write_count; 397 int *write_count;
394 398
395 write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn)); 399 gfn = unalias_gfn(kvm, gfn);
400 write_count = slot_largepage_idx(gfn,
401 gfn_to_memslot_unaliased(kvm, gfn));
396 *write_count -= 1; 402 *write_count -= 1;
397 WARN_ON(*write_count < 0); 403 WARN_ON(*write_count < 0);
398} 404}
399 405
400static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn) 406static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn)
401{ 407{
402 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 408 struct kvm_memory_slot *slot;
403 int *largepage_idx; 409 int *largepage_idx;
404 410
411 gfn = unalias_gfn(kvm, gfn);
412 slot = gfn_to_memslot_unaliased(kvm, gfn);
405 if (slot) { 413 if (slot) {
406 largepage_idx = slot_largepage_idx(gfn, slot); 414 largepage_idx = slot_largepage_idx(gfn, slot);
407 return *largepage_idx; 415 return *largepage_idx;
@@ -613,7 +621,7 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
613 return NULL; 621 return NULL;
614} 622}
615 623
616static void rmap_write_protect(struct kvm *kvm, u64 gfn) 624static int rmap_write_protect(struct kvm *kvm, u64 gfn)
617{ 625{
618 unsigned long *rmapp; 626 unsigned long *rmapp;
619 u64 *spte; 627 u64 *spte;
@@ -659,8 +667,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
659 spte = rmap_next(kvm, rmapp, spte); 667 spte = rmap_next(kvm, rmapp, spte);
660 } 668 }
661 669
662 if (write_protected) 670 return write_protected;
663 kvm_flush_remote_tlbs(kvm);
664} 671}
665 672
666static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) 673static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
@@ -786,9 +793,11 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
786 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); 793 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
787 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 794 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
788 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); 795 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
796 INIT_LIST_HEAD(&sp->oos_link);
789 ASSERT(is_empty_shadow_page(sp->spt)); 797 ASSERT(is_empty_shadow_page(sp->spt));
790 sp->slot_bitmap = 0; 798 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
791 sp->multimapped = 0; 799 sp->multimapped = 0;
800 sp->global = 1;
792 sp->parent_pte = parent_pte; 801 sp->parent_pte = parent_pte;
793 --vcpu->kvm->arch.n_free_mmu_pages; 802 --vcpu->kvm->arch.n_free_mmu_pages;
794 return sp; 803 return sp;
@@ -900,8 +909,9 @@ static void kvm_mmu_update_unsync_bitmap(u64 *spte)
900 struct kvm_mmu_page *sp = page_header(__pa(spte)); 909 struct kvm_mmu_page *sp = page_header(__pa(spte));
901 910
902 index = spte - sp->spt; 911 index = spte - sp->spt;
903 __set_bit(index, sp->unsync_child_bitmap); 912 if (!__test_and_set_bit(index, sp->unsync_child_bitmap))
904 sp->unsync_children = 1; 913 sp->unsync_children++;
914 WARN_ON(!sp->unsync_children);
905} 915}
906 916
907static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp) 917static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
@@ -928,7 +938,6 @@ static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
928 938
929static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 939static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
930{ 940{
931 sp->unsync_children = 1;
932 kvm_mmu_update_parents_unsync(sp); 941 kvm_mmu_update_parents_unsync(sp);
933 return 1; 942 return 1;
934} 943}
@@ -959,38 +968,66 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
959{ 968{
960} 969}
961 970
971#define KVM_PAGE_ARRAY_NR 16
972
973struct kvm_mmu_pages {
974 struct mmu_page_and_offset {
975 struct kvm_mmu_page *sp;
976 unsigned int idx;
977 } page[KVM_PAGE_ARRAY_NR];
978 unsigned int nr;
979};
980
962#define for_each_unsync_children(bitmap, idx) \ 981#define for_each_unsync_children(bitmap, idx) \
963 for (idx = find_first_bit(bitmap, 512); \ 982 for (idx = find_first_bit(bitmap, 512); \
964 idx < 512; \ 983 idx < 512; \
965 idx = find_next_bit(bitmap, 512, idx+1)) 984 idx = find_next_bit(bitmap, 512, idx+1))
966 985
967static int mmu_unsync_walk(struct kvm_mmu_page *sp, 986int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
968 struct kvm_unsync_walk *walker) 987 int idx)
969{ 988{
970 int i, ret; 989 int i;
971 990
972 if (!sp->unsync_children) 991 if (sp->unsync)
973 return 0; 992 for (i=0; i < pvec->nr; i++)
993 if (pvec->page[i].sp == sp)
994 return 0;
995
996 pvec->page[pvec->nr].sp = sp;
997 pvec->page[pvec->nr].idx = idx;
998 pvec->nr++;
999 return (pvec->nr == KVM_PAGE_ARRAY_NR);
1000}
1001
1002static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
1003 struct kvm_mmu_pages *pvec)
1004{
1005 int i, ret, nr_unsync_leaf = 0;
974 1006
975 for_each_unsync_children(sp->unsync_child_bitmap, i) { 1007 for_each_unsync_children(sp->unsync_child_bitmap, i) {
976 u64 ent = sp->spt[i]; 1008 u64 ent = sp->spt[i];
977 1009
978 if (is_shadow_present_pte(ent)) { 1010 if (is_shadow_present_pte(ent) && !is_large_pte(ent)) {
979 struct kvm_mmu_page *child; 1011 struct kvm_mmu_page *child;
980 child = page_header(ent & PT64_BASE_ADDR_MASK); 1012 child = page_header(ent & PT64_BASE_ADDR_MASK);
981 1013
982 if (child->unsync_children) { 1014 if (child->unsync_children) {
983 ret = mmu_unsync_walk(child, walker); 1015 if (mmu_pages_add(pvec, child, i))
984 if (ret) 1016 return -ENOSPC;
1017
1018 ret = __mmu_unsync_walk(child, pvec);
1019 if (!ret)
1020 __clear_bit(i, sp->unsync_child_bitmap);
1021 else if (ret > 0)
1022 nr_unsync_leaf += ret;
1023 else
985 return ret; 1024 return ret;
986 __clear_bit(i, sp->unsync_child_bitmap);
987 } 1025 }
988 1026
989 if (child->unsync) { 1027 if (child->unsync) {
990 ret = walker->entry(child, walker); 1028 nr_unsync_leaf++;
991 __clear_bit(i, sp->unsync_child_bitmap); 1029 if (mmu_pages_add(pvec, child, i))
992 if (ret) 1030 return -ENOSPC;
993 return ret;
994 } 1031 }
995 } 1032 }
996 } 1033 }
@@ -998,7 +1035,17 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp,
998 if (find_first_bit(sp->unsync_child_bitmap, 512) == 512) 1035 if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
999 sp->unsync_children = 0; 1036 sp->unsync_children = 0;
1000 1037
1001 return 0; 1038 return nr_unsync_leaf;
1039}
1040
1041static int mmu_unsync_walk(struct kvm_mmu_page *sp,
1042 struct kvm_mmu_pages *pvec)
1043{
1044 if (!sp->unsync_children)
1045 return 0;
1046
1047 mmu_pages_add(pvec, sp, 0);
1048 return __mmu_unsync_walk(sp, pvec);
1002} 1049}
1003 1050
1004static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) 1051static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
@@ -1021,10 +1068,18 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
1021 return NULL; 1068 return NULL;
1022} 1069}
1023 1070
1071static void kvm_unlink_unsync_global(struct kvm *kvm, struct kvm_mmu_page *sp)
1072{
1073 list_del(&sp->oos_link);
1074 --kvm->stat.mmu_unsync_global;
1075}
1076
1024static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1077static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1025{ 1078{
1026 WARN_ON(!sp->unsync); 1079 WARN_ON(!sp->unsync);
1027 sp->unsync = 0; 1080 sp->unsync = 0;
1081 if (sp->global)
1082 kvm_unlink_unsync_global(kvm, sp);
1028 --kvm->stat.mmu_unsync; 1083 --kvm->stat.mmu_unsync;
1029} 1084}
1030 1085
@@ -1037,7 +1092,8 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1037 return 1; 1092 return 1;
1038 } 1093 }
1039 1094
1040 rmap_write_protect(vcpu->kvm, sp->gfn); 1095 if (rmap_write_protect(vcpu->kvm, sp->gfn))
1096 kvm_flush_remote_tlbs(vcpu->kvm);
1041 kvm_unlink_unsync_page(vcpu->kvm, sp); 1097 kvm_unlink_unsync_page(vcpu->kvm, sp);
1042 if (vcpu->arch.mmu.sync_page(vcpu, sp)) { 1098 if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
1043 kvm_mmu_zap_page(vcpu->kvm, sp); 1099 kvm_mmu_zap_page(vcpu->kvm, sp);
@@ -1048,30 +1104,89 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1048 return 0; 1104 return 0;
1049} 1105}
1050 1106
1051struct sync_walker { 1107struct mmu_page_path {
1052 struct kvm_vcpu *vcpu; 1108 struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
1053 struct kvm_unsync_walk walker; 1109 unsigned int idx[PT64_ROOT_LEVEL-1];
1054}; 1110};
1055 1111
1056static int mmu_sync_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk) 1112#define for_each_sp(pvec, sp, parents, i) \
1113 for (i = mmu_pages_next(&pvec, &parents, -1), \
1114 sp = pvec.page[i].sp; \
1115 i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \
1116 i = mmu_pages_next(&pvec, &parents, i))
1117
1118int mmu_pages_next(struct kvm_mmu_pages *pvec, struct mmu_page_path *parents,
1119 int i)
1057{ 1120{
1058 struct sync_walker *sync_walk = container_of(walk, struct sync_walker, 1121 int n;
1059 walker);
1060 struct kvm_vcpu *vcpu = sync_walk->vcpu;
1061 1122
1062 kvm_sync_page(vcpu, sp); 1123 for (n = i+1; n < pvec->nr; n++) {
1063 return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)); 1124 struct kvm_mmu_page *sp = pvec->page[n].sp;
1125
1126 if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
1127 parents->idx[0] = pvec->page[n].idx;
1128 return n;
1129 }
1130
1131 parents->parent[sp->role.level-2] = sp;
1132 parents->idx[sp->role.level-1] = pvec->page[n].idx;
1133 }
1134
1135 return n;
1064} 1136}
1065 1137
1066static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 1138void mmu_pages_clear_parents(struct mmu_page_path *parents)
1067{ 1139{
1068 struct sync_walker walker = { 1140 struct kvm_mmu_page *sp;
1069 .walker = { .entry = mmu_sync_fn, }, 1141 unsigned int level = 0;
1070 .vcpu = vcpu, 1142
1071 }; 1143 do {
1144 unsigned int idx = parents->idx[level];
1145
1146 sp = parents->parent[level];
1147 if (!sp)
1148 return;
1149
1150 --sp->unsync_children;
1151 WARN_ON((int)sp->unsync_children < 0);
1152 __clear_bit(idx, sp->unsync_child_bitmap);
1153 level++;
1154 } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
1155}
1156
1157static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
1158 struct mmu_page_path *parents,
1159 struct kvm_mmu_pages *pvec)
1160{
1161 parents->parent[parent->role.level-1] = NULL;
1162 pvec->nr = 0;
1163}
1164
1165static void mmu_sync_children(struct kvm_vcpu *vcpu,
1166 struct kvm_mmu_page *parent)
1167{
1168 int i;
1169 struct kvm_mmu_page *sp;
1170 struct mmu_page_path parents;
1171 struct kvm_mmu_pages pages;
1172
1173 kvm_mmu_pages_init(parent, &parents, &pages);
1174 while (mmu_unsync_walk(parent, &pages)) {
1175 int protected = 0;
1072 1176
1073 while (mmu_unsync_walk(sp, &walker.walker)) 1177 for_each_sp(pages, sp, parents, i)
1178 protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
1179
1180 if (protected)
1181 kvm_flush_remote_tlbs(vcpu->kvm);
1182
1183 for_each_sp(pages, sp, parents, i) {
1184 kvm_sync_page(vcpu, sp);
1185 mmu_pages_clear_parents(&parents);
1186 }
1074 cond_resched_lock(&vcpu->kvm->mmu_lock); 1187 cond_resched_lock(&vcpu->kvm->mmu_lock);
1188 kvm_mmu_pages_init(parent, &parents, &pages);
1189 }
1075} 1190}
1076 1191
1077static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, 1192static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
@@ -1129,7 +1244,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1129 sp->role = role; 1244 sp->role = role;
1130 hlist_add_head(&sp->hash_link, bucket); 1245 hlist_add_head(&sp->hash_link, bucket);
1131 if (!metaphysical) { 1246 if (!metaphysical) {
1132 rmap_write_protect(vcpu->kvm, gfn); 1247 if (rmap_write_protect(vcpu->kvm, gfn))
1248 kvm_flush_remote_tlbs(vcpu->kvm);
1133 account_shadowed(vcpu->kvm, gfn); 1249 account_shadowed(vcpu->kvm, gfn);
1134 } 1250 }
1135 if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) 1251 if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
@@ -1153,6 +1269,8 @@ static int walk_shadow(struct kvm_shadow_walk *walker,
1153 if (level == PT32E_ROOT_LEVEL) { 1269 if (level == PT32E_ROOT_LEVEL) {
1154 shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; 1270 shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
1155 shadow_addr &= PT64_BASE_ADDR_MASK; 1271 shadow_addr &= PT64_BASE_ADDR_MASK;
1272 if (!shadow_addr)
1273 return 1;
1156 --level; 1274 --level;
1157 } 1275 }
1158 1276
@@ -1237,33 +1355,29 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
1237 } 1355 }
1238} 1356}
1239 1357
1240struct zap_walker { 1358static int mmu_zap_unsync_children(struct kvm *kvm,
1241 struct kvm_unsync_walk walker; 1359 struct kvm_mmu_page *parent)
1242 struct kvm *kvm;
1243 int zapped;
1244};
1245
1246static int mmu_zap_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
1247{ 1360{
1248 struct zap_walker *zap_walk = container_of(walk, struct zap_walker, 1361 int i, zapped = 0;
1249 walker); 1362 struct mmu_page_path parents;
1250 kvm_mmu_zap_page(zap_walk->kvm, sp); 1363 struct kvm_mmu_pages pages;
1251 zap_walk->zapped = 1;
1252 return 0;
1253}
1254 1364
1255static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp) 1365 if (parent->role.level == PT_PAGE_TABLE_LEVEL)
1256{
1257 struct zap_walker walker = {
1258 .walker = { .entry = mmu_zap_fn, },
1259 .kvm = kvm,
1260 .zapped = 0,
1261 };
1262
1263 if (sp->role.level == PT_PAGE_TABLE_LEVEL)
1264 return 0; 1366 return 0;
1265 mmu_unsync_walk(sp, &walker.walker); 1367
1266 return walker.zapped; 1368 kvm_mmu_pages_init(parent, &parents, &pages);
1369 while (mmu_unsync_walk(parent, &pages)) {
1370 struct kvm_mmu_page *sp;
1371
1372 for_each_sp(pages, sp, parents, i) {
1373 kvm_mmu_zap_page(kvm, sp);
1374 mmu_pages_clear_parents(&parents);
1375 }
1376 zapped += pages.nr;
1377 kvm_mmu_pages_init(parent, &parents, &pages);
1378 }
1379
1380 return zapped;
1267} 1381}
1268 1382
1269static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1383static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -1362,7 +1476,7 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
1362 int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn)); 1476 int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn));
1363 struct kvm_mmu_page *sp = page_header(__pa(pte)); 1477 struct kvm_mmu_page *sp = page_header(__pa(pte));
1364 1478
1365 __set_bit(slot, &sp->slot_bitmap); 1479 __set_bit(slot, sp->slot_bitmap);
1366} 1480}
1367 1481
1368static void mmu_convert_notrap(struct kvm_mmu_page *sp) 1482static void mmu_convert_notrap(struct kvm_mmu_page *sp)
@@ -1393,6 +1507,110 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
1393 return page; 1507 return page;
1394} 1508}
1395 1509
1510/*
1511 * The function is based on mtrr_type_lookup() in
1512 * arch/x86/kernel/cpu/mtrr/generic.c
1513 */
1514static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
1515 u64 start, u64 end)
1516{
1517 int i;
1518 u64 base, mask;
1519 u8 prev_match, curr_match;
1520 int num_var_ranges = KVM_NR_VAR_MTRR;
1521
1522 if (!mtrr_state->enabled)
1523 return 0xFF;
1524
1525 /* Make end inclusive end, instead of exclusive */
1526 end--;
1527
1528 /* Look in fixed ranges. Just return the type as per start */
1529 if (mtrr_state->have_fixed && (start < 0x100000)) {
1530 int idx;
1531
1532 if (start < 0x80000) {
1533 idx = 0;
1534 idx += (start >> 16);
1535 return mtrr_state->fixed_ranges[idx];
1536 } else if (start < 0xC0000) {
1537 idx = 1 * 8;
1538 idx += ((start - 0x80000) >> 14);
1539 return mtrr_state->fixed_ranges[idx];
1540 } else if (start < 0x1000000) {
1541 idx = 3 * 8;
1542 idx += ((start - 0xC0000) >> 12);
1543 return mtrr_state->fixed_ranges[idx];
1544 }
1545 }
1546
1547 /*
1548 * Look in variable ranges
1549 * Look of multiple ranges matching this address and pick type
1550 * as per MTRR precedence
1551 */
1552 if (!(mtrr_state->enabled & 2))
1553 return mtrr_state->def_type;
1554
1555 prev_match = 0xFF;
1556 for (i = 0; i < num_var_ranges; ++i) {
1557 unsigned short start_state, end_state;
1558
1559 if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11)))
1560 continue;
1561
1562 base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) +
1563 (mtrr_state->var_ranges[i].base_lo & PAGE_MASK);
1564 mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) +
1565 (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK);
1566
1567 start_state = ((start & mask) == (base & mask));
1568 end_state = ((end & mask) == (base & mask));
1569 if (start_state != end_state)
1570 return 0xFE;
1571
1572 if ((start & mask) != (base & mask))
1573 continue;
1574
1575 curr_match = mtrr_state->var_ranges[i].base_lo & 0xff;
1576 if (prev_match == 0xFF) {
1577 prev_match = curr_match;
1578 continue;
1579 }
1580
1581 if (prev_match == MTRR_TYPE_UNCACHABLE ||
1582 curr_match == MTRR_TYPE_UNCACHABLE)
1583 return MTRR_TYPE_UNCACHABLE;
1584
1585 if ((prev_match == MTRR_TYPE_WRBACK &&
1586 curr_match == MTRR_TYPE_WRTHROUGH) ||
1587 (prev_match == MTRR_TYPE_WRTHROUGH &&
1588 curr_match == MTRR_TYPE_WRBACK)) {
1589 prev_match = MTRR_TYPE_WRTHROUGH;
1590 curr_match = MTRR_TYPE_WRTHROUGH;
1591 }
1592
1593 if (prev_match != curr_match)
1594 return MTRR_TYPE_UNCACHABLE;
1595 }
1596
1597 if (prev_match != 0xFF)
1598 return prev_match;
1599
1600 return mtrr_state->def_type;
1601}
1602
1603static u8 get_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
1604{
1605 u8 mtrr;
1606
1607 mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT,
1608 (gfn << PAGE_SHIFT) + PAGE_SIZE);
1609 if (mtrr == 0xfe || mtrr == 0xff)
1610 mtrr = MTRR_TYPE_WRBACK;
1611 return mtrr;
1612}
1613
1396static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 1614static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1397{ 1615{
1398 unsigned index; 1616 unsigned index;
@@ -1409,9 +1627,15 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1409 if (s->role.word != sp->role.word) 1627 if (s->role.word != sp->role.word)
1410 return 1; 1628 return 1;
1411 } 1629 }
1412 kvm_mmu_mark_parents_unsync(vcpu, sp);
1413 ++vcpu->kvm->stat.mmu_unsync; 1630 ++vcpu->kvm->stat.mmu_unsync;
1414 sp->unsync = 1; 1631 sp->unsync = 1;
1632
1633 if (sp->global) {
1634 list_add(&sp->oos_link, &vcpu->kvm->arch.oos_global_pages);
1635 ++vcpu->kvm->stat.mmu_unsync_global;
1636 } else
1637 kvm_mmu_mark_parents_unsync(vcpu, sp);
1638
1415 mmu_convert_notrap(sp); 1639 mmu_convert_notrap(sp);
1416 return 0; 1640 return 0;
1417} 1641}
@@ -1437,11 +1661,24 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
1437static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, 1661static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1438 unsigned pte_access, int user_fault, 1662 unsigned pte_access, int user_fault,
1439 int write_fault, int dirty, int largepage, 1663 int write_fault, int dirty, int largepage,
1440 gfn_t gfn, pfn_t pfn, bool speculative, 1664 int global, gfn_t gfn, pfn_t pfn, bool speculative,
1441 bool can_unsync) 1665 bool can_unsync)
1442{ 1666{
1443 u64 spte; 1667 u64 spte;
1444 int ret = 0; 1668 int ret = 0;
1669 u64 mt_mask = shadow_mt_mask;
1670 struct kvm_mmu_page *sp = page_header(__pa(shadow_pte));
1671
1672 if (!(vcpu->arch.cr4 & X86_CR4_PGE))
1673 global = 0;
1674 if (!global && sp->global) {
1675 sp->global = 0;
1676 if (sp->unsync) {
1677 kvm_unlink_unsync_global(vcpu->kvm, sp);
1678 kvm_mmu_mark_parents_unsync(vcpu, sp);
1679 }
1680 }
1681
1445 /* 1682 /*
1446 * We don't set the accessed bit, since we sometimes want to see 1683 * We don't set the accessed bit, since we sometimes want to see
1447 * whether the guest actually used the pte (in order to detect 1684 * whether the guest actually used the pte (in order to detect
@@ -1460,6 +1697,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1460 spte |= shadow_user_mask; 1697 spte |= shadow_user_mask;
1461 if (largepage) 1698 if (largepage)
1462 spte |= PT_PAGE_SIZE_MASK; 1699 spte |= PT_PAGE_SIZE_MASK;
1700 if (mt_mask) {
1701 mt_mask = get_memory_type(vcpu, gfn) <<
1702 kvm_x86_ops->get_mt_mask_shift();
1703 spte |= mt_mask;
1704 }
1463 1705
1464 spte |= (u64)pfn << PAGE_SHIFT; 1706 spte |= (u64)pfn << PAGE_SHIFT;
1465 1707
@@ -1474,6 +1716,15 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1474 1716
1475 spte |= PT_WRITABLE_MASK; 1717 spte |= PT_WRITABLE_MASK;
1476 1718
1719 /*
1720 * Optimization: for pte sync, if spte was writable the hash
1721 * lookup is unnecessary (and expensive). Write protection
1722 * is responsibility of mmu_get_page / kvm_sync_page.
1723 * Same reasoning can be applied to dirty page accounting.
1724 */
1725 if (!can_unsync && is_writeble_pte(*shadow_pte))
1726 goto set_pte;
1727
1477 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { 1728 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
1478 pgprintk("%s: found shadow page for %lx, marking ro\n", 1729 pgprintk("%s: found shadow page for %lx, marking ro\n",
1479 __func__, gfn); 1730 __func__, gfn);
@@ -1495,8 +1746,8 @@ set_pte:
1495static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, 1746static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1496 unsigned pt_access, unsigned pte_access, 1747 unsigned pt_access, unsigned pte_access,
1497 int user_fault, int write_fault, int dirty, 1748 int user_fault, int write_fault, int dirty,
1498 int *ptwrite, int largepage, gfn_t gfn, 1749 int *ptwrite, int largepage, int global,
1499 pfn_t pfn, bool speculative) 1750 gfn_t gfn, pfn_t pfn, bool speculative)
1500{ 1751{
1501 int was_rmapped = 0; 1752 int was_rmapped = 0;
1502 int was_writeble = is_writeble_pte(*shadow_pte); 1753 int was_writeble = is_writeble_pte(*shadow_pte);
@@ -1529,7 +1780,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1529 } 1780 }
1530 } 1781 }
1531 if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, 1782 if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
1532 dirty, largepage, gfn, pfn, speculative, true)) { 1783 dirty, largepage, global, gfn, pfn, speculative, true)) {
1533 if (write_fault) 1784 if (write_fault)
1534 *ptwrite = 1; 1785 *ptwrite = 1;
1535 kvm_x86_ops->tlb_flush(vcpu); 1786 kvm_x86_ops->tlb_flush(vcpu);
@@ -1586,7 +1837,7 @@ static int direct_map_entry(struct kvm_shadow_walk *_walk,
1586 || (walk->largepage && level == PT_DIRECTORY_LEVEL)) { 1837 || (walk->largepage && level == PT_DIRECTORY_LEVEL)) {
1587 mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL, 1838 mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL,
1588 0, walk->write, 1, &walk->pt_write, 1839 0, walk->write, 1, &walk->pt_write,
1589 walk->largepage, gfn, walk->pfn, false); 1840 walk->largepage, 0, gfn, walk->pfn, false);
1590 ++vcpu->stat.pf_fixed; 1841 ++vcpu->stat.pf_fixed;
1591 return 1; 1842 return 1;
1592 } 1843 }
@@ -1773,6 +2024,15 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
1773 } 2024 }
1774} 2025}
1775 2026
2027static void mmu_sync_global(struct kvm_vcpu *vcpu)
2028{
2029 struct kvm *kvm = vcpu->kvm;
2030 struct kvm_mmu_page *sp, *n;
2031
2032 list_for_each_entry_safe(sp, n, &kvm->arch.oos_global_pages, oos_link)
2033 kvm_sync_page(vcpu, sp);
2034}
2035
1776void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) 2036void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
1777{ 2037{
1778 spin_lock(&vcpu->kvm->mmu_lock); 2038 spin_lock(&vcpu->kvm->mmu_lock);
@@ -1780,6 +2040,13 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
1780 spin_unlock(&vcpu->kvm->mmu_lock); 2040 spin_unlock(&vcpu->kvm->mmu_lock);
1781} 2041}
1782 2042
2043void kvm_mmu_sync_global(struct kvm_vcpu *vcpu)
2044{
2045 spin_lock(&vcpu->kvm->mmu_lock);
2046 mmu_sync_global(vcpu);
2047 spin_unlock(&vcpu->kvm->mmu_lock);
2048}
2049
1783static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) 2050static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
1784{ 2051{
1785 return vaddr; 2052 return vaddr;
@@ -2178,7 +2445,8 @@ static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
2178} 2445}
2179 2446
2180void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 2447void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2181 const u8 *new, int bytes) 2448 const u8 *new, int bytes,
2449 bool guest_initiated)
2182{ 2450{
2183 gfn_t gfn = gpa >> PAGE_SHIFT; 2451 gfn_t gfn = gpa >> PAGE_SHIFT;
2184 struct kvm_mmu_page *sp; 2452 struct kvm_mmu_page *sp;
@@ -2204,15 +2472,17 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2204 kvm_mmu_free_some_pages(vcpu); 2472 kvm_mmu_free_some_pages(vcpu);
2205 ++vcpu->kvm->stat.mmu_pte_write; 2473 ++vcpu->kvm->stat.mmu_pte_write;
2206 kvm_mmu_audit(vcpu, "pre pte write"); 2474 kvm_mmu_audit(vcpu, "pre pte write");
2207 if (gfn == vcpu->arch.last_pt_write_gfn 2475 if (guest_initiated) {
2208 && !last_updated_pte_accessed(vcpu)) { 2476 if (gfn == vcpu->arch.last_pt_write_gfn
2209 ++vcpu->arch.last_pt_write_count; 2477 && !last_updated_pte_accessed(vcpu)) {
2210 if (vcpu->arch.last_pt_write_count >= 3) 2478 ++vcpu->arch.last_pt_write_count;
2211 flooded = 1; 2479 if (vcpu->arch.last_pt_write_count >= 3)
2212 } else { 2480 flooded = 1;
2213 vcpu->arch.last_pt_write_gfn = gfn; 2481 } else {
2214 vcpu->arch.last_pt_write_count = 1; 2482 vcpu->arch.last_pt_write_gfn = gfn;
2215 vcpu->arch.last_pte_updated = NULL; 2483 vcpu->arch.last_pt_write_count = 1;
2484 vcpu->arch.last_pte_updated = NULL;
2485 }
2216 } 2486 }
2217 index = kvm_page_table_hashfn(gfn); 2487 index = kvm_page_table_hashfn(gfn);
2218 bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 2488 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
@@ -2352,9 +2622,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
2352 2622
2353void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) 2623void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
2354{ 2624{
2355 spin_lock(&vcpu->kvm->mmu_lock);
2356 vcpu->arch.mmu.invlpg(vcpu, gva); 2625 vcpu->arch.mmu.invlpg(vcpu, gva);
2357 spin_unlock(&vcpu->kvm->mmu_lock);
2358 kvm_mmu_flush_tlb(vcpu); 2626 kvm_mmu_flush_tlb(vcpu);
2359 ++vcpu->stat.invlpg; 2627 ++vcpu->stat.invlpg;
2360} 2628}
@@ -2451,7 +2719,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
2451 int i; 2719 int i;
2452 u64 *pt; 2720 u64 *pt;
2453 2721
2454 if (!test_bit(slot, &sp->slot_bitmap)) 2722 if (!test_bit(slot, sp->slot_bitmap))
2455 continue; 2723 continue;
2456 2724
2457 pt = sp->spt; 2725 pt = sp->spt;
@@ -2860,8 +3128,8 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
2860 if (sp->role.metaphysical) 3128 if (sp->role.metaphysical)
2861 continue; 3129 continue;
2862 3130
2863 slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
2864 gfn = unalias_gfn(vcpu->kvm, sp->gfn); 3131 gfn = unalias_gfn(vcpu->kvm, sp->gfn);
3132 slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn);
2865 rmapp = &slot->rmap[gfn - slot->base_gfn]; 3133 rmapp = &slot->rmap[gfn - slot->base_gfn];
2866 if (*rmapp) 3134 if (*rmapp)
2867 printk(KERN_ERR "%s: (%s) shadow page has writable" 3135 printk(KERN_ERR "%s: (%s) shadow page has writable"
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 84eee43bbe74..9fd78b6e17ad 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -82,6 +82,7 @@ struct shadow_walker {
82 int *ptwrite; 82 int *ptwrite;
83 pfn_t pfn; 83 pfn_t pfn;
84 u64 *sptep; 84 u64 *sptep;
85 gpa_t pte_gpa;
85}; 86};
86 87
87static gfn_t gpte_to_gfn(pt_element_t gpte) 88static gfn_t gpte_to_gfn(pt_element_t gpte)
@@ -222,7 +223,7 @@ walk:
222 if (ret) 223 if (ret)
223 goto walk; 224 goto walk;
224 pte |= PT_DIRTY_MASK; 225 pte |= PT_DIRTY_MASK;
225 kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte)); 226 kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte), 0);
226 walker->ptes[walker->level - 1] = pte; 227 walker->ptes[walker->level - 1] = pte;
227 } 228 }
228 229
@@ -274,7 +275,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
274 return; 275 return;
275 kvm_get_pfn(pfn); 276 kvm_get_pfn(pfn);
276 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, 277 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
277 gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte), 278 gpte & PT_DIRTY_MASK, NULL, largepage,
279 gpte & PT_GLOBAL_MASK, gpte_to_gfn(gpte),
278 pfn, true); 280 pfn, true);
279} 281}
280 282
@@ -301,8 +303,9 @@ static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw,
301 mmu_set_spte(vcpu, sptep, access, gw->pte_access & access, 303 mmu_set_spte(vcpu, sptep, access, gw->pte_access & access,
302 sw->user_fault, sw->write_fault, 304 sw->user_fault, sw->write_fault,
303 gw->ptes[gw->level-1] & PT_DIRTY_MASK, 305 gw->ptes[gw->level-1] & PT_DIRTY_MASK,
304 sw->ptwrite, sw->largepage, gw->gfn, sw->pfn, 306 sw->ptwrite, sw->largepage,
305 false); 307 gw->ptes[gw->level-1] & PT_GLOBAL_MASK,
308 gw->gfn, sw->pfn, false);
306 sw->sptep = sptep; 309 sw->sptep = sptep;
307 return 1; 310 return 1;
308 } 311 }
@@ -466,10 +469,22 @@ static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw,
466 struct kvm_vcpu *vcpu, u64 addr, 469 struct kvm_vcpu *vcpu, u64 addr,
467 u64 *sptep, int level) 470 u64 *sptep, int level)
468{ 471{
472 struct shadow_walker *sw =
473 container_of(_sw, struct shadow_walker, walker);
469 474
470 if (level == PT_PAGE_TABLE_LEVEL) { 475 /* FIXME: properly handle invlpg on large guest pages */
471 if (is_shadow_present_pte(*sptep)) 476 if (level == PT_PAGE_TABLE_LEVEL ||
477 ((level == PT_DIRECTORY_LEVEL) && is_large_pte(*sptep))) {
478 struct kvm_mmu_page *sp = page_header(__pa(sptep));
479
480 sw->pte_gpa = (sp->gfn << PAGE_SHIFT);
481 sw->pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
482
483 if (is_shadow_present_pte(*sptep)) {
472 rmap_remove(vcpu->kvm, sptep); 484 rmap_remove(vcpu->kvm, sptep);
485 if (is_large_pte(*sptep))
486 --vcpu->kvm->stat.lpages;
487 }
473 set_shadow_pte(sptep, shadow_trap_nonpresent_pte); 488 set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
474 return 1; 489 return 1;
475 } 490 }
@@ -480,11 +495,26 @@ static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw,
480 495
481static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) 496static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
482{ 497{
498 pt_element_t gpte;
483 struct shadow_walker walker = { 499 struct shadow_walker walker = {
484 .walker = { .entry = FNAME(shadow_invlpg_entry), }, 500 .walker = { .entry = FNAME(shadow_invlpg_entry), },
501 .pte_gpa = -1,
485 }; 502 };
486 503
504 spin_lock(&vcpu->kvm->mmu_lock);
487 walk_shadow(&walker.walker, vcpu, gva); 505 walk_shadow(&walker.walker, vcpu, gva);
506 spin_unlock(&vcpu->kvm->mmu_lock);
507 if (walker.pte_gpa == -1)
508 return;
509 if (kvm_read_guest_atomic(vcpu->kvm, walker.pte_gpa, &gpte,
510 sizeof(pt_element_t)))
511 return;
512 if (is_present_pte(gpte) && (gpte & PT_ACCESSED_MASK)) {
513 if (mmu_topup_memory_caches(vcpu))
514 return;
515 kvm_mmu_pte_write(vcpu, walker.pte_gpa, (const u8 *)&gpte,
516 sizeof(pt_element_t), 0);
517 }
488} 518}
489 519
490static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) 520static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
@@ -580,7 +610,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
580 nr_present++; 610 nr_present++;
581 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 611 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
582 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, 612 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
583 is_dirty_pte(gpte), 0, gfn, 613 is_dirty_pte(gpte), 0, gpte & PT_GLOBAL_MASK, gfn,
584 spte_to_pfn(sp->spt[i]), true, false); 614 spte_to_pfn(sp->spt[i]), true, false);
585 } 615 }
586 616
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 9c4ce657d963..1452851ae258 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -28,6 +28,8 @@
28 28
29#include <asm/desc.h> 29#include <asm/desc.h>
30 30
31#include <asm/virtext.h>
32
31#define __ex(x) __kvm_handle_fault_on_reboot(x) 33#define __ex(x) __kvm_handle_fault_on_reboot(x)
32 34
33MODULE_AUTHOR("Qumranet"); 35MODULE_AUTHOR("Qumranet");
@@ -245,34 +247,19 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
245 247
246static int has_svm(void) 248static int has_svm(void)
247{ 249{
248 uint32_t eax, ebx, ecx, edx; 250 const char *msg;
249
250 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
251 printk(KERN_INFO "has_svm: not amd\n");
252 return 0;
253 }
254 251
255 cpuid(0x80000000, &eax, &ebx, &ecx, &edx); 252 if (!cpu_has_svm(&msg)) {
256 if (eax < SVM_CPUID_FUNC) { 253 printk(KERN_INFO "has_svn: %s\n", msg);
257 printk(KERN_INFO "has_svm: can't execute cpuid_8000000a\n");
258 return 0; 254 return 0;
259 } 255 }
260 256
261 cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
262 if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) {
263 printk(KERN_DEBUG "has_svm: svm not available\n");
264 return 0;
265 }
266 return 1; 257 return 1;
267} 258}
268 259
269static void svm_hardware_disable(void *garbage) 260static void svm_hardware_disable(void *garbage)
270{ 261{
271 uint64_t efer; 262 cpu_svm_disable();
272
273 wrmsrl(MSR_VM_HSAVE_PA, 0);
274 rdmsrl(MSR_EFER, efer);
275 wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK);
276} 263}
277 264
278static void svm_hardware_enable(void *garbage) 265static void svm_hardware_enable(void *garbage)
@@ -772,6 +759,22 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
772 var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1; 759 var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
773 var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; 760 var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
774 var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1; 761 var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
762
763 /*
764 * SVM always stores 0 for the 'G' bit in the CS selector in
765 * the VMCB on a VMEXIT. This hurts cross-vendor migration:
766 * Intel's VMENTRY has a check on the 'G' bit.
767 */
768 if (seg == VCPU_SREG_CS)
769 var->g = s->limit > 0xfffff;
770
771 /*
772 * Work around a bug where the busy flag in the tr selector
773 * isn't exposed
774 */
775 if (seg == VCPU_SREG_TR)
776 var->type |= 0x2;
777
775 var->unusable = !var->present; 778 var->unusable = !var->present;
776} 779}
777 780
@@ -1099,6 +1102,7 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1099 rep = (io_info & SVM_IOIO_REP_MASK) != 0; 1102 rep = (io_info & SVM_IOIO_REP_MASK) != 0;
1100 down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0; 1103 down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0;
1101 1104
1105 skip_emulated_instruction(&svm->vcpu);
1102 return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port); 1106 return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port);
1103} 1107}
1104 1108
@@ -1912,6 +1916,11 @@ static int get_npt_level(void)
1912#endif 1916#endif
1913} 1917}
1914 1918
1919static int svm_get_mt_mask_shift(void)
1920{
1921 return 0;
1922}
1923
1915static struct kvm_x86_ops svm_x86_ops = { 1924static struct kvm_x86_ops svm_x86_ops = {
1916 .cpu_has_kvm_support = has_svm, 1925 .cpu_has_kvm_support = has_svm,
1917 .disabled_by_bios = is_disabled, 1926 .disabled_by_bios = is_disabled,
@@ -1967,6 +1976,7 @@ static struct kvm_x86_ops svm_x86_ops = {
1967 1976
1968 .set_tss_addr = svm_set_tss_addr, 1977 .set_tss_addr = svm_set_tss_addr,
1969 .get_tdp_level = get_npt_level, 1978 .get_tdp_level = get_npt_level,
1979 .get_mt_mask_shift = svm_get_mt_mask_shift,
1970}; 1980};
1971 1981
1972static int __init svm_init(void) 1982static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a4018b01e1f9..6259d7467648 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -16,7 +16,6 @@
16 */ 16 */
17 17
18#include "irq.h" 18#include "irq.h"
19#include "vmx.h"
20#include "mmu.h" 19#include "mmu.h"
21 20
22#include <linux/kvm_host.h> 21#include <linux/kvm_host.h>
@@ -31,6 +30,8 @@
31 30
32#include <asm/io.h> 31#include <asm/io.h>
33#include <asm/desc.h> 32#include <asm/desc.h>
33#include <asm/vmx.h>
34#include <asm/virtext.h>
34 35
35#define __ex(x) __kvm_handle_fault_on_reboot(x) 36#define __ex(x) __kvm_handle_fault_on_reboot(x)
36 37
@@ -90,6 +91,11 @@ struct vcpu_vmx {
90 } rmode; 91 } rmode;
91 int vpid; 92 int vpid;
92 bool emulation_required; 93 bool emulation_required;
94
95 /* Support for vnmi-less CPUs */
96 int soft_vnmi_blocked;
97 ktime_t entry_time;
98 s64 vnmi_blocked_time;
93}; 99};
94 100
95static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) 101static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -122,7 +128,7 @@ static struct vmcs_config {
122 u32 vmentry_ctrl; 128 u32 vmentry_ctrl;
123} vmcs_config; 129} vmcs_config;
124 130
125struct vmx_capability { 131static struct vmx_capability {
126 u32 ept; 132 u32 ept;
127 u32 vpid; 133 u32 vpid;
128} vmx_capability; 134} vmx_capability;
@@ -957,6 +963,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
957 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index, data); 963 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index, data);
958 964
959 break; 965 break;
966 case MSR_IA32_CR_PAT:
967 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
968 vmcs_write64(GUEST_IA32_PAT, data);
969 vcpu->arch.pat = data;
970 break;
971 }
972 /* Otherwise falls through to kvm_set_msr_common */
960 default: 973 default:
961 vmx_load_host_state(vmx); 974 vmx_load_host_state(vmx);
962 msr = find_msr_entry(vmx, msr_index); 975 msr = find_msr_entry(vmx, msr_index);
@@ -1032,8 +1045,7 @@ static int vmx_get_irq(struct kvm_vcpu *vcpu)
1032 1045
1033static __init int cpu_has_kvm_support(void) 1046static __init int cpu_has_kvm_support(void)
1034{ 1047{
1035 unsigned long ecx = cpuid_ecx(1); 1048 return cpu_has_vmx();
1036 return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
1037} 1049}
1038 1050
1039static __init int vmx_disabled_by_bios(void) 1051static __init int vmx_disabled_by_bios(void)
@@ -1079,13 +1091,22 @@ static void vmclear_local_vcpus(void)
1079 __vcpu_clear(vmx); 1091 __vcpu_clear(vmx);
1080} 1092}
1081 1093
1082static void hardware_disable(void *garbage) 1094
1095/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
1096 * tricks.
1097 */
1098static void kvm_cpu_vmxoff(void)
1083{ 1099{
1084 vmclear_local_vcpus();
1085 asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); 1100 asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
1086 write_cr4(read_cr4() & ~X86_CR4_VMXE); 1101 write_cr4(read_cr4() & ~X86_CR4_VMXE);
1087} 1102}
1088 1103
1104static void hardware_disable(void *garbage)
1105{
1106 vmclear_local_vcpus();
1107 kvm_cpu_vmxoff();
1108}
1109
1089static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, 1110static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
1090 u32 msr, u32 *result) 1111 u32 msr, u32 *result)
1091{ 1112{
@@ -1176,12 +1197,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1176#ifdef CONFIG_X86_64 1197#ifdef CONFIG_X86_64
1177 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; 1198 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
1178#endif 1199#endif
1179 opt = 0; 1200 opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT;
1180 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, 1201 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
1181 &_vmexit_control) < 0) 1202 &_vmexit_control) < 0)
1182 return -EIO; 1203 return -EIO;
1183 1204
1184 min = opt = 0; 1205 min = 0;
1206 opt = VM_ENTRY_LOAD_IA32_PAT;
1185 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, 1207 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
1186 &_vmentry_control) < 0) 1208 &_vmentry_control) < 0)
1187 return -EIO; 1209 return -EIO;
@@ -2087,8 +2109,9 @@ static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr)
2087 */ 2109 */
2088static int vmx_vcpu_setup(struct vcpu_vmx *vmx) 2110static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2089{ 2111{
2090 u32 host_sysenter_cs; 2112 u32 host_sysenter_cs, msr_low, msr_high;
2091 u32 junk; 2113 u32 junk;
2114 u64 host_pat;
2092 unsigned long a; 2115 unsigned long a;
2093 struct descriptor_table dt; 2116 struct descriptor_table dt;
2094 int i; 2117 int i;
@@ -2176,6 +2199,20 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2176 rdmsrl(MSR_IA32_SYSENTER_EIP, a); 2199 rdmsrl(MSR_IA32_SYSENTER_EIP, a);
2177 vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */ 2200 vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */
2178 2201
2202 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
2203 rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
2204 host_pat = msr_low | ((u64) msr_high << 32);
2205 vmcs_write64(HOST_IA32_PAT, host_pat);
2206 }
2207 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2208 rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
2209 host_pat = msr_low | ((u64) msr_high << 32);
2210 /* Write the default value follow host pat */
2211 vmcs_write64(GUEST_IA32_PAT, host_pat);
2212 /* Keep arch.pat sync with GUEST_IA32_PAT */
2213 vmx->vcpu.arch.pat = host_pat;
2214 }
2215
2179 for (i = 0; i < NR_VMX_MSR; ++i) { 2216 for (i = 0; i < NR_VMX_MSR; ++i) {
2180 u32 index = vmx_msr_index[i]; 2217 u32 index = vmx_msr_index[i];
2181 u32 data_low, data_high; 2218 u32 data_low, data_high;
@@ -2230,6 +2267,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2230 2267
2231 vmx->vcpu.arch.rmode.active = 0; 2268 vmx->vcpu.arch.rmode.active = 0;
2232 2269
2270 vmx->soft_vnmi_blocked = 0;
2271
2233 vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); 2272 vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
2234 kvm_set_cr8(&vmx->vcpu, 0); 2273 kvm_set_cr8(&vmx->vcpu, 0);
2235 msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; 2274 msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
@@ -2335,6 +2374,29 @@ out:
2335 return ret; 2374 return ret;
2336} 2375}
2337 2376
2377static void enable_irq_window(struct kvm_vcpu *vcpu)
2378{
2379 u32 cpu_based_vm_exec_control;
2380
2381 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2382 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
2383 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2384}
2385
2386static void enable_nmi_window(struct kvm_vcpu *vcpu)
2387{
2388 u32 cpu_based_vm_exec_control;
2389
2390 if (!cpu_has_virtual_nmis()) {
2391 enable_irq_window(vcpu);
2392 return;
2393 }
2394
2395 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2396 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
2397 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2398}
2399
2338static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) 2400static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
2339{ 2401{
2340 struct vcpu_vmx *vmx = to_vmx(vcpu); 2402 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2358,10 +2420,54 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
2358 2420
2359static void vmx_inject_nmi(struct kvm_vcpu *vcpu) 2421static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
2360{ 2422{
2423 struct vcpu_vmx *vmx = to_vmx(vcpu);
2424
2425 if (!cpu_has_virtual_nmis()) {
2426 /*
2427 * Tracking the NMI-blocked state in software is built upon
2428 * finding the next open IRQ window. This, in turn, depends on
2429 * well-behaving guests: They have to keep IRQs disabled at
2430 * least as long as the NMI handler runs. Otherwise we may
2431 * cause NMI nesting, maybe breaking the guest. But as this is
2432 * highly unlikely, we can live with the residual risk.
2433 */
2434 vmx->soft_vnmi_blocked = 1;
2435 vmx->vnmi_blocked_time = 0;
2436 }
2437
2438 ++vcpu->stat.nmi_injections;
2439 if (vcpu->arch.rmode.active) {
2440 vmx->rmode.irq.pending = true;
2441 vmx->rmode.irq.vector = NMI_VECTOR;
2442 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
2443 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2444 NMI_VECTOR | INTR_TYPE_SOFT_INTR |
2445 INTR_INFO_VALID_MASK);
2446 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
2447 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
2448 return;
2449 }
2361 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2450 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2362 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 2451 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
2363} 2452}
2364 2453
2454static void vmx_update_window_states(struct kvm_vcpu *vcpu)
2455{
2456 u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
2457
2458 vcpu->arch.nmi_window_open =
2459 !(guest_intr & (GUEST_INTR_STATE_STI |
2460 GUEST_INTR_STATE_MOV_SS |
2461 GUEST_INTR_STATE_NMI));
2462 if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
2463 vcpu->arch.nmi_window_open = 0;
2464
2465 vcpu->arch.interrupt_window_open =
2466 ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
2467 !(guest_intr & (GUEST_INTR_STATE_STI |
2468 GUEST_INTR_STATE_MOV_SS)));
2469}
2470
2365static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) 2471static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
2366{ 2472{
2367 int word_index = __ffs(vcpu->arch.irq_summary); 2473 int word_index = __ffs(vcpu->arch.irq_summary);
@@ -2374,40 +2480,49 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
2374 kvm_queue_interrupt(vcpu, irq); 2480 kvm_queue_interrupt(vcpu, irq);
2375} 2481}
2376 2482
2377
2378static void do_interrupt_requests(struct kvm_vcpu *vcpu, 2483static void do_interrupt_requests(struct kvm_vcpu *vcpu,
2379 struct kvm_run *kvm_run) 2484 struct kvm_run *kvm_run)
2380{ 2485{
2381 u32 cpu_based_vm_exec_control; 2486 vmx_update_window_states(vcpu);
2382
2383 vcpu->arch.interrupt_window_open =
2384 ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
2385 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
2386 2487
2387 if (vcpu->arch.interrupt_window_open && 2488 if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
2388 vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending) 2489 if (vcpu->arch.interrupt.pending) {
2389 kvm_do_inject_irq(vcpu); 2490 enable_nmi_window(vcpu);
2491 } else if (vcpu->arch.nmi_window_open) {
2492 vcpu->arch.nmi_pending = false;
2493 vcpu->arch.nmi_injected = true;
2494 } else {
2495 enable_nmi_window(vcpu);
2496 return;
2497 }
2498 }
2499 if (vcpu->arch.nmi_injected) {
2500 vmx_inject_nmi(vcpu);
2501 if (vcpu->arch.nmi_pending)
2502 enable_nmi_window(vcpu);
2503 else if (vcpu->arch.irq_summary
2504 || kvm_run->request_interrupt_window)
2505 enable_irq_window(vcpu);
2506 return;
2507 }
2390 2508
2391 if (vcpu->arch.interrupt_window_open && vcpu->arch.interrupt.pending) 2509 if (vcpu->arch.interrupt_window_open) {
2392 vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); 2510 if (vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
2511 kvm_do_inject_irq(vcpu);
2393 2512
2394 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 2513 if (vcpu->arch.interrupt.pending)
2514 vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
2515 }
2395 if (!vcpu->arch.interrupt_window_open && 2516 if (!vcpu->arch.interrupt_window_open &&
2396 (vcpu->arch.irq_summary || kvm_run->request_interrupt_window)) 2517 (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
2397 /* 2518 enable_irq_window(vcpu);
2398 * Interrupts blocked. Wait for unblock.
2399 */
2400 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
2401 else
2402 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2403 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2404} 2519}
2405 2520
2406static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) 2521static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
2407{ 2522{
2408 int ret; 2523 int ret;
2409 struct kvm_userspace_memory_region tss_mem = { 2524 struct kvm_userspace_memory_region tss_mem = {
2410 .slot = 8, 2525 .slot = TSS_PRIVATE_MEMSLOT,
2411 .guest_phys_addr = addr, 2526 .guest_phys_addr = addr,
2412 .memory_size = PAGE_SIZE * 3, 2527 .memory_size = PAGE_SIZE * 3,
2413 .flags = 0, 2528 .flags = 0,
@@ -2492,7 +2607,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2492 set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary); 2607 set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
2493 } 2608 }
2494 2609
2495 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */ 2610 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
2496 return 1; /* already handled by vmx_vcpu_run() */ 2611 return 1; /* already handled by vmx_vcpu_run() */
2497 2612
2498 if (is_no_device(intr_info)) { 2613 if (is_no_device(intr_info)) {
@@ -2581,6 +2696,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2581 rep = (exit_qualification & 32) != 0; 2696 rep = (exit_qualification & 32) != 0;
2582 port = exit_qualification >> 16; 2697 port = exit_qualification >> 16;
2583 2698
2699 skip_emulated_instruction(vcpu);
2584 return kvm_emulate_pio(vcpu, kvm_run, in, size, port); 2700 return kvm_emulate_pio(vcpu, kvm_run, in, size, port);
2585} 2701}
2586 2702
@@ -2767,6 +2883,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
2767 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 2883 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2768 2884
2769 KVMTRACE_0D(PEND_INTR, vcpu, handler); 2885 KVMTRACE_0D(PEND_INTR, vcpu, handler);
2886 ++vcpu->stat.irq_window_exits;
2770 2887
2771 /* 2888 /*
2772 * If the user space waits to inject interrupts, exit as soon as 2889 * If the user space waits to inject interrupts, exit as soon as
@@ -2775,7 +2892,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
2775 if (kvm_run->request_interrupt_window && 2892 if (kvm_run->request_interrupt_window &&
2776 !vcpu->arch.irq_summary) { 2893 !vcpu->arch.irq_summary) {
2777 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; 2894 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
2778 ++vcpu->stat.irq_window_exits;
2779 return 0; 2895 return 0;
2780 } 2896 }
2781 return 1; 2897 return 1;
@@ -2832,6 +2948,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2832 2948
2833static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2949static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2834{ 2950{
2951 struct vcpu_vmx *vmx = to_vmx(vcpu);
2835 unsigned long exit_qualification; 2952 unsigned long exit_qualification;
2836 u16 tss_selector; 2953 u16 tss_selector;
2837 int reason; 2954 int reason;
@@ -2839,6 +2956,15 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2839 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 2956 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
2840 2957
2841 reason = (u32)exit_qualification >> 30; 2958 reason = (u32)exit_qualification >> 30;
2959 if (reason == TASK_SWITCH_GATE && vmx->vcpu.arch.nmi_injected &&
2960 (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
2961 (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK)
2962 == INTR_TYPE_NMI_INTR) {
2963 vcpu->arch.nmi_injected = false;
2964 if (cpu_has_virtual_nmis())
2965 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
2966 GUEST_INTR_STATE_NMI);
2967 }
2842 tss_selector = exit_qualification; 2968 tss_selector = exit_qualification;
2843 2969
2844 return kvm_task_switch(vcpu, tss_selector, reason); 2970 return kvm_task_switch(vcpu, tss_selector, reason);
@@ -2927,16 +3053,12 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
2927 while (!guest_state_valid(vcpu)) { 3053 while (!guest_state_valid(vcpu)) {
2928 err = emulate_instruction(vcpu, kvm_run, 0, 0, 0); 3054 err = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
2929 3055
2930 switch (err) { 3056 if (err == EMULATE_DO_MMIO)
2931 case EMULATE_DONE: 3057 break;
2932 break; 3058
2933 case EMULATE_DO_MMIO: 3059 if (err != EMULATE_DONE) {
2934 kvm_report_emulation_failure(vcpu, "mmio"); 3060 kvm_report_emulation_failure(vcpu, "emulation failure");
2935 /* TODO: Handle MMIO */ 3061 return;
2936 return;
2937 default:
2938 kvm_report_emulation_failure(vcpu, "emulation failure");
2939 return;
2940 } 3062 }
2941 3063
2942 if (signal_pending(current)) 3064 if (signal_pending(current))
@@ -2948,8 +3070,10 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
2948 local_irq_disable(); 3070 local_irq_disable();
2949 preempt_disable(); 3071 preempt_disable();
2950 3072
2951 /* Guest state should be valid now, no more emulation should be needed */ 3073 /* Guest state should be valid now except if we need to
2952 vmx->emulation_required = 0; 3074 * emulate an MMIO */
3075 if (guest_state_valid(vcpu))
3076 vmx->emulation_required = 0;
2953} 3077}
2954 3078
2955/* 3079/*
@@ -2996,6 +3120,11 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2996 KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu), 3120 KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu),
2997 (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit); 3121 (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit);
2998 3122
3123 /* If we need to emulate an MMIO from handle_invalid_guest_state
3124 * we just return 0 */
3125 if (vmx->emulation_required && emulate_invalid_guest_state)
3126 return 0;
3127
2999 /* Access CR3 don't cause VMExit in paging mode, so we need 3128 /* Access CR3 don't cause VMExit in paging mode, so we need
3000 * to sync with guest real CR3. */ 3129 * to sync with guest real CR3. */
3001 if (vm_need_ept() && is_paging(vcpu)) { 3130 if (vm_need_ept() && is_paging(vcpu)) {
@@ -3012,9 +3141,32 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3012 3141
3013 if ((vectoring_info & VECTORING_INFO_VALID_MASK) && 3142 if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
3014 (exit_reason != EXIT_REASON_EXCEPTION_NMI && 3143 (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
3015 exit_reason != EXIT_REASON_EPT_VIOLATION)) 3144 exit_reason != EXIT_REASON_EPT_VIOLATION &&
3016 printk(KERN_WARNING "%s: unexpected, valid vectoring info and " 3145 exit_reason != EXIT_REASON_TASK_SWITCH))
3017 "exit reason is 0x%x\n", __func__, exit_reason); 3146 printk(KERN_WARNING "%s: unexpected, valid vectoring info "
3147 "(0x%x) and exit reason is 0x%x\n",
3148 __func__, vectoring_info, exit_reason);
3149
3150 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
3151 if (vcpu->arch.interrupt_window_open) {
3152 vmx->soft_vnmi_blocked = 0;
3153 vcpu->arch.nmi_window_open = 1;
3154 } else if (vmx->vnmi_blocked_time > 1000000000LL &&
3155 vcpu->arch.nmi_pending) {
3156 /*
3157 * This CPU don't support us in finding the end of an
3158 * NMI-blocked window if the guest runs with IRQs
3159 * disabled. So we pull the trigger after 1 s of
3160 * futile waiting, but inform the user about this.
3161 */
3162 printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
3163 "state on VCPU %d after 1 s timeout\n",
3164 __func__, vcpu->vcpu_id);
3165 vmx->soft_vnmi_blocked = 0;
3166 vmx->vcpu.arch.nmi_window_open = 1;
3167 }
3168 }
3169
3018 if (exit_reason < kvm_vmx_max_exit_handlers 3170 if (exit_reason < kvm_vmx_max_exit_handlers
3019 && kvm_vmx_exit_handlers[exit_reason]) 3171 && kvm_vmx_exit_handlers[exit_reason])
3020 return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run); 3172 return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run);
@@ -3042,51 +3194,6 @@ static void update_tpr_threshold(struct kvm_vcpu *vcpu)
3042 vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4); 3194 vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4);
3043} 3195}
3044 3196
3045static void enable_irq_window(struct kvm_vcpu *vcpu)
3046{
3047 u32 cpu_based_vm_exec_control;
3048
3049 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
3050 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
3051 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
3052}
3053
3054static void enable_nmi_window(struct kvm_vcpu *vcpu)
3055{
3056 u32 cpu_based_vm_exec_control;
3057
3058 if (!cpu_has_virtual_nmis())
3059 return;
3060
3061 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
3062 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
3063 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
3064}
3065
3066static int vmx_nmi_enabled(struct kvm_vcpu *vcpu)
3067{
3068 u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
3069 return !(guest_intr & (GUEST_INTR_STATE_NMI |
3070 GUEST_INTR_STATE_MOV_SS |
3071 GUEST_INTR_STATE_STI));
3072}
3073
3074static int vmx_irq_enabled(struct kvm_vcpu *vcpu)
3075{
3076 u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
3077 return (!(guest_intr & (GUEST_INTR_STATE_MOV_SS |
3078 GUEST_INTR_STATE_STI)) &&
3079 (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF));
3080}
3081
3082static void enable_intr_window(struct kvm_vcpu *vcpu)
3083{
3084 if (vcpu->arch.nmi_pending)
3085 enable_nmi_window(vcpu);
3086 else if (kvm_cpu_has_interrupt(vcpu))
3087 enable_irq_window(vcpu);
3088}
3089
3090static void vmx_complete_interrupts(struct vcpu_vmx *vmx) 3197static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3091{ 3198{
3092 u32 exit_intr_info; 3199 u32 exit_intr_info;
@@ -3109,7 +3216,9 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3109 if (unblock_nmi && vector != DF_VECTOR) 3216 if (unblock_nmi && vector != DF_VECTOR)
3110 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 3217 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
3111 GUEST_INTR_STATE_NMI); 3218 GUEST_INTR_STATE_NMI);
3112 } 3219 } else if (unlikely(vmx->soft_vnmi_blocked))
3220 vmx->vnmi_blocked_time +=
3221 ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
3113 3222
3114 idt_vectoring_info = vmx->idt_vectoring_info; 3223 idt_vectoring_info = vmx->idt_vectoring_info;
3115 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; 3224 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
@@ -3147,26 +3256,29 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
3147{ 3256{
3148 update_tpr_threshold(vcpu); 3257 update_tpr_threshold(vcpu);
3149 3258
3150 if (cpu_has_virtual_nmis()) { 3259 vmx_update_window_states(vcpu);
3151 if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) { 3260
3152 if (vcpu->arch.interrupt.pending) { 3261 if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
3153 enable_nmi_window(vcpu); 3262 if (vcpu->arch.interrupt.pending) {
3154 } else if (vmx_nmi_enabled(vcpu)) { 3263 enable_nmi_window(vcpu);
3155 vcpu->arch.nmi_pending = false; 3264 } else if (vcpu->arch.nmi_window_open) {
3156 vcpu->arch.nmi_injected = true; 3265 vcpu->arch.nmi_pending = false;
3157 } else { 3266 vcpu->arch.nmi_injected = true;
3158 enable_intr_window(vcpu); 3267 } else {
3159 return; 3268 enable_nmi_window(vcpu);
3160 }
3161 }
3162 if (vcpu->arch.nmi_injected) {
3163 vmx_inject_nmi(vcpu);
3164 enable_intr_window(vcpu);
3165 return; 3269 return;
3166 } 3270 }
3167 } 3271 }
3272 if (vcpu->arch.nmi_injected) {
3273 vmx_inject_nmi(vcpu);
3274 if (vcpu->arch.nmi_pending)
3275 enable_nmi_window(vcpu);
3276 else if (kvm_cpu_has_interrupt(vcpu))
3277 enable_irq_window(vcpu);
3278 return;
3279 }
3168 if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) { 3280 if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) {
3169 if (vmx_irq_enabled(vcpu)) 3281 if (vcpu->arch.interrupt_window_open)
3170 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu)); 3282 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
3171 else 3283 else
3172 enable_irq_window(vcpu); 3284 enable_irq_window(vcpu);
@@ -3174,6 +3286,8 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
3174 if (vcpu->arch.interrupt.pending) { 3286 if (vcpu->arch.interrupt.pending) {
3175 vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); 3287 vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
3176 kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr); 3288 kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr);
3289 if (kvm_cpu_has_interrupt(vcpu))
3290 enable_irq_window(vcpu);
3177 } 3291 }
3178} 3292}
3179 3293
@@ -3213,6 +3327,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3213 struct vcpu_vmx *vmx = to_vmx(vcpu); 3327 struct vcpu_vmx *vmx = to_vmx(vcpu);
3214 u32 intr_info; 3328 u32 intr_info;
3215 3329
3330 /* Record the guest's net vcpu time for enforced NMI injections. */
3331 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
3332 vmx->entry_time = ktime_get();
3333
3216 /* Handle invalid guest state instead of entering VMX */ 3334 /* Handle invalid guest state instead of entering VMX */
3217 if (vmx->emulation_required && emulate_invalid_guest_state) { 3335 if (vmx->emulation_required && emulate_invalid_guest_state) {
3218 handle_invalid_guest_state(vcpu, kvm_run); 3336 handle_invalid_guest_state(vcpu, kvm_run);
@@ -3327,9 +3445,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3327 if (vmx->rmode.irq.pending) 3445 if (vmx->rmode.irq.pending)
3328 fixup_rmode_irq(vmx); 3446 fixup_rmode_irq(vmx);
3329 3447
3330 vcpu->arch.interrupt_window_open = 3448 vmx_update_window_states(vcpu);
3331 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
3332 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)) == 0;
3333 3449
3334 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); 3450 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
3335 vmx->launched = 1; 3451 vmx->launched = 1;
@@ -3337,7 +3453,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3337 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 3453 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
3338 3454
3339 /* We need to handle NMIs before interrupts are enabled */ 3455 /* We need to handle NMIs before interrupts are enabled */
3340 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200 && 3456 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
3341 (intr_info & INTR_INFO_VALID_MASK)) { 3457 (intr_info & INTR_INFO_VALID_MASK)) {
3342 KVMTRACE_0D(NMI, vcpu, handler); 3458 KVMTRACE_0D(NMI, vcpu, handler);
3343 asm("int $2"); 3459 asm("int $2");
@@ -3455,6 +3571,11 @@ static int get_ept_level(void)
3455 return VMX_EPT_DEFAULT_GAW + 1; 3571 return VMX_EPT_DEFAULT_GAW + 1;
3456} 3572}
3457 3573
3574static int vmx_get_mt_mask_shift(void)
3575{
3576 return VMX_EPT_MT_EPTE_SHIFT;
3577}
3578
3458static struct kvm_x86_ops vmx_x86_ops = { 3579static struct kvm_x86_ops vmx_x86_ops = {
3459 .cpu_has_kvm_support = cpu_has_kvm_support, 3580 .cpu_has_kvm_support = cpu_has_kvm_support,
3460 .disabled_by_bios = vmx_disabled_by_bios, 3581 .disabled_by_bios = vmx_disabled_by_bios,
@@ -3510,6 +3631,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
3510 3631
3511 .set_tss_addr = vmx_set_tss_addr, 3632 .set_tss_addr = vmx_set_tss_addr,
3512 .get_tdp_level = get_ept_level, 3633 .get_tdp_level = get_ept_level,
3634 .get_mt_mask_shift = vmx_get_mt_mask_shift,
3513}; 3635};
3514 3636
3515static int __init vmx_init(void) 3637static int __init vmx_init(void)
@@ -3566,10 +3688,10 @@ static int __init vmx_init(void)
3566 bypass_guest_pf = 0; 3688 bypass_guest_pf = 0;
3567 kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK | 3689 kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
3568 VMX_EPT_WRITABLE_MASK | 3690 VMX_EPT_WRITABLE_MASK |
3569 VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT |
3570 VMX_EPT_IGMT_BIT); 3691 VMX_EPT_IGMT_BIT);
3571 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, 3692 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
3572 VMX_EPT_EXECUTABLE_MASK); 3693 VMX_EPT_EXECUTABLE_MASK,
3694 VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
3573 kvm_enable_tdp(); 3695 kvm_enable_tdp();
3574 } else 3696 } else
3575 kvm_disable_tdp(); 3697 kvm_disable_tdp();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f1f8ff2f1fa2..cc17546a2406 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -34,11 +34,13 @@
34#include <linux/module.h> 34#include <linux/module.h>
35#include <linux/mman.h> 35#include <linux/mman.h>
36#include <linux/highmem.h> 36#include <linux/highmem.h>
37#include <linux/iommu.h>
37#include <linux/intel-iommu.h> 38#include <linux/intel-iommu.h>
38 39
39#include <asm/uaccess.h> 40#include <asm/uaccess.h>
40#include <asm/msr.h> 41#include <asm/msr.h>
41#include <asm/desc.h> 42#include <asm/desc.h>
43#include <asm/mtrr.h>
42 44
43#define MAX_IO_MSRS 256 45#define MAX_IO_MSRS 256
44#define CR0_RESERVED_BITS \ 46#define CR0_RESERVED_BITS \
@@ -86,6 +88,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
86 { "halt_wakeup", VCPU_STAT(halt_wakeup) }, 88 { "halt_wakeup", VCPU_STAT(halt_wakeup) },
87 { "hypercalls", VCPU_STAT(hypercalls) }, 89 { "hypercalls", VCPU_STAT(hypercalls) },
88 { "request_irq", VCPU_STAT(request_irq_exits) }, 90 { "request_irq", VCPU_STAT(request_irq_exits) },
91 { "request_nmi", VCPU_STAT(request_nmi_exits) },
89 { "irq_exits", VCPU_STAT(irq_exits) }, 92 { "irq_exits", VCPU_STAT(irq_exits) },
90 { "host_state_reload", VCPU_STAT(host_state_reload) }, 93 { "host_state_reload", VCPU_STAT(host_state_reload) },
91 { "efer_reload", VCPU_STAT(efer_reload) }, 94 { "efer_reload", VCPU_STAT(efer_reload) },
@@ -93,6 +96,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
93 { "insn_emulation", VCPU_STAT(insn_emulation) }, 96 { "insn_emulation", VCPU_STAT(insn_emulation) },
94 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, 97 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
95 { "irq_injections", VCPU_STAT(irq_injections) }, 98 { "irq_injections", VCPU_STAT(irq_injections) },
99 { "nmi_injections", VCPU_STAT(nmi_injections) },
96 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, 100 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
97 { "mmu_pte_write", VM_STAT(mmu_pte_write) }, 101 { "mmu_pte_write", VM_STAT(mmu_pte_write) },
98 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, 102 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -101,6 +105,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
101 { "mmu_recycled", VM_STAT(mmu_recycled) }, 105 { "mmu_recycled", VM_STAT(mmu_recycled) },
102 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, 106 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
103 { "mmu_unsync", VM_STAT(mmu_unsync) }, 107 { "mmu_unsync", VM_STAT(mmu_unsync) },
108 { "mmu_unsync_global", VM_STAT(mmu_unsync_global) },
104 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, 109 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
105 { "largepages", VM_STAT(lpages) }, 110 { "largepages", VM_STAT(lpages) },
106 { NULL } 111 { NULL }
@@ -312,6 +317,7 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
312 kvm_x86_ops->set_cr0(vcpu, cr0); 317 kvm_x86_ops->set_cr0(vcpu, cr0);
313 vcpu->arch.cr0 = cr0; 318 vcpu->arch.cr0 = cr0;
314 319
320 kvm_mmu_sync_global(vcpu);
315 kvm_mmu_reset_context(vcpu); 321 kvm_mmu_reset_context(vcpu);
316 return; 322 return;
317} 323}
@@ -355,6 +361,7 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
355 } 361 }
356 kvm_x86_ops->set_cr4(vcpu, cr4); 362 kvm_x86_ops->set_cr4(vcpu, cr4);
357 vcpu->arch.cr4 = cr4; 363 vcpu->arch.cr4 = cr4;
364 kvm_mmu_sync_global(vcpu);
358 kvm_mmu_reset_context(vcpu); 365 kvm_mmu_reset_context(vcpu);
359} 366}
360EXPORT_SYMBOL_GPL(kvm_set_cr4); 367EXPORT_SYMBOL_GPL(kvm_set_cr4);
@@ -449,7 +456,7 @@ static u32 msrs_to_save[] = {
449 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 456 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
450#endif 457#endif
451 MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 458 MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
452 MSR_IA32_PERF_STATUS, 459 MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT
453}; 460};
454 461
455static unsigned num_msrs_to_save; 462static unsigned num_msrs_to_save;
@@ -648,10 +655,38 @@ static bool msr_mtrr_valid(unsigned msr)
648 655
649static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) 656static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
650{ 657{
658 u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
659
651 if (!msr_mtrr_valid(msr)) 660 if (!msr_mtrr_valid(msr))
652 return 1; 661 return 1;
653 662
654 vcpu->arch.mtrr[msr - 0x200] = data; 663 if (msr == MSR_MTRRdefType) {
664 vcpu->arch.mtrr_state.def_type = data;
665 vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
666 } else if (msr == MSR_MTRRfix64K_00000)
667 p[0] = data;
668 else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
669 p[1 + msr - MSR_MTRRfix16K_80000] = data;
670 else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
671 p[3 + msr - MSR_MTRRfix4K_C0000] = data;
672 else if (msr == MSR_IA32_CR_PAT)
673 vcpu->arch.pat = data;
674 else { /* Variable MTRRs */
675 int idx, is_mtrr_mask;
676 u64 *pt;
677
678 idx = (msr - 0x200) / 2;
679 is_mtrr_mask = msr - 0x200 - 2 * idx;
680 if (!is_mtrr_mask)
681 pt =
682 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
683 else
684 pt =
685 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
686 *pt = data;
687 }
688
689 kvm_mmu_reset_context(vcpu);
655 return 0; 690 return 0;
656} 691}
657 692
@@ -747,10 +782,37 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
747 782
748static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 783static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
749{ 784{
785 u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
786
750 if (!msr_mtrr_valid(msr)) 787 if (!msr_mtrr_valid(msr))
751 return 1; 788 return 1;
752 789
753 *pdata = vcpu->arch.mtrr[msr - 0x200]; 790 if (msr == MSR_MTRRdefType)
791 *pdata = vcpu->arch.mtrr_state.def_type +
792 (vcpu->arch.mtrr_state.enabled << 10);
793 else if (msr == MSR_MTRRfix64K_00000)
794 *pdata = p[0];
795 else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
796 *pdata = p[1 + msr - MSR_MTRRfix16K_80000];
797 else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
798 *pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
799 else if (msr == MSR_IA32_CR_PAT)
800 *pdata = vcpu->arch.pat;
801 else { /* Variable MTRRs */
802 int idx, is_mtrr_mask;
803 u64 *pt;
804
805 idx = (msr - 0x200) / 2;
806 is_mtrr_mask = msr - 0x200 - 2 * idx;
807 if (!is_mtrr_mask)
808 pt =
809 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
810 else
811 pt =
812 (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
813 *pdata = *pt;
814 }
815
754 return 0; 816 return 0;
755} 817}
756 818
@@ -903,7 +965,6 @@ int kvm_dev_ioctl_check_extension(long ext)
903 case KVM_CAP_IRQCHIP: 965 case KVM_CAP_IRQCHIP:
904 case KVM_CAP_HLT: 966 case KVM_CAP_HLT:
905 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: 967 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
906 case KVM_CAP_USER_MEMORY:
907 case KVM_CAP_SET_TSS_ADDR: 968 case KVM_CAP_SET_TSS_ADDR:
908 case KVM_CAP_EXT_CPUID: 969 case KVM_CAP_EXT_CPUID:
909 case KVM_CAP_CLOCKSOURCE: 970 case KVM_CAP_CLOCKSOURCE:
@@ -929,7 +990,7 @@ int kvm_dev_ioctl_check_extension(long ext)
929 r = !tdp_enabled; 990 r = !tdp_enabled;
930 break; 991 break;
931 case KVM_CAP_IOMMU: 992 case KVM_CAP_IOMMU:
932 r = intel_iommu_found(); 993 r = iommu_found();
933 break; 994 break;
934 default: 995 default:
935 r = 0; 996 r = 0;
@@ -1188,6 +1249,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1188 int t, times = entry->eax & 0xff; 1249 int t, times = entry->eax & 0xff;
1189 1250
1190 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; 1251 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
1252 entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
1191 for (t = 1; t < times && *nent < maxnent; ++t) { 1253 for (t = 1; t < times && *nent < maxnent; ++t) {
1192 do_cpuid_1_ent(&entry[t], function, 0); 1254 do_cpuid_1_ent(&entry[t], function, 0);
1193 entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; 1255 entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
@@ -1218,7 +1280,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1218 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1280 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1219 /* read more entries until level_type is zero */ 1281 /* read more entries until level_type is zero */
1220 for (i = 1; *nent < maxnent; ++i) { 1282 for (i = 1; *nent < maxnent; ++i) {
1221 level_type = entry[i - 1].ecx & 0xff; 1283 level_type = entry[i - 1].ecx & 0xff00;
1222 if (!level_type) 1284 if (!level_type)
1223 break; 1285 break;
1224 do_cpuid_1_ent(&entry[i], function, i); 1286 do_cpuid_1_ent(&entry[i], function, i);
@@ -1318,6 +1380,15 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
1318 return 0; 1380 return 0;
1319} 1381}
1320 1382
1383static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
1384{
1385 vcpu_load(vcpu);
1386 kvm_inject_nmi(vcpu);
1387 vcpu_put(vcpu);
1388
1389 return 0;
1390}
1391
1321static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, 1392static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
1322 struct kvm_tpr_access_ctl *tac) 1393 struct kvm_tpr_access_ctl *tac)
1323{ 1394{
@@ -1377,6 +1448,13 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1377 r = 0; 1448 r = 0;
1378 break; 1449 break;
1379 } 1450 }
1451 case KVM_NMI: {
1452 r = kvm_vcpu_ioctl_nmi(vcpu);
1453 if (r)
1454 goto out;
1455 r = 0;
1456 break;
1457 }
1380 case KVM_SET_CPUID: { 1458 case KVM_SET_CPUID: {
1381 struct kvm_cpuid __user *cpuid_arg = argp; 1459 struct kvm_cpuid __user *cpuid_arg = argp;
1382 struct kvm_cpuid cpuid; 1460 struct kvm_cpuid cpuid;
@@ -1968,7 +2046,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
1968 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); 2046 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
1969 if (ret < 0) 2047 if (ret < 0)
1970 return 0; 2048 return 0;
1971 kvm_mmu_pte_write(vcpu, gpa, val, bytes); 2049 kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
1972 return 1; 2050 return 1;
1973} 2051}
1974 2052
@@ -2404,8 +2482,6 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2404 val = kvm_register_read(vcpu, VCPU_REGS_RAX); 2482 val = kvm_register_read(vcpu, VCPU_REGS_RAX);
2405 memcpy(vcpu->arch.pio_data, &val, 4); 2483 memcpy(vcpu->arch.pio_data, &val, 4);
2406 2484
2407 kvm_x86_ops->skip_emulated_instruction(vcpu);
2408
2409 pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in); 2485 pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in);
2410 if (pio_dev) { 2486 if (pio_dev) {
2411 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data); 2487 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
@@ -2541,7 +2617,7 @@ int kvm_arch_init(void *opaque)
2541 kvm_mmu_set_nonpresent_ptes(0ull, 0ull); 2617 kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
2542 kvm_mmu_set_base_ptes(PT_PRESENT_MASK); 2618 kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
2543 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 2619 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
2544 PT_DIRTY_MASK, PT64_NX_MASK, 0); 2620 PT_DIRTY_MASK, PT64_NX_MASK, 0, 0);
2545 return 0; 2621 return 0;
2546 2622
2547out: 2623out:
@@ -2729,7 +2805,7 @@ static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
2729 2805
2730 e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; 2806 e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
2731 /* when no next entry is found, the current entry[i] is reselected */ 2807 /* when no next entry is found, the current entry[i] is reselected */
2732 for (j = i + 1; j == i; j = (j + 1) % nent) { 2808 for (j = i + 1; ; j = (j + 1) % nent) {
2733 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; 2809 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
2734 if (ej->function == e->function) { 2810 if (ej->function == e->function) {
2735 ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; 2811 ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
@@ -2973,7 +3049,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2973 pr_debug("vcpu %d received sipi with vector # %x\n", 3049 pr_debug("vcpu %d received sipi with vector # %x\n",
2974 vcpu->vcpu_id, vcpu->arch.sipi_vector); 3050 vcpu->vcpu_id, vcpu->arch.sipi_vector);
2975 kvm_lapic_reset(vcpu); 3051 kvm_lapic_reset(vcpu);
2976 r = kvm_x86_ops->vcpu_reset(vcpu); 3052 r = kvm_arch_vcpu_reset(vcpu);
2977 if (r) 3053 if (r)
2978 return r; 3054 return r;
2979 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 3055 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -3275,9 +3351,9 @@ static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
3275 kvm_desct->padding = 0; 3351 kvm_desct->padding = 0;
3276} 3352}
3277 3353
3278static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu, 3354static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
3279 u16 selector, 3355 u16 selector,
3280 struct descriptor_table *dtable) 3356 struct descriptor_table *dtable)
3281{ 3357{
3282 if (selector & 1 << 2) { 3358 if (selector & 1 << 2) {
3283 struct kvm_segment kvm_seg; 3359 struct kvm_segment kvm_seg;
@@ -3302,7 +3378,7 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3302 struct descriptor_table dtable; 3378 struct descriptor_table dtable;
3303 u16 index = selector >> 3; 3379 u16 index = selector >> 3;
3304 3380
3305 get_segment_descritptor_dtable(vcpu, selector, &dtable); 3381 get_segment_descriptor_dtable(vcpu, selector, &dtable);
3306 3382
3307 if (dtable.limit < index * 8 + 7) { 3383 if (dtable.limit < index * 8 + 7) {
3308 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); 3384 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
@@ -3321,7 +3397,7 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3321 struct descriptor_table dtable; 3397 struct descriptor_table dtable;
3322 u16 index = selector >> 3; 3398 u16 index = selector >> 3;
3323 3399
3324 get_segment_descritptor_dtable(vcpu, selector, &dtable); 3400 get_segment_descriptor_dtable(vcpu, selector, &dtable);
3325 3401
3326 if (dtable.limit < index * 8 + 7) 3402 if (dtable.limit < index * 8 + 7)
3327 return 1; 3403 return 1;
@@ -3900,6 +3976,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
3900 /* We do fxsave: this must be aligned. */ 3976 /* We do fxsave: this must be aligned. */
3901 BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF); 3977 BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
3902 3978
3979 vcpu->arch.mtrr_state.have_fixed = 1;
3903 vcpu_load(vcpu); 3980 vcpu_load(vcpu);
3904 r = kvm_arch_vcpu_reset(vcpu); 3981 r = kvm_arch_vcpu_reset(vcpu);
3905 if (r == 0) 3982 if (r == 0)
@@ -3925,6 +4002,9 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
3925 4002
3926int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) 4003int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
3927{ 4004{
4005 vcpu->arch.nmi_pending = false;
4006 vcpu->arch.nmi_injected = false;
4007
3928 return kvm_x86_ops->vcpu_reset(vcpu); 4008 return kvm_x86_ops->vcpu_reset(vcpu);
3929} 4009}
3930 4010
@@ -4012,6 +4092,7 @@ struct kvm *kvm_arch_create_vm(void)
4012 return ERR_PTR(-ENOMEM); 4092 return ERR_PTR(-ENOMEM);
4013 4093
4014 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 4094 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
4095 INIT_LIST_HEAD(&kvm->arch.oos_global_pages);
4015 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 4096 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
4016 4097
4017 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 4098 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
@@ -4048,8 +4129,8 @@ static void kvm_free_vcpus(struct kvm *kvm)
4048 4129
4049void kvm_arch_destroy_vm(struct kvm *kvm) 4130void kvm_arch_destroy_vm(struct kvm *kvm)
4050{ 4131{
4051 kvm_iommu_unmap_guest(kvm);
4052 kvm_free_all_assigned_devices(kvm); 4132 kvm_free_all_assigned_devices(kvm);
4133 kvm_iommu_unmap_guest(kvm);
4053 kvm_free_pit(kvm); 4134 kvm_free_pit(kvm);
4054 kfree(kvm->arch.vpic); 4135 kfree(kvm->arch.vpic);
4055 kfree(kvm->arch.vioapic); 4136 kfree(kvm->arch.vioapic);
@@ -4127,7 +4208,8 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
4127int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 4208int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
4128{ 4209{
4129 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE 4210 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
4130 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED; 4211 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
4212 || vcpu->arch.nmi_pending;
4131} 4213}
4132 4214
4133static void vcpu_kick_intr(void *info) 4215static void vcpu_kick_intr(void *info)
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index ea051173b0da..d174db7a3370 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -58,6 +58,7 @@
58#define SrcMem32 (4<<4) /* Memory operand (32-bit). */ 58#define SrcMem32 (4<<4) /* Memory operand (32-bit). */
59#define SrcImm (5<<4) /* Immediate operand. */ 59#define SrcImm (5<<4) /* Immediate operand. */
60#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */ 60#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */
61#define SrcOne (7<<4) /* Implied '1' */
61#define SrcMask (7<<4) 62#define SrcMask (7<<4)
62/* Generic ModRM decode. */ 63/* Generic ModRM decode. */
63#define ModRM (1<<7) 64#define ModRM (1<<7)
@@ -70,17 +71,23 @@
70#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ 71#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
71#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ 72#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
72#define GroupMask 0xff /* Group number stored in bits 0:7 */ 73#define GroupMask 0xff /* Group number stored in bits 0:7 */
74/* Source 2 operand type */
75#define Src2None (0<<29)
76#define Src2CL (1<<29)
77#define Src2ImmByte (2<<29)
78#define Src2One (3<<29)
79#define Src2Mask (7<<29)
73 80
74enum { 81enum {
75 Group1_80, Group1_81, Group1_82, Group1_83, 82 Group1_80, Group1_81, Group1_82, Group1_83,
76 Group1A, Group3_Byte, Group3, Group4, Group5, Group7, 83 Group1A, Group3_Byte, Group3, Group4, Group5, Group7,
77}; 84};
78 85
79static u16 opcode_table[256] = { 86static u32 opcode_table[256] = {
80 /* 0x00 - 0x07 */ 87 /* 0x00 - 0x07 */
81 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 88 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
82 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 89 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
83 0, 0, 0, 0, 90 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0,
84 /* 0x08 - 0x0F */ 91 /* 0x08 - 0x0F */
85 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 92 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
86 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 93 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -195,7 +202,7 @@ static u16 opcode_table[256] = {
195 ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, 202 ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
196}; 203};
197 204
198static u16 twobyte_table[256] = { 205static u32 twobyte_table[256] = {
199 /* 0x00 - 0x0F */ 206 /* 0x00 - 0x0F */
200 0, Group | GroupDual | Group7, 0, 0, 0, 0, ImplicitOps, 0, 207 0, Group | GroupDual | Group7, 0, 0, 0, 0, ImplicitOps, 0,
201 ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 208 ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
@@ -230,9 +237,14 @@ static u16 twobyte_table[256] = {
230 /* 0x90 - 0x9F */ 237 /* 0x90 - 0x9F */
231 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 238 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
232 /* 0xA0 - 0xA7 */ 239 /* 0xA0 - 0xA7 */
233 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, 240 0, 0, 0, DstMem | SrcReg | ModRM | BitOp,
241 DstMem | SrcReg | Src2ImmByte | ModRM,
242 DstMem | SrcReg | Src2CL | ModRM, 0, 0,
234 /* 0xA8 - 0xAF */ 243 /* 0xA8 - 0xAF */
235 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, ModRM, 0, 244 0, 0, 0, DstMem | SrcReg | ModRM | BitOp,
245 DstMem | SrcReg | Src2ImmByte | ModRM,
246 DstMem | SrcReg | Src2CL | ModRM,
247 ModRM, 0,
236 /* 0xB0 - 0xB7 */ 248 /* 0xB0 - 0xB7 */
237 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, 249 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
238 DstMem | SrcReg | ModRM | BitOp, 250 DstMem | SrcReg | ModRM | BitOp,
@@ -253,7 +265,7 @@ static u16 twobyte_table[256] = {
253 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 265 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
254}; 266};
255 267
256static u16 group_table[] = { 268static u32 group_table[] = {
257 [Group1_80*8] = 269 [Group1_80*8] =
258 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 270 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
259 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 271 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
@@ -297,9 +309,9 @@ static u16 group_table[] = {
297 SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp, 309 SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp,
298}; 310};
299 311
300static u16 group2_table[] = { 312static u32 group2_table[] = {
301 [Group7*8] = 313 [Group7*8] =
302 SrcNone | ModRM, 0, 0, 0, 314 SrcNone | ModRM, 0, 0, SrcNone | ModRM,
303 SrcNone | ModRM | DstMem | Mov, 0, 315 SrcNone | ModRM | DstMem | Mov, 0,
304 SrcMem16 | ModRM | Mov, 0, 316 SrcMem16 | ModRM | Mov, 0,
305}; 317};
@@ -359,49 +371,48 @@ static u16 group2_table[] = {
359 "andl %"_msk",%"_LO32 _tmp"; " \ 371 "andl %"_msk",%"_LO32 _tmp"; " \
360 "orl %"_LO32 _tmp",%"_sav"; " 372 "orl %"_LO32 _tmp",%"_sav"; "
361 373
374#ifdef CONFIG_X86_64
375#define ON64(x) x
376#else
377#define ON64(x)
378#endif
379
380#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix) \
381 do { \
382 __asm__ __volatile__ ( \
383 _PRE_EFLAGS("0", "4", "2") \
384 _op _suffix " %"_x"3,%1; " \
385 _POST_EFLAGS("0", "4", "2") \
386 : "=m" (_eflags), "=m" ((_dst).val), \
387 "=&r" (_tmp) \
388 : _y ((_src).val), "i" (EFLAGS_MASK)); \
389 } while (0)
390
391
362/* Raw emulation: instruction has two explicit operands. */ 392/* Raw emulation: instruction has two explicit operands. */
363#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \ 393#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
364 do { \ 394 do { \
365 unsigned long _tmp; \ 395 unsigned long _tmp; \
366 \ 396 \
367 switch ((_dst).bytes) { \ 397 switch ((_dst).bytes) { \
368 case 2: \ 398 case 2: \
369 __asm__ __volatile__ ( \ 399 ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w"); \
370 _PRE_EFLAGS("0", "4", "2") \ 400 break; \
371 _op"w %"_wx"3,%1; " \ 401 case 4: \
372 _POST_EFLAGS("0", "4", "2") \ 402 ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l"); \
373 : "=m" (_eflags), "=m" ((_dst).val), \ 403 break; \
374 "=&r" (_tmp) \ 404 case 8: \
375 : _wy ((_src).val), "i" (EFLAGS_MASK)); \ 405 ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q")); \
376 break; \ 406 break; \
377 case 4: \ 407 } \
378 __asm__ __volatile__ ( \
379 _PRE_EFLAGS("0", "4", "2") \
380 _op"l %"_lx"3,%1; " \
381 _POST_EFLAGS("0", "4", "2") \
382 : "=m" (_eflags), "=m" ((_dst).val), \
383 "=&r" (_tmp) \
384 : _ly ((_src).val), "i" (EFLAGS_MASK)); \
385 break; \
386 case 8: \
387 __emulate_2op_8byte(_op, _src, _dst, \
388 _eflags, _qx, _qy); \
389 break; \
390 } \
391 } while (0) 408 } while (0)
392 409
393#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ 410#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
394 do { \ 411 do { \
395 unsigned long __tmp; \ 412 unsigned long _tmp; \
396 switch ((_dst).bytes) { \ 413 switch ((_dst).bytes) { \
397 case 1: \ 414 case 1: \
398 __asm__ __volatile__ ( \ 415 ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b"); \
399 _PRE_EFLAGS("0", "4", "2") \
400 _op"b %"_bx"3,%1; " \
401 _POST_EFLAGS("0", "4", "2") \
402 : "=m" (_eflags), "=m" ((_dst).val), \
403 "=&r" (__tmp) \
404 : _by ((_src).val), "i" (EFLAGS_MASK)); \
405 break; \ 416 break; \
406 default: \ 417 default: \
407 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ 418 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
@@ -425,71 +436,68 @@ static u16 group2_table[] = {
425 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ 436 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
426 "w", "r", _LO32, "r", "", "r") 437 "w", "r", _LO32, "r", "", "r")
427 438
428/* Instruction has only one explicit operand (no source operand). */ 439/* Instruction has three operands and one operand is stored in ECX register */
429#define emulate_1op(_op, _dst, _eflags) \ 440#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \
430 do { \ 441 do { \
431 unsigned long _tmp; \ 442 unsigned long _tmp; \
432 \ 443 _type _clv = (_cl).val; \
433 switch ((_dst).bytes) { \ 444 _type _srcv = (_src).val; \
434 case 1: \ 445 _type _dstv = (_dst).val; \
435 __asm__ __volatile__ ( \ 446 \
436 _PRE_EFLAGS("0", "3", "2") \ 447 __asm__ __volatile__ ( \
437 _op"b %1; " \ 448 _PRE_EFLAGS("0", "5", "2") \
438 _POST_EFLAGS("0", "3", "2") \ 449 _op _suffix " %4,%1 \n" \
439 : "=m" (_eflags), "=m" ((_dst).val), \ 450 _POST_EFLAGS("0", "5", "2") \
440 "=&r" (_tmp) \ 451 : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \
441 : "i" (EFLAGS_MASK)); \ 452 : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \
442 break; \ 453 ); \
443 case 2: \ 454 \
444 __asm__ __volatile__ ( \ 455 (_cl).val = (unsigned long) _clv; \
445 _PRE_EFLAGS("0", "3", "2") \ 456 (_src).val = (unsigned long) _srcv; \
446 _op"w %1; " \ 457 (_dst).val = (unsigned long) _dstv; \
447 _POST_EFLAGS("0", "3", "2") \
448 : "=m" (_eflags), "=m" ((_dst).val), \
449 "=&r" (_tmp) \
450 : "i" (EFLAGS_MASK)); \
451 break; \
452 case 4: \
453 __asm__ __volatile__ ( \
454 _PRE_EFLAGS("0", "3", "2") \
455 _op"l %1; " \
456 _POST_EFLAGS("0", "3", "2") \
457 : "=m" (_eflags), "=m" ((_dst).val), \
458 "=&r" (_tmp) \
459 : "i" (EFLAGS_MASK)); \
460 break; \
461 case 8: \
462 __emulate_1op_8byte(_op, _dst, _eflags); \
463 break; \
464 } \
465 } while (0) 458 } while (0)
466 459
467/* Emulate an instruction with quadword operands (x86/64 only). */ 460#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \
468#if defined(CONFIG_X86_64) 461 do { \
469#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \ 462 switch ((_dst).bytes) { \
470 do { \ 463 case 2: \
471 __asm__ __volatile__ ( \ 464 __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
472 _PRE_EFLAGS("0", "4", "2") \ 465 "w", unsigned short); \
473 _op"q %"_qx"3,%1; " \ 466 break; \
474 _POST_EFLAGS("0", "4", "2") \ 467 case 4: \
475 : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ 468 __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
476 : _qy ((_src).val), "i" (EFLAGS_MASK)); \ 469 "l", unsigned int); \
470 break; \
471 case 8: \
472 ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
473 "q", unsigned long)); \
474 break; \
475 } \
477 } while (0) 476 } while (0)
478 477
479#define __emulate_1op_8byte(_op, _dst, _eflags) \ 478#define __emulate_1op(_op, _dst, _eflags, _suffix) \
480 do { \ 479 do { \
481 __asm__ __volatile__ ( \ 480 unsigned long _tmp; \
482 _PRE_EFLAGS("0", "3", "2") \ 481 \
483 _op"q %1; " \ 482 __asm__ __volatile__ ( \
484 _POST_EFLAGS("0", "3", "2") \ 483 _PRE_EFLAGS("0", "3", "2") \
485 : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ 484 _op _suffix " %1; " \
486 : "i" (EFLAGS_MASK)); \ 485 _POST_EFLAGS("0", "3", "2") \
486 : "=m" (_eflags), "+m" ((_dst).val), \
487 "=&r" (_tmp) \
488 : "i" (EFLAGS_MASK)); \
487 } while (0) 489 } while (0)
488 490
489#elif defined(__i386__) 491/* Instruction has only one explicit operand (no source operand). */
490#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) 492#define emulate_1op(_op, _dst, _eflags) \
491#define __emulate_1op_8byte(_op, _dst, _eflags) 493 do { \
492#endif /* __i386__ */ 494 switch ((_dst).bytes) { \
495 case 1: __emulate_1op(_op, _dst, _eflags, "b"); break; \
496 case 2: __emulate_1op(_op, _dst, _eflags, "w"); break; \
497 case 4: __emulate_1op(_op, _dst, _eflags, "l"); break; \
498 case 8: ON64(__emulate_1op(_op, _dst, _eflags, "q")); break; \
499 } \
500 } while (0)
493 501
494/* Fetch next part of the instruction being emulated. */ 502/* Fetch next part of the instruction being emulated. */
495#define insn_fetch(_type, _size, _eip) \ 503#define insn_fetch(_type, _size, _eip) \
@@ -1041,6 +1049,33 @@ done_prefixes:
1041 c->src.bytes = 1; 1049 c->src.bytes = 1;
1042 c->src.val = insn_fetch(s8, 1, c->eip); 1050 c->src.val = insn_fetch(s8, 1, c->eip);
1043 break; 1051 break;
1052 case SrcOne:
1053 c->src.bytes = 1;
1054 c->src.val = 1;
1055 break;
1056 }
1057
1058 /*
1059 * Decode and fetch the second source operand: register, memory
1060 * or immediate.
1061 */
1062 switch (c->d & Src2Mask) {
1063 case Src2None:
1064 break;
1065 case Src2CL:
1066 c->src2.bytes = 1;
1067 c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8;
1068 break;
1069 case Src2ImmByte:
1070 c->src2.type = OP_IMM;
1071 c->src2.ptr = (unsigned long *)c->eip;
1072 c->src2.bytes = 1;
1073 c->src2.val = insn_fetch(u8, 1, c->eip);
1074 break;
1075 case Src2One:
1076 c->src2.bytes = 1;
1077 c->src2.val = 1;
1078 break;
1044 } 1079 }
1045 1080
1046 /* Decode and fetch the destination operand: register or memory. */ 1081 /* Decode and fetch the destination operand: register or memory. */
@@ -1100,20 +1135,33 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
1100 c->regs[VCPU_REGS_RSP]); 1135 c->regs[VCPU_REGS_RSP]);
1101} 1136}
1102 1137
1103static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, 1138static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1104 struct x86_emulate_ops *ops) 1139 struct x86_emulate_ops *ops)
1105{ 1140{
1106 struct decode_cache *c = &ctxt->decode; 1141 struct decode_cache *c = &ctxt->decode;
1107 int rc; 1142 int rc;
1108 1143
1109 rc = ops->read_std(register_address(c, ss_base(ctxt), 1144 rc = ops->read_emulated(register_address(c, ss_base(ctxt),
1110 c->regs[VCPU_REGS_RSP]), 1145 c->regs[VCPU_REGS_RSP]),
1111 &c->dst.val, c->dst.bytes, ctxt->vcpu); 1146 &c->src.val, c->src.bytes, ctxt->vcpu);
1112 if (rc != 0) 1147 if (rc != 0)
1113 return rc; 1148 return rc;
1114 1149
1115 register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->dst.bytes); 1150 register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.bytes);
1151 return rc;
1152}
1153
1154static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
1155 struct x86_emulate_ops *ops)
1156{
1157 struct decode_cache *c = &ctxt->decode;
1158 int rc;
1116 1159
1160 c->src.bytes = c->dst.bytes;
1161 rc = emulate_pop(ctxt, ops);
1162 if (rc != 0)
1163 return rc;
1164 c->dst.val = c->src.val;
1117 return 0; 1165 return 0;
1118} 1166}
1119 1167
@@ -1415,24 +1463,15 @@ special_insn:
1415 emulate_1op("dec", c->dst, ctxt->eflags); 1463 emulate_1op("dec", c->dst, ctxt->eflags);
1416 break; 1464 break;
1417 case 0x50 ... 0x57: /* push reg */ 1465 case 0x50 ... 0x57: /* push reg */
1418 c->dst.type = OP_MEM; 1466 emulate_push(ctxt);
1419 c->dst.bytes = c->op_bytes;
1420 c->dst.val = c->src.val;
1421 register_address_increment(c, &c->regs[VCPU_REGS_RSP],
1422 -c->op_bytes);
1423 c->dst.ptr = (void *) register_address(
1424 c, ss_base(ctxt), c->regs[VCPU_REGS_RSP]);
1425 break; 1467 break;
1426 case 0x58 ... 0x5f: /* pop reg */ 1468 case 0x58 ... 0x5f: /* pop reg */
1427 pop_instruction: 1469 pop_instruction:
1428 if ((rc = ops->read_std(register_address(c, ss_base(ctxt), 1470 c->src.bytes = c->op_bytes;
1429 c->regs[VCPU_REGS_RSP]), c->dst.ptr, 1471 rc = emulate_pop(ctxt, ops);
1430 c->op_bytes, ctxt->vcpu)) != 0) 1472 if (rc != 0)
1431 goto done; 1473 goto done;
1432 1474 c->dst.val = c->src.val;
1433 register_address_increment(c, &c->regs[VCPU_REGS_RSP],
1434 c->op_bytes);
1435 c->dst.type = OP_NONE; /* Disable writeback. */
1436 break; 1475 break;
1437 case 0x63: /* movsxd */ 1476 case 0x63: /* movsxd */
1438 if (ctxt->mode != X86EMUL_MODE_PROT64) 1477 if (ctxt->mode != X86EMUL_MODE_PROT64)
@@ -1591,7 +1630,9 @@ special_insn:
1591 emulate_push(ctxt); 1630 emulate_push(ctxt);
1592 break; 1631 break;
1593 case 0x9d: /* popf */ 1632 case 0x9d: /* popf */
1633 c->dst.type = OP_REG;
1594 c->dst.ptr = (unsigned long *) &ctxt->eflags; 1634 c->dst.ptr = (unsigned long *) &ctxt->eflags;
1635 c->dst.bytes = c->op_bytes;
1595 goto pop_instruction; 1636 goto pop_instruction;
1596 case 0xa0 ... 0xa1: /* mov */ 1637 case 0xa0 ... 0xa1: /* mov */
1597 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; 1638 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
@@ -1689,7 +1730,9 @@ special_insn:
1689 emulate_grp2(ctxt); 1730 emulate_grp2(ctxt);
1690 break; 1731 break;
1691 case 0xc3: /* ret */ 1732 case 0xc3: /* ret */
1733 c->dst.type = OP_REG;
1692 c->dst.ptr = &c->eip; 1734 c->dst.ptr = &c->eip;
1735 c->dst.bytes = c->op_bytes;
1693 goto pop_instruction; 1736 goto pop_instruction;
1694 case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ 1737 case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */
1695 mov: 1738 mov:
@@ -1778,7 +1821,7 @@ special_insn:
1778 c->eip = saved_eip; 1821 c->eip = saved_eip;
1779 goto cannot_emulate; 1822 goto cannot_emulate;
1780 } 1823 }
1781 return 0; 1824 break;
1782 case 0xf4: /* hlt */ 1825 case 0xf4: /* hlt */
1783 ctxt->vcpu->arch.halt_request = 1; 1826 ctxt->vcpu->arch.halt_request = 1;
1784 break; 1827 break;
@@ -1999,12 +2042,20 @@ twobyte_insn:
1999 c->src.val &= (c->dst.bytes << 3) - 1; 2042 c->src.val &= (c->dst.bytes << 3) - 1;
2000 emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags); 2043 emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags);
2001 break; 2044 break;
2045 case 0xa4: /* shld imm8, r, r/m */
2046 case 0xa5: /* shld cl, r, r/m */
2047 emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
2048 break;
2002 case 0xab: 2049 case 0xab:
2003 bts: /* bts */ 2050 bts: /* bts */
2004 /* only subword offset */ 2051 /* only subword offset */
2005 c->src.val &= (c->dst.bytes << 3) - 1; 2052 c->src.val &= (c->dst.bytes << 3) - 1;
2006 emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); 2053 emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
2007 break; 2054 break;
2055 case 0xac: /* shrd imm8, r, r/m */
2056 case 0xad: /* shrd cl, r, r/m */
2057 emulate_2op_cl("shrd", c->src2, c->src, c->dst, ctxt->eflags);
2058 break;
2008 case 0xae: /* clflush */ 2059 case 0xae: /* clflush */
2009 break; 2060 break;
2010 case 0xb0 ... 0xb1: /* cmpxchg */ 2061 case 0xb0 ... 0xb1: /* cmpxchg */
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index a5d8e1ace1cf..a7ed208f81e3 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -590,7 +590,8 @@ static void __init lguest_init_IRQ(void)
590 * a straightforward 1 to 1 mapping, so force that here. */ 590 * a straightforward 1 to 1 mapping, so force that here. */
591 __get_cpu_var(vector_irq)[vector] = i; 591 __get_cpu_var(vector_irq)[vector] = i;
592 if (vector != SYSCALL_VECTOR) { 592 if (vector != SYSCALL_VECTOR) {
593 set_intr_gate(vector, interrupt[vector]); 593 set_intr_gate(vector,
594 interrupt[vector-FIRST_EXTERNAL_VECTOR]);
594 set_irq_chip_and_handler_name(i, &lguest_irq_controller, 595 set_irq_chip_and_handler_name(i, &lguest_irq_controller,
595 handle_level_irq, 596 handle_level_irq,
596 "level"); 597 "level");
@@ -737,7 +738,7 @@ static void lguest_time_init(void)
737 738
738 /* We can't set cpumask in the initializer: damn C limitations! Set it 739 /* We can't set cpumask in the initializer: damn C limitations! Set it
739 * here and register our timer device. */ 740 * here and register our timer device. */
740 lguest_clockevent.cpumask = cpumask_of_cpu(0); 741 lguest_clockevent.cpumask = cpumask_of(0);
741 clockevents_register_device(&lguest_clockevent); 742 clockevents_register_device(&lguest_clockevent);
742 743
743 /* Finally, we unblock the timer interrupt. */ 744 /* Finally, we unblock the timer interrupt. */
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index 5c7cef34c9e7..10b9bd35a8ff 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -30,21 +30,6 @@ ENTRY(lguest_entry)
30 movl $lguest_data - __PAGE_OFFSET, %edx 30 movl $lguest_data - __PAGE_OFFSET, %edx
31 int $LGUEST_TRAP_ENTRY 31 int $LGUEST_TRAP_ENTRY
32 32
33 /* The Host put the toplevel pagetable in lguest_data.pgdir. The movsl
34 * instruction uses %esi implicitly as the source for the copy we're
35 * about to do. */
36 movl lguest_data - __PAGE_OFFSET + LGUEST_DATA_pgdir, %esi
37
38 /* Copy first 32 entries of page directory to __PAGE_OFFSET entries.
39 * This means the first 128M of kernel memory will be mapped at
40 * PAGE_OFFSET where the kernel expects to run. This will get it far
41 * enough through boot to switch to its own pagetables. */
42 movl $32, %ecx
43 movl %esi, %edi
44 addl $((__PAGE_OFFSET >> 22) * 4), %edi
45 rep
46 movsl
47
48 /* Set up the initial stack so we can run C code. */ 33 /* Set up the initial stack so we can run C code. */
49 movl $(init_thread_union+THREAD_SIZE),%esp 34 movl $(init_thread_union+THREAD_SIZE),%esp
50 35
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 9e68075544f6..4a20b2f9a381 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -39,7 +39,7 @@ static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned lon
39#define __do_strncpy_from_user(dst, src, count, res) \ 39#define __do_strncpy_from_user(dst, src, count, res) \
40do { \ 40do { \
41 int __d0, __d1, __d2; \ 41 int __d0, __d1, __d2; \
42 might_sleep(); \ 42 might_fault(); \
43 __asm__ __volatile__( \ 43 __asm__ __volatile__( \
44 " testl %1,%1\n" \ 44 " testl %1,%1\n" \
45 " jz 2f\n" \ 45 " jz 2f\n" \
@@ -126,7 +126,7 @@ EXPORT_SYMBOL(strncpy_from_user);
126#define __do_clear_user(addr,size) \ 126#define __do_clear_user(addr,size) \
127do { \ 127do { \
128 int __d0; \ 128 int __d0; \
129 might_sleep(); \ 129 might_fault(); \
130 __asm__ __volatile__( \ 130 __asm__ __volatile__( \
131 "0: rep; stosl\n" \ 131 "0: rep; stosl\n" \
132 " movl %2,%0\n" \ 132 " movl %2,%0\n" \
@@ -155,7 +155,7 @@ do { \
155unsigned long 155unsigned long
156clear_user(void __user *to, unsigned long n) 156clear_user(void __user *to, unsigned long n)
157{ 157{
158 might_sleep(); 158 might_fault();
159 if (access_ok(VERIFY_WRITE, to, n)) 159 if (access_ok(VERIFY_WRITE, to, n))
160 __do_clear_user(to, n); 160 __do_clear_user(to, n);
161 return n; 161 return n;
@@ -197,7 +197,7 @@ long strnlen_user(const char __user *s, long n)
197 unsigned long mask = -__addr_ok(s); 197 unsigned long mask = -__addr_ok(s);
198 unsigned long res, tmp; 198 unsigned long res, tmp;
199 199
200 might_sleep(); 200 might_fault();
201 201
202 __asm__ __volatile__( 202 __asm__ __volatile__(
203 " testl %0, %0\n" 203 " testl %0, %0\n"
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index f4df6e7c718b..64d6c84e6353 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -15,7 +15,7 @@
15#define __do_strncpy_from_user(dst,src,count,res) \ 15#define __do_strncpy_from_user(dst,src,count,res) \
16do { \ 16do { \
17 long __d0, __d1, __d2; \ 17 long __d0, __d1, __d2; \
18 might_sleep(); \ 18 might_fault(); \
19 __asm__ __volatile__( \ 19 __asm__ __volatile__( \
20 " testq %1,%1\n" \ 20 " testq %1,%1\n" \
21 " jz 2f\n" \ 21 " jz 2f\n" \
@@ -64,7 +64,7 @@ EXPORT_SYMBOL(strncpy_from_user);
64unsigned long __clear_user(void __user *addr, unsigned long size) 64unsigned long __clear_user(void __user *addr, unsigned long size)
65{ 65{
66 long __d0; 66 long __d0;
67 might_sleep(); 67 might_fault();
68 /* no memory constraint because it doesn't change any memory gcc knows 68 /* no memory constraint because it doesn't change any memory gcc knows
69 about */ 69 about */
70 asm volatile( 70 asm volatile(
diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c
index 37b9ae4d44c5..df167f265622 100644
--- a/arch/x86/mach-default/setup.c
+++ b/arch/x86/mach-default/setup.c
@@ -133,29 +133,28 @@ void __init time_init_hook(void)
133 **/ 133 **/
134void mca_nmi_hook(void) 134void mca_nmi_hook(void)
135{ 135{
136 /* If I recall correctly, there's a whole bunch of other things that 136 /*
137 * If I recall correctly, there's a whole bunch of other things that
137 * we can do to check for NMI problems, but that's all I know about 138 * we can do to check for NMI problems, but that's all I know about
138 * at the moment. 139 * at the moment.
139 */ 140 */
140 141 pr_warning("NMI generated from unknown source!\n");
141 printk("NMI generated from unknown source!\n");
142} 142}
143#endif 143#endif
144 144
145static __init int no_ipi_broadcast(char *str) 145static __init int no_ipi_broadcast(char *str)
146{ 146{
147 get_option(&str, &no_broadcast); 147 get_option(&str, &no_broadcast);
148 printk ("Using %s mode\n", no_broadcast ? "No IPI Broadcast" : 148 pr_info("Using %s mode\n",
149 "IPI Broadcast"); 149 no_broadcast ? "No IPI Broadcast" : "IPI Broadcast");
150 return 1; 150 return 1;
151} 151}
152
153__setup("no_ipi_broadcast=", no_ipi_broadcast); 152__setup("no_ipi_broadcast=", no_ipi_broadcast);
154 153
155static int __init print_ipi_mode(void) 154static int __init print_ipi_mode(void)
156{ 155{
157 printk ("Using IPI %s mode\n", no_broadcast ? "No-Shortcut" : 156 pr_info("Using IPI %s mode\n",
158 "Shortcut"); 157 no_broadcast ? "No-Shortcut" : "Shortcut");
159 return 0; 158 return 0;
160} 159}
161 160
diff --git a/arch/x86/mach-generic/bigsmp.c b/arch/x86/mach-generic/bigsmp.c
index 3c3b471ea496..bc4c7840b2a8 100644
--- a/arch/x86/mach-generic/bigsmp.c
+++ b/arch/x86/mach-generic/bigsmp.c
@@ -17,6 +17,7 @@
17#include <asm/bigsmp/apic.h> 17#include <asm/bigsmp/apic.h>
18#include <asm/bigsmp/ipi.h> 18#include <asm/bigsmp/ipi.h>
19#include <asm/mach-default/mach_mpparse.h> 19#include <asm/mach-default/mach_mpparse.h>
20#include <asm/mach-default/mach_wakecpu.h>
20 21
21static int dmi_bigsmp; /* can be set by dmi scanners */ 22static int dmi_bigsmp; /* can be set by dmi scanners */
22 23
@@ -41,9 +42,10 @@ static const struct dmi_system_id bigsmp_dmi_table[] = {
41 { } 42 { }
42}; 43};
43 44
44static cpumask_t vector_allocation_domain(int cpu) 45static void vector_allocation_domain(int cpu, cpumask_t *retmask)
45{ 46{
46 return cpumask_of_cpu(cpu); 47 cpus_clear(*retmask);
48 cpu_set(cpu, *retmask);
47} 49}
48 50
49static int probe_bigsmp(void) 51static int probe_bigsmp(void)
diff --git a/arch/x86/mach-generic/default.c b/arch/x86/mach-generic/default.c
index 9e835a11a13a..e63a4a76d8cd 100644
--- a/arch/x86/mach-generic/default.c
+++ b/arch/x86/mach-generic/default.c
@@ -16,6 +16,7 @@
16#include <asm/mach-default/mach_apic.h> 16#include <asm/mach-default/mach_apic.h>
17#include <asm/mach-default/mach_ipi.h> 17#include <asm/mach-default/mach_ipi.h>
18#include <asm/mach-default/mach_mpparse.h> 18#include <asm/mach-default/mach_mpparse.h>
19#include <asm/mach-default/mach_wakecpu.h>
19 20
20/* should be called last. */ 21/* should be called last. */
21static int probe_default(void) 22static int probe_default(void)
diff --git a/arch/x86/mach-generic/es7000.c b/arch/x86/mach-generic/es7000.c
index 28459cab3ddb..4ba5ccaa1584 100644
--- a/arch/x86/mach-generic/es7000.c
+++ b/arch/x86/mach-generic/es7000.c
@@ -16,7 +16,19 @@
16#include <asm/es7000/apic.h> 16#include <asm/es7000/apic.h>
17#include <asm/es7000/ipi.h> 17#include <asm/es7000/ipi.h>
18#include <asm/es7000/mpparse.h> 18#include <asm/es7000/mpparse.h>
19#include <asm/es7000/wakecpu.h> 19#include <asm/mach-default/mach_wakecpu.h>
20
21void __init es7000_update_genapic_to_cluster(void)
22{
23 genapic->target_cpus = target_cpus_cluster;
24 genapic->int_delivery_mode = INT_DELIVERY_MODE_CLUSTER;
25 genapic->int_dest_mode = INT_DEST_MODE_CLUSTER;
26 genapic->no_balance_irq = NO_BALANCE_IRQ_CLUSTER;
27
28 genapic->init_apic_ldr = init_apic_ldr_cluster;
29
30 genapic->cpu_mask_to_apicid = cpu_mask_to_apicid_cluster;
31}
20 32
21static int probe_es7000(void) 33static int probe_es7000(void)
22{ 34{
@@ -75,7 +87,7 @@ static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
75} 87}
76#endif 88#endif
77 89
78static cpumask_t vector_allocation_domain(int cpu) 90static void vector_allocation_domain(int cpu, cpumask_t *retmask)
79{ 91{
80 /* Careful. Some cpus do not strictly honor the set of cpus 92 /* Careful. Some cpus do not strictly honor the set of cpus
81 * specified in the interrupt destination when using lowest 93 * specified in the interrupt destination when using lowest
@@ -85,8 +97,7 @@ static cpumask_t vector_allocation_domain(int cpu)
85 * deliver interrupts to the wrong hyperthread when only one 97 * deliver interrupts to the wrong hyperthread when only one
86 * hyperthread was specified in the interrupt desitination. 98 * hyperthread was specified in the interrupt desitination.
87 */ 99 */
88 cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; 100 *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } };
89 return domain;
90} 101}
91 102
92struct genapic __initdata_refok apic_es7000 = APIC_INIT("es7000", probe_es7000); 103struct genapic __initdata_refok apic_es7000 = APIC_INIT("es7000", probe_es7000);
diff --git a/arch/x86/mach-generic/numaq.c b/arch/x86/mach-generic/numaq.c
index 71a309b122e6..511d7941364f 100644
--- a/arch/x86/mach-generic/numaq.c
+++ b/arch/x86/mach-generic/numaq.c
@@ -38,7 +38,7 @@ static int acpi_madt_oem_check(char *oem_id, char *oem_table_id)
38 return 0; 38 return 0;
39} 39}
40 40
41static cpumask_t vector_allocation_domain(int cpu) 41static void vector_allocation_domain(int cpu, cpumask_t *retmask)
42{ 42{
43 /* Careful. Some cpus do not strictly honor the set of cpus 43 /* Careful. Some cpus do not strictly honor the set of cpus
44 * specified in the interrupt destination when using lowest 44 * specified in the interrupt destination when using lowest
@@ -48,8 +48,7 @@ static cpumask_t vector_allocation_domain(int cpu)
48 * deliver interrupts to the wrong hyperthread when only one 48 * deliver interrupts to the wrong hyperthread when only one
49 * hyperthread was specified in the interrupt desitination. 49 * hyperthread was specified in the interrupt desitination.
50 */ 50 */
51 cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; 51 *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } };
52 return domain;
53} 52}
54 53
55struct genapic apic_numaq = APIC_INIT("NUMAQ", probe_numaq); 54struct genapic apic_numaq = APIC_INIT("NUMAQ", probe_numaq);
diff --git a/arch/x86/mach-generic/probe.c b/arch/x86/mach-generic/probe.c
index 5a7e4619e1c4..c346d9d0226f 100644
--- a/arch/x86/mach-generic/probe.c
+++ b/arch/x86/mach-generic/probe.c
@@ -15,6 +15,7 @@
15#include <asm/mpspec.h> 15#include <asm/mpspec.h>
16#include <asm/apicdef.h> 16#include <asm/apicdef.h>
17#include <asm/genapic.h> 17#include <asm/genapic.h>
18#include <asm/setup.h>
18 19
19extern struct genapic apic_numaq; 20extern struct genapic apic_numaq;
20extern struct genapic apic_summit; 21extern struct genapic apic_summit;
@@ -57,6 +58,9 @@ static int __init parse_apic(char *arg)
57 } 58 }
58 } 59 }
59 60
61 if (x86_quirks->update_genapic)
62 x86_quirks->update_genapic();
63
60 /* Parsed again by __setup for debug/verbose */ 64 /* Parsed again by __setup for debug/verbose */
61 return 0; 65 return 0;
62} 66}
@@ -72,12 +76,15 @@ void __init generic_bigsmp_probe(void)
72 * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support 76 * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support
73 */ 77 */
74 78
75 if (!cmdline_apic && genapic == &apic_default) 79 if (!cmdline_apic && genapic == &apic_default) {
76 if (apic_bigsmp.probe()) { 80 if (apic_bigsmp.probe()) {
77 genapic = &apic_bigsmp; 81 genapic = &apic_bigsmp;
82 if (x86_quirks->update_genapic)
83 x86_quirks->update_genapic();
78 printk(KERN_INFO "Overriding APIC driver with %s\n", 84 printk(KERN_INFO "Overriding APIC driver with %s\n",
79 genapic->name); 85 genapic->name);
80 } 86 }
87 }
81#endif 88#endif
82} 89}
83 90
@@ -94,6 +101,9 @@ void __init generic_apic_probe(void)
94 /* Not visible without early console */ 101 /* Not visible without early console */
95 if (!apic_probe[i]) 102 if (!apic_probe[i])
96 panic("Didn't find an APIC driver"); 103 panic("Didn't find an APIC driver");
104
105 if (x86_quirks->update_genapic)
106 x86_quirks->update_genapic();
97 } 107 }
98 printk(KERN_INFO "Using APIC driver %s\n", genapic->name); 108 printk(KERN_INFO "Using APIC driver %s\n", genapic->name);
99} 109}
@@ -108,6 +118,8 @@ int __init mps_oem_check(struct mp_config_table *mpc, char *oem,
108 if (apic_probe[i]->mps_oem_check(mpc, oem, productid)) { 118 if (apic_probe[i]->mps_oem_check(mpc, oem, productid)) {
109 if (!cmdline_apic) { 119 if (!cmdline_apic) {
110 genapic = apic_probe[i]; 120 genapic = apic_probe[i];
121 if (x86_quirks->update_genapic)
122 x86_quirks->update_genapic();
111 printk(KERN_INFO "Switched to APIC driver `%s'.\n", 123 printk(KERN_INFO "Switched to APIC driver `%s'.\n",
112 genapic->name); 124 genapic->name);
113 } 125 }
@@ -124,6 +136,8 @@ int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
124 if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) { 136 if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) {
125 if (!cmdline_apic) { 137 if (!cmdline_apic) {
126 genapic = apic_probe[i]; 138 genapic = apic_probe[i];
139 if (x86_quirks->update_genapic)
140 x86_quirks->update_genapic();
127 printk(KERN_INFO "Switched to APIC driver `%s'.\n", 141 printk(KERN_INFO "Switched to APIC driver `%s'.\n",
128 genapic->name); 142 genapic->name);
129 } 143 }
diff --git a/arch/x86/mach-generic/summit.c b/arch/x86/mach-generic/summit.c
index 6272b5e69da6..2821ffc188b5 100644
--- a/arch/x86/mach-generic/summit.c
+++ b/arch/x86/mach-generic/summit.c
@@ -16,6 +16,7 @@
16#include <asm/summit/apic.h> 16#include <asm/summit/apic.h>
17#include <asm/summit/ipi.h> 17#include <asm/summit/ipi.h>
18#include <asm/summit/mpparse.h> 18#include <asm/summit/mpparse.h>
19#include <asm/mach-default/mach_wakecpu.h>
19 20
20static int probe_summit(void) 21static int probe_summit(void)
21{ 22{
@@ -23,7 +24,7 @@ static int probe_summit(void)
23 return 0; 24 return 0;
24} 25}
25 26
26static cpumask_t vector_allocation_domain(int cpu) 27static void vector_allocation_domain(int cpu, cpumask_t *retmask)
27{ 28{
28 /* Careful. Some cpus do not strictly honor the set of cpus 29 /* Careful. Some cpus do not strictly honor the set of cpus
29 * specified in the interrupt destination when using lowest 30 * specified in the interrupt destination when using lowest
@@ -33,8 +34,7 @@ static cpumask_t vector_allocation_domain(int cpu)
33 * deliver interrupts to the wrong hyperthread when only one 34 * deliver interrupts to the wrong hyperthread when only one
34 * hyperthread was specified in the interrupt desitination. 35 * hyperthread was specified in the interrupt desitination.
35 */ 36 */
36 cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; 37 *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } };
37 return domain;
38} 38}
39 39
40struct genapic apic_summit = APIC_INIT("summit", probe_summit); 40struct genapic apic_summit = APIC_INIT("summit", probe_summit);
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index 52145007bd7e..9840b7ec749a 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -63,11 +63,6 @@ static int voyager_extended_cpus = 1;
63/* Used for the invalidate map that's also checked in the spinlock */ 63/* Used for the invalidate map that's also checked in the spinlock */
64static volatile unsigned long smp_invalidate_needed; 64static volatile unsigned long smp_invalidate_needed;
65 65
66/* Bitmask of currently online CPUs - used by setup.c for
67 /proc/cpuinfo, visible externally but still physical */
68cpumask_t cpu_online_map = CPU_MASK_NONE;
69EXPORT_SYMBOL(cpu_online_map);
70
71/* Bitmask of CPUs present in the system - exported by i386_syms.c, used 66/* Bitmask of CPUs present in the system - exported by i386_syms.c, used
72 * by scheduler but indexed physically */ 67 * by scheduler but indexed physically */
73cpumask_t phys_cpu_present_map = CPU_MASK_NONE; 68cpumask_t phys_cpu_present_map = CPU_MASK_NONE;
@@ -218,8 +213,6 @@ static cpumask_t smp_commenced_mask = CPU_MASK_NONE;
218/* This is for the new dynamic CPU boot code */ 213/* This is for the new dynamic CPU boot code */
219cpumask_t cpu_callin_map = CPU_MASK_NONE; 214cpumask_t cpu_callin_map = CPU_MASK_NONE;
220cpumask_t cpu_callout_map = CPU_MASK_NONE; 215cpumask_t cpu_callout_map = CPU_MASK_NONE;
221cpumask_t cpu_possible_map = CPU_MASK_NONE;
222EXPORT_SYMBOL(cpu_possible_map);
223 216
224/* The per processor IRQ masks (these are usually kept in sync) */ 217/* The per processor IRQ masks (these are usually kept in sync) */
225static __u16 vic_irq_mask[NR_CPUS] __cacheline_aligned; 218static __u16 vic_irq_mask[NR_CPUS] __cacheline_aligned;
@@ -364,9 +357,8 @@ void __init find_smp_config(void)
364 printk("VOYAGER SMP: Boot cpu is %d\n", boot_cpu_id); 357 printk("VOYAGER SMP: Boot cpu is %d\n", boot_cpu_id);
365 358
366 /* initialize the CPU structures (moved from smp_boot_cpus) */ 359 /* initialize the CPU structures (moved from smp_boot_cpus) */
367 for (i = 0; i < NR_CPUS; i++) { 360 for (i = 0; i < nr_cpu_ids; i++)
368 cpu_irq_affinity[i] = ~0; 361 cpu_irq_affinity[i] = ~0;
369 }
370 cpu_online_map = cpumask_of_cpu(boot_cpu_id); 362 cpu_online_map = cpumask_of_cpu(boot_cpu_id);
371 363
372 /* The boot CPU must be extended */ 364 /* The boot CPU must be extended */
@@ -679,7 +671,7 @@ void __init smp_boot_cpus(void)
679 671
680 /* loop over all the extended VIC CPUs and boot them. The 672 /* loop over all the extended VIC CPUs and boot them. The
681 * Quad CPUs must be bootstrapped by their extended VIC cpu */ 673 * Quad CPUs must be bootstrapped by their extended VIC cpu */
682 for (i = 0; i < NR_CPUS; i++) { 674 for (i = 0; i < nr_cpu_ids; i++) {
683 if (i == boot_cpu_id || !cpu_isset(i, phys_cpu_present_map)) 675 if (i == boot_cpu_id || !cpu_isset(i, phys_cpu_present_map))
684 continue; 676 continue;
685 do_boot_cpu(i); 677 do_boot_cpu(i);
@@ -1234,7 +1226,7 @@ int setup_profiling_timer(unsigned int multiplier)
1234 * new values until the next timer interrupt in which they do process 1226 * new values until the next timer interrupt in which they do process
1235 * accounting. 1227 * accounting.
1236 */ 1228 */
1237 for (i = 0; i < NR_CPUS; ++i) 1229 for (i = 0; i < nr_cpu_ids; ++i)
1238 per_cpu(prof_multiplier, i) = multiplier; 1230 per_cpu(prof_multiplier, i) = multiplier;
1239 1231
1240 return 0; 1232 return 0;
@@ -1264,7 +1256,7 @@ void __init voyager_smp_intr_init(void)
1264 int i; 1256 int i;
1265 1257
1266 /* initialize the per cpu irq mask to all disabled */ 1258 /* initialize the per cpu irq mask to all disabled */
1267 for (i = 0; i < NR_CPUS; i++) 1259 for (i = 0; i < nr_cpu_ids; i++)
1268 vic_irq_mask[i] = 0xFFFF; 1260 vic_irq_mask[i] = 0xFFFF;
1269 1261
1270 VIC_SET_GATE(VIC_CPI_LEVEL0, vic_cpi_interrupt); 1262 VIC_SET_GATE(VIC_CPI_LEVEL0, vic_cpi_interrupt);
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index fea4565ff576..d8cc96a2738f 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -8,9 +8,8 @@ obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o
8 8
9obj-$(CONFIG_HIGHMEM) += highmem_32.o 9obj-$(CONFIG_HIGHMEM) += highmem_32.o
10 10
11obj-$(CONFIG_MMIOTRACE_HOOKS) += kmmio.o
12obj-$(CONFIG_MMIOTRACE) += mmiotrace.o 11obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
13mmiotrace-y := pf_in.o mmio-mod.o 12mmiotrace-y := kmmio.o pf_in.o mmio-mod.o
14obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o 13obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
15 14
16obj-$(CONFIG_NUMA) += numa_$(BITS).o 15obj-$(CONFIG_NUMA) += numa_$(BITS).o
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 31e8730fa246..57ec8c86a877 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -53,7 +53,7 @@
53 53
54static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) 54static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
55{ 55{
56#ifdef CONFIG_MMIOTRACE_HOOKS 56#ifdef CONFIG_MMIOTRACE
57 if (unlikely(is_kmmio_active())) 57 if (unlikely(is_kmmio_active()))
58 if (kmmio_handler(regs, addr) == 1) 58 if (kmmio_handler(regs, addr) == 1)
59 return -1; 59 return -1;
@@ -393,7 +393,7 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
393 if (pte && pte_present(*pte) && !pte_exec(*pte)) 393 if (pte && pte_present(*pte) && !pte_exec(*pte))
394 printk(KERN_CRIT "kernel tried to execute " 394 printk(KERN_CRIT "kernel tried to execute "
395 "NX-protected page - exploit attempt? " 395 "NX-protected page - exploit attempt? "
396 "(uid: %d)\n", current->uid); 396 "(uid: %d)\n", current_uid());
397 } 397 }
398#endif 398#endif
399 399
@@ -413,6 +413,7 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
413 unsigned long error_code) 413 unsigned long error_code)
414{ 414{
415 unsigned long flags = oops_begin(); 415 unsigned long flags = oops_begin();
416 int sig = SIGKILL;
416 struct task_struct *tsk; 417 struct task_struct *tsk;
417 418
418 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", 419 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
@@ -423,8 +424,8 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
423 tsk->thread.trap_no = 14; 424 tsk->thread.trap_no = 14;
424 tsk->thread.error_code = error_code; 425 tsk->thread.error_code = error_code;
425 if (__die("Bad pagetable", regs, error_code)) 426 if (__die("Bad pagetable", regs, error_code))
426 regs = NULL; 427 sig = 0;
427 oops_end(flags, regs, SIGKILL); 428 oops_end(flags, regs, sig);
428} 429}
429#endif 430#endif
430 431
@@ -590,6 +591,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
590 int fault; 591 int fault;
591#ifdef CONFIG_X86_64 592#ifdef CONFIG_X86_64
592 unsigned long flags; 593 unsigned long flags;
594 int sig;
593#endif 595#endif
594 596
595 tsk = current; 597 tsk = current;
@@ -849,11 +851,12 @@ no_context:
849 bust_spinlocks(0); 851 bust_spinlocks(0);
850 do_exit(SIGKILL); 852 do_exit(SIGKILL);
851#else 853#else
854 sig = SIGKILL;
852 if (__die("Oops", regs, error_code)) 855 if (__die("Oops", regs, error_code))
853 regs = NULL; 856 sig = 0;
854 /* Executive summary in case the body of the oops scrolled away */ 857 /* Executive summary in case the body of the oops scrolled away */
855 printk(KERN_EMERG "CR2: %016lx\n", address); 858 printk(KERN_EMERG "CR2: %016lx\n", address);
856 oops_end(flags, regs, SIGKILL); 859 oops_end(flags, regs, sig);
857#endif 860#endif
858 861
859/* 862/*
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index c483f4242079..f99a6c6c432e 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -21,6 +21,7 @@
21#include <linux/init.h> 21#include <linux/init.h>
22#include <linux/highmem.h> 22#include <linux/highmem.h>
23#include <linux/pagemap.h> 23#include <linux/pagemap.h>
24#include <linux/pci.h>
24#include <linux/pfn.h> 25#include <linux/pfn.h>
25#include <linux/poison.h> 26#include <linux/poison.h>
26#include <linux/bootmem.h> 27#include <linux/bootmem.h>
@@ -67,7 +68,7 @@ static unsigned long __meminitdata table_top;
67 68
68static int __initdata after_init_bootmem; 69static int __initdata after_init_bootmem;
69 70
70static __init void *alloc_low_page(unsigned long *phys) 71static __init void *alloc_low_page(void)
71{ 72{
72 unsigned long pfn = table_end++; 73 unsigned long pfn = table_end++;
73 void *adr; 74 void *adr;
@@ -77,7 +78,6 @@ static __init void *alloc_low_page(unsigned long *phys)
77 78
78 adr = __va(pfn * PAGE_SIZE); 79 adr = __va(pfn * PAGE_SIZE);
79 memset(adr, 0, PAGE_SIZE); 80 memset(adr, 0, PAGE_SIZE);
80 *phys = pfn * PAGE_SIZE;
81 return adr; 81 return adr;
82} 82}
83 83
@@ -92,16 +92,17 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
92 pmd_t *pmd_table; 92 pmd_t *pmd_table;
93 93
94#ifdef CONFIG_X86_PAE 94#ifdef CONFIG_X86_PAE
95 unsigned long phys;
96 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { 95 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
97 if (after_init_bootmem) 96 if (after_init_bootmem)
98 pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); 97 pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
99 else 98 else
100 pmd_table = (pmd_t *)alloc_low_page(&phys); 99 pmd_table = (pmd_t *)alloc_low_page();
101 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); 100 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
102 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); 101 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
103 pud = pud_offset(pgd, 0); 102 pud = pud_offset(pgd, 0);
104 BUG_ON(pmd_table != pmd_offset(pud, 0)); 103 BUG_ON(pmd_table != pmd_offset(pud, 0));
104
105 return pmd_table;
105 } 106 }
106#endif 107#endif
107 pud = pud_offset(pgd, 0); 108 pud = pud_offset(pgd, 0);
@@ -126,10 +127,8 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
126 if (!page_table) 127 if (!page_table)
127 page_table = 128 page_table =
128 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); 129 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
129 } else { 130 } else
130 unsigned long phys; 131 page_table = (pte_t *)alloc_low_page();
131 page_table = (pte_t *)alloc_low_page(&phys);
132 }
133 132
134 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); 133 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
135 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); 134 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
@@ -436,8 +435,12 @@ static void __init set_highmem_pages_init(void)
436#endif /* !CONFIG_NUMA */ 435#endif /* !CONFIG_NUMA */
437 436
438#else 437#else
439# define permanent_kmaps_init(pgd_base) do { } while (0) 438static inline void permanent_kmaps_init(pgd_t *pgd_base)
440# define set_highmem_pages_init() do { } while (0) 439{
440}
441static inline void set_highmem_pages_init(void)
442{
443}
441#endif /* CONFIG_HIGHMEM */ 444#endif /* CONFIG_HIGHMEM */
442 445
443void __init native_pagetable_setup_start(pgd_t *base) 446void __init native_pagetable_setup_start(pgd_t *base)
@@ -969,7 +972,7 @@ void __init mem_init(void)
969 int codesize, reservedpages, datasize, initsize; 972 int codesize, reservedpages, datasize, initsize;
970 int tmp; 973 int tmp;
971 974
972 start_periodic_check_for_corruption(); 975 pci_iommu_alloc();
973 976
974#ifdef CONFIG_FLATMEM 977#ifdef CONFIG_FLATMEM
975 BUG_ON(!mem_map); 978 BUG_ON(!mem_map);
@@ -1040,11 +1043,25 @@ void __init mem_init(void)
1040 (unsigned long)&_text, (unsigned long)&_etext, 1043 (unsigned long)&_text, (unsigned long)&_etext,
1041 ((unsigned long)&_etext - (unsigned long)&_text) >> 10); 1044 ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
1042 1045
1046 /*
1047 * Check boundaries twice: Some fundamental inconsistencies can
1048 * be detected at build time already.
1049 */
1050#define __FIXADDR_TOP (-PAGE_SIZE)
1051#ifdef CONFIG_HIGHMEM
1052 BUILD_BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
1053 BUILD_BUG_ON(VMALLOC_END > PKMAP_BASE);
1054#endif
1055#define high_memory (-128UL << 20)
1056 BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END);
1057#undef high_memory
1058#undef __FIXADDR_TOP
1059
1043#ifdef CONFIG_HIGHMEM 1060#ifdef CONFIG_HIGHMEM
1044 BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START); 1061 BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
1045 BUG_ON(VMALLOC_END > PKMAP_BASE); 1062 BUG_ON(VMALLOC_END > PKMAP_BASE);
1046#endif 1063#endif
1047 BUG_ON(VMALLOC_START > VMALLOC_END); 1064 BUG_ON(VMALLOC_START >= VMALLOC_END);
1048 BUG_ON((unsigned long)high_memory > VMALLOC_START); 1065 BUG_ON((unsigned long)high_memory > VMALLOC_START);
1049 1066
1050 if (boot_cpu_data.wp_works_ok < 0) 1067 if (boot_cpu_data.wp_works_ok < 0)
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 9db01db6e3cd..9f7a0d24d42a 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -902,8 +902,6 @@ void __init mem_init(void)
902 long codesize, reservedpages, datasize, initsize; 902 long codesize, reservedpages, datasize, initsize;
903 unsigned long absent_pages; 903 unsigned long absent_pages;
904 904
905 start_periodic_check_for_corruption();
906
907 pci_iommu_alloc(); 905 pci_iommu_alloc();
908 906
909 /* clear_bss() already clear the empty_zero_page */ 907 /* clear_bss() already clear the empty_zero_page */
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index d4c4307ff3e0..bd85d42819e1 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -223,7 +223,8 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
223 * Check if the request spans more than any BAR in the iomem resource 223 * Check if the request spans more than any BAR in the iomem resource
224 * tree. 224 * tree.
225 */ 225 */
226 WARN_ON(iomem_map_sanity_check(phys_addr, size)); 226 WARN_ONCE(iomem_map_sanity_check(phys_addr, size),
227 KERN_INFO "Info: mapping multiple BARs. Your kernel is fine.");
227 228
228 /* 229 /*
229 * Don't allow anybody to remap normal RAM that we're using.. 230 * Don't allow anybody to remap normal RAM that we're using..
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index cebcbf152d46..71a14f89f89e 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -278,7 +278,7 @@ void __init numa_init_array(void)
278 int rr, i; 278 int rr, i;
279 279
280 rr = first_node(node_online_map); 280 rr = first_node(node_online_map);
281 for (i = 0; i < NR_CPUS; i++) { 281 for (i = 0; i < nr_cpu_ids; i++) {
282 if (early_cpu_to_node(i) != NUMA_NO_NODE) 282 if (early_cpu_to_node(i) != NUMA_NO_NODE)
283 continue; 283 continue;
284 numa_set_node(i, rr); 284 numa_set_node(i, rr);
@@ -549,7 +549,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
549 memnodemap[0] = 0; 549 memnodemap[0] = 0;
550 node_set_online(0); 550 node_set_online(0);
551 node_set(0, node_possible_map); 551 node_set(0, node_possible_map);
552 for (i = 0; i < NR_CPUS; i++) 552 for (i = 0; i < nr_cpu_ids; i++)
553 numa_set_node(i, 0); 553 numa_set_node(i, 0);
554 e820_register_active_regions(0, start_pfn, last_pfn); 554 e820_register_active_regions(0, start_pfn, last_pfn);
555 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT); 555 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index eb1bf000d12e..85cbd3cd3723 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -596,6 +596,242 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
596 free_memtype(addr, addr + size); 596 free_memtype(addr, addr + size);
597} 597}
598 598
599/*
600 * Internal interface to reserve a range of physical memory with prot.
601 * Reserved non RAM regions only and after successful reserve_memtype,
602 * this func also keeps identity mapping (if any) in sync with this new prot.
603 */
604static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t vma_prot)
605{
606 int is_ram = 0;
607 int id_sz, ret;
608 unsigned long flags;
609 unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
610
611 is_ram = pagerange_is_ram(paddr, paddr + size);
612
613 if (is_ram != 0) {
614 /*
615 * For mapping RAM pages, drivers need to call
616 * set_memory_[uc|wc|wb] directly, for reserve and free, before
617 * setting up the PTE.
618 */
619 WARN_ON_ONCE(1);
620 return 0;
621 }
622
623 ret = reserve_memtype(paddr, paddr + size, want_flags, &flags);
624 if (ret)
625 return ret;
626
627 if (flags != want_flags) {
628 free_memtype(paddr, paddr + size);
629 printk(KERN_ERR
630 "%s:%d map pfn expected mapping type %s for %Lx-%Lx, got %s\n",
631 current->comm, current->pid,
632 cattr_name(want_flags),
633 (unsigned long long)paddr,
634 (unsigned long long)(paddr + size),
635 cattr_name(flags));
636 return -EINVAL;
637 }
638
639 /* Need to keep identity mapping in sync */
640 if (paddr >= __pa(high_memory))
641 return 0;
642
643 id_sz = (__pa(high_memory) < paddr + size) ?
644 __pa(high_memory) - paddr :
645 size;
646
647 if (ioremap_change_attr((unsigned long)__va(paddr), id_sz, flags) < 0) {
648 free_memtype(paddr, paddr + size);
649 printk(KERN_ERR
650 "%s:%d reserve_pfn_range ioremap_change_attr failed %s "
651 "for %Lx-%Lx\n",
652 current->comm, current->pid,
653 cattr_name(flags),
654 (unsigned long long)paddr,
655 (unsigned long long)(paddr + size));
656 return -EINVAL;
657 }
658 return 0;
659}
660
661/*
662 * Internal interface to free a range of physical memory.
663 * Frees non RAM regions only.
664 */
665static void free_pfn_range(u64 paddr, unsigned long size)
666{
667 int is_ram;
668
669 is_ram = pagerange_is_ram(paddr, paddr + size);
670 if (is_ram == 0)
671 free_memtype(paddr, paddr + size);
672}
673
674/*
675 * track_pfn_vma_copy is called when vma that is covering the pfnmap gets
676 * copied through copy_page_range().
677 *
678 * If the vma has a linear pfn mapping for the entire range, we get the prot
679 * from pte and reserve the entire vma range with single reserve_pfn_range call.
680 * Otherwise, we reserve the entire vma range, my ging through the PTEs page
681 * by page to get physical address and protection.
682 */
683int track_pfn_vma_copy(struct vm_area_struct *vma)
684{
685 int retval = 0;
686 unsigned long i, j;
687 resource_size_t paddr;
688 unsigned long prot;
689 unsigned long vma_start = vma->vm_start;
690 unsigned long vma_end = vma->vm_end;
691 unsigned long vma_size = vma_end - vma_start;
692
693 if (!pat_enabled)
694 return 0;
695
696 if (is_linear_pfn_mapping(vma)) {
697 /*
698 * reserve the whole chunk covered by vma. We need the
699 * starting address and protection from pte.
700 */
701 if (follow_phys(vma, vma_start, 0, &prot, &paddr)) {
702 WARN_ON_ONCE(1);
703 return -EINVAL;
704 }
705 return reserve_pfn_range(paddr, vma_size, __pgprot(prot));
706 }
707
708 /* reserve entire vma page by page, using pfn and prot from pte */
709 for (i = 0; i < vma_size; i += PAGE_SIZE) {
710 if (follow_phys(vma, vma_start + i, 0, &prot, &paddr))
711 continue;
712
713 retval = reserve_pfn_range(paddr, PAGE_SIZE, __pgprot(prot));
714 if (retval)
715 goto cleanup_ret;
716 }
717 return 0;
718
719cleanup_ret:
720 /* Reserve error: Cleanup partial reservation and return error */
721 for (j = 0; j < i; j += PAGE_SIZE) {
722 if (follow_phys(vma, vma_start + j, 0, &prot, &paddr))
723 continue;
724
725 free_pfn_range(paddr, PAGE_SIZE);
726 }
727
728 return retval;
729}
730
731/*
732 * track_pfn_vma_new is called when a _new_ pfn mapping is being established
733 * for physical range indicated by pfn and size.
734 *
735 * prot is passed in as a parameter for the new mapping. If the vma has a
736 * linear pfn mapping for the entire range reserve the entire vma range with
737 * single reserve_pfn_range call.
738 * Otherwise, we look t the pfn and size and reserve only the specified range
739 * page by page.
740 *
741 * Note that this function can be called with caller trying to map only a
742 * subrange/page inside the vma.
743 */
744int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t prot,
745 unsigned long pfn, unsigned long size)
746{
747 int retval = 0;
748 unsigned long i, j;
749 resource_size_t base_paddr;
750 resource_size_t paddr;
751 unsigned long vma_start = vma->vm_start;
752 unsigned long vma_end = vma->vm_end;
753 unsigned long vma_size = vma_end - vma_start;
754
755 if (!pat_enabled)
756 return 0;
757
758 if (is_linear_pfn_mapping(vma)) {
759 /* reserve the whole chunk starting from vm_pgoff */
760 paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
761 return reserve_pfn_range(paddr, vma_size, prot);
762 }
763
764 /* reserve page by page using pfn and size */
765 base_paddr = (resource_size_t)pfn << PAGE_SHIFT;
766 for (i = 0; i < size; i += PAGE_SIZE) {
767 paddr = base_paddr + i;
768 retval = reserve_pfn_range(paddr, PAGE_SIZE, prot);
769 if (retval)
770 goto cleanup_ret;
771 }
772 return 0;
773
774cleanup_ret:
775 /* Reserve error: Cleanup partial reservation and return error */
776 for (j = 0; j < i; j += PAGE_SIZE) {
777 paddr = base_paddr + j;
778 free_pfn_range(paddr, PAGE_SIZE);
779 }
780
781 return retval;
782}
783
784/*
785 * untrack_pfn_vma is called while unmapping a pfnmap for a region.
786 * untrack can be called for a specific region indicated by pfn and size or
787 * can be for the entire vma (in which case size can be zero).
788 */
789void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
790 unsigned long size)
791{
792 unsigned long i;
793 resource_size_t paddr;
794 unsigned long prot;
795 unsigned long vma_start = vma->vm_start;
796 unsigned long vma_end = vma->vm_end;
797 unsigned long vma_size = vma_end - vma_start;
798
799 if (!pat_enabled)
800 return;
801
802 if (is_linear_pfn_mapping(vma)) {
803 /* free the whole chunk starting from vm_pgoff */
804 paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
805 free_pfn_range(paddr, vma_size);
806 return;
807 }
808
809 if (size != 0 && size != vma_size) {
810 /* free page by page, using pfn and size */
811 paddr = (resource_size_t)pfn << PAGE_SHIFT;
812 for (i = 0; i < size; i += PAGE_SIZE) {
813 paddr = paddr + i;
814 free_pfn_range(paddr, PAGE_SIZE);
815 }
816 } else {
817 /* free entire vma, page by page, using the pfn from pte */
818 for (i = 0; i < vma_size; i += PAGE_SIZE) {
819 if (follow_phys(vma, vma_start + i, 0, &prot, &paddr))
820 continue;
821
822 free_pfn_range(paddr, PAGE_SIZE);
823 }
824 }
825}
826
827pgprot_t pgprot_writecombine(pgprot_t prot)
828{
829 if (pat_enabled)
830 return __pgprot(pgprot_val(prot) | _PAGE_CACHE_WC);
831 else
832 return pgprot_noncached(prot);
833}
834
599#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT) 835#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
600 836
601/* get Nth element of the linked list */ 837/* get Nth element of the linked list */
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 51c0a2fc14fe..09737c8af074 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -382,7 +382,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
382 if (!node_online(i)) 382 if (!node_online(i))
383 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 383 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
384 384
385 for (i = 0; i < NR_CPUS; i++) { 385 for (i = 0; i < nr_cpu_ids; i++) {
386 int node = early_cpu_to_node(i); 386 int node = early_cpu_to_node(i);
387 387
388 if (node == NUMA_NO_NODE) 388 if (node == NUMA_NO_NODE)
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 509513760a6e..98658f25f542 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -65,11 +65,13 @@ static unsigned long reset_value[NUM_COUNTERS];
65#define IBS_FETCH_BEGIN 3 65#define IBS_FETCH_BEGIN 3
66#define IBS_OP_BEGIN 4 66#define IBS_OP_BEGIN 4
67 67
68/* The function interface needs to be fixed, something like add 68/*
69 data. Should then be added to linux/oprofile.h. */ 69 * The function interface needs to be fixed, something like add
70 * data. Should then be added to linux/oprofile.h.
71 */
70extern void 72extern void
71oprofile_add_ibs_sample(struct pt_regs *const regs, 73oprofile_add_ibs_sample(struct pt_regs * const regs,
72 unsigned int *const ibs_sample, int ibs_code); 74 unsigned int * const ibs_sample, int ibs_code);
73 75
74struct ibs_fetch_sample { 76struct ibs_fetch_sample {
75 /* MSRC001_1031 IBS Fetch Linear Address Register */ 77 /* MSRC001_1031 IBS Fetch Linear Address Register */
@@ -104,11 +106,6 @@ struct ibs_op_sample {
104 unsigned int ibs_dc_phys_high; 106 unsigned int ibs_dc_phys_high;
105}; 107};
106 108
107/*
108 * unitialize the APIC for the IBS interrupts if needed on AMD Family10h+
109*/
110static void clear_ibs_nmi(void);
111
112static int ibs_allowed; /* AMD Family10h and later */ 109static int ibs_allowed; /* AMD Family10h and later */
113 110
114struct op_ibs_config { 111struct op_ibs_config {
@@ -223,7 +220,7 @@ op_amd_handle_ibs(struct pt_regs * const regs,
223 (unsigned int *)&ibs_fetch, 220 (unsigned int *)&ibs_fetch,
224 IBS_FETCH_BEGIN); 221 IBS_FETCH_BEGIN);
225 222
226 /*reenable the IRQ */ 223 /* reenable the IRQ */
227 rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); 224 rdmsr(MSR_AMD64_IBSFETCHCTL, low, high);
228 high &= ~IBS_FETCH_HIGH_VALID_BIT; 225 high &= ~IBS_FETCH_HIGH_VALID_BIT;
229 high |= IBS_FETCH_HIGH_ENABLE; 226 high |= IBS_FETCH_HIGH_ENABLE;
@@ -331,8 +328,10 @@ static void op_amd_stop(struct op_msrs const * const msrs)
331 unsigned int low, high; 328 unsigned int low, high;
332 int i; 329 int i;
333 330
334 /* Subtle: stop on all counters to avoid race with 331 /*
335 * setting our pm callback */ 332 * Subtle: stop on all counters to avoid race with setting our
333 * pm callback
334 */
336 for (i = 0 ; i < NUM_COUNTERS ; ++i) { 335 for (i = 0 ; i < NUM_COUNTERS ; ++i) {
337 if (!reset_value[i]) 336 if (!reset_value[i])
338 continue; 337 continue;
@@ -343,13 +342,15 @@ static void op_amd_stop(struct op_msrs const * const msrs)
343 342
344#ifdef CONFIG_OPROFILE_IBS 343#ifdef CONFIG_OPROFILE_IBS
345 if (ibs_allowed && ibs_config.fetch_enabled) { 344 if (ibs_allowed && ibs_config.fetch_enabled) {
346 low = 0; /* clear max count and enable */ 345 /* clear max count and enable */
346 low = 0;
347 high = 0; 347 high = 0;
348 wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); 348 wrmsr(MSR_AMD64_IBSFETCHCTL, low, high);
349 } 349 }
350 350
351 if (ibs_allowed && ibs_config.op_enabled) { 351 if (ibs_allowed && ibs_config.op_enabled) {
352 low = 0; /* clear max count and enable */ 352 /* clear max count and enable */
353 low = 0;
353 high = 0; 354 high = 0;
354 wrmsr(MSR_AMD64_IBSOPCTL, low, high); 355 wrmsr(MSR_AMD64_IBSOPCTL, low, high);
355 } 356 }
@@ -370,18 +371,7 @@ static void op_amd_shutdown(struct op_msrs const * const msrs)
370 } 371 }
371} 372}
372 373
373#ifndef CONFIG_OPROFILE_IBS 374#ifdef CONFIG_OPROFILE_IBS
374
375/* no IBS support */
376
377static int op_amd_init(struct oprofile_operations *ops)
378{
379 return 0;
380}
381
382static void op_amd_exit(void) {}
383
384#else
385 375
386static u8 ibs_eilvt_off; 376static u8 ibs_eilvt_off;
387 377
@@ -395,7 +385,7 @@ static inline void apic_clear_ibs_nmi_per_cpu(void *arg)
395 setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1); 385 setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
396} 386}
397 387
398static int pfm_amd64_setup_eilvt(void) 388static int init_ibs_nmi(void)
399{ 389{
400#define IBSCTL_LVTOFFSETVAL (1 << 8) 390#define IBSCTL_LVTOFFSETVAL (1 << 8)
401#define IBSCTL 0x1cc 391#define IBSCTL 0x1cc
@@ -443,18 +433,22 @@ static int pfm_amd64_setup_eilvt(void)
443 return 0; 433 return 0;
444} 434}
445 435
446/* 436/* uninitialize the APIC for the IBS interrupts if needed */
447 * initialize the APIC for the IBS interrupts 437static void clear_ibs_nmi(void)
448 * if available (AMD Family10h rev B0 and later) 438{
449 */ 439 if (ibs_allowed)
450static void setup_ibs(void) 440 on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1);
441}
442
443/* initialize the APIC for the IBS interrupts if available */
444static void ibs_init(void)
451{ 445{
452 ibs_allowed = boot_cpu_has(X86_FEATURE_IBS); 446 ibs_allowed = boot_cpu_has(X86_FEATURE_IBS);
453 447
454 if (!ibs_allowed) 448 if (!ibs_allowed)
455 return; 449 return;
456 450
457 if (pfm_amd64_setup_eilvt()) { 451 if (init_ibs_nmi()) {
458 ibs_allowed = 0; 452 ibs_allowed = 0;
459 return; 453 return;
460 } 454 }
@@ -462,14 +456,12 @@ static void setup_ibs(void)
462 printk(KERN_INFO "oprofile: AMD IBS detected\n"); 456 printk(KERN_INFO "oprofile: AMD IBS detected\n");
463} 457}
464 458
465 459static void ibs_exit(void)
466/*
467 * unitialize the APIC for the IBS interrupts if needed on AMD Family10h
468 * rev B0 and later */
469static void clear_ibs_nmi(void)
470{ 460{
471 if (ibs_allowed) 461 if (!ibs_allowed)
472 on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1); 462 return;
463
464 clear_ibs_nmi();
473} 465}
474 466
475static int (*create_arch_files)(struct super_block *sb, struct dentry *root); 467static int (*create_arch_files)(struct super_block *sb, struct dentry *root);
@@ -519,7 +511,7 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
519 511
520static int op_amd_init(struct oprofile_operations *ops) 512static int op_amd_init(struct oprofile_operations *ops)
521{ 513{
522 setup_ibs(); 514 ibs_init();
523 create_arch_files = ops->create_files; 515 create_arch_files = ops->create_files;
524 ops->create_files = setup_ibs_files; 516 ops->create_files = setup_ibs_files;
525 return 0; 517 return 0;
@@ -527,10 +519,21 @@ static int op_amd_init(struct oprofile_operations *ops)
527 519
528static void op_amd_exit(void) 520static void op_amd_exit(void)
529{ 521{
530 clear_ibs_nmi(); 522 ibs_exit();
531} 523}
532 524
533#endif 525#else
526
527/* no IBS support */
528
529static int op_amd_init(struct oprofile_operations *ops)
530{
531 return 0;
532}
533
534static void op_amd_exit(void) {}
535
536#endif /* CONFIG_OPROFILE_IBS */
534 537
535struct op_x86_model_spec const op_amd_spec = { 538struct op_x86_model_spec const op_amd_spec = {
536 .init = op_amd_init, 539 .init = op_amd_init,
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 1d88d2b39771..9e5752fe4d15 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -4,7 +4,7 @@
4#include <linux/irq.h> 4#include <linux/irq.h>
5#include <linux/dmi.h> 5#include <linux/dmi.h>
6#include <asm/numa.h> 6#include <asm/numa.h>
7#include "pci.h" 7#include <asm/pci_x86.h>
8 8
9struct pci_root_info { 9struct pci_root_info {
10 char *name; 10 char *name;
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 22e057665e55..9bb09823b362 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -2,7 +2,7 @@
2#include <linux/pci.h> 2#include <linux/pci.h>
3#include <linux/topology.h> 3#include <linux/topology.h>
4#include <linux/cpu.h> 4#include <linux/cpu.h>
5#include "pci.h" 5#include <asm/pci_x86.h>
6 6
7#ifdef CONFIG_X86_64 7#ifdef CONFIG_X86_64
8#include <asm/pci-direct.h> 8#include <asm/pci-direct.h>
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index b67732bbb85a..62ddb73e09ed 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -14,8 +14,7 @@
14#include <asm/segment.h> 14#include <asm/segment.h>
15#include <asm/io.h> 15#include <asm/io.h>
16#include <asm/smp.h> 16#include <asm/smp.h>
17 17#include <asm/pci_x86.h>
18#include "pci.h"
19 18
20unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 | 19unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 |
21 PCI_PROBE_MMCONF; 20 PCI_PROBE_MMCONF;
@@ -23,6 +22,12 @@ unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 |
23unsigned int pci_early_dump_regs; 22unsigned int pci_early_dump_regs;
24static int pci_bf_sort; 23static int pci_bf_sort;
25int pci_routeirq; 24int pci_routeirq;
25int noioapicquirk;
26#ifdef CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS
27int noioapicreroute = 0;
28#else
29int noioapicreroute = 1;
30#endif
26int pcibios_last_bus = -1; 31int pcibios_last_bus = -1;
27unsigned long pirq_table_addr; 32unsigned long pirq_table_addr;
28struct pci_bus *pci_root_bus; 33struct pci_bus *pci_root_bus;
@@ -519,6 +524,17 @@ char * __devinit pcibios_setup(char *str)
519 } else if (!strcmp(str, "skip_isa_align")) { 524 } else if (!strcmp(str, "skip_isa_align")) {
520 pci_probe |= PCI_CAN_SKIP_ISA_ALIGN; 525 pci_probe |= PCI_CAN_SKIP_ISA_ALIGN;
521 return NULL; 526 return NULL;
527 } else if (!strcmp(str, "noioapicquirk")) {
528 noioapicquirk = 1;
529 return NULL;
530 } else if (!strcmp(str, "ioapicreroute")) {
531 if (noioapicreroute != -1)
532 noioapicreroute = 0;
533 return NULL;
534 } else if (!strcmp(str, "noioapicreroute")) {
535 if (noioapicreroute != -1)
536 noioapicreroute = 1;
537 return NULL;
522 } 538 }
523 return str; 539 return str;
524} 540}
diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c
index 9915293500fb..bd13c3e4c6db 100644
--- a/arch/x86/pci/direct.c
+++ b/arch/x86/pci/direct.c
@@ -5,7 +5,7 @@
5#include <linux/pci.h> 5#include <linux/pci.h>
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/dmi.h> 7#include <linux/dmi.h>
8#include "pci.h" 8#include <asm/pci_x86.h>
9 9
10/* 10/*
11 * Functions for accessing PCI base (first 256 bytes) and extended 11 * Functions for accessing PCI base (first 256 bytes) and extended
@@ -173,7 +173,7 @@ static int pci_conf2_write(unsigned int seg, unsigned int bus,
173 173
174#undef PCI_CONF2_ADDRESS 174#undef PCI_CONF2_ADDRESS
175 175
176static struct pci_raw_ops pci_direct_conf2 = { 176struct pci_raw_ops pci_direct_conf2 = {
177 .read = pci_conf2_read, 177 .read = pci_conf2_read,
178 .write = pci_conf2_write, 178 .write = pci_conf2_write,
179}; 179};
@@ -289,6 +289,7 @@ int __init pci_direct_probe(void)
289 289
290 if (pci_check_type1()) { 290 if (pci_check_type1()) {
291 raw_pci_ops = &pci_direct_conf1; 291 raw_pci_ops = &pci_direct_conf1;
292 port_cf9_safe = true;
292 return 1; 293 return 1;
293 } 294 }
294 release_resource(region); 295 release_resource(region);
@@ -305,6 +306,7 @@ int __init pci_direct_probe(void)
305 306
306 if (pci_check_type2()) { 307 if (pci_check_type2()) {
307 raw_pci_ops = &pci_direct_conf2; 308 raw_pci_ops = &pci_direct_conf2;
309 port_cf9_safe = true;
308 return 2; 310 return 2;
309 } 311 }
310 312
diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c
index 86631ccbc25a..f6adf2c6d751 100644
--- a/arch/x86/pci/early.c
+++ b/arch/x86/pci/early.c
@@ -2,7 +2,7 @@
2#include <linux/pci.h> 2#include <linux/pci.h>
3#include <asm/pci-direct.h> 3#include <asm/pci-direct.h>
4#include <asm/io.h> 4#include <asm/io.h>
5#include "pci.h" 5#include <asm/pci_x86.h>
6 6
7/* Direct PCI access. This is used for PCI accesses in early boot before 7/* Direct PCI access. This is used for PCI accesses in early boot before
8 the PCI subsystem works. */ 8 the PCI subsystem works. */
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 2051dc96b8e9..7d388d5cf548 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -6,8 +6,7 @@
6#include <linux/dmi.h> 6#include <linux/dmi.h>
7#include <linux/pci.h> 7#include <linux/pci.h>
8#include <linux/init.h> 8#include <linux/init.h>
9#include "pci.h" 9#include <asm/pci_x86.h>
10
11 10
12static void __devinit pci_fixup_i450nx(struct pci_dev *d) 11static void __devinit pci_fixup_i450nx(struct pci_dev *d)
13{ 12{
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 844df0cbbd3e..e51bf2cda4b0 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -34,8 +34,8 @@
34 34
35#include <asm/pat.h> 35#include <asm/pat.h>
36#include <asm/e820.h> 36#include <asm/e820.h>
37#include <asm/pci_x86.h>
37 38
38#include "pci.h"
39 39
40static int 40static int
41skip_isa_ioresource_align(struct pci_dev *dev) { 41skip_isa_ioresource_align(struct pci_dev *dev) {
diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c
index d6c950f81858..bec3b048e72b 100644
--- a/arch/x86/pci/init.c
+++ b/arch/x86/pci/init.c
@@ -1,6 +1,6 @@
1#include <linux/pci.h> 1#include <linux/pci.h>
2#include <linux/init.h> 2#include <linux/init.h>
3#include "pci.h" 3#include <asm/pci_x86.h>
4 4
5/* arch_initcall has too random ordering, so call the initializers 5/* arch_initcall has too random ordering, so call the initializers
6 in the right sequence from here. */ 6 in the right sequence from here. */
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index bf69dbe08bff..373b9afe6d44 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -16,8 +16,7 @@
16#include <asm/io_apic.h> 16#include <asm/io_apic.h>
17#include <linux/irq.h> 17#include <linux/irq.h>
18#include <linux/acpi.h> 18#include <linux/acpi.h>
19 19#include <asm/pci_x86.h>
20#include "pci.h"
21 20
22#define PIRQ_SIGNATURE (('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24)) 21#define PIRQ_SIGNATURE (('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24))
23#define PIRQ_VERSION 0x0100 22#define PIRQ_VERSION 0x0100
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index b722dd481b39..f1065b129e9c 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -3,7 +3,7 @@
3 */ 3 */
4#include <linux/init.h> 4#include <linux/init.h>
5#include <linux/pci.h> 5#include <linux/pci.h>
6#include "pci.h" 6#include <asm/pci_x86.h>
7 7
8/* 8/*
9 * Discover remaining PCI buses in case there are peer host bridges. 9 * Discover remaining PCI buses in case there are peer host bridges.
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 654a2234f8f3..89bf9242c80a 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -15,8 +15,7 @@
15#include <linux/acpi.h> 15#include <linux/acpi.h>
16#include <linux/bitmap.h> 16#include <linux/bitmap.h>
17#include <asm/e820.h> 17#include <asm/e820.h>
18 18#include <asm/pci_x86.h>
19#include "pci.h"
20 19
21/* aperture is up to 256MB but BIOS may reserve less */ 20/* aperture is up to 256MB but BIOS may reserve less */
22#define MMCONFIG_APER_MIN (2 * 1024*1024) 21#define MMCONFIG_APER_MIN (2 * 1024*1024)
diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c
index f3c761dce695..8b2d561046a3 100644
--- a/arch/x86/pci/mmconfig_32.c
+++ b/arch/x86/pci/mmconfig_32.c
@@ -13,7 +13,7 @@
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/acpi.h> 14#include <linux/acpi.h>
15#include <asm/e820.h> 15#include <asm/e820.h>
16#include "pci.h" 16#include <asm/pci_x86.h>
17 17
18/* Assume systems with more busses have correct MCFG */ 18/* Assume systems with more busses have correct MCFG */
19#define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG)) 19#define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG))
diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c
index a1994163c99d..30007ffc8e11 100644
--- a/arch/x86/pci/mmconfig_64.c
+++ b/arch/x86/pci/mmconfig_64.c
@@ -10,8 +10,7 @@
10#include <linux/acpi.h> 10#include <linux/acpi.h>
11#include <linux/bitmap.h> 11#include <linux/bitmap.h>
12#include <asm/e820.h> 12#include <asm/e820.h>
13 13#include <asm/pci_x86.h>
14#include "pci.h"
15 14
16/* Static virtual mapping of the MMCONFIG aperture */ 15/* Static virtual mapping of the MMCONFIG aperture */
17struct mmcfg_virt { 16struct mmcfg_virt {
diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c
index 1177845d3186..2089354968a2 100644
--- a/arch/x86/pci/numaq_32.c
+++ b/arch/x86/pci/numaq_32.c
@@ -7,7 +7,7 @@
7#include <linux/nodemask.h> 7#include <linux/nodemask.h>
8#include <mach_apic.h> 8#include <mach_apic.h>
9#include <asm/mpspec.h> 9#include <asm/mpspec.h>
10#include "pci.h" 10#include <asm/pci_x86.h>
11 11
12#define XQUAD_PORTIO_BASE 0xfe400000 12#define XQUAD_PORTIO_BASE 0xfe400000
13#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */ 13#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */
diff --git a/arch/x86/pci/olpc.c b/arch/x86/pci/olpc.c
index e11e9e803d5f..b889d824f7c6 100644
--- a/arch/x86/pci/olpc.c
+++ b/arch/x86/pci/olpc.c
@@ -29,7 +29,7 @@
29#include <linux/init.h> 29#include <linux/init.h>
30#include <asm/olpc.h> 30#include <asm/olpc.h>
31#include <asm/geode.h> 31#include <asm/geode.h>
32#include "pci.h" 32#include <asm/pci_x86.h>
33 33
34/* 34/*
35 * In the tables below, the first two line (8 longwords) are the 35 * In the tables below, the first two line (8 longwords) are the
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index 37472fc6f729..b82cae970dfd 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -6,9 +6,8 @@
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/uaccess.h> 8#include <linux/uaccess.h>
9#include "pci.h" 9#include <asm/pci_x86.h>
10#include "pci-functions.h" 10#include <asm/mach-default/pci-functions.h>
11
12 11
13/* BIOS32 signature: "_32_" */ 12/* BIOS32 signature: "_32_" */
14#define BIOS32_SIGNATURE (('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24)) 13#define BIOS32_SIGNATURE (('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24))
diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c
index 42f4cb19faca..16d0c0eb0d19 100644
--- a/arch/x86/pci/visws.c
+++ b/arch/x86/pci/visws.c
@@ -9,11 +9,10 @@
9#include <linux/init.h> 9#include <linux/init.h>
10 10
11#include <asm/setup.h> 11#include <asm/setup.h>
12#include <asm/pci_x86.h>
12#include <asm/visws/cobalt.h> 13#include <asm/visws/cobalt.h>
13#include <asm/visws/lithium.h> 14#include <asm/visws/lithium.h>
14 15
15#include "pci.h"
16
17static int pci_visws_enable_irq(struct pci_dev *dev) { return 0; } 16static int pci_visws_enable_irq(struct pci_dev *dev) { return 0; }
18static void pci_visws_disable_irq(struct pci_dev *dev) { } 17static void pci_visws_disable_irq(struct pci_dev *dev) { }
19 18
diff --git a/arch/x86/scripts/strip-symbols b/arch/x86/scripts/strip-symbols
new file mode 100644
index 000000000000..a2f1ccb827c7
--- /dev/null
+++ b/arch/x86/scripts/strip-symbols
@@ -0,0 +1 @@
__cpu_vendor_dev_X86_VENDOR_*
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 1ef0f90813d6..d9d35824c56f 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -9,6 +9,9 @@
9 * Also alternative() doesn't work. 9 * Also alternative() doesn't work.
10 */ 10 */
11 11
12/* Disable profiling for userspace code: */
13#define DISABLE_BRANCH_PROFILING
14
12#include <linux/kernel.h> 15#include <linux/kernel.h>
13#include <linux/posix-timers.h> 16#include <linux/posix-timers.h>
14#include <linux/time.h> 17#include <linux/time.h>
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 513f330c5832..1241f118ab56 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -310,7 +310,7 @@ int __init sysenter_setup(void)
310} 310}
311 311
312/* Setup a VMA at program startup for the vsyscall page */ 312/* Setup a VMA at program startup for the vsyscall page */
313int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) 313int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
314{ 314{
315 struct mm_struct *mm = current->mm; 315 struct mm_struct *mm = current->mm;
316 unsigned long addr; 316 unsigned long addr;
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 257ba4a10abf..9c98cc6ba978 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -98,7 +98,7 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
98 98
99/* Setup a VMA at program startup for the vsyscall page. 99/* Setup a VMA at program startup for the vsyscall page.
100 Not called for compat tasks */ 100 Not called for compat tasks */
101int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) 101int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
102{ 102{
103 struct mm_struct *mm = current->mm; 103 struct mm_struct *mm = current->mm;
104 unsigned long addr; 104 unsigned long addr;
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 5e4686d70f62..bea215230b20 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -28,6 +28,7 @@
28#include <linux/console.h> 28#include <linux/console.h>
29 29
30#include <xen/interface/xen.h> 30#include <xen/interface/xen.h>
31#include <xen/interface/version.h>
31#include <xen/interface/physdev.h> 32#include <xen/interface/physdev.h>
32#include <xen/interface/vcpu.h> 33#include <xen/interface/vcpu.h>
33#include <xen/features.h> 34#include <xen/features.h>
@@ -793,7 +794,7 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
793 794
794 ret = 0; 795 ret = 0;
795 796
796 switch(msr) { 797 switch (msr) {
797#ifdef CONFIG_X86_64 798#ifdef CONFIG_X86_64
798 unsigned which; 799 unsigned which;
799 u64 base; 800 u64 base;
@@ -1453,7 +1454,7 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1453 1454
1454 ident_pte = 0; 1455 ident_pte = 0;
1455 pfn = 0; 1456 pfn = 0;
1456 for(pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { 1457 for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1457 pte_t *pte_page; 1458 pte_t *pte_page;
1458 1459
1459 /* Reuse or allocate a page of ptes */ 1460 /* Reuse or allocate a page of ptes */
@@ -1471,7 +1472,7 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1471 } 1472 }
1472 1473
1473 /* Install mappings */ 1474 /* Install mappings */
1474 for(pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { 1475 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1475 pte_t pte; 1476 pte_t pte;
1476 1477
1477 if (pfn > max_pfn_mapped) 1478 if (pfn > max_pfn_mapped)
@@ -1485,7 +1486,7 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1485 } 1486 }
1486 } 1487 }
1487 1488
1488 for(pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE) 1489 for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1489 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO); 1490 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1490 1491
1491 set_page_prot(pmd, PAGE_KERNEL_RO); 1492 set_page_prot(pmd, PAGE_KERNEL_RO);
@@ -1499,7 +1500,7 @@ static void convert_pfn_mfn(void *v)
1499 1500
1500 /* All levels are converted the same way, so just treat them 1501 /* All levels are converted the same way, so just treat them
1501 as ptes. */ 1502 as ptes. */
1502 for(i = 0; i < PTRS_PER_PTE; i++) 1503 for (i = 0; i < PTRS_PER_PTE; i++)
1503 pte[i] = xen_make_pte(pte[i].pte); 1504 pte[i] = xen_make_pte(pte[i].pte);
1504} 1505}
1505 1506
@@ -1514,7 +1515,8 @@ static void convert_pfn_mfn(void *v)
1514 * of the physical mapping once some sort of allocator has been set 1515 * of the physical mapping once some sort of allocator has been set
1515 * up. 1516 * up.
1516 */ 1517 */
1517static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) 1518static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1519 unsigned long max_pfn)
1518{ 1520{
1519 pud_t *l3; 1521 pud_t *l3;
1520 pmd_t *l2; 1522 pmd_t *l2;
@@ -1577,7 +1579,8 @@ static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pf
1577#else /* !CONFIG_X86_64 */ 1579#else /* !CONFIG_X86_64 */
1578static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss; 1580static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
1579 1581
1580static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) 1582static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1583 unsigned long max_pfn)
1581{ 1584{
1582 pmd_t *kernel_pmd; 1585 pmd_t *kernel_pmd;
1583 1586
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 636ef4caa52d..503c240e26c7 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -154,13 +154,13 @@ void xen_setup_mfn_list_list(void)
154{ 154{
155 unsigned pfn, idx; 155 unsigned pfn, idx;
156 156
157 for(pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) { 157 for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
158 unsigned topidx = p2m_top_index(pfn); 158 unsigned topidx = p2m_top_index(pfn);
159 159
160 p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]); 160 p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
161 } 161 }
162 162
163 for(idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) { 163 for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
164 unsigned topidx = idx * P2M_ENTRIES_PER_PAGE; 164 unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
165 p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]); 165 p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
166 } 166 }
@@ -179,7 +179,7 @@ void __init xen_build_dynamic_phys_to_machine(void)
179 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); 179 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
180 unsigned pfn; 180 unsigned pfn;
181 181
182 for(pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) { 182 for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
183 unsigned topidx = p2m_top_index(pfn); 183 unsigned topidx = p2m_top_index(pfn);
184 184
185 p2m_top[topidx] = &mfn_list[pfn]; 185 p2m_top[topidx] = &mfn_list[pfn];
@@ -207,7 +207,7 @@ static void alloc_p2m(unsigned long **pp, unsigned long *mfnp)
207 p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL); 207 p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
208 BUG_ON(p == NULL); 208 BUG_ON(p == NULL);
209 209
210 for(i = 0; i < P2M_ENTRIES_PER_PAGE; i++) 210 for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
211 p[i] = INVALID_P2M_ENTRY; 211 p[i] = INVALID_P2M_ENTRY;
212 212
213 if (cmpxchg(pp, p2m_missing, p) != p2m_missing) 213 if (cmpxchg(pp, p2m_missing, p) != p2m_missing)
@@ -407,7 +407,8 @@ out:
407 preempt_enable(); 407 preempt_enable();
408} 408}
409 409
410pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 410pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
411 unsigned long addr, pte_t *ptep)
411{ 412{
412 /* Just return the pte as-is. We preserve the bits on commit */ 413 /* Just return the pte as-is. We preserve the bits on commit */
413 return *ptep; 414 return *ptep;
@@ -878,7 +879,8 @@ static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
878 879
879 if (user_pgd) { 880 if (user_pgd) {
880 xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD); 881 xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
881 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd))); 882 xen_do_pin(MMUEXT_PIN_L4_TABLE,
883 PFN_DOWN(__pa(user_pgd)));
882 } 884 }
883 } 885 }
884#else /* CONFIG_X86_32 */ 886#else /* CONFIG_X86_32 */
@@ -993,7 +995,8 @@ static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
993 pgd_t *user_pgd = xen_get_user_pgd(pgd); 995 pgd_t *user_pgd = xen_get_user_pgd(pgd);
994 996
995 if (user_pgd) { 997 if (user_pgd) {
996 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd))); 998 xen_do_pin(MMUEXT_UNPIN_TABLE,
999 PFN_DOWN(__pa(user_pgd)));
997 xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD); 1000 xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
998 } 1001 }
999 } 1002 }
@@ -1079,7 +1082,7 @@ static void drop_other_mm_ref(void *info)
1079 1082
1080static void xen_drop_mm_ref(struct mm_struct *mm) 1083static void xen_drop_mm_ref(struct mm_struct *mm)
1081{ 1084{
1082 cpumask_t mask; 1085 cpumask_var_t mask;
1083 unsigned cpu; 1086 unsigned cpu;
1084 1087
1085 if (current->active_mm == mm) { 1088 if (current->active_mm == mm) {
@@ -1091,7 +1094,16 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
1091 } 1094 }
1092 1095
1093 /* Get the "official" set of cpus referring to our pagetable. */ 1096 /* Get the "official" set of cpus referring to our pagetable. */
1094 mask = mm->cpu_vm_mask; 1097 if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
1098 for_each_online_cpu(cpu) {
1099 if (!cpumask_test_cpu(cpu, &mm->cpu_vm_mask)
1100 && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1101 continue;
1102 smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
1103 }
1104 return;
1105 }
1106 cpumask_copy(mask, &mm->cpu_vm_mask);
1095 1107
1096 /* It's possible that a vcpu may have a stale reference to our 1108 /* It's possible that a vcpu may have a stale reference to our
1097 cr3, because its in lazy mode, and it hasn't yet flushed 1109 cr3, because its in lazy mode, and it hasn't yet flushed
@@ -1100,11 +1112,12 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
1100 if needed. */ 1112 if needed. */
1101 for_each_online_cpu(cpu) { 1113 for_each_online_cpu(cpu) {
1102 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) 1114 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
1103 cpu_set(cpu, mask); 1115 cpumask_set_cpu(cpu, mask);
1104 } 1116 }
1105 1117
1106 if (!cpus_empty(mask)) 1118 if (!cpumask_empty(mask))
1107 smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); 1119 smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
1120 free_cpumask_var(mask);
1108} 1121}
1109#else 1122#else
1110static void xen_drop_mm_ref(struct mm_struct *mm) 1123static void xen_drop_mm_ref(struct mm_struct *mm)
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 8ea8a0d0b0de..c738644b5435 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -154,7 +154,7 @@ void xen_mc_flush(void)
154 ret, smp_processor_id()); 154 ret, smp_processor_id());
155 dump_stack(); 155 dump_stack();
156 for (i = 0; i < b->mcidx; i++) { 156 for (i = 0; i < b->mcidx; i++) {
157 printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n", 157 printk(KERN_DEBUG " call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
158 i+1, b->mcidx, 158 i+1, b->mcidx,
159 b->debug[i].op, 159 b->debug[i].op,
160 b->debug[i].args[0], 160 b->debug[i].args[0],
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index d67901083888..15c6c68db6a2 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -28,6 +28,9 @@
28/* These are code, but not functions. Defined in entry.S */ 28/* These are code, but not functions. Defined in entry.S */
29extern const char xen_hypervisor_callback[]; 29extern const char xen_hypervisor_callback[];
30extern const char xen_failsafe_callback[]; 30extern const char xen_failsafe_callback[];
31extern void xen_sysenter_target(void);
32extern void xen_syscall_target(void);
33extern void xen_syscall32_target(void);
31 34
32 35
33/** 36/**
@@ -110,7 +113,6 @@ static __cpuinit int register_callback(unsigned type, const void *func)
110 113
111void __cpuinit xen_enable_sysenter(void) 114void __cpuinit xen_enable_sysenter(void)
112{ 115{
113 extern void xen_sysenter_target(void);
114 int ret; 116 int ret;
115 unsigned sysenter_feature; 117 unsigned sysenter_feature;
116 118
@@ -132,8 +134,6 @@ void __cpuinit xen_enable_syscall(void)
132{ 134{
133#ifdef CONFIG_X86_64 135#ifdef CONFIG_X86_64
134 int ret; 136 int ret;
135 extern void xen_syscall_target(void);
136 extern void xen_syscall32_target(void);
137 137
138 ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target); 138 ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
139 if (ret != 0) { 139 if (ret != 0) {
@@ -160,7 +160,8 @@ void __init xen_arch_setup(void)
160 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); 160 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
161 161
162 if (!xen_feature(XENFEAT_auto_translated_physmap)) 162 if (!xen_feature(XENFEAT_auto_translated_physmap))
163 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3); 163 HYPERVISOR_vm_assist(VMASST_CMD_enable,
164 VMASST_TYPE_pae_extended_cr3);
164 165
165 if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) || 166 if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
166 register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback)) 167 register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index acd9b6705e02..c44e2069c7c7 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -33,7 +33,7 @@
33#include "xen-ops.h" 33#include "xen-ops.h"
34#include "mmu.h" 34#include "mmu.h"
35 35
36cpumask_t xen_cpu_initialized_map; 36cpumask_var_t xen_cpu_initialized_map;
37 37
38static DEFINE_PER_CPU(int, resched_irq); 38static DEFINE_PER_CPU(int, resched_irq);
39static DEFINE_PER_CPU(int, callfunc_irq); 39static DEFINE_PER_CPU(int, callfunc_irq);
@@ -158,7 +158,7 @@ static void __init xen_fill_possible_map(void)
158{ 158{
159 int i, rc; 159 int i, rc;
160 160
161 for (i = 0; i < NR_CPUS; i++) { 161 for (i = 0; i < nr_cpu_ids; i++) {
162 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); 162 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
163 if (rc >= 0) { 163 if (rc >= 0) {
164 num_processors++; 164 num_processors++;
@@ -192,11 +192,14 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
192 if (xen_smp_intr_init(0)) 192 if (xen_smp_intr_init(0))
193 BUG(); 193 BUG();
194 194
195 xen_cpu_initialized_map = cpumask_of_cpu(0); 195 if (!alloc_cpumask_var(&xen_cpu_initialized_map, GFP_KERNEL))
196 panic("could not allocate xen_cpu_initialized_map\n");
197
198 cpumask_copy(xen_cpu_initialized_map, cpumask_of(0));
196 199
197 /* Restrict the possible_map according to max_cpus. */ 200 /* Restrict the possible_map according to max_cpus. */
198 while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) { 201 while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
199 for (cpu = NR_CPUS - 1; !cpu_possible(cpu); cpu--) 202 for (cpu = nr_cpu_ids - 1; !cpu_possible(cpu); cpu--)
200 continue; 203 continue;
201 cpu_clear(cpu, cpu_possible_map); 204 cpu_clear(cpu, cpu_possible_map);
202 } 205 }
@@ -221,7 +224,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
221 struct vcpu_guest_context *ctxt; 224 struct vcpu_guest_context *ctxt;
222 struct desc_struct *gdt; 225 struct desc_struct *gdt;
223 226
224 if (cpu_test_and_set(cpu, xen_cpu_initialized_map)) 227 if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map))
225 return 0; 228 return 0;
226 229
227 ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL); 230 ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
@@ -408,24 +411,23 @@ static void xen_smp_send_reschedule(int cpu)
408 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); 411 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
409} 412}
410 413
411static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector) 414static void xen_send_IPI_mask(const struct cpumask *mask,
415 enum ipi_vector vector)
412{ 416{
413 unsigned cpu; 417 unsigned cpu;
414 418
415 cpus_and(mask, mask, cpu_online_map); 419 for_each_cpu_and(cpu, mask, cpu_online_mask)
416
417 for_each_cpu_mask_nr(cpu, mask)
418 xen_send_IPI_one(cpu, vector); 420 xen_send_IPI_one(cpu, vector);
419} 421}
420 422
421static void xen_smp_send_call_function_ipi(cpumask_t mask) 423static void xen_smp_send_call_function_ipi(const struct cpumask *mask)
422{ 424{
423 int cpu; 425 int cpu;
424 426
425 xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); 427 xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
426 428
427 /* Make sure other vcpus get a chance to run if they need to. */ 429 /* Make sure other vcpus get a chance to run if they need to. */
428 for_each_cpu_mask_nr(cpu, mask) { 430 for_each_cpu(cpu, mask) {
429 if (xen_vcpu_stolen(cpu)) { 431 if (xen_vcpu_stolen(cpu)) {
430 HYPERVISOR_sched_op(SCHEDOP_yield, 0); 432 HYPERVISOR_sched_op(SCHEDOP_yield, 0);
431 break; 433 break;
@@ -435,7 +437,8 @@ static void xen_smp_send_call_function_ipi(cpumask_t mask)
435 437
436static void xen_smp_send_call_function_single_ipi(int cpu) 438static void xen_smp_send_call_function_single_ipi(int cpu)
437{ 439{
438 xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR); 440 xen_send_IPI_mask(cpumask_of(cpu),
441 XEN_CALL_FUNCTION_SINGLE_VECTOR);
439} 442}
440 443
441static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id) 444static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 2a234db5949b..212ffe012b76 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -35,7 +35,8 @@ void xen_post_suspend(int suspend_cancelled)
35 pfn_to_mfn(xen_start_info->console.domU.mfn); 35 pfn_to_mfn(xen_start_info->console.domU.mfn);
36 } else { 36 } else {
37#ifdef CONFIG_SMP 37#ifdef CONFIG_SMP
38 xen_cpu_initialized_map = cpu_online_map; 38 BUG_ON(xen_cpu_initialized_map == NULL);
39 cpumask_copy(xen_cpu_initialized_map, cpu_online_mask);
39#endif 40#endif
40 xen_vcpu_restore(); 41 xen_vcpu_restore();
41 } 42 }
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index c9f7cda48ed7..14f240623497 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -132,8 +132,7 @@ static void do_stolen_accounting(void)
132 *snap = state; 132 *snap = state;
133 133
134 /* Add the appropriate number of ticks of stolen time, 134 /* Add the appropriate number of ticks of stolen time,
135 including any left-overs from last time. Passing NULL to 135 including any left-overs from last time. */
136 account_steal_time accounts the time as stolen. */
137 stolen = runnable + offline + __get_cpu_var(residual_stolen); 136 stolen = runnable + offline + __get_cpu_var(residual_stolen);
138 137
139 if (stolen < 0) 138 if (stolen < 0)
@@ -141,11 +140,10 @@ static void do_stolen_accounting(void)
141 140
142 ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen); 141 ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen);
143 __get_cpu_var(residual_stolen) = stolen; 142 __get_cpu_var(residual_stolen) = stolen;
144 account_steal_time(NULL, ticks); 143 account_steal_ticks(ticks);
145 144
146 /* Add the appropriate number of ticks of blocked time, 145 /* Add the appropriate number of ticks of blocked time,
147 including any left-overs from last time. Passing idle to 146 including any left-overs from last time. */
148 account_steal_time accounts the time as idle/wait. */
149 blocked += __get_cpu_var(residual_blocked); 147 blocked += __get_cpu_var(residual_blocked);
150 148
151 if (blocked < 0) 149 if (blocked < 0)
@@ -153,7 +151,7 @@ static void do_stolen_accounting(void)
153 151
154 ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked); 152 ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked);
155 __get_cpu_var(residual_blocked) = blocked; 153 __get_cpu_var(residual_blocked) = blocked;
156 account_steal_time(idle_task(smp_processor_id()), ticks); 154 account_idle_ticks(ticks);
157} 155}
158 156
159/* 157/*
@@ -437,7 +435,7 @@ void xen_setup_timer(int cpu)
437 evt = &per_cpu(xen_clock_events, cpu); 435 evt = &per_cpu(xen_clock_events, cpu);
438 memcpy(evt, xen_clockevent, sizeof(*evt)); 436 memcpy(evt, xen_clockevent, sizeof(*evt));
439 437
440 evt->cpumask = cpumask_of_cpu(cpu); 438 evt->cpumask = cpumask_of(cpu);
441 evt->irq = irq; 439 evt->irq = irq;
442 440
443 setup_runstate_info(cpu); 441 setup_runstate_info(cpu);
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 9e1afae8461f..c1f8faf0a2c5 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -58,7 +58,7 @@ void __init xen_init_spinlocks(void);
58__cpuinit void xen_init_lock_cpu(int cpu); 58__cpuinit void xen_init_lock_cpu(int cpu);
59void xen_uninit_lock_cpu(int cpu); 59void xen_uninit_lock_cpu(int cpu);
60 60
61extern cpumask_t xen_cpu_initialized_map; 61extern cpumask_var_t xen_cpu_initialized_map;
62#else 62#else
63static inline void xen_smp_init(void) {} 63static inline void xen_smp_init(void) {}
64#endif 64#endif