aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kbuild16
-rw-r--r--arch/x86/Kconfig102
-rw-r--r--arch/x86/Kconfig.debug20
-rw-r--r--arch/x86/Makefile24
-rw-r--r--arch/x86/boot/.gitignore2
-rw-r--r--arch/x86/boot/Makefile30
-rw-r--r--arch/x86/boot/a20.c9
-rw-r--r--arch/x86/boot/apm.c76
-rw-r--r--arch/x86/boot/bioscall.S82
-rw-r--r--arch/x86/boot/boot.h48
-rw-r--r--arch/x86/boot/compressed/.gitignore3
-rw-r--r--arch/x86/boot/compressed/Makefile55
-rw-r--r--arch/x86/boot/compressed/head_32.S194
-rw-r--r--arch/x86/boot/compressed/head_64.S169
-rw-r--r--arch/x86/boot/compressed/misc.c12
-rw-r--r--arch/x86/boot/compressed/mkpiggy.c97
-rw-r--r--arch/x86/boot/compressed/vmlinux.lds.S (renamed from arch/x86/boot/compressed/vmlinux_64.lds)29
-rw-r--r--arch/x86/boot/compressed/vmlinux.scr10
-rw-r--r--arch/x86/boot/compressed/vmlinux_32.lds43
-rw-r--r--arch/x86/boot/edd.c71
-rw-r--r--arch/x86/boot/header.S30
-rw-r--r--arch/x86/boot/main.c39
-rw-r--r--arch/x86/boot/mca.c27
-rw-r--r--arch/x86/boot/memory.c79
-rw-r--r--arch/x86/boot/regs.c29
-rw-r--r--arch/x86/boot/setup.ld6
-rw-r--r--arch/x86/boot/tty.c52
-rw-r--r--arch/x86/boot/video-bios.c27
-rw-r--r--arch/x86/boot/video-vesa.c137
-rw-r--r--arch/x86/boot/video-vga.c95
-rw-r--r--arch/x86/boot/video.c42
-rw-r--r--arch/x86/boot/video.h14
-rw-r--r--arch/x86/configs/i386_defconfig148
-rw-r--r--arch/x86/configs/x86_64_defconfig151
-rw-r--r--arch/x86/crypto/Makefile2
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S5
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c271
-rw-r--r--arch/x86/crypto/fpu.c166
-rw-r--r--arch/x86/ia32/ia32entry.S4
-rw-r--r--arch/x86/include/asm/acpi.h1
-rw-r--r--arch/x86/include/asm/alternative.h59
-rw-r--r--arch/x86/include/asm/amd_iommu.h4
-rw-r--r--arch/x86/include/asm/amd_iommu_types.h55
-rw-r--r--arch/x86/include/asm/apic.h33
-rw-r--r--arch/x86/include/asm/apicdef.h8
-rw-r--r--arch/x86/include/asm/atomic_32.h237
-rw-r--r--arch/x86/include/asm/atomic_64.h2
-rw-r--r--arch/x86/include/asm/bitsperlong.h13
-rw-r--r--arch/x86/include/asm/boot.h15
-rw-r--r--arch/x86/include/asm/bootparam.h3
-rw-r--r--arch/x86/include/asm/cpu_debug.h101
-rw-r--r--arch/x86/include/asm/cpufeature.h9
-rw-r--r--arch/x86/include/asm/desc.h26
-rw-r--r--arch/x86/include/asm/dma-mapping.h168
-rw-r--r--arch/x86/include/asm/ds.h82
-rw-r--r--arch/x86/include/asm/entry_arch.h13
-rw-r--r--arch/x86/include/asm/hardirq.h4
-rw-r--r--arch/x86/include/asm/hw_irq.h29
-rw-r--r--arch/x86/include/asm/i387.h43
-rw-r--r--arch/x86/include/asm/i8259.h4
-rw-r--r--arch/x86/include/asm/intel_arch_perfmon.h31
-rw-r--r--arch/x86/include/asm/io_apic.h9
-rw-r--r--arch/x86/include/asm/iomap.h5
-rw-r--r--arch/x86/include/asm/iommu.h1
-rw-r--r--arch/x86/include/asm/irq_remapping.h2
-rw-r--r--arch/x86/include/asm/irq_vectors.h26
-rw-r--r--arch/x86/include/asm/k8.h13
-rw-r--r--arch/x86/include/asm/kmap_types.h23
-rw-r--r--arch/x86/include/asm/kmemcheck.h42
-rw-r--r--arch/x86/include/asm/kvm.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h45
-rw-r--r--arch/x86/include/asm/kvm_x86_emulate.h6
-rw-r--r--arch/x86/include/asm/lguest.h7
-rw-r--r--arch/x86/include/asm/lguest_hcall.h15
-rw-r--r--arch/x86/include/asm/mce.h143
-rw-r--r--arch/x86/include/asm/microcode.h25
-rw-r--r--arch/x86/include/asm/mman.h2
-rw-r--r--arch/x86/include/asm/mpspec.h15
-rw-r--r--arch/x86/include/asm/msr-index.h8
-rw-r--r--arch/x86/include/asm/msr.h30
-rw-r--r--arch/x86/include/asm/nmi.h2
-rw-r--r--arch/x86/include/asm/numa_64.h10
-rw-r--r--arch/x86/include/asm/page.h2
-rw-r--r--arch/x86/include/asm/page_32_types.h4
-rw-r--r--arch/x86/include/asm/page_64_types.h24
-rw-r--r--arch/x86/include/asm/page_types.h6
-rw-r--r--arch/x86/include/asm/paravirt.h22
-rw-r--r--arch/x86/include/asm/pci.h1
-rw-r--r--arch/x86/include/asm/pci_x86.h3
-rw-r--r--arch/x86/include/asm/perf_counter.h95
-rw-r--r--arch/x86/include/asm/pgtable.h9
-rw-r--r--arch/x86/include/asm/pgtable_32.h8
-rw-r--r--arch/x86/include/asm/pgtable_32_types.h4
-rw-r--r--arch/x86/include/asm/pgtable_64.h11
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h8
-rw-r--r--arch/x86/include/asm/pgtable_types.h10
-rw-r--r--arch/x86/include/asm/processor.h47
-rw-r--r--arch/x86/include/asm/ptrace.h9
-rw-r--r--arch/x86/include/asm/required-features.h8
-rw-r--r--arch/x86/include/asm/setup.h1
-rw-r--r--arch/x86/include/asm/signal.h2
-rw-r--r--arch/x86/include/asm/smp.h2
-rw-r--r--arch/x86/include/asm/sparsemem.h2
-rw-r--r--arch/x86/include/asm/string_32.h8
-rw-r--r--arch/x86/include/asm/string_64.h8
-rw-r--r--arch/x86/include/asm/svm.h1
-rw-r--r--arch/x86/include/asm/syscalls.h45
-rw-r--r--arch/x86/include/asm/termios.h1
-rw-r--r--arch/x86/include/asm/therm_throt.h9
-rw-r--r--arch/x86/include/asm/thread_info.h8
-rw-r--r--arch/x86/include/asm/timer.h6
-rw-r--r--arch/x86/include/asm/timex.h4
-rw-r--r--arch/x86/include/asm/tlbflush.h10
-rw-r--r--arch/x86/include/asm/topology.h3
-rw-r--r--arch/x86/include/asm/traps.h5
-rw-r--r--arch/x86/include/asm/types.h6
-rw-r--r--arch/x86/include/asm/uaccess.h2
-rw-r--r--arch/x86/include/asm/unistd_32.h2
-rw-r--r--arch/x86/include/asm/unistd_64.h5
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h2
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h6
-rw-r--r--arch/x86/include/asm/vmx.h1
-rw-r--r--arch/x86/include/asm/xor.h5
-rw-r--r--arch/x86/kernel/Makefile7
-rw-r--r--arch/x86/kernel/acpi/boot.c236
-rw-r--r--arch/x86/kernel/acpi/cstate.c16
-rw-r--r--arch/x86/kernel/acpi/processor.c13
-rw-r--r--arch/x86/kernel/acpi/realmode/Makefile3
-rw-r--r--arch/x86/kernel/acpi/realmode/bioscall.S1
-rw-r--r--arch/x86/kernel/acpi/realmode/regs.c1
-rw-r--r--arch/x86/kernel/acpi/sleep.c2
-rw-r--r--arch/x86/kernel/amd_iommu.c516
-rw-r--r--arch/x86/kernel/amd_iommu_init.c283
-rw-r--r--arch/x86/kernel/apic/apic.c318
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c4
-rw-r--r--arch/x86/kernel/apic/es7000_32.c2
-rw-r--r--arch/x86/kernel/apic/io_apic.c923
-rw-r--r--arch/x86/kernel/apic/nmi.c4
-rw-r--r--arch/x86/kernel/apic/probe_32.c12
-rw-r--r--arch/x86/kernel/apic/probe_64.c2
-rw-r--r--arch/x86/kernel/apic/summit_32.c8
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c22
-rw-r--r--arch/x86/kernel/apm_32.c14
-rw-r--r--arch/x86/kernel/asm-offsets_32.c2
-rw-r--r--arch/x86/kernel/asm-offsets_64.c1
-rw-r--r--arch/x86/kernel/cpu/Makefile12
-rw-r--r--arch/x86/kernel/cpu/amd.c12
-rw-r--r--arch/x86/kernel/cpu/common.c41
-rw-r--r--arch/x86/kernel/cpu/cpu_debug.c431
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Kconfig9
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c191
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.h11
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c60
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c93
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.c1
-rw-r--r--arch/x86/kernel/cpu/intel.c29
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c153
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile13
-rw-r--r--arch/x86/kernel/cpu/mcheck/k7.c45
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c127
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h15
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c218
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c2049
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.h14
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_32.c76
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c1187
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c (renamed from arch/x86/kernel/cpu/mcheck/mce_amd_64.c)203
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c (renamed from arch/x86/kernel/cpu/mcheck/mce_intel_64.c)84
-rw-r--r--arch/x86/kernel/cpu/mcheck/non-fatal.c60
-rw-r--r--arch/x86/kernel/cpu/mcheck/p4.c112
-rw-r--r--arch/x86/kernel/cpu/mcheck/p5.c51
-rw-r--r--arch/x86/kernel/cpu/mcheck/p6.c29
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c177
-rw-r--r--arch/x86/kernel/cpu/mcheck/threshold.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/winchip.c20
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c4
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c24
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h15
-rw-r--r--arch/x86/kernel/cpu/mtrr/state.c6
-rw-r--r--arch/x86/kernel/cpu/perf_counter.c1721
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c16
-rw-r--r--arch/x86/kernel/cpuid.c6
-rw-r--r--arch/x86/kernel/crash.c6
-rw-r--r--arch/x86/kernel/ds.c921
-rw-r--r--arch/x86/kernel/ds_selftest.c408
-rw-r--r--arch/x86/kernel/ds_selftest.h15
-rw-r--r--arch/x86/kernel/dumpstack.h1
-rw-r--r--arch/x86/kernel/e820.c46
-rw-r--r--arch/x86/kernel/early-quirks.c2
-rw-r--r--arch/x86/kernel/efi.c31
-rw-r--r--arch/x86/kernel/entry_32.S66
-rw-r--r--arch/x86/kernel/entry_64.S42
-rw-r--r--arch/x86/kernel/ftrace.c6
-rw-r--r--arch/x86/kernel/head_32.S8
-rw-r--r--arch/x86/kernel/head_64.S1
-rw-r--r--arch/x86/kernel/hpet.c3
-rw-r--r--arch/x86/kernel/i8253.c1
-rw-r--r--arch/x86/kernel/init_task.c1
-rw-r--r--arch/x86/kernel/irq.c47
-rw-r--r--arch/x86/kernel/irqinit.c (renamed from arch/x86/kernel/irqinit_32.c)155
-rw-r--r--arch/x86/kernel/irqinit_64.c177
-rw-r--r--arch/x86/kernel/kgdb.c2
-rw-r--r--arch/x86/kernel/kvm.c6
-rw-r--r--arch/x86/kernel/microcode_amd.c70
-rw-r--r--arch/x86/kernel/microcode_core.c330
-rw-r--r--arch/x86/kernel/microcode_intel.c90
-rw-r--r--arch/x86/kernel/module.c (renamed from arch/x86/kernel/module_64.c)82
-rw-r--r--arch/x86/kernel/module_32.c152
-rw-r--r--arch/x86/kernel/mpparse.c34
-rw-r--r--arch/x86/kernel/msr.c6
-rw-r--r--arch/x86/kernel/paravirt.c56
-rw-r--r--arch/x86/kernel/pci-calgary_64.c54
-rw-r--r--arch/x86/kernel/pci-dma.c8
-rw-r--r--arch/x86/kernel/pci-gart_64.c55
-rw-r--r--arch/x86/kernel/pci-swiotlb.c5
-rw-r--r--arch/x86/kernel/process.c22
-rw-r--r--arch/x86/kernel/process_32.c20
-rw-r--r--arch/x86/kernel/process_64.c20
-rw-r--r--arch/x86/kernel/ptrace.c284
-rw-r--r--arch/x86/kernel/quirks.c37
-rw-r--r--arch/x86/kernel/reboot.c9
-rw-r--r--arch/x86/kernel/setup.c55
-rw-r--r--arch/x86/kernel/setup_percpu.c8
-rw-r--r--arch/x86/kernel/signal.c7
-rw-r--r--arch/x86/kernel/smp.c51
-rw-r--r--arch/x86/kernel/smpboot.c24
-rw-r--r--arch/x86/kernel/stacktrace.c9
-rw-r--r--arch/x86/kernel/syscall_table_32.S2
-rw-r--r--arch/x86/kernel/tlb_uv.c15
-rw-r--r--arch/x86/kernel/traps.c34
-rw-r--r--arch/x86/kernel/tsc.c36
-rw-r--r--arch/x86/kernel/tsc_sync.c14
-rw-r--r--arch/x86/kernel/vm86_32.c13
-rw-r--r--arch/x86/kernel/vmi_32.c20
-rw-r--r--arch/x86/kernel/vmlinux.lds.S432
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S229
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S298
-rw-r--r--arch/x86/kernel/vsyscall_64.c8
-rw-r--r--arch/x86/kvm/Kconfig6
-rw-r--r--arch/x86/kvm/Makefile2
-rw-r--r--arch/x86/kvm/i8254.c109
-rw-r--r--arch/x86/kvm/i8254.h12
-rw-r--r--arch/x86/kvm/irq.c7
-rw-r--r--arch/x86/kvm/kvm_timer.h18
-rw-r--r--arch/x86/kvm/lapic.c251
-rw-r--r--arch/x86/kvm/lapic.h12
-rw-r--r--arch/x86/kvm/mmu.c194
-rw-r--r--arch/x86/kvm/mmu.h5
-rw-r--r--arch/x86/kvm/paging_tmpl.h16
-rw-r--r--arch/x86/kvm/svm.c415
-rw-r--r--arch/x86/kvm/timer.c46
-rw-r--r--arch/x86/kvm/vmx.c723
-rw-r--r--arch/x86/kvm/x86.c409
-rw-r--r--arch/x86/kvm/x86.h14
-rw-r--r--arch/x86/kvm/x86_emulate.c141
-rw-r--r--arch/x86/lguest/Kconfig1
-rw-r--r--arch/x86/lguest/boot.c176
-rw-r--r--arch/x86/lguest/i386_head.S60
-rw-r--r--arch/x86/lib/Makefile2
-rw-r--r--arch/x86/lib/msr-on-cpu.c97
-rw-r--r--arch/x86/lib/msr.c183
-rw-r--r--arch/x86/lib/usercopy_64.c2
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/dump_pagetables.c7
-rw-r--r--arch/x86/mm/fault.c92
-rw-r--r--arch/x86/mm/gup.c67
-rw-r--r--arch/x86/mm/highmem_32.c2
-rw-r--r--arch/x86/mm/init.c80
-rw-r--r--arch/x86/mm/init_32.c73
-rw-r--r--arch/x86/mm/init_64.c53
-rw-r--r--arch/x86/mm/iomap_32.c1
-rw-r--r--arch/x86/mm/kmemcheck/Makefile1
-rw-r--r--arch/x86/mm/kmemcheck/error.c228
-rw-r--r--arch/x86/mm/kmemcheck/error.h15
-rw-r--r--arch/x86/mm/kmemcheck/kmemcheck.c640
-rw-r--r--arch/x86/mm/kmemcheck/opcode.c106
-rw-r--r--arch/x86/mm/kmemcheck/opcode.h9
-rw-r--r--arch/x86/mm/kmemcheck/pte.c22
-rw-r--r--arch/x86/mm/kmemcheck/pte.h10
-rw-r--r--arch/x86/mm/kmemcheck/selftest.c69
-rw-r--r--arch/x86/mm/kmemcheck/selftest.h6
-rw-r--r--arch/x86/mm/kmemcheck/shadow.c162
-rw-r--r--arch/x86/mm/kmemcheck/shadow.h16
-rw-r--r--arch/x86/mm/kmmio.c104
-rw-r--r--arch/x86/mm/memtest.c17
-rw-r--r--arch/x86/mm/mmio-mod.c2
-rw-r--r--arch/x86/mm/numa_64.c33
-rw-r--r--arch/x86/mm/pageattr.c16
-rw-r--r--arch/x86/mm/pgtable.c12
-rw-r--r--arch/x86/mm/srat_64.c98
-rw-r--r--arch/x86/oprofile/nmi_int.c34
-rw-r--r--arch/x86/oprofile/op_model_ppro.c10
-rw-r--r--arch/x86/pci/acpi.c33
-rw-r--r--arch/x86/pci/i386.c17
-rw-r--r--arch/x86/pci/irq.c84
-rw-r--r--arch/x86/pci/mmconfig-shared.c65
-rw-r--r--arch/x86/power/Makefile2
-rw-r--r--arch/x86/power/cpu.c (renamed from arch/x86/power/cpu_64.c)165
-rw-r--r--arch/x86/power/cpu_32.c148
-rw-r--r--arch/x86/vdso/Makefile1
-rw-r--r--arch/x86/vdso/vdso32-setup.c6
-rw-r--r--arch/x86/vdso/vma.c8
-rw-r--r--arch/x86/xen/enlighten.c65
-rw-r--r--arch/x86/xen/mmu.c23
-rw-r--r--arch/x86/xen/setup.c6
-rw-r--r--arch/x86/xen/xen-ops.h1
309 files changed, 15340 insertions, 8624 deletions
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
new file mode 100644
index 000000000000..ad8ec356fb36
--- /dev/null
+++ b/arch/x86/Kbuild
@@ -0,0 +1,16 @@
1
2obj-$(CONFIG_KVM) += kvm/
3
4# Xen paravirtualization support
5obj-$(CONFIG_XEN) += xen/
6
7# lguest paravirtualization support
8obj-$(CONFIG_LGUEST_GUEST) += lguest/
9
10obj-y += kernel/
11obj-y += mm/
12
13obj-y += crypto/
14obj-y += vdso/
15obj-$(CONFIG_IA32_EMULATION) += ia32/
16
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index a6efe0a2e9ae..d1430ef6b4f9 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -28,11 +28,13 @@ config X86
28 select HAVE_KPROBES 28 select HAVE_KPROBES
29 select ARCH_WANT_OPTIONAL_GPIOLIB 29 select ARCH_WANT_OPTIONAL_GPIOLIB
30 select ARCH_WANT_FRAME_POINTERS 30 select ARCH_WANT_FRAME_POINTERS
31 select HAVE_DMA_ATTRS
31 select HAVE_KRETPROBES 32 select HAVE_KRETPROBES
32 select HAVE_FTRACE_MCOUNT_RECORD 33 select HAVE_FTRACE_MCOUNT_RECORD
33 select HAVE_DYNAMIC_FTRACE 34 select HAVE_DYNAMIC_FTRACE
34 select HAVE_FUNCTION_TRACER 35 select HAVE_FUNCTION_TRACER
35 select HAVE_FUNCTION_GRAPH_TRACER 36 select HAVE_FUNCTION_GRAPH_TRACER
37 select HAVE_FUNCTION_GRAPH_FP_TEST
36 select HAVE_FUNCTION_TRACE_MCOUNT_TEST 38 select HAVE_FUNCTION_TRACE_MCOUNT_TEST
37 select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE 39 select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE
38 select HAVE_FTRACE_SYSCALLS 40 select HAVE_FTRACE_SYSCALLS
@@ -46,6 +48,12 @@ config X86
46 select HAVE_KERNEL_GZIP 48 select HAVE_KERNEL_GZIP
47 select HAVE_KERNEL_BZIP2 49 select HAVE_KERNEL_BZIP2
48 select HAVE_KERNEL_LZMA 50 select HAVE_KERNEL_LZMA
51 select HAVE_ARCH_KMEMCHECK
52
53config OUTPUT_FORMAT
54 string
55 default "elf32-i386" if X86_32
56 default "elf64-x86-64" if X86_64
49 57
50config ARCH_DEFCONFIG 58config ARCH_DEFCONFIG
51 string 59 string
@@ -274,15 +282,9 @@ config SPARSE_IRQ
274 282
275 If you don't know what to do here, say N. 283 If you don't know what to do here, say N.
276 284
277config NUMA_MIGRATE_IRQ_DESC 285config NUMA_IRQ_DESC
278 bool "Move irq desc when changing irq smp_affinity" 286 def_bool y
279 depends on SPARSE_IRQ && NUMA 287 depends on SPARSE_IRQ && NUMA
280 depends on BROKEN
281 default n
282 ---help---
283 This enables moving irq_desc to cpu/node that irq will use handled.
284
285 If you don't know what to do here, say N.
286 288
287config X86_MPPARSE 289config X86_MPPARSE
288 bool "Enable MPS table" if ACPI 290 bool "Enable MPS table" if ACPI
@@ -355,7 +357,7 @@ config X86_UV
355 depends on X86_64 357 depends on X86_64
356 depends on X86_EXTENDED_PLATFORM 358 depends on X86_EXTENDED_PLATFORM
357 depends on NUMA 359 depends on NUMA
358 select X86_X2APIC 360 depends on X86_X2APIC
359 ---help--- 361 ---help---
360 This option is needed in order to support SGI Ultraviolet systems. 362 This option is needed in order to support SGI Ultraviolet systems.
361 If you don't have one of these, you should say N here. 363 If you don't have one of these, you should say N here.
@@ -740,6 +742,7 @@ config X86_UP_IOAPIC
740config X86_LOCAL_APIC 742config X86_LOCAL_APIC
741 def_bool y 743 def_bool y
742 depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC 744 depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC
745 select HAVE_PERF_COUNTERS if (!M386 && !M486)
743 746
744config X86_IO_APIC 747config X86_IO_APIC
745 def_bool y 748 def_bool y
@@ -789,10 +792,26 @@ config X86_MCE
789 to disable it. MCE support simply ignores non-MCE processors like 792 to disable it. MCE support simply ignores non-MCE processors like
790 the 386 and 486, so nearly everyone can say Y here. 793 the 386 and 486, so nearly everyone can say Y here.
791 794
795config X86_OLD_MCE
796 depends on X86_32 && X86_MCE
797 bool "Use legacy machine check code (will go away)"
798 default n
799 select X86_ANCIENT_MCE
800 ---help---
801 Use the old i386 machine check code. This is merely intended for
802 testing in a transition period. Try this if you run into any machine
803 check related software problems, but report the problem to
804 linux-kernel. When in doubt say no.
805
806config X86_NEW_MCE
807 depends on X86_MCE
808 bool
809 default y if (!X86_OLD_MCE && X86_32) || X86_64
810
792config X86_MCE_INTEL 811config X86_MCE_INTEL
793 def_bool y 812 def_bool y
794 prompt "Intel MCE features" 813 prompt "Intel MCE features"
795 depends on X86_64 && X86_MCE && X86_LOCAL_APIC 814 depends on X86_NEW_MCE && X86_LOCAL_APIC
796 ---help--- 815 ---help---
797 Additional support for intel specific MCE features such as 816 Additional support for intel specific MCE features such as
798 the thermal monitor. 817 the thermal monitor.
@@ -800,19 +819,36 @@ config X86_MCE_INTEL
800config X86_MCE_AMD 819config X86_MCE_AMD
801 def_bool y 820 def_bool y
802 prompt "AMD MCE features" 821 prompt "AMD MCE features"
803 depends on X86_64 && X86_MCE && X86_LOCAL_APIC 822 depends on X86_NEW_MCE && X86_LOCAL_APIC
804 ---help--- 823 ---help---
805 Additional support for AMD specific MCE features such as 824 Additional support for AMD specific MCE features such as
806 the DRAM Error Threshold. 825 the DRAM Error Threshold.
807 826
827config X86_ANCIENT_MCE
828 def_bool n
829 depends on X86_32
830 prompt "Support for old Pentium 5 / WinChip machine checks"
831 ---help---
832 Include support for machine check handling on old Pentium 5 or WinChip
833 systems. These typically need to be enabled explicitely on the command
834 line.
835
808config X86_MCE_THRESHOLD 836config X86_MCE_THRESHOLD
809 depends on X86_MCE_AMD || X86_MCE_INTEL 837 depends on X86_MCE_AMD || X86_MCE_INTEL
810 bool 838 bool
811 default y 839 default y
812 840
841config X86_MCE_INJECT
842 depends on X86_NEW_MCE
843 tristate "Machine check injector support"
844 ---help---
845 Provide support for injecting machine checks for testing purposes.
846 If you don't know what a machine check is and you don't do kernel
847 QA it is safe to say n.
848
813config X86_MCE_NONFATAL 849config X86_MCE_NONFATAL
814 tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" 850 tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
815 depends on X86_32 && X86_MCE 851 depends on X86_OLD_MCE
816 ---help--- 852 ---help---
817 Enabling this feature starts a timer that triggers every 5 seconds which 853 Enabling this feature starts a timer that triggers every 5 seconds which
818 will look at the machine check registers to see if anything happened. 854 will look at the machine check registers to see if anything happened.
@@ -825,11 +861,15 @@ config X86_MCE_NONFATAL
825 861
826config X86_MCE_P4THERMAL 862config X86_MCE_P4THERMAL
827 bool "check for P4 thermal throttling interrupt." 863 bool "check for P4 thermal throttling interrupt."
828 depends on X86_32 && X86_MCE && (X86_UP_APIC || SMP) 864 depends on X86_OLD_MCE && X86_MCE && (X86_UP_APIC || SMP)
829 ---help--- 865 ---help---
830 Enabling this feature will cause a message to be printed when the P4 866 Enabling this feature will cause a message to be printed when the P4
831 enters thermal throttling. 867 enters thermal throttling.
832 868
869config X86_THERMAL_VECTOR
870 def_bool y
871 depends on X86_MCE_P4THERMAL || X86_MCE_INTEL
872
833config VM86 873config VM86
834 bool "Enable VM86 support" if EMBEDDED 874 bool "Enable VM86 support" if EMBEDDED
835 default y 875 default y
@@ -1466,9 +1506,7 @@ config KEXEC_JUMP
1466 1506
1467config PHYSICAL_START 1507config PHYSICAL_START
1468 hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) 1508 hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
1469 default "0x1000000" if X86_NUMAQ 1509 default "0x1000000"
1470 default "0x200000" if X86_64
1471 default "0x100000"
1472 ---help--- 1510 ---help---
1473 This gives the physical address where the kernel is loaded. 1511 This gives the physical address where the kernel is loaded.
1474 1512
@@ -1487,15 +1525,15 @@ config PHYSICAL_START
1487 to be specifically compiled to run from a specific memory area 1525 to be specifically compiled to run from a specific memory area
1488 (normally a reserved region) and this option comes handy. 1526 (normally a reserved region) and this option comes handy.
1489 1527
1490 So if you are using bzImage for capturing the crash dump, leave 1528 So if you are using bzImage for capturing the crash dump,
1491 the value here unchanged to 0x100000 and set CONFIG_RELOCATABLE=y. 1529 leave the value here unchanged to 0x1000000 and set
1492 Otherwise if you plan to use vmlinux for capturing the crash dump 1530 CONFIG_RELOCATABLE=y. Otherwise if you plan to use vmlinux
1493 change this value to start of the reserved region (Typically 16MB 1531 for capturing the crash dump change this value to start of
1494 0x1000000). In other words, it can be set based on the "X" value as 1532 the reserved region. In other words, it can be set based on
1495 specified in the "crashkernel=YM@XM" command line boot parameter 1533 the "X" value as specified in the "crashkernel=YM@XM"
1496 passed to the panic-ed kernel. Typically this parameter is set as 1534 command line boot parameter passed to the panic-ed
1497 crashkernel=64M@16M. Please take a look at 1535 kernel. Please take a look at Documentation/kdump/kdump.txt
1498 Documentation/kdump/kdump.txt for more details about crash dumps. 1536 for more details about crash dumps.
1499 1537
1500 Usage of bzImage for capturing the crash dump is recommended as 1538 Usage of bzImage for capturing the crash dump is recommended as
1501 one does not have to build two kernels. Same kernel can be used 1539 one does not have to build two kernels. Same kernel can be used
@@ -1508,8 +1546,8 @@ config PHYSICAL_START
1508 Don't change this unless you know what you are doing. 1546 Don't change this unless you know what you are doing.
1509 1547
1510config RELOCATABLE 1548config RELOCATABLE
1511 bool "Build a relocatable kernel (EXPERIMENTAL)" 1549 bool "Build a relocatable kernel"
1512 depends on EXPERIMENTAL 1550 default y
1513 ---help--- 1551 ---help---
1514 This builds a kernel image that retains relocation information 1552 This builds a kernel image that retains relocation information
1515 so it can be loaded someplace besides the default 1MB. 1553 so it can be loaded someplace besides the default 1MB.
@@ -1524,12 +1562,16 @@ config RELOCATABLE
1524 it has been loaded at and the compile time physical address 1562 it has been loaded at and the compile time physical address
1525 (CONFIG_PHYSICAL_START) is ignored. 1563 (CONFIG_PHYSICAL_START) is ignored.
1526 1564
1565# Relocation on x86-32 needs some additional build support
1566config X86_NEED_RELOCS
1567 def_bool y
1568 depends on X86_32 && RELOCATABLE
1569
1527config PHYSICAL_ALIGN 1570config PHYSICAL_ALIGN
1528 hex 1571 hex
1529 prompt "Alignment value to which kernel should be aligned" if X86_32 1572 prompt "Alignment value to which kernel should be aligned" if X86_32
1530 default "0x100000" if X86_32 1573 default "0x1000000"
1531 default "0x200000" if X86_64 1574 range 0x2000 0x1000000
1532 range 0x2000 0x400000
1533 ---help--- 1575 ---help---
1534 This value puts the alignment restrictions on physical address 1576 This value puts the alignment restrictions on physical address
1535 where kernel is loaded and run from. Kernel is compiled for an 1577 where kernel is loaded and run from. Kernel is compiled for an
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index d8359e73317f..d105f29bb6bb 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -159,14 +159,30 @@ config IOMMU_DEBUG
159 options. See Documentation/x86_64/boot-options.txt for more 159 options. See Documentation/x86_64/boot-options.txt for more
160 details. 160 details.
161 161
162config IOMMU_STRESS
163 bool "Enable IOMMU stress-test mode"
164 ---help---
165 This option disables various optimizations in IOMMU related
166 code to do real stress testing of the IOMMU code. This option
167 will cause a performance drop and should only be enabled for
168 testing.
169
162config IOMMU_LEAK 170config IOMMU_LEAK
163 bool "IOMMU leak tracing" 171 bool "IOMMU leak tracing"
164 depends on DEBUG_KERNEL 172 depends on IOMMU_DEBUG && DMA_API_DEBUG
165 depends on IOMMU_DEBUG
166 ---help--- 173 ---help---
167 Add a simple leak tracer to the IOMMU code. This is useful when you 174 Add a simple leak tracer to the IOMMU code. This is useful when you
168 are debugging a buggy device driver that leaks IOMMU mappings. 175 are debugging a buggy device driver that leaks IOMMU mappings.
169 176
177config X86_DS_SELFTEST
178 bool "DS selftest"
179 default y
180 depends on DEBUG_KERNEL
181 depends on X86_DS
182 ---help---
183 Perform Debug Store selftests at boot time.
184 If in doubt, say "N".
185
170config HAVE_MMIOTRACE_SUPPORT 186config HAVE_MMIOTRACE_SUPPORT
171 def_bool y 187 def_bool y
172 188
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 8c86b72afdc2..1b68659c41b4 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -7,8 +7,6 @@ else
7 KBUILD_DEFCONFIG := $(ARCH)_defconfig 7 KBUILD_DEFCONFIG := $(ARCH)_defconfig
8endif 8endif
9 9
10core-$(CONFIG_KVM) += arch/x86/kvm/
11
12# BITS is used as extension for files which are available in a 32 bit 10# BITS is used as extension for files which are available in a 32 bit
13# and a 64 bit version to simplify shared Makefiles. 11# and a 64 bit version to simplify shared Makefiles.
14# e.g.: obj-y += foo_$(BITS).o 12# e.g.: obj-y += foo_$(BITS).o
@@ -83,6 +81,11 @@ ifdef CONFIG_CC_STACKPROTECTOR
83 endif 81 endif
84endif 82endif
85 83
84# Don't unroll struct assignments with kmemcheck enabled
85ifeq ($(CONFIG_KMEMCHECK),y)
86 KBUILD_CFLAGS += $(call cc-option,-fno-builtin-memcpy)
87endif
88
86# Stackpointer is addressed different for 32 bit and 64 bit x86 89# Stackpointer is addressed different for 32 bit and 64 bit x86
87sp-$(CONFIG_X86_32) := esp 90sp-$(CONFIG_X86_32) := esp
88sp-$(CONFIG_X86_64) := rsp 91sp-$(CONFIG_X86_64) := rsp
@@ -118,21 +121,8 @@ head-y += arch/x86/kernel/init_task.o
118 121
119libs-y += arch/x86/lib/ 122libs-y += arch/x86/lib/
120 123
121# Sub architecture files that needs linking first 124# See arch/x86/Kbuild for content of core part of the kernel
122core-y += $(fcore-y) 125core-y += arch/x86/
123
124# Xen paravirtualization support
125core-$(CONFIG_XEN) += arch/x86/xen/
126
127# lguest paravirtualization support
128core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/
129
130core-y += arch/x86/kernel/
131core-y += arch/x86/mm/
132
133core-y += arch/x86/crypto/
134core-y += arch/x86/vdso/
135core-$(CONFIG_IA32_EMULATION) += arch/x86/ia32/
136 126
137# drivers-y are linked after core-y 127# drivers-y are linked after core-y
138drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/ 128drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/
diff --git a/arch/x86/boot/.gitignore b/arch/x86/boot/.gitignore
index 172cf8a98bdd..851fe936d242 100644
--- a/arch/x86/boot/.gitignore
+++ b/arch/x86/boot/.gitignore
@@ -3,6 +3,8 @@ bzImage
3cpustr.h 3cpustr.h
4mkcpustr 4mkcpustr
5offsets.h 5offsets.h
6voffset.h
7zoffset.h
6setup 8setup
7setup.bin 9setup.bin
8setup.elf 10setup.elf
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 6633b6e7505a..ec749c2bfdd7 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -26,9 +26,10 @@ targets := vmlinux.bin setup.bin setup.elf bzImage
26targets += fdimage fdimage144 fdimage288 image.iso mtools.conf 26targets += fdimage fdimage144 fdimage288 image.iso mtools.conf
27subdir- := compressed 27subdir- := compressed
28 28
29setup-y += a20.o cmdline.o copy.o cpu.o cpucheck.o edd.o 29setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpucheck.o edd.o
30setup-y += header.o main.o mca.o memory.o pm.o pmjump.o 30setup-y += header.o main.o mca.o memory.o pm.o pmjump.o
31setup-y += printf.o string.o tty.o video.o video-mode.o version.o 31setup-y += printf.o regs.o string.o tty.o video.o video-mode.o
32setup-y += version.o
32setup-$(CONFIG_X86_APM_BOOT) += apm.o 33setup-$(CONFIG_X86_APM_BOOT) += apm.o
33 34
34# The link order of the video-*.o modules can matter. In particular, 35# The link order of the video-*.o modules can matter. In particular,
@@ -69,6 +70,7 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \
69 $(call cc-option, -mpreferred-stack-boundary=2) 70 $(call cc-option, -mpreferred-stack-boundary=2)
70KBUILD_CFLAGS += $(call cc-option, -m32) 71KBUILD_CFLAGS += $(call cc-option, -m32)
71KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ 72KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
73GCOV_PROFILE := n
72 74
73$(obj)/bzImage: asflags-y := $(SVGA_MODE) 75$(obj)/bzImage: asflags-y := $(SVGA_MODE)
74 76
@@ -86,19 +88,27 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE
86 88
87SETUP_OBJS = $(addprefix $(obj)/,$(setup-y)) 89SETUP_OBJS = $(addprefix $(obj)/,$(setup-y))
88 90
89sed-offsets := -e 's/^00*/0/' \ 91sed-voffset := -e 's/^\([0-9a-fA-F]*\) . \(_text\|_end\)$$/\#define VO_\2 0x\1/p'
90 -e 's/^\([0-9a-fA-F]*\) . \(input_data\|input_data_end\)$$/\#define \2 0x\1/p'
91 92
92quiet_cmd_offsets = OFFSETS $@ 93quiet_cmd_voffset = VOFFSET $@
93 cmd_offsets = $(NM) $< | sed -n $(sed-offsets) > $@ 94 cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@
94 95
95$(obj)/offsets.h: $(obj)/compressed/vmlinux FORCE 96targets += voffset.h
96 $(call if_changed,offsets) 97$(obj)/voffset.h: vmlinux FORCE
98 $(call if_changed,voffset)
99
100sed-zoffset := -e 's/^\([0-9a-fA-F]*\) . \(startup_32\|input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p'
101
102quiet_cmd_zoffset = ZOFFSET $@
103 cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@
104
105targets += zoffset.h
106$(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE
107 $(call if_changed,zoffset)
97 108
98targets += offsets.h
99 109
100AFLAGS_header.o += -I$(obj) 110AFLAGS_header.o += -I$(obj)
101$(obj)/header.o: $(obj)/offsets.h 111$(obj)/header.o: $(obj)/voffset.h $(obj)/zoffset.h
102 112
103LDFLAGS_setup.elf := -T 113LDFLAGS_setup.elf := -T
104$(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE 114$(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE
diff --git a/arch/x86/boot/a20.c b/arch/x86/boot/a20.c
index 7c19ce8c2442..64a31a6d751a 100644
--- a/arch/x86/boot/a20.c
+++ b/arch/x86/boot/a20.c
@@ -2,7 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007-2008 rPath, Inc. - All Rights Reserved 4 * Copyright 2007-2008 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation 5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
6 * 6 *
7 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
8 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -90,8 +90,11 @@ static int a20_test_long(void)
90 90
91static void enable_a20_bios(void) 91static void enable_a20_bios(void)
92{ 92{
93 asm volatile("pushfl; int $0x15; popfl" 93 struct biosregs ireg;
94 : : "a" ((u16)0x2401)); 94
95 initregs(&ireg);
96 ireg.ax = 0x2401;
97 intcall(0x15, &ireg, NULL);
95} 98}
96 99
97static void enable_a20_kbc(void) 100static void enable_a20_kbc(void)
diff --git a/arch/x86/boot/apm.c b/arch/x86/boot/apm.c
index 7aa6033001f9..ee274834ea8b 100644
--- a/arch/x86/boot/apm.c
+++ b/arch/x86/boot/apm.c
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * Original APM BIOS checking by Stephen Rothwell, May 1994 7 * Original APM BIOS checking by Stephen Rothwell, May 1994
7 * (sfr@canb.auug.org.au) 8 * (sfr@canb.auug.org.au)
@@ -19,75 +20,56 @@
19 20
20int query_apm_bios(void) 21int query_apm_bios(void)
21{ 22{
22 u16 ax, bx, cx, dx, di; 23 struct biosregs ireg, oreg;
23 u32 ebx, esi;
24 u8 err;
25 24
26 /* APM BIOS installation check */ 25 /* APM BIOS installation check */
27 ax = 0x5300; 26 initregs(&ireg);
28 bx = cx = 0; 27 ireg.ah = 0x53;
29 asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %0" 28 intcall(0x15, &ireg, &oreg);
30 : "=d" (err), "+a" (ax), "+b" (bx), "+c" (cx)
31 : : "esi", "edi");
32 29
33 if (err) 30 if (oreg.flags & X86_EFLAGS_CF)
34 return -1; /* No APM BIOS */ 31 return -1; /* No APM BIOS */
35 32
36 if (bx != 0x504d) /* "PM" signature */ 33 if (oreg.bx != 0x504d) /* "PM" signature */
37 return -1; 34 return -1;
38 35
39 if (!(cx & 0x02)) /* 32 bits supported? */ 36 if (!(oreg.cx & 0x02)) /* 32 bits supported? */
40 return -1; 37 return -1;
41 38
42 /* Disconnect first, just in case */ 39 /* Disconnect first, just in case */
43 ax = 0x5304; 40 ireg.al = 0x04;
44 bx = 0; 41 intcall(0x15, &ireg, NULL);
45 asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp"
46 : "+a" (ax), "+b" (bx)
47 : : "ecx", "edx", "esi", "edi");
48
49 /* Paranoia */
50 ebx = esi = 0;
51 cx = dx = di = 0;
52 42
53 /* 32-bit connect */ 43 /* 32-bit connect */
54 asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %6" 44 ireg.al = 0x03;
55 : "=a" (ax), "+b" (ebx), "+c" (cx), "+d" (dx), 45 intcall(0x15, &ireg, &oreg);
56 "+S" (esi), "+D" (di), "=m" (err) 46
57 : "a" (0x5303)); 47 boot_params.apm_bios_info.cseg = oreg.ax;
58 48 boot_params.apm_bios_info.offset = oreg.ebx;
59 boot_params.apm_bios_info.cseg = ax; 49 boot_params.apm_bios_info.cseg_16 = oreg.cx;
60 boot_params.apm_bios_info.offset = ebx; 50 boot_params.apm_bios_info.dseg = oreg.dx;
61 boot_params.apm_bios_info.cseg_16 = cx; 51 boot_params.apm_bios_info.cseg_len = oreg.si;
62 boot_params.apm_bios_info.dseg = dx; 52 boot_params.apm_bios_info.cseg_16_len = oreg.hsi;
63 boot_params.apm_bios_info.cseg_len = (u16)esi; 53 boot_params.apm_bios_info.dseg_len = oreg.di;
64 boot_params.apm_bios_info.cseg_16_len = esi >> 16; 54
65 boot_params.apm_bios_info.dseg_len = di; 55 if (oreg.flags & X86_EFLAGS_CF)
66
67 if (err)
68 return -1; 56 return -1;
69 57
70 /* Redo the installation check as the 32-bit connect; 58 /* Redo the installation check as the 32-bit connect;
71 some BIOSes return different flags this way... */ 59 some BIOSes return different flags this way... */
72 60
73 ax = 0x5300; 61 ireg.al = 0x00;
74 bx = cx = 0; 62 intcall(0x15, &ireg, &oreg);
75 asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %0"
76 : "=d" (err), "+a" (ax), "+b" (bx), "+c" (cx)
77 : : "esi", "edi");
78 63
79 if (err || bx != 0x504d) { 64 if ((oreg.eflags & X86_EFLAGS_CF) || oreg.bx != 0x504d) {
80 /* Failure with 32-bit connect, try to disconect and ignore */ 65 /* Failure with 32-bit connect, try to disconect and ignore */
81 ax = 0x5304; 66 ireg.al = 0x04;
82 bx = 0; 67 intcall(0x15, &ireg, NULL);
83 asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp"
84 : "+a" (ax), "+b" (bx)
85 : : "ecx", "edx", "esi", "edi");
86 return -1; 68 return -1;
87 } 69 }
88 70
89 boot_params.apm_bios_info.version = ax; 71 boot_params.apm_bios_info.version = oreg.ax;
90 boot_params.apm_bios_info.flags = cx; 72 boot_params.apm_bios_info.flags = oreg.cx;
91 return 0; 73 return 0;
92} 74}
93 75
diff --git a/arch/x86/boot/bioscall.S b/arch/x86/boot/bioscall.S
new file mode 100644
index 000000000000..1dfbf64e52a2
--- /dev/null
+++ b/arch/x86/boot/bioscall.S
@@ -0,0 +1,82 @@
1/* -----------------------------------------------------------------------
2 *
3 * Copyright 2009 Intel Corporation; author H. Peter Anvin
4 *
5 * This file is part of the Linux kernel, and is made available under
6 * the terms of the GNU General Public License version 2 or (at your
7 * option) any later version; incorporated herein by reference.
8 *
9 * ----------------------------------------------------------------------- */
10
11/*
12 * "Glove box" for BIOS calls. Avoids the constant problems with BIOSes
13 * touching registers they shouldn't be.
14 */
15
16 .code16gcc
17 .text
18 .globl intcall
19 .type intcall, @function
20intcall:
21 /* Self-modify the INT instruction. Ugly, but works. */
22 cmpb %al, 3f
23 je 1f
24 movb %al, 3f
25 jmp 1f /* Synchronize pipeline */
261:
27 /* Save state */
28 pushfl
29 pushw %fs
30 pushw %gs
31 pushal
32
33 /* Copy input state to stack frame */
34 subw $44, %sp
35 movw %dx, %si
36 movw %sp, %di
37 movw $11, %cx
38 rep; movsd
39
40 /* Pop full state from the stack */
41 popal
42 popw %gs
43 popw %fs
44 popw %es
45 popw %ds
46 popfl
47
48 /* Actual INT */
49 .byte 0xcd /* INT opcode */
503: .byte 0
51
52 /* Push full state to the stack */
53 pushfl
54 pushw %ds
55 pushw %es
56 pushw %fs
57 pushw %gs
58 pushal
59
60 /* Re-establish C environment invariants */
61 cld
62 movzwl %sp, %esp
63 movw %cs, %ax
64 movw %ax, %ds
65 movw %ax, %es
66
67 /* Copy output state from stack frame */
68 movw 68(%esp), %di /* Original %cx == 3rd argument */
69 andw %di, %di
70 jz 4f
71 movw %sp, %si
72 movw $11, %cx
73 rep; movsd
744: addw $44, %sp
75
76 /* Restore state and return */
77 popal
78 popw %gs
79 popw %fs
80 popfl
81 retl
82 .size intcall, .-intcall
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index 7b2692e897e5..98239d2658f2 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -26,6 +27,7 @@
26#include <asm/setup.h> 27#include <asm/setup.h>
27#include "bitops.h" 28#include "bitops.h"
28#include <asm/cpufeature.h> 29#include <asm/cpufeature.h>
30#include <asm/processor-flags.h>
29 31
30/* Useful macros */ 32/* Useful macros */
31#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) 33#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
@@ -241,6 +243,49 @@ int enable_a20(void);
241/* apm.c */ 243/* apm.c */
242int query_apm_bios(void); 244int query_apm_bios(void);
243 245
246/* bioscall.c */
247struct biosregs {
248 union {
249 struct {
250 u32 edi;
251 u32 esi;
252 u32 ebp;
253 u32 _esp;
254 u32 ebx;
255 u32 edx;
256 u32 ecx;
257 u32 eax;
258 u32 _fsgs;
259 u32 _dses;
260 u32 eflags;
261 };
262 struct {
263 u16 di, hdi;
264 u16 si, hsi;
265 u16 bp, hbp;
266 u16 _sp, _hsp;
267 u16 bx, hbx;
268 u16 dx, hdx;
269 u16 cx, hcx;
270 u16 ax, hax;
271 u16 gs, fs;
272 u16 es, ds;
273 u16 flags, hflags;
274 };
275 struct {
276 u8 dil, dih, edi2, edi3;
277 u8 sil, sih, esi2, esi3;
278 u8 bpl, bph, ebp2, ebp3;
279 u8 _spl, _sph, _esp2, _esp3;
280 u8 bl, bh, ebx2, ebx3;
281 u8 dl, dh, edx2, edx3;
282 u8 cl, ch, ecx2, ecx3;
283 u8 al, ah, eax2, eax3;
284 };
285 };
286};
287void intcall(u8 int_no, const struct biosregs *ireg, struct biosregs *oreg);
288
244/* cmdline.c */ 289/* cmdline.c */
245int cmdline_find_option(const char *option, char *buffer, int bufsize); 290int cmdline_find_option(const char *option, char *buffer, int bufsize);
246int cmdline_find_option_bool(const char *option); 291int cmdline_find_option_bool(const char *option);
@@ -279,6 +324,9 @@ int sprintf(char *buf, const char *fmt, ...);
279int vsprintf(char *buf, const char *fmt, va_list args); 324int vsprintf(char *buf, const char *fmt, va_list args);
280int printf(const char *fmt, ...); 325int printf(const char *fmt, ...);
281 326
327/* regs.c */
328void initregs(struct biosregs *regs);
329
282/* string.c */ 330/* string.c */
283int strcmp(const char *str1, const char *str2); 331int strcmp(const char *str1, const char *str2);
284size_t strnlen(const char *s, size_t maxlen); 332size_t strnlen(const char *s, size_t maxlen);
diff --git a/arch/x86/boot/compressed/.gitignore b/arch/x86/boot/compressed/.gitignore
index 63eff3b04d01..4a46fab7162e 100644
--- a/arch/x86/boot/compressed/.gitignore
+++ b/arch/x86/boot/compressed/.gitignore
@@ -1,3 +1,6 @@
1relocs 1relocs
2vmlinux.bin.all 2vmlinux.bin.all
3vmlinux.relocs 3vmlinux.relocs
4vmlinux.lds
5mkpiggy
6piggy.S
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 65551c9f8571..e2ff504b4ddc 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -15,11 +15,14 @@ KBUILD_CFLAGS += $(call cc-option,-ffreestanding)
15KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector) 15KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector)
16 16
17KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ 17KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
18GCOV_PROFILE := n
18 19
19LDFLAGS := -m elf_$(UTS_MACHINE) 20LDFLAGS := -m elf_$(UTS_MACHINE)
20LDFLAGS_vmlinux := -T 21LDFLAGS_vmlinux := -T
21 22
22$(obj)/vmlinux: $(src)/vmlinux_$(BITS).lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE 23hostprogs-y := mkpiggy
24
25$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE
23 $(call if_changed,ld) 26 $(call if_changed,ld)
24 @: 27 @:
25 28
@@ -29,7 +32,7 @@ $(obj)/vmlinux.bin: vmlinux FORCE
29 32
30 33
31targets += vmlinux.bin.all vmlinux.relocs relocs 34targets += vmlinux.bin.all vmlinux.relocs relocs
32hostprogs-$(CONFIG_X86_32) += relocs 35hostprogs-$(CONFIG_X86_NEED_RELOCS) += relocs
33 36
34quiet_cmd_relocs = RELOCS $@ 37quiet_cmd_relocs = RELOCS $@
35 cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $< 38 cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $<
@@ -37,46 +40,22 @@ $(obj)/vmlinux.relocs: vmlinux $(obj)/relocs FORCE
37 $(call if_changed,relocs) 40 $(call if_changed,relocs)
38 41
39vmlinux.bin.all-y := $(obj)/vmlinux.bin 42vmlinux.bin.all-y := $(obj)/vmlinux.bin
40vmlinux.bin.all-$(CONFIG_RELOCATABLE) += $(obj)/vmlinux.relocs 43vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs
41quiet_cmd_relocbin = BUILD $@
42 cmd_relocbin = cat $(filter-out FORCE,$^) > $@
43$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE
44 $(call if_changed,relocbin)
45
46ifeq ($(CONFIG_X86_32),y)
47 44
48ifdef CONFIG_RELOCATABLE 45$(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE
49$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE
50 $(call if_changed,gzip)
51$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin.all FORCE
52 $(call if_changed,bzip2)
53$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin.all FORCE
54 $(call if_changed,lzma)
55else
56$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
57 $(call if_changed,gzip) 46 $(call if_changed,gzip)
58$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE 47$(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE
59 $(call if_changed,bzip2) 48 $(call if_changed,bzip2)
60$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE 49$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE
61 $(call if_changed,lzma) 50 $(call if_changed,lzma)
62endif
63LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T
64 51
65else 52suffix-$(CONFIG_KERNEL_GZIP) := gz
53suffix-$(CONFIG_KERNEL_BZIP2) := bz2
54suffix-$(CONFIG_KERNEL_LZMA) := lzma
66 55
67$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE 56quiet_cmd_mkpiggy = MKPIGGY $@
68 $(call if_changed,gzip) 57 cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false )
69$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE
70 $(call if_changed,bzip2)
71$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE
72 $(call if_changed,lzma)
73
74LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T
75endif
76 58
77suffix_$(CONFIG_KERNEL_GZIP) = gz 59targets += piggy.S
78suffix_$(CONFIG_KERNEL_BZIP2) = bz2 60$(obj)/piggy.S: $(obj)/vmlinux.bin.$(suffix-y) $(obj)/mkpiggy FORCE
79suffix_$(CONFIG_KERNEL_LZMA) = lzma 61 $(call if_changed,mkpiggy)
80
81$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix_y) FORCE
82 $(call if_changed,ld)
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 3a8a866fb2e2..75e4f001e706 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -12,16 +12,16 @@
12 * the page directory. [According to comments etc elsewhere on a compressed 12 * the page directory. [According to comments etc elsewhere on a compressed
13 * kernel it will end up at 0x1000 + 1Mb I hope so as I assume this. - AC] 13 * kernel it will end up at 0x1000 + 1Mb I hope so as I assume this. - AC]
14 * 14 *
15 * Page 0 is deliberately kept safe, since System Management Mode code in 15 * Page 0 is deliberately kept safe, since System Management Mode code in
16 * laptops may need to access the BIOS data stored there. This is also 16 * laptops may need to access the BIOS data stored there. This is also
17 * useful for future device drivers that either access the BIOS via VM86 17 * useful for future device drivers that either access the BIOS via VM86
18 * mode. 18 * mode.
19 */ 19 */
20 20
21/* 21/*
22 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 22 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
23 */ 23 */
24.text 24 .text
25 25
26#include <linux/linkage.h> 26#include <linux/linkage.h>
27#include <asm/segment.h> 27#include <asm/segment.h>
@@ -29,161 +29,151 @@
29#include <asm/boot.h> 29#include <asm/boot.h>
30#include <asm/asm-offsets.h> 30#include <asm/asm-offsets.h>
31 31
32.section ".text.head","ax",@progbits 32 .section ".text.head","ax",@progbits
33ENTRY(startup_32) 33ENTRY(startup_32)
34 cld 34 cld
35 /* test KEEP_SEGMENTS flag to see if the bootloader is asking 35 /*
36 * us to not reload segments */ 36 * Test KEEP_SEGMENTS flag to see if the bootloader is asking
37 testb $(1<<6), BP_loadflags(%esi) 37 * us to not reload segments
38 jnz 1f 38 */
39 testb $(1<<6), BP_loadflags(%esi)
40 jnz 1f
39 41
40 cli 42 cli
41 movl $(__BOOT_DS),%eax 43 movl $__BOOT_DS, %eax
42 movl %eax,%ds 44 movl %eax, %ds
43 movl %eax,%es 45 movl %eax, %es
44 movl %eax,%fs 46 movl %eax, %fs
45 movl %eax,%gs 47 movl %eax, %gs
46 movl %eax,%ss 48 movl %eax, %ss
471: 491:
48 50
49/* Calculate the delta between where we were compiled to run 51/*
52 * Calculate the delta between where we were compiled to run
50 * at and where we were actually loaded at. This can only be done 53 * at and where we were actually loaded at. This can only be done
51 * with a short local call on x86. Nothing else will tell us what 54 * with a short local call on x86. Nothing else will tell us what
52 * address we are running at. The reserved chunk of the real-mode 55 * address we are running at. The reserved chunk of the real-mode
53 * data at 0x1e4 (defined as a scratch field) are used as the stack 56 * data at 0x1e4 (defined as a scratch field) are used as the stack
54 * for this calculation. Only 4 bytes are needed. 57 * for this calculation. Only 4 bytes are needed.
55 */ 58 */
56 leal (0x1e4+4)(%esi), %esp 59 leal (BP_scratch+4)(%esi), %esp
57 call 1f 60 call 1f
581: popl %ebp 611: popl %ebp
59 subl $1b, %ebp 62 subl $1b, %ebp
60 63
61/* %ebp contains the address we are loaded at by the boot loader and %ebx 64/*
65 * %ebp contains the address we are loaded at by the boot loader and %ebx
62 * contains the address where we should move the kernel image temporarily 66 * contains the address where we should move the kernel image temporarily
63 * for safe in-place decompression. 67 * for safe in-place decompression.
64 */ 68 */
65 69
66#ifdef CONFIG_RELOCATABLE 70#ifdef CONFIG_RELOCATABLE
67 movl %ebp, %ebx 71 movl %ebp, %ebx
68 addl $(CONFIG_PHYSICAL_ALIGN - 1), %ebx 72 movl BP_kernel_alignment(%esi), %eax
69 andl $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebx 73 decl %eax
74 addl %eax, %ebx
75 notl %eax
76 andl %eax, %ebx
70#else 77#else
71 movl $LOAD_PHYSICAL_ADDR, %ebx 78 movl $LOAD_PHYSICAL_ADDR, %ebx
72#endif 79#endif
73 80
74 /* Replace the compressed data size with the uncompressed size */ 81 /* Target address to relocate to for decompression */
75 subl input_len(%ebp), %ebx 82 addl $z_extract_offset, %ebx
76 movl output_len(%ebp), %eax 83
77 addl %eax, %ebx 84 /* Set up the stack */
78 /* Add 8 bytes for every 32K input block */ 85 leal boot_stack_end(%ebx), %esp
79 shrl $12, %eax 86
80 addl %eax, %ebx 87 /* Zero EFLAGS */
81 /* Add 32K + 18 bytes of extra slack */ 88 pushl $0
82 addl $(32768 + 18), %ebx 89 popfl
83 /* Align on a 4K boundary */ 90
84 addl $4095, %ebx 91/*
85 andl $~4095, %ebx 92 * Copy the compressed kernel to the end of our buffer
86
87/* Copy the compressed kernel to the end of our buffer
88 * where decompression in place becomes safe. 93 * where decompression in place becomes safe.
89 */ 94 */
90 pushl %esi 95 pushl %esi
91 leal _end(%ebp), %esi 96 leal (_bss-4)(%ebp), %esi
92 leal _end(%ebx), %edi 97 leal (_bss-4)(%ebx), %edi
93 movl $(_end - startup_32), %ecx 98 movl $(_bss - startup_32), %ecx
99 shrl $2, %ecx
94 std 100 std
95 rep 101 rep movsl
96 movsb
97 cld 102 cld
98 popl %esi 103 popl %esi
99
100/* Compute the kernel start address.
101 */
102#ifdef CONFIG_RELOCATABLE
103 addl $(CONFIG_PHYSICAL_ALIGN - 1), %ebp
104 andl $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebp
105#else
106 movl $LOAD_PHYSICAL_ADDR, %ebp
107#endif
108 104
109/* 105/*
110 * Jump to the relocated address. 106 * Jump to the relocated address.
111 */ 107 */
112 leal relocated(%ebx), %eax 108 leal relocated(%ebx), %eax
113 jmp *%eax 109 jmp *%eax
114ENDPROC(startup_32) 110ENDPROC(startup_32)
115 111
116.section ".text" 112 .text
117relocated: 113relocated:
118 114
119/* 115/*
120 * Clear BSS 116 * Clear BSS (stack is currently empty)
121 */
122 xorl %eax,%eax
123 leal _edata(%ebx),%edi
124 leal _end(%ebx), %ecx
125 subl %edi,%ecx
126 cld
127 rep
128 stosb
129
130/*
131 * Setup the stack for the decompressor
132 */ 117 */
133 leal boot_stack_end(%ebx), %esp 118 xorl %eax, %eax
119 leal _bss(%ebx), %edi
120 leal _ebss(%ebx), %ecx
121 subl %edi, %ecx
122 shrl $2, %ecx
123 rep stosl
134 124
135/* 125/*
136 * Do the decompression, and jump to the new kernel.. 126 * Do the decompression, and jump to the new kernel..
137 */ 127 */
138 movl output_len(%ebx), %eax 128 leal z_extract_offset_negative(%ebx), %ebp
139 pushl %eax 129 /* push arguments for decompress_kernel: */
140 # push arguments for decompress_kernel: 130 pushl %ebp /* output address */
141 pushl %ebp # output address 131 pushl $z_input_len /* input_len */
142 movl input_len(%ebx), %eax 132 leal input_data(%ebx), %eax
143 pushl %eax # input_len 133 pushl %eax /* input_data */
144 leal input_data(%ebx), %eax 134 leal boot_heap(%ebx), %eax
145 pushl %eax # input_data 135 pushl %eax /* heap area */
146 leal boot_heap(%ebx), %eax 136 pushl %esi /* real mode pointer */
147 pushl %eax # heap area 137 call decompress_kernel
148 pushl %esi # real mode pointer 138 addl $20, %esp
149 call decompress_kernel
150 addl $20, %esp
151 popl %ecx
152 139
153#if CONFIG_RELOCATABLE 140#if CONFIG_RELOCATABLE
154/* Find the address of the relocations. 141/*
142 * Find the address of the relocations.
155 */ 143 */
156 movl %ebp, %edi 144 leal z_output_len(%ebp), %edi
157 addl %ecx, %edi
158 145
159/* Calculate the delta between where vmlinux was compiled to run 146/*
147 * Calculate the delta between where vmlinux was compiled to run
160 * and where it was actually loaded. 148 * and where it was actually loaded.
161 */ 149 */
162 movl %ebp, %ebx 150 movl %ebp, %ebx
163 subl $LOAD_PHYSICAL_ADDR, %ebx 151 subl $LOAD_PHYSICAL_ADDR, %ebx
164 jz 2f /* Nothing to be done if loaded at compiled addr. */ 152 jz 2f /* Nothing to be done if loaded at compiled addr. */
165/* 153/*
166 * Process relocations. 154 * Process relocations.
167 */ 155 */
168 156
1691: subl $4, %edi 1571: subl $4, %edi
170 movl 0(%edi), %ecx 158 movl (%edi), %ecx
171 testl %ecx, %ecx 159 testl %ecx, %ecx
172 jz 2f 160 jz 2f
173 addl %ebx, -__PAGE_OFFSET(%ebx, %ecx) 161 addl %ebx, -__PAGE_OFFSET(%ebx, %ecx)
174 jmp 1b 162 jmp 1b
1752: 1632:
176#endif 164#endif
177 165
178/* 166/*
179 * Jump to the decompressed kernel. 167 * Jump to the decompressed kernel.
180 */ 168 */
181 xorl %ebx,%ebx 169 xorl %ebx, %ebx
182 jmp *%ebp 170 jmp *%ebp
183 171
184.bss 172/*
185/* Stack and heap for uncompression */ 173 * Stack and heap for uncompression
186.balign 4 174 */
175 .bss
176 .balign 4
187boot_heap: 177boot_heap:
188 .fill BOOT_HEAP_SIZE, 1, 0 178 .fill BOOT_HEAP_SIZE, 1, 0
189boot_stack: 179boot_stack:
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index ed4a82948002..f62c284db9eb 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -21,8 +21,8 @@
21/* 21/*
22 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 22 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
23 */ 23 */
24.code32 24 .code32
25.text 25 .text
26 26
27#include <linux/linkage.h> 27#include <linux/linkage.h>
28#include <asm/segment.h> 28#include <asm/segment.h>
@@ -33,12 +33,14 @@
33#include <asm/processor-flags.h> 33#include <asm/processor-flags.h>
34#include <asm/asm-offsets.h> 34#include <asm/asm-offsets.h>
35 35
36.section ".text.head" 36 .section ".text.head"
37 .code32 37 .code32
38ENTRY(startup_32) 38ENTRY(startup_32)
39 cld 39 cld
40 /* test KEEP_SEGMENTS flag to see if the bootloader is asking 40 /*
41 * us to not reload segments */ 41 * Test KEEP_SEGMENTS flag to see if the bootloader is asking
42 * us to not reload segments
43 */
42 testb $(1<<6), BP_loadflags(%esi) 44 testb $(1<<6), BP_loadflags(%esi)
43 jnz 1f 45 jnz 1f
44 46
@@ -49,14 +51,15 @@ ENTRY(startup_32)
49 movl %eax, %ss 51 movl %eax, %ss
501: 521:
51 53
52/* Calculate the delta between where we were compiled to run 54/*
55 * Calculate the delta between where we were compiled to run
53 * at and where we were actually loaded at. This can only be done 56 * at and where we were actually loaded at. This can only be done
54 * with a short local call on x86. Nothing else will tell us what 57 * with a short local call on x86. Nothing else will tell us what
55 * address we are running at. The reserved chunk of the real-mode 58 * address we are running at. The reserved chunk of the real-mode
56 * data at 0x1e4 (defined as a scratch field) are used as the stack 59 * data at 0x1e4 (defined as a scratch field) are used as the stack
57 * for this calculation. Only 4 bytes are needed. 60 * for this calculation. Only 4 bytes are needed.
58 */ 61 */
59 leal (0x1e4+4)(%esi), %esp 62 leal (BP_scratch+4)(%esi), %esp
60 call 1f 63 call 1f
611: popl %ebp 641: popl %ebp
62 subl $1b, %ebp 65 subl $1b, %ebp
@@ -70,32 +73,28 @@ ENTRY(startup_32)
70 testl %eax, %eax 73 testl %eax, %eax
71 jnz no_longmode 74 jnz no_longmode
72 75
73/* Compute the delta between where we were compiled to run at 76/*
77 * Compute the delta between where we were compiled to run at
74 * and where the code will actually run at. 78 * and where the code will actually run at.
75 */ 79 *
76/* %ebp contains the address we are loaded at by the boot loader and %ebx 80 * %ebp contains the address we are loaded at by the boot loader and %ebx
77 * contains the address where we should move the kernel image temporarily 81 * contains the address where we should move the kernel image temporarily
78 * for safe in-place decompression. 82 * for safe in-place decompression.
79 */ 83 */
80 84
81#ifdef CONFIG_RELOCATABLE 85#ifdef CONFIG_RELOCATABLE
82 movl %ebp, %ebx 86 movl %ebp, %ebx
83 addl $(PMD_PAGE_SIZE -1), %ebx 87 movl BP_kernel_alignment(%esi), %eax
84 andl $PMD_PAGE_MASK, %ebx 88 decl %eax
89 addl %eax, %ebx
90 notl %eax
91 andl %eax, %ebx
85#else 92#else
86 movl $CONFIG_PHYSICAL_START, %ebx 93 movl $LOAD_PHYSICAL_ADDR, %ebx
87#endif 94#endif
88 95
89 /* Replace the compressed data size with the uncompressed size */ 96 /* Target address to relocate to for decompression */
90 subl input_len(%ebp), %ebx 97 addl $z_extract_offset, %ebx
91 movl output_len(%ebp), %eax
92 addl %eax, %ebx
93 /* Add 8 bytes for every 32K input block */
94 shrl $12, %eax
95 addl %eax, %ebx
96 /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */
97 addl $(32768 + 18 + 4095), %ebx
98 andl $~4095, %ebx
99 98
100/* 99/*
101 * Prepare for entering 64 bit mode 100 * Prepare for entering 64 bit mode
@@ -114,7 +113,7 @@ ENTRY(startup_32)
114 /* 113 /*
115 * Build early 4G boot pagetable 114 * Build early 4G boot pagetable
116 */ 115 */
117 /* Initialize Page tables to 0*/ 116 /* Initialize Page tables to 0 */
118 leal pgtable(%ebx), %edi 117 leal pgtable(%ebx), %edi
119 xorl %eax, %eax 118 xorl %eax, %eax
120 movl $((4096*6)/4), %ecx 119 movl $((4096*6)/4), %ecx
@@ -155,7 +154,8 @@ ENTRY(startup_32)
155 btsl $_EFER_LME, %eax 154 btsl $_EFER_LME, %eax
156 wrmsr 155 wrmsr
157 156
158 /* Setup for the jump to 64bit mode 157 /*
158 * Setup for the jump to 64bit mode
159 * 159 *
160 * When the jump is performend we will be in long mode but 160 * When the jump is performend we will be in long mode but
161 * in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1 161 * in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1
@@ -184,7 +184,8 @@ no_longmode:
184 184
185#include "../../kernel/verify_cpu_64.S" 185#include "../../kernel/verify_cpu_64.S"
186 186
187 /* Be careful here startup_64 needs to be at a predictable 187 /*
188 * Be careful here startup_64 needs to be at a predictable
188 * address so I can export it in an ELF header. Bootloaders 189 * address so I can export it in an ELF header. Bootloaders
189 * should look at the ELF header to find this address, as 190 * should look at the ELF header to find this address, as
190 * it may change in the future. 191 * it may change in the future.
@@ -192,7 +193,8 @@ no_longmode:
192 .code64 193 .code64
193 .org 0x200 194 .org 0x200
194ENTRY(startup_64) 195ENTRY(startup_64)
195 /* We come here either from startup_32 or directly from a 196 /*
197 * We come here either from startup_32 or directly from a
196 * 64bit bootloader. If we come here from a bootloader we depend on 198 * 64bit bootloader. If we come here from a bootloader we depend on
197 * an identity mapped page table being provied that maps our 199 * an identity mapped page table being provied that maps our
198 * entire text+data+bss and hopefully all of memory. 200 * entire text+data+bss and hopefully all of memory.
@@ -209,50 +211,54 @@ ENTRY(startup_64)
209 movl $0x20, %eax 211 movl $0x20, %eax
210 ltr %ax 212 ltr %ax
211 213
212 /* Compute the decompressed kernel start address. It is where 214 /*
215 * Compute the decompressed kernel start address. It is where
213 * we were loaded at aligned to a 2M boundary. %rbp contains the 216 * we were loaded at aligned to a 2M boundary. %rbp contains the
214 * decompressed kernel start address. 217 * decompressed kernel start address.
215 * 218 *
216 * If it is a relocatable kernel then decompress and run the kernel 219 * If it is a relocatable kernel then decompress and run the kernel
217 * from load address aligned to 2MB addr, otherwise decompress and 220 * from load address aligned to 2MB addr, otherwise decompress and
218 * run the kernel from CONFIG_PHYSICAL_START 221 * run the kernel from LOAD_PHYSICAL_ADDR
222 *
223 * We cannot rely on the calculation done in 32-bit mode, since we
224 * may have been invoked via the 64-bit entry point.
219 */ 225 */
220 226
221 /* Start with the delta to where the kernel will run at. */ 227 /* Start with the delta to where the kernel will run at. */
222#ifdef CONFIG_RELOCATABLE 228#ifdef CONFIG_RELOCATABLE
223 leaq startup_32(%rip) /* - $startup_32 */, %rbp 229 leaq startup_32(%rip) /* - $startup_32 */, %rbp
224 addq $(PMD_PAGE_SIZE - 1), %rbp 230 movl BP_kernel_alignment(%rsi), %eax
225 andq $PMD_PAGE_MASK, %rbp 231 decl %eax
226 movq %rbp, %rbx 232 addq %rax, %rbp
233 notq %rax
234 andq %rax, %rbp
227#else 235#else
228 movq $CONFIG_PHYSICAL_START, %rbp 236 movq $LOAD_PHYSICAL_ADDR, %rbp
229 movq %rbp, %rbx
230#endif 237#endif
231 238
232 /* Replace the compressed data size with the uncompressed size */ 239 /* Target address to relocate to for decompression */
233 movl input_len(%rip), %eax 240 leaq z_extract_offset(%rbp), %rbx
234 subq %rax, %rbx 241
235 movl output_len(%rip), %eax 242 /* Set up the stack */
236 addq %rax, %rbx 243 leaq boot_stack_end(%rbx), %rsp
237 /* Add 8 bytes for every 32K input block */ 244
238 shrq $12, %rax 245 /* Zero EFLAGS */
239 addq %rax, %rbx 246 pushq $0
240 /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */ 247 popfq
241 addq $(32768 + 18 + 4095), %rbx 248
242 andq $~4095, %rbx 249/*
243 250 * Copy the compressed kernel to the end of our buffer
244/* Copy the compressed kernel to the end of our buffer
245 * where decompression in place becomes safe. 251 * where decompression in place becomes safe.
246 */ 252 */
247 leaq _end_before_pgt(%rip), %r8 253 pushq %rsi
248 leaq _end_before_pgt(%rbx), %r9 254 leaq (_bss-8)(%rip), %rsi
249 movq $_end_before_pgt /* - $startup_32 */, %rcx 255 leaq (_bss-8)(%rbx), %rdi
2501: subq $8, %r8 256 movq $_bss /* - $startup_32 */, %rcx
251 subq $8, %r9 257 shrq $3, %rcx
252 movq 0(%r8), %rax 258 std
253 movq %rax, 0(%r9) 259 rep movsq
254 subq $8, %rcx 260 cld
255 jnz 1b 261 popq %rsi
256 262
257/* 263/*
258 * Jump to the relocated address. 264 * Jump to the relocated address.
@@ -260,37 +266,28 @@ ENTRY(startup_64)
260 leaq relocated(%rbx), %rax 266 leaq relocated(%rbx), %rax
261 jmp *%rax 267 jmp *%rax
262 268
263.section ".text" 269 .text
264relocated: 270relocated:
265 271
266/* 272/*
267 * Clear BSS 273 * Clear BSS (stack is currently empty)
268 */ 274 */
269 xorq %rax, %rax 275 xorl %eax, %eax
270 leaq _edata(%rbx), %rdi 276 leaq _bss(%rip), %rdi
271 leaq _end_before_pgt(%rbx), %rcx 277 leaq _ebss(%rip), %rcx
272 subq %rdi, %rcx 278 subq %rdi, %rcx
273 cld 279 shrq $3, %rcx
274 rep 280 rep stosq
275 stosb
276
277 /* Setup the stack */
278 leaq boot_stack_end(%rip), %rsp
279
280 /* zero EFLAGS after setting rsp */
281 pushq $0
282 popfq
283 281
284/* 282/*
285 * Do the decompression, and jump to the new kernel.. 283 * Do the decompression, and jump to the new kernel..
286 */ 284 */
287 pushq %rsi # Save the real mode argument 285 pushq %rsi /* Save the real mode argument */
288 movq %rsi, %rdi # real mode address 286 movq %rsi, %rdi /* real mode address */
289 leaq boot_heap(%rip), %rsi # malloc area for uncompression 287 leaq boot_heap(%rip), %rsi /* malloc area for uncompression */
290 leaq input_data(%rip), %rdx # input_data 288 leaq input_data(%rip), %rdx /* input_data */
291 movl input_len(%rip), %eax 289 movl $z_input_len, %ecx /* input_len */
292 movq %rax, %rcx # input_len 290 movq %rbp, %r8 /* output target address */
293 movq %rbp, %r8 # output
294 call decompress_kernel 291 call decompress_kernel
295 popq %rsi 292 popq %rsi
296 293
@@ -311,11 +308,21 @@ gdt:
311 .quad 0x0000000000000000 /* TS continued */ 308 .quad 0x0000000000000000 /* TS continued */
312gdt_end: 309gdt_end:
313 310
314.bss 311/*
315/* Stack and heap for uncompression */ 312 * Stack and heap for uncompression
316.balign 4 313 */
314 .bss
315 .balign 4
317boot_heap: 316boot_heap:
318 .fill BOOT_HEAP_SIZE, 1, 0 317 .fill BOOT_HEAP_SIZE, 1, 0
319boot_stack: 318boot_stack:
320 .fill BOOT_STACK_SIZE, 1, 0 319 .fill BOOT_STACK_SIZE, 1, 0
321boot_stack_end: 320boot_stack_end:
321
322/*
323 * Space for page tables (not in .bss so not zeroed)
324 */
325 .section ".pgtable","a",@nobits
326 .balign 4096
327pgtable:
328 .fill 6*4096, 1, 0
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index e45be73684ff..842b2a36174a 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -325,21 +325,19 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
325 free_mem_ptr = heap; /* Heap */ 325 free_mem_ptr = heap; /* Heap */
326 free_mem_end_ptr = heap + BOOT_HEAP_SIZE; 326 free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
327 327
328 if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1))
329 error("Destination address inappropriately aligned");
328#ifdef CONFIG_X86_64 330#ifdef CONFIG_X86_64
329 if ((unsigned long)output & (__KERNEL_ALIGN - 1)) 331 if (heap > 0x3fffffffffffUL)
330 error("Destination address not 2M aligned");
331 if ((unsigned long)output >= 0xffffffffffUL)
332 error("Destination address too large"); 332 error("Destination address too large");
333#else 333#else
334 if ((u32)output & (CONFIG_PHYSICAL_ALIGN - 1))
335 error("Destination address not CONFIG_PHYSICAL_ALIGN aligned");
336 if (heap > ((-__PAGE_OFFSET-(512<<20)-1) & 0x7fffffff)) 334 if (heap > ((-__PAGE_OFFSET-(512<<20)-1) & 0x7fffffff))
337 error("Destination address too large"); 335 error("Destination address too large");
336#endif
338#ifndef CONFIG_RELOCATABLE 337#ifndef CONFIG_RELOCATABLE
339 if ((u32)output != LOAD_PHYSICAL_ADDR) 338 if ((unsigned long)output != LOAD_PHYSICAL_ADDR)
340 error("Wrong destination address"); 339 error("Wrong destination address");
341#endif 340#endif
342#endif
343 341
344 if (!quiet) 342 if (!quiet)
345 putstr("\nDecompressing Linux... "); 343 putstr("\nDecompressing Linux... ");
diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c
new file mode 100644
index 000000000000..bcbd36c41432
--- /dev/null
+++ b/arch/x86/boot/compressed/mkpiggy.c
@@ -0,0 +1,97 @@
1/* ----------------------------------------------------------------------- *
2 *
3 * Copyright (C) 2009 Intel Corporation. All rights reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License version
7 * 2 as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301, USA.
18 *
19 * H. Peter Anvin <hpa@linux.intel.com>
20 *
21 * ----------------------------------------------------------------------- */
22
23/*
24 * Compute the desired load offset from a compressed program; outputs
25 * a small assembly wrapper with the appropriate symbols defined.
26 */
27
28#include <stdlib.h>
29#include <stdio.h>
30#include <string.h>
31#include <inttypes.h>
32
33static uint32_t getle32(const void *p)
34{
35 const uint8_t *cp = p;
36
37 return (uint32_t)cp[0] + ((uint32_t)cp[1] << 8) +
38 ((uint32_t)cp[2] << 16) + ((uint32_t)cp[3] << 24);
39}
40
41int main(int argc, char *argv[])
42{
43 uint32_t olen;
44 long ilen;
45 unsigned long offs;
46 FILE *f;
47
48 if (argc < 2) {
49 fprintf(stderr, "Usage: %s compressed_file\n", argv[0]);
50 return 1;
51 }
52
53 /* Get the information for the compressed kernel image first */
54
55 f = fopen(argv[1], "r");
56 if (!f) {
57 perror(argv[1]);
58 return 1;
59 }
60
61
62 if (fseek(f, -4L, SEEK_END)) {
63 perror(argv[1]);
64 }
65 fread(&olen, sizeof olen, 1, f);
66 ilen = ftell(f);
67 olen = getle32(&olen);
68 fclose(f);
69
70 /*
71 * Now we have the input (compressed) and output (uncompressed)
72 * sizes, compute the necessary decompression offset...
73 */
74
75 offs = (olen > ilen) ? olen - ilen : 0;
76 offs += olen >> 12; /* Add 8 bytes for each 32K block */
77 offs += 32*1024 + 18; /* Add 32K + 18 bytes slack */
78 offs = (offs+4095) & ~4095; /* Round to a 4K boundary */
79
80 printf(".section \".rodata.compressed\",\"a\",@progbits\n");
81 printf(".globl z_input_len\n");
82 printf("z_input_len = %lu\n", ilen);
83 printf(".globl z_output_len\n");
84 printf("z_output_len = %lu\n", (unsigned long)olen);
85 printf(".globl z_extract_offset\n");
86 printf("z_extract_offset = 0x%lx\n", offs);
87 /* z_extract_offset_negative allows simplification of head_32.S */
88 printf(".globl z_extract_offset_negative\n");
89 printf("z_extract_offset_negative = -0x%lx\n", offs);
90
91 printf(".globl input_data, input_data_end\n");
92 printf("input_data:\n");
93 printf(".incbin \"%s\"\n", argv[1]);
94 printf("input_data_end:\n");
95
96 return 0;
97}
diff --git a/arch/x86/boot/compressed/vmlinux_64.lds b/arch/x86/boot/compressed/vmlinux.lds.S
index bef1ac891bce..cc353e1b3ffd 100644
--- a/arch/x86/boot/compressed/vmlinux_64.lds
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -1,6 +1,17 @@
1OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") 1OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
2
3#undef i386
4
5#include <asm/page_types.h>
6
7#ifdef CONFIG_X86_64
2OUTPUT_ARCH(i386:x86-64) 8OUTPUT_ARCH(i386:x86-64)
3ENTRY(startup_64) 9ENTRY(startup_64)
10#else
11OUTPUT_ARCH(i386)
12ENTRY(startup_32)
13#endif
14
4SECTIONS 15SECTIONS
5{ 16{
6 /* Be careful parts of head_64.S assume startup_32 is at 17 /* Be careful parts of head_64.S assume startup_32 is at
@@ -33,16 +44,22 @@ SECTIONS
33 *(.data.*) 44 *(.data.*)
34 _edata = . ; 45 _edata = . ;
35 } 46 }
47 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
36 .bss : { 48 .bss : {
37 _bss = . ; 49 _bss = . ;
38 *(.bss) 50 *(.bss)
39 *(.bss.*) 51 *(.bss.*)
40 *(COMMON) 52 *(COMMON)
41 . = ALIGN(8); 53 . = ALIGN(8); /* For convenience during zeroing */
42 _end_before_pgt = . ;
43 . = ALIGN(4096);
44 pgtable = . ;
45 . = . + 4096 * 6;
46 _ebss = .; 54 _ebss = .;
47 } 55 }
56#ifdef CONFIG_X86_64
57 . = ALIGN(PAGE_SIZE);
58 .pgtable : {
59 _pgtable = . ;
60 *(.pgtable)
61 _epgtable = . ;
62 }
63#endif
64 _end = .;
48} 65}
diff --git a/arch/x86/boot/compressed/vmlinux.scr b/arch/x86/boot/compressed/vmlinux.scr
deleted file mode 100644
index f02382ae5c48..000000000000
--- a/arch/x86/boot/compressed/vmlinux.scr
+++ /dev/null
@@ -1,10 +0,0 @@
1SECTIONS
2{
3 .rodata.compressed : {
4 input_len = .;
5 LONG(input_data_end - input_data) input_data = .;
6 *(.data)
7 output_len = . - 4;
8 input_data_end = .;
9 }
10}
diff --git a/arch/x86/boot/compressed/vmlinux_32.lds b/arch/x86/boot/compressed/vmlinux_32.lds
deleted file mode 100644
index bb3c48379c40..000000000000
--- a/arch/x86/boot/compressed/vmlinux_32.lds
+++ /dev/null
@@ -1,43 +0,0 @@
1OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
2OUTPUT_ARCH(i386)
3ENTRY(startup_32)
4SECTIONS
5{
6 /* Be careful parts of head_32.S assume startup_32 is at
7 * address 0.
8 */
9 . = 0;
10 .text.head : {
11 _head = . ;
12 *(.text.head)
13 _ehead = . ;
14 }
15 .rodata.compressed : {
16 *(.rodata.compressed)
17 }
18 .text : {
19 _text = .; /* Text */
20 *(.text)
21 *(.text.*)
22 _etext = . ;
23 }
24 .rodata : {
25 _rodata = . ;
26 *(.rodata) /* read-only data */
27 *(.rodata.*)
28 _erodata = . ;
29 }
30 .data : {
31 _data = . ;
32 *(.data)
33 *(.data.*)
34 _edata = . ;
35 }
36 .bss : {
37 _bss = . ;
38 *(.bss)
39 *(.bss.*)
40 *(COMMON)
41 _end = . ;
42 }
43}
diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c
index 1aae8f3e5ca1..c501a5b466f8 100644
--- a/arch/x86/boot/edd.c
+++ b/arch/x86/boot/edd.c
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -22,17 +23,17 @@
22 */ 23 */
23static int read_mbr(u8 devno, void *buf) 24static int read_mbr(u8 devno, void *buf)
24{ 25{
25 u16 ax, bx, cx, dx; 26 struct biosregs ireg, oreg;
26 27
27 ax = 0x0201; /* Legacy Read, one sector */ 28 initregs(&ireg);
28 cx = 0x0001; /* Sector 0-0-1 */ 29 ireg.ax = 0x0201; /* Legacy Read, one sector */
29 dx = devno; 30 ireg.cx = 0x0001; /* Sector 0-0-1 */
30 bx = (size_t)buf; 31 ireg.dl = devno;
31 asm volatile("pushfl; stc; int $0x13; setc %%al; popfl" 32 ireg.bx = (size_t)buf;
32 : "+a" (ax), "+c" (cx), "+d" (dx), "+b" (bx)
33 : : "esi", "edi", "memory");
34 33
35 return -(u8)ax; /* 0 or -1 */ 34 intcall(0x13, &ireg, &oreg);
35
36 return -(oreg.eflags & X86_EFLAGS_CF); /* 0 or -1 */
36} 37}
37 38
38static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig) 39static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig)
@@ -72,56 +73,46 @@ static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig)
72 73
73static int get_edd_info(u8 devno, struct edd_info *ei) 74static int get_edd_info(u8 devno, struct edd_info *ei)
74{ 75{
75 u16 ax, bx, cx, dx, di; 76 struct biosregs ireg, oreg;
76 77
77 memset(ei, 0, sizeof *ei); 78 memset(ei, 0, sizeof *ei);
78 79
79 /* Check Extensions Present */ 80 /* Check Extensions Present */
80 81
81 ax = 0x4100; 82 initregs(&ireg);
82 bx = EDDMAGIC1; 83 ireg.ah = 0x41;
83 dx = devno; 84 ireg.bx = EDDMAGIC1;
84 asm("pushfl; stc; int $0x13; setc %%al; popfl" 85 ireg.dl = devno;
85 : "+a" (ax), "+b" (bx), "=c" (cx), "+d" (dx) 86 intcall(0x13, &ireg, &oreg);
86 : : "esi", "edi");
87 87
88 if ((u8)ax) 88 if (oreg.eflags & X86_EFLAGS_CF)
89 return -1; /* No extended information */ 89 return -1; /* No extended information */
90 90
91 if (bx != EDDMAGIC2) 91 if (oreg.bx != EDDMAGIC2)
92 return -1; 92 return -1;
93 93
94 ei->device = devno; 94 ei->device = devno;
95 ei->version = ax >> 8; /* EDD version number */ 95 ei->version = oreg.ah; /* EDD version number */
96 ei->interface_support = cx; /* EDD functionality subsets */ 96 ei->interface_support = oreg.cx; /* EDD functionality subsets */
97 97
98 /* Extended Get Device Parameters */ 98 /* Extended Get Device Parameters */
99 99
100 ei->params.length = sizeof(ei->params); 100 ei->params.length = sizeof(ei->params);
101 ax = 0x4800; 101 ireg.ah = 0x48;
102 dx = devno; 102 ireg.si = (size_t)&ei->params;
103 asm("pushfl; int $0x13; popfl" 103 intcall(0x13, &ireg, &oreg);
104 : "+a" (ax), "+d" (dx), "=m" (ei->params)
105 : "S" (&ei->params)
106 : "ebx", "ecx", "edi");
107 104
108 /* Get legacy CHS parameters */ 105 /* Get legacy CHS parameters */
109 106
110 /* Ralf Brown recommends setting ES:DI to 0:0 */ 107 /* Ralf Brown recommends setting ES:DI to 0:0 */
111 ax = 0x0800; 108 ireg.ah = 0x08;
112 dx = devno; 109 ireg.es = 0;
113 di = 0; 110 intcall(0x13, &ireg, &oreg);
114 asm("pushw %%es; " 111
115 "movw %%di,%%es; " 112 if (!(oreg.eflags & X86_EFLAGS_CF)) {
116 "pushfl; stc; int $0x13; setc %%al; popfl; " 113 ei->legacy_max_cylinder = oreg.ch + ((oreg.cl & 0xc0) << 2);
117 "popw %%es" 114 ei->legacy_max_head = oreg.dh;
118 : "+a" (ax), "=b" (bx), "=c" (cx), "+d" (dx), "+D" (di) 115 ei->legacy_sectors_per_track = oreg.cl & 0x3f;
119 : : "esi");
120
121 if ((u8)ax == 0) {
122 ei->legacy_max_cylinder = (cx >> 8) + ((cx & 0xc0) << 2);
123 ei->legacy_max_head = dx >> 8;
124 ei->legacy_sectors_per_track = cx & 0x3f;
125 } 116 }
126 117
127 return 0; 118 return 0;
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 5d84d1c74e4c..b31cc54b4641 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -22,7 +22,8 @@
22#include <asm/page_types.h> 22#include <asm/page_types.h>
23#include <asm/setup.h> 23#include <asm/setup.h>
24#include "boot.h" 24#include "boot.h"
25#include "offsets.h" 25#include "voffset.h"
26#include "zoffset.h"
26 27
27BOOTSEG = 0x07C0 /* original address of boot-sector */ 28BOOTSEG = 0x07C0 /* original address of boot-sector */
28SYSSEG = 0x1000 /* historical load address >> 4 */ 29SYSSEG = 0x1000 /* historical load address >> 4 */
@@ -115,7 +116,7 @@ _start:
115 # Part 2 of the header, from the old setup.S 116 # Part 2 of the header, from the old setup.S
116 117
117 .ascii "HdrS" # header signature 118 .ascii "HdrS" # header signature
118 .word 0x0209 # header version number (>= 0x0105) 119 .word 0x020a # header version number (>= 0x0105)
119 # or else old loadlin-1.5 will fail) 120 # or else old loadlin-1.5 will fail)
120 .globl realmode_swtch 121 .globl realmode_swtch
121realmode_swtch: .word 0, 0 # default_switch, SETUPSEG 122realmode_swtch: .word 0, 0 # default_switch, SETUPSEG
@@ -168,7 +169,11 @@ heap_end_ptr: .word _end+STACK_SIZE-512
168 # end of setup code can be used by setup 169 # end of setup code can be used by setup
169 # for local heap purposes. 170 # for local heap purposes.
170 171
171pad1: .word 0 172ext_loader_ver:
173 .byte 0 # Extended boot loader version
174ext_loader_type:
175 .byte 0 # Extended boot loader type
176
172cmd_line_ptr: .long 0 # (Header version 0x0202 or later) 177cmd_line_ptr: .long 0 # (Header version 0x0202 or later)
173 # If nonzero, a 32-bit pointer 178 # If nonzero, a 32-bit pointer
174 # to the kernel command line. 179 # to the kernel command line.
@@ -200,7 +205,7 @@ relocatable_kernel: .byte 1
200#else 205#else
201relocatable_kernel: .byte 0 206relocatable_kernel: .byte 0
202#endif 207#endif
203pad2: .byte 0 208min_alignment: .byte MIN_KERNEL_ALIGN_LG2 # minimum alignment
204pad3: .word 0 209pad3: .word 0
205 210
206cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line, 211cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line,
@@ -212,16 +217,27 @@ hardware_subarch: .long 0 # subarchitecture, added with 2.07
212 217
213hardware_subarch_data: .quad 0 218hardware_subarch_data: .quad 0
214 219
215payload_offset: .long input_data 220payload_offset: .long ZO_input_data
216payload_length: .long input_data_end-input_data 221payload_length: .long ZO_z_input_len
217 222
218setup_data: .quad 0 # 64-bit physical pointer to 223setup_data: .quad 0 # 64-bit physical pointer to
219 # single linked list of 224 # single linked list of
220 # struct setup_data 225 # struct setup_data
221 226
227pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr
228
229#define ZO_INIT_SIZE (ZO__end - ZO_startup_32 + ZO_z_extract_offset)
230#define VO_INIT_SIZE (VO__end - VO__text)
231#if ZO_INIT_SIZE > VO_INIT_SIZE
232#define INIT_SIZE ZO_INIT_SIZE
233#else
234#define INIT_SIZE VO_INIT_SIZE
235#endif
236init_size: .long INIT_SIZE # kernel initialization size
237
222# End of setup header ##################################################### 238# End of setup header #####################################################
223 239
224 .section ".inittext", "ax" 240 .section ".entrytext", "ax"
225start_of_setup: 241start_of_setup:
226#ifdef SAFE_RESET_DISK_CONTROLLER 242#ifdef SAFE_RESET_DISK_CONTROLLER
227# Reset the disk controller. 243# Reset the disk controller.
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
index 58f0415d3ae0..140172b895bd 100644
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -61,11 +62,10 @@ static void copy_boot_params(void)
61 */ 62 */
62static void keyboard_set_repeat(void) 63static void keyboard_set_repeat(void)
63{ 64{
64 u16 ax = 0x0305; 65 struct biosregs ireg;
65 u16 bx = 0; 66 initregs(&ireg);
66 asm volatile("int $0x16" 67 ireg.ax = 0x0305;
67 : "+a" (ax), "+b" (bx) 68 intcall(0x16, &ireg, NULL);
68 : : "ecx", "edx", "esi", "edi");
69} 69}
70 70
71/* 71/*
@@ -73,18 +73,22 @@ static void keyboard_set_repeat(void)
73 */ 73 */
74static void query_ist(void) 74static void query_ist(void)
75{ 75{
76 struct biosregs ireg, oreg;
77
76 /* Some older BIOSes apparently crash on this call, so filter 78 /* Some older BIOSes apparently crash on this call, so filter
77 it from machines too old to have SpeedStep at all. */ 79 it from machines too old to have SpeedStep at all. */
78 if (cpu.level < 6) 80 if (cpu.level < 6)
79 return; 81 return;
80 82
81 asm("int $0x15" 83 initregs(&ireg);
82 : "=a" (boot_params.ist_info.signature), 84 ireg.ax = 0xe980; /* IST Support */
83 "=b" (boot_params.ist_info.command), 85 ireg.edx = 0x47534943; /* Request value */
84 "=c" (boot_params.ist_info.event), 86 intcall(0x15, &ireg, &oreg);
85 "=d" (boot_params.ist_info.perf_level) 87
86 : "a" (0x0000e980), /* IST Support */ 88 boot_params.ist_info.signature = oreg.eax;
87 "d" (0x47534943)); /* Request value */ 89 boot_params.ist_info.command = oreg.ebx;
90 boot_params.ist_info.event = oreg.ecx;
91 boot_params.ist_info.perf_level = oreg.edx;
88} 92}
89 93
90/* 94/*
@@ -93,13 +97,12 @@ static void query_ist(void)
93static void set_bios_mode(void) 97static void set_bios_mode(void)
94{ 98{
95#ifdef CONFIG_X86_64 99#ifdef CONFIG_X86_64
96 u32 eax, ebx; 100 struct biosregs ireg;
97 101
98 eax = 0xec00; 102 initregs(&ireg);
99 ebx = 2; 103 ireg.ax = 0xec00;
100 asm volatile("int $0x15" 104 ireg.bx = 2;
101 : "+a" (eax), "+b" (ebx) 105 intcall(0x15, &ireg, NULL);
102 : : "ecx", "edx", "esi", "edi");
103#endif 106#endif
104} 107}
105 108
diff --git a/arch/x86/boot/mca.c b/arch/x86/boot/mca.c
index 911eaae5d696..a95a531148ef 100644
--- a/arch/x86/boot/mca.c
+++ b/arch/x86/boot/mca.c
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -16,26 +17,22 @@
16 17
17int query_mca(void) 18int query_mca(void)
18{ 19{
19 u8 err; 20 struct biosregs ireg, oreg;
20 u16 es, bx, len; 21 u16 len;
21 22
22 asm("pushw %%es ; " 23 initregs(&ireg);
23 "int $0x15 ; " 24 ireg.ah = 0xc0;
24 "setc %0 ; " 25 intcall(0x15, &ireg, &oreg);
25 "movw %%es, %1 ; " 26
26 "popw %%es" 27 if (oreg.eflags & X86_EFLAGS_CF)
27 : "=acd" (err), "=acdSD" (es), "=b" (bx)
28 : "a" (0xc000));
29
30 if (err)
31 return -1; /* No MCA present */ 28 return -1; /* No MCA present */
32 29
33 set_fs(es); 30 set_fs(oreg.es);
34 len = rdfs16(bx); 31 len = rdfs16(oreg.bx);
35 32
36 if (len > sizeof(boot_params.sys_desc_table)) 33 if (len > sizeof(boot_params.sys_desc_table))
37 len = sizeof(boot_params.sys_desc_table); 34 len = sizeof(boot_params.sys_desc_table);
38 35
39 copy_from_fs(&boot_params.sys_desc_table, bx, len); 36 copy_from_fs(&boot_params.sys_desc_table, oreg.bx, len);
40 return 0; 37 return 0;
41} 38}
diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c
index 74b3d2ba84e9..cae3feb1035e 100644
--- a/arch/x86/boot/memory.c
+++ b/arch/x86/boot/memory.c
@@ -20,12 +20,16 @@
20static int detect_memory_e820(void) 20static int detect_memory_e820(void)
21{ 21{
22 int count = 0; 22 int count = 0;
23 u32 next = 0; 23 struct biosregs ireg, oreg;
24 u32 size, id, edi;
25 u8 err;
26 struct e820entry *desc = boot_params.e820_map; 24 struct e820entry *desc = boot_params.e820_map;
27 static struct e820entry buf; /* static so it is zeroed */ 25 static struct e820entry buf; /* static so it is zeroed */
28 26
27 initregs(&ireg);
28 ireg.ax = 0xe820;
29 ireg.cx = sizeof buf;
30 ireg.edx = SMAP;
31 ireg.di = (size_t)&buf;
32
29 /* 33 /*
30 * Note: at least one BIOS is known which assumes that the 34 * Note: at least one BIOS is known which assumes that the
31 * buffer pointed to by one e820 call is the same one as 35 * buffer pointed to by one e820 call is the same one as
@@ -41,22 +45,13 @@ static int detect_memory_e820(void)
41 */ 45 */
42 46
43 do { 47 do {
44 size = sizeof buf; 48 intcall(0x15, &ireg, &oreg);
45 49 ireg.ebx = oreg.ebx; /* for next iteration... */
46 /* Important: %edx and %esi are clobbered by some BIOSes,
47 so they must be either used for the error output
48 or explicitly marked clobbered. Given that, assume there
49 is something out there clobbering %ebp and %edi, too. */
50 asm("pushl %%ebp; int $0x15; popl %%ebp; setc %0"
51 : "=d" (err), "+b" (next), "=a" (id), "+c" (size),
52 "=D" (edi), "+m" (buf)
53 : "D" (&buf), "d" (SMAP), "a" (0xe820)
54 : "esi");
55 50
56 /* BIOSes which terminate the chain with CF = 1 as opposed 51 /* BIOSes which terminate the chain with CF = 1 as opposed
57 to %ebx = 0 don't always report the SMAP signature on 52 to %ebx = 0 don't always report the SMAP signature on
58 the final, failing, probe. */ 53 the final, failing, probe. */
59 if (err) 54 if (oreg.eflags & X86_EFLAGS_CF)
60 break; 55 break;
61 56
62 /* Some BIOSes stop returning SMAP in the middle of 57 /* Some BIOSes stop returning SMAP in the middle of
@@ -64,60 +59,64 @@ static int detect_memory_e820(void)
64 screwed up the map at that point, we might have a 59 screwed up the map at that point, we might have a
65 partial map, the full map, or complete garbage, so 60 partial map, the full map, or complete garbage, so
66 just return failure. */ 61 just return failure. */
67 if (id != SMAP) { 62 if (oreg.eax != SMAP) {
68 count = 0; 63 count = 0;
69 break; 64 break;
70 } 65 }
71 66
72 *desc++ = buf; 67 *desc++ = buf;
73 count++; 68 count++;
74 } while (next && count < ARRAY_SIZE(boot_params.e820_map)); 69 } while (ireg.ebx && count < ARRAY_SIZE(boot_params.e820_map));
75 70
76 return boot_params.e820_entries = count; 71 return boot_params.e820_entries = count;
77} 72}
78 73
79static int detect_memory_e801(void) 74static int detect_memory_e801(void)
80{ 75{
81 u16 ax, bx, cx, dx; 76 struct biosregs ireg, oreg;
82 u8 err;
83 77
84 bx = cx = dx = 0; 78 initregs(&ireg);
85 ax = 0xe801; 79 ireg.ax = 0xe801;
86 asm("stc; int $0x15; setc %0" 80 intcall(0x15, &ireg, &oreg);
87 : "=m" (err), "+a" (ax), "+b" (bx), "+c" (cx), "+d" (dx));
88 81
89 if (err) 82 if (oreg.eflags & X86_EFLAGS_CF)
90 return -1; 83 return -1;
91 84
92 /* Do we really need to do this? */ 85 /* Do we really need to do this? */
93 if (cx || dx) { 86 if (oreg.cx || oreg.dx) {
94 ax = cx; 87 oreg.ax = oreg.cx;
95 bx = dx; 88 oreg.bx = oreg.dx;
96 } 89 }
97 90
98 if (ax > 15*1024) 91 if (oreg.ax > 15*1024) {
99 return -1; /* Bogus! */ 92 return -1; /* Bogus! */
100 93 } else if (oreg.ax == 15*1024) {
101 /* This ignores memory above 16MB if we have a memory hole 94 boot_params.alt_mem_k = (oreg.dx << 6) + oreg.ax;
102 there. If someone actually finds a machine with a memory 95 } else {
103 hole at 16MB and no support for 0E820h they should probably 96 /*
104 generate a fake e820 map. */ 97 * This ignores memory above 16MB if we have a memory
105 boot_params.alt_mem_k = (ax == 15*1024) ? (dx << 6)+ax : ax; 98 * hole there. If someone actually finds a machine
99 * with a memory hole at 16MB and no support for
100 * 0E820h they should probably generate a fake e820
101 * map.
102 */
103 boot_params.alt_mem_k = oreg.ax;
104 }
106 105
107 return 0; 106 return 0;
108} 107}
109 108
110static int detect_memory_88(void) 109static int detect_memory_88(void)
111{ 110{
112 u16 ax; 111 struct biosregs ireg, oreg;
113 u8 err;
114 112
115 ax = 0x8800; 113 initregs(&ireg);
116 asm("stc; int $0x15; setc %0" : "=bcdm" (err), "+a" (ax)); 114 ireg.ah = 0x88;
115 intcall(0x15, &ireg, &oreg);
117 116
118 boot_params.screen_info.ext_mem_k = ax; 117 boot_params.screen_info.ext_mem_k = oreg.ax;
119 118
120 return -err; 119 return -(oreg.eflags & X86_EFLAGS_CF); /* 0 or -1 */
121} 120}
122 121
123int detect_memory(void) 122int detect_memory(void)
diff --git a/arch/x86/boot/regs.c b/arch/x86/boot/regs.c
new file mode 100644
index 000000000000..958019b1cfa5
--- /dev/null
+++ b/arch/x86/boot/regs.c
@@ -0,0 +1,29 @@
1/* -----------------------------------------------------------------------
2 *
3 * Copyright 2009 Intel Corporation; author H. Peter Anvin
4 *
5 * This file is part of the Linux kernel, and is made available under
6 * the terms of the GNU General Public License version 2 or (at your
7 * option) any later version; incorporated herein by reference.
8 *
9 * ----------------------------------------------------------------------- */
10
11/*
12 * Simple helper function for initializing a register set.
13 *
14 * Note that this sets EFLAGS_CF in the input register set; this
15 * makes it easier to catch functions which do nothing but don't
16 * explicitly set CF.
17 */
18
19#include "boot.h"
20
21void initregs(struct biosregs *reg)
22{
23 memset(reg, 0, sizeof *reg);
24 reg->eflags |= X86_EFLAGS_CF;
25 reg->ds = ds();
26 reg->es = ds();
27 reg->fs = fs();
28 reg->gs = gs();
29}
diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld
index bb8dc2de7969..0f6ec455a2b1 100644
--- a/arch/x86/boot/setup.ld
+++ b/arch/x86/boot/setup.ld
@@ -15,8 +15,11 @@ SECTIONS
15 15
16 . = 497; 16 . = 497;
17 .header : { *(.header) } 17 .header : { *(.header) }
18 .entrytext : { *(.entrytext) }
18 .inittext : { *(.inittext) } 19 .inittext : { *(.inittext) }
19 .initdata : { *(.initdata) } 20 .initdata : { *(.initdata) }
21 __end_init = .;
22
20 .text : { *(.text) } 23 .text : { *(.text) }
21 .text32 : { *(.text32) } 24 .text32 : { *(.text32) }
22 25
@@ -52,4 +55,7 @@ SECTIONS
52 55
53 . = ASSERT(_end <= 0x8000, "Setup too big!"); 56 . = ASSERT(_end <= 0x8000, "Setup too big!");
54 . = ASSERT(hdr == 0x1f1, "The setup header has the wrong offset!"); 57 . = ASSERT(hdr == 0x1f1, "The setup header has the wrong offset!");
58 /* Necessary for the very-old-loader check to work... */
59 . = ASSERT(__end_init <= 5*512, "init sections too big!");
60
55} 61}
diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c
index 7e8e8b25f5f6..01ec69c901c7 100644
--- a/arch/x86/boot/tty.c
+++ b/arch/x86/boot/tty.c
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -22,24 +23,23 @@
22 23
23void __attribute__((section(".inittext"))) putchar(int ch) 24void __attribute__((section(".inittext"))) putchar(int ch)
24{ 25{
25 unsigned char c = ch; 26 struct biosregs ireg;
26 27
27 if (c == '\n') 28 if (ch == '\n')
28 putchar('\r'); /* \n -> \r\n */ 29 putchar('\r'); /* \n -> \r\n */
29 30
30 /* int $0x10 is known to have bugs involving touching registers 31 initregs(&ireg);
31 it shouldn't. Be extra conservative... */ 32 ireg.bx = 0x0007;
32 asm volatile("pushal; pushw %%ds; int $0x10; popw %%ds; popal" 33 ireg.cx = 0x0001;
33 : : "b" (0x0007), "c" (0x0001), "a" (0x0e00|ch)); 34 ireg.ah = 0x0e;
35 ireg.al = ch;
36 intcall(0x10, &ireg, NULL);
34} 37}
35 38
36void __attribute__((section(".inittext"))) puts(const char *str) 39void __attribute__((section(".inittext"))) puts(const char *str)
37{ 40{
38 int n = 0; 41 while (*str)
39 while (*str) {
40 putchar(*str++); 42 putchar(*str++);
41 n++;
42 }
43} 43}
44 44
45/* 45/*
@@ -49,14 +49,13 @@ void __attribute__((section(".inittext"))) puts(const char *str)
49 49
50static u8 gettime(void) 50static u8 gettime(void)
51{ 51{
52 u16 ax = 0x0200; 52 struct biosregs ireg, oreg;
53 u16 cx, dx;
54 53
55 asm volatile("int $0x1a" 54 initregs(&ireg);
56 : "+a" (ax), "=c" (cx), "=d" (dx) 55 ireg.ah = 0x02;
57 : : "ebx", "esi", "edi"); 56 intcall(0x1a, &ireg, &oreg);
58 57
59 return dx >> 8; 58 return oreg.dh;
60} 59}
61 60
62/* 61/*
@@ -64,19 +63,24 @@ static u8 gettime(void)
64 */ 63 */
65int getchar(void) 64int getchar(void)
66{ 65{
67 u16 ax = 0; 66 struct biosregs ireg, oreg;
68 asm volatile("int $0x16" : "+a" (ax)); 67
68 initregs(&ireg);
69 /* ireg.ah = 0x00; */
70 intcall(0x16, &ireg, &oreg);
69 71
70 return ax & 0xff; 72 return oreg.al;
71} 73}
72 74
73static int kbd_pending(void) 75static int kbd_pending(void)
74{ 76{
75 u8 pending; 77 struct biosregs ireg, oreg;
76 asm volatile("int $0x16; setnz %0" 78
77 : "=qm" (pending) 79 initregs(&ireg);
78 : "a" (0x0100)); 80 ireg.ah = 0x01;
79 return pending; 81 intcall(0x16, &ireg, &oreg);
82
83 return !(oreg.eflags & X86_EFLAGS_ZF);
80} 84}
81 85
82void kbd_flush(void) 86void kbd_flush(void)
diff --git a/arch/x86/boot/video-bios.c b/arch/x86/boot/video-bios.c
index 3fa979c9c363..d660be492363 100644
--- a/arch/x86/boot/video-bios.c
+++ b/arch/x86/boot/video-bios.c
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -29,21 +30,21 @@ static int bios_set_mode(struct mode_info *mi)
29 30
30static int set_bios_mode(u8 mode) 31static int set_bios_mode(u8 mode)
31{ 32{
32 u16 ax; 33 struct biosregs ireg, oreg;
33 u8 new_mode; 34 u8 new_mode;
34 35
35 ax = mode; /* AH=0x00 Set Video Mode */ 36 initregs(&ireg);
36 asm volatile(INT10 37 ireg.al = mode; /* AH=0x00 Set Video Mode */
37 : "+a" (ax) 38 intcall(0x10, &ireg, NULL);
38 : : "ebx", "ecx", "edx", "esi", "edi");
39 39
40 ax = 0x0f00; /* Get Current Video Mode */ 40
41 asm volatile(INT10 41 ireg.ah = 0x0f; /* Get Current Video Mode */
42 : "+a" (ax) 42 intcall(0x10, &ireg, &oreg);
43 : : "ebx", "ecx", "edx", "esi", "edi");
44 43
45 do_restore = 1; /* Assume video contents were lost */ 44 do_restore = 1; /* Assume video contents were lost */
46 new_mode = ax & 0x7f; /* Not all BIOSes are clean with the top bit */ 45
46 /* Not all BIOSes are clean with the top bit */
47 new_mode = ireg.al & 0x7f;
47 48
48 if (new_mode == mode) 49 if (new_mode == mode)
49 return 0; /* Mode change OK */ 50 return 0; /* Mode change OK */
@@ -53,10 +54,8 @@ static int set_bios_mode(u8 mode)
53 /* Mode setting failed, but we didn't end up where we 54 /* Mode setting failed, but we didn't end up where we
54 started. That's bad. Try to revert to the original 55 started. That's bad. Try to revert to the original
55 video mode. */ 56 video mode. */
56 ax = boot_params.screen_info.orig_video_mode; 57 ireg.ax = boot_params.screen_info.orig_video_mode;
57 asm volatile(INT10 58 intcall(0x10, &ireg, NULL);
58 : "+a" (ax)
59 : : "ebx", "ecx", "edx", "esi", "edi");
60 } 59 }
61#endif 60#endif
62 return -1; 61 return -1;
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c
index 4a58c8ce3f69..c700147d6ffb 100644
--- a/arch/x86/boot/video-vesa.c
+++ b/arch/x86/boot/video-vesa.c
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -31,7 +32,7 @@ static inline void vesa_store_mode_params_graphics(void) {}
31static int vesa_probe(void) 32static int vesa_probe(void)
32{ 33{
33#if defined(CONFIG_VIDEO_VESA) || defined(CONFIG_FIRMWARE_EDID) 34#if defined(CONFIG_VIDEO_VESA) || defined(CONFIG_FIRMWARE_EDID)
34 u16 ax, cx, di; 35 struct biosregs ireg, oreg;
35 u16 mode; 36 u16 mode;
36 addr_t mode_ptr; 37 addr_t mode_ptr;
37 struct mode_info *mi; 38 struct mode_info *mi;
@@ -39,13 +40,12 @@ static int vesa_probe(void)
39 40
40 video_vesa.modes = GET_HEAP(struct mode_info, 0); 41 video_vesa.modes = GET_HEAP(struct mode_info, 0);
41 42
42 ax = 0x4f00; 43 initregs(&ireg);
43 di = (size_t)&vginfo; 44 ireg.ax = 0x4f00;
44 asm(INT10 45 ireg.di = (size_t)&vginfo;
45 : "+a" (ax), "+D" (di), "=m" (vginfo) 46 intcall(0x10, &ireg, &oreg);
46 : : "ebx", "ecx", "edx", "esi");
47 47
48 if (ax != 0x004f || 48 if (ireg.ax != 0x004f ||
49 vginfo.signature != VESA_MAGIC || 49 vginfo.signature != VESA_MAGIC ||
50 vginfo.version < 0x0102) 50 vginfo.version < 0x0102)
51 return 0; /* Not present */ 51 return 0; /* Not present */
@@ -65,14 +65,12 @@ static int vesa_probe(void)
65 65
66 memset(&vminfo, 0, sizeof vminfo); /* Just in case... */ 66 memset(&vminfo, 0, sizeof vminfo); /* Just in case... */
67 67
68 ax = 0x4f01; 68 ireg.ax = 0x4f01;
69 cx = mode; 69 ireg.cx = mode;
70 di = (size_t)&vminfo; 70 ireg.di = (size_t)&vminfo;
71 asm(INT10 71 intcall(0x10, &ireg, &oreg);
72 : "+a" (ax), "+c" (cx), "+D" (di), "=m" (vminfo)
73 : : "ebx", "edx", "esi");
74 72
75 if (ax != 0x004f) 73 if (ireg.ax != 0x004f)
76 continue; 74 continue;
77 75
78 if ((vminfo.mode_attr & 0x15) == 0x05) { 76 if ((vminfo.mode_attr & 0x15) == 0x05) {
@@ -111,20 +109,19 @@ static int vesa_probe(void)
111 109
112static int vesa_set_mode(struct mode_info *mode) 110static int vesa_set_mode(struct mode_info *mode)
113{ 111{
114 u16 ax, bx, cx, di; 112 struct biosregs ireg, oreg;
115 int is_graphic; 113 int is_graphic;
116 u16 vesa_mode = mode->mode - VIDEO_FIRST_VESA; 114 u16 vesa_mode = mode->mode - VIDEO_FIRST_VESA;
117 115
118 memset(&vminfo, 0, sizeof vminfo); /* Just in case... */ 116 memset(&vminfo, 0, sizeof vminfo); /* Just in case... */
119 117
120 ax = 0x4f01; 118 initregs(&ireg);
121 cx = vesa_mode; 119 ireg.ax = 0x4f01;
122 di = (size_t)&vminfo; 120 ireg.cx = vesa_mode;
123 asm(INT10 121 ireg.di = (size_t)&vminfo;
124 : "+a" (ax), "+c" (cx), "+D" (di), "=m" (vminfo) 122 intcall(0x10, &ireg, &oreg);
125 : : "ebx", "edx", "esi");
126 123
127 if (ax != 0x004f) 124 if (oreg.ax != 0x004f)
128 return -1; 125 return -1;
129 126
130 if ((vminfo.mode_attr & 0x15) == 0x05) { 127 if ((vminfo.mode_attr & 0x15) == 0x05) {
@@ -141,14 +138,12 @@ static int vesa_set_mode(struct mode_info *mode)
141 } 138 }
142 139
143 140
144 ax = 0x4f02; 141 initregs(&ireg);
145 bx = vesa_mode; 142 ireg.ax = 0x4f02;
146 di = 0; 143 ireg.bx = vesa_mode;
147 asm volatile(INT10 144 intcall(0x10, &ireg, &oreg);
148 : "+a" (ax), "+b" (bx), "+D" (di)
149 : : "ecx", "edx", "esi");
150 145
151 if (ax != 0x004f) 146 if (oreg.ax != 0x004f)
152 return -1; 147 return -1;
153 148
154 graphic_mode = is_graphic; 149 graphic_mode = is_graphic;
@@ -171,50 +166,45 @@ static int vesa_set_mode(struct mode_info *mode)
171/* Switch DAC to 8-bit mode */ 166/* Switch DAC to 8-bit mode */
172static void vesa_dac_set_8bits(void) 167static void vesa_dac_set_8bits(void)
173{ 168{
169 struct biosregs ireg, oreg;
174 u8 dac_size = 6; 170 u8 dac_size = 6;
175 171
176 /* If possible, switch the DAC to 8-bit mode */ 172 /* If possible, switch the DAC to 8-bit mode */
177 if (vginfo.capabilities & 1) { 173 if (vginfo.capabilities & 1) {
178 u16 ax, bx; 174 initregs(&ireg);
179 175 ireg.ax = 0x4f08;
180 ax = 0x4f08; 176 ireg.bh = 0x08;
181 bx = 0x0800; 177 intcall(0x10, &ireg, &oreg);
182 asm volatile(INT10 178 if (oreg.ax == 0x004f)
183 : "+a" (ax), "+b" (bx) 179 dac_size = oreg.bh;
184 : : "ecx", "edx", "esi", "edi");
185
186 if (ax == 0x004f)
187 dac_size = bx >> 8;
188 } 180 }
189 181
190 /* Set the color sizes to the DAC size, and offsets to 0 */ 182 /* Set the color sizes to the DAC size, and offsets to 0 */
191 boot_params.screen_info.red_size = dac_size; 183 boot_params.screen_info.red_size = dac_size;
192 boot_params.screen_info.green_size = dac_size; 184 boot_params.screen_info.green_size = dac_size;
193 boot_params.screen_info.blue_size = dac_size; 185 boot_params.screen_info.blue_size = dac_size;
194 boot_params.screen_info.rsvd_size = dac_size; 186 boot_params.screen_info.rsvd_size = dac_size;
195 187
196 boot_params.screen_info.red_pos = 0; 188 boot_params.screen_info.red_pos = 0;
197 boot_params.screen_info.green_pos = 0; 189 boot_params.screen_info.green_pos = 0;
198 boot_params.screen_info.blue_pos = 0; 190 boot_params.screen_info.blue_pos = 0;
199 boot_params.screen_info.rsvd_pos = 0; 191 boot_params.screen_info.rsvd_pos = 0;
200} 192}
201 193
202/* Save the VESA protected mode info */ 194/* Save the VESA protected mode info */
203static void vesa_store_pm_info(void) 195static void vesa_store_pm_info(void)
204{ 196{
205 u16 ax, bx, di, es; 197 struct biosregs ireg, oreg;
206 198
207 ax = 0x4f0a; 199 initregs(&ireg);
208 bx = di = 0; 200 ireg.ax = 0x4f0a;
209 asm("pushw %%es; "INT10"; movw %%es,%0; popw %%es" 201 intcall(0x10, &ireg, &oreg);
210 : "=d" (es), "+a" (ax), "+b" (bx), "+D" (di)
211 : : "ecx", "esi");
212 202
213 if (ax != 0x004f) 203 if (oreg.ax != 0x004f)
214 return; 204 return;
215 205
216 boot_params.screen_info.vesapm_seg = es; 206 boot_params.screen_info.vesapm_seg = oreg.es;
217 boot_params.screen_info.vesapm_off = di; 207 boot_params.screen_info.vesapm_off = oreg.di;
218} 208}
219 209
220/* 210/*
@@ -252,7 +242,7 @@ static void vesa_store_mode_params_graphics(void)
252void vesa_store_edid(void) 242void vesa_store_edid(void)
253{ 243{
254#ifdef CONFIG_FIRMWARE_EDID 244#ifdef CONFIG_FIRMWARE_EDID
255 u16 ax, bx, cx, dx, di; 245 struct biosregs ireg, oreg;
256 246
257 /* Apparently used as a nonsense token... */ 247 /* Apparently used as a nonsense token... */
258 memset(&boot_params.edid_info, 0x13, sizeof boot_params.edid_info); 248 memset(&boot_params.edid_info, 0x13, sizeof boot_params.edid_info);
@@ -260,33 +250,26 @@ void vesa_store_edid(void)
260 if (vginfo.version < 0x0200) 250 if (vginfo.version < 0x0200)
261 return; /* EDID requires VBE 2.0+ */ 251 return; /* EDID requires VBE 2.0+ */
262 252
263 ax = 0x4f15; /* VBE DDC */ 253 initregs(&ireg);
264 bx = 0x0000; /* Report DDC capabilities */ 254 ireg.ax = 0x4f15; /* VBE DDC */
265 cx = 0; /* Controller 0 */ 255 /* ireg.bx = 0x0000; */ /* Report DDC capabilities */
266 di = 0; /* ES:DI must be 0 by spec */ 256 /* ireg.cx = 0; */ /* Controller 0 */
267 257 ireg.es = 0; /* ES:DI must be 0 by spec */
268 /* Note: The VBE DDC spec is different from the main VESA spec; 258 intcall(0x10, &ireg, &oreg);
269 we genuinely have to assume all registers are destroyed here. */
270
271 asm("pushw %%es; movw %2,%%es; "INT10"; popw %%es"
272 : "+a" (ax), "+b" (bx), "+c" (cx), "+D" (di)
273 : : "esi", "edx");
274 259
275 if (ax != 0x004f) 260 if (oreg.ax != 0x004f)
276 return; /* No EDID */ 261 return; /* No EDID */
277 262
278 /* BH = time in seconds to transfer EDD information */ 263 /* BH = time in seconds to transfer EDD information */
279 /* BL = DDC level supported */ 264 /* BL = DDC level supported */
280 265
281 ax = 0x4f15; /* VBE DDC */ 266 ireg.ax = 0x4f15; /* VBE DDC */
282 bx = 0x0001; /* Read EDID */ 267 ireg.bx = 0x0001; /* Read EDID */
283 cx = 0; /* Controller 0 */ 268 /* ireg.cx = 0; */ /* Controller 0 */
284 dx = 0; /* EDID block number */ 269 /* ireg.dx = 0; */ /* EDID block number */
285 di =(size_t) &boot_params.edid_info; /* (ES:)Pointer to block */ 270 ireg.es = ds();
286 asm(INT10 271 ireg.di =(size_t)&boot_params.edid_info; /* (ES:)Pointer to block */
287 : "+a" (ax), "+b" (bx), "+d" (dx), "=m" (boot_params.edid_info), 272 intcall(0x10, &ireg, &oreg);
288 "+c" (cx), "+D" (di)
289 : : "esi");
290#endif /* CONFIG_FIRMWARE_EDID */ 273#endif /* CONFIG_FIRMWARE_EDID */
291} 274}
292 275
diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c
index 9e0587a37768..8f8d827e254d 100644
--- a/arch/x86/boot/video-vga.c
+++ b/arch/x86/boot/video-vga.c
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -39,30 +40,30 @@ static __videocard video_vga;
39/* Set basic 80x25 mode */ 40/* Set basic 80x25 mode */
40static u8 vga_set_basic_mode(void) 41static u8 vga_set_basic_mode(void)
41{ 42{
43 struct biosregs ireg, oreg;
42 u16 ax; 44 u16 ax;
43 u8 rows; 45 u8 rows;
44 u8 mode; 46 u8 mode;
45 47
48 initregs(&ireg);
49
46#ifdef CONFIG_VIDEO_400_HACK 50#ifdef CONFIG_VIDEO_400_HACK
47 if (adapter >= ADAPTER_VGA) { 51 if (adapter >= ADAPTER_VGA) {
48 asm volatile(INT10 52 ireg.ax = 0x1202;
49 : : "a" (0x1202), "b" (0x0030) 53 ireg.bx = 0x0030;
50 : "ecx", "edx", "esi", "edi"); 54 intcall(0x10, &ireg, NULL);
51 } 55 }
52#endif 56#endif
53 57
54 ax = 0x0f00; 58 ax = 0x0f00;
55 asm volatile(INT10 59 intcall(0x10, &ireg, &oreg);
56 : "+a" (ax) 60 mode = oreg.al;
57 : : "ebx", "ecx", "edx", "esi", "edi");
58
59 mode = (u8)ax;
60 61
61 set_fs(0); 62 set_fs(0);
62 rows = rdfs8(0x484); /* rows minus one */ 63 rows = rdfs8(0x484); /* rows minus one */
63 64
64#ifndef CONFIG_VIDEO_400_HACK 65#ifndef CONFIG_VIDEO_400_HACK
65 if ((ax == 0x5003 || ax == 0x5007) && 66 if ((oreg.ax == 0x5003 || oreg.ax == 0x5007) &&
66 (rows == 0 || rows == 24)) 67 (rows == 0 || rows == 24))
67 return mode; 68 return mode;
68#endif 69#endif
@@ -71,10 +72,8 @@ static u8 vga_set_basic_mode(void)
71 mode = 3; 72 mode = 3;
72 73
73 /* Set the mode */ 74 /* Set the mode */
74 ax = mode; 75 ireg.ax = mode; /* AH=0: set mode */
75 asm volatile(INT10 76 intcall(0x10, &ireg, NULL);
76 : "+a" (ax)
77 : : "ebx", "ecx", "edx", "esi", "edi");
78 do_restore = 1; 77 do_restore = 1;
79 return mode; 78 return mode;
80} 79}
@@ -82,43 +81,69 @@ static u8 vga_set_basic_mode(void)
82static void vga_set_8font(void) 81static void vga_set_8font(void)
83{ 82{
84 /* Set 8x8 font - 80x43 on EGA, 80x50 on VGA */ 83 /* Set 8x8 font - 80x43 on EGA, 80x50 on VGA */
84 struct biosregs ireg;
85
86 initregs(&ireg);
85 87
86 /* Set 8x8 font */ 88 /* Set 8x8 font */
87 asm volatile(INT10 : : "a" (0x1112), "b" (0)); 89 ireg.ax = 0x1112;
90 /* ireg.bl = 0; */
91 intcall(0x10, &ireg, NULL);
88 92
89 /* Use alternate print screen */ 93 /* Use alternate print screen */
90 asm volatile(INT10 : : "a" (0x1200), "b" (0x20)); 94 ireg.ax = 0x1200;
95 ireg.bl = 0x20;
96 intcall(0x10, &ireg, NULL);
91 97
92 /* Turn off cursor emulation */ 98 /* Turn off cursor emulation */
93 asm volatile(INT10 : : "a" (0x1201), "b" (0x34)); 99 ireg.ax = 0x1201;
100 ireg.bl = 0x34;
101 intcall(0x10, &ireg, NULL);
94 102
95 /* Cursor is scan lines 6-7 */ 103 /* Cursor is scan lines 6-7 */
96 asm volatile(INT10 : : "a" (0x0100), "c" (0x0607)); 104 ireg.ax = 0x0100;
105 ireg.cx = 0x0607;
106 intcall(0x10, &ireg, NULL);
97} 107}
98 108
99static void vga_set_14font(void) 109static void vga_set_14font(void)
100{ 110{
101 /* Set 9x14 font - 80x28 on VGA */ 111 /* Set 9x14 font - 80x28 on VGA */
112 struct biosregs ireg;
113
114 initregs(&ireg);
102 115
103 /* Set 9x14 font */ 116 /* Set 9x14 font */
104 asm volatile(INT10 : : "a" (0x1111), "b" (0)); 117 ireg.ax = 0x1111;
118 /* ireg.bl = 0; */
119 intcall(0x10, &ireg, NULL);
105 120
106 /* Turn off cursor emulation */ 121 /* Turn off cursor emulation */
107 asm volatile(INT10 : : "a" (0x1201), "b" (0x34)); 122 ireg.ax = 0x1201;
123 ireg.bl = 0x34;
124 intcall(0x10, &ireg, NULL);
108 125
109 /* Cursor is scan lines 11-12 */ 126 /* Cursor is scan lines 11-12 */
110 asm volatile(INT10 : : "a" (0x0100), "c" (0x0b0c)); 127 ireg.ax = 0x0100;
128 ireg.cx = 0x0b0c;
129 intcall(0x10, &ireg, NULL);
111} 130}
112 131
113static void vga_set_80x43(void) 132static void vga_set_80x43(void)
114{ 133{
115 /* Set 80x43 mode on VGA (not EGA) */ 134 /* Set 80x43 mode on VGA (not EGA) */
135 struct biosregs ireg;
136
137 initregs(&ireg);
116 138
117 /* Set 350 scans */ 139 /* Set 350 scans */
118 asm volatile(INT10 : : "a" (0x1201), "b" (0x30)); 140 ireg.ax = 0x1201;
141 ireg.bl = 0x30;
142 intcall(0x10, &ireg, NULL);
119 143
120 /* Reset video mode */ 144 /* Reset video mode */
121 asm volatile(INT10 : : "a" (0x0003)); 145 ireg.ax = 0x0003;
146 intcall(0x10, &ireg, NULL);
122 147
123 vga_set_8font(); 148 vga_set_8font();
124} 149}
@@ -225,8 +250,6 @@ static int vga_set_mode(struct mode_info *mode)
225 */ 250 */
226static int vga_probe(void) 251static int vga_probe(void)
227{ 252{
228 u16 ega_bx;
229
230 static const char *card_name[] = { 253 static const char *card_name[] = {
231 "CGA/MDA/HGC", "EGA", "VGA" 254 "CGA/MDA/HGC", "EGA", "VGA"
232 }; 255 };
@@ -240,26 +263,26 @@ static int vga_probe(void)
240 sizeof(ega_modes)/sizeof(struct mode_info), 263 sizeof(ega_modes)/sizeof(struct mode_info),
241 sizeof(vga_modes)/sizeof(struct mode_info), 264 sizeof(vga_modes)/sizeof(struct mode_info),
242 }; 265 };
243 u8 vga_flag;
244 266
245 asm(INT10 267 struct biosregs ireg, oreg;
246 : "=b" (ega_bx) 268
247 : "a" (0x1200), "b" (0x10) /* Check EGA/VGA */ 269 initregs(&ireg);
248 : "ecx", "edx", "esi", "edi"); 270
271 ireg.ax = 0x1200;
272 ireg.bl = 0x10; /* Check EGA/VGA */
273 intcall(0x10, &ireg, &oreg);
249 274
250#ifndef _WAKEUP 275#ifndef _WAKEUP
251 boot_params.screen_info.orig_video_ega_bx = ega_bx; 276 boot_params.screen_info.orig_video_ega_bx = oreg.bx;
252#endif 277#endif
253 278
254 /* If we have MDA/CGA/HGC then BL will be unchanged at 0x10 */ 279 /* If we have MDA/CGA/HGC then BL will be unchanged at 0x10 */
255 if ((u8)ega_bx != 0x10) { 280 if (oreg.bl != 0x10) {
256 /* EGA/VGA */ 281 /* EGA/VGA */
257 asm(INT10 282 ireg.ax = 0x1a00;
258 : "=a" (vga_flag) 283 intcall(0x10, &ireg, &oreg);
259 : "a" (0x1a00)
260 : "ebx", "ecx", "edx", "esi", "edi");
261 284
262 if (vga_flag == 0x1a) { 285 if (oreg.al == 0x1a) {
263 adapter = ADAPTER_VGA; 286 adapter = ADAPTER_VGA;
264#ifndef _WAKEUP 287#ifndef _WAKEUP
265 boot_params.screen_info.orig_video_isVGA = 1; 288 boot_params.screen_info.orig_video_isVGA = 1;
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
index 3bef2c1febe9..bad728b76fc2 100644
--- a/arch/x86/boot/video.c
+++ b/arch/x86/boot/video.c
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -18,33 +19,29 @@
18 19
19static void store_cursor_position(void) 20static void store_cursor_position(void)
20{ 21{
21 u16 curpos; 22 struct biosregs ireg, oreg;
22 u16 ax, bx;
23 23
24 ax = 0x0300; 24 initregs(&ireg);
25 bx = 0; 25 ireg.ah = 0x03;
26 asm(INT10 26 intcall(0x10, &ireg, &oreg);
27 : "=d" (curpos), "+a" (ax), "+b" (bx)
28 : : "ecx", "esi", "edi");
29 27
30 boot_params.screen_info.orig_x = curpos; 28 boot_params.screen_info.orig_x = oreg.dl;
31 boot_params.screen_info.orig_y = curpos >> 8; 29 boot_params.screen_info.orig_y = oreg.dh;
32} 30}
33 31
34static void store_video_mode(void) 32static void store_video_mode(void)
35{ 33{
36 u16 ax, page; 34 struct biosregs ireg, oreg;
37 35
38 /* N.B.: the saving of the video page here is a bit silly, 36 /* N.B.: the saving of the video page here is a bit silly,
39 since we pretty much assume page 0 everywhere. */ 37 since we pretty much assume page 0 everywhere. */
40 ax = 0x0f00; 38 initregs(&ireg);
41 asm(INT10 39 ireg.ah = 0x0f;
42 : "+a" (ax), "=b" (page) 40 intcall(0x10, &ireg, &oreg);
43 : : "ecx", "edx", "esi", "edi");
44 41
45 /* Not all BIOSes are clean with respect to the top bit */ 42 /* Not all BIOSes are clean with respect to the top bit */
46 boot_params.screen_info.orig_video_mode = ax & 0x7f; 43 boot_params.screen_info.orig_video_mode = oreg.al & 0x7f;
47 boot_params.screen_info.orig_video_page = page >> 8; 44 boot_params.screen_info.orig_video_page = oreg.bh;
48} 45}
49 46
50/* 47/*
@@ -257,7 +254,7 @@ static void restore_screen(void)
257 int y; 254 int y;
258 addr_t dst = 0; 255 addr_t dst = 0;
259 u16 *src = saved.data; 256 u16 *src = saved.data;
260 u16 ax, bx, dx; 257 struct biosregs ireg;
261 258
262 if (graphic_mode) 259 if (graphic_mode)
263 return; /* Can't restore onto a graphic mode */ 260 return; /* Can't restore onto a graphic mode */
@@ -296,12 +293,11 @@ static void restore_screen(void)
296 } 293 }
297 294
298 /* Restore cursor position */ 295 /* Restore cursor position */
299 ax = 0x0200; /* Set cursor position */ 296 initregs(&ireg);
300 bx = 0; /* Page number (<< 8) */ 297 ireg.ah = 0x02; /* Set cursor position */
301 dx = (saved.cury << 8)+saved.curx; 298 ireg.dh = saved.cury;
302 asm volatile(INT10 299 ireg.dl = saved.curx;
303 : "+a" (ax), "+b" (bx), "+d" (dx) 300 intcall(0x10, &ireg, NULL);
304 : : "ecx", "esi", "edi");
305} 301}
306#else 302#else
307#define save_screen() ((void)0) 303#define save_screen() ((void)0)
diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h
index ee63f5d14461..5bb174a997fc 100644
--- a/arch/x86/boot/video.h
+++ b/arch/x86/boot/video.h
@@ -112,20 +112,6 @@ extern int force_x, force_y; /* Don't query the BIOS for cols/rows */
112extern int do_restore; /* Restore screen contents */ 112extern int do_restore; /* Restore screen contents */
113extern int graphic_mode; /* Graphics mode with linear frame buffer */ 113extern int graphic_mode; /* Graphics mode with linear frame buffer */
114 114
115/*
116 * int $0x10 is notorious for touching registers it shouldn't.
117 * gcc doesn't like %ebp being clobbered, so define it as a push/pop
118 * sequence here.
119 *
120 * A number of systems, including the original PC can clobber %bp in
121 * certain circumstances, like when scrolling. There exists at least
122 * one Trident video card which could clobber DS under a set of
123 * circumstances that we are unlikely to encounter (scrolling when
124 * using an extended graphics mode of more than 800x600 pixels), but
125 * it's cheap insurance to deal with that here.
126 */
127#define INT10 "pushl %%ebp; pushw %%ds; int $0x10; popw %%ds; popl %%ebp"
128
129/* Accessing VGA indexed registers */ 115/* Accessing VGA indexed registers */
130static inline u8 in_idx(u16 port, u8 index) 116static inline u8 in_idx(u16 port, u8 index)
131{ 117{
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 235b81d0f6f2..edb992ebef92 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -1,12 +1,13 @@
1# 1#
2# Automatically generated make config: don't edit 2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.29-rc4 3# Linux kernel version: 2.6.30-rc2
4# Tue Feb 24 15:50:58 2009 4# Mon May 11 16:21:55 2009
5# 5#
6# CONFIG_64BIT is not set 6# CONFIG_64BIT is not set
7CONFIG_X86_32=y 7CONFIG_X86_32=y
8# CONFIG_X86_64 is not set 8# CONFIG_X86_64 is not set
9CONFIG_X86=y 9CONFIG_X86=y
10CONFIG_OUTPUT_FORMAT="elf32-i386"
10CONFIG_ARCH_DEFCONFIG="arch/x86/configs/i386_defconfig" 11CONFIG_ARCH_DEFCONFIG="arch/x86/configs/i386_defconfig"
11CONFIG_GENERIC_TIME=y 12CONFIG_GENERIC_TIME=y
12CONFIG_GENERIC_CMOS_UPDATE=y 13CONFIG_GENERIC_CMOS_UPDATE=y
@@ -33,6 +34,7 @@ CONFIG_ARCH_HAS_CPU_RELAX=y
33CONFIG_ARCH_HAS_DEFAULT_IDLE=y 34CONFIG_ARCH_HAS_DEFAULT_IDLE=y
34CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y 35CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y
35CONFIG_HAVE_SETUP_PER_CPU_AREA=y 36CONFIG_HAVE_SETUP_PER_CPU_AREA=y
37CONFIG_HAVE_DYNAMIC_PER_CPU_AREA=y
36# CONFIG_HAVE_CPUMASK_OF_CPU_MAP is not set 38# CONFIG_HAVE_CPUMASK_OF_CPU_MAP is not set
37CONFIG_ARCH_HIBERNATION_POSSIBLE=y 39CONFIG_ARCH_HIBERNATION_POSSIBLE=y
38CONFIG_ARCH_SUSPEND_POSSIBLE=y 40CONFIG_ARCH_SUSPEND_POSSIBLE=y
@@ -40,15 +42,16 @@ CONFIG_ARCH_SUSPEND_POSSIBLE=y
40CONFIG_ARCH_POPULATES_NODE_MAP=y 42CONFIG_ARCH_POPULATES_NODE_MAP=y
41# CONFIG_AUDIT_ARCH is not set 43# CONFIG_AUDIT_ARCH is not set
42CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y 44CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y
45CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y
43CONFIG_GENERIC_HARDIRQS=y 46CONFIG_GENERIC_HARDIRQS=y
47CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y
44CONFIG_GENERIC_IRQ_PROBE=y 48CONFIG_GENERIC_IRQ_PROBE=y
45CONFIG_GENERIC_PENDING_IRQ=y 49CONFIG_GENERIC_PENDING_IRQ=y
46CONFIG_X86_SMP=y
47CONFIG_USE_GENERIC_SMP_HELPERS=y 50CONFIG_USE_GENERIC_SMP_HELPERS=y
48CONFIG_X86_32_SMP=y 51CONFIG_X86_32_SMP=y
49CONFIG_X86_HT=y 52CONFIG_X86_HT=y
50CONFIG_X86_BIOS_REBOOT=y
51CONFIG_X86_TRAMPOLINE=y 53CONFIG_X86_TRAMPOLINE=y
54CONFIG_X86_32_LAZY_GS=y
52CONFIG_KTIME_SCALAR=y 55CONFIG_KTIME_SCALAR=y
53CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" 56CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
54 57
@@ -60,10 +63,17 @@ CONFIG_LOCK_KERNEL=y
60CONFIG_INIT_ENV_ARG_LIMIT=32 63CONFIG_INIT_ENV_ARG_LIMIT=32
61CONFIG_LOCALVERSION="" 64CONFIG_LOCALVERSION=""
62# CONFIG_LOCALVERSION_AUTO is not set 65# CONFIG_LOCALVERSION_AUTO is not set
66CONFIG_HAVE_KERNEL_GZIP=y
67CONFIG_HAVE_KERNEL_BZIP2=y
68CONFIG_HAVE_KERNEL_LZMA=y
69CONFIG_KERNEL_GZIP=y
70# CONFIG_KERNEL_BZIP2 is not set
71# CONFIG_KERNEL_LZMA is not set
63CONFIG_SWAP=y 72CONFIG_SWAP=y
64CONFIG_SYSVIPC=y 73CONFIG_SYSVIPC=y
65CONFIG_SYSVIPC_SYSCTL=y 74CONFIG_SYSVIPC_SYSCTL=y
66CONFIG_POSIX_MQUEUE=y 75CONFIG_POSIX_MQUEUE=y
76CONFIG_POSIX_MQUEUE_SYSCTL=y
67CONFIG_BSD_PROCESS_ACCT=y 77CONFIG_BSD_PROCESS_ACCT=y
68# CONFIG_BSD_PROCESS_ACCT_V3 is not set 78# CONFIG_BSD_PROCESS_ACCT_V3 is not set
69CONFIG_TASKSTATS=y 79CONFIG_TASKSTATS=y
@@ -113,23 +123,26 @@ CONFIG_PID_NS=y
113CONFIG_NET_NS=y 123CONFIG_NET_NS=y
114CONFIG_BLK_DEV_INITRD=y 124CONFIG_BLK_DEV_INITRD=y
115CONFIG_INITRAMFS_SOURCE="" 125CONFIG_INITRAMFS_SOURCE=""
126CONFIG_RD_GZIP=y
127CONFIG_RD_BZIP2=y
128CONFIG_RD_LZMA=y
116CONFIG_CC_OPTIMIZE_FOR_SIZE=y 129CONFIG_CC_OPTIMIZE_FOR_SIZE=y
117CONFIG_SYSCTL=y 130CONFIG_SYSCTL=y
131CONFIG_ANON_INODES=y
118# CONFIG_EMBEDDED is not set 132# CONFIG_EMBEDDED is not set
119CONFIG_UID16=y 133CONFIG_UID16=y
120CONFIG_SYSCTL_SYSCALL=y 134CONFIG_SYSCTL_SYSCALL=y
121CONFIG_KALLSYMS=y 135CONFIG_KALLSYMS=y
122CONFIG_KALLSYMS_ALL=y 136CONFIG_KALLSYMS_ALL=y
123CONFIG_KALLSYMS_EXTRA_PASS=y 137CONFIG_KALLSYMS_EXTRA_PASS=y
138# CONFIG_STRIP_ASM_SYMS is not set
124CONFIG_HOTPLUG=y 139CONFIG_HOTPLUG=y
125CONFIG_PRINTK=y 140CONFIG_PRINTK=y
126CONFIG_BUG=y 141CONFIG_BUG=y
127CONFIG_ELF_CORE=y 142CONFIG_ELF_CORE=y
128CONFIG_PCSPKR_PLATFORM=y 143CONFIG_PCSPKR_PLATFORM=y
129# CONFIG_COMPAT_BRK is not set
130CONFIG_BASE_FULL=y 144CONFIG_BASE_FULL=y
131CONFIG_FUTEX=y 145CONFIG_FUTEX=y
132CONFIG_ANON_INODES=y
133CONFIG_EPOLL=y 146CONFIG_EPOLL=y
134CONFIG_SIGNALFD=y 147CONFIG_SIGNALFD=y
135CONFIG_TIMERFD=y 148CONFIG_TIMERFD=y
@@ -139,6 +152,7 @@ CONFIG_AIO=y
139CONFIG_VM_EVENT_COUNTERS=y 152CONFIG_VM_EVENT_COUNTERS=y
140CONFIG_PCI_QUIRKS=y 153CONFIG_PCI_QUIRKS=y
141CONFIG_SLUB_DEBUG=y 154CONFIG_SLUB_DEBUG=y
155# CONFIG_COMPAT_BRK is not set
142# CONFIG_SLAB is not set 156# CONFIG_SLAB is not set
143CONFIG_SLUB=y 157CONFIG_SLUB=y
144# CONFIG_SLOB is not set 158# CONFIG_SLOB is not set
@@ -154,6 +168,8 @@ CONFIG_HAVE_IOREMAP_PROT=y
154CONFIG_HAVE_KPROBES=y 168CONFIG_HAVE_KPROBES=y
155CONFIG_HAVE_KRETPROBES=y 169CONFIG_HAVE_KRETPROBES=y
156CONFIG_HAVE_ARCH_TRACEHOOK=y 170CONFIG_HAVE_ARCH_TRACEHOOK=y
171CONFIG_HAVE_DMA_API_DEBUG=y
172# CONFIG_SLOW_WORK is not set
157CONFIG_HAVE_GENERIC_DMA_COHERENT=y 173CONFIG_HAVE_GENERIC_DMA_COHERENT=y
158CONFIG_SLABINFO=y 174CONFIG_SLABINFO=y
159CONFIG_RT_MUTEXES=y 175CONFIG_RT_MUTEXES=y
@@ -167,7 +183,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
167CONFIG_STOP_MACHINE=y 183CONFIG_STOP_MACHINE=y
168CONFIG_BLOCK=y 184CONFIG_BLOCK=y
169# CONFIG_LBD is not set 185# CONFIG_LBD is not set
170CONFIG_BLK_DEV_IO_TRACE=y
171CONFIG_BLK_DEV_BSG=y 186CONFIG_BLK_DEV_BSG=y
172# CONFIG_BLK_DEV_INTEGRITY is not set 187# CONFIG_BLK_DEV_INTEGRITY is not set
173 188
@@ -194,12 +209,12 @@ CONFIG_HIGH_RES_TIMERS=y
194CONFIG_GENERIC_CLOCKEVENTS_BUILD=y 209CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
195CONFIG_SMP=y 210CONFIG_SMP=y
196CONFIG_SPARSE_IRQ=y 211CONFIG_SPARSE_IRQ=y
197CONFIG_X86_FIND_SMP_CONFIG=y
198CONFIG_X86_MPPARSE=y 212CONFIG_X86_MPPARSE=y
213# CONFIG_X86_BIGSMP is not set
214CONFIG_X86_EXTENDED_PLATFORM=y
199# CONFIG_X86_ELAN is not set 215# CONFIG_X86_ELAN is not set
200# CONFIG_X86_GENERICARCH is not set
201# CONFIG_X86_VSMP is not set
202# CONFIG_X86_RDC321X is not set 216# CONFIG_X86_RDC321X is not set
217# CONFIG_X86_32_NON_STANDARD is not set
203CONFIG_SCHED_OMIT_FRAME_POINTER=y 218CONFIG_SCHED_OMIT_FRAME_POINTER=y
204# CONFIG_PARAVIRT_GUEST is not set 219# CONFIG_PARAVIRT_GUEST is not set
205# CONFIG_MEMTEST is not set 220# CONFIG_MEMTEST is not set
@@ -230,8 +245,10 @@ CONFIG_M686=y
230# CONFIG_GENERIC_CPU is not set 245# CONFIG_GENERIC_CPU is not set
231CONFIG_X86_GENERIC=y 246CONFIG_X86_GENERIC=y
232CONFIG_X86_CPU=y 247CONFIG_X86_CPU=y
248CONFIG_X86_L1_CACHE_BYTES=64
249CONFIG_X86_INTERNODE_CACHE_BYTES=64
233CONFIG_X86_CMPXCHG=y 250CONFIG_X86_CMPXCHG=y
234CONFIG_X86_L1_CACHE_SHIFT=7 251CONFIG_X86_L1_CACHE_SHIFT=5
235CONFIG_X86_XADD=y 252CONFIG_X86_XADD=y
236# CONFIG_X86_PPRO_FENCE is not set 253# CONFIG_X86_PPRO_FENCE is not set
237CONFIG_X86_WP_WORKS_OK=y 254CONFIG_X86_WP_WORKS_OK=y
@@ -247,7 +264,7 @@ CONFIG_X86_DEBUGCTLMSR=y
247CONFIG_CPU_SUP_INTEL=y 264CONFIG_CPU_SUP_INTEL=y
248CONFIG_CPU_SUP_CYRIX_32=y 265CONFIG_CPU_SUP_CYRIX_32=y
249CONFIG_CPU_SUP_AMD=y 266CONFIG_CPU_SUP_AMD=y
250CONFIG_CPU_SUP_CENTAUR_32=y 267CONFIG_CPU_SUP_CENTAUR=y
251CONFIG_CPU_SUP_TRANSMETA_32=y 268CONFIG_CPU_SUP_TRANSMETA_32=y
252CONFIG_CPU_SUP_UMC_32=y 269CONFIG_CPU_SUP_UMC_32=y
253CONFIG_X86_DS=y 270CONFIG_X86_DS=y
@@ -279,6 +296,7 @@ CONFIG_MICROCODE_AMD=y
279CONFIG_MICROCODE_OLD_INTERFACE=y 296CONFIG_MICROCODE_OLD_INTERFACE=y
280CONFIG_X86_MSR=y 297CONFIG_X86_MSR=y
281CONFIG_X86_CPUID=y 298CONFIG_X86_CPUID=y
299# CONFIG_X86_CPU_DEBUG is not set
282# CONFIG_NOHIGHMEM is not set 300# CONFIG_NOHIGHMEM is not set
283CONFIG_HIGHMEM4G=y 301CONFIG_HIGHMEM4G=y
284# CONFIG_HIGHMEM64G is not set 302# CONFIG_HIGHMEM64G is not set
@@ -302,6 +320,8 @@ CONFIG_ZONE_DMA_FLAG=1
302CONFIG_BOUNCE=y 320CONFIG_BOUNCE=y
303CONFIG_VIRT_TO_BUS=y 321CONFIG_VIRT_TO_BUS=y
304CONFIG_UNEVICTABLE_LRU=y 322CONFIG_UNEVICTABLE_LRU=y
323CONFIG_HAVE_MLOCK=y
324CONFIG_HAVE_MLOCKED_PAGE_BIT=y
305CONFIG_HIGHPTE=y 325CONFIG_HIGHPTE=y
306CONFIG_X86_CHECK_BIOS_CORRUPTION=y 326CONFIG_X86_CHECK_BIOS_CORRUPTION=y
307CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y 327CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y
@@ -312,6 +332,7 @@ CONFIG_MTRR=y
312CONFIG_X86_PAT=y 332CONFIG_X86_PAT=y
313CONFIG_EFI=y 333CONFIG_EFI=y
314CONFIG_SECCOMP=y 334CONFIG_SECCOMP=y
335# CONFIG_CC_STACKPROTECTOR is not set
315# CONFIG_HZ_100 is not set 336# CONFIG_HZ_100 is not set
316# CONFIG_HZ_250 is not set 337# CONFIG_HZ_250 is not set
317# CONFIG_HZ_300 is not set 338# CONFIG_HZ_300 is not set
@@ -322,8 +343,9 @@ CONFIG_KEXEC=y
322CONFIG_CRASH_DUMP=y 343CONFIG_CRASH_DUMP=y
323# CONFIG_KEXEC_JUMP is not set 344# CONFIG_KEXEC_JUMP is not set
324CONFIG_PHYSICAL_START=0x1000000 345CONFIG_PHYSICAL_START=0x1000000
325# CONFIG_RELOCATABLE is not set 346CONFIG_RELOCATABLE=y
326CONFIG_PHYSICAL_ALIGN=0x200000 347CONFIG_X86_NEED_RELOCS=y
348CONFIG_PHYSICAL_ALIGN=0x1000000
327CONFIG_HOTPLUG_CPU=y 349CONFIG_HOTPLUG_CPU=y
328# CONFIG_COMPAT_VDSO is not set 350# CONFIG_COMPAT_VDSO is not set
329# CONFIG_CMDLINE_BOOL is not set 351# CONFIG_CMDLINE_BOOL is not set
@@ -363,7 +385,6 @@ CONFIG_ACPI_THERMAL=y
363CONFIG_ACPI_BLACKLIST_YEAR=0 385CONFIG_ACPI_BLACKLIST_YEAR=0
364# CONFIG_ACPI_DEBUG is not set 386# CONFIG_ACPI_DEBUG is not set
365# CONFIG_ACPI_PCI_SLOT is not set 387# CONFIG_ACPI_PCI_SLOT is not set
366CONFIG_ACPI_SYSTEM=y
367CONFIG_X86_PM_TIMER=y 388CONFIG_X86_PM_TIMER=y
368CONFIG_ACPI_CONTAINER=y 389CONFIG_ACPI_CONTAINER=y
369# CONFIG_ACPI_SBS is not set 390# CONFIG_ACPI_SBS is not set
@@ -425,6 +446,7 @@ CONFIG_PCI_BIOS=y
425CONFIG_PCI_DIRECT=y 446CONFIG_PCI_DIRECT=y
426CONFIG_PCI_MMCONFIG=y 447CONFIG_PCI_MMCONFIG=y
427CONFIG_PCI_DOMAINS=y 448CONFIG_PCI_DOMAINS=y
449# CONFIG_DMAR is not set
428CONFIG_PCIEPORTBUS=y 450CONFIG_PCIEPORTBUS=y
429# CONFIG_HOTPLUG_PCI_PCIE is not set 451# CONFIG_HOTPLUG_PCI_PCIE is not set
430CONFIG_PCIEAER=y 452CONFIG_PCIEAER=y
@@ -435,6 +457,7 @@ CONFIG_PCI_MSI=y
435# CONFIG_PCI_DEBUG is not set 457# CONFIG_PCI_DEBUG is not set
436# CONFIG_PCI_STUB is not set 458# CONFIG_PCI_STUB is not set
437CONFIG_HT_IRQ=y 459CONFIG_HT_IRQ=y
460# CONFIG_PCI_IOV is not set
438CONFIG_ISA_DMA_API=y 461CONFIG_ISA_DMA_API=y
439# CONFIG_ISA is not set 462# CONFIG_ISA is not set
440# CONFIG_MCA is not set 463# CONFIG_MCA is not set
@@ -481,7 +504,6 @@ CONFIG_NET=y
481# 504#
482# Networking options 505# Networking options
483# 506#
484CONFIG_COMPAT_NET_DEV_OPS=y
485CONFIG_PACKET=y 507CONFIG_PACKET=y
486CONFIG_PACKET_MMAP=y 508CONFIG_PACKET_MMAP=y
487CONFIG_UNIX=y 509CONFIG_UNIX=y
@@ -639,6 +661,7 @@ CONFIG_LLC=y
639# CONFIG_LAPB is not set 661# CONFIG_LAPB is not set
640# CONFIG_ECONET is not set 662# CONFIG_ECONET is not set
641# CONFIG_WAN_ROUTER is not set 663# CONFIG_WAN_ROUTER is not set
664# CONFIG_PHONET is not set
642CONFIG_NET_SCHED=y 665CONFIG_NET_SCHED=y
643 666
644# 667#
@@ -696,6 +719,7 @@ CONFIG_NET_SCH_FIFO=y
696# 719#
697# CONFIG_NET_PKTGEN is not set 720# CONFIG_NET_PKTGEN is not set
698# CONFIG_NET_TCPPROBE is not set 721# CONFIG_NET_TCPPROBE is not set
722# CONFIG_NET_DROP_MONITOR is not set
699CONFIG_HAMRADIO=y 723CONFIG_HAMRADIO=y
700 724
701# 725#
@@ -706,12 +730,10 @@ CONFIG_HAMRADIO=y
706# CONFIG_IRDA is not set 730# CONFIG_IRDA is not set
707# CONFIG_BT is not set 731# CONFIG_BT is not set
708# CONFIG_AF_RXRPC is not set 732# CONFIG_AF_RXRPC is not set
709# CONFIG_PHONET is not set
710CONFIG_FIB_RULES=y 733CONFIG_FIB_RULES=y
711CONFIG_WIRELESS=y 734CONFIG_WIRELESS=y
712CONFIG_CFG80211=y 735CONFIG_CFG80211=y
713# CONFIG_CFG80211_REG_DEBUG is not set 736# CONFIG_CFG80211_REG_DEBUG is not set
714CONFIG_NL80211=y
715CONFIG_WIRELESS_OLD_REGULATORY=y 737CONFIG_WIRELESS_OLD_REGULATORY=y
716CONFIG_WIRELESS_EXT=y 738CONFIG_WIRELESS_EXT=y
717CONFIG_WIRELESS_EXT_SYSFS=y 739CONFIG_WIRELESS_EXT_SYSFS=y
@@ -789,6 +811,7 @@ CONFIG_MISC_DEVICES=y
789# CONFIG_ICS932S401 is not set 811# CONFIG_ICS932S401 is not set
790# CONFIG_ENCLOSURE_SERVICES is not set 812# CONFIG_ENCLOSURE_SERVICES is not set
791# CONFIG_HP_ILO is not set 813# CONFIG_HP_ILO is not set
814# CONFIG_ISL29003 is not set
792# CONFIG_C2PORT is not set 815# CONFIG_C2PORT is not set
793 816
794# 817#
@@ -842,6 +865,7 @@ CONFIG_SCSI_SPI_ATTRS=y
842# CONFIG_SCSI_LOWLEVEL is not set 865# CONFIG_SCSI_LOWLEVEL is not set
843# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set 866# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set
844# CONFIG_SCSI_DH is not set 867# CONFIG_SCSI_DH is not set
868# CONFIG_SCSI_OSD_INITIATOR is not set
845CONFIG_ATA=y 869CONFIG_ATA=y
846# CONFIG_ATA_NONSTANDARD is not set 870# CONFIG_ATA_NONSTANDARD is not set
847CONFIG_ATA_ACPI=y 871CONFIG_ATA_ACPI=y
@@ -940,6 +964,7 @@ CONFIG_DM_ZERO=y
940CONFIG_MACINTOSH_DRIVERS=y 964CONFIG_MACINTOSH_DRIVERS=y
941CONFIG_MAC_EMUMOUSEBTN=y 965CONFIG_MAC_EMUMOUSEBTN=y
942CONFIG_NETDEVICES=y 966CONFIG_NETDEVICES=y
967CONFIG_COMPAT_NET_DEV_OPS=y
943# CONFIG_IFB is not set 968# CONFIG_IFB is not set
944# CONFIG_DUMMY is not set 969# CONFIG_DUMMY is not set
945# CONFIG_BONDING is not set 970# CONFIG_BONDING is not set
@@ -977,6 +1002,8 @@ CONFIG_MII=y
977CONFIG_NET_VENDOR_3COM=y 1002CONFIG_NET_VENDOR_3COM=y
978# CONFIG_VORTEX is not set 1003# CONFIG_VORTEX is not set
979# CONFIG_TYPHOON is not set 1004# CONFIG_TYPHOON is not set
1005# CONFIG_ETHOC is not set
1006# CONFIG_DNET is not set
980CONFIG_NET_TULIP=y 1007CONFIG_NET_TULIP=y
981# CONFIG_DE2104X is not set 1008# CONFIG_DE2104X is not set
982# CONFIG_TULIP is not set 1009# CONFIG_TULIP is not set
@@ -1026,6 +1053,7 @@ CONFIG_E1000=y
1026CONFIG_E1000E=y 1053CONFIG_E1000E=y
1027# CONFIG_IP1000 is not set 1054# CONFIG_IP1000 is not set
1028# CONFIG_IGB is not set 1055# CONFIG_IGB is not set
1056# CONFIG_IGBVF is not set
1029# CONFIG_NS83820 is not set 1057# CONFIG_NS83820 is not set
1030# CONFIG_HAMACHI is not set 1058# CONFIG_HAMACHI is not set
1031# CONFIG_YELLOWFIN is not set 1059# CONFIG_YELLOWFIN is not set
@@ -1040,6 +1068,7 @@ CONFIG_BNX2=y
1040# CONFIG_QLA3XXX is not set 1068# CONFIG_QLA3XXX is not set
1041# CONFIG_ATL1 is not set 1069# CONFIG_ATL1 is not set
1042# CONFIG_ATL1E is not set 1070# CONFIG_ATL1E is not set
1071# CONFIG_ATL1C is not set
1043# CONFIG_JME is not set 1072# CONFIG_JME is not set
1044CONFIG_NETDEV_10000=y 1073CONFIG_NETDEV_10000=y
1045# CONFIG_CHELSIO_T1 is not set 1074# CONFIG_CHELSIO_T1 is not set
@@ -1049,6 +1078,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
1049# CONFIG_IXGBE is not set 1078# CONFIG_IXGBE is not set
1050# CONFIG_IXGB is not set 1079# CONFIG_IXGB is not set
1051# CONFIG_S2IO is not set 1080# CONFIG_S2IO is not set
1081# CONFIG_VXGE is not set
1052# CONFIG_MYRI10GE is not set 1082# CONFIG_MYRI10GE is not set
1053# CONFIG_NETXEN_NIC is not set 1083# CONFIG_NETXEN_NIC is not set
1054# CONFIG_NIU is not set 1084# CONFIG_NIU is not set
@@ -1058,6 +1088,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
1058# CONFIG_BNX2X is not set 1088# CONFIG_BNX2X is not set
1059# CONFIG_QLGE is not set 1089# CONFIG_QLGE is not set
1060# CONFIG_SFC is not set 1090# CONFIG_SFC is not set
1091# CONFIG_BE2NET is not set
1061CONFIG_TR=y 1092CONFIG_TR=y
1062# CONFIG_IBMOL is not set 1093# CONFIG_IBMOL is not set
1063# CONFIG_IBMLS is not set 1094# CONFIG_IBMLS is not set
@@ -1073,8 +1104,8 @@ CONFIG_WLAN_80211=y
1073# CONFIG_LIBERTAS is not set 1104# CONFIG_LIBERTAS is not set
1074# CONFIG_LIBERTAS_THINFIRM is not set 1105# CONFIG_LIBERTAS_THINFIRM is not set
1075# CONFIG_AIRO is not set 1106# CONFIG_AIRO is not set
1076# CONFIG_HERMES is not set
1077# CONFIG_ATMEL is not set 1107# CONFIG_ATMEL is not set
1108# CONFIG_AT76C50X_USB is not set
1078# CONFIG_AIRO_CS is not set 1109# CONFIG_AIRO_CS is not set
1079# CONFIG_PCMCIA_WL3501 is not set 1110# CONFIG_PCMCIA_WL3501 is not set
1080# CONFIG_PRISM54 is not set 1111# CONFIG_PRISM54 is not set
@@ -1084,21 +1115,21 @@ CONFIG_WLAN_80211=y
1084# CONFIG_RTL8187 is not set 1115# CONFIG_RTL8187 is not set
1085# CONFIG_ADM8211 is not set 1116# CONFIG_ADM8211 is not set
1086# CONFIG_MAC80211_HWSIM is not set 1117# CONFIG_MAC80211_HWSIM is not set
1118# CONFIG_MWL8K is not set
1087# CONFIG_P54_COMMON is not set 1119# CONFIG_P54_COMMON is not set
1088CONFIG_ATH5K=y 1120CONFIG_ATH5K=y
1089# CONFIG_ATH5K_DEBUG is not set 1121# CONFIG_ATH5K_DEBUG is not set
1090# CONFIG_ATH9K is not set 1122# CONFIG_ATH9K is not set
1123# CONFIG_AR9170_USB is not set
1091# CONFIG_IPW2100 is not set 1124# CONFIG_IPW2100 is not set
1092# CONFIG_IPW2200 is not set 1125# CONFIG_IPW2200 is not set
1093# CONFIG_IWLCORE is not set 1126# CONFIG_IWLWIFI is not set
1094# CONFIG_IWLWIFI_LEDS is not set
1095# CONFIG_IWLAGN is not set
1096# CONFIG_IWL3945 is not set
1097# CONFIG_HOSTAP is not set 1127# CONFIG_HOSTAP is not set
1098# CONFIG_B43 is not set 1128# CONFIG_B43 is not set
1099# CONFIG_B43LEGACY is not set 1129# CONFIG_B43LEGACY is not set
1100# CONFIG_ZD1211RW is not set 1130# CONFIG_ZD1211RW is not set
1101# CONFIG_RT2X00 is not set 1131# CONFIG_RT2X00 is not set
1132# CONFIG_HERMES is not set
1102 1133
1103# 1134#
1104# Enable WiMAX (Networking options) to see the WiMAX drivers 1135# Enable WiMAX (Networking options) to see the WiMAX drivers
@@ -1209,6 +1240,8 @@ CONFIG_INPUT_TABLET=y
1209# CONFIG_TABLET_USB_KBTAB is not set 1240# CONFIG_TABLET_USB_KBTAB is not set
1210# CONFIG_TABLET_USB_WACOM is not set 1241# CONFIG_TABLET_USB_WACOM is not set
1211CONFIG_INPUT_TOUCHSCREEN=y 1242CONFIG_INPUT_TOUCHSCREEN=y
1243# CONFIG_TOUCHSCREEN_AD7879_I2C is not set
1244# CONFIG_TOUCHSCREEN_AD7879 is not set
1212# CONFIG_TOUCHSCREEN_FUJITSU is not set 1245# CONFIG_TOUCHSCREEN_FUJITSU is not set
1213# CONFIG_TOUCHSCREEN_GUNZE is not set 1246# CONFIG_TOUCHSCREEN_GUNZE is not set
1214# CONFIG_TOUCHSCREEN_ELO is not set 1247# CONFIG_TOUCHSCREEN_ELO is not set
@@ -1303,6 +1336,7 @@ CONFIG_UNIX98_PTYS=y
1303# CONFIG_LEGACY_PTYS is not set 1336# CONFIG_LEGACY_PTYS is not set
1304# CONFIG_IPMI_HANDLER is not set 1337# CONFIG_IPMI_HANDLER is not set
1305CONFIG_HW_RANDOM=y 1338CONFIG_HW_RANDOM=y
1339# CONFIG_HW_RANDOM_TIMERIOMEM is not set
1306CONFIG_HW_RANDOM_INTEL=y 1340CONFIG_HW_RANDOM_INTEL=y
1307CONFIG_HW_RANDOM_AMD=y 1341CONFIG_HW_RANDOM_AMD=y
1308CONFIG_HW_RANDOM_GEODE=y 1342CONFIG_HW_RANDOM_GEODE=y
@@ -1390,7 +1424,6 @@ CONFIG_I2C_I801=y
1390# CONFIG_SENSORS_PCF8574 is not set 1424# CONFIG_SENSORS_PCF8574 is not set
1391# CONFIG_PCF8575 is not set 1425# CONFIG_PCF8575 is not set
1392# CONFIG_SENSORS_PCA9539 is not set 1426# CONFIG_SENSORS_PCA9539 is not set
1393# CONFIG_SENSORS_PCF8591 is not set
1394# CONFIG_SENSORS_MAX6875 is not set 1427# CONFIG_SENSORS_MAX6875 is not set
1395# CONFIG_SENSORS_TSL2550 is not set 1428# CONFIG_SENSORS_TSL2550 is not set
1396# CONFIG_I2C_DEBUG_CORE is not set 1429# CONFIG_I2C_DEBUG_CORE is not set
@@ -1424,6 +1457,7 @@ CONFIG_HWMON=y
1424# CONFIG_SENSORS_ADT7475 is not set 1457# CONFIG_SENSORS_ADT7475 is not set
1425# CONFIG_SENSORS_K8TEMP is not set 1458# CONFIG_SENSORS_K8TEMP is not set
1426# CONFIG_SENSORS_ASB100 is not set 1459# CONFIG_SENSORS_ASB100 is not set
1460# CONFIG_SENSORS_ATK0110 is not set
1427# CONFIG_SENSORS_ATXP1 is not set 1461# CONFIG_SENSORS_ATXP1 is not set
1428# CONFIG_SENSORS_DS1621 is not set 1462# CONFIG_SENSORS_DS1621 is not set
1429# CONFIG_SENSORS_I5K_AMB is not set 1463# CONFIG_SENSORS_I5K_AMB is not set
@@ -1433,6 +1467,7 @@ CONFIG_HWMON=y
1433# CONFIG_SENSORS_FSCHER is not set 1467# CONFIG_SENSORS_FSCHER is not set
1434# CONFIG_SENSORS_FSCPOS is not set 1468# CONFIG_SENSORS_FSCPOS is not set
1435# CONFIG_SENSORS_FSCHMD is not set 1469# CONFIG_SENSORS_FSCHMD is not set
1470# CONFIG_SENSORS_G760A is not set
1436# CONFIG_SENSORS_GL518SM is not set 1471# CONFIG_SENSORS_GL518SM is not set
1437# CONFIG_SENSORS_GL520SM is not set 1472# CONFIG_SENSORS_GL520SM is not set
1438# CONFIG_SENSORS_CORETEMP is not set 1473# CONFIG_SENSORS_CORETEMP is not set
@@ -1448,11 +1483,14 @@ CONFIG_HWMON=y
1448# CONFIG_SENSORS_LM90 is not set 1483# CONFIG_SENSORS_LM90 is not set
1449# CONFIG_SENSORS_LM92 is not set 1484# CONFIG_SENSORS_LM92 is not set
1450# CONFIG_SENSORS_LM93 is not set 1485# CONFIG_SENSORS_LM93 is not set
1486# CONFIG_SENSORS_LTC4215 is not set
1451# CONFIG_SENSORS_LTC4245 is not set 1487# CONFIG_SENSORS_LTC4245 is not set
1488# CONFIG_SENSORS_LM95241 is not set
1452# CONFIG_SENSORS_MAX1619 is not set 1489# CONFIG_SENSORS_MAX1619 is not set
1453# CONFIG_SENSORS_MAX6650 is not set 1490# CONFIG_SENSORS_MAX6650 is not set
1454# CONFIG_SENSORS_PC87360 is not set 1491# CONFIG_SENSORS_PC87360 is not set
1455# CONFIG_SENSORS_PC87427 is not set 1492# CONFIG_SENSORS_PC87427 is not set
1493# CONFIG_SENSORS_PCF8591 is not set
1456# CONFIG_SENSORS_SIS5595 is not set 1494# CONFIG_SENSORS_SIS5595 is not set
1457# CONFIG_SENSORS_DME1737 is not set 1495# CONFIG_SENSORS_DME1737 is not set
1458# CONFIG_SENSORS_SMSC47M1 is not set 1496# CONFIG_SENSORS_SMSC47M1 is not set
@@ -1643,7 +1681,6 @@ CONFIG_FB_EFI=y
1643# CONFIG_FB_3DFX is not set 1681# CONFIG_FB_3DFX is not set
1644# CONFIG_FB_VOODOO1 is not set 1682# CONFIG_FB_VOODOO1 is not set
1645# CONFIG_FB_VT8623 is not set 1683# CONFIG_FB_VT8623 is not set
1646# CONFIG_FB_CYBLA is not set
1647# CONFIG_FB_TRIDENT is not set 1684# CONFIG_FB_TRIDENT is not set
1648# CONFIG_FB_ARK is not set 1685# CONFIG_FB_ARK is not set
1649# CONFIG_FB_PM3 is not set 1686# CONFIG_FB_PM3 is not set
@@ -1652,6 +1689,7 @@ CONFIG_FB_EFI=y
1652# CONFIG_FB_VIRTUAL is not set 1689# CONFIG_FB_VIRTUAL is not set
1653# CONFIG_FB_METRONOME is not set 1690# CONFIG_FB_METRONOME is not set
1654# CONFIG_FB_MB862XX is not set 1691# CONFIG_FB_MB862XX is not set
1692# CONFIG_FB_BROADSHEET is not set
1655CONFIG_BACKLIGHT_LCD_SUPPORT=y 1693CONFIG_BACKLIGHT_LCD_SUPPORT=y
1656# CONFIG_LCD_CLASS_DEVICE is not set 1694# CONFIG_LCD_CLASS_DEVICE is not set
1657CONFIG_BACKLIGHT_CLASS_DEVICE=y 1695CONFIG_BACKLIGHT_CLASS_DEVICE=y
@@ -1738,6 +1776,8 @@ CONFIG_SND_PCI=y
1738# CONFIG_SND_INDIGO is not set 1776# CONFIG_SND_INDIGO is not set
1739# CONFIG_SND_INDIGOIO is not set 1777# CONFIG_SND_INDIGOIO is not set
1740# CONFIG_SND_INDIGODJ is not set 1778# CONFIG_SND_INDIGODJ is not set
1779# CONFIG_SND_INDIGOIOX is not set
1780# CONFIG_SND_INDIGODJX is not set
1741# CONFIG_SND_EMU10K1 is not set 1781# CONFIG_SND_EMU10K1 is not set
1742# CONFIG_SND_EMU10K1X is not set 1782# CONFIG_SND_EMU10K1X is not set
1743# CONFIG_SND_ENS1370 is not set 1783# CONFIG_SND_ENS1370 is not set
@@ -1811,15 +1851,17 @@ CONFIG_USB_HIDDEV=y
1811# 1851#
1812# Special HID drivers 1852# Special HID drivers
1813# 1853#
1814CONFIG_HID_COMPAT=y
1815CONFIG_HID_A4TECH=y 1854CONFIG_HID_A4TECH=y
1816CONFIG_HID_APPLE=y 1855CONFIG_HID_APPLE=y
1817CONFIG_HID_BELKIN=y 1856CONFIG_HID_BELKIN=y
1818CONFIG_HID_CHERRY=y 1857CONFIG_HID_CHERRY=y
1819CONFIG_HID_CHICONY=y 1858CONFIG_HID_CHICONY=y
1820CONFIG_HID_CYPRESS=y 1859CONFIG_HID_CYPRESS=y
1860# CONFIG_DRAGONRISE_FF is not set
1821CONFIG_HID_EZKEY=y 1861CONFIG_HID_EZKEY=y
1862CONFIG_HID_KYE=y
1822CONFIG_HID_GYRATION=y 1863CONFIG_HID_GYRATION=y
1864CONFIG_HID_KENSINGTON=y
1823CONFIG_HID_LOGITECH=y 1865CONFIG_HID_LOGITECH=y
1824CONFIG_LOGITECH_FF=y 1866CONFIG_LOGITECH_FF=y
1825# CONFIG_LOGIRUMBLEPAD2_FF is not set 1867# CONFIG_LOGIRUMBLEPAD2_FF is not set
@@ -1885,11 +1927,11 @@ CONFIG_USB_PRINTER=y
1885# CONFIG_USB_TMC is not set 1927# CONFIG_USB_TMC is not set
1886 1928
1887# 1929#
1888# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may also be needed; 1930# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may
1889# 1931#
1890 1932
1891# 1933#
1892# see USB_STORAGE Help for more information 1934# also be needed; see USB_STORAGE Help for more info
1893# 1935#
1894CONFIG_USB_STORAGE=y 1936CONFIG_USB_STORAGE=y
1895# CONFIG_USB_STORAGE_DEBUG is not set 1937# CONFIG_USB_STORAGE_DEBUG is not set
@@ -1931,7 +1973,6 @@ CONFIG_USB_LIBUSUAL=y
1931# CONFIG_USB_LED is not set 1973# CONFIG_USB_LED is not set
1932# CONFIG_USB_CYPRESS_CY7C63 is not set 1974# CONFIG_USB_CYPRESS_CY7C63 is not set
1933# CONFIG_USB_CYTHERM is not set 1975# CONFIG_USB_CYTHERM is not set
1934# CONFIG_USB_PHIDGET is not set
1935# CONFIG_USB_IDMOUSE is not set 1976# CONFIG_USB_IDMOUSE is not set
1936# CONFIG_USB_FTDI_ELAN is not set 1977# CONFIG_USB_FTDI_ELAN is not set
1937# CONFIG_USB_APPLEDISPLAY is not set 1978# CONFIG_USB_APPLEDISPLAY is not set
@@ -1947,6 +1988,7 @@ CONFIG_USB_LIBUSUAL=y
1947# 1988#
1948# OTG and related infrastructure 1989# OTG and related infrastructure
1949# 1990#
1991# CONFIG_NOP_USB_XCEIV is not set
1950# CONFIG_UWB is not set 1992# CONFIG_UWB is not set
1951# CONFIG_MMC is not set 1993# CONFIG_MMC is not set
1952# CONFIG_MEMSTICK is not set 1994# CONFIG_MEMSTICK is not set
@@ -1958,8 +2000,10 @@ CONFIG_LEDS_CLASS=y
1958# 2000#
1959# CONFIG_LEDS_ALIX2 is not set 2001# CONFIG_LEDS_ALIX2 is not set
1960# CONFIG_LEDS_PCA9532 is not set 2002# CONFIG_LEDS_PCA9532 is not set
2003# CONFIG_LEDS_LP5521 is not set
1961# CONFIG_LEDS_CLEVO_MAIL is not set 2004# CONFIG_LEDS_CLEVO_MAIL is not set
1962# CONFIG_LEDS_PCA955X is not set 2005# CONFIG_LEDS_PCA955X is not set
2006# CONFIG_LEDS_BD2802 is not set
1963 2007
1964# 2008#
1965# LED Triggers 2009# LED Triggers
@@ -1969,6 +2013,10 @@ CONFIG_LEDS_TRIGGERS=y
1969# CONFIG_LEDS_TRIGGER_HEARTBEAT is not set 2013# CONFIG_LEDS_TRIGGER_HEARTBEAT is not set
1970# CONFIG_LEDS_TRIGGER_BACKLIGHT is not set 2014# CONFIG_LEDS_TRIGGER_BACKLIGHT is not set
1971# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set 2015# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set
2016
2017#
2018# iptables trigger is under Netfilter config (LED target)
2019#
1972# CONFIG_ACCESSIBILITY is not set 2020# CONFIG_ACCESSIBILITY is not set
1973# CONFIG_INFINIBAND is not set 2021# CONFIG_INFINIBAND is not set
1974CONFIG_EDAC=y 2022CONFIG_EDAC=y
@@ -2037,6 +2085,7 @@ CONFIG_DMADEVICES=y
2037# DMA Devices 2085# DMA Devices
2038# 2086#
2039# CONFIG_INTEL_IOATDMA is not set 2087# CONFIG_INTEL_IOATDMA is not set
2088# CONFIG_AUXDISPLAY is not set
2040# CONFIG_UIO is not set 2089# CONFIG_UIO is not set
2041# CONFIG_STAGING is not set 2090# CONFIG_STAGING is not set
2042CONFIG_X86_PLATFORM_DEVICES=y 2091CONFIG_X86_PLATFORM_DEVICES=y
@@ -2071,6 +2120,7 @@ CONFIG_DMIID=y
2071# 2120#
2072# CONFIG_EXT2_FS is not set 2121# CONFIG_EXT2_FS is not set
2073CONFIG_EXT3_FS=y 2122CONFIG_EXT3_FS=y
2123# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
2074CONFIG_EXT3_FS_XATTR=y 2124CONFIG_EXT3_FS_XATTR=y
2075CONFIG_EXT3_FS_POSIX_ACL=y 2125CONFIG_EXT3_FS_POSIX_ACL=y
2076CONFIG_EXT3_FS_SECURITY=y 2126CONFIG_EXT3_FS_SECURITY=y
@@ -2101,6 +2151,11 @@ CONFIG_AUTOFS4_FS=y
2101CONFIG_GENERIC_ACL=y 2151CONFIG_GENERIC_ACL=y
2102 2152
2103# 2153#
2154# Caches
2155#
2156# CONFIG_FSCACHE is not set
2157
2158#
2104# CD-ROM/DVD Filesystems 2159# CD-ROM/DVD Filesystems
2105# 2160#
2106CONFIG_ISO9660_FS=y 2161CONFIG_ISO9660_FS=y
@@ -2151,6 +2206,7 @@ CONFIG_MISC_FILESYSTEMS=y
2151# CONFIG_ROMFS_FS is not set 2206# CONFIG_ROMFS_FS is not set
2152# CONFIG_SYSV_FS is not set 2207# CONFIG_SYSV_FS is not set
2153# CONFIG_UFS_FS is not set 2208# CONFIG_UFS_FS is not set
2209# CONFIG_NILFS2_FS is not set
2154CONFIG_NETWORK_FILESYSTEMS=y 2210CONFIG_NETWORK_FILESYSTEMS=y
2155CONFIG_NFS_FS=y 2211CONFIG_NFS_FS=y
2156CONFIG_NFS_V3=y 2212CONFIG_NFS_V3=y
@@ -2164,7 +2220,6 @@ CONFIG_NFS_ACL_SUPPORT=y
2164CONFIG_NFS_COMMON=y 2220CONFIG_NFS_COMMON=y
2165CONFIG_SUNRPC=y 2221CONFIG_SUNRPC=y
2166CONFIG_SUNRPC_GSS=y 2222CONFIG_SUNRPC_GSS=y
2167# CONFIG_SUNRPC_REGISTER_V4 is not set
2168CONFIG_RPCSEC_GSS_KRB5=y 2223CONFIG_RPCSEC_GSS_KRB5=y
2169# CONFIG_RPCSEC_GSS_SPKM3 is not set 2224# CONFIG_RPCSEC_GSS_SPKM3 is not set
2170# CONFIG_SMB_FS is not set 2225# CONFIG_SMB_FS is not set
@@ -2251,6 +2306,7 @@ CONFIG_DEBUG_FS=y
2251CONFIG_DEBUG_KERNEL=y 2306CONFIG_DEBUG_KERNEL=y
2252# CONFIG_DEBUG_SHIRQ is not set 2307# CONFIG_DEBUG_SHIRQ is not set
2253# CONFIG_DETECT_SOFTLOCKUP is not set 2308# CONFIG_DETECT_SOFTLOCKUP is not set
2309# CONFIG_DETECT_HUNG_TASK is not set
2254# CONFIG_SCHED_DEBUG is not set 2310# CONFIG_SCHED_DEBUG is not set
2255CONFIG_SCHEDSTATS=y 2311CONFIG_SCHEDSTATS=y
2256CONFIG_TIMER_STATS=y 2312CONFIG_TIMER_STATS=y
@@ -2266,6 +2322,7 @@ CONFIG_TIMER_STATS=y
2266# CONFIG_LOCK_STAT is not set 2322# CONFIG_LOCK_STAT is not set
2267# CONFIG_DEBUG_SPINLOCK_SLEEP is not set 2323# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
2268# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set 2324# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
2325CONFIG_STACKTRACE=y
2269# CONFIG_DEBUG_KOBJECT is not set 2326# CONFIG_DEBUG_KOBJECT is not set
2270# CONFIG_DEBUG_HIGHMEM is not set 2327# CONFIG_DEBUG_HIGHMEM is not set
2271CONFIG_DEBUG_BUGVERBOSE=y 2328CONFIG_DEBUG_BUGVERBOSE=y
@@ -2289,13 +2346,19 @@ CONFIG_FRAME_POINTER=y
2289# CONFIG_FAULT_INJECTION is not set 2346# CONFIG_FAULT_INJECTION is not set
2290# CONFIG_LATENCYTOP is not set 2347# CONFIG_LATENCYTOP is not set
2291CONFIG_SYSCTL_SYSCALL_CHECK=y 2348CONFIG_SYSCTL_SYSCALL_CHECK=y
2349# CONFIG_DEBUG_PAGEALLOC is not set
2292CONFIG_USER_STACKTRACE_SUPPORT=y 2350CONFIG_USER_STACKTRACE_SUPPORT=y
2351CONFIG_NOP_TRACER=y
2293CONFIG_HAVE_FUNCTION_TRACER=y 2352CONFIG_HAVE_FUNCTION_TRACER=y
2294CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y 2353CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y
2295CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y 2354CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
2296CONFIG_HAVE_DYNAMIC_FTRACE=y 2355CONFIG_HAVE_DYNAMIC_FTRACE=y
2297CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y 2356CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
2298CONFIG_HAVE_HW_BRANCH_TRACER=y 2357CONFIG_HAVE_HW_BRANCH_TRACER=y
2358CONFIG_HAVE_FTRACE_SYSCALLS=y
2359CONFIG_RING_BUFFER=y
2360CONFIG_TRACING=y
2361CONFIG_TRACING_SUPPORT=y
2299 2362
2300# 2363#
2301# Tracers 2364# Tracers
@@ -2305,13 +2368,21 @@ CONFIG_HAVE_HW_BRANCH_TRACER=y
2305# CONFIG_SYSPROF_TRACER is not set 2368# CONFIG_SYSPROF_TRACER is not set
2306# CONFIG_SCHED_TRACER is not set 2369# CONFIG_SCHED_TRACER is not set
2307# CONFIG_CONTEXT_SWITCH_TRACER is not set 2370# CONFIG_CONTEXT_SWITCH_TRACER is not set
2371# CONFIG_EVENT_TRACER is not set
2372# CONFIG_FTRACE_SYSCALLS is not set
2308# CONFIG_BOOT_TRACER is not set 2373# CONFIG_BOOT_TRACER is not set
2309# CONFIG_TRACE_BRANCH_PROFILING is not set 2374# CONFIG_TRACE_BRANCH_PROFILING is not set
2310# CONFIG_POWER_TRACER is not set 2375# CONFIG_POWER_TRACER is not set
2311# CONFIG_STACK_TRACER is not set 2376# CONFIG_STACK_TRACER is not set
2312# CONFIG_HW_BRANCH_TRACER is not set 2377# CONFIG_HW_BRANCH_TRACER is not set
2378# CONFIG_KMEMTRACE is not set
2379# CONFIG_WORKQUEUE_TRACER is not set
2380CONFIG_BLK_DEV_IO_TRACE=y
2381# CONFIG_FTRACE_STARTUP_TEST is not set
2382# CONFIG_MMIOTRACE is not set
2313CONFIG_PROVIDE_OHCI1394_DMA_INIT=y 2383CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
2314# CONFIG_DYNAMIC_PRINTK_DEBUG is not set 2384# CONFIG_DYNAMIC_DEBUG is not set
2385# CONFIG_DMA_API_DEBUG is not set
2315# CONFIG_SAMPLES is not set 2386# CONFIG_SAMPLES is not set
2316CONFIG_HAVE_ARCH_KGDB=y 2387CONFIG_HAVE_ARCH_KGDB=y
2317# CONFIG_KGDB is not set 2388# CONFIG_KGDB is not set
@@ -2321,7 +2392,6 @@ CONFIG_EARLY_PRINTK=y
2321CONFIG_EARLY_PRINTK_DBGP=y 2392CONFIG_EARLY_PRINTK_DBGP=y
2322CONFIG_DEBUG_STACKOVERFLOW=y 2393CONFIG_DEBUG_STACKOVERFLOW=y
2323CONFIG_DEBUG_STACK_USAGE=y 2394CONFIG_DEBUG_STACK_USAGE=y
2324# CONFIG_DEBUG_PAGEALLOC is not set
2325# CONFIG_DEBUG_PER_CPU_MAPS is not set 2395# CONFIG_DEBUG_PER_CPU_MAPS is not set
2326# CONFIG_X86_PTDUMP is not set 2396# CONFIG_X86_PTDUMP is not set
2327CONFIG_DEBUG_RODATA=y 2397CONFIG_DEBUG_RODATA=y
@@ -2329,7 +2399,7 @@ CONFIG_DEBUG_RODATA=y
2329CONFIG_DEBUG_NX_TEST=m 2399CONFIG_DEBUG_NX_TEST=m
2330# CONFIG_4KSTACKS is not set 2400# CONFIG_4KSTACKS is not set
2331CONFIG_DOUBLEFAULT=y 2401CONFIG_DOUBLEFAULT=y
2332# CONFIG_MMIOTRACE is not set 2402CONFIG_HAVE_MMIOTRACE_SUPPORT=y
2333CONFIG_IO_DELAY_TYPE_0X80=0 2403CONFIG_IO_DELAY_TYPE_0X80=0
2334CONFIG_IO_DELAY_TYPE_0XED=1 2404CONFIG_IO_DELAY_TYPE_0XED=1
2335CONFIG_IO_DELAY_TYPE_UDELAY=2 2405CONFIG_IO_DELAY_TYPE_UDELAY=2
@@ -2365,6 +2435,8 @@ CONFIG_SECURITY_SELINUX_AVC_STATS=y
2365CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1 2435CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
2366# CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set 2436# CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set
2367# CONFIG_SECURITY_SMACK is not set 2437# CONFIG_SECURITY_SMACK is not set
2438# CONFIG_SECURITY_TOMOYO is not set
2439# CONFIG_IMA is not set
2368CONFIG_CRYPTO=y 2440CONFIG_CRYPTO=y
2369 2441
2370# 2442#
@@ -2380,10 +2452,12 @@ CONFIG_CRYPTO_BLKCIPHER2=y
2380CONFIG_CRYPTO_HASH=y 2452CONFIG_CRYPTO_HASH=y
2381CONFIG_CRYPTO_HASH2=y 2453CONFIG_CRYPTO_HASH2=y
2382CONFIG_CRYPTO_RNG2=y 2454CONFIG_CRYPTO_RNG2=y
2455CONFIG_CRYPTO_PCOMP=y
2383CONFIG_CRYPTO_MANAGER=y 2456CONFIG_CRYPTO_MANAGER=y
2384CONFIG_CRYPTO_MANAGER2=y 2457CONFIG_CRYPTO_MANAGER2=y
2385# CONFIG_CRYPTO_GF128MUL is not set 2458# CONFIG_CRYPTO_GF128MUL is not set
2386# CONFIG_CRYPTO_NULL is not set 2459# CONFIG_CRYPTO_NULL is not set
2460CONFIG_CRYPTO_WORKQUEUE=y
2387# CONFIG_CRYPTO_CRYPTD is not set 2461# CONFIG_CRYPTO_CRYPTD is not set
2388CONFIG_CRYPTO_AUTHENC=y 2462CONFIG_CRYPTO_AUTHENC=y
2389# CONFIG_CRYPTO_TEST is not set 2463# CONFIG_CRYPTO_TEST is not set
@@ -2456,6 +2530,7 @@ CONFIG_CRYPTO_DES=y
2456# Compression 2530# Compression
2457# 2531#
2458# CONFIG_CRYPTO_DEFLATE is not set 2532# CONFIG_CRYPTO_DEFLATE is not set
2533# CONFIG_CRYPTO_ZLIB is not set
2459# CONFIG_CRYPTO_LZO is not set 2534# CONFIG_CRYPTO_LZO is not set
2460 2535
2461# 2536#
@@ -2467,11 +2542,13 @@ CONFIG_CRYPTO_HW=y
2467# CONFIG_CRYPTO_DEV_GEODE is not set 2542# CONFIG_CRYPTO_DEV_GEODE is not set
2468# CONFIG_CRYPTO_DEV_HIFN_795X is not set 2543# CONFIG_CRYPTO_DEV_HIFN_795X is not set
2469CONFIG_HAVE_KVM=y 2544CONFIG_HAVE_KVM=y
2545CONFIG_HAVE_KVM_IRQCHIP=y
2470CONFIG_VIRTUALIZATION=y 2546CONFIG_VIRTUALIZATION=y
2471# CONFIG_KVM is not set 2547# CONFIG_KVM is not set
2472# CONFIG_LGUEST is not set 2548# CONFIG_LGUEST is not set
2473# CONFIG_VIRTIO_PCI is not set 2549# CONFIG_VIRTIO_PCI is not set
2474# CONFIG_VIRTIO_BALLOON is not set 2550# CONFIG_VIRTIO_BALLOON is not set
2551CONFIG_BINARY_PRINTF=y
2475 2552
2476# 2553#
2477# Library routines 2554# Library routines
@@ -2489,7 +2566,10 @@ CONFIG_CRC32=y
2489# CONFIG_LIBCRC32C is not set 2566# CONFIG_LIBCRC32C is not set
2490CONFIG_AUDIT_GENERIC=y 2567CONFIG_AUDIT_GENERIC=y
2491CONFIG_ZLIB_INFLATE=y 2568CONFIG_ZLIB_INFLATE=y
2492CONFIG_PLIST=y 2569CONFIG_DECOMPRESS_GZIP=y
2570CONFIG_DECOMPRESS_BZIP2=y
2571CONFIG_DECOMPRESS_LZMA=y
2493CONFIG_HAS_IOMEM=y 2572CONFIG_HAS_IOMEM=y
2494CONFIG_HAS_IOPORT=y 2573CONFIG_HAS_IOPORT=y
2495CONFIG_HAS_DMA=y 2574CONFIG_HAS_DMA=y
2575CONFIG_NLATTR=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 9fe5d212ab4c..cee1dd2e69b2 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -1,12 +1,13 @@
1# 1#
2# Automatically generated make config: don't edit 2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.29-rc4 3# Linux kernel version: 2.6.30-rc2
4# Tue Feb 24 15:44:16 2009 4# Mon May 11 16:22:00 2009
5# 5#
6CONFIG_64BIT=y 6CONFIG_64BIT=y
7# CONFIG_X86_32 is not set 7# CONFIG_X86_32 is not set
8CONFIG_X86_64=y 8CONFIG_X86_64=y
9CONFIG_X86=y 9CONFIG_X86=y
10CONFIG_OUTPUT_FORMAT="elf64-x86-64"
10CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig" 11CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig"
11CONFIG_GENERIC_TIME=y 12CONFIG_GENERIC_TIME=y
12CONFIG_GENERIC_CMOS_UPDATE=y 13CONFIG_GENERIC_CMOS_UPDATE=y
@@ -34,6 +35,7 @@ CONFIG_ARCH_HAS_CPU_RELAX=y
34CONFIG_ARCH_HAS_DEFAULT_IDLE=y 35CONFIG_ARCH_HAS_DEFAULT_IDLE=y
35CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y 36CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y
36CONFIG_HAVE_SETUP_PER_CPU_AREA=y 37CONFIG_HAVE_SETUP_PER_CPU_AREA=y
38CONFIG_HAVE_DYNAMIC_PER_CPU_AREA=y
37CONFIG_HAVE_CPUMASK_OF_CPU_MAP=y 39CONFIG_HAVE_CPUMASK_OF_CPU_MAP=y
38CONFIG_ARCH_HIBERNATION_POSSIBLE=y 40CONFIG_ARCH_HIBERNATION_POSSIBLE=y
39CONFIG_ARCH_SUSPEND_POSSIBLE=y 41CONFIG_ARCH_SUSPEND_POSSIBLE=y
@@ -41,14 +43,14 @@ CONFIG_ZONE_DMA32=y
41CONFIG_ARCH_POPULATES_NODE_MAP=y 43CONFIG_ARCH_POPULATES_NODE_MAP=y
42CONFIG_AUDIT_ARCH=y 44CONFIG_AUDIT_ARCH=y
43CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y 45CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y
46CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y
44CONFIG_GENERIC_HARDIRQS=y 47CONFIG_GENERIC_HARDIRQS=y
48CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y
45CONFIG_GENERIC_IRQ_PROBE=y 49CONFIG_GENERIC_IRQ_PROBE=y
46CONFIG_GENERIC_PENDING_IRQ=y 50CONFIG_GENERIC_PENDING_IRQ=y
47CONFIG_X86_SMP=y
48CONFIG_USE_GENERIC_SMP_HELPERS=y 51CONFIG_USE_GENERIC_SMP_HELPERS=y
49CONFIG_X86_64_SMP=y 52CONFIG_X86_64_SMP=y
50CONFIG_X86_HT=y 53CONFIG_X86_HT=y
51CONFIG_X86_BIOS_REBOOT=y
52CONFIG_X86_TRAMPOLINE=y 54CONFIG_X86_TRAMPOLINE=y
53# CONFIG_KTIME_SCALAR is not set 55# CONFIG_KTIME_SCALAR is not set
54CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" 56CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
@@ -61,10 +63,17 @@ CONFIG_LOCK_KERNEL=y
61CONFIG_INIT_ENV_ARG_LIMIT=32 63CONFIG_INIT_ENV_ARG_LIMIT=32
62CONFIG_LOCALVERSION="" 64CONFIG_LOCALVERSION=""
63# CONFIG_LOCALVERSION_AUTO is not set 65# CONFIG_LOCALVERSION_AUTO is not set
66CONFIG_HAVE_KERNEL_GZIP=y
67CONFIG_HAVE_KERNEL_BZIP2=y
68CONFIG_HAVE_KERNEL_LZMA=y
69CONFIG_KERNEL_GZIP=y
70# CONFIG_KERNEL_BZIP2 is not set
71# CONFIG_KERNEL_LZMA is not set
64CONFIG_SWAP=y 72CONFIG_SWAP=y
65CONFIG_SYSVIPC=y 73CONFIG_SYSVIPC=y
66CONFIG_SYSVIPC_SYSCTL=y 74CONFIG_SYSVIPC_SYSCTL=y
67CONFIG_POSIX_MQUEUE=y 75CONFIG_POSIX_MQUEUE=y
76CONFIG_POSIX_MQUEUE_SYSCTL=y
68CONFIG_BSD_PROCESS_ACCT=y 77CONFIG_BSD_PROCESS_ACCT=y
69# CONFIG_BSD_PROCESS_ACCT_V3 is not set 78# CONFIG_BSD_PROCESS_ACCT_V3 is not set
70CONFIG_TASKSTATS=y 79CONFIG_TASKSTATS=y
@@ -114,23 +123,26 @@ CONFIG_PID_NS=y
114CONFIG_NET_NS=y 123CONFIG_NET_NS=y
115CONFIG_BLK_DEV_INITRD=y 124CONFIG_BLK_DEV_INITRD=y
116CONFIG_INITRAMFS_SOURCE="" 125CONFIG_INITRAMFS_SOURCE=""
126CONFIG_RD_GZIP=y
127CONFIG_RD_BZIP2=y
128CONFIG_RD_LZMA=y
117CONFIG_CC_OPTIMIZE_FOR_SIZE=y 129CONFIG_CC_OPTIMIZE_FOR_SIZE=y
118CONFIG_SYSCTL=y 130CONFIG_SYSCTL=y
131CONFIG_ANON_INODES=y
119# CONFIG_EMBEDDED is not set 132# CONFIG_EMBEDDED is not set
120CONFIG_UID16=y 133CONFIG_UID16=y
121CONFIG_SYSCTL_SYSCALL=y 134CONFIG_SYSCTL_SYSCALL=y
122CONFIG_KALLSYMS=y 135CONFIG_KALLSYMS=y
123CONFIG_KALLSYMS_ALL=y 136CONFIG_KALLSYMS_ALL=y
124CONFIG_KALLSYMS_EXTRA_PASS=y 137CONFIG_KALLSYMS_EXTRA_PASS=y
138# CONFIG_STRIP_ASM_SYMS is not set
125CONFIG_HOTPLUG=y 139CONFIG_HOTPLUG=y
126CONFIG_PRINTK=y 140CONFIG_PRINTK=y
127CONFIG_BUG=y 141CONFIG_BUG=y
128CONFIG_ELF_CORE=y 142CONFIG_ELF_CORE=y
129CONFIG_PCSPKR_PLATFORM=y 143CONFIG_PCSPKR_PLATFORM=y
130# CONFIG_COMPAT_BRK is not set
131CONFIG_BASE_FULL=y 144CONFIG_BASE_FULL=y
132CONFIG_FUTEX=y 145CONFIG_FUTEX=y
133CONFIG_ANON_INODES=y
134CONFIG_EPOLL=y 146CONFIG_EPOLL=y
135CONFIG_SIGNALFD=y 147CONFIG_SIGNALFD=y
136CONFIG_TIMERFD=y 148CONFIG_TIMERFD=y
@@ -140,6 +152,7 @@ CONFIG_AIO=y
140CONFIG_VM_EVENT_COUNTERS=y 152CONFIG_VM_EVENT_COUNTERS=y
141CONFIG_PCI_QUIRKS=y 153CONFIG_PCI_QUIRKS=y
142CONFIG_SLUB_DEBUG=y 154CONFIG_SLUB_DEBUG=y
155# CONFIG_COMPAT_BRK is not set
143# CONFIG_SLAB is not set 156# CONFIG_SLAB is not set
144CONFIG_SLUB=y 157CONFIG_SLUB=y
145# CONFIG_SLOB is not set 158# CONFIG_SLOB is not set
@@ -155,6 +168,8 @@ CONFIG_HAVE_IOREMAP_PROT=y
155CONFIG_HAVE_KPROBES=y 168CONFIG_HAVE_KPROBES=y
156CONFIG_HAVE_KRETPROBES=y 169CONFIG_HAVE_KRETPROBES=y
157CONFIG_HAVE_ARCH_TRACEHOOK=y 170CONFIG_HAVE_ARCH_TRACEHOOK=y
171CONFIG_HAVE_DMA_API_DEBUG=y
172# CONFIG_SLOW_WORK is not set
158# CONFIG_HAVE_GENERIC_DMA_COHERENT is not set 173# CONFIG_HAVE_GENERIC_DMA_COHERENT is not set
159CONFIG_SLABINFO=y 174CONFIG_SLABINFO=y
160CONFIG_RT_MUTEXES=y 175CONFIG_RT_MUTEXES=y
@@ -167,7 +182,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
167# CONFIG_MODULE_SRCVERSION_ALL is not set 182# CONFIG_MODULE_SRCVERSION_ALL is not set
168CONFIG_STOP_MACHINE=y 183CONFIG_STOP_MACHINE=y
169CONFIG_BLOCK=y 184CONFIG_BLOCK=y
170CONFIG_BLK_DEV_IO_TRACE=y
171CONFIG_BLK_DEV_BSG=y 185CONFIG_BLK_DEV_BSG=y
172# CONFIG_BLK_DEV_INTEGRITY is not set 186# CONFIG_BLK_DEV_INTEGRITY is not set
173CONFIG_BLOCK_COMPAT=y 187CONFIG_BLOCK_COMPAT=y
@@ -195,12 +209,10 @@ CONFIG_HIGH_RES_TIMERS=y
195CONFIG_GENERIC_CLOCKEVENTS_BUILD=y 209CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
196CONFIG_SMP=y 210CONFIG_SMP=y
197CONFIG_SPARSE_IRQ=y 211CONFIG_SPARSE_IRQ=y
198# CONFIG_NUMA_MIGRATE_IRQ_DESC is not set
199CONFIG_X86_FIND_SMP_CONFIG=y
200CONFIG_X86_MPPARSE=y 212CONFIG_X86_MPPARSE=y
201# CONFIG_X86_ELAN is not set 213CONFIG_X86_EXTENDED_PLATFORM=y
202# CONFIG_X86_GENERICARCH is not set
203# CONFIG_X86_VSMP is not set 214# CONFIG_X86_VSMP is not set
215# CONFIG_X86_UV is not set
204CONFIG_SCHED_OMIT_FRAME_POINTER=y 216CONFIG_SCHED_OMIT_FRAME_POINTER=y
205# CONFIG_PARAVIRT_GUEST is not set 217# CONFIG_PARAVIRT_GUEST is not set
206# CONFIG_MEMTEST is not set 218# CONFIG_MEMTEST is not set
@@ -230,10 +242,10 @@ CONFIG_SCHED_OMIT_FRAME_POINTER=y
230# CONFIG_MCORE2 is not set 242# CONFIG_MCORE2 is not set
231CONFIG_GENERIC_CPU=y 243CONFIG_GENERIC_CPU=y
232CONFIG_X86_CPU=y 244CONFIG_X86_CPU=y
233CONFIG_X86_L1_CACHE_BYTES=128 245CONFIG_X86_L1_CACHE_BYTES=64
234CONFIG_X86_INTERNODE_CACHE_BYTES=128 246CONFIG_X86_INTERNODE_CACHE_BYTES=64
235CONFIG_X86_CMPXCHG=y 247CONFIG_X86_CMPXCHG=y
236CONFIG_X86_L1_CACHE_SHIFT=7 248CONFIG_X86_L1_CACHE_SHIFT=6
237CONFIG_X86_WP_WORKS_OK=y 249CONFIG_X86_WP_WORKS_OK=y
238CONFIG_X86_TSC=y 250CONFIG_X86_TSC=y
239CONFIG_X86_CMPXCHG64=y 251CONFIG_X86_CMPXCHG64=y
@@ -242,7 +254,7 @@ CONFIG_X86_MINIMUM_CPU_FAMILY=64
242CONFIG_X86_DEBUGCTLMSR=y 254CONFIG_X86_DEBUGCTLMSR=y
243CONFIG_CPU_SUP_INTEL=y 255CONFIG_CPU_SUP_INTEL=y
244CONFIG_CPU_SUP_AMD=y 256CONFIG_CPU_SUP_AMD=y
245CONFIG_CPU_SUP_CENTAUR_64=y 257CONFIG_CPU_SUP_CENTAUR=y
246CONFIG_X86_DS=y 258CONFIG_X86_DS=y
247CONFIG_X86_PTRACE_BTS=y 259CONFIG_X86_PTRACE_BTS=y
248CONFIG_HPET_TIMER=y 260CONFIG_HPET_TIMER=y
@@ -269,6 +281,7 @@ CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y
269CONFIG_X86_MCE=y 281CONFIG_X86_MCE=y
270CONFIG_X86_MCE_INTEL=y 282CONFIG_X86_MCE_INTEL=y
271CONFIG_X86_MCE_AMD=y 283CONFIG_X86_MCE_AMD=y
284CONFIG_X86_MCE_THRESHOLD=y
272# CONFIG_I8K is not set 285# CONFIG_I8K is not set
273CONFIG_MICROCODE=y 286CONFIG_MICROCODE=y
274CONFIG_MICROCODE_INTEL=y 287CONFIG_MICROCODE_INTEL=y
@@ -276,6 +289,7 @@ CONFIG_MICROCODE_AMD=y
276CONFIG_MICROCODE_OLD_INTERFACE=y 289CONFIG_MICROCODE_OLD_INTERFACE=y
277CONFIG_X86_MSR=y 290CONFIG_X86_MSR=y
278CONFIG_X86_CPUID=y 291CONFIG_X86_CPUID=y
292# CONFIG_X86_CPU_DEBUG is not set
279CONFIG_ARCH_PHYS_ADDR_T_64BIT=y 293CONFIG_ARCH_PHYS_ADDR_T_64BIT=y
280CONFIG_DIRECT_GBPAGES=y 294CONFIG_DIRECT_GBPAGES=y
281CONFIG_NUMA=y 295CONFIG_NUMA=y
@@ -309,6 +323,8 @@ CONFIG_ZONE_DMA_FLAG=1
309CONFIG_BOUNCE=y 323CONFIG_BOUNCE=y
310CONFIG_VIRT_TO_BUS=y 324CONFIG_VIRT_TO_BUS=y
311CONFIG_UNEVICTABLE_LRU=y 325CONFIG_UNEVICTABLE_LRU=y
326CONFIG_HAVE_MLOCK=y
327CONFIG_HAVE_MLOCKED_PAGE_BIT=y
312CONFIG_X86_CHECK_BIOS_CORRUPTION=y 328CONFIG_X86_CHECK_BIOS_CORRUPTION=y
313CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y 329CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y
314CONFIG_X86_RESERVE_LOW_64K=y 330CONFIG_X86_RESERVE_LOW_64K=y
@@ -317,6 +333,7 @@ CONFIG_MTRR=y
317CONFIG_X86_PAT=y 333CONFIG_X86_PAT=y
318CONFIG_EFI=y 334CONFIG_EFI=y
319CONFIG_SECCOMP=y 335CONFIG_SECCOMP=y
336# CONFIG_CC_STACKPROTECTOR is not set
320# CONFIG_HZ_100 is not set 337# CONFIG_HZ_100 is not set
321# CONFIG_HZ_250 is not set 338# CONFIG_HZ_250 is not set
322# CONFIG_HZ_300 is not set 339# CONFIG_HZ_300 is not set
@@ -325,9 +342,10 @@ CONFIG_HZ=1000
325CONFIG_SCHED_HRTICK=y 342CONFIG_SCHED_HRTICK=y
326CONFIG_KEXEC=y 343CONFIG_KEXEC=y
327CONFIG_CRASH_DUMP=y 344CONFIG_CRASH_DUMP=y
345# CONFIG_KEXEC_JUMP is not set
328CONFIG_PHYSICAL_START=0x1000000 346CONFIG_PHYSICAL_START=0x1000000
329# CONFIG_RELOCATABLE is not set 347CONFIG_RELOCATABLE=y
330CONFIG_PHYSICAL_ALIGN=0x200000 348CONFIG_PHYSICAL_ALIGN=0x1000000
331CONFIG_HOTPLUG_CPU=y 349CONFIG_HOTPLUG_CPU=y
332# CONFIG_COMPAT_VDSO is not set 350# CONFIG_COMPAT_VDSO is not set
333# CONFIG_CMDLINE_BOOL is not set 351# CONFIG_CMDLINE_BOOL is not set
@@ -370,7 +388,6 @@ CONFIG_ACPI_NUMA=y
370CONFIG_ACPI_BLACKLIST_YEAR=0 388CONFIG_ACPI_BLACKLIST_YEAR=0
371# CONFIG_ACPI_DEBUG is not set 389# CONFIG_ACPI_DEBUG is not set
372# CONFIG_ACPI_PCI_SLOT is not set 390# CONFIG_ACPI_PCI_SLOT is not set
373CONFIG_ACPI_SYSTEM=y
374CONFIG_X86_PM_TIMER=y 391CONFIG_X86_PM_TIMER=y
375CONFIG_ACPI_CONTAINER=y 392CONFIG_ACPI_CONTAINER=y
376# CONFIG_ACPI_SBS is not set 393# CONFIG_ACPI_SBS is not set
@@ -436,6 +453,7 @@ CONFIG_PCI_MSI=y
436# CONFIG_PCI_DEBUG is not set 453# CONFIG_PCI_DEBUG is not set
437# CONFIG_PCI_STUB is not set 454# CONFIG_PCI_STUB is not set
438CONFIG_HT_IRQ=y 455CONFIG_HT_IRQ=y
456# CONFIG_PCI_IOV is not set
439CONFIG_ISA_DMA_API=y 457CONFIG_ISA_DMA_API=y
440CONFIG_K8_NB=y 458CONFIG_K8_NB=y
441CONFIG_PCCARD=y 459CONFIG_PCCARD=y
@@ -481,7 +499,6 @@ CONFIG_NET=y
481# 499#
482# Networking options 500# Networking options
483# 501#
484CONFIG_COMPAT_NET_DEV_OPS=y
485CONFIG_PACKET=y 502CONFIG_PACKET=y
486CONFIG_PACKET_MMAP=y 503CONFIG_PACKET_MMAP=y
487CONFIG_UNIX=y 504CONFIG_UNIX=y
@@ -639,6 +656,7 @@ CONFIG_LLC=y
639# CONFIG_LAPB is not set 656# CONFIG_LAPB is not set
640# CONFIG_ECONET is not set 657# CONFIG_ECONET is not set
641# CONFIG_WAN_ROUTER is not set 658# CONFIG_WAN_ROUTER is not set
659# CONFIG_PHONET is not set
642CONFIG_NET_SCHED=y 660CONFIG_NET_SCHED=y
643 661
644# 662#
@@ -696,6 +714,7 @@ CONFIG_NET_SCH_FIFO=y
696# 714#
697# CONFIG_NET_PKTGEN is not set 715# CONFIG_NET_PKTGEN is not set
698# CONFIG_NET_TCPPROBE is not set 716# CONFIG_NET_TCPPROBE is not set
717# CONFIG_NET_DROP_MONITOR is not set
699CONFIG_HAMRADIO=y 718CONFIG_HAMRADIO=y
700 719
701# 720#
@@ -706,12 +725,10 @@ CONFIG_HAMRADIO=y
706# CONFIG_IRDA is not set 725# CONFIG_IRDA is not set
707# CONFIG_BT is not set 726# CONFIG_BT is not set
708# CONFIG_AF_RXRPC is not set 727# CONFIG_AF_RXRPC is not set
709# CONFIG_PHONET is not set
710CONFIG_FIB_RULES=y 728CONFIG_FIB_RULES=y
711CONFIG_WIRELESS=y 729CONFIG_WIRELESS=y
712CONFIG_CFG80211=y 730CONFIG_CFG80211=y
713# CONFIG_CFG80211_REG_DEBUG is not set 731# CONFIG_CFG80211_REG_DEBUG is not set
714CONFIG_NL80211=y
715CONFIG_WIRELESS_OLD_REGULATORY=y 732CONFIG_WIRELESS_OLD_REGULATORY=y
716CONFIG_WIRELESS_EXT=y 733CONFIG_WIRELESS_EXT=y
717CONFIG_WIRELESS_EXT_SYSFS=y 734CONFIG_WIRELESS_EXT_SYSFS=y
@@ -788,9 +805,8 @@ CONFIG_MISC_DEVICES=y
788# CONFIG_TIFM_CORE is not set 805# CONFIG_TIFM_CORE is not set
789# CONFIG_ICS932S401 is not set 806# CONFIG_ICS932S401 is not set
790# CONFIG_ENCLOSURE_SERVICES is not set 807# CONFIG_ENCLOSURE_SERVICES is not set
791# CONFIG_SGI_XP is not set
792# CONFIG_HP_ILO is not set 808# CONFIG_HP_ILO is not set
793# CONFIG_SGI_GRU is not set 809# CONFIG_ISL29003 is not set
794# CONFIG_C2PORT is not set 810# CONFIG_C2PORT is not set
795 811
796# 812#
@@ -844,6 +860,7 @@ CONFIG_SCSI_SPI_ATTRS=y
844# CONFIG_SCSI_LOWLEVEL is not set 860# CONFIG_SCSI_LOWLEVEL is not set
845# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set 861# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set
846# CONFIG_SCSI_DH is not set 862# CONFIG_SCSI_DH is not set
863# CONFIG_SCSI_OSD_INITIATOR is not set
847CONFIG_ATA=y 864CONFIG_ATA=y
848# CONFIG_ATA_NONSTANDARD is not set 865# CONFIG_ATA_NONSTANDARD is not set
849CONFIG_ATA_ACPI=y 866CONFIG_ATA_ACPI=y
@@ -940,6 +957,7 @@ CONFIG_DM_ZERO=y
940CONFIG_MACINTOSH_DRIVERS=y 957CONFIG_MACINTOSH_DRIVERS=y
941CONFIG_MAC_EMUMOUSEBTN=y 958CONFIG_MAC_EMUMOUSEBTN=y
942CONFIG_NETDEVICES=y 959CONFIG_NETDEVICES=y
960CONFIG_COMPAT_NET_DEV_OPS=y
943# CONFIG_IFB is not set 961# CONFIG_IFB is not set
944# CONFIG_DUMMY is not set 962# CONFIG_DUMMY is not set
945# CONFIG_BONDING is not set 963# CONFIG_BONDING is not set
@@ -977,6 +995,8 @@ CONFIG_MII=y
977CONFIG_NET_VENDOR_3COM=y 995CONFIG_NET_VENDOR_3COM=y
978# CONFIG_VORTEX is not set 996# CONFIG_VORTEX is not set
979# CONFIG_TYPHOON is not set 997# CONFIG_TYPHOON is not set
998# CONFIG_ETHOC is not set
999# CONFIG_DNET is not set
980CONFIG_NET_TULIP=y 1000CONFIG_NET_TULIP=y
981# CONFIG_DE2104X is not set 1001# CONFIG_DE2104X is not set
982# CONFIG_TULIP is not set 1002# CONFIG_TULIP is not set
@@ -1026,6 +1046,7 @@ CONFIG_E1000=y
1026# CONFIG_E1000E is not set 1046# CONFIG_E1000E is not set
1027# CONFIG_IP1000 is not set 1047# CONFIG_IP1000 is not set
1028# CONFIG_IGB is not set 1048# CONFIG_IGB is not set
1049# CONFIG_IGBVF is not set
1029# CONFIG_NS83820 is not set 1050# CONFIG_NS83820 is not set
1030# CONFIG_HAMACHI is not set 1051# CONFIG_HAMACHI is not set
1031# CONFIG_YELLOWFIN is not set 1052# CONFIG_YELLOWFIN is not set
@@ -1040,6 +1061,7 @@ CONFIG_TIGON3=y
1040# CONFIG_QLA3XXX is not set 1061# CONFIG_QLA3XXX is not set
1041# CONFIG_ATL1 is not set 1062# CONFIG_ATL1 is not set
1042# CONFIG_ATL1E is not set 1063# CONFIG_ATL1E is not set
1064# CONFIG_ATL1C is not set
1043# CONFIG_JME is not set 1065# CONFIG_JME is not set
1044CONFIG_NETDEV_10000=y 1066CONFIG_NETDEV_10000=y
1045# CONFIG_CHELSIO_T1 is not set 1067# CONFIG_CHELSIO_T1 is not set
@@ -1049,6 +1071,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
1049# CONFIG_IXGBE is not set 1071# CONFIG_IXGBE is not set
1050# CONFIG_IXGB is not set 1072# CONFIG_IXGB is not set
1051# CONFIG_S2IO is not set 1073# CONFIG_S2IO is not set
1074# CONFIG_VXGE is not set
1052# CONFIG_MYRI10GE is not set 1075# CONFIG_MYRI10GE is not set
1053# CONFIG_NETXEN_NIC is not set 1076# CONFIG_NETXEN_NIC is not set
1054# CONFIG_NIU is not set 1077# CONFIG_NIU is not set
@@ -1058,6 +1081,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
1058# CONFIG_BNX2X is not set 1081# CONFIG_BNX2X is not set
1059# CONFIG_QLGE is not set 1082# CONFIG_QLGE is not set
1060# CONFIG_SFC is not set 1083# CONFIG_SFC is not set
1084# CONFIG_BE2NET is not set
1061CONFIG_TR=y 1085CONFIG_TR=y
1062# CONFIG_IBMOL is not set 1086# CONFIG_IBMOL is not set
1063# CONFIG_3C359 is not set 1087# CONFIG_3C359 is not set
@@ -1072,8 +1096,8 @@ CONFIG_WLAN_80211=y
1072# CONFIG_LIBERTAS is not set 1096# CONFIG_LIBERTAS is not set
1073# CONFIG_LIBERTAS_THINFIRM is not set 1097# CONFIG_LIBERTAS_THINFIRM is not set
1074# CONFIG_AIRO is not set 1098# CONFIG_AIRO is not set
1075# CONFIG_HERMES is not set
1076# CONFIG_ATMEL is not set 1099# CONFIG_ATMEL is not set
1100# CONFIG_AT76C50X_USB is not set
1077# CONFIG_AIRO_CS is not set 1101# CONFIG_AIRO_CS is not set
1078# CONFIG_PCMCIA_WL3501 is not set 1102# CONFIG_PCMCIA_WL3501 is not set
1079# CONFIG_PRISM54 is not set 1103# CONFIG_PRISM54 is not set
@@ -1083,21 +1107,21 @@ CONFIG_WLAN_80211=y
1083# CONFIG_RTL8187 is not set 1107# CONFIG_RTL8187 is not set
1084# CONFIG_ADM8211 is not set 1108# CONFIG_ADM8211 is not set
1085# CONFIG_MAC80211_HWSIM is not set 1109# CONFIG_MAC80211_HWSIM is not set
1110# CONFIG_MWL8K is not set
1086# CONFIG_P54_COMMON is not set 1111# CONFIG_P54_COMMON is not set
1087CONFIG_ATH5K=y 1112CONFIG_ATH5K=y
1088# CONFIG_ATH5K_DEBUG is not set 1113# CONFIG_ATH5K_DEBUG is not set
1089# CONFIG_ATH9K is not set 1114# CONFIG_ATH9K is not set
1115# CONFIG_AR9170_USB is not set
1090# CONFIG_IPW2100 is not set 1116# CONFIG_IPW2100 is not set
1091# CONFIG_IPW2200 is not set 1117# CONFIG_IPW2200 is not set
1092# CONFIG_IWLCORE is not set 1118# CONFIG_IWLWIFI is not set
1093# CONFIG_IWLWIFI_LEDS is not set
1094# CONFIG_IWLAGN is not set
1095# CONFIG_IWL3945 is not set
1096# CONFIG_HOSTAP is not set 1119# CONFIG_HOSTAP is not set
1097# CONFIG_B43 is not set 1120# CONFIG_B43 is not set
1098# CONFIG_B43LEGACY is not set 1121# CONFIG_B43LEGACY is not set
1099# CONFIG_ZD1211RW is not set 1122# CONFIG_ZD1211RW is not set
1100# CONFIG_RT2X00 is not set 1123# CONFIG_RT2X00 is not set
1124# CONFIG_HERMES is not set
1101 1125
1102# 1126#
1103# Enable WiMAX (Networking options) to see the WiMAX drivers 1127# Enable WiMAX (Networking options) to see the WiMAX drivers
@@ -1208,6 +1232,8 @@ CONFIG_INPUT_TABLET=y
1208# CONFIG_TABLET_USB_KBTAB is not set 1232# CONFIG_TABLET_USB_KBTAB is not set
1209# CONFIG_TABLET_USB_WACOM is not set 1233# CONFIG_TABLET_USB_WACOM is not set
1210CONFIG_INPUT_TOUCHSCREEN=y 1234CONFIG_INPUT_TOUCHSCREEN=y
1235# CONFIG_TOUCHSCREEN_AD7879_I2C is not set
1236# CONFIG_TOUCHSCREEN_AD7879 is not set
1211# CONFIG_TOUCHSCREEN_FUJITSU is not set 1237# CONFIG_TOUCHSCREEN_FUJITSU is not set
1212# CONFIG_TOUCHSCREEN_GUNZE is not set 1238# CONFIG_TOUCHSCREEN_GUNZE is not set
1213# CONFIG_TOUCHSCREEN_ELO is not set 1239# CONFIG_TOUCHSCREEN_ELO is not set
@@ -1301,6 +1327,7 @@ CONFIG_UNIX98_PTYS=y
1301# CONFIG_LEGACY_PTYS is not set 1327# CONFIG_LEGACY_PTYS is not set
1302# CONFIG_IPMI_HANDLER is not set 1328# CONFIG_IPMI_HANDLER is not set
1303CONFIG_HW_RANDOM=y 1329CONFIG_HW_RANDOM=y
1330# CONFIG_HW_RANDOM_TIMERIOMEM is not set
1304# CONFIG_HW_RANDOM_INTEL is not set 1331# CONFIG_HW_RANDOM_INTEL is not set
1305# CONFIG_HW_RANDOM_AMD is not set 1332# CONFIG_HW_RANDOM_AMD is not set
1306CONFIG_NVRAM=y 1333CONFIG_NVRAM=y
@@ -1382,7 +1409,6 @@ CONFIG_I2C_I801=y
1382# CONFIG_SENSORS_PCF8574 is not set 1409# CONFIG_SENSORS_PCF8574 is not set
1383# CONFIG_PCF8575 is not set 1410# CONFIG_PCF8575 is not set
1384# CONFIG_SENSORS_PCA9539 is not set 1411# CONFIG_SENSORS_PCA9539 is not set
1385# CONFIG_SENSORS_PCF8591 is not set
1386# CONFIG_SENSORS_MAX6875 is not set 1412# CONFIG_SENSORS_MAX6875 is not set
1387# CONFIG_SENSORS_TSL2550 is not set 1413# CONFIG_SENSORS_TSL2550 is not set
1388# CONFIG_I2C_DEBUG_CORE is not set 1414# CONFIG_I2C_DEBUG_CORE is not set
@@ -1416,6 +1442,7 @@ CONFIG_HWMON=y
1416# CONFIG_SENSORS_ADT7475 is not set 1442# CONFIG_SENSORS_ADT7475 is not set
1417# CONFIG_SENSORS_K8TEMP is not set 1443# CONFIG_SENSORS_K8TEMP is not set
1418# CONFIG_SENSORS_ASB100 is not set 1444# CONFIG_SENSORS_ASB100 is not set
1445# CONFIG_SENSORS_ATK0110 is not set
1419# CONFIG_SENSORS_ATXP1 is not set 1446# CONFIG_SENSORS_ATXP1 is not set
1420# CONFIG_SENSORS_DS1621 is not set 1447# CONFIG_SENSORS_DS1621 is not set
1421# CONFIG_SENSORS_I5K_AMB is not set 1448# CONFIG_SENSORS_I5K_AMB is not set
@@ -1425,6 +1452,7 @@ CONFIG_HWMON=y
1425# CONFIG_SENSORS_FSCHER is not set 1452# CONFIG_SENSORS_FSCHER is not set
1426# CONFIG_SENSORS_FSCPOS is not set 1453# CONFIG_SENSORS_FSCPOS is not set
1427# CONFIG_SENSORS_FSCHMD is not set 1454# CONFIG_SENSORS_FSCHMD is not set
1455# CONFIG_SENSORS_G760A is not set
1428# CONFIG_SENSORS_GL518SM is not set 1456# CONFIG_SENSORS_GL518SM is not set
1429# CONFIG_SENSORS_GL520SM is not set 1457# CONFIG_SENSORS_GL520SM is not set
1430# CONFIG_SENSORS_CORETEMP is not set 1458# CONFIG_SENSORS_CORETEMP is not set
@@ -1440,11 +1468,14 @@ CONFIG_HWMON=y
1440# CONFIG_SENSORS_LM90 is not set 1468# CONFIG_SENSORS_LM90 is not set
1441# CONFIG_SENSORS_LM92 is not set 1469# CONFIG_SENSORS_LM92 is not set
1442# CONFIG_SENSORS_LM93 is not set 1470# CONFIG_SENSORS_LM93 is not set
1471# CONFIG_SENSORS_LTC4215 is not set
1443# CONFIG_SENSORS_LTC4245 is not set 1472# CONFIG_SENSORS_LTC4245 is not set
1473# CONFIG_SENSORS_LM95241 is not set
1444# CONFIG_SENSORS_MAX1619 is not set 1474# CONFIG_SENSORS_MAX1619 is not set
1445# CONFIG_SENSORS_MAX6650 is not set 1475# CONFIG_SENSORS_MAX6650 is not set
1446# CONFIG_SENSORS_PC87360 is not set 1476# CONFIG_SENSORS_PC87360 is not set
1447# CONFIG_SENSORS_PC87427 is not set 1477# CONFIG_SENSORS_PC87427 is not set
1478# CONFIG_SENSORS_PCF8591 is not set
1448# CONFIG_SENSORS_SIS5595 is not set 1479# CONFIG_SENSORS_SIS5595 is not set
1449# CONFIG_SENSORS_DME1737 is not set 1480# CONFIG_SENSORS_DME1737 is not set
1450# CONFIG_SENSORS_SMSC47M1 is not set 1481# CONFIG_SENSORS_SMSC47M1 is not set
@@ -1635,6 +1666,7 @@ CONFIG_FB_EFI=y
1635# CONFIG_FB_VIRTUAL is not set 1666# CONFIG_FB_VIRTUAL is not set
1636# CONFIG_FB_METRONOME is not set 1667# CONFIG_FB_METRONOME is not set
1637# CONFIG_FB_MB862XX is not set 1668# CONFIG_FB_MB862XX is not set
1669# CONFIG_FB_BROADSHEET is not set
1638CONFIG_BACKLIGHT_LCD_SUPPORT=y 1670CONFIG_BACKLIGHT_LCD_SUPPORT=y
1639# CONFIG_LCD_CLASS_DEVICE is not set 1671# CONFIG_LCD_CLASS_DEVICE is not set
1640CONFIG_BACKLIGHT_CLASS_DEVICE=y 1672CONFIG_BACKLIGHT_CLASS_DEVICE=y
@@ -1720,6 +1752,8 @@ CONFIG_SND_PCI=y
1720# CONFIG_SND_INDIGO is not set 1752# CONFIG_SND_INDIGO is not set
1721# CONFIG_SND_INDIGOIO is not set 1753# CONFIG_SND_INDIGOIO is not set
1722# CONFIG_SND_INDIGODJ is not set 1754# CONFIG_SND_INDIGODJ is not set
1755# CONFIG_SND_INDIGOIOX is not set
1756# CONFIG_SND_INDIGODJX is not set
1723# CONFIG_SND_EMU10K1 is not set 1757# CONFIG_SND_EMU10K1 is not set
1724# CONFIG_SND_EMU10K1X is not set 1758# CONFIG_SND_EMU10K1X is not set
1725# CONFIG_SND_ENS1370 is not set 1759# CONFIG_SND_ENS1370 is not set
@@ -1792,15 +1826,17 @@ CONFIG_USB_HIDDEV=y
1792# 1826#
1793# Special HID drivers 1827# Special HID drivers
1794# 1828#
1795CONFIG_HID_COMPAT=y
1796CONFIG_HID_A4TECH=y 1829CONFIG_HID_A4TECH=y
1797CONFIG_HID_APPLE=y 1830CONFIG_HID_APPLE=y
1798CONFIG_HID_BELKIN=y 1831CONFIG_HID_BELKIN=y
1799CONFIG_HID_CHERRY=y 1832CONFIG_HID_CHERRY=y
1800CONFIG_HID_CHICONY=y 1833CONFIG_HID_CHICONY=y
1801CONFIG_HID_CYPRESS=y 1834CONFIG_HID_CYPRESS=y
1835# CONFIG_DRAGONRISE_FF is not set
1802CONFIG_HID_EZKEY=y 1836CONFIG_HID_EZKEY=y
1837CONFIG_HID_KYE=y
1803CONFIG_HID_GYRATION=y 1838CONFIG_HID_GYRATION=y
1839CONFIG_HID_KENSINGTON=y
1804CONFIG_HID_LOGITECH=y 1840CONFIG_HID_LOGITECH=y
1805CONFIG_LOGITECH_FF=y 1841CONFIG_LOGITECH_FF=y
1806# CONFIG_LOGIRUMBLEPAD2_FF is not set 1842# CONFIG_LOGIRUMBLEPAD2_FF is not set
@@ -1866,11 +1902,11 @@ CONFIG_USB_PRINTER=y
1866# CONFIG_USB_TMC is not set 1902# CONFIG_USB_TMC is not set
1867 1903
1868# 1904#
1869# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may also be needed; 1905# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may
1870# 1906#
1871 1907
1872# 1908#
1873# see USB_STORAGE Help for more information 1909# also be needed; see USB_STORAGE Help for more info
1874# 1910#
1875CONFIG_USB_STORAGE=y 1911CONFIG_USB_STORAGE=y
1876# CONFIG_USB_STORAGE_DEBUG is not set 1912# CONFIG_USB_STORAGE_DEBUG is not set
@@ -1912,7 +1948,6 @@ CONFIG_USB_LIBUSUAL=y
1912# CONFIG_USB_LED is not set 1948# CONFIG_USB_LED is not set
1913# CONFIG_USB_CYPRESS_CY7C63 is not set 1949# CONFIG_USB_CYPRESS_CY7C63 is not set
1914# CONFIG_USB_CYTHERM is not set 1950# CONFIG_USB_CYTHERM is not set
1915# CONFIG_USB_PHIDGET is not set
1916# CONFIG_USB_IDMOUSE is not set 1951# CONFIG_USB_IDMOUSE is not set
1917# CONFIG_USB_FTDI_ELAN is not set 1952# CONFIG_USB_FTDI_ELAN is not set
1918# CONFIG_USB_APPLEDISPLAY is not set 1953# CONFIG_USB_APPLEDISPLAY is not set
@@ -1928,6 +1963,7 @@ CONFIG_USB_LIBUSUAL=y
1928# 1963#
1929# OTG and related infrastructure 1964# OTG and related infrastructure
1930# 1965#
1966# CONFIG_NOP_USB_XCEIV is not set
1931# CONFIG_UWB is not set 1967# CONFIG_UWB is not set
1932# CONFIG_MMC is not set 1968# CONFIG_MMC is not set
1933# CONFIG_MEMSTICK is not set 1969# CONFIG_MEMSTICK is not set
@@ -1939,8 +1975,10 @@ CONFIG_LEDS_CLASS=y
1939# 1975#
1940# CONFIG_LEDS_ALIX2 is not set 1976# CONFIG_LEDS_ALIX2 is not set
1941# CONFIG_LEDS_PCA9532 is not set 1977# CONFIG_LEDS_PCA9532 is not set
1978# CONFIG_LEDS_LP5521 is not set
1942# CONFIG_LEDS_CLEVO_MAIL is not set 1979# CONFIG_LEDS_CLEVO_MAIL is not set
1943# CONFIG_LEDS_PCA955X is not set 1980# CONFIG_LEDS_PCA955X is not set
1981# CONFIG_LEDS_BD2802 is not set
1944 1982
1945# 1983#
1946# LED Triggers 1984# LED Triggers
@@ -1950,6 +1988,10 @@ CONFIG_LEDS_TRIGGERS=y
1950# CONFIG_LEDS_TRIGGER_HEARTBEAT is not set 1988# CONFIG_LEDS_TRIGGER_HEARTBEAT is not set
1951# CONFIG_LEDS_TRIGGER_BACKLIGHT is not set 1989# CONFIG_LEDS_TRIGGER_BACKLIGHT is not set
1952# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set 1990# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set
1991
1992#
1993# iptables trigger is under Netfilter config (LED target)
1994#
1953# CONFIG_ACCESSIBILITY is not set 1995# CONFIG_ACCESSIBILITY is not set
1954# CONFIG_INFINIBAND is not set 1996# CONFIG_INFINIBAND is not set
1955CONFIG_EDAC=y 1997CONFIG_EDAC=y
@@ -2018,6 +2060,7 @@ CONFIG_DMADEVICES=y
2018# DMA Devices 2060# DMA Devices
2019# 2061#
2020# CONFIG_INTEL_IOATDMA is not set 2062# CONFIG_INTEL_IOATDMA is not set
2063# CONFIG_AUXDISPLAY is not set
2021# CONFIG_UIO is not set 2064# CONFIG_UIO is not set
2022# CONFIG_STAGING is not set 2065# CONFIG_STAGING is not set
2023CONFIG_X86_PLATFORM_DEVICES=y 2066CONFIG_X86_PLATFORM_DEVICES=y
@@ -2051,6 +2094,7 @@ CONFIG_DMIID=y
2051# 2094#
2052# CONFIG_EXT2_FS is not set 2095# CONFIG_EXT2_FS is not set
2053CONFIG_EXT3_FS=y 2096CONFIG_EXT3_FS=y
2097# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
2054CONFIG_EXT3_FS_XATTR=y 2098CONFIG_EXT3_FS_XATTR=y
2055CONFIG_EXT3_FS_POSIX_ACL=y 2099CONFIG_EXT3_FS_POSIX_ACL=y
2056CONFIG_EXT3_FS_SECURITY=y 2100CONFIG_EXT3_FS_SECURITY=y
@@ -2082,6 +2126,11 @@ CONFIG_AUTOFS4_FS=y
2082CONFIG_GENERIC_ACL=y 2126CONFIG_GENERIC_ACL=y
2083 2127
2084# 2128#
2129# Caches
2130#
2131# CONFIG_FSCACHE is not set
2132
2133#
2085# CD-ROM/DVD Filesystems 2134# CD-ROM/DVD Filesystems
2086# 2135#
2087CONFIG_ISO9660_FS=y 2136CONFIG_ISO9660_FS=y
@@ -2132,6 +2181,7 @@ CONFIG_MISC_FILESYSTEMS=y
2132# CONFIG_ROMFS_FS is not set 2181# CONFIG_ROMFS_FS is not set
2133# CONFIG_SYSV_FS is not set 2182# CONFIG_SYSV_FS is not set
2134# CONFIG_UFS_FS is not set 2183# CONFIG_UFS_FS is not set
2184# CONFIG_NILFS2_FS is not set
2135CONFIG_NETWORK_FILESYSTEMS=y 2185CONFIG_NETWORK_FILESYSTEMS=y
2136CONFIG_NFS_FS=y 2186CONFIG_NFS_FS=y
2137CONFIG_NFS_V3=y 2187CONFIG_NFS_V3=y
@@ -2145,7 +2195,6 @@ CONFIG_NFS_ACL_SUPPORT=y
2145CONFIG_NFS_COMMON=y 2195CONFIG_NFS_COMMON=y
2146CONFIG_SUNRPC=y 2196CONFIG_SUNRPC=y
2147CONFIG_SUNRPC_GSS=y 2197CONFIG_SUNRPC_GSS=y
2148# CONFIG_SUNRPC_REGISTER_V4 is not set
2149CONFIG_RPCSEC_GSS_KRB5=y 2198CONFIG_RPCSEC_GSS_KRB5=y
2150# CONFIG_RPCSEC_GSS_SPKM3 is not set 2199# CONFIG_RPCSEC_GSS_SPKM3 is not set
2151# CONFIG_SMB_FS is not set 2200# CONFIG_SMB_FS is not set
@@ -2232,6 +2281,7 @@ CONFIG_DEBUG_FS=y
2232CONFIG_DEBUG_KERNEL=y 2281CONFIG_DEBUG_KERNEL=y
2233# CONFIG_DEBUG_SHIRQ is not set 2282# CONFIG_DEBUG_SHIRQ is not set
2234# CONFIG_DETECT_SOFTLOCKUP is not set 2283# CONFIG_DETECT_SOFTLOCKUP is not set
2284# CONFIG_DETECT_HUNG_TASK is not set
2235# CONFIG_SCHED_DEBUG is not set 2285# CONFIG_SCHED_DEBUG is not set
2236CONFIG_SCHEDSTATS=y 2286CONFIG_SCHEDSTATS=y
2237CONFIG_TIMER_STATS=y 2287CONFIG_TIMER_STATS=y
@@ -2247,6 +2297,7 @@ CONFIG_TIMER_STATS=y
2247# CONFIG_LOCK_STAT is not set 2297# CONFIG_LOCK_STAT is not set
2248# CONFIG_DEBUG_SPINLOCK_SLEEP is not set 2298# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
2249# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set 2299# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
2300CONFIG_STACKTRACE=y
2250# CONFIG_DEBUG_KOBJECT is not set 2301# CONFIG_DEBUG_KOBJECT is not set
2251CONFIG_DEBUG_BUGVERBOSE=y 2302CONFIG_DEBUG_BUGVERBOSE=y
2252# CONFIG_DEBUG_INFO is not set 2303# CONFIG_DEBUG_INFO is not set
@@ -2269,13 +2320,19 @@ CONFIG_FRAME_POINTER=y
2269# CONFIG_FAULT_INJECTION is not set 2320# CONFIG_FAULT_INJECTION is not set
2270# CONFIG_LATENCYTOP is not set 2321# CONFIG_LATENCYTOP is not set
2271CONFIG_SYSCTL_SYSCALL_CHECK=y 2322CONFIG_SYSCTL_SYSCALL_CHECK=y
2323# CONFIG_DEBUG_PAGEALLOC is not set
2272CONFIG_USER_STACKTRACE_SUPPORT=y 2324CONFIG_USER_STACKTRACE_SUPPORT=y
2325CONFIG_NOP_TRACER=y
2273CONFIG_HAVE_FUNCTION_TRACER=y 2326CONFIG_HAVE_FUNCTION_TRACER=y
2274CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y 2327CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y
2275CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y 2328CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
2276CONFIG_HAVE_DYNAMIC_FTRACE=y 2329CONFIG_HAVE_DYNAMIC_FTRACE=y
2277CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y 2330CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
2278CONFIG_HAVE_HW_BRANCH_TRACER=y 2331CONFIG_HAVE_HW_BRANCH_TRACER=y
2332CONFIG_HAVE_FTRACE_SYSCALLS=y
2333CONFIG_RING_BUFFER=y
2334CONFIG_TRACING=y
2335CONFIG_TRACING_SUPPORT=y
2279 2336
2280# 2337#
2281# Tracers 2338# Tracers
@@ -2285,13 +2342,21 @@ CONFIG_HAVE_HW_BRANCH_TRACER=y
2285# CONFIG_SYSPROF_TRACER is not set 2342# CONFIG_SYSPROF_TRACER is not set
2286# CONFIG_SCHED_TRACER is not set 2343# CONFIG_SCHED_TRACER is not set
2287# CONFIG_CONTEXT_SWITCH_TRACER is not set 2344# CONFIG_CONTEXT_SWITCH_TRACER is not set
2345# CONFIG_EVENT_TRACER is not set
2346# CONFIG_FTRACE_SYSCALLS is not set
2288# CONFIG_BOOT_TRACER is not set 2347# CONFIG_BOOT_TRACER is not set
2289# CONFIG_TRACE_BRANCH_PROFILING is not set 2348# CONFIG_TRACE_BRANCH_PROFILING is not set
2290# CONFIG_POWER_TRACER is not set 2349# CONFIG_POWER_TRACER is not set
2291# CONFIG_STACK_TRACER is not set 2350# CONFIG_STACK_TRACER is not set
2292# CONFIG_HW_BRANCH_TRACER is not set 2351# CONFIG_HW_BRANCH_TRACER is not set
2352# CONFIG_KMEMTRACE is not set
2353# CONFIG_WORKQUEUE_TRACER is not set
2354CONFIG_BLK_DEV_IO_TRACE=y
2355# CONFIG_FTRACE_STARTUP_TEST is not set
2356# CONFIG_MMIOTRACE is not set
2293CONFIG_PROVIDE_OHCI1394_DMA_INIT=y 2357CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
2294# CONFIG_DYNAMIC_PRINTK_DEBUG is not set 2358# CONFIG_DYNAMIC_DEBUG is not set
2359# CONFIG_DMA_API_DEBUG is not set
2295# CONFIG_SAMPLES is not set 2360# CONFIG_SAMPLES is not set
2296CONFIG_HAVE_ARCH_KGDB=y 2361CONFIG_HAVE_ARCH_KGDB=y
2297# CONFIG_KGDB is not set 2362# CONFIG_KGDB is not set
@@ -2301,14 +2366,13 @@ CONFIG_EARLY_PRINTK=y
2301CONFIG_EARLY_PRINTK_DBGP=y 2366CONFIG_EARLY_PRINTK_DBGP=y
2302CONFIG_DEBUG_STACKOVERFLOW=y 2367CONFIG_DEBUG_STACKOVERFLOW=y
2303CONFIG_DEBUG_STACK_USAGE=y 2368CONFIG_DEBUG_STACK_USAGE=y
2304# CONFIG_DEBUG_PAGEALLOC is not set
2305# CONFIG_DEBUG_PER_CPU_MAPS is not set 2369# CONFIG_DEBUG_PER_CPU_MAPS is not set
2306# CONFIG_X86_PTDUMP is not set 2370# CONFIG_X86_PTDUMP is not set
2307CONFIG_DEBUG_RODATA=y 2371CONFIG_DEBUG_RODATA=y
2308# CONFIG_DEBUG_RODATA_TEST is not set 2372# CONFIG_DEBUG_RODATA_TEST is not set
2309CONFIG_DEBUG_NX_TEST=m 2373CONFIG_DEBUG_NX_TEST=m
2310# CONFIG_IOMMU_DEBUG is not set 2374# CONFIG_IOMMU_DEBUG is not set
2311# CONFIG_MMIOTRACE is not set 2375CONFIG_HAVE_MMIOTRACE_SUPPORT=y
2312CONFIG_IO_DELAY_TYPE_0X80=0 2376CONFIG_IO_DELAY_TYPE_0X80=0
2313CONFIG_IO_DELAY_TYPE_0XED=1 2377CONFIG_IO_DELAY_TYPE_0XED=1
2314CONFIG_IO_DELAY_TYPE_UDELAY=2 2378CONFIG_IO_DELAY_TYPE_UDELAY=2
@@ -2344,6 +2408,8 @@ CONFIG_SECURITY_SELINUX_AVC_STATS=y
2344CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1 2408CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
2345# CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set 2409# CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set
2346# CONFIG_SECURITY_SMACK is not set 2410# CONFIG_SECURITY_SMACK is not set
2411# CONFIG_SECURITY_TOMOYO is not set
2412# CONFIG_IMA is not set
2347CONFIG_CRYPTO=y 2413CONFIG_CRYPTO=y
2348 2414
2349# 2415#
@@ -2359,10 +2425,12 @@ CONFIG_CRYPTO_BLKCIPHER2=y
2359CONFIG_CRYPTO_HASH=y 2425CONFIG_CRYPTO_HASH=y
2360CONFIG_CRYPTO_HASH2=y 2426CONFIG_CRYPTO_HASH2=y
2361CONFIG_CRYPTO_RNG2=y 2427CONFIG_CRYPTO_RNG2=y
2428CONFIG_CRYPTO_PCOMP=y
2362CONFIG_CRYPTO_MANAGER=y 2429CONFIG_CRYPTO_MANAGER=y
2363CONFIG_CRYPTO_MANAGER2=y 2430CONFIG_CRYPTO_MANAGER2=y
2364# CONFIG_CRYPTO_GF128MUL is not set 2431# CONFIG_CRYPTO_GF128MUL is not set
2365# CONFIG_CRYPTO_NULL is not set 2432# CONFIG_CRYPTO_NULL is not set
2433CONFIG_CRYPTO_WORKQUEUE=y
2366# CONFIG_CRYPTO_CRYPTD is not set 2434# CONFIG_CRYPTO_CRYPTD is not set
2367CONFIG_CRYPTO_AUTHENC=y 2435CONFIG_CRYPTO_AUTHENC=y
2368# CONFIG_CRYPTO_TEST is not set 2436# CONFIG_CRYPTO_TEST is not set
@@ -2414,6 +2482,7 @@ CONFIG_CRYPTO_SHA1=y
2414# 2482#
2415CONFIG_CRYPTO_AES=y 2483CONFIG_CRYPTO_AES=y
2416# CONFIG_CRYPTO_AES_X86_64 is not set 2484# CONFIG_CRYPTO_AES_X86_64 is not set
2485# CONFIG_CRYPTO_AES_NI_INTEL is not set
2417# CONFIG_CRYPTO_ANUBIS is not set 2486# CONFIG_CRYPTO_ANUBIS is not set
2418CONFIG_CRYPTO_ARC4=y 2487CONFIG_CRYPTO_ARC4=y
2419# CONFIG_CRYPTO_BLOWFISH is not set 2488# CONFIG_CRYPTO_BLOWFISH is not set
@@ -2435,6 +2504,7 @@ CONFIG_CRYPTO_DES=y
2435# Compression 2504# Compression
2436# 2505#
2437# CONFIG_CRYPTO_DEFLATE is not set 2506# CONFIG_CRYPTO_DEFLATE is not set
2507# CONFIG_CRYPTO_ZLIB is not set
2438# CONFIG_CRYPTO_LZO is not set 2508# CONFIG_CRYPTO_LZO is not set
2439 2509
2440# 2510#
@@ -2444,10 +2514,12 @@ CONFIG_CRYPTO_DES=y
2444CONFIG_CRYPTO_HW=y 2514CONFIG_CRYPTO_HW=y
2445# CONFIG_CRYPTO_DEV_HIFN_795X is not set 2515# CONFIG_CRYPTO_DEV_HIFN_795X is not set
2446CONFIG_HAVE_KVM=y 2516CONFIG_HAVE_KVM=y
2517CONFIG_HAVE_KVM_IRQCHIP=y
2447CONFIG_VIRTUALIZATION=y 2518CONFIG_VIRTUALIZATION=y
2448# CONFIG_KVM is not set 2519# CONFIG_KVM is not set
2449# CONFIG_VIRTIO_PCI is not set 2520# CONFIG_VIRTIO_PCI is not set
2450# CONFIG_VIRTIO_BALLOON is not set 2521# CONFIG_VIRTIO_BALLOON is not set
2522CONFIG_BINARY_PRINTF=y
2451 2523
2452# 2524#
2453# Library routines 2525# Library routines
@@ -2464,7 +2536,10 @@ CONFIG_CRC32=y
2464# CONFIG_CRC7 is not set 2536# CONFIG_CRC7 is not set
2465# CONFIG_LIBCRC32C is not set 2537# CONFIG_LIBCRC32C is not set
2466CONFIG_ZLIB_INFLATE=y 2538CONFIG_ZLIB_INFLATE=y
2467CONFIG_PLIST=y 2539CONFIG_DECOMPRESS_GZIP=y
2540CONFIG_DECOMPRESS_BZIP2=y
2541CONFIG_DECOMPRESS_LZMA=y
2468CONFIG_HAS_IOMEM=y 2542CONFIG_HAS_IOMEM=y
2469CONFIG_HAS_IOPORT=y 2543CONFIG_HAS_IOPORT=y
2470CONFIG_HAS_DMA=y 2544CONFIG_HAS_DMA=y
2545CONFIG_NLATTR=y
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index ebe7deedd5b4..cfb0010fa940 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -2,6 +2,8 @@
2# Arch-specific CryptoAPI modules. 2# Arch-specific CryptoAPI modules.
3# 3#
4 4
5obj-$(CONFIG_CRYPTO_FPU) += fpu.o
6
5obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o 7obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
6obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o 8obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
7obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o 9obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index caba99601703..eb0566e83319 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -845,7 +845,7 @@ ENTRY(aesni_cbc_enc)
845 */ 845 */
846ENTRY(aesni_cbc_dec) 846ENTRY(aesni_cbc_dec)
847 cmp $16, LEN 847 cmp $16, LEN
848 jb .Lcbc_dec_ret 848 jb .Lcbc_dec_just_ret
849 mov 480(KEYP), KLEN 849 mov 480(KEYP), KLEN
850 add $240, KEYP 850 add $240, KEYP
851 movups (IVP), IV 851 movups (IVP), IV
@@ -891,6 +891,7 @@ ENTRY(aesni_cbc_dec)
891 add $16, OUTP 891 add $16, OUTP
892 cmp $16, LEN 892 cmp $16, LEN
893 jge .Lcbc_dec_loop1 893 jge .Lcbc_dec_loop1
894 movups IV, (IVP)
895.Lcbc_dec_ret: 894.Lcbc_dec_ret:
895 movups IV, (IVP)
896.Lcbc_dec_just_ret:
896 ret 897 ret
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 02af0af65497..c580c5ec1cad 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -21,6 +21,22 @@
21#include <asm/i387.h> 21#include <asm/i387.h>
22#include <asm/aes.h> 22#include <asm/aes.h>
23 23
24#if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE)
25#define HAS_CTR
26#endif
27
28#if defined(CONFIG_CRYPTO_LRW) || defined(CONFIG_CRYPTO_LRW_MODULE)
29#define HAS_LRW
30#endif
31
32#if defined(CONFIG_CRYPTO_PCBC) || defined(CONFIG_CRYPTO_PCBC_MODULE)
33#define HAS_PCBC
34#endif
35
36#if defined(CONFIG_CRYPTO_XTS) || defined(CONFIG_CRYPTO_XTS_MODULE)
37#define HAS_XTS
38#endif
39
24struct async_aes_ctx { 40struct async_aes_ctx {
25 struct cryptd_ablkcipher *cryptd_tfm; 41 struct cryptd_ablkcipher *cryptd_tfm;
26}; 42};
@@ -137,6 +153,41 @@ static struct crypto_alg aesni_alg = {
137 } 153 }
138}; 154};
139 155
156static void __aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
157{
158 struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
159
160 aesni_enc(ctx, dst, src);
161}
162
163static void __aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
164{
165 struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
166
167 aesni_dec(ctx, dst, src);
168}
169
170static struct crypto_alg __aesni_alg = {
171 .cra_name = "__aes-aesni",
172 .cra_driver_name = "__driver-aes-aesni",
173 .cra_priority = 0,
174 .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
175 .cra_blocksize = AES_BLOCK_SIZE,
176 .cra_ctxsize = sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1,
177 .cra_alignmask = 0,
178 .cra_module = THIS_MODULE,
179 .cra_list = LIST_HEAD_INIT(__aesni_alg.cra_list),
180 .cra_u = {
181 .cipher = {
182 .cia_min_keysize = AES_MIN_KEY_SIZE,
183 .cia_max_keysize = AES_MAX_KEY_SIZE,
184 .cia_setkey = aes_set_key,
185 .cia_encrypt = __aes_encrypt,
186 .cia_decrypt = __aes_decrypt
187 }
188 }
189};
190
140static int ecb_encrypt(struct blkcipher_desc *desc, 191static int ecb_encrypt(struct blkcipher_desc *desc,
141 struct scatterlist *dst, struct scatterlist *src, 192 struct scatterlist *dst, struct scatterlist *src,
142 unsigned int nbytes) 193 unsigned int nbytes)
@@ -147,6 +198,7 @@ static int ecb_encrypt(struct blkcipher_desc *desc,
147 198
148 blkcipher_walk_init(&walk, dst, src, nbytes); 199 blkcipher_walk_init(&walk, dst, src, nbytes);
149 err = blkcipher_walk_virt(desc, &walk); 200 err = blkcipher_walk_virt(desc, &walk);
201 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
150 202
151 kernel_fpu_begin(); 203 kernel_fpu_begin();
152 while ((nbytes = walk.nbytes)) { 204 while ((nbytes = walk.nbytes)) {
@@ -170,6 +222,7 @@ static int ecb_decrypt(struct blkcipher_desc *desc,
170 222
171 blkcipher_walk_init(&walk, dst, src, nbytes); 223 blkcipher_walk_init(&walk, dst, src, nbytes);
172 err = blkcipher_walk_virt(desc, &walk); 224 err = blkcipher_walk_virt(desc, &walk);
225 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
173 226
174 kernel_fpu_begin(); 227 kernel_fpu_begin();
175 while ((nbytes = walk.nbytes)) { 228 while ((nbytes = walk.nbytes)) {
@@ -215,6 +268,7 @@ static int cbc_encrypt(struct blkcipher_desc *desc,
215 268
216 blkcipher_walk_init(&walk, dst, src, nbytes); 269 blkcipher_walk_init(&walk, dst, src, nbytes);
217 err = blkcipher_walk_virt(desc, &walk); 270 err = blkcipher_walk_virt(desc, &walk);
271 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
218 272
219 kernel_fpu_begin(); 273 kernel_fpu_begin();
220 while ((nbytes = walk.nbytes)) { 274 while ((nbytes = walk.nbytes)) {
@@ -238,6 +292,7 @@ static int cbc_decrypt(struct blkcipher_desc *desc,
238 292
239 blkcipher_walk_init(&walk, dst, src, nbytes); 293 blkcipher_walk_init(&walk, dst, src, nbytes);
240 err = blkcipher_walk_virt(desc, &walk); 294 err = blkcipher_walk_virt(desc, &walk);
295 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
241 296
242 kernel_fpu_begin(); 297 kernel_fpu_begin();
243 while ((nbytes = walk.nbytes)) { 298 while ((nbytes = walk.nbytes)) {
@@ -277,8 +332,16 @@ static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
277 unsigned int key_len) 332 unsigned int key_len)
278{ 333{
279 struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm); 334 struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
335 struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base;
336 int err;
280 337
281 return crypto_ablkcipher_setkey(&ctx->cryptd_tfm->base, key, key_len); 338 crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
339 crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm)
340 & CRYPTO_TFM_REQ_MASK);
341 err = crypto_ablkcipher_setkey(child, key, key_len);
342 crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child)
343 & CRYPTO_TFM_RES_MASK);
344 return err;
282} 345}
283 346
284static int ablk_encrypt(struct ablkcipher_request *req) 347static int ablk_encrypt(struct ablkcipher_request *req)
@@ -411,6 +474,163 @@ static struct crypto_alg ablk_cbc_alg = {
411 }, 474 },
412}; 475};
413 476
477#ifdef HAS_CTR
478static int ablk_ctr_init(struct crypto_tfm *tfm)
479{
480 struct cryptd_ablkcipher *cryptd_tfm;
481
482 cryptd_tfm = cryptd_alloc_ablkcipher("fpu(ctr(__driver-aes-aesni))",
483 0, 0);
484 if (IS_ERR(cryptd_tfm))
485 return PTR_ERR(cryptd_tfm);
486 ablk_init_common(tfm, cryptd_tfm);
487 return 0;
488}
489
490static struct crypto_alg ablk_ctr_alg = {
491 .cra_name = "ctr(aes)",
492 .cra_driver_name = "ctr-aes-aesni",
493 .cra_priority = 400,
494 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
495 .cra_blocksize = 1,
496 .cra_ctxsize = sizeof(struct async_aes_ctx),
497 .cra_alignmask = 0,
498 .cra_type = &crypto_ablkcipher_type,
499 .cra_module = THIS_MODULE,
500 .cra_list = LIST_HEAD_INIT(ablk_ctr_alg.cra_list),
501 .cra_init = ablk_ctr_init,
502 .cra_exit = ablk_exit,
503 .cra_u = {
504 .ablkcipher = {
505 .min_keysize = AES_MIN_KEY_SIZE,
506 .max_keysize = AES_MAX_KEY_SIZE,
507 .ivsize = AES_BLOCK_SIZE,
508 .setkey = ablk_set_key,
509 .encrypt = ablk_encrypt,
510 .decrypt = ablk_decrypt,
511 .geniv = "chainiv",
512 },
513 },
514};
515#endif
516
517#ifdef HAS_LRW
518static int ablk_lrw_init(struct crypto_tfm *tfm)
519{
520 struct cryptd_ablkcipher *cryptd_tfm;
521
522 cryptd_tfm = cryptd_alloc_ablkcipher("fpu(lrw(__driver-aes-aesni))",
523 0, 0);
524 if (IS_ERR(cryptd_tfm))
525 return PTR_ERR(cryptd_tfm);
526 ablk_init_common(tfm, cryptd_tfm);
527 return 0;
528}
529
530static struct crypto_alg ablk_lrw_alg = {
531 .cra_name = "lrw(aes)",
532 .cra_driver_name = "lrw-aes-aesni",
533 .cra_priority = 400,
534 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
535 .cra_blocksize = AES_BLOCK_SIZE,
536 .cra_ctxsize = sizeof(struct async_aes_ctx),
537 .cra_alignmask = 0,
538 .cra_type = &crypto_ablkcipher_type,
539 .cra_module = THIS_MODULE,
540 .cra_list = LIST_HEAD_INIT(ablk_lrw_alg.cra_list),
541 .cra_init = ablk_lrw_init,
542 .cra_exit = ablk_exit,
543 .cra_u = {
544 .ablkcipher = {
545 .min_keysize = AES_MIN_KEY_SIZE + AES_BLOCK_SIZE,
546 .max_keysize = AES_MAX_KEY_SIZE + AES_BLOCK_SIZE,
547 .ivsize = AES_BLOCK_SIZE,
548 .setkey = ablk_set_key,
549 .encrypt = ablk_encrypt,
550 .decrypt = ablk_decrypt,
551 },
552 },
553};
554#endif
555
556#ifdef HAS_PCBC
557static int ablk_pcbc_init(struct crypto_tfm *tfm)
558{
559 struct cryptd_ablkcipher *cryptd_tfm;
560
561 cryptd_tfm = cryptd_alloc_ablkcipher("fpu(pcbc(__driver-aes-aesni))",
562 0, 0);
563 if (IS_ERR(cryptd_tfm))
564 return PTR_ERR(cryptd_tfm);
565 ablk_init_common(tfm, cryptd_tfm);
566 return 0;
567}
568
569static struct crypto_alg ablk_pcbc_alg = {
570 .cra_name = "pcbc(aes)",
571 .cra_driver_name = "pcbc-aes-aesni",
572 .cra_priority = 400,
573 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
574 .cra_blocksize = AES_BLOCK_SIZE,
575 .cra_ctxsize = sizeof(struct async_aes_ctx),
576 .cra_alignmask = 0,
577 .cra_type = &crypto_ablkcipher_type,
578 .cra_module = THIS_MODULE,
579 .cra_list = LIST_HEAD_INIT(ablk_pcbc_alg.cra_list),
580 .cra_init = ablk_pcbc_init,
581 .cra_exit = ablk_exit,
582 .cra_u = {
583 .ablkcipher = {
584 .min_keysize = AES_MIN_KEY_SIZE,
585 .max_keysize = AES_MAX_KEY_SIZE,
586 .ivsize = AES_BLOCK_SIZE,
587 .setkey = ablk_set_key,
588 .encrypt = ablk_encrypt,
589 .decrypt = ablk_decrypt,
590 },
591 },
592};
593#endif
594
595#ifdef HAS_XTS
596static int ablk_xts_init(struct crypto_tfm *tfm)
597{
598 struct cryptd_ablkcipher *cryptd_tfm;
599
600 cryptd_tfm = cryptd_alloc_ablkcipher("fpu(xts(__driver-aes-aesni))",
601 0, 0);
602 if (IS_ERR(cryptd_tfm))
603 return PTR_ERR(cryptd_tfm);
604 ablk_init_common(tfm, cryptd_tfm);
605 return 0;
606}
607
608static struct crypto_alg ablk_xts_alg = {
609 .cra_name = "xts(aes)",
610 .cra_driver_name = "xts-aes-aesni",
611 .cra_priority = 400,
612 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
613 .cra_blocksize = AES_BLOCK_SIZE,
614 .cra_ctxsize = sizeof(struct async_aes_ctx),
615 .cra_alignmask = 0,
616 .cra_type = &crypto_ablkcipher_type,
617 .cra_module = THIS_MODULE,
618 .cra_list = LIST_HEAD_INIT(ablk_xts_alg.cra_list),
619 .cra_init = ablk_xts_init,
620 .cra_exit = ablk_exit,
621 .cra_u = {
622 .ablkcipher = {
623 .min_keysize = 2 * AES_MIN_KEY_SIZE,
624 .max_keysize = 2 * AES_MAX_KEY_SIZE,
625 .ivsize = AES_BLOCK_SIZE,
626 .setkey = ablk_set_key,
627 .encrypt = ablk_encrypt,
628 .decrypt = ablk_decrypt,
629 },
630 },
631};
632#endif
633
414static int __init aesni_init(void) 634static int __init aesni_init(void)
415{ 635{
416 int err; 636 int err;
@@ -421,6 +641,8 @@ static int __init aesni_init(void)
421 } 641 }
422 if ((err = crypto_register_alg(&aesni_alg))) 642 if ((err = crypto_register_alg(&aesni_alg)))
423 goto aes_err; 643 goto aes_err;
644 if ((err = crypto_register_alg(&__aesni_alg)))
645 goto __aes_err;
424 if ((err = crypto_register_alg(&blk_ecb_alg))) 646 if ((err = crypto_register_alg(&blk_ecb_alg)))
425 goto blk_ecb_err; 647 goto blk_ecb_err;
426 if ((err = crypto_register_alg(&blk_cbc_alg))) 648 if ((err = crypto_register_alg(&blk_cbc_alg)))
@@ -429,9 +651,41 @@ static int __init aesni_init(void)
429 goto ablk_ecb_err; 651 goto ablk_ecb_err;
430 if ((err = crypto_register_alg(&ablk_cbc_alg))) 652 if ((err = crypto_register_alg(&ablk_cbc_alg)))
431 goto ablk_cbc_err; 653 goto ablk_cbc_err;
654#ifdef HAS_CTR
655 if ((err = crypto_register_alg(&ablk_ctr_alg)))
656 goto ablk_ctr_err;
657#endif
658#ifdef HAS_LRW
659 if ((err = crypto_register_alg(&ablk_lrw_alg)))
660 goto ablk_lrw_err;
661#endif
662#ifdef HAS_PCBC
663 if ((err = crypto_register_alg(&ablk_pcbc_alg)))
664 goto ablk_pcbc_err;
665#endif
666#ifdef HAS_XTS
667 if ((err = crypto_register_alg(&ablk_xts_alg)))
668 goto ablk_xts_err;
669#endif
432 670
433 return err; 671 return err;
434 672
673#ifdef HAS_XTS
674ablk_xts_err:
675#endif
676#ifdef HAS_PCBC
677 crypto_unregister_alg(&ablk_pcbc_alg);
678ablk_pcbc_err:
679#endif
680#ifdef HAS_LRW
681 crypto_unregister_alg(&ablk_lrw_alg);
682ablk_lrw_err:
683#endif
684#ifdef HAS_CTR
685 crypto_unregister_alg(&ablk_ctr_alg);
686ablk_ctr_err:
687#endif
688 crypto_unregister_alg(&ablk_cbc_alg);
435ablk_cbc_err: 689ablk_cbc_err:
436 crypto_unregister_alg(&ablk_ecb_alg); 690 crypto_unregister_alg(&ablk_ecb_alg);
437ablk_ecb_err: 691ablk_ecb_err:
@@ -439,6 +693,8 @@ ablk_ecb_err:
439blk_cbc_err: 693blk_cbc_err:
440 crypto_unregister_alg(&blk_ecb_alg); 694 crypto_unregister_alg(&blk_ecb_alg);
441blk_ecb_err: 695blk_ecb_err:
696 crypto_unregister_alg(&__aesni_alg);
697__aes_err:
442 crypto_unregister_alg(&aesni_alg); 698 crypto_unregister_alg(&aesni_alg);
443aes_err: 699aes_err:
444 return err; 700 return err;
@@ -446,10 +702,23 @@ aes_err:
446 702
447static void __exit aesni_exit(void) 703static void __exit aesni_exit(void)
448{ 704{
705#ifdef HAS_XTS
706 crypto_unregister_alg(&ablk_xts_alg);
707#endif
708#ifdef HAS_PCBC
709 crypto_unregister_alg(&ablk_pcbc_alg);
710#endif
711#ifdef HAS_LRW
712 crypto_unregister_alg(&ablk_lrw_alg);
713#endif
714#ifdef HAS_CTR
715 crypto_unregister_alg(&ablk_ctr_alg);
716#endif
449 crypto_unregister_alg(&ablk_cbc_alg); 717 crypto_unregister_alg(&ablk_cbc_alg);
450 crypto_unregister_alg(&ablk_ecb_alg); 718 crypto_unregister_alg(&ablk_ecb_alg);
451 crypto_unregister_alg(&blk_cbc_alg); 719 crypto_unregister_alg(&blk_cbc_alg);
452 crypto_unregister_alg(&blk_ecb_alg); 720 crypto_unregister_alg(&blk_ecb_alg);
721 crypto_unregister_alg(&__aesni_alg);
453 crypto_unregister_alg(&aesni_alg); 722 crypto_unregister_alg(&aesni_alg);
454} 723}
455 724
diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c
new file mode 100644
index 000000000000..daef6cd2b45d
--- /dev/null
+++ b/arch/x86/crypto/fpu.c
@@ -0,0 +1,166 @@
1/*
2 * FPU: Wrapper for blkcipher touching fpu
3 *
4 * Copyright (c) Intel Corp.
5 * Author: Huang Ying <ying.huang@intel.com>
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License as published by the Free
9 * Software Foundation; either version 2 of the License, or (at your option)
10 * any later version.
11 *
12 */
13
14#include <crypto/algapi.h>
15#include <linux/err.h>
16#include <linux/init.h>
17#include <linux/kernel.h>
18#include <linux/module.h>
19#include <asm/i387.h>
20
21struct crypto_fpu_ctx {
22 struct crypto_blkcipher *child;
23};
24
25static int crypto_fpu_setkey(struct crypto_tfm *parent, const u8 *key,
26 unsigned int keylen)
27{
28 struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(parent);
29 struct crypto_blkcipher *child = ctx->child;
30 int err;
31
32 crypto_blkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
33 crypto_blkcipher_set_flags(child, crypto_tfm_get_flags(parent) &
34 CRYPTO_TFM_REQ_MASK);
35 err = crypto_blkcipher_setkey(child, key, keylen);
36 crypto_tfm_set_flags(parent, crypto_blkcipher_get_flags(child) &
37 CRYPTO_TFM_RES_MASK);
38 return err;
39}
40
41static int crypto_fpu_encrypt(struct blkcipher_desc *desc_in,
42 struct scatterlist *dst, struct scatterlist *src,
43 unsigned int nbytes)
44{
45 int err;
46 struct crypto_fpu_ctx *ctx = crypto_blkcipher_ctx(desc_in->tfm);
47 struct crypto_blkcipher *child = ctx->child;
48 struct blkcipher_desc desc = {
49 .tfm = child,
50 .info = desc_in->info,
51 .flags = desc_in->flags & ~CRYPTO_TFM_REQ_MAY_SLEEP,
52 };
53
54 kernel_fpu_begin();
55 err = crypto_blkcipher_crt(desc.tfm)->encrypt(&desc, dst, src, nbytes);
56 kernel_fpu_end();
57 return err;
58}
59
60static int crypto_fpu_decrypt(struct blkcipher_desc *desc_in,
61 struct scatterlist *dst, struct scatterlist *src,
62 unsigned int nbytes)
63{
64 int err;
65 struct crypto_fpu_ctx *ctx = crypto_blkcipher_ctx(desc_in->tfm);
66 struct crypto_blkcipher *child = ctx->child;
67 struct blkcipher_desc desc = {
68 .tfm = child,
69 .info = desc_in->info,
70 .flags = desc_in->flags & ~CRYPTO_TFM_REQ_MAY_SLEEP,
71 };
72
73 kernel_fpu_begin();
74 err = crypto_blkcipher_crt(desc.tfm)->decrypt(&desc, dst, src, nbytes);
75 kernel_fpu_end();
76 return err;
77}
78
79static int crypto_fpu_init_tfm(struct crypto_tfm *tfm)
80{
81 struct crypto_instance *inst = crypto_tfm_alg_instance(tfm);
82 struct crypto_spawn *spawn = crypto_instance_ctx(inst);
83 struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(tfm);
84 struct crypto_blkcipher *cipher;
85
86 cipher = crypto_spawn_blkcipher(spawn);
87 if (IS_ERR(cipher))
88 return PTR_ERR(cipher);
89
90 ctx->child = cipher;
91 return 0;
92}
93
94static void crypto_fpu_exit_tfm(struct crypto_tfm *tfm)
95{
96 struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(tfm);
97 crypto_free_blkcipher(ctx->child);
98}
99
100static struct crypto_instance *crypto_fpu_alloc(struct rtattr **tb)
101{
102 struct crypto_instance *inst;
103 struct crypto_alg *alg;
104 int err;
105
106 err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_BLKCIPHER);
107 if (err)
108 return ERR_PTR(err);
109
110 alg = crypto_get_attr_alg(tb, CRYPTO_ALG_TYPE_BLKCIPHER,
111 CRYPTO_ALG_TYPE_MASK);
112 if (IS_ERR(alg))
113 return ERR_CAST(alg);
114
115 inst = crypto_alloc_instance("fpu", alg);
116 if (IS_ERR(inst))
117 goto out_put_alg;
118
119 inst->alg.cra_flags = alg->cra_flags;
120 inst->alg.cra_priority = alg->cra_priority;
121 inst->alg.cra_blocksize = alg->cra_blocksize;
122 inst->alg.cra_alignmask = alg->cra_alignmask;
123 inst->alg.cra_type = alg->cra_type;
124 inst->alg.cra_blkcipher.ivsize = alg->cra_blkcipher.ivsize;
125 inst->alg.cra_blkcipher.min_keysize = alg->cra_blkcipher.min_keysize;
126 inst->alg.cra_blkcipher.max_keysize = alg->cra_blkcipher.max_keysize;
127 inst->alg.cra_ctxsize = sizeof(struct crypto_fpu_ctx);
128 inst->alg.cra_init = crypto_fpu_init_tfm;
129 inst->alg.cra_exit = crypto_fpu_exit_tfm;
130 inst->alg.cra_blkcipher.setkey = crypto_fpu_setkey;
131 inst->alg.cra_blkcipher.encrypt = crypto_fpu_encrypt;
132 inst->alg.cra_blkcipher.decrypt = crypto_fpu_decrypt;
133
134out_put_alg:
135 crypto_mod_put(alg);
136 return inst;
137}
138
139static void crypto_fpu_free(struct crypto_instance *inst)
140{
141 crypto_drop_spawn(crypto_instance_ctx(inst));
142 kfree(inst);
143}
144
145static struct crypto_template crypto_fpu_tmpl = {
146 .name = "fpu",
147 .alloc = crypto_fpu_alloc,
148 .free = crypto_fpu_free,
149 .module = THIS_MODULE,
150};
151
152static int __init crypto_fpu_module_init(void)
153{
154 return crypto_register_template(&crypto_fpu_tmpl);
155}
156
157static void __exit crypto_fpu_module_exit(void)
158{
159 crypto_unregister_template(&crypto_fpu_tmpl);
160}
161
162module_init(crypto_fpu_module_init);
163module_exit(crypto_fpu_module_exit);
164
165MODULE_LICENSE("GPL");
166MODULE_DESCRIPTION("FPU block cipher wrapper");
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index a505202086e8..e590261ba059 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -825,9 +825,11 @@ ia32_sys_call_table:
825 .quad compat_sys_signalfd4 825 .quad compat_sys_signalfd4
826 .quad sys_eventfd2 826 .quad sys_eventfd2
827 .quad sys_epoll_create1 827 .quad sys_epoll_create1
828 .quad sys_dup3 /* 330 */ 828 .quad sys_dup3 /* 330 */
829 .quad sys_pipe2 829 .quad sys_pipe2
830 .quad sys_inotify_init1 830 .quad sys_inotify_init1
831 .quad compat_sys_preadv 831 .quad compat_sys_preadv
832 .quad compat_sys_pwritev 832 .quad compat_sys_pwritev
833 .quad compat_sys_rt_tgsigqueueinfo /* 335 */
834 .quad sys_perf_counter_open
833ia32_syscall_end: 835ia32_syscall_end:
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 4518dc500903..20d1465a2ab0 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -144,6 +144,7 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
144 144
145#else /* !CONFIG_ACPI */ 145#else /* !CONFIG_ACPI */
146 146
147#define acpi_disabled 1
147#define acpi_lapic 0 148#define acpi_lapic 0
148#define acpi_ioapic 0 149#define acpi_ioapic 0
149static inline void acpi_noirq_set(void) { } 150static inline void acpi_noirq_set(void) { }
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index f6aa18eadf71..1a37bcdc8606 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -3,6 +3,7 @@
3 3
4#include <linux/types.h> 4#include <linux/types.h>
5#include <linux/stddef.h> 5#include <linux/stddef.h>
6#include <linux/stringify.h>
6#include <asm/asm.h> 7#include <asm/asm.h>
7 8
8/* 9/*
@@ -74,6 +75,22 @@ static inline void alternatives_smp_switch(int smp) {}
74 75
75const unsigned char *const *find_nop_table(void); 76const unsigned char *const *find_nop_table(void);
76 77
78/* alternative assembly primitive: */
79#define ALTERNATIVE(oldinstr, newinstr, feature) \
80 \
81 "661:\n\t" oldinstr "\n662:\n" \
82 ".section .altinstructions,\"a\"\n" \
83 _ASM_ALIGN "\n" \
84 _ASM_PTR "661b\n" /* label */ \
85 _ASM_PTR "663f\n" /* new instruction */ \
86 " .byte " __stringify(feature) "\n" /* feature bit */ \
87 " .byte 662b-661b\n" /* sourcelen */ \
88 " .byte 664f-663f\n" /* replacementlen */ \
89 ".previous\n" \
90 ".section .altinstr_replacement, \"ax\"\n" \
91 "663:\n\t" newinstr "\n664:\n" /* replacement */ \
92 ".previous"
93
77/* 94/*
78 * Alternative instructions for different CPU types or capabilities. 95 * Alternative instructions for different CPU types or capabilities.
79 * 96 *
@@ -87,18 +104,7 @@ const unsigned char *const *find_nop_table(void);
87 * without volatile and memory clobber. 104 * without volatile and memory clobber.
88 */ 105 */
89#define alternative(oldinstr, newinstr, feature) \ 106#define alternative(oldinstr, newinstr, feature) \
90 asm volatile ("661:\n\t" oldinstr "\n662:\n" \ 107 asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory")
91 ".section .altinstructions,\"a\"\n" \
92 _ASM_ALIGN "\n" \
93 _ASM_PTR "661b\n" /* label */ \
94 _ASM_PTR "663f\n" /* new instruction */ \
95 " .byte %c0\n" /* feature bit */ \
96 " .byte 662b-661b\n" /* sourcelen */ \
97 " .byte 664f-663f\n" /* replacementlen */ \
98 ".previous\n" \
99 ".section .altinstr_replacement,\"ax\"\n" \
100 "663:\n\t" newinstr "\n664:\n" /* replacement */ \
101 ".previous" :: "i" (feature) : "memory")
102 108
103/* 109/*
104 * Alternative inline assembly with input. 110 * Alternative inline assembly with input.
@@ -109,35 +115,16 @@ const unsigned char *const *find_nop_table(void);
109 * Best is to use constraints that are fixed size (like (%1) ... "r") 115 * Best is to use constraints that are fixed size (like (%1) ... "r")
110 * If you use variable sized constraints like "m" or "g" in the 116 * If you use variable sized constraints like "m" or "g" in the
111 * replacement make sure to pad to the worst case length. 117 * replacement make sure to pad to the worst case length.
118 * Leaving an unused argument 0 to keep API compatibility.
112 */ 119 */
113#define alternative_input(oldinstr, newinstr, feature, input...) \ 120#define alternative_input(oldinstr, newinstr, feature, input...) \
114 asm volatile ("661:\n\t" oldinstr "\n662:\n" \ 121 asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \
115 ".section .altinstructions,\"a\"\n" \ 122 : : "i" (0), ## input)
116 _ASM_ALIGN "\n" \
117 _ASM_PTR "661b\n" /* label */ \
118 _ASM_PTR "663f\n" /* new instruction */ \
119 " .byte %c0\n" /* feature bit */ \
120 " .byte 662b-661b\n" /* sourcelen */ \
121 " .byte 664f-663f\n" /* replacementlen */ \
122 ".previous\n" \
123 ".section .altinstr_replacement,\"ax\"\n" \
124 "663:\n\t" newinstr "\n664:\n" /* replacement */ \
125 ".previous" :: "i" (feature), ##input)
126 123
127/* Like alternative_input, but with a single output argument */ 124/* Like alternative_input, but with a single output argument */
128#define alternative_io(oldinstr, newinstr, feature, output, input...) \ 125#define alternative_io(oldinstr, newinstr, feature, output, input...) \
129 asm volatile ("661:\n\t" oldinstr "\n662:\n" \ 126 asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \
130 ".section .altinstructions,\"a\"\n" \ 127 : output : "i" (0), ## input)
131 _ASM_ALIGN "\n" \
132 _ASM_PTR "661b\n" /* label */ \
133 _ASM_PTR "663f\n" /* new instruction */ \
134 " .byte %c[feat]\n" /* feature bit */ \
135 " .byte 662b-661b\n" /* sourcelen */ \
136 " .byte 664f-663f\n" /* replacementlen */ \
137 ".previous\n" \
138 ".section .altinstr_replacement,\"ax\"\n" \
139 "663:\n\t" newinstr "\n664:\n" /* replacement */ \
140 ".previous" : output : [feat] "i" (feature), ##input)
141 128
142/* 129/*
143 * use this macro(s) if you need more than one output parameter 130 * use this macro(s) if you need more than one output parameter
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
index f712344329bc..bdf96f119f06 100644
--- a/arch/x86/include/asm/amd_iommu.h
+++ b/arch/x86/include/asm/amd_iommu.h
@@ -27,9 +27,13 @@ extern int amd_iommu_init(void);
27extern int amd_iommu_init_dma_ops(void); 27extern int amd_iommu_init_dma_ops(void);
28extern void amd_iommu_detect(void); 28extern void amd_iommu_detect(void);
29extern irqreturn_t amd_iommu_int_handler(int irq, void *data); 29extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
30extern void amd_iommu_flush_all_domains(void);
31extern void amd_iommu_flush_all_devices(void);
32extern void amd_iommu_shutdown(void);
30#else 33#else
31static inline int amd_iommu_init(void) { return -ENODEV; } 34static inline int amd_iommu_init(void) { return -ENODEV; }
32static inline void amd_iommu_detect(void) { } 35static inline void amd_iommu_detect(void) { }
36static inline void amd_iommu_shutdown(void) { }
33#endif 37#endif
34 38
35#endif /* _ASM_X86_AMD_IOMMU_H */ 39#endif /* _ASM_X86_AMD_IOMMU_H */
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index 95c8cd9d22b5..0c878caaa0a2 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -194,6 +194,27 @@
194#define PD_DMA_OPS_MASK (1UL << 0) /* domain used for dma_ops */ 194#define PD_DMA_OPS_MASK (1UL << 0) /* domain used for dma_ops */
195#define PD_DEFAULT_MASK (1UL << 1) /* domain is a default dma_ops 195#define PD_DEFAULT_MASK (1UL << 1) /* domain is a default dma_ops
196 domain for an IOMMU */ 196 domain for an IOMMU */
197extern bool amd_iommu_dump;
198#define DUMP_printk(format, arg...) \
199 do { \
200 if (amd_iommu_dump) \
201 printk(KERN_INFO "AMD IOMMU: " format, ## arg); \
202 } while(0);
203
204/*
205 * Make iterating over all IOMMUs easier
206 */
207#define for_each_iommu(iommu) \
208 list_for_each_entry((iommu), &amd_iommu_list, list)
209#define for_each_iommu_safe(iommu, next) \
210 list_for_each_entry_safe((iommu), (next), &amd_iommu_list, list)
211
212#define APERTURE_RANGE_SHIFT 27 /* 128 MB */
213#define APERTURE_RANGE_SIZE (1ULL << APERTURE_RANGE_SHIFT)
214#define APERTURE_RANGE_PAGES (APERTURE_RANGE_SIZE >> PAGE_SHIFT)
215#define APERTURE_MAX_RANGES 32 /* allows 4GB of DMA address space */
216#define APERTURE_RANGE_INDEX(a) ((a) >> APERTURE_RANGE_SHIFT)
217#define APERTURE_PAGE_INDEX(a) (((a) >> 21) & 0x3fULL)
197 218
198/* 219/*
199 * This structure contains generic data for IOMMU protection domains 220 * This structure contains generic data for IOMMU protection domains
@@ -210,6 +231,26 @@ struct protection_domain {
210}; 231};
211 232
212/* 233/*
234 * For dynamic growth the aperture size is split into ranges of 128MB of
235 * DMA address space each. This struct represents one such range.
236 */
237struct aperture_range {
238
239 /* address allocation bitmap */
240 unsigned long *bitmap;
241
242 /*
243 * Array of PTE pages for the aperture. In this array we save all the
244 * leaf pages of the domain page table used for the aperture. This way
245 * we don't need to walk the page table to find a specific PTE. We can
246 * just calculate its address in constant time.
247 */
248 u64 *pte_pages[64];
249
250 unsigned long offset;
251};
252
253/*
213 * Data container for a dma_ops specific protection domain 254 * Data container for a dma_ops specific protection domain
214 */ 255 */
215struct dma_ops_domain { 256struct dma_ops_domain {
@@ -222,18 +263,10 @@ struct dma_ops_domain {
222 unsigned long aperture_size; 263 unsigned long aperture_size;
223 264
224 /* address we start to search for free addresses */ 265 /* address we start to search for free addresses */
225 unsigned long next_bit; 266 unsigned long next_address;
226
227 /* address allocation bitmap */
228 unsigned long *bitmap;
229 267
230 /* 268 /* address space relevant data */
231 * Array of PTE pages for the aperture. In this array we save all the 269 struct aperture_range *aperture[APERTURE_MAX_RANGES];
232 * leaf pages of the domain page table used for the aperture. This way
233 * we don't need to walk the page table to find a specific PTE. We can
234 * just calculate its address in constant time.
235 */
236 u64 **pte_pages;
237 270
238 /* This will be set to true when TLB needs to be flushed */ 271 /* This will be set to true when TLB needs to be flushed */
239 bool need_flush; 272 bool need_flush;
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 42f2f8377422..bb7d47925847 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -107,8 +107,7 @@ extern u32 native_safe_apic_wait_icr_idle(void);
107extern void native_apic_icr_write(u32 low, u32 id); 107extern void native_apic_icr_write(u32 low, u32 id);
108extern u64 native_apic_icr_read(void); 108extern u64 native_apic_icr_read(void);
109 109
110#define EIM_8BIT_APIC_ID 0 110extern int x2apic_mode;
111#define EIM_32BIT_APIC_ID 1
112 111
113#ifdef CONFIG_X86_X2APIC 112#ifdef CONFIG_X86_X2APIC
114/* 113/*
@@ -166,10 +165,9 @@ static inline u64 native_x2apic_icr_read(void)
166 return val; 165 return val;
167} 166}
168 167
169extern int x2apic, x2apic_phys; 168extern int x2apic_phys;
170extern void check_x2apic(void); 169extern void check_x2apic(void);
171extern void enable_x2apic(void); 170extern void enable_x2apic(void);
172extern void enable_IR_x2apic(void);
173extern void x2apic_icr_write(u32 low, u32 id); 171extern void x2apic_icr_write(u32 low, u32 id);
174static inline int x2apic_enabled(void) 172static inline int x2apic_enabled(void)
175{ 173{
@@ -183,6 +181,8 @@ static inline int x2apic_enabled(void)
183 return 1; 181 return 1;
184 return 0; 182 return 0;
185} 183}
184
185#define x2apic_supported() (cpu_has_x2apic)
186#else 186#else
187static inline void check_x2apic(void) 187static inline void check_x2apic(void)
188{ 188{
@@ -190,28 +190,20 @@ static inline void check_x2apic(void)
190static inline void enable_x2apic(void) 190static inline void enable_x2apic(void)
191{ 191{
192} 192}
193static inline void enable_IR_x2apic(void)
194{
195}
196static inline int x2apic_enabled(void) 193static inline int x2apic_enabled(void)
197{ 194{
198 return 0; 195 return 0;
199} 196}
200 197
201#define x2apic 0 198#define x2apic_preenabled 0
202 199#define x2apic_supported() 0
203#endif 200#endif
204 201
205extern int get_physical_broadcast(void); 202extern void enable_IR_x2apic(void);
206 203
207#ifdef CONFIG_X86_X2APIC 204extern int get_physical_broadcast(void);
208static inline void ack_x2APIC_irq(void)
209{
210 /* Docs say use 0 for future compatibility */
211 native_apic_msr_write(APIC_EOI, 0);
212}
213#endif
214 205
206extern void apic_disable(void);
215extern int lapic_get_maxlvt(void); 207extern int lapic_get_maxlvt(void);
216extern void clear_local_APIC(void); 208extern void clear_local_APIC(void);
217extern void connect_bsp_APIC(void); 209extern void connect_bsp_APIC(void);
@@ -252,7 +244,7 @@ static inline void lapic_shutdown(void) { }
252#define local_apic_timer_c2_ok 1 244#define local_apic_timer_c2_ok 1
253static inline void init_apic_mappings(void) { } 245static inline void init_apic_mappings(void) { }
254static inline void disable_local_APIC(void) { } 246static inline void disable_local_APIC(void) { }
255 247static inline void apic_disable(void) { }
256#endif /* !CONFIG_X86_LOCAL_APIC */ 248#endif /* !CONFIG_X86_LOCAL_APIC */
257 249
258#ifdef CONFIG_X86_64 250#ifdef CONFIG_X86_64
@@ -410,7 +402,7 @@ static inline unsigned default_get_apic_id(unsigned long x)
410{ 402{
411 unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); 403 unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
412 404
413 if (APIC_XAPIC(ver)) 405 if (APIC_XAPIC(ver) || boot_cpu_has(X86_FEATURE_EXTD_APICID))
414 return (x >> 24) & 0xFF; 406 return (x >> 24) & 0xFF;
415 else 407 else
416 return (x >> 24) & 0x0F; 408 return (x >> 24) & 0x0F;
@@ -478,6 +470,9 @@ static inline unsigned int read_apic_id(void)
478extern void default_setup_apic_routing(void); 470extern void default_setup_apic_routing(void);
479 471
480#ifdef CONFIG_X86_32 472#ifdef CONFIG_X86_32
473
474extern struct apic apic_default;
475
481/* 476/*
482 * Set up the logical destination ID. 477 * Set up the logical destination ID.
483 * 478 *
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index bc9514fb3b13..7ddb36ab933b 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -22,6 +22,7 @@
22# define APIC_INTEGRATED(x) (1) 22# define APIC_INTEGRATED(x) (1)
23#endif 23#endif
24#define APIC_XAPIC(x) ((x) >= 0x14) 24#define APIC_XAPIC(x) ((x) >= 0x14)
25#define APIC_EXT_SPACE(x) ((x) & 0x80000000)
25#define APIC_TASKPRI 0x80 26#define APIC_TASKPRI 0x80
26#define APIC_TPRI_MASK 0xFFu 27#define APIC_TPRI_MASK 0xFFu
27#define APIC_ARBPRI 0x90 28#define APIC_ARBPRI 0x90
@@ -116,7 +117,9 @@
116#define APIC_TDR_DIV_32 0x8 117#define APIC_TDR_DIV_32 0x8
117#define APIC_TDR_DIV_64 0x9 118#define APIC_TDR_DIV_64 0x9
118#define APIC_TDR_DIV_128 0xA 119#define APIC_TDR_DIV_128 0xA
119#define APIC_EILVT0 0x500 120#define APIC_EFEAT 0x400
121#define APIC_ECTRL 0x410
122#define APIC_EILVTn(n) (0x500 + 0x10 * n)
120#define APIC_EILVT_NR_AMD_K8 1 /* # of extended interrupts */ 123#define APIC_EILVT_NR_AMD_K8 1 /* # of extended interrupts */
121#define APIC_EILVT_NR_AMD_10H 4 124#define APIC_EILVT_NR_AMD_10H 4
122#define APIC_EILVT_LVTOFF(x) (((x) >> 4) & 0xF) 125#define APIC_EILVT_LVTOFF(x) (((x) >> 4) & 0xF)
@@ -125,9 +128,6 @@
125#define APIC_EILVT_MSG_NMI 0x4 128#define APIC_EILVT_MSG_NMI 0x4
126#define APIC_EILVT_MSG_EXT 0x7 129#define APIC_EILVT_MSG_EXT 0x7
127#define APIC_EILVT_MASKED (1 << 16) 130#define APIC_EILVT_MASKED (1 << 16)
128#define APIC_EILVT1 0x510
129#define APIC_EILVT2 0x520
130#define APIC_EILVT3 0x530
131 131
132#define APIC_BASE (fix_to_virt(FIX_APIC_BASE)) 132#define APIC_BASE (fix_to_virt(FIX_APIC_BASE))
133#define APIC_BASE_MSR 0x800 133#define APIC_BASE_MSR 0x800
diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
index 85b46fba4229..2503d4e64c2a 100644
--- a/arch/x86/include/asm/atomic_32.h
+++ b/arch/x86/include/asm/atomic_32.h
@@ -247,5 +247,240 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
247#define smp_mb__before_atomic_inc() barrier() 247#define smp_mb__before_atomic_inc() barrier()
248#define smp_mb__after_atomic_inc() barrier() 248#define smp_mb__after_atomic_inc() barrier()
249 249
250#include <asm-generic/atomic.h> 250/* An 64bit atomic type */
251
252typedef struct {
253 unsigned long long counter;
254} atomic64_t;
255
256#define ATOMIC64_INIT(val) { (val) }
257
258/**
259 * atomic64_read - read atomic64 variable
260 * @ptr: pointer of type atomic64_t
261 *
262 * Atomically reads the value of @v.
263 * Doesn't imply a read memory barrier.
264 */
265#define __atomic64_read(ptr) ((ptr)->counter)
266
267static inline unsigned long long
268cmpxchg8b(unsigned long long *ptr, unsigned long long old, unsigned long long new)
269{
270 asm volatile(
271
272 LOCK_PREFIX "cmpxchg8b (%[ptr])\n"
273
274 : "=A" (old)
275
276 : [ptr] "D" (ptr),
277 "A" (old),
278 "b" (ll_low(new)),
279 "c" (ll_high(new))
280
281 : "memory");
282
283 return old;
284}
285
286static inline unsigned long long
287atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val,
288 unsigned long long new_val)
289{
290 return cmpxchg8b(&ptr->counter, old_val, new_val);
291}
292
293/**
294 * atomic64_xchg - xchg atomic64 variable
295 * @ptr: pointer to type atomic64_t
296 * @new_val: value to assign
297 *
298 * Atomically xchgs the value of @ptr to @new_val and returns
299 * the old value.
300 */
301
302static inline unsigned long long
303atomic64_xchg(atomic64_t *ptr, unsigned long long new_val)
304{
305 unsigned long long old_val;
306
307 do {
308 old_val = atomic_read(ptr);
309 } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
310
311 return old_val;
312}
313
314/**
315 * atomic64_set - set atomic64 variable
316 * @ptr: pointer to type atomic64_t
317 * @new_val: value to assign
318 *
319 * Atomically sets the value of @ptr to @new_val.
320 */
321static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val)
322{
323 atomic64_xchg(ptr, new_val);
324}
325
326/**
327 * atomic64_read - read atomic64 variable
328 * @ptr: pointer to type atomic64_t
329 *
330 * Atomically reads the value of @ptr and returns it.
331 */
332static inline unsigned long long atomic64_read(atomic64_t *ptr)
333{
334 unsigned long long curr_val;
335
336 do {
337 curr_val = __atomic64_read(ptr);
338 } while (atomic64_cmpxchg(ptr, curr_val, curr_val) != curr_val);
339
340 return curr_val;
341}
342
343/**
344 * atomic64_add_return - add and return
345 * @delta: integer value to add
346 * @ptr: pointer to type atomic64_t
347 *
348 * Atomically adds @delta to @ptr and returns @delta + *@ptr
349 */
350static inline unsigned long long
351atomic64_add_return(unsigned long long delta, atomic64_t *ptr)
352{
353 unsigned long long old_val, new_val;
354
355 do {
356 old_val = atomic_read(ptr);
357 new_val = old_val + delta;
358
359 } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
360
361 return new_val;
362}
363
364static inline long atomic64_sub_return(unsigned long long delta, atomic64_t *ptr)
365{
366 return atomic64_add_return(-delta, ptr);
367}
368
369static inline long atomic64_inc_return(atomic64_t *ptr)
370{
371 return atomic64_add_return(1, ptr);
372}
373
374static inline long atomic64_dec_return(atomic64_t *ptr)
375{
376 return atomic64_sub_return(1, ptr);
377}
378
379/**
380 * atomic64_add - add integer to atomic64 variable
381 * @delta: integer value to add
382 * @ptr: pointer to type atomic64_t
383 *
384 * Atomically adds @delta to @ptr.
385 */
386static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr)
387{
388 atomic64_add_return(delta, ptr);
389}
390
391/**
392 * atomic64_sub - subtract the atomic64 variable
393 * @delta: integer value to subtract
394 * @ptr: pointer to type atomic64_t
395 *
396 * Atomically subtracts @delta from @ptr.
397 */
398static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr)
399{
400 atomic64_add(-delta, ptr);
401}
402
403/**
404 * atomic64_sub_and_test - subtract value from variable and test result
405 * @delta: integer value to subtract
406 * @ptr: pointer to type atomic64_t
407 *
408 * Atomically subtracts @delta from @ptr and returns
409 * true if the result is zero, or false for all
410 * other cases.
411 */
412static inline int
413atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr)
414{
415 unsigned long long old_val = atomic64_sub_return(delta, ptr);
416
417 return old_val == 0;
418}
419
420/**
421 * atomic64_inc - increment atomic64 variable
422 * @ptr: pointer to type atomic64_t
423 *
424 * Atomically increments @ptr by 1.
425 */
426static inline void atomic64_inc(atomic64_t *ptr)
427{
428 atomic64_add(1, ptr);
429}
430
431/**
432 * atomic64_dec - decrement atomic64 variable
433 * @ptr: pointer to type atomic64_t
434 *
435 * Atomically decrements @ptr by 1.
436 */
437static inline void atomic64_dec(atomic64_t *ptr)
438{
439 atomic64_sub(1, ptr);
440}
441
442/**
443 * atomic64_dec_and_test - decrement and test
444 * @ptr: pointer to type atomic64_t
445 *
446 * Atomically decrements @ptr by 1 and
447 * returns true if the result is 0, or false for all other
448 * cases.
449 */
450static inline int atomic64_dec_and_test(atomic64_t *ptr)
451{
452 return atomic64_sub_and_test(1, ptr);
453}
454
455/**
456 * atomic64_inc_and_test - increment and test
457 * @ptr: pointer to type atomic64_t
458 *
459 * Atomically increments @ptr by 1
460 * and returns true if the result is zero, or false for all
461 * other cases.
462 */
463static inline int atomic64_inc_and_test(atomic64_t *ptr)
464{
465 return atomic64_sub_and_test(-1, ptr);
466}
467
468/**
469 * atomic64_add_negative - add and test if negative
470 * @delta: integer value to add
471 * @ptr: pointer to type atomic64_t
472 *
473 * Atomically adds @delta to @ptr and returns true
474 * if the result is negative, or false when
475 * result is greater than or equal to zero.
476 */
477static inline int
478atomic64_add_negative(unsigned long long delta, atomic64_t *ptr)
479{
480 long long old_val = atomic64_add_return(delta, ptr);
481
482 return old_val < 0;
483}
484
485#include <asm-generic/atomic-long.h>
251#endif /* _ASM_X86_ATOMIC_32_H */ 486#endif /* _ASM_X86_ATOMIC_32_H */
diff --git a/arch/x86/include/asm/atomic_64.h b/arch/x86/include/asm/atomic_64.h
index 8c21731984da..0d6360220007 100644
--- a/arch/x86/include/asm/atomic_64.h
+++ b/arch/x86/include/asm/atomic_64.h
@@ -455,5 +455,5 @@ static inline void atomic_or_long(unsigned long *v1, unsigned long v2)
455#define smp_mb__before_atomic_inc() barrier() 455#define smp_mb__before_atomic_inc() barrier()
456#define smp_mb__after_atomic_inc() barrier() 456#define smp_mb__after_atomic_inc() barrier()
457 457
458#include <asm-generic/atomic.h> 458#include <asm-generic/atomic-long.h>
459#endif /* _ASM_X86_ATOMIC_64_H */ 459#endif /* _ASM_X86_ATOMIC_64_H */
diff --git a/arch/x86/include/asm/bitsperlong.h b/arch/x86/include/asm/bitsperlong.h
new file mode 100644
index 000000000000..b0ae1c4dc791
--- /dev/null
+++ b/arch/x86/include/asm/bitsperlong.h
@@ -0,0 +1,13 @@
1#ifndef __ASM_X86_BITSPERLONG_H
2#define __ASM_X86_BITSPERLONG_H
3
4#ifdef __x86_64__
5# define __BITS_PER_LONG 64
6#else
7# define __BITS_PER_LONG 32
8#endif
9
10#include <asm-generic/bitsperlong.h>
11
12#endif /* __ASM_X86_BITSPERLONG_H */
13
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 6ba23dd9fc92..418e632d4a80 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -8,11 +8,26 @@
8 8
9#ifdef __KERNEL__ 9#ifdef __KERNEL__
10 10
11#include <asm/page_types.h>
12
11/* Physical address where kernel should be loaded. */ 13/* Physical address where kernel should be loaded. */
12#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \ 14#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \
13 + (CONFIG_PHYSICAL_ALIGN - 1)) \ 15 + (CONFIG_PHYSICAL_ALIGN - 1)) \
14 & ~(CONFIG_PHYSICAL_ALIGN - 1)) 16 & ~(CONFIG_PHYSICAL_ALIGN - 1))
15 17
18/* Minimum kernel alignment, as a power of two */
19#ifdef CONFIG_x86_64
20#define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT
21#else
22#define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT+1)
23#endif
24#define MIN_KERNEL_ALIGN (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2)
25
26#if (CONFIG_PHYSICAL_ALIGN & (CONFIG_PHYSICAL_ALIGN-1)) || \
27 (CONFIG_PHYSICAL_ALIGN < (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2))
28#error "Invalid value for CONFIG_PHYSICAL_ALIGN"
29#endif
30
16#ifdef CONFIG_KERNEL_BZIP2 31#ifdef CONFIG_KERNEL_BZIP2
17#define BOOT_HEAP_SIZE 0x400000 32#define BOOT_HEAP_SIZE 0x400000
18#else /* !CONFIG_KERNEL_BZIP2 */ 33#else /* !CONFIG_KERNEL_BZIP2 */
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
index 433adaebf9b6..1724e8de317c 100644
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -50,7 +50,8 @@ struct setup_header {
50 __u32 ramdisk_size; 50 __u32 ramdisk_size;
51 __u32 bootsect_kludge; 51 __u32 bootsect_kludge;
52 __u16 heap_end_ptr; 52 __u16 heap_end_ptr;
53 __u16 _pad1; 53 __u8 ext_loader_ver;
54 __u8 ext_loader_type;
54 __u32 cmd_line_ptr; 55 __u32 cmd_line_ptr;
55 __u32 initrd_addr_max; 56 __u32 initrd_addr_max;
56 __u32 kernel_alignment; 57 __u32 kernel_alignment;
diff --git a/arch/x86/include/asm/cpu_debug.h b/arch/x86/include/asm/cpu_debug.h
index 222802029fa6..d96c1ee3a95c 100644
--- a/arch/x86/include/asm/cpu_debug.h
+++ b/arch/x86/include/asm/cpu_debug.h
@@ -86,105 +86,7 @@ enum cpu_file_bit {
86 CPU_VALUE_BIT, /* value */ 86 CPU_VALUE_BIT, /* value */
87}; 87};
88 88
89#define CPU_FILE_VALUE (1 << CPU_VALUE_BIT) 89#define CPU_FILE_VALUE (1 << CPU_VALUE_BIT)
90
91/*
92 * DisplayFamily_DisplayModel Processor Families/Processor Number Series
93 * -------------------------- ------------------------------------------
94 * 05_01, 05_02, 05_04 Pentium, Pentium with MMX
95 *
96 * 06_01 Pentium Pro
97 * 06_03, 06_05 Pentium II Xeon, Pentium II
98 * 06_07, 06_08, 06_0A, 06_0B Pentium III Xeon, Pentum III
99 *
100 * 06_09, 060D Pentium M
101 *
102 * 06_0E Core Duo, Core Solo
103 *
104 * 06_0F Xeon 3000, 3200, 5100, 5300, 7300 series,
105 * Core 2 Quad, Core 2 Extreme, Core 2 Duo,
106 * Pentium dual-core
107 * 06_17 Xeon 5200, 5400 series, Core 2 Quad Q9650
108 *
109 * 06_1C Atom
110 *
111 * 0F_00, 0F_01, 0F_02 Xeon, Xeon MP, Pentium 4
112 * 0F_03, 0F_04 Xeon, Xeon MP, Pentium 4, Pentium D
113 *
114 * 0F_06 Xeon 7100, 5000 Series, Xeon MP,
115 * Pentium 4, Pentium D
116 */
117
118/* Register processors bits */
119enum cpu_processor_bit {
120 CPU_NONE,
121/* Intel */
122 CPU_INTEL_PENTIUM_BIT,
123 CPU_INTEL_P6_BIT,
124 CPU_INTEL_PENTIUM_M_BIT,
125 CPU_INTEL_CORE_BIT,
126 CPU_INTEL_CORE2_BIT,
127 CPU_INTEL_ATOM_BIT,
128 CPU_INTEL_XEON_P4_BIT,
129 CPU_INTEL_XEON_MP_BIT,
130/* AMD */
131 CPU_AMD_K6_BIT,
132 CPU_AMD_K7_BIT,
133 CPU_AMD_K8_BIT,
134 CPU_AMD_0F_BIT,
135 CPU_AMD_10_BIT,
136 CPU_AMD_11_BIT,
137};
138
139#define CPU_INTEL_PENTIUM (1 << CPU_INTEL_PENTIUM_BIT)
140#define CPU_INTEL_P6 (1 << CPU_INTEL_P6_BIT)
141#define CPU_INTEL_PENTIUM_M (1 << CPU_INTEL_PENTIUM_M_BIT)
142#define CPU_INTEL_CORE (1 << CPU_INTEL_CORE_BIT)
143#define CPU_INTEL_CORE2 (1 << CPU_INTEL_CORE2_BIT)
144#define CPU_INTEL_ATOM (1 << CPU_INTEL_ATOM_BIT)
145#define CPU_INTEL_XEON_P4 (1 << CPU_INTEL_XEON_P4_BIT)
146#define CPU_INTEL_XEON_MP (1 << CPU_INTEL_XEON_MP_BIT)
147
148#define CPU_INTEL_PX (CPU_INTEL_P6 | CPU_INTEL_PENTIUM_M)
149#define CPU_INTEL_COREX (CPU_INTEL_CORE | CPU_INTEL_CORE2)
150#define CPU_INTEL_XEON (CPU_INTEL_XEON_P4 | CPU_INTEL_XEON_MP)
151#define CPU_CO_AT (CPU_INTEL_CORE | CPU_INTEL_ATOM)
152#define CPU_C2_AT (CPU_INTEL_CORE2 | CPU_INTEL_ATOM)
153#define CPU_CX_AT (CPU_INTEL_COREX | CPU_INTEL_ATOM)
154#define CPU_CX_XE (CPU_INTEL_COREX | CPU_INTEL_XEON)
155#define CPU_P6_XE (CPU_INTEL_P6 | CPU_INTEL_XEON)
156#define CPU_PM_CO_AT (CPU_INTEL_PENTIUM_M | CPU_CO_AT)
157#define CPU_C2_AT_XE (CPU_C2_AT | CPU_INTEL_XEON)
158#define CPU_CX_AT_XE (CPU_CX_AT | CPU_INTEL_XEON)
159#define CPU_P6_CX_AT (CPU_INTEL_P6 | CPU_CX_AT)
160#define CPU_P6_CX_XE (CPU_P6_XE | CPU_INTEL_COREX)
161#define CPU_P6_CX_AT_XE (CPU_INTEL_P6 | CPU_CX_AT_XE)
162#define CPU_PM_CX_AT_XE (CPU_INTEL_PENTIUM_M | CPU_CX_AT_XE)
163#define CPU_PM_CX_AT (CPU_INTEL_PENTIUM_M | CPU_CX_AT)
164#define CPU_PM_CX_XE (CPU_INTEL_PENTIUM_M | CPU_CX_XE)
165#define CPU_PX_CX_AT (CPU_INTEL_PX | CPU_CX_AT)
166#define CPU_PX_CX_AT_XE (CPU_INTEL_PX | CPU_CX_AT_XE)
167
168/* Select all supported Intel CPUs */
169#define CPU_INTEL_ALL (CPU_INTEL_PENTIUM | CPU_PX_CX_AT_XE)
170
171#define CPU_AMD_K6 (1 << CPU_AMD_K6_BIT)
172#define CPU_AMD_K7 (1 << CPU_AMD_K7_BIT)
173#define CPU_AMD_K8 (1 << CPU_AMD_K8_BIT)
174#define CPU_AMD_0F (1 << CPU_AMD_0F_BIT)
175#define CPU_AMD_10 (1 << CPU_AMD_10_BIT)
176#define CPU_AMD_11 (1 << CPU_AMD_11_BIT)
177
178#define CPU_K10_PLUS (CPU_AMD_10 | CPU_AMD_11)
179#define CPU_K0F_PLUS (CPU_AMD_0F | CPU_K10_PLUS)
180#define CPU_K8_PLUS (CPU_AMD_K8 | CPU_K0F_PLUS)
181#define CPU_K7_PLUS (CPU_AMD_K7 | CPU_K8_PLUS)
182
183/* Select all supported AMD CPUs */
184#define CPU_AMD_ALL (CPU_AMD_K6 | CPU_K7_PLUS)
185
186/* Select all supported CPUs */
187#define CPU_ALL (CPU_INTEL_ALL | CPU_AMD_ALL)
188 90
189#define MAX_CPU_FILES 512 91#define MAX_CPU_FILES 512
190 92
@@ -220,7 +122,6 @@ struct cpu_debug_range {
220 unsigned min; /* Register range min */ 122 unsigned min; /* Register range min */
221 unsigned max; /* Register range max */ 123 unsigned max; /* Register range max */
222 unsigned flag; /* Supported flags */ 124 unsigned flag; /* Supported flags */
223 unsigned model; /* Supported models */
224}; 125};
225 126
226#endif /* _ASM_X86_CPU_DEBUG_H */ 127#endif /* _ASM_X86_CPU_DEBUG_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index bb83b1c397aa..4a28d22d4793 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -22,7 +22,7 @@
22#define X86_FEATURE_TSC (0*32+ 4) /* Time Stamp Counter */ 22#define X86_FEATURE_TSC (0*32+ 4) /* Time Stamp Counter */
23#define X86_FEATURE_MSR (0*32+ 5) /* Model-Specific Registers */ 23#define X86_FEATURE_MSR (0*32+ 5) /* Model-Specific Registers */
24#define X86_FEATURE_PAE (0*32+ 6) /* Physical Address Extensions */ 24#define X86_FEATURE_PAE (0*32+ 6) /* Physical Address Extensions */
25#define X86_FEATURE_MCE (0*32+ 7) /* Machine Check Architecture */ 25#define X86_FEATURE_MCE (0*32+ 7) /* Machine Check Exception */
26#define X86_FEATURE_CX8 (0*32+ 8) /* CMPXCHG8 instruction */ 26#define X86_FEATURE_CX8 (0*32+ 8) /* CMPXCHG8 instruction */
27#define X86_FEATURE_APIC (0*32+ 9) /* Onboard APIC */ 27#define X86_FEATURE_APIC (0*32+ 9) /* Onboard APIC */
28#define X86_FEATURE_SEP (0*32+11) /* SYSENTER/SYSEXIT */ 28#define X86_FEATURE_SEP (0*32+11) /* SYSENTER/SYSEXIT */
@@ -94,6 +94,7 @@
94#define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */ 94#define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */
95#define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */ 95#define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */
96#define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */ 96#define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */
97#define X86_FEATURE_EXTD_APICID (3*32+26) /* has extended APICID (8 bits) */
97 98
98/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ 99/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
99#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ 100#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */
@@ -115,6 +116,8 @@
115#define X86_FEATURE_XMM4_1 (4*32+19) /* "sse4_1" SSE-4.1 */ 116#define X86_FEATURE_XMM4_1 (4*32+19) /* "sse4_1" SSE-4.1 */
116#define X86_FEATURE_XMM4_2 (4*32+20) /* "sse4_2" SSE-4.2 */ 117#define X86_FEATURE_XMM4_2 (4*32+20) /* "sse4_2" SSE-4.2 */
117#define X86_FEATURE_X2APIC (4*32+21) /* x2APIC */ 118#define X86_FEATURE_X2APIC (4*32+21) /* x2APIC */
119#define X86_FEATURE_MOVBE (4*32+22) /* MOVBE instruction */
120#define X86_FEATURE_POPCNT (4*32+23) /* POPCNT instruction */
118#define X86_FEATURE_AES (4*32+25) /* AES instructions */ 121#define X86_FEATURE_AES (4*32+25) /* AES instructions */
119#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ 122#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
120#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */ 123#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */
@@ -192,11 +195,11 @@ extern const char * const x86_power_flags[32];
192#define clear_cpu_cap(c, bit) clear_bit(bit, (unsigned long *)((c)->x86_capability)) 195#define clear_cpu_cap(c, bit) clear_bit(bit, (unsigned long *)((c)->x86_capability))
193#define setup_clear_cpu_cap(bit) do { \ 196#define setup_clear_cpu_cap(bit) do { \
194 clear_cpu_cap(&boot_cpu_data, bit); \ 197 clear_cpu_cap(&boot_cpu_data, bit); \
195 set_bit(bit, (unsigned long *)cleared_cpu_caps); \ 198 set_bit(bit, (unsigned long *)cpu_caps_cleared); \
196} while (0) 199} while (0)
197#define setup_force_cpu_cap(bit) do { \ 200#define setup_force_cpu_cap(bit) do { \
198 set_cpu_cap(&boot_cpu_data, bit); \ 201 set_cpu_cap(&boot_cpu_data, bit); \
199 clear_bit(bit, (unsigned long *)cleared_cpu_caps); \ 202 set_bit(bit, (unsigned long *)cpu_caps_set); \
200} while (0) 203} while (0)
201 204
202#define cpu_has_fpu boot_cpu_has(X86_FEATURE_FPU) 205#define cpu_has_fpu boot_cpu_has(X86_FEATURE_FPU)
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index c45f415ce315..c993e9e0fed4 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -1,7 +1,6 @@
1#ifndef _ASM_X86_DESC_H 1#ifndef _ASM_X86_DESC_H
2#define _ASM_X86_DESC_H 2#define _ASM_X86_DESC_H
3 3
4#ifndef __ASSEMBLY__
5#include <asm/desc_defs.h> 4#include <asm/desc_defs.h>
6#include <asm/ldt.h> 5#include <asm/ldt.h>
7#include <asm/mmu.h> 6#include <asm/mmu.h>
@@ -380,29 +379,4 @@ static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist)
380 _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS); 379 _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS);
381} 380}
382 381
383#else
384/*
385 * GET_DESC_BASE reads the descriptor base of the specified segment.
386 *
387 * Args:
388 * idx - descriptor index
389 * gdt - GDT pointer
390 * base - 32bit register to which the base will be written
391 * lo_w - lo word of the "base" register
392 * lo_b - lo byte of the "base" register
393 * hi_b - hi byte of the low word of the "base" register
394 *
395 * Example:
396 * GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
397 * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax.
398 */
399#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \
400 movb idx * 8 + 4(gdt), lo_b; \
401 movb idx * 8 + 7(gdt), hi_b; \
402 shll $16, base; \
403 movw idx * 8 + 2(gdt), lo_w;
404
405
406#endif /* __ASSEMBLY__ */
407
408#endif /* _ASM_X86_DESC_H */ 382#endif /* _ASM_X86_DESC_H */
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index f82fdc412c64..1c3f9435f1c9 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -6,6 +6,7 @@
6 * Documentation/DMA-API.txt for documentation. 6 * Documentation/DMA-API.txt for documentation.
7 */ 7 */
8 8
9#include <linux/kmemcheck.h>
9#include <linux/scatterlist.h> 10#include <linux/scatterlist.h>
10#include <linux/dma-debug.h> 11#include <linux/dma-debug.h>
11#include <linux/dma-attrs.h> 12#include <linux/dma-attrs.h>
@@ -32,6 +33,8 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
32#endif 33#endif
33} 34}
34 35
36#include <asm-generic/dma-mapping-common.h>
37
35/* Make sure we keep the same behaviour */ 38/* Make sure we keep the same behaviour */
36static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) 39static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
37{ 40{
@@ -52,171 +55,6 @@ extern int dma_set_mask(struct device *dev, u64 mask);
52extern void *dma_generic_alloc_coherent(struct device *dev, size_t size, 55extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,
53 dma_addr_t *dma_addr, gfp_t flag); 56 dma_addr_t *dma_addr, gfp_t flag);
54 57
55static inline dma_addr_t
56dma_map_single(struct device *hwdev, void *ptr, size_t size,
57 enum dma_data_direction dir)
58{
59 struct dma_map_ops *ops = get_dma_ops(hwdev);
60 dma_addr_t addr;
61
62 BUG_ON(!valid_dma_direction(dir));
63 addr = ops->map_page(hwdev, virt_to_page(ptr),
64 (unsigned long)ptr & ~PAGE_MASK, size,
65 dir, NULL);
66 debug_dma_map_page(hwdev, virt_to_page(ptr),
67 (unsigned long)ptr & ~PAGE_MASK, size,
68 dir, addr, true);
69 return addr;
70}
71
72static inline void
73dma_unmap_single(struct device *dev, dma_addr_t addr, size_t size,
74 enum dma_data_direction dir)
75{
76 struct dma_map_ops *ops = get_dma_ops(dev);
77
78 BUG_ON(!valid_dma_direction(dir));
79 if (ops->unmap_page)
80 ops->unmap_page(dev, addr, size, dir, NULL);
81 debug_dma_unmap_page(dev, addr, size, dir, true);
82}
83
84static inline int
85dma_map_sg(struct device *hwdev, struct scatterlist *sg,
86 int nents, enum dma_data_direction dir)
87{
88 struct dma_map_ops *ops = get_dma_ops(hwdev);
89 int ents;
90
91 BUG_ON(!valid_dma_direction(dir));
92 ents = ops->map_sg(hwdev, sg, nents, dir, NULL);
93 debug_dma_map_sg(hwdev, sg, nents, ents, dir);
94
95 return ents;
96}
97
98static inline void
99dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
100 enum dma_data_direction dir)
101{
102 struct dma_map_ops *ops = get_dma_ops(hwdev);
103
104 BUG_ON(!valid_dma_direction(dir));
105 debug_dma_unmap_sg(hwdev, sg, nents, dir);
106 if (ops->unmap_sg)
107 ops->unmap_sg(hwdev, sg, nents, dir, NULL);
108}
109
110static inline void
111dma_sync_single_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
112 size_t size, enum dma_data_direction dir)
113{
114 struct dma_map_ops *ops = get_dma_ops(hwdev);
115
116 BUG_ON(!valid_dma_direction(dir));
117 if (ops->sync_single_for_cpu)
118 ops->sync_single_for_cpu(hwdev, dma_handle, size, dir);
119 debug_dma_sync_single_for_cpu(hwdev, dma_handle, size, dir);
120 flush_write_buffers();
121}
122
123static inline void
124dma_sync_single_for_device(struct device *hwdev, dma_addr_t dma_handle,
125 size_t size, enum dma_data_direction dir)
126{
127 struct dma_map_ops *ops = get_dma_ops(hwdev);
128
129 BUG_ON(!valid_dma_direction(dir));
130 if (ops->sync_single_for_device)
131 ops->sync_single_for_device(hwdev, dma_handle, size, dir);
132 debug_dma_sync_single_for_device(hwdev, dma_handle, size, dir);
133 flush_write_buffers();
134}
135
136static inline void
137dma_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
138 unsigned long offset, size_t size,
139 enum dma_data_direction dir)
140{
141 struct dma_map_ops *ops = get_dma_ops(hwdev);
142
143 BUG_ON(!valid_dma_direction(dir));
144 if (ops->sync_single_range_for_cpu)
145 ops->sync_single_range_for_cpu(hwdev, dma_handle, offset,
146 size, dir);
147 debug_dma_sync_single_range_for_cpu(hwdev, dma_handle,
148 offset, size, dir);
149 flush_write_buffers();
150}
151
152static inline void
153dma_sync_single_range_for_device(struct device *hwdev, dma_addr_t dma_handle,
154 unsigned long offset, size_t size,
155 enum dma_data_direction dir)
156{
157 struct dma_map_ops *ops = get_dma_ops(hwdev);
158
159 BUG_ON(!valid_dma_direction(dir));
160 if (ops->sync_single_range_for_device)
161 ops->sync_single_range_for_device(hwdev, dma_handle,
162 offset, size, dir);
163 debug_dma_sync_single_range_for_device(hwdev, dma_handle,
164 offset, size, dir);
165 flush_write_buffers();
166}
167
168static inline void
169dma_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
170 int nelems, enum dma_data_direction dir)
171{
172 struct dma_map_ops *ops = get_dma_ops(hwdev);
173
174 BUG_ON(!valid_dma_direction(dir));
175 if (ops->sync_sg_for_cpu)
176 ops->sync_sg_for_cpu(hwdev, sg, nelems, dir);
177 debug_dma_sync_sg_for_cpu(hwdev, sg, nelems, dir);
178 flush_write_buffers();
179}
180
181static inline void
182dma_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
183 int nelems, enum dma_data_direction dir)
184{
185 struct dma_map_ops *ops = get_dma_ops(hwdev);
186
187 BUG_ON(!valid_dma_direction(dir));
188 if (ops->sync_sg_for_device)
189 ops->sync_sg_for_device(hwdev, sg, nelems, dir);
190 debug_dma_sync_sg_for_device(hwdev, sg, nelems, dir);
191
192 flush_write_buffers();
193}
194
195static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
196 size_t offset, size_t size,
197 enum dma_data_direction dir)
198{
199 struct dma_map_ops *ops = get_dma_ops(dev);
200 dma_addr_t addr;
201
202 BUG_ON(!valid_dma_direction(dir));
203 addr = ops->map_page(dev, page, offset, size, dir, NULL);
204 debug_dma_map_page(dev, page, offset, size, dir, addr, false);
205
206 return addr;
207}
208
209static inline void dma_unmap_page(struct device *dev, dma_addr_t addr,
210 size_t size, enum dma_data_direction dir)
211{
212 struct dma_map_ops *ops = get_dma_ops(dev);
213
214 BUG_ON(!valid_dma_direction(dir));
215 if (ops->unmap_page)
216 ops->unmap_page(dev, addr, size, dir, NULL);
217 debug_dma_unmap_page(dev, addr, size, dir, false);
218}
219
220static inline void 58static inline void
221dma_cache_sync(struct device *dev, void *vaddr, size_t size, 59dma_cache_sync(struct device *dev, void *vaddr, size_t size,
222 enum dma_data_direction dir) 60 enum dma_data_direction dir)
diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h
index a8f672ba100c..70dac199b093 100644
--- a/arch/x86/include/asm/ds.h
+++ b/arch/x86/include/asm/ds.h
@@ -15,8 +15,8 @@
15 * - buffer allocation (memory accounting) 15 * - buffer allocation (memory accounting)
16 * 16 *
17 * 17 *
18 * Copyright (C) 2007-2008 Intel Corporation. 18 * Copyright (C) 2007-2009 Intel Corporation.
19 * Markus Metzger <markus.t.metzger@intel.com>, 2007-2008 19 * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
20 */ 20 */
21 21
22#ifndef _ASM_X86_DS_H 22#ifndef _ASM_X86_DS_H
@@ -83,8 +83,10 @@ enum ds_feature {
83 * The interrupt threshold is independent from the overflow callback 83 * The interrupt threshold is independent from the overflow callback
84 * to allow users to use their own overflow interrupt handling mechanism. 84 * to allow users to use their own overflow interrupt handling mechanism.
85 * 85 *
86 * task: the task to request recording for; 86 * The function might sleep.
87 * NULL for per-cpu recording on the current cpu 87 *
88 * task: the task to request recording for
89 * cpu: the cpu to request recording for
88 * base: the base pointer for the (non-pageable) buffer; 90 * base: the base pointer for the (non-pageable) buffer;
89 * size: the size of the provided buffer in bytes 91 * size: the size of the provided buffer in bytes
90 * ovfl: pointer to a function to be called on buffer overflow; 92 * ovfl: pointer to a function to be called on buffer overflow;
@@ -93,19 +95,28 @@ enum ds_feature {
93 * -1 if no interrupt threshold is requested. 95 * -1 if no interrupt threshold is requested.
94 * flags: a bit-mask of the above flags 96 * flags: a bit-mask of the above flags
95 */ 97 */
96extern struct bts_tracer *ds_request_bts(struct task_struct *task, 98extern struct bts_tracer *ds_request_bts_task(struct task_struct *task,
97 void *base, size_t size, 99 void *base, size_t size,
98 bts_ovfl_callback_t ovfl, 100 bts_ovfl_callback_t ovfl,
99 size_t th, unsigned int flags); 101 size_t th, unsigned int flags);
100extern struct pebs_tracer *ds_request_pebs(struct task_struct *task, 102extern struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
101 void *base, size_t size, 103 bts_ovfl_callback_t ovfl,
102 pebs_ovfl_callback_t ovfl, 104 size_t th, unsigned int flags);
103 size_t th, unsigned int flags); 105extern struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
106 void *base, size_t size,
107 pebs_ovfl_callback_t ovfl,
108 size_t th, unsigned int flags);
109extern struct pebs_tracer *ds_request_pebs_cpu(int cpu,
110 void *base, size_t size,
111 pebs_ovfl_callback_t ovfl,
112 size_t th, unsigned int flags);
104 113
105/* 114/*
106 * Release BTS or PEBS resources 115 * Release BTS or PEBS resources
107 * Suspend and resume BTS or PEBS tracing 116 * Suspend and resume BTS or PEBS tracing
108 * 117 *
118 * Must be called with irq's enabled.
119 *
109 * tracer: the tracer handle returned from ds_request_~() 120 * tracer: the tracer handle returned from ds_request_~()
110 */ 121 */
111extern void ds_release_bts(struct bts_tracer *tracer); 122extern void ds_release_bts(struct bts_tracer *tracer);
@@ -115,6 +126,28 @@ extern void ds_release_pebs(struct pebs_tracer *tracer);
115extern void ds_suspend_pebs(struct pebs_tracer *tracer); 126extern void ds_suspend_pebs(struct pebs_tracer *tracer);
116extern void ds_resume_pebs(struct pebs_tracer *tracer); 127extern void ds_resume_pebs(struct pebs_tracer *tracer);
117 128
129/*
130 * Release BTS or PEBS resources
131 * Suspend and resume BTS or PEBS tracing
132 *
133 * Cpu tracers must call this on the traced cpu.
134 * Task tracers must call ds_release_~_noirq() for themselves.
135 *
136 * May be called with irq's disabled.
137 *
138 * Returns 0 if successful;
139 * -EPERM if the cpu tracer does not trace the current cpu.
140 * -EPERM if the task tracer does not trace itself.
141 *
142 * tracer: the tracer handle returned from ds_request_~()
143 */
144extern int ds_release_bts_noirq(struct bts_tracer *tracer);
145extern int ds_suspend_bts_noirq(struct bts_tracer *tracer);
146extern int ds_resume_bts_noirq(struct bts_tracer *tracer);
147extern int ds_release_pebs_noirq(struct pebs_tracer *tracer);
148extern int ds_suspend_pebs_noirq(struct pebs_tracer *tracer);
149extern int ds_resume_pebs_noirq(struct pebs_tracer *tracer);
150
118 151
119/* 152/*
120 * The raw DS buffer state as it is used for BTS and PEBS recording. 153 * The raw DS buffer state as it is used for BTS and PEBS recording.
@@ -170,9 +203,9 @@ struct bts_struct {
170 } lbr; 203 } lbr;
171 /* BTS_TASK_ARRIVES or BTS_TASK_DEPARTS */ 204 /* BTS_TASK_ARRIVES or BTS_TASK_DEPARTS */
172 struct { 205 struct {
173 __u64 jiffies; 206 __u64 clock;
174 pid_t pid; 207 pid_t pid;
175 } timestamp; 208 } event;
176 } variant; 209 } variant;
177}; 210};
178 211
@@ -201,8 +234,12 @@ struct bts_trace {
201struct pebs_trace { 234struct pebs_trace {
202 struct ds_trace ds; 235 struct ds_trace ds;
203 236
204 /* the PEBS reset value */ 237 /* the number of valid counters in the below array */
205 unsigned long long reset_value; 238 unsigned int counters;
239
240#define MAX_PEBS_COUNTERS 4
241 /* the counter reset value */
242 unsigned long long counter_reset[MAX_PEBS_COUNTERS];
206}; 243};
207 244
208 245
@@ -237,9 +274,11 @@ extern int ds_reset_pebs(struct pebs_tracer *tracer);
237 * Returns 0 on success; -Eerrno on error 274 * Returns 0 on success; -Eerrno on error
238 * 275 *
239 * tracer: the tracer handle returned from ds_request_pebs() 276 * tracer: the tracer handle returned from ds_request_pebs()
277 * counter: the index of the counter
240 * value: the new counter reset value 278 * value: the new counter reset value
241 */ 279 */
242extern int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value); 280extern int ds_set_pebs_reset(struct pebs_tracer *tracer,
281 unsigned int counter, u64 value);
243 282
244/* 283/*
245 * Initialization 284 * Initialization
@@ -252,21 +291,12 @@ extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *);
252 */ 291 */
253extern void ds_switch_to(struct task_struct *prev, struct task_struct *next); 292extern void ds_switch_to(struct task_struct *prev, struct task_struct *next);
254 293
255/*
256 * Task clone/init and cleanup work
257 */
258extern void ds_copy_thread(struct task_struct *tsk, struct task_struct *father);
259extern void ds_exit_thread(struct task_struct *tsk);
260
261#else /* CONFIG_X86_DS */ 294#else /* CONFIG_X86_DS */
262 295
263struct cpuinfo_x86; 296struct cpuinfo_x86;
264static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {} 297static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {}
265static inline void ds_switch_to(struct task_struct *prev, 298static inline void ds_switch_to(struct task_struct *prev,
266 struct task_struct *next) {} 299 struct task_struct *next) {}
267static inline void ds_copy_thread(struct task_struct *tsk,
268 struct task_struct *father) {}
269static inline void ds_exit_thread(struct task_struct *tsk) {}
270 300
271#endif /* CONFIG_X86_DS */ 301#endif /* CONFIG_X86_DS */
272#endif /* _ASM_X86_DS_H */ 302#endif /* _ASM_X86_DS_H */
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index c2e6bedaf258..ff8cbfa07851 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -14,6 +14,7 @@ BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR)
14BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR) 14BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
15BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR) 15BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
16BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR) 16BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
17BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR)
17 18
18BUILD_INTERRUPT3(invalidate_interrupt0,INVALIDATE_TLB_VECTOR_START+0, 19BUILD_INTERRUPT3(invalidate_interrupt0,INVALIDATE_TLB_VECTOR_START+0,
19 smp_invalidate_interrupt) 20 smp_invalidate_interrupt)
@@ -49,11 +50,19 @@ BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
49BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) 50BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
50 51
51#ifdef CONFIG_PERF_COUNTERS 52#ifdef CONFIG_PERF_COUNTERS
52BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR) 53BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
53#endif 54#endif
54 55
55#ifdef CONFIG_X86_MCE_P4THERMAL 56#ifdef CONFIG_X86_THERMAL_VECTOR
56BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) 57BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
57#endif 58#endif
58 59
60#ifdef CONFIG_X86_MCE_THRESHOLD
61BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR)
62#endif
63
64#ifdef CONFIG_X86_NEW_MCE
65BUILD_INTERRUPT(mce_self_interrupt,MCE_SELF_VECTOR)
66#endif
67
59#endif 68#endif
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 37555e52f980..82e3e8f01043 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -13,6 +13,8 @@ typedef struct {
13 unsigned int irq_spurious_count; 13 unsigned int irq_spurious_count;
14#endif 14#endif
15 unsigned int generic_irqs; /* arch dependent */ 15 unsigned int generic_irqs; /* arch dependent */
16 unsigned int apic_perf_irqs;
17 unsigned int apic_pending_irqs;
16#ifdef CONFIG_SMP 18#ifdef CONFIG_SMP
17 unsigned int irq_resched_count; 19 unsigned int irq_resched_count;
18 unsigned int irq_call_count; 20 unsigned int irq_call_count;
@@ -20,7 +22,7 @@ typedef struct {
20#endif 22#endif
21#ifdef CONFIG_X86_MCE 23#ifdef CONFIG_X86_MCE
22 unsigned int irq_thermal_count; 24 unsigned int irq_thermal_count;
23# ifdef CONFIG_X86_64 25# ifdef CONFIG_X86_MCE_THRESHOLD
24 unsigned int irq_threshold_count; 26 unsigned int irq_threshold_count;
25# endif 27# endif
26#endif 28#endif
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index b762ea49bd70..ba180d93b08c 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -29,9 +29,12 @@
29extern void apic_timer_interrupt(void); 29extern void apic_timer_interrupt(void);
30extern void generic_interrupt(void); 30extern void generic_interrupt(void);
31extern void error_interrupt(void); 31extern void error_interrupt(void);
32extern void perf_pending_interrupt(void);
33
32extern void spurious_interrupt(void); 34extern void spurious_interrupt(void);
33extern void thermal_interrupt(void); 35extern void thermal_interrupt(void);
34extern void reschedule_interrupt(void); 36extern void reschedule_interrupt(void);
37extern void mce_self_interrupt(void);
35 38
36extern void invalidate_interrupt(void); 39extern void invalidate_interrupt(void);
37extern void invalidate_interrupt0(void); 40extern void invalidate_interrupt0(void);
@@ -44,6 +47,7 @@ extern void invalidate_interrupt6(void);
44extern void invalidate_interrupt7(void); 47extern void invalidate_interrupt7(void);
45 48
46extern void irq_move_cleanup_interrupt(void); 49extern void irq_move_cleanup_interrupt(void);
50extern void reboot_interrupt(void);
47extern void threshold_interrupt(void); 51extern void threshold_interrupt(void);
48 52
49extern void call_function_interrupt(void); 53extern void call_function_interrupt(void);
@@ -63,7 +67,26 @@ extern unsigned long io_apic_irqs;
63extern void init_VISWS_APIC_irqs(void); 67extern void init_VISWS_APIC_irqs(void);
64extern void setup_IO_APIC(void); 68extern void setup_IO_APIC(void);
65extern void disable_IO_APIC(void); 69extern void disable_IO_APIC(void);
66extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn); 70
71struct io_apic_irq_attr {
72 int ioapic;
73 int ioapic_pin;
74 int trigger;
75 int polarity;
76};
77
78static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr,
79 int ioapic, int ioapic_pin,
80 int trigger, int polarity)
81{
82 irq_attr->ioapic = ioapic;
83 irq_attr->ioapic_pin = ioapic_pin;
84 irq_attr->trigger = trigger;
85 irq_attr->polarity = polarity;
86}
87
88extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin,
89 struct io_apic_irq_attr *irq_attr);
67extern void setup_ioapic_dest(void); 90extern void setup_ioapic_dest(void);
68 91
69extern void enable_IO_APIC(void); 92extern void enable_IO_APIC(void);
@@ -78,7 +101,11 @@ extern void eisa_set_level_irq(unsigned int irq);
78/* SMP */ 101/* SMP */
79extern void smp_apic_timer_interrupt(struct pt_regs *); 102extern void smp_apic_timer_interrupt(struct pt_regs *);
80extern void smp_spurious_interrupt(struct pt_regs *); 103extern void smp_spurious_interrupt(struct pt_regs *);
104extern void smp_generic_interrupt(struct pt_regs *);
81extern void smp_error_interrupt(struct pt_regs *); 105extern void smp_error_interrupt(struct pt_regs *);
106#ifdef CONFIG_X86_IO_APIC
107extern asmlinkage void smp_irq_move_cleanup_interrupt(void);
108#endif
82#ifdef CONFIG_SMP 109#ifdef CONFIG_SMP
83extern void smp_reschedule_interrupt(struct pt_regs *); 110extern void smp_reschedule_interrupt(struct pt_regs *);
84extern void smp_call_function_interrupt(struct pt_regs *); 111extern void smp_call_function_interrupt(struct pt_regs *);
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 71c9e5183982..175adf58dd4f 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -67,7 +67,7 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
67 ".previous\n" 67 ".previous\n"
68 _ASM_EXTABLE(1b, 3b) 68 _ASM_EXTABLE(1b, 3b)
69 : [err] "=r" (err) 69 : [err] "=r" (err)
70#if 0 /* See comment in __save_init_fpu() below. */ 70#if 0 /* See comment in fxsave() below. */
71 : [fx] "r" (fx), "m" (*fx), "0" (0)); 71 : [fx] "r" (fx), "m" (*fx), "0" (0));
72#else 72#else
73 : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0)); 73 : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0));
@@ -75,14 +75,6 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
75 return err; 75 return err;
76} 76}
77 77
78static inline int restore_fpu_checking(struct task_struct *tsk)
79{
80 if (task_thread_info(tsk)->status & TS_XSAVE)
81 return xrstor_checking(&tsk->thread.xstate->xsave);
82 else
83 return fxrstor_checking(&tsk->thread.xstate->fxsave);
84}
85
86/* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception 78/* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception
87 is pending. Clear the x87 state here by setting it to fixed 79 is pending. Clear the x87 state here by setting it to fixed
88 values. The kernel data segment can be sometimes 0 and sometimes 80 values. The kernel data segment can be sometimes 0 and sometimes
@@ -120,7 +112,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
120 ".previous\n" 112 ".previous\n"
121 _ASM_EXTABLE(1b, 3b) 113 _ASM_EXTABLE(1b, 3b)
122 : [err] "=r" (err), "=m" (*fx) 114 : [err] "=r" (err), "=m" (*fx)
123#if 0 /* See comment in __fxsave_clear() below. */ 115#if 0 /* See comment in fxsave() below. */
124 : [fx] "r" (fx), "0" (0)); 116 : [fx] "r" (fx), "0" (0));
125#else 117#else
126 : [fx] "cdaSDb" (fx), "0" (0)); 118 : [fx] "cdaSDb" (fx), "0" (0));
@@ -185,12 +177,9 @@ static inline void tolerant_fwait(void)
185 asm volatile("fnclex ; fwait"); 177 asm volatile("fnclex ; fwait");
186} 178}
187 179
188static inline void restore_fpu(struct task_struct *tsk) 180/* perform fxrstor iff the processor has extended states, otherwise frstor */
181static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
189{ 182{
190 if (task_thread_info(tsk)->status & TS_XSAVE) {
191 xrstor_checking(&tsk->thread.xstate->xsave);
192 return;
193 }
194 /* 183 /*
195 * The "nop" is needed to make the instructions the same 184 * The "nop" is needed to make the instructions the same
196 * length. 185 * length.
@@ -199,7 +188,9 @@ static inline void restore_fpu(struct task_struct *tsk)
199 "nop ; frstor %1", 188 "nop ; frstor %1",
200 "fxrstor %1", 189 "fxrstor %1",
201 X86_FEATURE_FXSR, 190 X86_FEATURE_FXSR,
202 "m" (tsk->thread.xstate->fxsave)); 191 "m" (*fx));
192
193 return 0;
203} 194}
204 195
205/* We need a safe address that is cheap to find and that is already 196/* We need a safe address that is cheap to find and that is already
@@ -262,6 +253,14 @@ end:
262 253
263#endif /* CONFIG_X86_64 */ 254#endif /* CONFIG_X86_64 */
264 255
256static inline int restore_fpu_checking(struct task_struct *tsk)
257{
258 if (task_thread_info(tsk)->status & TS_XSAVE)
259 return xrstor_checking(&tsk->thread.xstate->xsave);
260 else
261 return fxrstor_checking(&tsk->thread.xstate->fxsave);
262}
263
265/* 264/*
266 * Signal frame handlers... 265 * Signal frame handlers...
267 */ 266 */
@@ -305,18 +304,18 @@ static inline void kernel_fpu_end(void)
305/* 304/*
306 * Some instructions like VIA's padlock instructions generate a spurious 305 * Some instructions like VIA's padlock instructions generate a spurious
307 * DNA fault but don't modify SSE registers. And these instructions 306 * DNA fault but don't modify SSE registers. And these instructions
308 * get used from interrupt context aswell. To prevent these kernel instructions 307 * get used from interrupt context as well. To prevent these kernel instructions
309 * in interrupt context interact wrongly with other user/kernel fpu usage, we 308 * in interrupt context interacting wrongly with other user/kernel fpu usage, we
310 * should use them only in the context of irq_ts_save/restore() 309 * should use them only in the context of irq_ts_save/restore()
311 */ 310 */
312static inline int irq_ts_save(void) 311static inline int irq_ts_save(void)
313{ 312{
314 /* 313 /*
315 * If we are in process context, we are ok to take a spurious DNA fault. 314 * If in process context and not atomic, we can take a spurious DNA fault.
316 * Otherwise, doing clts() in process context require pre-emption to 315 * Otherwise, doing clts() in process context requires disabling preemption
317 * be disabled or some heavy lifting like kernel_fpu_begin() 316 * or some heavy lifting like kernel_fpu_begin()
318 */ 317 */
319 if (!in_interrupt()) 318 if (!in_atomic())
320 return 0; 319 return 0;
321 320
322 if (read_cr0() & X86_CR0_TS) { 321 if (read_cr0() & X86_CR0_TS) {
diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h
index 1a99e6c092af..58d7091eeb1f 100644
--- a/arch/x86/include/asm/i8259.h
+++ b/arch/x86/include/asm/i8259.h
@@ -60,8 +60,4 @@ extern struct irq_chip i8259A_chip;
60extern void mask_8259A(void); 60extern void mask_8259A(void);
61extern void unmask_8259A(void); 61extern void unmask_8259A(void);
62 62
63#ifdef CONFIG_X86_32
64extern void init_ISA_irqs(void);
65#endif
66
67#endif /* _ASM_X86_I8259_H */ 63#endif /* _ASM_X86_I8259_H */
diff --git a/arch/x86/include/asm/intel_arch_perfmon.h b/arch/x86/include/asm/intel_arch_perfmon.h
deleted file mode 100644
index fa0fd068bc2e..000000000000
--- a/arch/x86/include/asm/intel_arch_perfmon.h
+++ /dev/null
@@ -1,31 +0,0 @@
1#ifndef _ASM_X86_INTEL_ARCH_PERFMON_H
2#define _ASM_X86_INTEL_ARCH_PERFMON_H
3
4#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
5#define MSR_ARCH_PERFMON_PERFCTR1 0xc2
6
7#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
8#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
9
10#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
11#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
12#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
13#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
14
15#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL (0x3c)
16#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
17#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX (0)
18#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
19 (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
20
21union cpuid10_eax {
22 struct {
23 unsigned int version_id:8;
24 unsigned int num_counters:8;
25 unsigned int bit_width:8;
26 unsigned int mask_length:8;
27 } split;
28 unsigned int full;
29};
30
31#endif /* _ASM_X86_INTEL_ARCH_PERFMON_H */
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 9d826e436010..daf866ed0612 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -154,22 +154,19 @@ extern int timer_through_8259;
154extern int io_apic_get_unique_id(int ioapic, int apic_id); 154extern int io_apic_get_unique_id(int ioapic, int apic_id);
155extern int io_apic_get_version(int ioapic); 155extern int io_apic_get_version(int ioapic);
156extern int io_apic_get_redir_entries(int ioapic); 156extern int io_apic_get_redir_entries(int ioapic);
157extern int io_apic_set_pci_routing(int ioapic, int pin, int irq,
158 int edge_level, int active_high_low);
159#endif /* CONFIG_ACPI */ 157#endif /* CONFIG_ACPI */
160 158
159struct io_apic_irq_attr;
160extern int io_apic_set_pci_routing(struct device *dev, int irq,
161 struct io_apic_irq_attr *irq_attr);
161extern int (*ioapic_renumber_irq)(int ioapic, int irq); 162extern int (*ioapic_renumber_irq)(int ioapic, int irq);
162extern void ioapic_init_mappings(void); 163extern void ioapic_init_mappings(void);
163 164
164#ifdef CONFIG_X86_64
165extern struct IO_APIC_route_entry **alloc_ioapic_entries(void); 165extern struct IO_APIC_route_entry **alloc_ioapic_entries(void);
166extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries); 166extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries);
167extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); 167extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
168extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); 168extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
169extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); 169extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
170extern void reinit_intr_remapped_IO_APIC(int intr_remapping,
171 struct IO_APIC_route_entry **ioapic_entries);
172#endif
173 170
174extern void probe_nr_irqs_gsi(void); 171extern void probe_nr_irqs_gsi(void);
175 172
diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h
index 86af26091d6c..0e9fe1d9d971 100644
--- a/arch/x86/include/asm/iomap.h
+++ b/arch/x86/include/asm/iomap.h
@@ -1,3 +1,6 @@
1#ifndef _ASM_X86_IOMAP_H
2#define _ASM_X86_IOMAP_H
3
1/* 4/*
2 * Copyright © 2008 Ingo Molnar 5 * Copyright © 2008 Ingo Molnar
3 * 6 *
@@ -31,3 +34,5 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
31 34
32void 35void
33iounmap_atomic(void *kvaddr, enum km_type type); 36iounmap_atomic(void *kvaddr, enum km_type type);
37
38#endif /* _ASM_X86_IOMAP_H */
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index af326a2975b5..fd6d21bbee6c 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -6,6 +6,7 @@ extern void no_iommu_init(void);
6extern struct dma_map_ops nommu_dma_ops; 6extern struct dma_map_ops nommu_dma_ops;
7extern int force_iommu, no_iommu; 7extern int force_iommu, no_iommu;
8extern int iommu_detected; 8extern int iommu_detected;
9extern int iommu_pass_through;
9 10
10/* 10 seconds */ 11/* 10 seconds */
11#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) 12#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 0396760fccb8..f275e2244505 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -1,6 +1,6 @@
1#ifndef _ASM_X86_IRQ_REMAPPING_H 1#ifndef _ASM_X86_IRQ_REMAPPING_H
2#define _ASM_X86_IRQ_REMAPPING_H 2#define _ASM_X86_IRQ_REMAPPING_H
3 3
4#define IRTE_DEST(dest) ((x2apic) ? dest : dest << 8) 4#define IRTE_DEST(dest) ((x2apic_mode) ? dest : dest << 8)
5 5
6#endif /* _ASM_X86_IRQ_REMAPPING_H */ 6#endif /* _ASM_X86_IRQ_REMAPPING_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 3cbd79bbb47c..5b21f0ec3df2 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -25,6 +25,7 @@
25 */ 25 */
26 26
27#define NMI_VECTOR 0x02 27#define NMI_VECTOR 0x02
28#define MCE_VECTOR 0x12
28 29
29/* 30/*
30 * IDT vectors usable for external interrupt sources start 31 * IDT vectors usable for external interrupt sources start
@@ -34,6 +35,7 @@
34 35
35#ifdef CONFIG_X86_32 36#ifdef CONFIG_X86_32
36# define SYSCALL_VECTOR 0x80 37# define SYSCALL_VECTOR 0x80
38# define IA32_SYSCALL_VECTOR 0x80
37#else 39#else
38# define IA32_SYSCALL_VECTOR 0x80 40# define IA32_SYSCALL_VECTOR 0x80
39#endif 41#endif
@@ -86,13 +88,8 @@
86#define CALL_FUNCTION_VECTOR 0xfc 88#define CALL_FUNCTION_VECTOR 0xfc
87#define CALL_FUNCTION_SINGLE_VECTOR 0xfb 89#define CALL_FUNCTION_SINGLE_VECTOR 0xfb
88#define THERMAL_APIC_VECTOR 0xfa 90#define THERMAL_APIC_VECTOR 0xfa
89 91#define THRESHOLD_APIC_VECTOR 0xf9
90#ifdef CONFIG_X86_32 92#define REBOOT_VECTOR 0xf8
91/* 0xf8 - 0xf9 : free */
92#else
93# define THRESHOLD_APIC_VECTOR 0xf9
94# define UV_BAU_MESSAGE 0xf8
95#endif
96 93
97/* f0-f7 used for spreading out TLB flushes: */ 94/* f0-f7 used for spreading out TLB flushes: */
98#define INVALIDATE_TLB_VECTOR_END 0xf7 95#define INVALIDATE_TLB_VECTOR_END 0xf7
@@ -107,14 +104,21 @@
107#define LOCAL_TIMER_VECTOR 0xef 104#define LOCAL_TIMER_VECTOR 0xef
108 105
109/* 106/*
110 * Performance monitoring interrupt vector: 107 * Generic system vector for platform specific use
111 */ 108 */
112#define LOCAL_PERF_VECTOR 0xee 109#define GENERIC_INTERRUPT_VECTOR 0xed
113 110
114/* 111/*
115 * Generic system vector for platform specific use 112 * Performance monitoring pending work vector:
116 */ 113 */
117#define GENERIC_INTERRUPT_VECTOR 0xed 114#define LOCAL_PENDING_VECTOR 0xec
115
116#define UV_BAU_MESSAGE 0xec
117
118/*
119 * Self IPI vector for machine checks
120 */
121#define MCE_SELF_VECTOR 0xeb
118 122
119/* 123/*
120 * First APIC vector available to drivers: (vectors 0x30-0xee) we 124 * First APIC vector available to drivers: (vectors 0x30-0xee) we
diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h
index 54c8cc53b24d..c2d1f3b58e5f 100644
--- a/arch/x86/include/asm/k8.h
+++ b/arch/x86/include/asm/k8.h
@@ -12,4 +12,17 @@ extern int cache_k8_northbridges(void);
12extern void k8_flush_garts(void); 12extern void k8_flush_garts(void);
13extern int k8_scan_nodes(unsigned long start, unsigned long end); 13extern int k8_scan_nodes(unsigned long start, unsigned long end);
14 14
15#ifdef CONFIG_K8_NB
16static inline struct pci_dev *node_to_k8_nb_misc(int node)
17{
18 return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL;
19}
20#else
21static inline struct pci_dev *node_to_k8_nb_misc(int node)
22{
23 return NULL;
24}
25#endif
26
27
15#endif /* _ASM_X86_K8_H */ 28#endif /* _ASM_X86_K8_H */
diff --git a/arch/x86/include/asm/kmap_types.h b/arch/x86/include/asm/kmap_types.h
index 5759c165a5cf..9e00a731a7fb 100644
--- a/arch/x86/include/asm/kmap_types.h
+++ b/arch/x86/include/asm/kmap_types.h
@@ -2,28 +2,11 @@
2#define _ASM_X86_KMAP_TYPES_H 2#define _ASM_X86_KMAP_TYPES_H
3 3
4#if defined(CONFIG_X86_32) && defined(CONFIG_DEBUG_HIGHMEM) 4#if defined(CONFIG_X86_32) && defined(CONFIG_DEBUG_HIGHMEM)
5# define D(n) __KM_FENCE_##n , 5#define __WITH_KM_FENCE
6#else
7# define D(n)
8#endif 6#endif
9 7
10enum km_type { 8#include <asm-generic/kmap_types.h>
11D(0) KM_BOUNCE_READ,
12D(1) KM_SKB_SUNRPC_DATA,
13D(2) KM_SKB_DATA_SOFTIRQ,
14D(3) KM_USER0,
15D(4) KM_USER1,
16D(5) KM_BIO_SRC_IRQ,
17D(6) KM_BIO_DST_IRQ,
18D(7) KM_PTE0,
19D(8) KM_PTE1,
20D(9) KM_IRQ0,
21D(10) KM_IRQ1,
22D(11) KM_SOFTIRQ0,
23D(12) KM_SOFTIRQ1,
24D(13) KM_TYPE_NR
25};
26 9
27#undef D 10#undef __WITH_KM_FENCE
28 11
29#endif /* _ASM_X86_KMAP_TYPES_H */ 12#endif /* _ASM_X86_KMAP_TYPES_H */
diff --git a/arch/x86/include/asm/kmemcheck.h b/arch/x86/include/asm/kmemcheck.h
new file mode 100644
index 000000000000..ed01518f297e
--- /dev/null
+++ b/arch/x86/include/asm/kmemcheck.h
@@ -0,0 +1,42 @@
1#ifndef ASM_X86_KMEMCHECK_H
2#define ASM_X86_KMEMCHECK_H
3
4#include <linux/types.h>
5#include <asm/ptrace.h>
6
7#ifdef CONFIG_KMEMCHECK
8bool kmemcheck_active(struct pt_regs *regs);
9
10void kmemcheck_show(struct pt_regs *regs);
11void kmemcheck_hide(struct pt_regs *regs);
12
13bool kmemcheck_fault(struct pt_regs *regs,
14 unsigned long address, unsigned long error_code);
15bool kmemcheck_trap(struct pt_regs *regs);
16#else
17static inline bool kmemcheck_active(struct pt_regs *regs)
18{
19 return false;
20}
21
22static inline void kmemcheck_show(struct pt_regs *regs)
23{
24}
25
26static inline void kmemcheck_hide(struct pt_regs *regs)
27{
28}
29
30static inline bool kmemcheck_fault(struct pt_regs *regs,
31 unsigned long address, unsigned long error_code)
32{
33 return false;
34}
35
36static inline bool kmemcheck_trap(struct pt_regs *regs)
37{
38 return false;
39}
40#endif /* CONFIG_KMEMCHECK */
41
42#endif
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index dc3f6cf11704..125be8b19568 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -16,6 +16,7 @@
16#define __KVM_HAVE_MSI 16#define __KVM_HAVE_MSI
17#define __KVM_HAVE_USER_NMI 17#define __KVM_HAVE_USER_NMI
18#define __KVM_HAVE_GUEST_DEBUG 18#define __KVM_HAVE_GUEST_DEBUG
19#define __KVM_HAVE_MSIX
19 20
20/* Architectural interrupt line count. */ 21/* Architectural interrupt line count. */
21#define KVM_NR_INTERRUPTS 256 22#define KVM_NR_INTERRUPTS 256
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f0faf58044ff..eabdc1cfab5c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -185,6 +185,7 @@ union kvm_mmu_page_role {
185 unsigned access:3; 185 unsigned access:3;
186 unsigned invalid:1; 186 unsigned invalid:1;
187 unsigned cr4_pge:1; 187 unsigned cr4_pge:1;
188 unsigned nxe:1;
188 }; 189 };
189}; 190};
190 191
@@ -212,7 +213,6 @@ struct kvm_mmu_page {
212 int multimapped; /* More than one parent_pte? */ 213 int multimapped; /* More than one parent_pte? */
213 int root_count; /* Currently serving as active root */ 214 int root_count; /* Currently serving as active root */
214 bool unsync; 215 bool unsync;
215 bool global;
216 unsigned int unsync_children; 216 unsigned int unsync_children;
217 union { 217 union {
218 u64 *parent_pte; /* !multimapped */ 218 u64 *parent_pte; /* !multimapped */
@@ -261,13 +261,11 @@ struct kvm_mmu {
261 union kvm_mmu_page_role base_role; 261 union kvm_mmu_page_role base_role;
262 262
263 u64 *pae_root; 263 u64 *pae_root;
264 u64 rsvd_bits_mask[2][4];
264}; 265};
265 266
266struct kvm_vcpu_arch { 267struct kvm_vcpu_arch {
267 u64 host_tsc; 268 u64 host_tsc;
268 int interrupt_window_open;
269 unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
270 DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS);
271 /* 269 /*
272 * rip and regs accesses must go through 270 * rip and regs accesses must go through
273 * kvm_{register,rip}_{read,write} functions. 271 * kvm_{register,rip}_{read,write} functions.
@@ -286,6 +284,7 @@ struct kvm_vcpu_arch {
286 u64 shadow_efer; 284 u64 shadow_efer;
287 u64 apic_base; 285 u64 apic_base;
288 struct kvm_lapic *apic; /* kernel irqchip context */ 286 struct kvm_lapic *apic; /* kernel irqchip context */
287 int32_t apic_arb_prio;
289 int mp_state; 288 int mp_state;
290 int sipi_vector; 289 int sipi_vector;
291 u64 ia32_misc_enable_msr; 290 u64 ia32_misc_enable_msr;
@@ -320,6 +319,8 @@ struct kvm_vcpu_arch {
320 struct kvm_pio_request pio; 319 struct kvm_pio_request pio;
321 void *pio_data; 320 void *pio_data;
322 321
322 u8 event_exit_inst_len;
323
323 struct kvm_queued_exception { 324 struct kvm_queued_exception {
324 bool pending; 325 bool pending;
325 bool has_error_code; 326 bool has_error_code;
@@ -329,11 +330,12 @@ struct kvm_vcpu_arch {
329 330
330 struct kvm_queued_interrupt { 331 struct kvm_queued_interrupt {
331 bool pending; 332 bool pending;
333 bool soft;
332 u8 nr; 334 u8 nr;
333 } interrupt; 335 } interrupt;
334 336
335 struct { 337 struct {
336 int active; 338 int vm86_active;
337 u8 save_iopl; 339 u8 save_iopl;
338 struct kvm_save_segment { 340 struct kvm_save_segment {
339 u16 selector; 341 u16 selector;
@@ -356,9 +358,9 @@ struct kvm_vcpu_arch {
356 unsigned int time_offset; 358 unsigned int time_offset;
357 struct page *time_page; 359 struct page *time_page;
358 360
361 bool singlestep; /* guest is single stepped by KVM */
359 bool nmi_pending; 362 bool nmi_pending;
360 bool nmi_injected; 363 bool nmi_injected;
361 bool nmi_window_open;
362 364
363 struct mtrr_state_type mtrr_state; 365 struct mtrr_state_type mtrr_state;
364 u32 pat; 366 u32 pat;
@@ -392,15 +394,14 @@ struct kvm_arch{
392 */ 394 */
393 struct list_head active_mmu_pages; 395 struct list_head active_mmu_pages;
394 struct list_head assigned_dev_head; 396 struct list_head assigned_dev_head;
395 struct list_head oos_global_pages;
396 struct iommu_domain *iommu_domain; 397 struct iommu_domain *iommu_domain;
398 int iommu_flags;
397 struct kvm_pic *vpic; 399 struct kvm_pic *vpic;
398 struct kvm_ioapic *vioapic; 400 struct kvm_ioapic *vioapic;
399 struct kvm_pit *vpit; 401 struct kvm_pit *vpit;
400 struct hlist_head irq_ack_notifier_list; 402 struct hlist_head irq_ack_notifier_list;
401 int vapics_in_nmi_mode; 403 int vapics_in_nmi_mode;
402 404
403 int round_robin_prev_vcpu;
404 unsigned int tss_addr; 405 unsigned int tss_addr;
405 struct page *apic_access_page; 406 struct page *apic_access_page;
406 407
@@ -423,7 +424,6 @@ struct kvm_vm_stat {
423 u32 mmu_recycled; 424 u32 mmu_recycled;
424 u32 mmu_cache_miss; 425 u32 mmu_cache_miss;
425 u32 mmu_unsync; 426 u32 mmu_unsync;
426 u32 mmu_unsync_global;
427 u32 remote_tlb_flush; 427 u32 remote_tlb_flush;
428 u32 lpages; 428 u32 lpages;
429}; 429};
@@ -443,7 +443,6 @@ struct kvm_vcpu_stat {
443 u32 halt_exits; 443 u32 halt_exits;
444 u32 halt_wakeup; 444 u32 halt_wakeup;
445 u32 request_irq_exits; 445 u32 request_irq_exits;
446 u32 request_nmi_exits;
447 u32 irq_exits; 446 u32 irq_exits;
448 u32 host_state_reload; 447 u32 host_state_reload;
449 u32 efer_reload; 448 u32 efer_reload;
@@ -511,20 +510,22 @@ struct kvm_x86_ops {
511 void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run); 510 void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
512 int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu); 511 int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
513 void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); 512 void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
513 void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
514 u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
514 void (*patch_hypercall)(struct kvm_vcpu *vcpu, 515 void (*patch_hypercall)(struct kvm_vcpu *vcpu,
515 unsigned char *hypercall_addr); 516 unsigned char *hypercall_addr);
516 int (*get_irq)(struct kvm_vcpu *vcpu); 517 void (*set_irq)(struct kvm_vcpu *vcpu);
517 void (*set_irq)(struct kvm_vcpu *vcpu, int vec); 518 void (*set_nmi)(struct kvm_vcpu *vcpu);
518 void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr, 519 void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
519 bool has_error_code, u32 error_code); 520 bool has_error_code, u32 error_code);
520 bool (*exception_injected)(struct kvm_vcpu *vcpu); 521 int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
521 void (*inject_pending_irq)(struct kvm_vcpu *vcpu); 522 int (*nmi_allowed)(struct kvm_vcpu *vcpu);
522 void (*inject_pending_vectors)(struct kvm_vcpu *vcpu, 523 void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
523 struct kvm_run *run); 524 void (*enable_irq_window)(struct kvm_vcpu *vcpu);
524 525 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
525 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); 526 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
526 int (*get_tdp_level)(void); 527 int (*get_tdp_level)(void);
527 int (*get_mt_mask_shift)(void); 528 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
528}; 529};
529 530
530extern struct kvm_x86_ops *kvm_x86_ops; 531extern struct kvm_x86_ops *kvm_x86_ops;
@@ -538,7 +539,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu);
538void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte); 539void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
539void kvm_mmu_set_base_ptes(u64 base_pte); 540void kvm_mmu_set_base_ptes(u64 base_pte);
540void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 541void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
541 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask); 542 u64 dirty_mask, u64 nx_mask, u64 x_mask);
542 543
543int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); 544int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
544void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); 545void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
@@ -552,6 +553,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
552 const void *val, int bytes); 553 const void *val, int bytes);
553int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, 554int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
554 gpa_t addr, unsigned long *ret); 555 gpa_t addr, unsigned long *ret);
556u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
555 557
556extern bool tdp_enabled; 558extern bool tdp_enabled;
557 559
@@ -563,6 +565,7 @@ enum emulation_result {
563 565
564#define EMULTYPE_NO_DECODE (1 << 0) 566#define EMULTYPE_NO_DECODE (1 << 0)
565#define EMULTYPE_TRAP_UD (1 << 1) 567#define EMULTYPE_TRAP_UD (1 << 1)
568#define EMULTYPE_SKIP (1 << 2)
566int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, 569int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run,
567 unsigned long cr2, u16 error_code, int emulation_type); 570 unsigned long cr2, u16 error_code, int emulation_type);
568void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); 571void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
@@ -638,7 +641,6 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
638int kvm_mmu_load(struct kvm_vcpu *vcpu); 641int kvm_mmu_load(struct kvm_vcpu *vcpu);
639void kvm_mmu_unload(struct kvm_vcpu *vcpu); 642void kvm_mmu_unload(struct kvm_vcpu *vcpu);
640void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); 643void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
641void kvm_mmu_sync_global(struct kvm_vcpu *vcpu);
642 644
643int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); 645int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
644 646
@@ -769,6 +771,8 @@ enum {
769#define HF_GIF_MASK (1 << 0) 771#define HF_GIF_MASK (1 << 0)
770#define HF_HIF_MASK (1 << 1) 772#define HF_HIF_MASK (1 << 1)
771#define HF_VINTR_MASK (1 << 2) 773#define HF_VINTR_MASK (1 << 2)
774#define HF_NMI_MASK (1 << 3)
775#define HF_IRET_MASK (1 << 4)
772 776
773/* 777/*
774 * Hardware virtualization extension instructions may fault if a 778 * Hardware virtualization extension instructions may fault if a
@@ -791,5 +795,6 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
791#define KVM_ARCH_WANT_MMU_NOTIFIER 795#define KVM_ARCH_WANT_MMU_NOTIFIER
792int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); 796int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
793int kvm_age_hva(struct kvm *kvm, unsigned long hva); 797int kvm_age_hva(struct kvm *kvm, unsigned long hva);
798int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
794 799
795#endif /* _ASM_X86_KVM_HOST_H */ 800#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_x86_emulate.h
index 6a159732881a..b7ed2c423116 100644
--- a/arch/x86/include/asm/kvm_x86_emulate.h
+++ b/arch/x86/include/asm/kvm_x86_emulate.h
@@ -143,6 +143,9 @@ struct decode_cache {
143 struct fetch_cache fetch; 143 struct fetch_cache fetch;
144}; 144};
145 145
146#define X86_SHADOW_INT_MOV_SS 1
147#define X86_SHADOW_INT_STI 2
148
146struct x86_emulate_ctxt { 149struct x86_emulate_ctxt {
147 /* Register state before/after emulation. */ 150 /* Register state before/after emulation. */
148 struct kvm_vcpu *vcpu; 151 struct kvm_vcpu *vcpu;
@@ -152,6 +155,9 @@ struct x86_emulate_ctxt {
152 int mode; 155 int mode;
153 u32 cs_base; 156 u32 cs_base;
154 157
158 /* interruptibility state, as a result of execution of STI or MOV SS */
159 int interruptibility;
160
155 /* decode cache */ 161 /* decode cache */
156 struct decode_cache decode; 162 struct decode_cache decode;
157}; 163};
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
index 1caf57628b9c..313389cd50d2 100644
--- a/arch/x86/include/asm/lguest.h
+++ b/arch/x86/include/asm/lguest.h
@@ -17,8 +17,13 @@
17/* Pages for switcher itself, then two pages per cpu */ 17/* Pages for switcher itself, then two pages per cpu */
18#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) 18#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids)
19 19
20/* We map at -4M for ease of mapping into the guest (one PTE page). */ 20/* We map at -4M (-2M when PAE is activated) for ease of mapping
21 * into the guest (one PTE page). */
22#ifdef CONFIG_X86_PAE
23#define SWITCHER_ADDR 0xFFE00000
24#else
21#define SWITCHER_ADDR 0xFFC00000 25#define SWITCHER_ADDR 0xFFC00000
26#endif
22 27
23/* Found in switcher.S */ 28/* Found in switcher.S */
24extern unsigned long default_idt_entries[]; 29extern unsigned long default_idt_entries[];
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
index faae1996487b..d31c4a684078 100644
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -12,11 +12,13 @@
12#define LHCALL_TS 8 12#define LHCALL_TS 8
13#define LHCALL_SET_CLOCKEVENT 9 13#define LHCALL_SET_CLOCKEVENT 9
14#define LHCALL_HALT 10 14#define LHCALL_HALT 10
15#define LHCALL_SET_PMD 13
15#define LHCALL_SET_PTE 14 16#define LHCALL_SET_PTE 14
16#define LHCALL_SET_PMD 15 17#define LHCALL_SET_PGD 15
17#define LHCALL_LOAD_TLS 16 18#define LHCALL_LOAD_TLS 16
18#define LHCALL_NOTIFY 17 19#define LHCALL_NOTIFY 17
19#define LHCALL_LOAD_GDT_ENTRY 18 20#define LHCALL_LOAD_GDT_ENTRY 18
21#define LHCALL_SEND_INTERRUPTS 19
20 22
21#define LGUEST_TRAP_ENTRY 0x1F 23#define LGUEST_TRAP_ENTRY 0x1F
22 24
@@ -32,10 +34,10 @@
32 * operations? There are two ways: the direct way is to make a "hypercall", 34 * operations? There are two ways: the direct way is to make a "hypercall",
33 * to make requests of the Host Itself. 35 * to make requests of the Host Itself.
34 * 36 *
35 * We use the KVM hypercall mechanism. Eighteen hypercalls are 37 * We use the KVM hypercall mechanism. Seventeen hypercalls are
36 * available: the hypercall number is put in the %eax register, and the 38 * available: the hypercall number is put in the %eax register, and the
37 * arguments (when required) are placed in %ebx, %ecx and %edx. If a return 39 * arguments (when required) are placed in %ebx, %ecx, %edx and %esi.
38 * value makes sense, it's returned in %eax. 40 * If a return value makes sense, it's returned in %eax.
39 * 41 *
40 * Grossly invalid calls result in Sudden Death at the hands of the vengeful 42 * Grossly invalid calls result in Sudden Death at the hands of the vengeful
41 * Host, rather than returning failure. This reflects Winston Churchill's 43 * Host, rather than returning failure. This reflects Winston Churchill's
@@ -47,8 +49,9 @@
47 49
48#define LHCALL_RING_SIZE 64 50#define LHCALL_RING_SIZE 64
49struct hcall_args { 51struct hcall_args {
50 /* These map directly onto eax, ebx, ecx, edx in struct lguest_regs */ 52 /* These map directly onto eax, ebx, ecx, edx and esi
51 unsigned long arg0, arg1, arg2, arg3; 53 * in struct lguest_regs */
54 unsigned long arg0, arg1, arg2, arg3, arg4;
52}; 55};
53 56
54#endif /* !__ASSEMBLY__ */ 57#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 4f8c199584e7..5cdd8d100ec9 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -1,8 +1,6 @@
1#ifndef _ASM_X86_MCE_H 1#ifndef _ASM_X86_MCE_H
2#define _ASM_X86_MCE_H 2#define _ASM_X86_MCE_H
3 3
4#ifdef __x86_64__
5
6#include <linux/types.h> 4#include <linux/types.h>
7#include <asm/ioctls.h> 5#include <asm/ioctls.h>
8 6
@@ -10,21 +8,35 @@
10 * Machine Check support for x86 8 * Machine Check support for x86
11 */ 9 */
12 10
13#define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */ 11#define MCG_BANKCNT_MASK 0xff /* Number of Banks */
14#define MCG_EXT_P (1ULL<<9) /* Extended registers available */ 12#define MCG_CTL_P (1ULL<<8) /* MCG_CAP register available */
15#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */ 13#define MCG_EXT_P (1ULL<<9) /* Extended registers available */
16 14#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */
17#define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */ 15#define MCG_EXT_CNT_MASK 0xff0000 /* Number of Extended registers */
18#define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */ 16#define MCG_EXT_CNT_SHIFT 16
19#define MCG_STATUS_MCIP (1UL<<2) /* machine check in progress */ 17#define MCG_EXT_CNT(c) (((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT)
20 18#define MCG_SER_P (1ULL<<24) /* MCA recovery/new status bits */
21#define MCI_STATUS_VAL (1UL<<63) /* valid error */ 19
22#define MCI_STATUS_OVER (1UL<<62) /* previous errors lost */ 20#define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */
23#define MCI_STATUS_UC (1UL<<61) /* uncorrected error */ 21#define MCG_STATUS_EIPV (1ULL<<1) /* ip points to correct instruction */
24#define MCI_STATUS_EN (1UL<<60) /* error enabled */ 22#define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */
25#define MCI_STATUS_MISCV (1UL<<59) /* misc error reg. valid */ 23
26#define MCI_STATUS_ADDRV (1UL<<58) /* addr reg. valid */ 24#define MCI_STATUS_VAL (1ULL<<63) /* valid error */
27#define MCI_STATUS_PCC (1UL<<57) /* processor context corrupt */ 25#define MCI_STATUS_OVER (1ULL<<62) /* previous errors lost */
26#define MCI_STATUS_UC (1ULL<<61) /* uncorrected error */
27#define MCI_STATUS_EN (1ULL<<60) /* error enabled */
28#define MCI_STATUS_MISCV (1ULL<<59) /* misc error reg. valid */
29#define MCI_STATUS_ADDRV (1ULL<<58) /* addr reg. valid */
30#define MCI_STATUS_PCC (1ULL<<57) /* processor context corrupt */
31#define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */
32#define MCI_STATUS_AR (1ULL<<55) /* Action required */
33
34/* MISC register defines */
35#define MCM_ADDR_SEGOFF 0 /* segment offset */
36#define MCM_ADDR_LINEAR 1 /* linear address */
37#define MCM_ADDR_PHYS 2 /* physical address */
38#define MCM_ADDR_MEM 3 /* memory address */
39#define MCM_ADDR_GENERIC 7 /* generic */
28 40
29/* Fields are zero when not available */ 41/* Fields are zero when not available */
30struct mce { 42struct mce {
@@ -34,13 +46,19 @@ struct mce {
34 __u64 mcgstatus; 46 __u64 mcgstatus;
35 __u64 ip; 47 __u64 ip;
36 __u64 tsc; /* cpu time stamp counter */ 48 __u64 tsc; /* cpu time stamp counter */
37 __u64 res1; /* for future extension */ 49 __u64 time; /* wall time_t when error was detected */
38 __u64 res2; /* dito. */ 50 __u8 cpuvendor; /* cpu vendor as encoded in system.h */
51 __u8 pad1;
52 __u16 pad2;
53 __u32 cpuid; /* CPUID 1 EAX */
39 __u8 cs; /* code segment */ 54 __u8 cs; /* code segment */
40 __u8 bank; /* machine check bank */ 55 __u8 bank; /* machine check bank */
41 __u8 cpu; /* cpu that raised the error */ 56 __u8 cpu; /* cpu number; obsolete; use extcpu now */
42 __u8 finished; /* entry is valid */ 57 __u8 finished; /* entry is valid */
43 __u32 pad; 58 __u32 extcpu; /* linux cpu number that detected the error */
59 __u32 socketid; /* CPU socket ID */
60 __u32 apicid; /* CPU initial apic ID */
61 __u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */
44}; 62};
45 63
46/* 64/*
@@ -57,7 +75,7 @@ struct mce_log {
57 unsigned len; /* = MCE_LOG_LEN */ 75 unsigned len; /* = MCE_LOG_LEN */
58 unsigned next; 76 unsigned next;
59 unsigned flags; 77 unsigned flags;
60 unsigned pad0; 78 unsigned recordlen; /* length of struct mce */
61 struct mce entry[MCE_LOG_LEN]; 79 struct mce entry[MCE_LOG_LEN];
62}; 80};
63 81
@@ -82,20 +100,41 @@ struct mce_log {
82#define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9) 100#define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9)
83#define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0) 101#define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0)
84 102
85#endif /* __x86_64__ */
86
87#ifdef __KERNEL__ 103#ifdef __KERNEL__
88 104
89#ifdef CONFIG_X86_32 105#include <linux/percpu.h>
106#include <linux/init.h>
107#include <asm/atomic.h>
108
90extern int mce_disabled; 109extern int mce_disabled;
91#else /* CONFIG_X86_32 */ 110extern int mce_p5_enabled;
92 111
93#include <asm/atomic.h> 112#ifdef CONFIG_X86_MCE
113void mcheck_init(struct cpuinfo_x86 *c);
114#else
115static inline void mcheck_init(struct cpuinfo_x86 *c) {}
116#endif
117
118#ifdef CONFIG_X86_OLD_MCE
119extern int nr_mce_banks;
120void amd_mcheck_init(struct cpuinfo_x86 *c);
121void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
122void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
123#endif
124
125#ifdef CONFIG_X86_ANCIENT_MCE
126void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
127void winchip_mcheck_init(struct cpuinfo_x86 *c);
128static inline void enable_p5_mce(void) { mce_p5_enabled = 1; }
129#else
130static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {}
131static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
132static inline void enable_p5_mce(void) {}
133#endif
94 134
95void mce_setup(struct mce *m); 135void mce_setup(struct mce *m);
96void mce_log(struct mce *m); 136void mce_log(struct mce *m);
97DECLARE_PER_CPU(struct sys_device, device_mce); 137DECLARE_PER_CPU(struct sys_device, mce_dev);
98extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
99 138
100/* 139/*
101 * To support more than 128 would need to escape the predefined 140 * To support more than 128 would need to escape the predefined
@@ -104,6 +143,8 @@ extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
104#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1) 143#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1)
105 144
106#ifdef CONFIG_X86_MCE_INTEL 145#ifdef CONFIG_X86_MCE_INTEL
146extern int mce_cmci_disabled;
147extern int mce_ignore_ce;
107void mce_intel_feature_init(struct cpuinfo_x86 *c); 148void mce_intel_feature_init(struct cpuinfo_x86 *c);
108void cmci_clear(void); 149void cmci_clear(void);
109void cmci_reenable(void); 150void cmci_reenable(void);
@@ -123,14 +164,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c);
123static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } 164static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
124#endif 165#endif
125 166
126extern int mce_available(struct cpuinfo_x86 *c); 167int mce_available(struct cpuinfo_x86 *c);
127 168
128void mce_log_therm_throt_event(__u64 status); 169DECLARE_PER_CPU(unsigned, mce_exception_count);
170DECLARE_PER_CPU(unsigned, mce_poll_count);
129 171
130extern atomic_t mce_entry; 172extern atomic_t mce_entry;
131 173
132extern void do_machine_check(struct pt_regs *, long);
133
134typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS); 174typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS);
135DECLARE_PER_CPU(mce_banks_t, mce_poll_banks); 175DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);
136 176
@@ -139,19 +179,40 @@ enum mcp_flags {
139 MCP_UC = (1 << 1), /* log uncorrected errors */ 179 MCP_UC = (1 << 1), /* log uncorrected errors */
140 MCP_DONTLOG = (1 << 2), /* only clear, don't log */ 180 MCP_DONTLOG = (1 << 2), /* only clear, don't log */
141}; 181};
142extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); 182void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
143 183
144extern int mce_notify_user(void); 184int mce_notify_irq(void);
185void mce_notify_process(void);
145 186
146#endif /* !CONFIG_X86_32 */ 187DECLARE_PER_CPU(struct mce, injectm);
188extern struct file_operations mce_chrdev_ops;
147 189
148#ifdef CONFIG_X86_MCE 190/*
149extern void mcheck_init(struct cpuinfo_x86 *c); 191 * Exception handler
150#else 192 */
151#define mcheck_init(c) do { } while (0) 193
152#endif 194/* Call the installed machine check handler for this CPU setup. */
195extern void (*machine_check_vector)(struct pt_regs *, long error_code);
196void do_machine_check(struct pt_regs *, long);
197
198/*
199 * Threshold handler
200 */
153 201
154extern void (*mce_threshold_vector)(void); 202extern void (*mce_threshold_vector)(void);
203extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
204
205/*
206 * Thermal handler
207 */
208
209void intel_init_thermal(struct cpuinfo_x86 *c);
210
211#ifdef CONFIG_X86_NEW_MCE
212void mce_log_therm_throt_event(__u64 status);
213#else
214static inline void mce_log_therm_throt_event(__u64 status) {}
215#endif
155 216
156#endif /* __KERNEL__ */ 217#endif /* __KERNEL__ */
157#endif /* _ASM_X86_MCE_H */ 218#endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index c882664716c1..ef51b501e22a 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -9,20 +9,31 @@ struct cpu_signature {
9 9
10struct device; 10struct device;
11 11
12enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND };
13
12struct microcode_ops { 14struct microcode_ops {
13 int (*request_microcode_user) (int cpu, const void __user *buf, size_t size); 15 enum ucode_state (*request_microcode_user) (int cpu,
14 int (*request_microcode_fw) (int cpu, struct device *device); 16 const void __user *buf, size_t size);
15 17
16 void (*apply_microcode) (int cpu); 18 enum ucode_state (*request_microcode_fw) (int cpu,
19 struct device *device);
17 20
18 int (*collect_cpu_info) (int cpu, struct cpu_signature *csig);
19 void (*microcode_fini_cpu) (int cpu); 21 void (*microcode_fini_cpu) (int cpu);
22
23 /*
24 * The generic 'microcode_core' part guarantees that
25 * the callbacks below run on a target cpu when they
26 * are being called.
27 * See also the "Synchronization" section in microcode_core.c.
28 */
29 int (*apply_microcode) (int cpu);
30 int (*collect_cpu_info) (int cpu, struct cpu_signature *csig);
20}; 31};
21 32
22struct ucode_cpu_info { 33struct ucode_cpu_info {
23 struct cpu_signature cpu_sig; 34 struct cpu_signature cpu_sig;
24 int valid; 35 int valid;
25 void *mc; 36 void *mc;
26}; 37};
27extern struct ucode_cpu_info ucode_cpu_info[]; 38extern struct ucode_cpu_info ucode_cpu_info[];
28 39
diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h
index 90bc4108a4fd..751af2550ed9 100644
--- a/arch/x86/include/asm/mman.h
+++ b/arch/x86/include/asm/mman.h
@@ -1,7 +1,7 @@
1#ifndef _ASM_X86_MMAN_H 1#ifndef _ASM_X86_MMAN_H
2#define _ASM_X86_MMAN_H 2#define _ASM_X86_MMAN_H
3 3
4#include <asm-generic/mman.h> 4#include <asm-generic/mman-common.h>
5 5
6#define MAP_32BIT 0x40 /* only give out 32bit addresses */ 6#define MAP_32BIT 0x40 /* only give out 32bit addresses */
7 7
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 642fc7fc8cdc..e2a1bb6d71ea 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -61,9 +61,11 @@ extern void get_smp_config(void);
61#ifdef CONFIG_X86_MPPARSE 61#ifdef CONFIG_X86_MPPARSE
62extern void find_smp_config(void); 62extern void find_smp_config(void);
63extern void early_reserve_e820_mpc_new(void); 63extern void early_reserve_e820_mpc_new(void);
64extern int enable_update_mptable;
64#else 65#else
65static inline void find_smp_config(void) { } 66static inline void find_smp_config(void) { }
66static inline void early_reserve_e820_mpc_new(void) { } 67static inline void early_reserve_e820_mpc_new(void) { }
68#define enable_update_mptable 0
67#endif 69#endif
68 70
69void __cpuinit generic_processor_info(int apicid, int version); 71void __cpuinit generic_processor_info(int apicid, int version);
@@ -72,20 +74,13 @@ extern void mp_register_ioapic(int id, u32 address, u32 gsi_base);
72extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, 74extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
73 u32 gsi); 75 u32 gsi);
74extern void mp_config_acpi_legacy_irqs(void); 76extern void mp_config_acpi_legacy_irqs(void);
75extern int mp_register_gsi(u32 gsi, int edge_level, int active_high_low); 77struct device;
78extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level,
79 int active_high_low);
76extern int acpi_probe_gsi(void); 80extern int acpi_probe_gsi(void);
77#ifdef CONFIG_X86_IO_APIC 81#ifdef CONFIG_X86_IO_APIC
78extern int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
79 u32 gsi, int triggering, int polarity);
80extern int mp_find_ioapic(int gsi); 82extern int mp_find_ioapic(int gsi);
81extern int mp_find_ioapic_pin(int ioapic, int gsi); 83extern int mp_find_ioapic_pin(int ioapic, int gsi);
82#else
83static inline int
84mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
85 u32 gsi, int triggering, int polarity)
86{
87 return 0;
88}
89#endif 84#endif
90#else /* !CONFIG_ACPI: */ 85#else /* !CONFIG_ACPI: */
91static inline int acpi_probe_gsi(void) 86static inline int acpi_probe_gsi(void)
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index ec41fc16c167..1692fb5050e3 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -121,7 +121,6 @@
121#define MSR_K8_TOP_MEM1 0xc001001a 121#define MSR_K8_TOP_MEM1 0xc001001a
122#define MSR_K8_TOP_MEM2 0xc001001d 122#define MSR_K8_TOP_MEM2 0xc001001d
123#define MSR_K8_SYSCFG 0xc0010010 123#define MSR_K8_SYSCFG 0xc0010010
124#define MSR_K8_HWCR 0xc0010015
125#define MSR_K8_INT_PENDING_MSG 0xc0010055 124#define MSR_K8_INT_PENDING_MSG 0xc0010055
126/* C1E active bits in int pending message */ 125/* C1E active bits in int pending message */
127#define K8_INTP_C1E_ACTIVE_MASK 0x18000000 126#define K8_INTP_C1E_ACTIVE_MASK 0x18000000
@@ -208,7 +207,14 @@
208 207
209#define MSR_IA32_THERM_CONTROL 0x0000019a 208#define MSR_IA32_THERM_CONTROL 0x0000019a
210#define MSR_IA32_THERM_INTERRUPT 0x0000019b 209#define MSR_IA32_THERM_INTERRUPT 0x0000019b
210
211#define THERM_INT_LOW_ENABLE (1 << 0)
212#define THERM_INT_HIGH_ENABLE (1 << 1)
213
211#define MSR_IA32_THERM_STATUS 0x0000019c 214#define MSR_IA32_THERM_STATUS 0x0000019c
215
216#define THERM_STATUS_PROCHOT (1 << 0)
217
212#define MSR_IA32_MISC_ENABLE 0x000001a0 218#define MSR_IA32_MISC_ENABLE 0x000001a0
213 219
214/* MISC_ENABLE bits: architectural */ 220/* MISC_ENABLE bits: architectural */
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 638bf6241807..48ad9d29484a 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -3,15 +3,23 @@
3 3
4#include <asm/msr-index.h> 4#include <asm/msr-index.h>
5 5
6#ifndef __ASSEMBLY__
7# include <linux/types.h>
8#endif
9
10#ifdef __KERNEL__ 6#ifdef __KERNEL__
11#ifndef __ASSEMBLY__ 7#ifndef __ASSEMBLY__
12 8
9#include <linux/types.h>
13#include <asm/asm.h> 10#include <asm/asm.h>
14#include <asm/errno.h> 11#include <asm/errno.h>
12#include <asm/cpumask.h>
13
14struct msr {
15 union {
16 struct {
17 u32 l;
18 u32 h;
19 };
20 u64 q;
21 };
22};
15 23
16static inline unsigned long long native_read_tscp(unsigned int *aux) 24static inline unsigned long long native_read_tscp(unsigned int *aux)
17{ 25{
@@ -216,6 +224,8 @@ do { \
216#ifdef CONFIG_SMP 224#ifdef CONFIG_SMP
217int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); 225int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
218int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); 226int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
227void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs);
228void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs);
219int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); 229int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
220int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); 230int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
221#else /* CONFIG_SMP */ 231#else /* CONFIG_SMP */
@@ -229,6 +239,16 @@ static inline int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
229 wrmsr(msr_no, l, h); 239 wrmsr(msr_no, l, h);
230 return 0; 240 return 0;
231} 241}
242static inline void rdmsr_on_cpus(const cpumask_t *m, u32 msr_no,
243 struct msr *msrs)
244{
245 rdmsr_on_cpu(0, msr_no, &(msrs[0].l), &(msrs[0].h));
246}
247static inline void wrmsr_on_cpus(const cpumask_t *m, u32 msr_no,
248 struct msr *msrs)
249{
250 wrmsr_on_cpu(0, msr_no, msrs[0].l, msrs[0].h);
251}
232static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, 252static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no,
233 u32 *l, u32 *h) 253 u32 *l, u32 *h)
234{ 254{
@@ -241,6 +261,4 @@ static inline int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
241#endif /* CONFIG_SMP */ 261#endif /* CONFIG_SMP */
242#endif /* __ASSEMBLY__ */ 262#endif /* __ASSEMBLY__ */
243#endif /* __KERNEL__ */ 263#endif /* __KERNEL__ */
244
245
246#endif /* _ASM_X86_MSR_H */ 264#endif /* _ASM_X86_MSR_H */
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index c45a0a568dff..c97264409934 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -64,7 +64,7 @@ static inline int nmi_watchdog_active(void)
64 * but since they are power of two we could use a 64 * but since they are power of two we could use a
65 * cheaper way --cvg 65 * cheaper way --cvg
66 */ 66 */
67 return nmi_watchdog & 0x3; 67 return nmi_watchdog & (NMI_LOCAL_APIC | NMI_IO_APIC);
68} 68}
69#endif 69#endif
70 70
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 064ed6df4cbe..c4ae822e415f 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -17,9 +17,6 @@ extern int compute_hash_shift(struct bootnode *nodes, int numblks,
17extern void numa_init_array(void); 17extern void numa_init_array(void);
18extern int numa_off; 18extern int numa_off;
19 19
20extern void srat_reserve_add_area(int nodeid);
21extern int hotadd_percent;
22
23extern s16 apicid_to_node[MAX_LOCAL_APIC]; 20extern s16 apicid_to_node[MAX_LOCAL_APIC];
24 21
25extern unsigned long numa_free_all_bootmem(void); 22extern unsigned long numa_free_all_bootmem(void);
@@ -27,6 +24,13 @@ extern void setup_node_bootmem(int nodeid, unsigned long start,
27 unsigned long end); 24 unsigned long end);
28 25
29#ifdef CONFIG_NUMA 26#ifdef CONFIG_NUMA
27/*
28 * Too small node sizes may confuse the VM badly. Usually they
29 * result from BIOS bugs. So dont recognize nodes as standalone
30 * NUMA entities that have less than this amount of RAM listed:
31 */
32#define NODE_MIN_SIZE (4*1024*1024)
33
30extern void __init init_cpu_to_node(void); 34extern void __init init_cpu_to_node(void);
31extern void __cpuinit numa_set_node(int cpu, int node); 35extern void __cpuinit numa_set_node(int cpu, int node);
32extern void __cpuinit numa_clear_node(int cpu); 36extern void __cpuinit numa_clear_node(int cpu);
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index 89ed9d70b0aa..625c3f0e741a 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -56,7 +56,7 @@ extern bool __virt_addr_valid(unsigned long kaddr);
56#endif /* __ASSEMBLY__ */ 56#endif /* __ASSEMBLY__ */
57 57
58#include <asm-generic/memory_model.h> 58#include <asm-generic/memory_model.h>
59#include <asm-generic/page.h> 59#include <asm-generic/getorder.h>
60 60
61#define __HAVE_ARCH_GATE_AREA 1 61#define __HAVE_ARCH_GATE_AREA 1
62 62
diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h
index 0f915ae649a7..6f1b7331313f 100644
--- a/arch/x86/include/asm/page_32_types.h
+++ b/arch/x86/include/asm/page_32_types.h
@@ -54,10 +54,6 @@ extern unsigned int __VMALLOC_RESERVE;
54extern int sysctl_legacy_va_layout; 54extern int sysctl_legacy_va_layout;
55 55
56extern void find_low_pfn_range(void); 56extern void find_low_pfn_range(void);
57extern unsigned long init_memory_mapping(unsigned long start,
58 unsigned long end);
59extern void initmem_init(unsigned long, unsigned long);
60extern void free_initmem(void);
61extern void setup_bootmem_allocator(void); 57extern void setup_bootmem_allocator(void);
62 58
63#endif /* !__ASSEMBLY__ */ 59#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index d38c91b70248..7639dbf5d223 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -32,24 +32,16 @@
32 */ 32 */
33#define __PAGE_OFFSET _AC(0xffff880000000000, UL) 33#define __PAGE_OFFSET _AC(0xffff880000000000, UL)
34 34
35#define __PHYSICAL_START CONFIG_PHYSICAL_START 35#define __PHYSICAL_START ((CONFIG_PHYSICAL_START + \
36#define __KERNEL_ALIGN 0x200000 36 (CONFIG_PHYSICAL_ALIGN - 1)) & \
37 37 ~(CONFIG_PHYSICAL_ALIGN - 1))
38/*
39 * Make sure kernel is aligned to 2MB address. Catching it at compile
40 * time is better. Change your config file and compile the kernel
41 * for a 2MB aligned address (CONFIG_PHYSICAL_START)
42 */
43#if (CONFIG_PHYSICAL_START % __KERNEL_ALIGN) != 0
44#error "CONFIG_PHYSICAL_START must be a multiple of 2MB"
45#endif
46 38
47#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START) 39#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
48#define __START_KERNEL_map _AC(0xffffffff80000000, UL) 40#define __START_KERNEL_map _AC(0xffffffff80000000, UL)
49 41
50/* See Documentation/x86_64/mm.txt for a description of the memory map. */ 42/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
51#define __PHYSICAL_MASK_SHIFT 46 43#define __PHYSICAL_MASK_SHIFT 46
52#define __VIRTUAL_MASK_SHIFT 48 44#define __VIRTUAL_MASK_SHIFT 47
53 45
54/* 46/*
55 * Kernel image size is limited to 512 MB (see level2_kernel_pgt in 47 * Kernel image size is limited to 512 MB (see level2_kernel_pgt in
@@ -71,12 +63,6 @@ extern unsigned long __phys_addr(unsigned long);
71 63
72#define vmemmap ((struct page *)VMEMMAP_START) 64#define vmemmap ((struct page *)VMEMMAP_START)
73 65
74extern unsigned long init_memory_mapping(unsigned long start,
75 unsigned long end);
76
77extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
78extern void free_initmem(void);
79
80extern void init_extra_mapping_uc(unsigned long phys, unsigned long size); 66extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
81extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); 67extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
82 68
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 826ad37006ab..6473f5ccff85 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -46,6 +46,12 @@ extern int devmem_is_allowed(unsigned long pagenr);
46extern unsigned long max_low_pfn_mapped; 46extern unsigned long max_low_pfn_mapped;
47extern unsigned long max_pfn_mapped; 47extern unsigned long max_pfn_mapped;
48 48
49extern unsigned long init_memory_mapping(unsigned long start,
50 unsigned long end);
51
52extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
53extern void free_initmem(void);
54
49#endif /* !__ASSEMBLY__ */ 55#endif /* !__ASSEMBLY__ */
50 56
51#endif /* _ASM_X86_PAGE_DEFS_H */ 57#endif /* _ASM_X86_PAGE_DEFS_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index a53da004e08e..4fb37c8a0832 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -56,6 +56,7 @@ struct desc_ptr;
56struct tss_struct; 56struct tss_struct;
57struct mm_struct; 57struct mm_struct;
58struct desc_struct; 58struct desc_struct;
59struct task_struct;
59 60
60/* 61/*
61 * Wrapper type for pointers to code which uses the non-standard 62 * Wrapper type for pointers to code which uses the non-standard
@@ -203,7 +204,8 @@ struct pv_cpu_ops {
203 204
204 void (*swapgs)(void); 205 void (*swapgs)(void);
205 206
206 struct pv_lazy_ops lazy_mode; 207 void (*start_context_switch)(struct task_struct *prev);
208 void (*end_context_switch)(struct task_struct *next);
207}; 209};
208 210
209struct pv_irq_ops { 211struct pv_irq_ops {
@@ -1399,25 +1401,23 @@ enum paravirt_lazy_mode {
1399}; 1401};
1400 1402
1401enum paravirt_lazy_mode paravirt_get_lazy_mode(void); 1403enum paravirt_lazy_mode paravirt_get_lazy_mode(void);
1402void paravirt_enter_lazy_cpu(void); 1404void paravirt_start_context_switch(struct task_struct *prev);
1403void paravirt_leave_lazy_cpu(void); 1405void paravirt_end_context_switch(struct task_struct *next);
1406
1404void paravirt_enter_lazy_mmu(void); 1407void paravirt_enter_lazy_mmu(void);
1405void paravirt_leave_lazy_mmu(void); 1408void paravirt_leave_lazy_mmu(void);
1406void paravirt_leave_lazy(enum paravirt_lazy_mode mode);
1407 1409
1408#define __HAVE_ARCH_ENTER_LAZY_CPU_MODE 1410#define __HAVE_ARCH_START_CONTEXT_SWITCH
1409static inline void arch_enter_lazy_cpu_mode(void) 1411static inline void arch_start_context_switch(struct task_struct *prev)
1410{ 1412{
1411 PVOP_VCALL0(pv_cpu_ops.lazy_mode.enter); 1413 PVOP_VCALL1(pv_cpu_ops.start_context_switch, prev);
1412} 1414}
1413 1415
1414static inline void arch_leave_lazy_cpu_mode(void) 1416static inline void arch_end_context_switch(struct task_struct *next)
1415{ 1417{
1416 PVOP_VCALL0(pv_cpu_ops.lazy_mode.leave); 1418 PVOP_VCALL1(pv_cpu_ops.end_context_switch, next);
1417} 1419}
1418 1420
1419void arch_flush_lazy_cpu_mode(void);
1420
1421#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE 1421#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
1422static inline void arch_enter_lazy_mmu_mode(void) 1422static inline void arch_enter_lazy_mmu_mode(void)
1423{ 1423{
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index b51a1e8b0baf..927958d13c19 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -130,6 +130,7 @@ extern void pci_iommu_alloc(void);
130 130
131/* generic pci stuff */ 131/* generic pci stuff */
132#include <asm-generic/pci.h> 132#include <asm-generic/pci.h>
133#define PCIBIOS_MAX_MEM_32 0xffffffff
133 134
134#ifdef CONFIG_NUMA 135#ifdef CONFIG_NUMA
135/* Returns the node based on pci bus */ 136/* Returns the node based on pci bus */
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index e60fd3e14bdf..b399988eee3a 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -121,6 +121,9 @@ extern int __init pcibios_init(void);
121extern int __init pci_mmcfg_arch_init(void); 121extern int __init pci_mmcfg_arch_init(void);
122extern void __init pci_mmcfg_arch_free(void); 122extern void __init pci_mmcfg_arch_free(void);
123 123
124extern struct acpi_mcfg_allocation *pci_mmcfg_config;
125extern int pci_mmcfg_config_num;
126
124/* 127/*
125 * AMD Fam10h CPUs are buggy, and cannot access MMIO config space 128 * AMD Fam10h CPUs are buggy, and cannot access MMIO config space
126 * on their northbrige except through the * %eax register. As such, you MUST 129 * on their northbrige except through the * %eax register. As such, you MUST
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
new file mode 100644
index 000000000000..5fb33e160ea0
--- /dev/null
+++ b/arch/x86/include/asm/perf_counter.h
@@ -0,0 +1,95 @@
1#ifndef _ASM_X86_PERF_COUNTER_H
2#define _ASM_X86_PERF_COUNTER_H
3
4/*
5 * Performance counter hw details:
6 */
7
8#define X86_PMC_MAX_GENERIC 8
9#define X86_PMC_MAX_FIXED 3
10
11#define X86_PMC_IDX_GENERIC 0
12#define X86_PMC_IDX_FIXED 32
13#define X86_PMC_IDX_MAX 64
14
15#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
16#define MSR_ARCH_PERFMON_PERFCTR1 0xc2
17
18#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
19#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
20
21#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
22#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
23#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
24#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
25
26/*
27 * Includes eventsel and unit mask as well:
28 */
29#define ARCH_PERFMON_EVENT_MASK 0xffff
30
31#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c
32#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
33#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0
34#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
35 (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
36
37#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6
38
39/*
40 * Intel "Architectural Performance Monitoring" CPUID
41 * detection/enumeration details:
42 */
43union cpuid10_eax {
44 struct {
45 unsigned int version_id:8;
46 unsigned int num_counters:8;
47 unsigned int bit_width:8;
48 unsigned int mask_length:8;
49 } split;
50 unsigned int full;
51};
52
53union cpuid10_edx {
54 struct {
55 unsigned int num_counters_fixed:4;
56 unsigned int reserved:28;
57 } split;
58 unsigned int full;
59};
60
61
62/*
63 * Fixed-purpose performance counters:
64 */
65
66/*
67 * All 3 fixed-mode PMCs are configured via this single MSR:
68 */
69#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d
70
71/*
72 * The counts are available in three separate MSRs:
73 */
74
75/* Instr_Retired.Any: */
76#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309
77#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0)
78
79/* CPU_CLK_Unhalted.Core: */
80#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a
81#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1)
82
83/* CPU_CLK_Unhalted.Ref: */
84#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b
85#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2)
86
87#ifdef CONFIG_PERF_COUNTERS
88extern void init_hw_perf_counters(void);
89extern void perf_counters_lapic_init(void);
90#else
91static inline void init_hw_perf_counters(void) { }
92static inline void perf_counters_lapic_init(void) { }
93#endif
94
95#endif /* _ASM_X86_PERF_COUNTER_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 29d96d168bc0..3cc06e3fceb8 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -81,6 +81,8 @@ static inline void __init paravirt_pagetable_setup_done(pgd_t *base)
81#define pte_val(x) native_pte_val(x) 81#define pte_val(x) native_pte_val(x)
82#define __pte(x) native_make_pte(x) 82#define __pte(x) native_make_pte(x)
83 83
84#define arch_end_context_switch(prev) do {} while(0)
85
84#endif /* CONFIG_PARAVIRT */ 86#endif /* CONFIG_PARAVIRT */
85 87
86/* 88/*
@@ -315,6 +317,11 @@ static inline int pte_present(pte_t a)
315 return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); 317 return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
316} 318}
317 319
320static inline int pte_hidden(pte_t pte)
321{
322 return pte_flags(pte) & _PAGE_HIDDEN;
323}
324
318static inline int pmd_present(pmd_t pmd) 325static inline int pmd_present(pmd_t pmd)
319{ 326{
320 return pmd_flags(pmd) & _PAGE_PRESENT; 327 return pmd_flags(pmd) & _PAGE_PRESENT;
@@ -503,6 +510,8 @@ static inline int pgd_none(pgd_t pgd)
503 510
504#ifndef __ASSEMBLY__ 511#ifndef __ASSEMBLY__
505 512
513extern int direct_gbpages;
514
506/* local pte updates need not use xchg for locking */ 515/* local pte updates need not use xchg for locking */
507static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) 516static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
508{ 517{
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 31bd120cf2a2..01fd9461d323 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -49,13 +49,17 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
49#endif 49#endif
50 50
51#if defined(CONFIG_HIGHPTE) 51#if defined(CONFIG_HIGHPTE)
52#define __KM_PTE \
53 (in_nmi() ? KM_NMI_PTE : \
54 in_irq() ? KM_IRQ_PTE : \
55 KM_PTE0)
52#define pte_offset_map(dir, address) \ 56#define pte_offset_map(dir, address) \
53 ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE0) + \ 57 ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), __KM_PTE) + \
54 pte_index((address))) 58 pte_index((address)))
55#define pte_offset_map_nested(dir, address) \ 59#define pte_offset_map_nested(dir, address) \
56 ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) + \ 60 ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) + \
57 pte_index((address))) 61 pte_index((address)))
58#define pte_unmap(pte) kunmap_atomic((pte), KM_PTE0) 62#define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE)
59#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1) 63#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1)
60#else 64#else
61#define pte_offset_map(dir, address) \ 65#define pte_offset_map(dir, address) \
diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h
index 2733fad45f98..5e67c1532314 100644
--- a/arch/x86/include/asm/pgtable_32_types.h
+++ b/arch/x86/include/asm/pgtable_32_types.h
@@ -46,6 +46,10 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */
46# define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE) 46# define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE)
47#endif 47#endif
48 48
49#define MODULES_VADDR VMALLOC_START
50#define MODULES_END VMALLOC_END
51#define MODULES_LEN (MODULES_VADDR - MODULES_END)
52
49#define MAXMEM (VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE) 53#define MAXMEM (VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE)
50 54
51#endif /* _ASM_X86_PGTABLE_32_DEFS_H */ 55#endif /* _ASM_X86_PGTABLE_32_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 6b87bc6d5018..c57a30117149 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -25,10 +25,6 @@ extern pgd_t init_level4_pgt[];
25 25
26extern void paging_init(void); 26extern void paging_init(void);
27 27
28#endif /* !__ASSEMBLY__ */
29
30#ifndef __ASSEMBLY__
31
32#define pte_ERROR(e) \ 28#define pte_ERROR(e) \
33 printk("%s:%d: bad pte %p(%016lx).\n", \ 29 printk("%s:%d: bad pte %p(%016lx).\n", \
34 __FILE__, __LINE__, &(e), pte_val(e)) 30 __FILE__, __LINE__, &(e), pte_val(e))
@@ -135,8 +131,6 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
135 131
136#define update_mmu_cache(vma, address, pte) do { } while (0) 132#define update_mmu_cache(vma, address, pte) do { } while (0)
137 133
138extern int direct_gbpages;
139
140/* Encode and de-code a swap entry */ 134/* Encode and de-code a swap entry */
141#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE 135#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
142#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) 136#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
@@ -171,10 +165,7 @@ extern void cleanup_highmap(void);
171 165
172/* fs/proc/kcore.c */ 166/* fs/proc/kcore.c */
173#define kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK) 167#define kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK)
174#define kc_offset_to_vaddr(o) \ 168#define kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK)
175 (((o) & (1UL << (__VIRTUAL_MASK_SHIFT - 1))) \
176 ? ((o) | ~__VIRTUAL_MASK) \
177 : (o))
178 169
179#define __HAVE_ARCH_PTE_SAME 170#define __HAVE_ARCH_PTE_SAME
180#endif /* !__ASSEMBLY__ */ 171#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index fbf42b8e0383..766ea16fbbbd 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -51,11 +51,11 @@ typedef struct { pteval_t pte; } pte_t;
51#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT) 51#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
52#define PGDIR_MASK (~(PGDIR_SIZE - 1)) 52#define PGDIR_MASK (~(PGDIR_SIZE - 1))
53 53
54 54/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
55#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) 55#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
56#define VMALLOC_START _AC(0xffffc20000000000, UL) 56#define VMALLOC_START _AC(0xffffc90000000000, UL)
57#define VMALLOC_END _AC(0xffffe1ffffffffff, UL) 57#define VMALLOC_END _AC(0xffffe8ffffffffff, UL)
58#define VMEMMAP_START _AC(0xffffe20000000000, UL) 58#define VMEMMAP_START _AC(0xffffea0000000000, UL)
59#define MODULES_VADDR _AC(0xffffffffa0000000, UL) 59#define MODULES_VADDR _AC(0xffffffffa0000000, UL)
60#define MODULES_END _AC(0xffffffffff000000, UL) 60#define MODULES_END _AC(0xffffffffff000000, UL)
61#define MODULES_LEN (MODULES_END - MODULES_VADDR) 61#define MODULES_LEN (MODULES_END - MODULES_VADDR)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index b8238dc8786d..54cb697f4900 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -18,7 +18,7 @@
18#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ 18#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
19#define _PAGE_BIT_UNUSED1 9 /* available for programmer */ 19#define _PAGE_BIT_UNUSED1 9 /* available for programmer */
20#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */ 20#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */
21#define _PAGE_BIT_UNUSED3 11 21#define _PAGE_BIT_HIDDEN 11 /* hidden by kmemcheck */
22#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ 22#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
23#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 23#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1
24#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 24#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1
@@ -41,13 +41,18 @@
41#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) 41#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
42#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1) 42#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
43#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP) 43#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
44#define _PAGE_UNUSED3 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED3)
45#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) 44#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
46#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) 45#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
47#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) 46#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
48#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) 47#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
49#define __HAVE_ARCH_PTE_SPECIAL 48#define __HAVE_ARCH_PTE_SPECIAL
50 49
50#ifdef CONFIG_KMEMCHECK
51#define _PAGE_HIDDEN (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN)
52#else
53#define _PAGE_HIDDEN (_AT(pteval_t, 0))
54#endif
55
51#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) 56#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
52#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) 57#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
53#else 58#else
@@ -273,7 +278,6 @@ typedef struct page *pgtable_t;
273 278
274extern pteval_t __supported_pte_mask; 279extern pteval_t __supported_pte_mask;
275extern int nx_enabled; 280extern int nx_enabled;
276extern void set_nx(void);
277 281
278#define pgprot_writecombine pgprot_writecombine 282#define pgprot_writecombine pgprot_writecombine
279extern pgprot_t pgprot_writecombine(pgprot_t prot); 283extern pgprot_t pgprot_writecombine(pgprot_t prot);
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c2cceae709c8..c7768269b1cf 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -135,7 +135,8 @@ extern struct cpuinfo_x86 boot_cpu_data;
135extern struct cpuinfo_x86 new_cpu_data; 135extern struct cpuinfo_x86 new_cpu_data;
136 136
137extern struct tss_struct doublefault_tss; 137extern struct tss_struct doublefault_tss;
138extern __u32 cleared_cpu_caps[NCAPINTS]; 138extern __u32 cpu_caps_cleared[NCAPINTS];
139extern __u32 cpu_caps_set[NCAPINTS];
139 140
140#ifdef CONFIG_SMP 141#ifdef CONFIG_SMP
141DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); 142DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
@@ -409,9 +410,6 @@ DECLARE_PER_CPU(unsigned long, stack_canary);
409extern unsigned int xstate_size; 410extern unsigned int xstate_size;
410extern void free_thread_xstate(struct task_struct *); 411extern void free_thread_xstate(struct task_struct *);
411extern struct kmem_cache *task_xstate_cachep; 412extern struct kmem_cache *task_xstate_cachep;
412extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
413extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
414extern unsigned short num_cache_leaves;
415 413
416struct thread_struct { 414struct thread_struct {
417 /* Cached TLS descriptors: */ 415 /* Cached TLS descriptors: */
@@ -427,8 +425,12 @@ struct thread_struct {
427 unsigned short fsindex; 425 unsigned short fsindex;
428 unsigned short gsindex; 426 unsigned short gsindex;
429#endif 427#endif
428#ifdef CONFIG_X86_32
430 unsigned long ip; 429 unsigned long ip;
430#endif
431#ifdef CONFIG_X86_64
431 unsigned long fs; 432 unsigned long fs;
433#endif
432 unsigned long gs; 434 unsigned long gs;
433 /* Hardware debugging registers: */ 435 /* Hardware debugging registers: */
434 unsigned long debugreg0; 436 unsigned long debugreg0;
@@ -460,14 +462,8 @@ struct thread_struct {
460 unsigned io_bitmap_max; 462 unsigned io_bitmap_max;
461/* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set. */ 463/* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set. */
462 unsigned long debugctlmsr; 464 unsigned long debugctlmsr;
463#ifdef CONFIG_X86_DS 465 /* Debug Store context; see asm/ds.h */
464/* Debug Store context; see include/asm-x86/ds.h; goes into MSR_IA32_DS_AREA */
465 struct ds_context *ds_ctx; 466 struct ds_context *ds_ctx;
466#endif /* CONFIG_X86_DS */
467#ifdef CONFIG_X86_PTRACE_BTS
468/* the signal to send on a bts buffer overflow */
469 unsigned int bts_ovfl_signal;
470#endif /* CONFIG_X86_PTRACE_BTS */
471}; 467};
472 468
473static inline unsigned long native_get_debugreg(int regno) 469static inline unsigned long native_get_debugreg(int regno)
@@ -795,6 +791,21 @@ static inline unsigned long get_debugctlmsr(void)
795 return debugctlmsr; 791 return debugctlmsr;
796} 792}
797 793
794static inline unsigned long get_debugctlmsr_on_cpu(int cpu)
795{
796 u64 debugctlmsr = 0;
797 u32 val1, val2;
798
799#ifndef CONFIG_X86_DEBUGCTLMSR
800 if (boot_cpu_data.x86 < 6)
801 return 0;
802#endif
803 rdmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR, &val1, &val2);
804 debugctlmsr = val1 | ((u64)val2 << 32);
805
806 return debugctlmsr;
807}
808
798static inline void update_debugctlmsr(unsigned long debugctlmsr) 809static inline void update_debugctlmsr(unsigned long debugctlmsr)
799{ 810{
800#ifndef CONFIG_X86_DEBUGCTLMSR 811#ifndef CONFIG_X86_DEBUGCTLMSR
@@ -804,6 +815,18 @@ static inline void update_debugctlmsr(unsigned long debugctlmsr)
804 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); 815 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
805} 816}
806 817
818static inline void update_debugctlmsr_on_cpu(int cpu,
819 unsigned long debugctlmsr)
820{
821#ifndef CONFIG_X86_DEBUGCTLMSR
822 if (boot_cpu_data.x86 < 6)
823 return;
824#endif
825 wrmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR,
826 (u32)((u64)debugctlmsr),
827 (u32)((u64)debugctlmsr >> 32));
828}
829
807/* 830/*
808 * from system description table in BIOS. Mostly for MCA use, but 831 * from system description table in BIOS. Mostly for MCA use, but
809 * others may find it useful: 832 * others may find it useful:
@@ -814,6 +837,7 @@ extern unsigned int BIOS_revision;
814 837
815/* Boot loader type from the setup header: */ 838/* Boot loader type from the setup header: */
816extern int bootloader_type; 839extern int bootloader_type;
840extern int bootloader_version;
817 841
818extern char ignore_fpu_irq; 842extern char ignore_fpu_irq;
819 843
@@ -874,7 +898,6 @@ static inline void spin_lock_prefetch(const void *x)
874 .vm86_info = NULL, \ 898 .vm86_info = NULL, \
875 .sysenter_cs = __KERNEL_CS, \ 899 .sysenter_cs = __KERNEL_CS, \
876 .io_bitmap_ptr = NULL, \ 900 .io_bitmap_ptr = NULL, \
877 .fs = __KERNEL_PERCPU, \
878} 901}
879 902
880/* 903/*
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 624f133943ed..0f0d908349aa 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -236,12 +236,11 @@ extern int do_get_thread_area(struct task_struct *p, int idx,
236extern int do_set_thread_area(struct task_struct *p, int idx, 236extern int do_set_thread_area(struct task_struct *p, int idx,
237 struct user_desc __user *info, int can_allocate); 237 struct user_desc __user *info, int can_allocate);
238 238
239extern void x86_ptrace_untrace(struct task_struct *); 239#ifdef CONFIG_X86_PTRACE_BTS
240extern void x86_ptrace_fork(struct task_struct *child, 240extern void ptrace_bts_untrace(struct task_struct *tsk);
241 unsigned long clone_flags);
242 241
243#define arch_ptrace_untrace(tsk) x86_ptrace_untrace(tsk) 242#define arch_ptrace_untrace(tsk) ptrace_bts_untrace(tsk)
244#define arch_ptrace_fork(child, flags) x86_ptrace_fork(child, flags) 243#endif /* CONFIG_X86_PTRACE_BTS */
245 244
246#endif /* __KERNEL__ */ 245#endif /* __KERNEL__ */
247 246
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index a4737dddfd58..64cf2d24fad1 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -48,9 +48,15 @@
48#endif 48#endif
49 49
50#ifdef CONFIG_X86_64 50#ifdef CONFIG_X86_64
51#ifdef CONFIG_PARAVIRT
52/* Paravirtualized systems may not have PSE or PGE available */
51#define NEED_PSE 0 53#define NEED_PSE 0
52#define NEED_MSR (1<<(X86_FEATURE_MSR & 31))
53#define NEED_PGE 0 54#define NEED_PGE 0
55#else
56#define NEED_PSE (1<<(X86_FEATURE_PSE) & 31)
57#define NEED_PGE (1<<(X86_FEATURE_PGE) & 31)
58#endif
59#define NEED_MSR (1<<(X86_FEATURE_MSR & 31))
54#define NEED_FXSR (1<<(X86_FEATURE_FXSR & 31)) 60#define NEED_FXSR (1<<(X86_FEATURE_FXSR & 31))
55#define NEED_XMM (1<<(X86_FEATURE_XMM & 31)) 61#define NEED_XMM (1<<(X86_FEATURE_XMM & 31))
56#define NEED_XMM2 (1<<(X86_FEATURE_XMM2 & 31)) 62#define NEED_XMM2 (1<<(X86_FEATURE_XMM2 & 31))
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index bdc2ada05ae0..4093d1ed6db2 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -33,7 +33,6 @@ struct x86_quirks {
33 int (*setup_ioapic_ids)(void); 33 int (*setup_ioapic_ids)(void);
34}; 34};
35 35
36extern void x86_quirk_pre_intr_init(void);
37extern void x86_quirk_intr_init(void); 36extern void x86_quirk_intr_init(void);
38 37
39extern void x86_quirk_trap_init(void); 38extern void x86_quirk_trap_init(void);
diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
index 7761a5d554bb..598457cbd0f8 100644
--- a/arch/x86/include/asm/signal.h
+++ b/arch/x86/include/asm/signal.h
@@ -117,7 +117,7 @@ typedef unsigned long sigset_t;
117#define MINSIGSTKSZ 2048 117#define MINSIGSTKSZ 2048
118#define SIGSTKSZ 8192 118#define SIGSTKSZ 8192
119 119
120#include <asm-generic/signal.h> 120#include <asm-generic/signal-defs.h>
121 121
122#ifndef __ASSEMBLY__ 122#ifndef __ASSEMBLY__
123 123
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 19e0d88b966d..6a84ed166aec 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -180,7 +180,7 @@ extern int safe_smp_processor_id(void);
180static inline int logical_smp_processor_id(void) 180static inline int logical_smp_processor_id(void)
181{ 181{
182 /* we don't want to mark this access volatile - bad code generation */ 182 /* we don't want to mark this access volatile - bad code generation */
183 return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR)); 183 return GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
184} 184}
185 185
186#endif 186#endif
diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h
index e3cc3c063ec5..4517d6b93188 100644
--- a/arch/x86/include/asm/sparsemem.h
+++ b/arch/x86/include/asm/sparsemem.h
@@ -27,7 +27,7 @@
27#else /* CONFIG_X86_32 */ 27#else /* CONFIG_X86_32 */
28# define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */ 28# define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */
29# define MAX_PHYSADDR_BITS 44 29# define MAX_PHYSADDR_BITS 44
30# define MAX_PHYSMEM_BITS 44 /* Can be max 45 bits */ 30# define MAX_PHYSMEM_BITS 46
31#endif 31#endif
32 32
33#endif /* CONFIG_SPARSEMEM */ 33#endif /* CONFIG_SPARSEMEM */
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
index 0e0e3ba827f7..c86f452256de 100644
--- a/arch/x86/include/asm/string_32.h
+++ b/arch/x86/include/asm/string_32.h
@@ -177,10 +177,18 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len)
177 * No 3D Now! 177 * No 3D Now!
178 */ 178 */
179 179
180#ifndef CONFIG_KMEMCHECK
180#define memcpy(t, f, n) \ 181#define memcpy(t, f, n) \
181 (__builtin_constant_p((n)) \ 182 (__builtin_constant_p((n)) \
182 ? __constant_memcpy((t), (f), (n)) \ 183 ? __constant_memcpy((t), (f), (n)) \
183 : __memcpy((t), (f), (n))) 184 : __memcpy((t), (f), (n)))
185#else
186/*
187 * kmemcheck becomes very happy if we use the REP instructions unconditionally,
188 * because it means that we know both memory operands in advance.
189 */
190#define memcpy(t, f, n) __memcpy((t), (f), (n))
191#endif
184 192
185#endif 193#endif
186 194
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 2afe164bf1e6..19e2c468fc2c 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -27,6 +27,7 @@ static __always_inline void *__inline_memcpy(void *to, const void *from, size_t
27 function. */ 27 function. */
28 28
29#define __HAVE_ARCH_MEMCPY 1 29#define __HAVE_ARCH_MEMCPY 1
30#ifndef CONFIG_KMEMCHECK
30#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4 31#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4
31extern void *memcpy(void *to, const void *from, size_t len); 32extern void *memcpy(void *to, const void *from, size_t len);
32#else 33#else
@@ -42,6 +43,13 @@ extern void *__memcpy(void *to, const void *from, size_t len);
42 __ret; \ 43 __ret; \
43}) 44})
44#endif 45#endif
46#else
47/*
48 * kmemcheck becomes very happy if we use the REP instructions unconditionally,
49 * because it means that we know both memory operands in advance.
50 */
51#define memcpy(dst, src, len) __inline_memcpy((dst), (src), (len))
52#endif
45 53
46#define __HAVE_ARCH_MEMSET 54#define __HAVE_ARCH_MEMSET
47void *memset(void *s, int c, size_t n); 55void *memset(void *s, int c, size_t n);
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 82ada75f3ebf..85574b7c1bc1 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -225,6 +225,7 @@ struct __attribute__ ((__packed__)) vmcb {
225#define SVM_EVTINJ_VALID_ERR (1 << 11) 225#define SVM_EVTINJ_VALID_ERR (1 << 11)
226 226
227#define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK 227#define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK
228#define SVM_EXITINTINFO_TYPE_MASK SVM_EVTINJ_TYPE_MASK
228 229
229#define SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR 230#define SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR
230#define SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI 231#define SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 7043408f6904..372b76edd63f 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -1,7 +1,7 @@
1/* 1/*
2 * syscalls.h - Linux syscall interfaces (arch-specific) 2 * syscalls.h - Linux syscall interfaces (arch-specific)
3 * 3 *
4 * Copyright (c) 2008 Jaswinder Singh 4 * Copyright (c) 2008 Jaswinder Singh Rajput
5 * 5 *
6 * This file is released under the GPLv2. 6 * This file is released under the GPLv2.
7 * See the file COPYING for more details. 7 * See the file COPYING for more details.
@@ -12,50 +12,55 @@
12 12
13#include <linux/compiler.h> 13#include <linux/compiler.h>
14#include <linux/linkage.h> 14#include <linux/linkage.h>
15#include <linux/types.h>
16#include <linux/signal.h> 15#include <linux/signal.h>
16#include <linux/types.h>
17 17
18/* Common in X86_32 and X86_64 */ 18/* Common in X86_32 and X86_64 */
19/* kernel/ioport.c */ 19/* kernel/ioport.c */
20asmlinkage long sys_ioperm(unsigned long, unsigned long, int); 20asmlinkage long sys_ioperm(unsigned long, unsigned long, int);
21 21
22/* kernel/process.c */
23int sys_fork(struct pt_regs *);
24int sys_vfork(struct pt_regs *);
25
22/* kernel/ldt.c */ 26/* kernel/ldt.c */
23asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); 27asmlinkage int sys_modify_ldt(int, void __user *, unsigned long);
24 28
29/* kernel/signal.c */
30long sys_rt_sigreturn(struct pt_regs *);
31
25/* kernel/tls.c */ 32/* kernel/tls.c */
26asmlinkage int sys_set_thread_area(struct user_desc __user *); 33asmlinkage int sys_set_thread_area(struct user_desc __user *);
27asmlinkage int sys_get_thread_area(struct user_desc __user *); 34asmlinkage int sys_get_thread_area(struct user_desc __user *);
28 35
29/* X86_32 only */ 36/* X86_32 only */
30#ifdef CONFIG_X86_32 37#ifdef CONFIG_X86_32
38/* kernel/ioport.c */
39long sys_iopl(struct pt_regs *);
40
31/* kernel/process_32.c */ 41/* kernel/process_32.c */
32int sys_fork(struct pt_regs *);
33int sys_clone(struct pt_regs *); 42int sys_clone(struct pt_regs *);
34int sys_vfork(struct pt_regs *);
35int sys_execve(struct pt_regs *); 43int sys_execve(struct pt_regs *);
36 44
37/* kernel/signal_32.c */ 45/* kernel/signal.c */
38asmlinkage int sys_sigsuspend(int, int, old_sigset_t); 46asmlinkage int sys_sigsuspend(int, int, old_sigset_t);
39asmlinkage int sys_sigaction(int, const struct old_sigaction __user *, 47asmlinkage int sys_sigaction(int, const struct old_sigaction __user *,
40 struct old_sigaction __user *); 48 struct old_sigaction __user *);
41int sys_sigaltstack(struct pt_regs *); 49int sys_sigaltstack(struct pt_regs *);
42unsigned long sys_sigreturn(struct pt_regs *); 50unsigned long sys_sigreturn(struct pt_regs *);
43long sys_rt_sigreturn(struct pt_regs *);
44
45/* kernel/ioport.c */
46long sys_iopl(struct pt_regs *);
47 51
48/* kernel/sys_i386_32.c */ 52/* kernel/sys_i386_32.c */
53struct mmap_arg_struct;
54struct sel_arg_struct;
55struct oldold_utsname;
56struct old_utsname;
57
49asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long, 58asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long,
50 unsigned long, unsigned long, unsigned long); 59 unsigned long, unsigned long, unsigned long);
51struct mmap_arg_struct;
52asmlinkage int old_mmap(struct mmap_arg_struct __user *); 60asmlinkage int old_mmap(struct mmap_arg_struct __user *);
53struct sel_arg_struct;
54asmlinkage int old_select(struct sel_arg_struct __user *); 61asmlinkage int old_select(struct sel_arg_struct __user *);
55asmlinkage int sys_ipc(uint, int, int, int, void __user *, long); 62asmlinkage int sys_ipc(uint, int, int, int, void __user *, long);
56struct old_utsname;
57asmlinkage int sys_uname(struct old_utsname __user *); 63asmlinkage int sys_uname(struct old_utsname __user *);
58struct oldold_utsname;
59asmlinkage int sys_olduname(struct oldold_utsname __user *); 64asmlinkage int sys_olduname(struct oldold_utsname __user *);
60 65
61/* kernel/vm86_32.c */ 66/* kernel/vm86_32.c */
@@ -65,29 +70,27 @@ int sys_vm86(struct pt_regs *);
65#else /* CONFIG_X86_32 */ 70#else /* CONFIG_X86_32 */
66 71
67/* X86_64 only */ 72/* X86_64 only */
73/* kernel/ioport.c */
74asmlinkage long sys_iopl(unsigned int, struct pt_regs *);
75
68/* kernel/process_64.c */ 76/* kernel/process_64.c */
69asmlinkage long sys_fork(struct pt_regs *);
70asmlinkage long sys_clone(unsigned long, unsigned long, 77asmlinkage long sys_clone(unsigned long, unsigned long,
71 void __user *, void __user *, 78 void __user *, void __user *,
72 struct pt_regs *); 79 struct pt_regs *);
73asmlinkage long sys_vfork(struct pt_regs *);
74asmlinkage long sys_execve(char __user *, char __user * __user *, 80asmlinkage long sys_execve(char __user *, char __user * __user *,
75 char __user * __user *, 81 char __user * __user *,
76 struct pt_regs *); 82 struct pt_regs *);
77long sys_arch_prctl(int, unsigned long); 83long sys_arch_prctl(int, unsigned long);
78 84
79/* kernel/ioport.c */ 85/* kernel/signal.c */
80asmlinkage long sys_iopl(unsigned int, struct pt_regs *);
81
82/* kernel/signal_64.c */
83asmlinkage long sys_sigaltstack(const stack_t __user *, stack_t __user *, 86asmlinkage long sys_sigaltstack(const stack_t __user *, stack_t __user *,
84 struct pt_regs *); 87 struct pt_regs *);
85long sys_rt_sigreturn(struct pt_regs *);
86 88
87/* kernel/sys_x86_64.c */ 89/* kernel/sys_x86_64.c */
90struct new_utsname;
91
88asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long, 92asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long,
89 unsigned long, unsigned long, unsigned long); 93 unsigned long, unsigned long, unsigned long);
90struct new_utsname;
91asmlinkage long sys_uname(struct new_utsname __user *); 94asmlinkage long sys_uname(struct new_utsname __user *);
92 95
93#endif /* CONFIG_X86_32 */ 96#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/include/asm/termios.h b/arch/x86/include/asm/termios.h
index f72956331c49..c4ee8056baca 100644
--- a/arch/x86/include/asm/termios.h
+++ b/arch/x86/include/asm/termios.h
@@ -67,6 +67,7 @@ static inline int user_termio_to_kernel_termios(struct ktermios *termios,
67 SET_LOW_TERMIOS_BITS(termios, termio, c_oflag); 67 SET_LOW_TERMIOS_BITS(termios, termio, c_oflag);
68 SET_LOW_TERMIOS_BITS(termios, termio, c_cflag); 68 SET_LOW_TERMIOS_BITS(termios, termio, c_cflag);
69 SET_LOW_TERMIOS_BITS(termios, termio, c_lflag); 69 SET_LOW_TERMIOS_BITS(termios, termio, c_lflag);
70 get_user(termios->c_line, &termio->c_line);
70 return copy_from_user(termios->c_cc, termio->c_cc, NCC); 71 return copy_from_user(termios->c_cc, termio->c_cc, NCC);
71} 72}
72 73
diff --git a/arch/x86/include/asm/therm_throt.h b/arch/x86/include/asm/therm_throt.h
deleted file mode 100644
index c62349ee7860..000000000000
--- a/arch/x86/include/asm/therm_throt.h
+++ /dev/null
@@ -1,9 +0,0 @@
1#ifndef _ASM_X86_THERM_THROT_H
2#define _ASM_X86_THERM_THROT_H
3
4#include <asm/atomic.h>
5
6extern atomic_t therm_throt_en;
7int therm_throt_process(int curr);
8
9#endif /* _ASM_X86_THERM_THROT_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 8820a73ae090..b0783520988b 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -94,7 +94,8 @@ struct thread_info {
94#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ 94#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
95#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ 95#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */
96#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ 96#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */
97#define TIF_SYSCALL_FTRACE 27 /* for ftrace syscall instrumentation */ 97#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
98#define TIF_SYSCALL_FTRACE 28 /* for ftrace syscall instrumentation */
98 99
99#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) 100#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
100#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) 101#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
@@ -116,6 +117,7 @@ struct thread_info {
116#define _TIF_FORCED_TF (1 << TIF_FORCED_TF) 117#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
117#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) 118#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR)
118#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) 119#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR)
120#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES)
119#define _TIF_SYSCALL_FTRACE (1 << TIF_SYSCALL_FTRACE) 121#define _TIF_SYSCALL_FTRACE (1 << TIF_SYSCALL_FTRACE)
120 122
121/* work to do in syscall_trace_enter() */ 123/* work to do in syscall_trace_enter() */
@@ -152,9 +154,9 @@ struct thread_info {
152 154
153/* thread information allocation */ 155/* thread information allocation */
154#ifdef CONFIG_DEBUG_STACK_USAGE 156#ifdef CONFIG_DEBUG_STACK_USAGE
155#define THREAD_FLAGS (GFP_KERNEL | __GFP_ZERO) 157#define THREAD_FLAGS (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
156#else 158#else
157#define THREAD_FLAGS GFP_KERNEL 159#define THREAD_FLAGS (GFP_KERNEL | __GFP_NOTRACK)
158#endif 160#endif
159 161
160#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR 162#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index bd37ed444a21..20ca9c4d4686 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -45,12 +45,16 @@ extern int no_timer_check;
45 */ 45 */
46 46
47DECLARE_PER_CPU(unsigned long, cyc2ns); 47DECLARE_PER_CPU(unsigned long, cyc2ns);
48DECLARE_PER_CPU(unsigned long long, cyc2ns_offset);
48 49
49#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ 50#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
50 51
51static inline unsigned long long __cycles_2_ns(unsigned long long cyc) 52static inline unsigned long long __cycles_2_ns(unsigned long long cyc)
52{ 53{
53 return cyc * per_cpu(cyc2ns, smp_processor_id()) >> CYC2NS_SCALE_FACTOR; 54 int cpu = smp_processor_id();
55 unsigned long long ns = per_cpu(cyc2ns_offset, cpu);
56 ns += cyc * per_cpu(cyc2ns, cpu) >> CYC2NS_SCALE_FACTOR;
57 return ns;
54} 58}
55 59
56static inline unsigned long long cycles_2_ns(unsigned long long cyc) 60static inline unsigned long long cycles_2_ns(unsigned long long cyc)
diff --git a/arch/x86/include/asm/timex.h b/arch/x86/include/asm/timex.h
index b5c9d45c981f..1375cfc93960 100644
--- a/arch/x86/include/asm/timex.h
+++ b/arch/x86/include/asm/timex.h
@@ -4,9 +4,7 @@
4#include <asm/processor.h> 4#include <asm/processor.h>
5#include <asm/tsc.h> 5#include <asm/tsc.h>
6 6
7/* The PIT ticks at this frequency (in HZ): */ 7/* Assume we use the PIT time source for the clock tick */
8#define PIT_TICK_RATE 1193182
9
10#define CLOCK_TICK_RATE PIT_TICK_RATE 8#define CLOCK_TICK_RATE PIT_TICK_RATE
11 9
12#define ARCH_HAS_READ_CURRENT_TIMER 10#define ARCH_HAS_READ_CURRENT_TIMER
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 16a5c84b0329..7f3eba08e7de 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -17,7 +17,7 @@
17 17
18static inline void __native_flush_tlb(void) 18static inline void __native_flush_tlb(void)
19{ 19{
20 write_cr3(read_cr3()); 20 native_write_cr3(native_read_cr3());
21} 21}
22 22
23static inline void __native_flush_tlb_global(void) 23static inline void __native_flush_tlb_global(void)
@@ -32,11 +32,11 @@ static inline void __native_flush_tlb_global(void)
32 */ 32 */
33 raw_local_irq_save(flags); 33 raw_local_irq_save(flags);
34 34
35 cr4 = read_cr4(); 35 cr4 = native_read_cr4();
36 /* clear PGE */ 36 /* clear PGE */
37 write_cr4(cr4 & ~X86_CR4_PGE); 37 native_write_cr4(cr4 & ~X86_CR4_PGE);
38 /* write old PGE again and flush TLBs */ 38 /* write old PGE again and flush TLBs */
39 write_cr4(cr4); 39 native_write_cr4(cr4);
40 40
41 raw_local_irq_restore(flags); 41 raw_local_irq_restore(flags);
42} 42}
@@ -172,6 +172,6 @@ static inline void flush_tlb_kernel_range(unsigned long start,
172 flush_tlb_all(); 172 flush_tlb_all();
173} 173}
174 174
175extern void zap_low_mappings(void); 175extern void zap_low_mappings(bool early);
176 176
177#endif /* _ASM_X86_TLBFLUSH_H */ 177#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index f44b49abca49..066ef590d7e0 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -203,7 +203,8 @@ struct pci_bus;
203void x86_pci_root_bus_res_quirks(struct pci_bus *b); 203void x86_pci_root_bus_res_quirks(struct pci_bus *b);
204 204
205#ifdef CONFIG_SMP 205#ifdef CONFIG_SMP
206#define mc_capable() (cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids) 206#define mc_capable() ((boot_cpu_data.x86_max_cores > 1) && \
207 (cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids))
207#define smt_capable() (smp_num_siblings > 1) 208#define smt_capable() (smp_num_siblings > 1)
208#endif 209#endif
209 210
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 0d5342515b86..bfd74c032fca 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -2,6 +2,7 @@
2#define _ASM_X86_TRAPS_H 2#define _ASM_X86_TRAPS_H
3 3
4#include <asm/debugreg.h> 4#include <asm/debugreg.h>
5#include <asm/siginfo.h> /* TRAP_TRACE, ... */
5 6
6#ifdef CONFIG_X86_32 7#ifdef CONFIG_X86_32
7#define dotraplinkage 8#define dotraplinkage
@@ -13,6 +14,9 @@ asmlinkage void divide_error(void);
13asmlinkage void debug(void); 14asmlinkage void debug(void);
14asmlinkage void nmi(void); 15asmlinkage void nmi(void);
15asmlinkage void int3(void); 16asmlinkage void int3(void);
17asmlinkage void xen_debug(void);
18asmlinkage void xen_int3(void);
19asmlinkage void xen_stack_segment(void);
16asmlinkage void overflow(void); 20asmlinkage void overflow(void);
17asmlinkage void bounds(void); 21asmlinkage void bounds(void);
18asmlinkage void invalid_op(void); 22asmlinkage void invalid_op(void);
@@ -74,7 +78,6 @@ static inline int get_si_code(unsigned long condition)
74} 78}
75 79
76extern int panic_on_unrecovered_nmi; 80extern int panic_on_unrecovered_nmi;
77extern int kstack_depth_to_print;
78 81
79void math_error(void __user *); 82void math_error(void __user *);
80void math_emulate(struct math_emu_info *); 83void math_emulate(struct math_emu_info *);
diff --git a/arch/x86/include/asm/types.h b/arch/x86/include/asm/types.h
index e6f736320077..09b97745772f 100644
--- a/arch/x86/include/asm/types.h
+++ b/arch/x86/include/asm/types.h
@@ -14,12 +14,6 @@ typedef unsigned short umode_t;
14 */ 14 */
15#ifdef __KERNEL__ 15#ifdef __KERNEL__
16 16
17#ifdef CONFIG_X86_32
18# define BITS_PER_LONG 32
19#else
20# define BITS_PER_LONG 64
21#endif
22
23#ifndef __ASSEMBLY__ 17#ifndef __ASSEMBLY__
24 18
25typedef u64 dma64_addr_t; 19typedef u64 dma64_addr_t;
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index b685ece89d5c..20e6a795e160 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -25,7 +25,7 @@
25#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) }) 25#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) })
26 26
27#define KERNEL_DS MAKE_MM_SEG(-1UL) 27#define KERNEL_DS MAKE_MM_SEG(-1UL)
28#define USER_DS MAKE_MM_SEG(PAGE_OFFSET) 28#define USER_DS MAKE_MM_SEG(TASK_SIZE_MAX)
29 29
30#define get_ds() (KERNEL_DS) 30#define get_ds() (KERNEL_DS)
31#define get_fs() (current_thread_info()->addr_limit) 31#define get_fs() (current_thread_info()->addr_limit)
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 6e72d74cf8dc..732a30706153 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -340,6 +340,8 @@
340#define __NR_inotify_init1 332 340#define __NR_inotify_init1 332
341#define __NR_preadv 333 341#define __NR_preadv 333
342#define __NR_pwritev 334 342#define __NR_pwritev 334
343#define __NR_rt_tgsigqueueinfo 335
344#define __NR_perf_counter_open 336
343 345
344#ifdef __KERNEL__ 346#ifdef __KERNEL__
345 347
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index f81829462325..900e1617e672 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -657,7 +657,10 @@ __SYSCALL(__NR_inotify_init1, sys_inotify_init1)
657__SYSCALL(__NR_preadv, sys_preadv) 657__SYSCALL(__NR_preadv, sys_preadv)
658#define __NR_pwritev 296 658#define __NR_pwritev 296
659__SYSCALL(__NR_pwritev, sys_pwritev) 659__SYSCALL(__NR_pwritev, sys_pwritev)
660 660#define __NR_rt_tgsigqueueinfo 297
661__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
662#define __NR_perf_counter_open 298
663__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
661 664
662#ifndef __NO_STUBS 665#ifndef __NO_STUBS
663#define __ARCH_WANT_OLD_READDIR 666#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 9b0e61bf7a88..bddd44f2f0ab 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -37,7 +37,7 @@
37#define UV_CPUS_PER_ACT_STATUS 32 37#define UV_CPUS_PER_ACT_STATUS 32
38#define UV_ACT_STATUS_MASK 0x3 38#define UV_ACT_STATUS_MASK 0x3
39#define UV_ACT_STATUS_SIZE 2 39#define UV_ACT_STATUS_SIZE 2
40#define UV_ACTIVATION_DESCRIPTOR_SIZE 32 40#define UV_ADP_SIZE 32
41#define UV_DISTRIBUTION_SIZE 256 41#define UV_DISTRIBUTION_SIZE 256
42#define UV_SW_ACK_NPENDING 8 42#define UV_SW_ACK_NPENDING 8
43#define UV_NET_ENDPOINT_INTD 0x38 43#define UV_NET_ENDPOINT_INTD 0x38
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index d3a98ea1062e..341070f7ad5c 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -133,6 +133,7 @@ struct uv_scir_s {
133struct uv_hub_info_s { 133struct uv_hub_info_s {
134 unsigned long global_mmr_base; 134 unsigned long global_mmr_base;
135 unsigned long gpa_mask; 135 unsigned long gpa_mask;
136 unsigned int gnode_extra;
136 unsigned long gnode_upper; 137 unsigned long gnode_upper;
137 unsigned long lowmem_remap_top; 138 unsigned long lowmem_remap_top;
138 unsigned long lowmem_remap_base; 139 unsigned long lowmem_remap_base;
@@ -159,7 +160,8 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
159 * p - PNODE (local part of nsids, right shifted 1) 160 * p - PNODE (local part of nsids, right shifted 1)
160 */ 161 */
161#define UV_NASID_TO_PNODE(n) (((n) >> 1) & uv_hub_info->pnode_mask) 162#define UV_NASID_TO_PNODE(n) (((n) >> 1) & uv_hub_info->pnode_mask)
162#define UV_PNODE_TO_NASID(p) (((p) << 1) | uv_hub_info->gnode_upper) 163#define UV_PNODE_TO_GNODE(p) ((p) |uv_hub_info->gnode_extra)
164#define UV_PNODE_TO_NASID(p) (UV_PNODE_TO_GNODE(p) << 1)
163 165
164#define UV_LOCAL_MMR_BASE 0xf4000000UL 166#define UV_LOCAL_MMR_BASE 0xf4000000UL
165#define UV_GLOBAL_MMR32_BASE 0xf8000000UL 167#define UV_GLOBAL_MMR32_BASE 0xf8000000UL
@@ -173,7 +175,7 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
173#define UV_GLOBAL_MMR32_PNODE_BITS(p) ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT)) 175#define UV_GLOBAL_MMR32_PNODE_BITS(p) ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT))
174 176
175#define UV_GLOBAL_MMR64_PNODE_BITS(p) \ 177#define UV_GLOBAL_MMR64_PNODE_BITS(p) \
176 ((unsigned long)(p) << UV_GLOBAL_MMR64_PNODE_SHIFT) 178 ((unsigned long)(UV_PNODE_TO_GNODE(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT)
177 179
178#define UV_APIC_PNODE_SHIFT 6 180#define UV_APIC_PNODE_SHIFT 6
179 181
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 498f944010b9..11be5ad2e0e9 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -247,6 +247,7 @@ enum vmcs_field {
247#define EXIT_REASON_MSR_READ 31 247#define EXIT_REASON_MSR_READ 31
248#define EXIT_REASON_MSR_WRITE 32 248#define EXIT_REASON_MSR_WRITE 32
249#define EXIT_REASON_MWAIT_INSTRUCTION 36 249#define EXIT_REASON_MWAIT_INSTRUCTION 36
250#define EXIT_REASON_MCE_DURING_VMENTRY 41
250#define EXIT_REASON_TPR_BELOW_THRESHOLD 43 251#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
251#define EXIT_REASON_APIC_ACCESS 44 252#define EXIT_REASON_APIC_ACCESS 44
252#define EXIT_REASON_EPT_VIOLATION 48 253#define EXIT_REASON_EPT_VIOLATION 48
diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h
index 11b3bb86e17b..7fcf6f3dbcc3 100644
--- a/arch/x86/include/asm/xor.h
+++ b/arch/x86/include/asm/xor.h
@@ -1,5 +1,10 @@
1#ifdef CONFIG_KMEMCHECK
2/* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */
3# include <asm-generic/xor.h>
4#else
1#ifdef CONFIG_X86_32 5#ifdef CONFIG_X86_32
2# include "xor_32.h" 6# include "xor_32.h"
3#else 7#else
4# include "xor_64.h" 8# include "xor_64.h"
5#endif 9#endif
10#endif
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 88d1bfc847d3..6c327b852e23 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -24,11 +24,13 @@ CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp)
24CFLAGS_hpet.o := $(nostackp) 24CFLAGS_hpet.o := $(nostackp)
25CFLAGS_tsc.o := $(nostackp) 25CFLAGS_tsc.o := $(nostackp)
26CFLAGS_paravirt.o := $(nostackp) 26CFLAGS_paravirt.o := $(nostackp)
27GCOV_PROFILE_vsyscall_64.o := n
28GCOV_PROFILE_hpet.o := n
27 29
28obj-y := process_$(BITS).o signal.o entry_$(BITS).o 30obj-y := process_$(BITS).o signal.o entry_$(BITS).o
29obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o 31obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
30obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o 32obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o
31obj-y += setup.o i8259.o irqinit_$(BITS).o 33obj-y += setup.o i8259.o irqinit.o
32obj-$(CONFIG_X86_VISWS) += visws_quirks.o 34obj-$(CONFIG_X86_VISWS) += visws_quirks.o
33obj-$(CONFIG_X86_32) += probe_roms_32.o 35obj-$(CONFIG_X86_32) += probe_roms_32.o
34obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o 36obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
@@ -44,6 +46,7 @@ obj-y += process.o
44obj-y += i387.o xsave.o 46obj-y += i387.o xsave.o
45obj-y += ptrace.o 47obj-y += ptrace.o
46obj-$(CONFIG_X86_DS) += ds.o 48obj-$(CONFIG_X86_DS) += ds.o
49obj-$(CONFIG_X86_DS_SELFTEST) += ds_selftest.o
47obj-$(CONFIG_X86_32) += tls.o 50obj-$(CONFIG_X86_32) += tls.o
48obj-$(CONFIG_IA32_EMULATION) += tls.o 51obj-$(CONFIG_IA32_EMULATION) += tls.o
49obj-y += step.o 52obj-y += step.o
@@ -72,7 +75,7 @@ obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
72obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o 75obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
73obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o 76obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
74obj-$(CONFIG_KPROBES) += kprobes.o 77obj-$(CONFIG_KPROBES) += kprobes.o
75obj-$(CONFIG_MODULES) += module_$(BITS).o 78obj-$(CONFIG_MODULES) += module.o
76obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o 79obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o
77obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o 80obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
78obj-$(CONFIG_KGDB) += kgdb.o 81obj-$(CONFIG_KGDB) += kgdb.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 723989d7f802..6b8ca3a0285d 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -33,6 +33,7 @@
33#include <linux/irq.h> 33#include <linux/irq.h>
34#include <linux/bootmem.h> 34#include <linux/bootmem.h>
35#include <linux/ioport.h> 35#include <linux/ioport.h>
36#include <linux/pci.h>
36 37
37#include <asm/pgtable.h> 38#include <asm/pgtable.h>
38#include <asm/io_apic.h> 39#include <asm/io_apic.h>
@@ -43,11 +44,7 @@
43 44
44static int __initdata acpi_force = 0; 45static int __initdata acpi_force = 0;
45u32 acpi_rsdt_forced; 46u32 acpi_rsdt_forced;
46#ifdef CONFIG_ACPI 47int acpi_disabled;
47int acpi_disabled = 0;
48#else
49int acpi_disabled = 1;
50#endif
51EXPORT_SYMBOL(acpi_disabled); 48EXPORT_SYMBOL(acpi_disabled);
52 49
53#ifdef CONFIG_X86_64 50#ifdef CONFIG_X86_64
@@ -121,72 +118,6 @@ void __init __acpi_unmap_table(char *map, unsigned long size)
121 early_iounmap(map, size); 118 early_iounmap(map, size);
122} 119}
123 120
124#ifdef CONFIG_PCI_MMCONFIG
125
126static int acpi_mcfg_64bit_base_addr __initdata = FALSE;
127
128/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */
129struct acpi_mcfg_allocation *pci_mmcfg_config;
130int pci_mmcfg_config_num;
131
132static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg)
133{
134 if (!strcmp(mcfg->header.oem_id, "SGI"))
135 acpi_mcfg_64bit_base_addr = TRUE;
136
137 return 0;
138}
139
140int __init acpi_parse_mcfg(struct acpi_table_header *header)
141{
142 struct acpi_table_mcfg *mcfg;
143 unsigned long i;
144 int config_size;
145
146 if (!header)
147 return -EINVAL;
148
149 mcfg = (struct acpi_table_mcfg *)header;
150
151 /* how many config structures do we have */
152 pci_mmcfg_config_num = 0;
153 i = header->length - sizeof(struct acpi_table_mcfg);
154 while (i >= sizeof(struct acpi_mcfg_allocation)) {
155 ++pci_mmcfg_config_num;
156 i -= sizeof(struct acpi_mcfg_allocation);
157 };
158 if (pci_mmcfg_config_num == 0) {
159 printk(KERN_ERR PREFIX "MMCONFIG has no entries\n");
160 return -ENODEV;
161 }
162
163 config_size = pci_mmcfg_config_num * sizeof(*pci_mmcfg_config);
164 pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL);
165 if (!pci_mmcfg_config) {
166 printk(KERN_WARNING PREFIX
167 "No memory for MCFG config tables\n");
168 return -ENOMEM;
169 }
170
171 memcpy(pci_mmcfg_config, &mcfg[1], config_size);
172
173 acpi_mcfg_oem_check(mcfg);
174
175 for (i = 0; i < pci_mmcfg_config_num; ++i) {
176 if ((pci_mmcfg_config[i].address > 0xFFFFFFFF) &&
177 !acpi_mcfg_64bit_base_addr) {
178 printk(KERN_ERR PREFIX
179 "MMCONFIG not in low 4GB of memory\n");
180 kfree(pci_mmcfg_config);
181 pci_mmcfg_config_num = 0;
182 return -ENODEV;
183 }
184 }
185
186 return 0;
187}
188#endif /* CONFIG_PCI_MMCONFIG */
189
190#ifdef CONFIG_X86_LOCAL_APIC 121#ifdef CONFIG_X86_LOCAL_APIC
191static int __init acpi_parse_madt(struct acpi_table_header *table) 122static int __init acpi_parse_madt(struct acpi_table_header *table)
192{ 123{
@@ -522,7 +453,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
522 * success: return IRQ number (>=0) 453 * success: return IRQ number (>=0)
523 * failure: return < 0 454 * failure: return < 0
524 */ 455 */
525int acpi_register_gsi(u32 gsi, int triggering, int polarity) 456int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
526{ 457{
527 unsigned int irq; 458 unsigned int irq;
528 unsigned int plat_gsi = gsi; 459 unsigned int plat_gsi = gsi;
@@ -532,14 +463,14 @@ int acpi_register_gsi(u32 gsi, int triggering, int polarity)
532 * Make sure all (legacy) PCI IRQs are set as level-triggered. 463 * Make sure all (legacy) PCI IRQs are set as level-triggered.
533 */ 464 */
534 if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) { 465 if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
535 if (triggering == ACPI_LEVEL_SENSITIVE) 466 if (trigger == ACPI_LEVEL_SENSITIVE)
536 eisa_set_level_irq(gsi); 467 eisa_set_level_irq(gsi);
537 } 468 }
538#endif 469#endif
539 470
540#ifdef CONFIG_X86_IO_APIC 471#ifdef CONFIG_X86_IO_APIC
541 if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) { 472 if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) {
542 plat_gsi = mp_register_gsi(gsi, triggering, polarity); 473 plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity);
543 } 474 }
544#endif 475#endif
545 acpi_gsi_to_irq(plat_gsi, &irq); 476 acpi_gsi_to_irq(plat_gsi, &irq);
@@ -903,10 +834,8 @@ extern int es7000_plat;
903#endif 834#endif
904 835
905static struct { 836static struct {
906 int apic_id;
907 int gsi_base; 837 int gsi_base;
908 int gsi_end; 838 int gsi_end;
909 DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
910} mp_ioapic_routing[MAX_IO_APICS]; 839} mp_ioapic_routing[MAX_IO_APICS];
911 840
912int mp_find_ioapic(int gsi) 841int mp_find_ioapic(int gsi)
@@ -986,16 +915,12 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
986 915
987 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); 916 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
988 mp_ioapics[idx].apicid = uniq_ioapic_id(id); 917 mp_ioapics[idx].apicid = uniq_ioapic_id(id);
989#ifdef CONFIG_X86_32
990 mp_ioapics[idx].apicver = io_apic_get_version(idx); 918 mp_ioapics[idx].apicver = io_apic_get_version(idx);
991#else 919
992 mp_ioapics[idx].apicver = 0;
993#endif
994 /* 920 /*
995 * Build basic GSI lookup table to facilitate gsi->io_apic lookups 921 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
996 * and to prevent reprogramming of IOAPIC pins (PCI GSIs). 922 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
997 */ 923 */
998 mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].apicid;
999 mp_ioapic_routing[idx].gsi_base = gsi_base; 924 mp_ioapic_routing[idx].gsi_base = gsi_base;
1000 mp_ioapic_routing[idx].gsi_end = gsi_base + 925 mp_ioapic_routing[idx].gsi_end = gsi_base +
1001 io_apic_get_redir_entries(idx); 926 io_apic_get_redir_entries(idx);
@@ -1158,26 +1083,52 @@ void __init mp_config_acpi_legacy_irqs(void)
1158 } 1083 }
1159} 1084}
1160 1085
1161int mp_register_gsi(u32 gsi, int triggering, int polarity) 1086static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
1087 int polarity)
1162{ 1088{
1089#ifdef CONFIG_X86_MPPARSE
1090 struct mpc_intsrc mp_irq;
1091 struct pci_dev *pdev;
1092 unsigned char number;
1093 unsigned int devfn;
1163 int ioapic; 1094 int ioapic;
1164 int ioapic_pin; 1095 u8 pin;
1165#ifdef CONFIG_X86_32
1166#define MAX_GSI_NUM 4096
1167#define IRQ_COMPRESSION_START 64
1168 1096
1169 static int pci_irq = IRQ_COMPRESSION_START; 1097 if (!acpi_ioapic)
1170 /* 1098 return 0;
1171 * Mapping between Global System Interrupts, which 1099 if (!dev)
1172 * represent all possible interrupts, and IRQs 1100 return 0;
1173 * assigned to actual devices. 1101 if (dev->bus != &pci_bus_type)
1174 */ 1102 return 0;
1175 static int gsi_to_irq[MAX_GSI_NUM]; 1103
1176#else 1104 pdev = to_pci_dev(dev);
1105 number = pdev->bus->number;
1106 devfn = pdev->devfn;
1107 pin = pdev->pin;
1108 /* print the entry should happen on mptable identically */
1109 mp_irq.type = MP_INTSRC;
1110 mp_irq.irqtype = mp_INT;
1111 mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
1112 (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
1113 mp_irq.srcbus = number;
1114 mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
1115 ioapic = mp_find_ioapic(gsi);
1116 mp_irq.dstapic = mp_ioapics[ioapic].apicid;
1117 mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
1118
1119 save_mp_irq(&mp_irq);
1120#endif
1121 return 0;
1122}
1123
1124int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
1125{
1126 int ioapic;
1127 int ioapic_pin;
1128 struct io_apic_irq_attr irq_attr;
1177 1129
1178 if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) 1130 if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
1179 return gsi; 1131 return gsi;
1180#endif
1181 1132
1182 /* Don't set up the ACPI SCI because it's already set up */ 1133 /* Don't set up the ACPI SCI because it's already set up */
1183 if (acpi_gbl_FADT.sci_interrupt == gsi) 1134 if (acpi_gbl_FADT.sci_interrupt == gsi)
@@ -1196,93 +1147,22 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
1196 gsi = ioapic_renumber_irq(ioapic, gsi); 1147 gsi = ioapic_renumber_irq(ioapic, gsi);
1197#endif 1148#endif
1198 1149
1199 /*
1200 * Avoid pin reprogramming. PRTs typically include entries
1201 * with redundant pin->gsi mappings (but unique PCI devices);
1202 * we only program the IOAPIC on the first.
1203 */
1204 if (ioapic_pin > MP_MAX_IOAPIC_PIN) { 1150 if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
1205 printk(KERN_ERR "Invalid reference to IOAPIC pin " 1151 printk(KERN_ERR "Invalid reference to IOAPIC pin "
1206 "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, 1152 "%d-%d\n", mp_ioapics[ioapic].apicid,
1207 ioapic_pin); 1153 ioapic_pin);
1208 return gsi; 1154 return gsi;
1209 } 1155 }
1210 if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
1211 pr_debug("Pin %d-%d already programmed\n",
1212 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
1213#ifdef CONFIG_X86_32
1214 return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
1215#else
1216 return gsi;
1217#endif
1218 }
1219 1156
1220 set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed); 1157 if (enable_update_mptable)
1221#ifdef CONFIG_X86_32 1158 mp_config_acpi_gsi(dev, gsi, trigger, polarity);
1222 /*
1223 * For GSI >= 64, use IRQ compression
1224 */
1225 if ((gsi >= IRQ_COMPRESSION_START)
1226 && (triggering == ACPI_LEVEL_SENSITIVE)) {
1227 /*
1228 * For PCI devices assign IRQs in order, avoiding gaps
1229 * due to unused I/O APIC pins.
1230 */
1231 int irq = gsi;
1232 if (gsi < MAX_GSI_NUM) {
1233 /*
1234 * Retain the VIA chipset work-around (gsi > 15), but
1235 * avoid a problem where the 8254 timer (IRQ0) is setup
1236 * via an override (so it's not on pin 0 of the ioapic),
1237 * and at the same time, the pin 0 interrupt is a PCI
1238 * type. The gsi > 15 test could cause these two pins
1239 * to be shared as IRQ0, and they are not shareable.
1240 * So test for this condition, and if necessary, avoid
1241 * the pin collision.
1242 */
1243 gsi = pci_irq++;
1244 /*
1245 * Don't assign IRQ used by ACPI SCI
1246 */
1247 if (gsi == acpi_gbl_FADT.sci_interrupt)
1248 gsi = pci_irq++;
1249 gsi_to_irq[irq] = gsi;
1250 } else {
1251 printk(KERN_ERR "GSI %u is too high\n", gsi);
1252 return gsi;
1253 }
1254 }
1255#endif
1256 io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
1257 triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
1258 polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
1259 return gsi;
1260}
1261 1159
1262int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin, 1160 set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin,
1263 u32 gsi, int triggering, int polarity) 1161 trigger == ACPI_EDGE_SENSITIVE ? 0 : 1,
1264{ 1162 polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
1265#ifdef CONFIG_X86_MPPARSE 1163 io_apic_set_pci_routing(dev, gsi, &irq_attr);
1266 struct mpc_intsrc mp_irq;
1267 int ioapic;
1268
1269 if (!acpi_ioapic)
1270 return 0;
1271 1164
1272 /* print the entry should happen on mptable identically */ 1165 return gsi;
1273 mp_irq.type = MP_INTSRC;
1274 mp_irq.irqtype = mp_INT;
1275 mp_irq.irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
1276 (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
1277 mp_irq.srcbus = number;
1278 mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
1279 ioapic = mp_find_ioapic(gsi);
1280 mp_irq.dstapic = mp_ioapic_routing[ioapic].apic_id;
1281 mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
1282
1283 save_mp_irq(&mp_irq);
1284#endif
1285 return 0;
1286} 1166}
1287 1167
1288/* 1168/*
@@ -1569,14 +1449,6 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
1569 }, 1449 },
1570 { 1450 {
1571 .callback = force_acpi_ht, 1451 .callback = force_acpi_ht,
1572 .ident = "ASUS P4B266",
1573 .matches = {
1574 DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1575 DMI_MATCH(DMI_BOARD_NAME, "P4B266"),
1576 },
1577 },
1578 {
1579 .callback = force_acpi_ht,
1580 .ident = "ASUS P2B-DS", 1452 .ident = "ASUS P2B-DS",
1581 .matches = { 1453 .matches = {
1582 DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), 1454 DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index bbbe4bbb6f34..8c44c232efcb 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -34,12 +34,22 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags,
34 flags->bm_check = 1; 34 flags->bm_check = 1;
35 else if (c->x86_vendor == X86_VENDOR_INTEL) { 35 else if (c->x86_vendor == X86_VENDOR_INTEL) {
36 /* 36 /*
37 * Today all CPUs that support C3 share cache. 37 * Today all MP CPUs that support C3 share cache.
38 * TBD: This needs to look at cache shared map, once 38 * And caches should not be flushed by software while
39 * multi-core detection patch makes to the base. 39 * entering C3 type state.
40 */ 40 */
41 flags->bm_check = 1; 41 flags->bm_check = 1;
42 } 42 }
43
44 /*
45 * On all recent Intel platforms, ARB_DISABLE is a nop.
46 * So, set bm_control to zero to indicate that ARB_DISABLE
47 * is not required while entering C3 type state on
48 * P4, Core and beyond CPUs
49 */
50 if (c->x86_vendor == X86_VENDOR_INTEL &&
51 (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 14)))
52 flags->bm_control = 0;
43} 53}
44EXPORT_SYMBOL(acpi_processor_power_init_bm_check); 54EXPORT_SYMBOL(acpi_processor_power_init_bm_check);
45 55
diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c
index 7c074eec39fb..d296f4a195c9 100644
--- a/arch/x86/kernel/acpi/processor.c
+++ b/arch/x86/kernel/acpi/processor.c
@@ -72,6 +72,7 @@ static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c)
72 return; 72 return;
73} 73}
74 74
75
75/* Initialize _PDC data based on the CPU vendor */ 76/* Initialize _PDC data based on the CPU vendor */
76void arch_acpi_processor_init_pdc(struct acpi_processor *pr) 77void arch_acpi_processor_init_pdc(struct acpi_processor *pr)
77{ 78{
@@ -85,3 +86,15 @@ void arch_acpi_processor_init_pdc(struct acpi_processor *pr)
85} 86}
86 87
87EXPORT_SYMBOL(arch_acpi_processor_init_pdc); 88EXPORT_SYMBOL(arch_acpi_processor_init_pdc);
89
90void arch_acpi_processor_cleanup_pdc(struct acpi_processor *pr)
91{
92 if (pr->pdc) {
93 kfree(pr->pdc->pointer->buffer.pointer);
94 kfree(pr->pdc->pointer);
95 kfree(pr->pdc);
96 pr->pdc = NULL;
97 }
98}
99
100EXPORT_SYMBOL(arch_acpi_processor_cleanup_pdc);
diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile
index 1c31cc0e9def..6a564ac67ef5 100644
--- a/arch/x86/kernel/acpi/realmode/Makefile
+++ b/arch/x86/kernel/acpi/realmode/Makefile
@@ -9,7 +9,7 @@
9always := wakeup.bin 9always := wakeup.bin
10targets := wakeup.elf wakeup.lds 10targets := wakeup.elf wakeup.lds
11 11
12wakeup-y += wakeup.o wakemain.o video-mode.o copy.o 12wakeup-y += wakeup.o wakemain.o video-mode.o copy.o bioscall.o regs.o
13 13
14# The link order of the video-*.o modules can matter. In particular, 14# The link order of the video-*.o modules can matter. In particular,
15# video-vga.o *must* be listed first, followed by video-vesa.o. 15# video-vga.o *must* be listed first, followed by video-vesa.o.
@@ -42,6 +42,7 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D_WAKEUP -D__KERNEL__ \
42 $(call cc-option, -mpreferred-stack-boundary=2) 42 $(call cc-option, -mpreferred-stack-boundary=2)
43KBUILD_CFLAGS += $(call cc-option, -m32) 43KBUILD_CFLAGS += $(call cc-option, -m32)
44KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ 44KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
45GCOV_PROFILE := n
45 46
46WAKEUP_OBJS = $(addprefix $(obj)/,$(wakeup-y)) 47WAKEUP_OBJS = $(addprefix $(obj)/,$(wakeup-y))
47 48
diff --git a/arch/x86/kernel/acpi/realmode/bioscall.S b/arch/x86/kernel/acpi/realmode/bioscall.S
new file mode 100644
index 000000000000..f51eb0bb56ce
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/bioscall.S
@@ -0,0 +1 @@
#include "../../../boot/bioscall.S"
diff --git a/arch/x86/kernel/acpi/realmode/regs.c b/arch/x86/kernel/acpi/realmode/regs.c
new file mode 100644
index 000000000000..6206033ba202
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/regs.c
@@ -0,0 +1 @@
#include "../../../boot/regs.c"
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 7c243a2c5115..ca93638ba430 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -104,7 +104,7 @@ int acpi_save_state_mem(void)
104 initial_gs = per_cpu_offset(smp_processor_id()); 104 initial_gs = per_cpu_offset(smp_processor_id());
105#endif 105#endif
106 initial_code = (unsigned long)wakeup_long64; 106 initial_code = (unsigned long)wakeup_long64;
107 saved_magic = 0x123456789abcdef0; 107 saved_magic = 0x123456789abcdef0L;
108#endif /* CONFIG_64BIT */ 108#endif /* CONFIG_64BIT */
109 109
110 return 0; 110 return 0;
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index a97db99dad52..9372f0406ad4 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -55,7 +55,16 @@ struct iommu_cmd {
55static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, 55static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
56 struct unity_map_entry *e); 56 struct unity_map_entry *e);
57static struct dma_ops_domain *find_protection_domain(u16 devid); 57static struct dma_ops_domain *find_protection_domain(u16 devid);
58static u64* alloc_pte(struct protection_domain *dom,
59 unsigned long address, u64
60 **pte_page, gfp_t gfp);
61static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
62 unsigned long start_page,
63 unsigned int pages);
58 64
65#ifndef BUS_NOTIFY_UNBOUND_DRIVER
66#define BUS_NOTIFY_UNBOUND_DRIVER 0x0005
67#endif
59 68
60#ifdef CONFIG_AMD_IOMMU_STATS 69#ifdef CONFIG_AMD_IOMMU_STATS
61 70
@@ -213,7 +222,7 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data)
213{ 222{
214 struct amd_iommu *iommu; 223 struct amd_iommu *iommu;
215 224
216 list_for_each_entry(iommu, &amd_iommu_list, list) 225 for_each_iommu(iommu)
217 iommu_poll_events(iommu); 226 iommu_poll_events(iommu);
218 227
219 return IRQ_HANDLED; 228 return IRQ_HANDLED;
@@ -425,6 +434,16 @@ static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid)
425 iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1); 434 iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1);
426} 435}
427 436
437/* Flush the whole IO/TLB for a given protection domain - including PDE */
438static void iommu_flush_tlb_pde(struct amd_iommu *iommu, u16 domid)
439{
440 u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
441
442 INC_STATS_COUNTER(domain_flush_single);
443
444 iommu_queue_inv_iommu_pages(iommu, address, domid, 1, 1);
445}
446
428/* 447/*
429 * This function is used to flush the IO/TLB for a given protection domain 448 * This function is used to flush the IO/TLB for a given protection domain
430 * on every IOMMU in the system 449 * on every IOMMU in the system
@@ -440,7 +459,7 @@ static void iommu_flush_domain(u16 domid)
440 __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 459 __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
441 domid, 1, 1); 460 domid, 1, 1);
442 461
443 list_for_each_entry(iommu, &amd_iommu_list, list) { 462 for_each_iommu(iommu) {
444 spin_lock_irqsave(&iommu->lock, flags); 463 spin_lock_irqsave(&iommu->lock, flags);
445 __iommu_queue_command(iommu, &cmd); 464 __iommu_queue_command(iommu, &cmd);
446 __iommu_completion_wait(iommu); 465 __iommu_completion_wait(iommu);
@@ -449,6 +468,35 @@ static void iommu_flush_domain(u16 domid)
449 } 468 }
450} 469}
451 470
471void amd_iommu_flush_all_domains(void)
472{
473 int i;
474
475 for (i = 1; i < MAX_DOMAIN_ID; ++i) {
476 if (!test_bit(i, amd_iommu_pd_alloc_bitmap))
477 continue;
478 iommu_flush_domain(i);
479 }
480}
481
482void amd_iommu_flush_all_devices(void)
483{
484 struct amd_iommu *iommu;
485 int i;
486
487 for (i = 0; i <= amd_iommu_last_bdf; ++i) {
488 if (amd_iommu_pd_table[i] == NULL)
489 continue;
490
491 iommu = amd_iommu_rlookup_table[i];
492 if (!iommu)
493 continue;
494
495 iommu_queue_inv_dev_entry(iommu, i);
496 iommu_completion_wait(iommu);
497 }
498}
499
452/**************************************************************************** 500/****************************************************************************
453 * 501 *
454 * The functions below are used the create the page table mappings for 502 * The functions below are used the create the page table mappings for
@@ -468,7 +516,7 @@ static int iommu_map_page(struct protection_domain *dom,
468 unsigned long phys_addr, 516 unsigned long phys_addr,
469 int prot) 517 int prot)
470{ 518{
471 u64 __pte, *pte, *page; 519 u64 __pte, *pte;
472 520
473 bus_addr = PAGE_ALIGN(bus_addr); 521 bus_addr = PAGE_ALIGN(bus_addr);
474 phys_addr = PAGE_ALIGN(phys_addr); 522 phys_addr = PAGE_ALIGN(phys_addr);
@@ -477,27 +525,7 @@ static int iommu_map_page(struct protection_domain *dom,
477 if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK)) 525 if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK))
478 return -EINVAL; 526 return -EINVAL;
479 527
480 pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)]; 528 pte = alloc_pte(dom, bus_addr, NULL, GFP_KERNEL);
481
482 if (!IOMMU_PTE_PRESENT(*pte)) {
483 page = (u64 *)get_zeroed_page(GFP_KERNEL);
484 if (!page)
485 return -ENOMEM;
486 *pte = IOMMU_L2_PDE(virt_to_phys(page));
487 }
488
489 pte = IOMMU_PTE_PAGE(*pte);
490 pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
491
492 if (!IOMMU_PTE_PRESENT(*pte)) {
493 page = (u64 *)get_zeroed_page(GFP_KERNEL);
494 if (!page)
495 return -ENOMEM;
496 *pte = IOMMU_L1_PDE(virt_to_phys(page));
497 }
498
499 pte = IOMMU_PTE_PAGE(*pte);
500 pte = &pte[IOMMU_PTE_L0_INDEX(bus_addr)];
501 529
502 if (IOMMU_PTE_PRESENT(*pte)) 530 if (IOMMU_PTE_PRESENT(*pte))
503 return -EBUSY; 531 return -EBUSY;
@@ -595,7 +623,8 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
595 * as allocated in the aperture 623 * as allocated in the aperture
596 */ 624 */
597 if (addr < dma_dom->aperture_size) 625 if (addr < dma_dom->aperture_size)
598 __set_bit(addr >> PAGE_SHIFT, dma_dom->bitmap); 626 __set_bit(addr >> PAGE_SHIFT,
627 dma_dom->aperture[0]->bitmap);
599 } 628 }
600 629
601 return 0; 630 return 0;
@@ -632,42 +661,191 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
632 ****************************************************************************/ 661 ****************************************************************************/
633 662
634/* 663/*
635 * The address allocator core function. 664 * The address allocator core functions.
636 * 665 *
637 * called with domain->lock held 666 * called with domain->lock held
638 */ 667 */
668
669/*
670 * This function checks if there is a PTE for a given dma address. If
671 * there is one, it returns the pointer to it.
672 */
673static u64* fetch_pte(struct protection_domain *domain,
674 unsigned long address)
675{
676 u64 *pte;
677
678 pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(address)];
679
680 if (!IOMMU_PTE_PRESENT(*pte))
681 return NULL;
682
683 pte = IOMMU_PTE_PAGE(*pte);
684 pte = &pte[IOMMU_PTE_L1_INDEX(address)];
685
686 if (!IOMMU_PTE_PRESENT(*pte))
687 return NULL;
688
689 pte = IOMMU_PTE_PAGE(*pte);
690 pte = &pte[IOMMU_PTE_L0_INDEX(address)];
691
692 return pte;
693}
694
695/*
696 * This function is used to add a new aperture range to an existing
697 * aperture in case of dma_ops domain allocation or address allocation
698 * failure.
699 */
700static int alloc_new_range(struct amd_iommu *iommu,
701 struct dma_ops_domain *dma_dom,
702 bool populate, gfp_t gfp)
703{
704 int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
705 int i;
706
707#ifdef CONFIG_IOMMU_STRESS
708 populate = false;
709#endif
710
711 if (index >= APERTURE_MAX_RANGES)
712 return -ENOMEM;
713
714 dma_dom->aperture[index] = kzalloc(sizeof(struct aperture_range), gfp);
715 if (!dma_dom->aperture[index])
716 return -ENOMEM;
717
718 dma_dom->aperture[index]->bitmap = (void *)get_zeroed_page(gfp);
719 if (!dma_dom->aperture[index]->bitmap)
720 goto out_free;
721
722 dma_dom->aperture[index]->offset = dma_dom->aperture_size;
723
724 if (populate) {
725 unsigned long address = dma_dom->aperture_size;
726 int i, num_ptes = APERTURE_RANGE_PAGES / 512;
727 u64 *pte, *pte_page;
728
729 for (i = 0; i < num_ptes; ++i) {
730 pte = alloc_pte(&dma_dom->domain, address,
731 &pte_page, gfp);
732 if (!pte)
733 goto out_free;
734
735 dma_dom->aperture[index]->pte_pages[i] = pte_page;
736
737 address += APERTURE_RANGE_SIZE / 64;
738 }
739 }
740
741 dma_dom->aperture_size += APERTURE_RANGE_SIZE;
742
743 /* Intialize the exclusion range if necessary */
744 if (iommu->exclusion_start &&
745 iommu->exclusion_start >= dma_dom->aperture[index]->offset &&
746 iommu->exclusion_start < dma_dom->aperture_size) {
747 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
748 int pages = iommu_num_pages(iommu->exclusion_start,
749 iommu->exclusion_length,
750 PAGE_SIZE);
751 dma_ops_reserve_addresses(dma_dom, startpage, pages);
752 }
753
754 /*
755 * Check for areas already mapped as present in the new aperture
756 * range and mark those pages as reserved in the allocator. Such
757 * mappings may already exist as a result of requested unity
758 * mappings for devices.
759 */
760 for (i = dma_dom->aperture[index]->offset;
761 i < dma_dom->aperture_size;
762 i += PAGE_SIZE) {
763 u64 *pte = fetch_pte(&dma_dom->domain, i);
764 if (!pte || !IOMMU_PTE_PRESENT(*pte))
765 continue;
766
767 dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1);
768 }
769
770 return 0;
771
772out_free:
773 free_page((unsigned long)dma_dom->aperture[index]->bitmap);
774
775 kfree(dma_dom->aperture[index]);
776 dma_dom->aperture[index] = NULL;
777
778 return -ENOMEM;
779}
780
781static unsigned long dma_ops_area_alloc(struct device *dev,
782 struct dma_ops_domain *dom,
783 unsigned int pages,
784 unsigned long align_mask,
785 u64 dma_mask,
786 unsigned long start)
787{
788 unsigned long next_bit = dom->next_address % APERTURE_RANGE_SIZE;
789 int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT;
790 int i = start >> APERTURE_RANGE_SHIFT;
791 unsigned long boundary_size;
792 unsigned long address = -1;
793 unsigned long limit;
794
795 next_bit >>= PAGE_SHIFT;
796
797 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
798 PAGE_SIZE) >> PAGE_SHIFT;
799
800 for (;i < max_index; ++i) {
801 unsigned long offset = dom->aperture[i]->offset >> PAGE_SHIFT;
802
803 if (dom->aperture[i]->offset >= dma_mask)
804 break;
805
806 limit = iommu_device_max_index(APERTURE_RANGE_PAGES, offset,
807 dma_mask >> PAGE_SHIFT);
808
809 address = iommu_area_alloc(dom->aperture[i]->bitmap,
810 limit, next_bit, pages, 0,
811 boundary_size, align_mask);
812 if (address != -1) {
813 address = dom->aperture[i]->offset +
814 (address << PAGE_SHIFT);
815 dom->next_address = address + (pages << PAGE_SHIFT);
816 break;
817 }
818
819 next_bit = 0;
820 }
821
822 return address;
823}
824
639static unsigned long dma_ops_alloc_addresses(struct device *dev, 825static unsigned long dma_ops_alloc_addresses(struct device *dev,
640 struct dma_ops_domain *dom, 826 struct dma_ops_domain *dom,
641 unsigned int pages, 827 unsigned int pages,
642 unsigned long align_mask, 828 unsigned long align_mask,
643 u64 dma_mask) 829 u64 dma_mask)
644{ 830{
645 unsigned long limit;
646 unsigned long address; 831 unsigned long address;
647 unsigned long boundary_size;
648 832
649 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, 833#ifdef CONFIG_IOMMU_STRESS
650 PAGE_SIZE) >> PAGE_SHIFT; 834 dom->next_address = 0;
651 limit = iommu_device_max_index(dom->aperture_size >> PAGE_SHIFT, 0, 835 dom->need_flush = true;
652 dma_mask >> PAGE_SHIFT); 836#endif
653 837
654 if (dom->next_bit >= limit) { 838 address = dma_ops_area_alloc(dev, dom, pages, align_mask,
655 dom->next_bit = 0; 839 dma_mask, dom->next_address);
656 dom->need_flush = true;
657 }
658 840
659 address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages,
660 0 , boundary_size, align_mask);
661 if (address == -1) { 841 if (address == -1) {
662 address = iommu_area_alloc(dom->bitmap, limit, 0, pages, 842 dom->next_address = 0;
663 0, boundary_size, align_mask); 843 address = dma_ops_area_alloc(dev, dom, pages, align_mask,
844 dma_mask, 0);
664 dom->need_flush = true; 845 dom->need_flush = true;
665 } 846 }
666 847
667 if (likely(address != -1)) { 848 if (unlikely(address == -1))
668 dom->next_bit = address + pages;
669 address <<= PAGE_SHIFT;
670 } else
671 address = bad_dma_address; 849 address = bad_dma_address;
672 850
673 WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size); 851 WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
@@ -684,11 +862,23 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
684 unsigned long address, 862 unsigned long address,
685 unsigned int pages) 863 unsigned int pages)
686{ 864{
687 address >>= PAGE_SHIFT; 865 unsigned i = address >> APERTURE_RANGE_SHIFT;
688 iommu_area_free(dom->bitmap, address, pages); 866 struct aperture_range *range = dom->aperture[i];
867
868 BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL);
869
870#ifdef CONFIG_IOMMU_STRESS
871 if (i < 4)
872 return;
873#endif
689 874
690 if (address >= dom->next_bit) 875 if (address >= dom->next_address)
691 dom->need_flush = true; 876 dom->need_flush = true;
877
878 address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT;
879
880 iommu_area_free(range->bitmap, address, pages);
881
692} 882}
693 883
694/**************************************************************************** 884/****************************************************************************
@@ -736,12 +926,16 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
736 unsigned long start_page, 926 unsigned long start_page,
737 unsigned int pages) 927 unsigned int pages)
738{ 928{
739 unsigned int last_page = dom->aperture_size >> PAGE_SHIFT; 929 unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
740 930
741 if (start_page + pages > last_page) 931 if (start_page + pages > last_page)
742 pages = last_page - start_page; 932 pages = last_page - start_page;
743 933
744 iommu_area_reserve(dom->bitmap, start_page, pages); 934 for (i = start_page; i < start_page + pages; ++i) {
935 int index = i / APERTURE_RANGE_PAGES;
936 int page = i % APERTURE_RANGE_PAGES;
937 __set_bit(page, dom->aperture[index]->bitmap);
938 }
745} 939}
746 940
747static void free_pagetable(struct protection_domain *domain) 941static void free_pagetable(struct protection_domain *domain)
@@ -780,14 +974,19 @@ static void free_pagetable(struct protection_domain *domain)
780 */ 974 */
781static void dma_ops_domain_free(struct dma_ops_domain *dom) 975static void dma_ops_domain_free(struct dma_ops_domain *dom)
782{ 976{
977 int i;
978
783 if (!dom) 979 if (!dom)
784 return; 980 return;
785 981
786 free_pagetable(&dom->domain); 982 free_pagetable(&dom->domain);
787 983
788 kfree(dom->pte_pages); 984 for (i = 0; i < APERTURE_MAX_RANGES; ++i) {
789 985 if (!dom->aperture[i])
790 kfree(dom->bitmap); 986 continue;
987 free_page((unsigned long)dom->aperture[i]->bitmap);
988 kfree(dom->aperture[i]);
989 }
791 990
792 kfree(dom); 991 kfree(dom);
793} 992}
@@ -797,19 +996,9 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
797 * It also intializes the page table and the address allocator data 996 * It also intializes the page table and the address allocator data
798 * structures required for the dma_ops interface 997 * structures required for the dma_ops interface
799 */ 998 */
800static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, 999static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
801 unsigned order)
802{ 1000{
803 struct dma_ops_domain *dma_dom; 1001 struct dma_ops_domain *dma_dom;
804 unsigned i, num_pte_pages;
805 u64 *l2_pde;
806 u64 address;
807
808 /*
809 * Currently the DMA aperture must be between 32 MB and 1GB in size
810 */
811 if ((order < 25) || (order > 30))
812 return NULL;
813 1002
814 dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL); 1003 dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL);
815 if (!dma_dom) 1004 if (!dma_dom)
@@ -826,55 +1015,20 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
826 dma_dom->domain.priv = dma_dom; 1015 dma_dom->domain.priv = dma_dom;
827 if (!dma_dom->domain.pt_root) 1016 if (!dma_dom->domain.pt_root)
828 goto free_dma_dom; 1017 goto free_dma_dom;
829 dma_dom->aperture_size = (1ULL << order);
830 dma_dom->bitmap = kzalloc(dma_dom->aperture_size / (PAGE_SIZE * 8),
831 GFP_KERNEL);
832 if (!dma_dom->bitmap)
833 goto free_dma_dom;
834 /*
835 * mark the first page as allocated so we never return 0 as
836 * a valid dma-address. So we can use 0 as error value
837 */
838 dma_dom->bitmap[0] = 1;
839 dma_dom->next_bit = 0;
840 1018
841 dma_dom->need_flush = false; 1019 dma_dom->need_flush = false;
842 dma_dom->target_dev = 0xffff; 1020 dma_dom->target_dev = 0xffff;
843 1021
844 /* Intialize the exclusion range if necessary */ 1022 if (alloc_new_range(iommu, dma_dom, true, GFP_KERNEL))
845 if (iommu->exclusion_start && 1023 goto free_dma_dom;
846 iommu->exclusion_start < dma_dom->aperture_size) {
847 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
848 int pages = iommu_num_pages(iommu->exclusion_start,
849 iommu->exclusion_length,
850 PAGE_SIZE);
851 dma_ops_reserve_addresses(dma_dom, startpage, pages);
852 }
853 1024
854 /* 1025 /*
855 * At the last step, build the page tables so we don't need to 1026 * mark the first page as allocated so we never return 0 as
856 * allocate page table pages in the dma_ops mapping/unmapping 1027 * a valid dma-address. So we can use 0 as error value
857 * path.
858 */ 1028 */
859 num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512); 1029 dma_dom->aperture[0]->bitmap[0] = 1;
860 dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *), 1030 dma_dom->next_address = 0;
861 GFP_KERNEL);
862 if (!dma_dom->pte_pages)
863 goto free_dma_dom;
864
865 l2_pde = (u64 *)get_zeroed_page(GFP_KERNEL);
866 if (l2_pde == NULL)
867 goto free_dma_dom;
868 1031
869 dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde));
870
871 for (i = 0; i < num_pte_pages; ++i) {
872 dma_dom->pte_pages[i] = (u64 *)get_zeroed_page(GFP_KERNEL);
873 if (!dma_dom->pte_pages[i])
874 goto free_dma_dom;
875 address = virt_to_phys(dma_dom->pte_pages[i]);
876 l2_pde[i] = IOMMU_L1_PDE(address);
877 }
878 1032
879 return dma_dom; 1033 return dma_dom;
880 1034
@@ -934,7 +1088,13 @@ static void attach_device(struct amd_iommu *iommu,
934 amd_iommu_pd_table[devid] = domain; 1088 amd_iommu_pd_table[devid] = domain;
935 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1089 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
936 1090
1091 /*
1092 * We might boot into a crash-kernel here. The crashed kernel
1093 * left the caches in the IOMMU dirty. So we have to flush
1094 * here to evict all dirty stuff.
1095 */
937 iommu_queue_inv_dev_entry(iommu, devid); 1096 iommu_queue_inv_dev_entry(iommu, devid);
1097 iommu_flush_tlb_pde(iommu, domain->id);
938} 1098}
939 1099
940/* 1100/*
@@ -983,7 +1143,6 @@ static int device_change_notifier(struct notifier_block *nb,
983 struct protection_domain *domain; 1143 struct protection_domain *domain;
984 struct dma_ops_domain *dma_domain; 1144 struct dma_ops_domain *dma_domain;
985 struct amd_iommu *iommu; 1145 struct amd_iommu *iommu;
986 int order = amd_iommu_aperture_order;
987 unsigned long flags; 1146 unsigned long flags;
988 1147
989 if (devid > amd_iommu_last_bdf) 1148 if (devid > amd_iommu_last_bdf)
@@ -1002,17 +1161,7 @@ static int device_change_notifier(struct notifier_block *nb,
1002 "to a non-dma-ops domain\n", dev_name(dev)); 1161 "to a non-dma-ops domain\n", dev_name(dev));
1003 1162
1004 switch (action) { 1163 switch (action) {
1005 case BUS_NOTIFY_BOUND_DRIVER: 1164 case BUS_NOTIFY_UNBOUND_DRIVER:
1006 if (domain)
1007 goto out;
1008 dma_domain = find_protection_domain(devid);
1009 if (!dma_domain)
1010 dma_domain = iommu->default_dom;
1011 attach_device(iommu, &dma_domain->domain, devid);
1012 printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
1013 "device %s\n", dma_domain->domain.id, dev_name(dev));
1014 break;
1015 case BUS_NOTIFY_UNBIND_DRIVER:
1016 if (!domain) 1165 if (!domain)
1017 goto out; 1166 goto out;
1018 detach_device(domain, devid); 1167 detach_device(domain, devid);
@@ -1022,7 +1171,7 @@ static int device_change_notifier(struct notifier_block *nb,
1022 dma_domain = find_protection_domain(devid); 1171 dma_domain = find_protection_domain(devid);
1023 if (dma_domain) 1172 if (dma_domain)
1024 goto out; 1173 goto out;
1025 dma_domain = dma_ops_domain_alloc(iommu, order); 1174 dma_domain = dma_ops_domain_alloc(iommu);
1026 if (!dma_domain) 1175 if (!dma_domain)
1027 goto out; 1176 goto out;
1028 dma_domain->target_dev = devid; 1177 dma_domain->target_dev = devid;
@@ -1133,8 +1282,8 @@ static int get_device_resources(struct device *dev,
1133 dma_dom = (*iommu)->default_dom; 1282 dma_dom = (*iommu)->default_dom;
1134 *domain = &dma_dom->domain; 1283 *domain = &dma_dom->domain;
1135 attach_device(*iommu, *domain, *bdf); 1284 attach_device(*iommu, *domain, *bdf);
1136 printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " 1285 DUMP_printk("Using protection domain %d for device %s\n",
1137 "device %s\n", (*domain)->id, dev_name(dev)); 1286 (*domain)->id, dev_name(dev));
1138 } 1287 }
1139 1288
1140 if (domain_for_device(_bdf) == NULL) 1289 if (domain_for_device(_bdf) == NULL)
@@ -1144,6 +1293,66 @@ static int get_device_resources(struct device *dev,
1144} 1293}
1145 1294
1146/* 1295/*
1296 * If the pte_page is not yet allocated this function is called
1297 */
1298static u64* alloc_pte(struct protection_domain *dom,
1299 unsigned long address, u64 **pte_page, gfp_t gfp)
1300{
1301 u64 *pte, *page;
1302
1303 pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(address)];
1304
1305 if (!IOMMU_PTE_PRESENT(*pte)) {
1306 page = (u64 *)get_zeroed_page(gfp);
1307 if (!page)
1308 return NULL;
1309 *pte = IOMMU_L2_PDE(virt_to_phys(page));
1310 }
1311
1312 pte = IOMMU_PTE_PAGE(*pte);
1313 pte = &pte[IOMMU_PTE_L1_INDEX(address)];
1314
1315 if (!IOMMU_PTE_PRESENT(*pte)) {
1316 page = (u64 *)get_zeroed_page(gfp);
1317 if (!page)
1318 return NULL;
1319 *pte = IOMMU_L1_PDE(virt_to_phys(page));
1320 }
1321
1322 pte = IOMMU_PTE_PAGE(*pte);
1323
1324 if (pte_page)
1325 *pte_page = pte;
1326
1327 pte = &pte[IOMMU_PTE_L0_INDEX(address)];
1328
1329 return pte;
1330}
1331
1332/*
1333 * This function fetches the PTE for a given address in the aperture
1334 */
1335static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
1336 unsigned long address)
1337{
1338 struct aperture_range *aperture;
1339 u64 *pte, *pte_page;
1340
1341 aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
1342 if (!aperture)
1343 return NULL;
1344
1345 pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
1346 if (!pte) {
1347 pte = alloc_pte(&dom->domain, address, &pte_page, GFP_ATOMIC);
1348 aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
1349 } else
1350 pte += IOMMU_PTE_L0_INDEX(address);
1351
1352 return pte;
1353}
1354
1355/*
1147 * This is the generic map function. It maps one 4kb page at paddr to 1356 * This is the generic map function. It maps one 4kb page at paddr to
1148 * the given address in the DMA address space for the domain. 1357 * the given address in the DMA address space for the domain.
1149 */ 1358 */
@@ -1159,8 +1368,9 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
1159 1368
1160 paddr &= PAGE_MASK; 1369 paddr &= PAGE_MASK;
1161 1370
1162 pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)]; 1371 pte = dma_ops_get_pte(dom, address);
1163 pte += IOMMU_PTE_L0_INDEX(address); 1372 if (!pte)
1373 return bad_dma_address;
1164 1374
1165 __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; 1375 __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
1166 1376
@@ -1185,14 +1395,20 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
1185 struct dma_ops_domain *dom, 1395 struct dma_ops_domain *dom,
1186 unsigned long address) 1396 unsigned long address)
1187{ 1397{
1398 struct aperture_range *aperture;
1188 u64 *pte; 1399 u64 *pte;
1189 1400
1190 if (address >= dom->aperture_size) 1401 if (address >= dom->aperture_size)
1191 return; 1402 return;
1192 1403
1193 WARN_ON(address & ~PAGE_MASK || address >= dom->aperture_size); 1404 aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
1405 if (!aperture)
1406 return;
1407
1408 pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
1409 if (!pte)
1410 return;
1194 1411
1195 pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)];
1196 pte += IOMMU_PTE_L0_INDEX(address); 1412 pte += IOMMU_PTE_L0_INDEX(address);
1197 1413
1198 WARN_ON(!*pte); 1414 WARN_ON(!*pte);
@@ -1216,7 +1432,7 @@ static dma_addr_t __map_single(struct device *dev,
1216 u64 dma_mask) 1432 u64 dma_mask)
1217{ 1433{
1218 dma_addr_t offset = paddr & ~PAGE_MASK; 1434 dma_addr_t offset = paddr & ~PAGE_MASK;
1219 dma_addr_t address, start; 1435 dma_addr_t address, start, ret;
1220 unsigned int pages; 1436 unsigned int pages;
1221 unsigned long align_mask = 0; 1437 unsigned long align_mask = 0;
1222 int i; 1438 int i;
@@ -1232,14 +1448,33 @@ static dma_addr_t __map_single(struct device *dev,
1232 if (align) 1448 if (align)
1233 align_mask = (1UL << get_order(size)) - 1; 1449 align_mask = (1UL << get_order(size)) - 1;
1234 1450
1451retry:
1235 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask, 1452 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
1236 dma_mask); 1453 dma_mask);
1237 if (unlikely(address == bad_dma_address)) 1454 if (unlikely(address == bad_dma_address)) {
1238 goto out; 1455 /*
1456 * setting next_address here will let the address
1457 * allocator only scan the new allocated range in the
1458 * first run. This is a small optimization.
1459 */
1460 dma_dom->next_address = dma_dom->aperture_size;
1461
1462 if (alloc_new_range(iommu, dma_dom, false, GFP_ATOMIC))
1463 goto out;
1464
1465 /*
1466 * aperture was sucessfully enlarged by 128 MB, try
1467 * allocation again
1468 */
1469 goto retry;
1470 }
1239 1471
1240 start = address; 1472 start = address;
1241 for (i = 0; i < pages; ++i) { 1473 for (i = 0; i < pages; ++i) {
1242 dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); 1474 ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir);
1475 if (ret == bad_dma_address)
1476 goto out_unmap;
1477
1243 paddr += PAGE_SIZE; 1478 paddr += PAGE_SIZE;
1244 start += PAGE_SIZE; 1479 start += PAGE_SIZE;
1245 } 1480 }
@@ -1255,6 +1490,17 @@ static dma_addr_t __map_single(struct device *dev,
1255 1490
1256out: 1491out:
1257 return address; 1492 return address;
1493
1494out_unmap:
1495
1496 for (--i; i >= 0; --i) {
1497 start -= PAGE_SIZE;
1498 dma_ops_domain_unmap(iommu, dma_dom, start);
1499 }
1500
1501 dma_ops_free_addresses(dma_dom, address, pages);
1502
1503 return bad_dma_address;
1258} 1504}
1259 1505
1260/* 1506/*
@@ -1537,8 +1783,10 @@ static void *alloc_coherent(struct device *dev, size_t size,
1537 *dma_addr = __map_single(dev, iommu, domain->priv, paddr, 1783 *dma_addr = __map_single(dev, iommu, domain->priv, paddr,
1538 size, DMA_BIDIRECTIONAL, true, dma_mask); 1784 size, DMA_BIDIRECTIONAL, true, dma_mask);
1539 1785
1540 if (*dma_addr == bad_dma_address) 1786 if (*dma_addr == bad_dma_address) {
1787 spin_unlock_irqrestore(&domain->lock, flags);
1541 goto out_free; 1788 goto out_free;
1789 }
1542 1790
1543 iommu_completion_wait(iommu); 1791 iommu_completion_wait(iommu);
1544 1792
@@ -1625,7 +1873,6 @@ static void prealloc_protection_domains(void)
1625 struct pci_dev *dev = NULL; 1873 struct pci_dev *dev = NULL;
1626 struct dma_ops_domain *dma_dom; 1874 struct dma_ops_domain *dma_dom;
1627 struct amd_iommu *iommu; 1875 struct amd_iommu *iommu;
1628 int order = amd_iommu_aperture_order;
1629 u16 devid; 1876 u16 devid;
1630 1877
1631 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 1878 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
@@ -1638,7 +1885,7 @@ static void prealloc_protection_domains(void)
1638 iommu = amd_iommu_rlookup_table[devid]; 1885 iommu = amd_iommu_rlookup_table[devid];
1639 if (!iommu) 1886 if (!iommu)
1640 continue; 1887 continue;
1641 dma_dom = dma_ops_domain_alloc(iommu, order); 1888 dma_dom = dma_ops_domain_alloc(iommu);
1642 if (!dma_dom) 1889 if (!dma_dom)
1643 continue; 1890 continue;
1644 init_unity_mappings_for_device(dma_dom, devid); 1891 init_unity_mappings_for_device(dma_dom, devid);
@@ -1664,7 +1911,6 @@ static struct dma_map_ops amd_iommu_dma_ops = {
1664int __init amd_iommu_init_dma_ops(void) 1911int __init amd_iommu_init_dma_ops(void)
1665{ 1912{
1666 struct amd_iommu *iommu; 1913 struct amd_iommu *iommu;
1667 int order = amd_iommu_aperture_order;
1668 int ret; 1914 int ret;
1669 1915
1670 /* 1916 /*
@@ -1672,8 +1918,8 @@ int __init amd_iommu_init_dma_ops(void)
1672 * found in the system. Devices not assigned to any other 1918 * found in the system. Devices not assigned to any other
1673 * protection domain will be assigned to the default one. 1919 * protection domain will be assigned to the default one.
1674 */ 1920 */
1675 list_for_each_entry(iommu, &amd_iommu_list, list) { 1921 for_each_iommu(iommu) {
1676 iommu->default_dom = dma_ops_domain_alloc(iommu, order); 1922 iommu->default_dom = dma_ops_domain_alloc(iommu);
1677 if (iommu->default_dom == NULL) 1923 if (iommu->default_dom == NULL)
1678 return -ENOMEM; 1924 return -ENOMEM;
1679 iommu->default_dom->domain.flags |= PD_DEFAULT_MASK; 1925 iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
@@ -1710,7 +1956,7 @@ int __init amd_iommu_init_dma_ops(void)
1710 1956
1711free_domains: 1957free_domains:
1712 1958
1713 list_for_each_entry(iommu, &amd_iommu_list, list) { 1959 for_each_iommu(iommu) {
1714 if (iommu->default_dom) 1960 if (iommu->default_dom)
1715 dma_ops_domain_free(iommu->default_dom); 1961 dma_ops_domain_free(iommu->default_dom);
1716 } 1962 }
@@ -1842,7 +2088,7 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
1842 2088
1843 old_domain = domain_for_device(devid); 2089 old_domain = domain_for_device(devid);
1844 if (old_domain) 2090 if (old_domain)
1845 return -EBUSY; 2091 detach_device(old_domain, devid);
1846 2092
1847 attach_device(iommu, domain, devid); 2093 attach_device(iommu, domain, devid);
1848 2094
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 8c0be0902dac..10b2accd12ea 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -115,15 +115,21 @@ struct ivmd_header {
115 u64 range_length; 115 u64 range_length;
116} __attribute__((packed)); 116} __attribute__((packed));
117 117
118bool amd_iommu_dump;
119
118static int __initdata amd_iommu_detected; 120static int __initdata amd_iommu_detected;
119 121
120u16 amd_iommu_last_bdf; /* largest PCI device id we have 122u16 amd_iommu_last_bdf; /* largest PCI device id we have
121 to handle */ 123 to handle */
122LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings 124LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
123 we find in ACPI */ 125 we find in ACPI */
124unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ 126#ifdef CONFIG_IOMMU_STRESS
127bool amd_iommu_isolate = false;
128#else
125bool amd_iommu_isolate = true; /* if true, device isolation is 129bool amd_iommu_isolate = true; /* if true, device isolation is
126 enabled */ 130 enabled */
131#endif
132
127bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ 133bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
128 134
129LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the 135LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
@@ -175,7 +181,7 @@ static inline void update_last_devid(u16 devid)
175static inline unsigned long tbl_size(int entry_size) 181static inline unsigned long tbl_size(int entry_size)
176{ 182{
177 unsigned shift = PAGE_SHIFT + 183 unsigned shift = PAGE_SHIFT +
178 get_order(amd_iommu_last_bdf * entry_size); 184 get_order(((int)amd_iommu_last_bdf + 1) * entry_size);
179 185
180 return 1UL << shift; 186 return 1UL << shift;
181} 187}
@@ -193,7 +199,7 @@ static inline unsigned long tbl_size(int entry_size)
193 * This function set the exclusion range in the IOMMU. DMA accesses to the 199 * This function set the exclusion range in the IOMMU. DMA accesses to the
194 * exclusion range are passed through untranslated 200 * exclusion range are passed through untranslated
195 */ 201 */
196static void __init iommu_set_exclusion_range(struct amd_iommu *iommu) 202static void iommu_set_exclusion_range(struct amd_iommu *iommu)
197{ 203{
198 u64 start = iommu->exclusion_start & PAGE_MASK; 204 u64 start = iommu->exclusion_start & PAGE_MASK;
199 u64 limit = (start + iommu->exclusion_length) & PAGE_MASK; 205 u64 limit = (start + iommu->exclusion_length) & PAGE_MASK;
@@ -225,7 +231,7 @@ static void __init iommu_set_device_table(struct amd_iommu *iommu)
225} 231}
226 232
227/* Generic functions to enable/disable certain features of the IOMMU. */ 233/* Generic functions to enable/disable certain features of the IOMMU. */
228static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit) 234static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
229{ 235{
230 u32 ctrl; 236 u32 ctrl;
231 237
@@ -244,7 +250,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
244} 250}
245 251
246/* Function to enable the hardware */ 252/* Function to enable the hardware */
247static void __init iommu_enable(struct amd_iommu *iommu) 253static void iommu_enable(struct amd_iommu *iommu)
248{ 254{
249 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n", 255 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n",
250 dev_name(&iommu->dev->dev), iommu->cap_ptr); 256 dev_name(&iommu->dev->dev), iommu->cap_ptr);
@@ -252,11 +258,17 @@ static void __init iommu_enable(struct amd_iommu *iommu)
252 iommu_feature_enable(iommu, CONTROL_IOMMU_EN); 258 iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
253} 259}
254 260
255/* Function to enable IOMMU event logging and event interrupts */ 261static void iommu_disable(struct amd_iommu *iommu)
256static void __init iommu_enable_event_logging(struct amd_iommu *iommu)
257{ 262{
258 iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); 263 /* Disable command buffer */
259 iommu_feature_enable(iommu, CONTROL_EVT_INT_EN); 264 iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
265
266 /* Disable event logging and event interrupts */
267 iommu_feature_disable(iommu, CONTROL_EVT_INT_EN);
268 iommu_feature_disable(iommu, CONTROL_EVT_LOG_EN);
269
270 /* Disable IOMMU hardware itself */
271 iommu_feature_disable(iommu, CONTROL_IOMMU_EN);
260} 272}
261 273
262/* 274/*
@@ -413,25 +425,36 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
413{ 425{
414 u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 426 u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
415 get_order(CMD_BUFFER_SIZE)); 427 get_order(CMD_BUFFER_SIZE));
416 u64 entry;
417 428
418 if (cmd_buf == NULL) 429 if (cmd_buf == NULL)
419 return NULL; 430 return NULL;
420 431
421 iommu->cmd_buf_size = CMD_BUFFER_SIZE; 432 iommu->cmd_buf_size = CMD_BUFFER_SIZE;
422 433
423 entry = (u64)virt_to_phys(cmd_buf); 434 return cmd_buf;
435}
436
437/*
438 * This function writes the command buffer address to the hardware and
439 * enables it.
440 */
441static void iommu_enable_command_buffer(struct amd_iommu *iommu)
442{
443 u64 entry;
444
445 BUG_ON(iommu->cmd_buf == NULL);
446
447 entry = (u64)virt_to_phys(iommu->cmd_buf);
424 entry |= MMIO_CMD_SIZE_512; 448 entry |= MMIO_CMD_SIZE_512;
449
425 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, 450 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
426 &entry, sizeof(entry)); 451 &entry, sizeof(entry));
427 452
428 /* set head and tail to zero manually */ 453 /* set head and tail to zero manually */
429 writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); 454 writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
430 writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); 455 writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
431 456
432 iommu_feature_enable(iommu, CONTROL_CMDBUF_EN); 457 iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
433
434 return cmd_buf;
435} 458}
436 459
437static void __init free_command_buffer(struct amd_iommu *iommu) 460static void __init free_command_buffer(struct amd_iommu *iommu)
@@ -443,20 +466,31 @@ static void __init free_command_buffer(struct amd_iommu *iommu)
443/* allocates the memory where the IOMMU will log its events to */ 466/* allocates the memory where the IOMMU will log its events to */
444static u8 * __init alloc_event_buffer(struct amd_iommu *iommu) 467static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
445{ 468{
446 u64 entry;
447 iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 469 iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
448 get_order(EVT_BUFFER_SIZE)); 470 get_order(EVT_BUFFER_SIZE));
449 471
450 if (iommu->evt_buf == NULL) 472 if (iommu->evt_buf == NULL)
451 return NULL; 473 return NULL;
452 474
475 return iommu->evt_buf;
476}
477
478static void iommu_enable_event_buffer(struct amd_iommu *iommu)
479{
480 u64 entry;
481
482 BUG_ON(iommu->evt_buf == NULL);
483
453 entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK; 484 entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
485
454 memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET, 486 memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
455 &entry, sizeof(entry)); 487 &entry, sizeof(entry));
456 488
457 iommu->evt_buf_size = EVT_BUFFER_SIZE; 489 /* set head and tail to zero manually */
490 writel(0x00, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
491 writel(0x00, iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
458 492
459 return iommu->evt_buf; 493 iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
460} 494}
461 495
462static void __init free_event_buffer(struct amd_iommu *iommu) 496static void __init free_event_buffer(struct amd_iommu *iommu)
@@ -596,32 +630,83 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
596 p += sizeof(struct ivhd_header); 630 p += sizeof(struct ivhd_header);
597 end += h->length; 631 end += h->length;
598 632
633
599 while (p < end) { 634 while (p < end) {
600 e = (struct ivhd_entry *)p; 635 e = (struct ivhd_entry *)p;
601 switch (e->type) { 636 switch (e->type) {
602 case IVHD_DEV_ALL: 637 case IVHD_DEV_ALL:
638
639 DUMP_printk(" DEV_ALL\t\t\t first devid: %02x:%02x.%x"
640 " last device %02x:%02x.%x flags: %02x\n",
641 PCI_BUS(iommu->first_device),
642 PCI_SLOT(iommu->first_device),
643 PCI_FUNC(iommu->first_device),
644 PCI_BUS(iommu->last_device),
645 PCI_SLOT(iommu->last_device),
646 PCI_FUNC(iommu->last_device),
647 e->flags);
648
603 for (dev_i = iommu->first_device; 649 for (dev_i = iommu->first_device;
604 dev_i <= iommu->last_device; ++dev_i) 650 dev_i <= iommu->last_device; ++dev_i)
605 set_dev_entry_from_acpi(iommu, dev_i, 651 set_dev_entry_from_acpi(iommu, dev_i,
606 e->flags, 0); 652 e->flags, 0);
607 break; 653 break;
608 case IVHD_DEV_SELECT: 654 case IVHD_DEV_SELECT:
655
656 DUMP_printk(" DEV_SELECT\t\t\t devid: %02x:%02x.%x "
657 "flags: %02x\n",
658 PCI_BUS(e->devid),
659 PCI_SLOT(e->devid),
660 PCI_FUNC(e->devid),
661 e->flags);
662
609 devid = e->devid; 663 devid = e->devid;
610 set_dev_entry_from_acpi(iommu, devid, e->flags, 0); 664 set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
611 break; 665 break;
612 case IVHD_DEV_SELECT_RANGE_START: 666 case IVHD_DEV_SELECT_RANGE_START:
667
668 DUMP_printk(" DEV_SELECT_RANGE_START\t "
669 "devid: %02x:%02x.%x flags: %02x\n",
670 PCI_BUS(e->devid),
671 PCI_SLOT(e->devid),
672 PCI_FUNC(e->devid),
673 e->flags);
674
613 devid_start = e->devid; 675 devid_start = e->devid;
614 flags = e->flags; 676 flags = e->flags;
615 ext_flags = 0; 677 ext_flags = 0;
616 alias = false; 678 alias = false;
617 break; 679 break;
618 case IVHD_DEV_ALIAS: 680 case IVHD_DEV_ALIAS:
681
682 DUMP_printk(" DEV_ALIAS\t\t\t devid: %02x:%02x.%x "
683 "flags: %02x devid_to: %02x:%02x.%x\n",
684 PCI_BUS(e->devid),
685 PCI_SLOT(e->devid),
686 PCI_FUNC(e->devid),
687 e->flags,
688 PCI_BUS(e->ext >> 8),
689 PCI_SLOT(e->ext >> 8),
690 PCI_FUNC(e->ext >> 8));
691
619 devid = e->devid; 692 devid = e->devid;
620 devid_to = e->ext >> 8; 693 devid_to = e->ext >> 8;
621 set_dev_entry_from_acpi(iommu, devid, e->flags, 0); 694 set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0);
622 amd_iommu_alias_table[devid] = devid_to; 695 amd_iommu_alias_table[devid] = devid_to;
623 break; 696 break;
624 case IVHD_DEV_ALIAS_RANGE: 697 case IVHD_DEV_ALIAS_RANGE:
698
699 DUMP_printk(" DEV_ALIAS_RANGE\t\t "
700 "devid: %02x:%02x.%x flags: %02x "
701 "devid_to: %02x:%02x.%x\n",
702 PCI_BUS(e->devid),
703 PCI_SLOT(e->devid),
704 PCI_FUNC(e->devid),
705 e->flags,
706 PCI_BUS(e->ext >> 8),
707 PCI_SLOT(e->ext >> 8),
708 PCI_FUNC(e->ext >> 8));
709
625 devid_start = e->devid; 710 devid_start = e->devid;
626 flags = e->flags; 711 flags = e->flags;
627 devid_to = e->ext >> 8; 712 devid_to = e->ext >> 8;
@@ -629,17 +714,39 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
629 alias = true; 714 alias = true;
630 break; 715 break;
631 case IVHD_DEV_EXT_SELECT: 716 case IVHD_DEV_EXT_SELECT:
717
718 DUMP_printk(" DEV_EXT_SELECT\t\t devid: %02x:%02x.%x "
719 "flags: %02x ext: %08x\n",
720 PCI_BUS(e->devid),
721 PCI_SLOT(e->devid),
722 PCI_FUNC(e->devid),
723 e->flags, e->ext);
724
632 devid = e->devid; 725 devid = e->devid;
633 set_dev_entry_from_acpi(iommu, devid, e->flags, 726 set_dev_entry_from_acpi(iommu, devid, e->flags,
634 e->ext); 727 e->ext);
635 break; 728 break;
636 case IVHD_DEV_EXT_SELECT_RANGE: 729 case IVHD_DEV_EXT_SELECT_RANGE:
730
731 DUMP_printk(" DEV_EXT_SELECT_RANGE\t devid: "
732 "%02x:%02x.%x flags: %02x ext: %08x\n",
733 PCI_BUS(e->devid),
734 PCI_SLOT(e->devid),
735 PCI_FUNC(e->devid),
736 e->flags, e->ext);
737
637 devid_start = e->devid; 738 devid_start = e->devid;
638 flags = e->flags; 739 flags = e->flags;
639 ext_flags = e->ext; 740 ext_flags = e->ext;
640 alias = false; 741 alias = false;
641 break; 742 break;
642 case IVHD_DEV_RANGE_END: 743 case IVHD_DEV_RANGE_END:
744
745 DUMP_printk(" DEV_RANGE_END\t\t devid: %02x:%02x.%x\n",
746 PCI_BUS(e->devid),
747 PCI_SLOT(e->devid),
748 PCI_FUNC(e->devid));
749
643 devid = e->devid; 750 devid = e->devid;
644 for (dev_i = devid_start; dev_i <= devid; ++dev_i) { 751 for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
645 if (alias) 752 if (alias)
@@ -679,7 +786,7 @@ static void __init free_iommu_all(void)
679{ 786{
680 struct amd_iommu *iommu, *next; 787 struct amd_iommu *iommu, *next;
681 788
682 list_for_each_entry_safe(iommu, next, &amd_iommu_list, list) { 789 for_each_iommu_safe(iommu, next) {
683 list_del(&iommu->list); 790 list_del(&iommu->list);
684 free_iommu_one(iommu); 791 free_iommu_one(iommu);
685 kfree(iommu); 792 kfree(iommu);
@@ -710,7 +817,6 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
710 if (!iommu->mmio_base) 817 if (!iommu->mmio_base)
711 return -ENOMEM; 818 return -ENOMEM;
712 819
713 iommu_set_device_table(iommu);
714 iommu->cmd_buf = alloc_command_buffer(iommu); 820 iommu->cmd_buf = alloc_command_buffer(iommu);
715 if (!iommu->cmd_buf) 821 if (!iommu->cmd_buf)
716 return -ENOMEM; 822 return -ENOMEM;
@@ -746,6 +852,15 @@ static int __init init_iommu_all(struct acpi_table_header *table)
746 h = (struct ivhd_header *)p; 852 h = (struct ivhd_header *)p;
747 switch (*p) { 853 switch (*p) {
748 case ACPI_IVHD_TYPE: 854 case ACPI_IVHD_TYPE:
855
856 DUMP_printk("IOMMU: device: %02x:%02x.%01x cap: %04x "
857 "seg: %d flags: %01x info %04x\n",
858 PCI_BUS(h->devid), PCI_SLOT(h->devid),
859 PCI_FUNC(h->devid), h->cap_ptr,
860 h->pci_seg, h->flags, h->info);
861 DUMP_printk(" mmio-addr: %016llx\n",
862 h->mmio_phys);
863
749 iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL); 864 iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
750 if (iommu == NULL) 865 if (iommu == NULL)
751 return -ENOMEM; 866 return -ENOMEM;
@@ -773,56 +888,9 @@ static int __init init_iommu_all(struct acpi_table_header *table)
773 * 888 *
774 ****************************************************************************/ 889 ****************************************************************************/
775 890
776static int __init iommu_setup_msix(struct amd_iommu *iommu)
777{
778 struct amd_iommu *curr;
779 struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */
780 int nvec = 0, i;
781
782 list_for_each_entry(curr, &amd_iommu_list, list) {
783 if (curr->dev == iommu->dev) {
784 entries[nvec].entry = curr->evt_msi_num;
785 entries[nvec].vector = 0;
786 curr->int_enabled = true;
787 nvec++;
788 }
789 }
790
791 if (pci_enable_msix(iommu->dev, entries, nvec)) {
792 pci_disable_msix(iommu->dev);
793 return 1;
794 }
795
796 for (i = 0; i < nvec; ++i) {
797 int r = request_irq(entries->vector, amd_iommu_int_handler,
798 IRQF_SAMPLE_RANDOM,
799 "AMD IOMMU",
800 NULL);
801 if (r)
802 goto out_free;
803 }
804
805 return 0;
806
807out_free:
808 for (i -= 1; i >= 0; --i)
809 free_irq(entries->vector, NULL);
810
811 pci_disable_msix(iommu->dev);
812
813 return 1;
814}
815
816static int __init iommu_setup_msi(struct amd_iommu *iommu) 891static int __init iommu_setup_msi(struct amd_iommu *iommu)
817{ 892{
818 int r; 893 int r;
819 struct amd_iommu *curr;
820
821 list_for_each_entry(curr, &amd_iommu_list, list) {
822 if (curr->dev == iommu->dev)
823 curr->int_enabled = true;
824 }
825
826 894
827 if (pci_enable_msi(iommu->dev)) 895 if (pci_enable_msi(iommu->dev))
828 return 1; 896 return 1;
@@ -837,17 +905,18 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu)
837 return 1; 905 return 1;
838 } 906 }
839 907
908 iommu->int_enabled = true;
909 iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
910
840 return 0; 911 return 0;
841} 912}
842 913
843static int __init iommu_init_msi(struct amd_iommu *iommu) 914static int iommu_init_msi(struct amd_iommu *iommu)
844{ 915{
845 if (iommu->int_enabled) 916 if (iommu->int_enabled)
846 return 0; 917 return 0;
847 918
848 if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSIX)) 919 if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
849 return iommu_setup_msix(iommu);
850 else if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
851 return iommu_setup_msi(iommu); 920 return iommu_setup_msi(iommu);
852 921
853 return 1; 922 return 1;
@@ -899,6 +968,7 @@ static int __init init_exclusion_range(struct ivmd_header *m)
899static int __init init_unity_map_range(struct ivmd_header *m) 968static int __init init_unity_map_range(struct ivmd_header *m)
900{ 969{
901 struct unity_map_entry *e = 0; 970 struct unity_map_entry *e = 0;
971 char *s;
902 972
903 e = kzalloc(sizeof(*e), GFP_KERNEL); 973 e = kzalloc(sizeof(*e), GFP_KERNEL);
904 if (e == NULL) 974 if (e == NULL)
@@ -906,14 +976,19 @@ static int __init init_unity_map_range(struct ivmd_header *m)
906 976
907 switch (m->type) { 977 switch (m->type) {
908 default: 978 default:
979 kfree(e);
980 return 0;
909 case ACPI_IVMD_TYPE: 981 case ACPI_IVMD_TYPE:
982 s = "IVMD_TYPEi\t\t\t";
910 e->devid_start = e->devid_end = m->devid; 983 e->devid_start = e->devid_end = m->devid;
911 break; 984 break;
912 case ACPI_IVMD_TYPE_ALL: 985 case ACPI_IVMD_TYPE_ALL:
986 s = "IVMD_TYPE_ALL\t\t";
913 e->devid_start = 0; 987 e->devid_start = 0;
914 e->devid_end = amd_iommu_last_bdf; 988 e->devid_end = amd_iommu_last_bdf;
915 break; 989 break;
916 case ACPI_IVMD_TYPE_RANGE: 990 case ACPI_IVMD_TYPE_RANGE:
991 s = "IVMD_TYPE_RANGE\t\t";
917 e->devid_start = m->devid; 992 e->devid_start = m->devid;
918 e->devid_end = m->aux; 993 e->devid_end = m->aux;
919 break; 994 break;
@@ -922,6 +997,13 @@ static int __init init_unity_map_range(struct ivmd_header *m)
922 e->address_end = e->address_start + PAGE_ALIGN(m->range_length); 997 e->address_end = e->address_start + PAGE_ALIGN(m->range_length);
923 e->prot = m->flags >> 1; 998 e->prot = m->flags >> 1;
924 999
1000 DUMP_printk("%s devid_start: %02x:%02x.%x devid_end: %02x:%02x.%x"
1001 " range_start: %016llx range_end: %016llx flags: %x\n", s,
1002 PCI_BUS(e->devid_start), PCI_SLOT(e->devid_start),
1003 PCI_FUNC(e->devid_start), PCI_BUS(e->devid_end),
1004 PCI_SLOT(e->devid_end), PCI_FUNC(e->devid_end),
1005 e->address_start, e->address_end, m->flags);
1006
925 list_add_tail(&e->list, &amd_iommu_unity_map); 1007 list_add_tail(&e->list, &amd_iommu_unity_map);
926 1008
927 return 0; 1009 return 0;
@@ -967,18 +1049,29 @@ static void init_device_table(void)
967 * This function finally enables all IOMMUs found in the system after 1049 * This function finally enables all IOMMUs found in the system after
968 * they have been initialized 1050 * they have been initialized
969 */ 1051 */
970static void __init enable_iommus(void) 1052static void enable_iommus(void)
971{ 1053{
972 struct amd_iommu *iommu; 1054 struct amd_iommu *iommu;
973 1055
974 list_for_each_entry(iommu, &amd_iommu_list, list) { 1056 for_each_iommu(iommu) {
1057 iommu_disable(iommu);
1058 iommu_set_device_table(iommu);
1059 iommu_enable_command_buffer(iommu);
1060 iommu_enable_event_buffer(iommu);
975 iommu_set_exclusion_range(iommu); 1061 iommu_set_exclusion_range(iommu);
976 iommu_init_msi(iommu); 1062 iommu_init_msi(iommu);
977 iommu_enable_event_logging(iommu);
978 iommu_enable(iommu); 1063 iommu_enable(iommu);
979 } 1064 }
980} 1065}
981 1066
1067static void disable_iommus(void)
1068{
1069 struct amd_iommu *iommu;
1070
1071 for_each_iommu(iommu)
1072 iommu_disable(iommu);
1073}
1074
982/* 1075/*
983 * Suspend/Resume support 1076 * Suspend/Resume support
984 * disable suspend until real resume implemented 1077 * disable suspend until real resume implemented
@@ -986,12 +1079,25 @@ static void __init enable_iommus(void)
986 1079
987static int amd_iommu_resume(struct sys_device *dev) 1080static int amd_iommu_resume(struct sys_device *dev)
988{ 1081{
1082 /* re-load the hardware */
1083 enable_iommus();
1084
1085 /*
1086 * we have to flush after the IOMMUs are enabled because a
1087 * disabled IOMMU will never execute the commands we send
1088 */
1089 amd_iommu_flush_all_devices();
1090 amd_iommu_flush_all_domains();
1091
989 return 0; 1092 return 0;
990} 1093}
991 1094
992static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state) 1095static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state)
993{ 1096{
994 return -EINVAL; 1097 /* disable IOMMUs to go out of the way for BIOS */
1098 disable_iommus();
1099
1100 return 0;
995} 1101}
996 1102
997static struct sysdev_class amd_iommu_sysdev_class = { 1103static struct sysdev_class amd_iommu_sysdev_class = {
@@ -1137,9 +1243,6 @@ int __init amd_iommu_init(void)
1137 1243
1138 enable_iommus(); 1244 enable_iommus();
1139 1245
1140 printk(KERN_INFO "AMD IOMMU: aperture size is %d MB\n",
1141 (1 << (amd_iommu_aperture_order-20)));
1142
1143 printk(KERN_INFO "AMD IOMMU: device isolation "); 1246 printk(KERN_INFO "AMD IOMMU: device isolation ");
1144 if (amd_iommu_isolate) 1247 if (amd_iommu_isolate)
1145 printk("enabled\n"); 1248 printk("enabled\n");
@@ -1177,6 +1280,11 @@ free:
1177 goto out; 1280 goto out;
1178} 1281}
1179 1282
1283void amd_iommu_shutdown(void)
1284{
1285 disable_iommus();
1286}
1287
1180/**************************************************************************** 1288/****************************************************************************
1181 * 1289 *
1182 * Early detect code. This code runs at IOMMU detection time in the DMA 1290 * Early detect code. This code runs at IOMMU detection time in the DMA
@@ -1211,6 +1319,13 @@ void __init amd_iommu_detect(void)
1211 * 1319 *
1212 ****************************************************************************/ 1320 ****************************************************************************/
1213 1321
1322static int __init parse_amd_iommu_dump(char *str)
1323{
1324 amd_iommu_dump = true;
1325
1326 return 1;
1327}
1328
1214static int __init parse_amd_iommu_options(char *str) 1329static int __init parse_amd_iommu_options(char *str)
1215{ 1330{
1216 for (; *str; ++str) { 1331 for (; *str; ++str) {
@@ -1225,15 +1340,5 @@ static int __init parse_amd_iommu_options(char *str)
1225 return 1; 1340 return 1;
1226} 1341}
1227 1342
1228static int __init parse_amd_iommu_size_options(char *str) 1343__setup("amd_iommu_dump", parse_amd_iommu_dump);
1229{
1230 unsigned order = PAGE_SHIFT + get_order(memparse(str, &str));
1231
1232 if ((order > 24) && (order < 31))
1233 amd_iommu_aperture_order = order;
1234
1235 return 1;
1236}
1237
1238__setup("amd_iommu=", parse_amd_iommu_options); 1344__setup("amd_iommu=", parse_amd_iommu_options);
1239__setup("amd_iommu_size=", parse_amd_iommu_size_options);
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index f2870920f246..8c7c042ecad1 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -14,6 +14,7 @@
14 * Mikael Pettersson : PM converted to driver model. 14 * Mikael Pettersson : PM converted to driver model.
15 */ 15 */
16 16
17#include <linux/perf_counter.h>
17#include <linux/kernel_stat.h> 18#include <linux/kernel_stat.h>
18#include <linux/mc146818rtc.h> 19#include <linux/mc146818rtc.h>
19#include <linux/acpi_pmtmr.h> 20#include <linux/acpi_pmtmr.h>
@@ -34,6 +35,7 @@
34#include <linux/smp.h> 35#include <linux/smp.h>
35#include <linux/mm.h> 36#include <linux/mm.h>
36 37
38#include <asm/perf_counter.h>
37#include <asm/pgalloc.h> 39#include <asm/pgalloc.h>
38#include <asm/atomic.h> 40#include <asm/atomic.h>
39#include <asm/mpspec.h> 41#include <asm/mpspec.h>
@@ -98,6 +100,29 @@ early_param("lapic", parse_lapic);
98/* Local APIC was disabled by the BIOS and enabled by the kernel */ 100/* Local APIC was disabled by the BIOS and enabled by the kernel */
99static int enabled_via_apicbase; 101static int enabled_via_apicbase;
100 102
103/*
104 * Handle interrupt mode configuration register (IMCR).
105 * This register controls whether the interrupt signals
106 * that reach the BSP come from the master PIC or from the
107 * local APIC. Before entering Symmetric I/O Mode, either
108 * the BIOS or the operating system must switch out of
109 * PIC Mode by changing the IMCR.
110 */
111static inline void imcr_pic_to_apic(void)
112{
113 /* select IMCR register */
114 outb(0x70, 0x22);
115 /* NMI and 8259 INTR go through APIC */
116 outb(0x01, 0x23);
117}
118
119static inline void imcr_apic_to_pic(void)
120{
121 /* select IMCR register */
122 outb(0x70, 0x22);
123 /* NMI and 8259 INTR go directly to BSP */
124 outb(0x00, 0x23);
125}
101#endif 126#endif
102 127
103#ifdef CONFIG_X86_64 128#ifdef CONFIG_X86_64
@@ -111,13 +136,19 @@ static __init int setup_apicpmtimer(char *s)
111__setup("apicpmtimer", setup_apicpmtimer); 136__setup("apicpmtimer", setup_apicpmtimer);
112#endif 137#endif
113 138
139int x2apic_mode;
114#ifdef CONFIG_X86_X2APIC 140#ifdef CONFIG_X86_X2APIC
115int x2apic;
116/* x2apic enabled before OS handover */ 141/* x2apic enabled before OS handover */
117static int x2apic_preenabled; 142static int x2apic_preenabled;
118static int disable_x2apic; 143static int disable_x2apic;
119static __init int setup_nox2apic(char *str) 144static __init int setup_nox2apic(char *str)
120{ 145{
146 if (x2apic_enabled()) {
147 pr_warning("Bios already enabled x2apic, "
148 "can't enforce nox2apic");
149 return 0;
150 }
151
121 disable_x2apic = 1; 152 disable_x2apic = 1;
122 setup_clear_cpu_cap(X86_FEATURE_X2APIC); 153 setup_clear_cpu_cap(X86_FEATURE_X2APIC);
123 return 0; 154 return 0;
@@ -209,6 +240,31 @@ static int modern_apic(void)
209 return lapic_get_version() >= 0x14; 240 return lapic_get_version() >= 0x14;
210} 241}
211 242
243/*
244 * bare function to substitute write operation
245 * and it's _that_ fast :)
246 */
247static void native_apic_write_dummy(u32 reg, u32 v)
248{
249 WARN_ON_ONCE((cpu_has_apic || !disable_apic));
250}
251
252static u32 native_apic_read_dummy(u32 reg)
253{
254 WARN_ON_ONCE((cpu_has_apic && !disable_apic));
255 return 0;
256}
257
258/*
259 * right after this call apic->write/read doesn't do anything
260 * note that there is no restore operation it works one way
261 */
262void apic_disable(void)
263{
264 apic->read = native_apic_read_dummy;
265 apic->write = native_apic_write_dummy;
266}
267
212void native_apic_wait_icr_idle(void) 268void native_apic_wait_icr_idle(void)
213{ 269{
214 while (apic_read(APIC_ICR) & APIC_ICR_BUSY) 270 while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
@@ -348,7 +404,7 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
348 404
349static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) 405static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
350{ 406{
351 unsigned long reg = (lvt_off << 4) + APIC_EILVT0; 407 unsigned long reg = (lvt_off << 4) + APIC_EILVTn(0);
352 unsigned int v = (mask << 16) | (msg_type << 8) | vector; 408 unsigned int v = (mask << 16) | (msg_type << 8) | vector;
353 409
354 apic_write(reg, v); 410 apic_write(reg, v);
@@ -815,7 +871,7 @@ void clear_local_APIC(void)
815 u32 v; 871 u32 v;
816 872
817 /* APIC hasn't been mapped yet */ 873 /* APIC hasn't been mapped yet */
818 if (!x2apic && !apic_phys) 874 if (!x2apic_mode && !apic_phys)
819 return; 875 return;
820 876
821 maxlvt = lapic_get_maxlvt(); 877 maxlvt = lapic_get_maxlvt();
@@ -843,7 +899,7 @@ void clear_local_APIC(void)
843 } 899 }
844 900
845 /* lets not touch this if we didn't frob it */ 901 /* lets not touch this if we didn't frob it */
846#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) 902#ifdef CONFIG_X86_THERMAL_VECTOR
847 if (maxlvt >= 5) { 903 if (maxlvt >= 5) {
848 v = apic_read(APIC_LVTTHMR); 904 v = apic_read(APIC_LVTTHMR);
849 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); 905 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
@@ -1133,6 +1189,7 @@ void __cpuinit setup_local_APIC(void)
1133 apic_write(APIC_ESR, 0); 1189 apic_write(APIC_ESR, 0);
1134 } 1190 }
1135#endif 1191#endif
1192 perf_counters_lapic_init();
1136 1193
1137 preempt_disable(); 1194 preempt_disable();
1138 1195
@@ -1287,7 +1344,7 @@ void check_x2apic(void)
1287{ 1344{
1288 if (x2apic_enabled()) { 1345 if (x2apic_enabled()) {
1289 pr_info("x2apic enabled by BIOS, switching to x2apic ops\n"); 1346 pr_info("x2apic enabled by BIOS, switching to x2apic ops\n");
1290 x2apic_preenabled = x2apic = 1; 1347 x2apic_preenabled = x2apic_mode = 1;
1291 } 1348 }
1292} 1349}
1293 1350
@@ -1295,7 +1352,7 @@ void enable_x2apic(void)
1295{ 1352{
1296 int msr, msr2; 1353 int msr, msr2;
1297 1354
1298 if (!x2apic) 1355 if (!x2apic_mode)
1299 return; 1356 return;
1300 1357
1301 rdmsr(MSR_IA32_APICBASE, msr, msr2); 1358 rdmsr(MSR_IA32_APICBASE, msr, msr2);
@@ -1304,6 +1361,7 @@ void enable_x2apic(void)
1304 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0); 1361 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
1305 } 1362 }
1306} 1363}
1364#endif /* CONFIG_X86_X2APIC */
1307 1365
1308void __init enable_IR_x2apic(void) 1366void __init enable_IR_x2apic(void)
1309{ 1367{
@@ -1312,32 +1370,21 @@ void __init enable_IR_x2apic(void)
1312 unsigned long flags; 1370 unsigned long flags;
1313 struct IO_APIC_route_entry **ioapic_entries = NULL; 1371 struct IO_APIC_route_entry **ioapic_entries = NULL;
1314 1372
1315 if (!cpu_has_x2apic) 1373 ret = dmar_table_init();
1316 return; 1374 if (ret) {
1317 1375 pr_debug("dmar_table_init() failed with %d:\n", ret);
1318 if (!x2apic_preenabled && disable_x2apic) { 1376 goto ir_failed;
1319 pr_info("Skipped enabling x2apic and Interrupt-remapping "
1320 "because of nox2apic\n");
1321 return;
1322 } 1377 }
1323 1378
1324 if (x2apic_preenabled && disable_x2apic) 1379 if (!intr_remapping_supported()) {
1325 panic("Bios already enabled x2apic, can't enforce nox2apic"); 1380 pr_debug("intr-remapping not supported\n");
1326 1381 goto ir_failed;
1327 if (!x2apic_preenabled && skip_ioapic_setup) {
1328 pr_info("Skipped enabling x2apic and Interrupt-remapping "
1329 "because of skipping io-apic setup\n");
1330 return;
1331 } 1382 }
1332 1383
1333 ret = dmar_table_init();
1334 if (ret) {
1335 pr_info("dmar_table_init() failed with %d:\n", ret);
1336 1384
1337 if (x2apic_preenabled) 1385 if (!x2apic_preenabled && skip_ioapic_setup) {
1338 panic("x2apic enabled by bios. But IR enabling failed"); 1386 pr_info("Skipped enabling intr-remap because of skipping "
1339 else 1387 "io-apic setup\n");
1340 pr_info("Not enabling x2apic,Intr-remapping\n");
1341 return; 1388 return;
1342 } 1389 }
1343 1390
@@ -1357,19 +1404,16 @@ void __init enable_IR_x2apic(void)
1357 mask_IO_APIC_setup(ioapic_entries); 1404 mask_IO_APIC_setup(ioapic_entries);
1358 mask_8259A(); 1405 mask_8259A();
1359 1406
1360 ret = enable_intr_remapping(EIM_32BIT_APIC_ID); 1407 ret = enable_intr_remapping(x2apic_supported());
1361
1362 if (ret && x2apic_preenabled) {
1363 local_irq_restore(flags);
1364 panic("x2apic enabled by bios. But IR enabling failed");
1365 }
1366
1367 if (ret) 1408 if (ret)
1368 goto end_restore; 1409 goto end_restore;
1369 1410
1370 if (!x2apic) { 1411 pr_info("Enabled Interrupt-remapping\n");
1371 x2apic = 1; 1412
1413 if (x2apic_supported() && !x2apic_mode) {
1414 x2apic_mode = 1;
1372 enable_x2apic(); 1415 enable_x2apic();
1416 pr_info("Enabled x2apic\n");
1373 } 1417 }
1374 1418
1375end_restore: 1419end_restore:
@@ -1378,37 +1422,34 @@ end_restore:
1378 * IR enabling failed 1422 * IR enabling failed
1379 */ 1423 */
1380 restore_IO_APIC_setup(ioapic_entries); 1424 restore_IO_APIC_setup(ioapic_entries);
1381 else
1382 reinit_intr_remapped_IO_APIC(x2apic_preenabled, ioapic_entries);
1383 1425
1384 unmask_8259A(); 1426 unmask_8259A();
1385 local_irq_restore(flags); 1427 local_irq_restore(flags);
1386 1428
1387end: 1429end:
1388 if (!ret) {
1389 if (!x2apic_preenabled)
1390 pr_info("Enabled x2apic and interrupt-remapping\n");
1391 else
1392 pr_info("Enabled Interrupt-remapping\n");
1393 } else
1394 pr_err("Failed to enable Interrupt-remapping and x2apic\n");
1395 if (ioapic_entries) 1430 if (ioapic_entries)
1396 free_ioapic_entries(ioapic_entries); 1431 free_ioapic_entries(ioapic_entries);
1432
1433 if (!ret)
1434 return;
1435
1436ir_failed:
1437 if (x2apic_preenabled)
1438 panic("x2apic enabled by bios. But IR enabling failed");
1439 else if (cpu_has_x2apic)
1440 pr_info("Not enabling x2apic,Intr-remapping\n");
1397#else 1441#else
1398 if (!cpu_has_x2apic) 1442 if (!cpu_has_x2apic)
1399 return; 1443 return;
1400 1444
1401 if (x2apic_preenabled) 1445 if (x2apic_preenabled)
1402 panic("x2apic enabled prior OS handover," 1446 panic("x2apic enabled prior OS handover,"
1403 " enable CONFIG_INTR_REMAP"); 1447 " enable CONFIG_X86_X2APIC, CONFIG_INTR_REMAP");
1404
1405 pr_info("Enable CONFIG_INTR_REMAP for enabling intr-remapping "
1406 " and x2apic\n");
1407#endif 1448#endif
1408 1449
1409 return; 1450 return;
1410} 1451}
1411#endif /* CONFIG_X86_X2APIC */ 1452
1412 1453
1413#ifdef CONFIG_X86_64 1454#ifdef CONFIG_X86_64
1414/* 1455/*
@@ -1425,7 +1466,6 @@ static int __init detect_init_APIC(void)
1425 } 1466 }
1426 1467
1427 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; 1468 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
1428 boot_cpu_physical_apicid = 0;
1429 return 0; 1469 return 0;
1430} 1470}
1431#else 1471#else
@@ -1539,32 +1579,49 @@ void __init early_init_lapic_mapping(void)
1539 */ 1579 */
1540void __init init_apic_mappings(void) 1580void __init init_apic_mappings(void)
1541{ 1581{
1542 if (x2apic) { 1582 unsigned int new_apicid;
1583
1584 if (x2apic_mode) {
1543 boot_cpu_physical_apicid = read_apic_id(); 1585 boot_cpu_physical_apicid = read_apic_id();
1544 return; 1586 return;
1545 } 1587 }
1546 1588
1547 /* 1589 /* If no local APIC can be found return early */
1548 * If no local APIC can be found then set up a fake all
1549 * zeroes page to simulate the local APIC and another
1550 * one for the IO-APIC.
1551 */
1552 if (!smp_found_config && detect_init_APIC()) { 1590 if (!smp_found_config && detect_init_APIC()) {
1553 apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); 1591 /* lets NOP'ify apic operations */
1554 apic_phys = __pa(apic_phys); 1592 pr_info("APIC: disable apic facility\n");
1555 } else 1593 apic_disable();
1594 } else {
1556 apic_phys = mp_lapic_addr; 1595 apic_phys = mp_lapic_addr;
1557 1596
1558 set_fixmap_nocache(FIX_APIC_BASE, apic_phys); 1597 /*
1559 apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", 1598 * acpi lapic path already maps that address in
1560 APIC_BASE, apic_phys); 1599 * acpi_register_lapic_address()
1600 */
1601 if (!acpi_lapic)
1602 set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
1603
1604 apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
1605 APIC_BASE, apic_phys);
1606 }
1561 1607
1562 /* 1608 /*
1563 * Fetch the APIC ID of the BSP in case we have a 1609 * Fetch the APIC ID of the BSP in case we have a
1564 * default configuration (or the MP table is broken). 1610 * default configuration (or the MP table is broken).
1565 */ 1611 */
1566 if (boot_cpu_physical_apicid == -1U) 1612 new_apicid = read_apic_id();
1567 boot_cpu_physical_apicid = read_apic_id(); 1613 if (boot_cpu_physical_apicid != new_apicid) {
1614 boot_cpu_physical_apicid = new_apicid;
1615 /*
1616 * yeah -- we lie about apic_version
1617 * in case if apic was disabled via boot option
1618 * but it's not a problem for SMP compiled kernel
1619 * since smp_sanity_check is prepared for such a case
1620 * and disable smp mode
1621 */
1622 apic_version[new_apicid] =
1623 GET_APIC_VERSION(apic_read(APIC_LVR));
1624 }
1568} 1625}
1569 1626
1570/* 1627/*
@@ -1733,8 +1790,7 @@ void __init connect_bsp_APIC(void)
1733 */ 1790 */
1734 apic_printk(APIC_VERBOSE, "leaving PIC mode, " 1791 apic_printk(APIC_VERBOSE, "leaving PIC mode, "
1735 "enabling APIC mode.\n"); 1792 "enabling APIC mode.\n");
1736 outb(0x70, 0x22); 1793 imcr_pic_to_apic();
1737 outb(0x01, 0x23);
1738 } 1794 }
1739#endif 1795#endif
1740 if (apic->enable_apic_mode) 1796 if (apic->enable_apic_mode)
@@ -1762,8 +1818,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1762 */ 1818 */
1763 apic_printk(APIC_VERBOSE, "disabling APIC mode, " 1819 apic_printk(APIC_VERBOSE, "disabling APIC mode, "
1764 "entering PIC mode.\n"); 1820 "entering PIC mode.\n");
1765 outb(0x70, 0x22); 1821 imcr_apic_to_pic();
1766 outb(0x00, 0x23);
1767 return; 1822 return;
1768 } 1823 }
1769#endif 1824#endif
@@ -1962,17 +2017,17 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
1962 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); 2017 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
1963 apic_pm_state.apic_tmict = apic_read(APIC_TMICT); 2018 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
1964 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); 2019 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
1965#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) 2020#ifdef CONFIG_X86_THERMAL_VECTOR
1966 if (maxlvt >= 5) 2021 if (maxlvt >= 5)
1967 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); 2022 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
1968#endif 2023#endif
1969 2024
1970 local_irq_save(flags); 2025 local_irq_save(flags);
1971 disable_local_APIC(); 2026 disable_local_APIC();
1972#ifdef CONFIG_INTR_REMAP 2027
1973 if (intr_remapping_enabled) 2028 if (intr_remapping_enabled)
1974 disable_intr_remapping(); 2029 disable_intr_remapping();
1975#endif 2030
1976 local_irq_restore(flags); 2031 local_irq_restore(flags);
1977 return 0; 2032 return 0;
1978} 2033}
@@ -1982,42 +2037,34 @@ static int lapic_resume(struct sys_device *dev)
1982 unsigned int l, h; 2037 unsigned int l, h;
1983 unsigned long flags; 2038 unsigned long flags;
1984 int maxlvt; 2039 int maxlvt;
1985 2040 int ret = 0;
1986#ifdef CONFIG_INTR_REMAP
1987 int ret;
1988 struct IO_APIC_route_entry **ioapic_entries = NULL; 2041 struct IO_APIC_route_entry **ioapic_entries = NULL;
1989 2042
1990 if (!apic_pm_state.active) 2043 if (!apic_pm_state.active)
1991 return 0; 2044 return 0;
1992 2045
1993 local_irq_save(flags); 2046 local_irq_save(flags);
1994 if (x2apic) { 2047 if (intr_remapping_enabled) {
1995 ioapic_entries = alloc_ioapic_entries(); 2048 ioapic_entries = alloc_ioapic_entries();
1996 if (!ioapic_entries) { 2049 if (!ioapic_entries) {
1997 WARN(1, "Alloc ioapic_entries in lapic resume failed."); 2050 WARN(1, "Alloc ioapic_entries in lapic resume failed.");
1998 return -ENOMEM; 2051 ret = -ENOMEM;
2052 goto restore;
1999 } 2053 }
2000 2054
2001 ret = save_IO_APIC_setup(ioapic_entries); 2055 ret = save_IO_APIC_setup(ioapic_entries);
2002 if (ret) { 2056 if (ret) {
2003 WARN(1, "Saving IO-APIC state failed: %d\n", ret); 2057 WARN(1, "Saving IO-APIC state failed: %d\n", ret);
2004 free_ioapic_entries(ioapic_entries); 2058 free_ioapic_entries(ioapic_entries);
2005 return ret; 2059 goto restore;
2006 } 2060 }
2007 2061
2008 mask_IO_APIC_setup(ioapic_entries); 2062 mask_IO_APIC_setup(ioapic_entries);
2009 mask_8259A(); 2063 mask_8259A();
2010 enable_x2apic();
2011 } 2064 }
2012#else
2013 if (!apic_pm_state.active)
2014 return 0;
2015 2065
2016 local_irq_save(flags); 2066 if (x2apic_mode)
2017 if (x2apic)
2018 enable_x2apic(); 2067 enable_x2apic();
2019#endif
2020
2021 else { 2068 else {
2022 /* 2069 /*
2023 * Make sure the APICBASE points to the right address 2070 * Make sure the APICBASE points to the right address
@@ -2055,21 +2102,16 @@ static int lapic_resume(struct sys_device *dev)
2055 apic_write(APIC_ESR, 0); 2102 apic_write(APIC_ESR, 0);
2056 apic_read(APIC_ESR); 2103 apic_read(APIC_ESR);
2057 2104
2058#ifdef CONFIG_INTR_REMAP 2105 if (intr_remapping_enabled) {
2059 if (intr_remapping_enabled) 2106 reenable_intr_remapping(x2apic_mode);
2060 reenable_intr_remapping(EIM_32BIT_APIC_ID);
2061
2062 if (x2apic) {
2063 unmask_8259A(); 2107 unmask_8259A();
2064 restore_IO_APIC_setup(ioapic_entries); 2108 restore_IO_APIC_setup(ioapic_entries);
2065 free_ioapic_entries(ioapic_entries); 2109 free_ioapic_entries(ioapic_entries);
2066 } 2110 }
2067#endif 2111restore:
2068
2069 local_irq_restore(flags); 2112 local_irq_restore(flags);
2070 2113
2071 2114 return ret;
2072 return 0;
2073} 2115}
2074 2116
2075/* 2117/*
@@ -2117,31 +2159,14 @@ static void apic_pm_activate(void) { }
2117#endif /* CONFIG_PM */ 2159#endif /* CONFIG_PM */
2118 2160
2119#ifdef CONFIG_X86_64 2161#ifdef CONFIG_X86_64
2120/* 2162
2121 * apic_is_clustered_box() -- Check if we can expect good TSC 2163static int __cpuinit apic_cluster_num(void)
2122 *
2123 * Thus far, the major user of this is IBM's Summit2 series:
2124 *
2125 * Clustered boxes may have unsynced TSC problems if they are
2126 * multi-chassis. Use available data to take a good guess.
2127 * If in doubt, go HPET.
2128 */
2129__cpuinit int apic_is_clustered_box(void)
2130{ 2164{
2131 int i, clusters, zeros; 2165 int i, clusters, zeros;
2132 unsigned id; 2166 unsigned id;
2133 u16 *bios_cpu_apicid; 2167 u16 *bios_cpu_apicid;
2134 DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); 2168 DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
2135 2169
2136 /*
2137 * there is not this kind of box with AMD CPU yet.
2138 * Some AMD box with quadcore cpu and 8 sockets apicid
2139 * will be [4, 0x23] or [8, 0x27] could be thought to
2140 * vsmp box still need checking...
2141 */
2142 if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box())
2143 return 0;
2144
2145 bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); 2170 bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
2146 bitmap_zero(clustermap, NUM_APIC_CLUSTERS); 2171 bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
2147 2172
@@ -2177,18 +2202,67 @@ __cpuinit int apic_is_clustered_box(void)
2177 ++zeros; 2202 ++zeros;
2178 } 2203 }
2179 2204
2180 /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are 2205 return clusters;
2181 * not guaranteed to be synced between boards 2206}
2182 */ 2207
2183 if (is_vsmp_box() && clusters > 1) 2208static int __cpuinitdata multi_checked;
2209static int __cpuinitdata multi;
2210
2211static int __cpuinit set_multi(const struct dmi_system_id *d)
2212{
2213 if (multi)
2214 return 0;
2215 pr_info("APIC: %s detected, Multi Chassis\n", d->ident);
2216 multi = 1;
2217 return 0;
2218}
2219
2220static const __cpuinitconst struct dmi_system_id multi_dmi_table[] = {
2221 {
2222 .callback = set_multi,
2223 .ident = "IBM System Summit2",
2224 .matches = {
2225 DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
2226 DMI_MATCH(DMI_PRODUCT_NAME, "Summit2"),
2227 },
2228 },
2229 {}
2230};
2231
2232static void __cpuinit dmi_check_multi(void)
2233{
2234 if (multi_checked)
2235 return;
2236
2237 dmi_check_system(multi_dmi_table);
2238 multi_checked = 1;
2239}
2240
2241/*
2242 * apic_is_clustered_box() -- Check if we can expect good TSC
2243 *
2244 * Thus far, the major user of this is IBM's Summit2 series:
2245 * Clustered boxes may have unsynced TSC problems if they are
2246 * multi-chassis.
2247 * Use DMI to check them
2248 */
2249__cpuinit int apic_is_clustered_box(void)
2250{
2251 dmi_check_multi();
2252 if (multi)
2184 return 1; 2253 return 1;
2185 2254
2255 if (!is_vsmp_box())
2256 return 0;
2257
2186 /* 2258 /*
2187 * If clusters > 2, then should be multi-chassis. 2259 * ScaleMP vSMPowered boxes have one cluster per board and TSCs are
2188 * May have to revisit this when multi-core + hyperthreaded CPUs come 2260 * not guaranteed to be synced between boards
2189 * out, but AFAIK this will work even for them.
2190 */ 2261 */
2191 return (clusters > 2); 2262 if (apic_cluster_num() > 1)
2263 return 1;
2264
2265 return 0;
2192} 2266}
2193#endif 2267#endif
2194 2268
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 306e5e88fb6f..d0c99abc26c3 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -161,7 +161,7 @@ static int flat_apic_id_registered(void)
161 161
162static int flat_phys_pkg_id(int initial_apic_id, int index_msb) 162static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
163{ 163{
164 return hard_smp_processor_id() >> index_msb; 164 return initial_apic_id >> index_msb;
165} 165}
166 166
167struct apic apic_flat = { 167struct apic apic_flat = {
@@ -235,7 +235,7 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
235 * regardless of how many processors are present (x86_64 ES7000 235 * regardless of how many processors are present (x86_64 ES7000
236 * is an example). 236 * is an example).
237 */ 237 */
238 if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID && 238 if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
239 (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) { 239 (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
240 printk(KERN_DEBUG "system APIC only can use physical flat"); 240 printk(KERN_DEBUG "system APIC only can use physical flat");
241 return 1; 241 return 1;
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 302947775575..69328ac8de9c 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -145,7 +145,7 @@ es7000_rename_gsi(int ioapic, int gsi)
145 return gsi; 145 return gsi;
146} 146}
147 147
148static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) 148static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
149{ 149{
150 unsigned long vect = 0, psaival = 0; 150 unsigned long vect = 0, psaival = 0;
151 151
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 30da617d18e4..4d0216fcb36c 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -59,6 +59,7 @@
59#include <asm/setup.h> 59#include <asm/setup.h>
60#include <asm/irq_remapping.h> 60#include <asm/irq_remapping.h>
61#include <asm/hpet.h> 61#include <asm/hpet.h>
62#include <asm/hw_irq.h>
62#include <asm/uv/uv_hub.h> 63#include <asm/uv/uv_hub.h>
63#include <asm/uv/uv_irq.h> 64#include <asm/uv/uv_irq.h>
64 65
@@ -129,12 +130,9 @@ struct irq_pin_list {
129 struct irq_pin_list *next; 130 struct irq_pin_list *next;
130}; 131};
131 132
132static struct irq_pin_list *get_one_free_irq_2_pin(int cpu) 133static struct irq_pin_list *get_one_free_irq_2_pin(int node)
133{ 134{
134 struct irq_pin_list *pin; 135 struct irq_pin_list *pin;
135 int node;
136
137 node = cpu_to_node(cpu);
138 136
139 pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node); 137 pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
140 138
@@ -148,9 +146,6 @@ struct irq_cfg {
148 unsigned move_cleanup_count; 146 unsigned move_cleanup_count;
149 u8 vector; 147 u8 vector;
150 u8 move_in_progress : 1; 148 u8 move_in_progress : 1;
151#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
152 u8 move_desc_pending : 1;
153#endif
154}; 149};
155 150
156/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ 151/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
@@ -182,16 +177,18 @@ int __init arch_early_irq_init(void)
182 struct irq_cfg *cfg; 177 struct irq_cfg *cfg;
183 struct irq_desc *desc; 178 struct irq_desc *desc;
184 int count; 179 int count;
180 int node;
185 int i; 181 int i;
186 182
187 cfg = irq_cfgx; 183 cfg = irq_cfgx;
188 count = ARRAY_SIZE(irq_cfgx); 184 count = ARRAY_SIZE(irq_cfgx);
185 node= cpu_to_node(boot_cpu_id);
189 186
190 for (i = 0; i < count; i++) { 187 for (i = 0; i < count; i++) {
191 desc = irq_to_desc(i); 188 desc = irq_to_desc(i);
192 desc->chip_data = &cfg[i]; 189 desc->chip_data = &cfg[i];
193 alloc_bootmem_cpumask_var(&cfg[i].domain); 190 zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
194 alloc_bootmem_cpumask_var(&cfg[i].old_domain); 191 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
195 if (i < NR_IRQS_LEGACY) 192 if (i < NR_IRQS_LEGACY)
196 cpumask_setall(cfg[i].domain); 193 cpumask_setall(cfg[i].domain);
197 } 194 }
@@ -212,12 +209,9 @@ static struct irq_cfg *irq_cfg(unsigned int irq)
212 return cfg; 209 return cfg;
213} 210}
214 211
215static struct irq_cfg *get_one_free_irq_cfg(int cpu) 212static struct irq_cfg *get_one_free_irq_cfg(int node)
216{ 213{
217 struct irq_cfg *cfg; 214 struct irq_cfg *cfg;
218 int node;
219
220 node = cpu_to_node(cpu);
221 215
222 cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); 216 cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
223 if (cfg) { 217 if (cfg) {
@@ -238,13 +232,13 @@ static struct irq_cfg *get_one_free_irq_cfg(int cpu)
238 return cfg; 232 return cfg;
239} 233}
240 234
241int arch_init_chip_data(struct irq_desc *desc, int cpu) 235int arch_init_chip_data(struct irq_desc *desc, int node)
242{ 236{
243 struct irq_cfg *cfg; 237 struct irq_cfg *cfg;
244 238
245 cfg = desc->chip_data; 239 cfg = desc->chip_data;
246 if (!cfg) { 240 if (!cfg) {
247 desc->chip_data = get_one_free_irq_cfg(cpu); 241 desc->chip_data = get_one_free_irq_cfg(node);
248 if (!desc->chip_data) { 242 if (!desc->chip_data) {
249 printk(KERN_ERR "can not alloc irq_cfg\n"); 243 printk(KERN_ERR "can not alloc irq_cfg\n");
250 BUG_ON(1); 244 BUG_ON(1);
@@ -254,10 +248,9 @@ int arch_init_chip_data(struct irq_desc *desc, int cpu)
254 return 0; 248 return 0;
255} 249}
256 250
257#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC 251/* for move_irq_desc */
258
259static void 252static void
260init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu) 253init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int node)
261{ 254{
262 struct irq_pin_list *old_entry, *head, *tail, *entry; 255 struct irq_pin_list *old_entry, *head, *tail, *entry;
263 256
@@ -266,7 +259,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
266 if (!old_entry) 259 if (!old_entry)
267 return; 260 return;
268 261
269 entry = get_one_free_irq_2_pin(cpu); 262 entry = get_one_free_irq_2_pin(node);
270 if (!entry) 263 if (!entry)
271 return; 264 return;
272 265
@@ -276,7 +269,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
276 tail = entry; 269 tail = entry;
277 old_entry = old_entry->next; 270 old_entry = old_entry->next;
278 while (old_entry) { 271 while (old_entry) {
279 entry = get_one_free_irq_2_pin(cpu); 272 entry = get_one_free_irq_2_pin(node);
280 if (!entry) { 273 if (!entry) {
281 entry = head; 274 entry = head;
282 while (entry) { 275 while (entry) {
@@ -316,12 +309,12 @@ static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)
316} 309}
317 310
318void arch_init_copy_chip_data(struct irq_desc *old_desc, 311void arch_init_copy_chip_data(struct irq_desc *old_desc,
319 struct irq_desc *desc, int cpu) 312 struct irq_desc *desc, int node)
320{ 313{
321 struct irq_cfg *cfg; 314 struct irq_cfg *cfg;
322 struct irq_cfg *old_cfg; 315 struct irq_cfg *old_cfg;
323 316
324 cfg = get_one_free_irq_cfg(cpu); 317 cfg = get_one_free_irq_cfg(node);
325 318
326 if (!cfg) 319 if (!cfg)
327 return; 320 return;
@@ -332,7 +325,7 @@ void arch_init_copy_chip_data(struct irq_desc *old_desc,
332 325
333 memcpy(cfg, old_cfg, sizeof(struct irq_cfg)); 326 memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
334 327
335 init_copy_irq_2_pin(old_cfg, cfg, cpu); 328 init_copy_irq_2_pin(old_cfg, cfg, node);
336} 329}
337 330
338static void free_irq_cfg(struct irq_cfg *old_cfg) 331static void free_irq_cfg(struct irq_cfg *old_cfg)
@@ -356,19 +349,7 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
356 old_desc->chip_data = NULL; 349 old_desc->chip_data = NULL;
357 } 350 }
358} 351}
359 352/* end for move_irq_desc */
360static void
361set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
362{
363 struct irq_cfg *cfg = desc->chip_data;
364
365 if (!cfg->move_in_progress) {
366 /* it means that domain is not changed */
367 if (!cpumask_intersects(desc->affinity, mask))
368 cfg->move_desc_pending = 1;
369 }
370}
371#endif
372 353
373#else 354#else
374static struct irq_cfg *irq_cfg(unsigned int irq) 355static struct irq_cfg *irq_cfg(unsigned int irq)
@@ -378,13 +359,6 @@ static struct irq_cfg *irq_cfg(unsigned int irq)
378 359
379#endif 360#endif
380 361
381#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC
382static inline void
383set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
384{
385}
386#endif
387
388struct io_apic { 362struct io_apic {
389 unsigned int index; 363 unsigned int index;
390 unsigned int unused[3]; 364 unsigned int unused[3];
@@ -488,7 +462,8 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
488static void 462static void
489__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) 463__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
490{ 464{
491 union entry_union eu; 465 union entry_union eu = {{0, 0}};
466
492 eu.entry = e; 467 eu.entry = e;
493 io_apic_write(apic, 0x11 + 2*pin, eu.w2); 468 io_apic_write(apic, 0x11 + 2*pin, eu.w2);
494 io_apic_write(apic, 0x10 + 2*pin, eu.w1); 469 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
@@ -518,132 +493,18 @@ static void ioapic_mask_entry(int apic, int pin)
518 spin_unlock_irqrestore(&ioapic_lock, flags); 493 spin_unlock_irqrestore(&ioapic_lock, flags);
519} 494}
520 495
521#ifdef CONFIG_SMP
522static void send_cleanup_vector(struct irq_cfg *cfg)
523{
524 cpumask_var_t cleanup_mask;
525
526 if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
527 unsigned int i;
528 cfg->move_cleanup_count = 0;
529 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
530 cfg->move_cleanup_count++;
531 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
532 apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
533 } else {
534 cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
535 cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
536 apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
537 free_cpumask_var(cleanup_mask);
538 }
539 cfg->move_in_progress = 0;
540}
541
542static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
543{
544 int apic, pin;
545 struct irq_pin_list *entry;
546 u8 vector = cfg->vector;
547
548 entry = cfg->irq_2_pin;
549 for (;;) {
550 unsigned int reg;
551
552 if (!entry)
553 break;
554
555 apic = entry->apic;
556 pin = entry->pin;
557 /*
558 * With interrupt-remapping, destination information comes
559 * from interrupt-remapping table entry.
560 */
561 if (!irq_remapped(irq))
562 io_apic_write(apic, 0x11 + pin*2, dest);
563 reg = io_apic_read(apic, 0x10 + pin*2);
564 reg &= ~IO_APIC_REDIR_VECTOR_MASK;
565 reg |= vector;
566 io_apic_modify(apic, 0x10 + pin*2, reg);
567 if (!entry->next)
568 break;
569 entry = entry->next;
570 }
571}
572
573static int
574assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
575
576/*
577 * Either sets desc->affinity to a valid value, and returns
578 * ->cpu_mask_to_apicid of that, or returns BAD_APICID and
579 * leaves desc->affinity untouched.
580 */
581static unsigned int
582set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
583{
584 struct irq_cfg *cfg;
585 unsigned int irq;
586
587 if (!cpumask_intersects(mask, cpu_online_mask))
588 return BAD_APICID;
589
590 irq = desc->irq;
591 cfg = desc->chip_data;
592 if (assign_irq_vector(irq, cfg, mask))
593 return BAD_APICID;
594
595 /* check that before desc->addinity get updated */
596 set_extra_move_desc(desc, mask);
597
598 cpumask_copy(desc->affinity, mask);
599
600 return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
601}
602
603static void
604set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
605{
606 struct irq_cfg *cfg;
607 unsigned long flags;
608 unsigned int dest;
609 unsigned int irq;
610
611 irq = desc->irq;
612 cfg = desc->chip_data;
613
614 spin_lock_irqsave(&ioapic_lock, flags);
615 dest = set_desc_affinity(desc, mask);
616 if (dest != BAD_APICID) {
617 /* Only the high 8 bits are valid. */
618 dest = SET_APIC_LOGICAL_ID(dest);
619 __target_IO_APIC_irq(irq, dest, cfg);
620 }
621 spin_unlock_irqrestore(&ioapic_lock, flags);
622}
623
624static void
625set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
626{
627 struct irq_desc *desc;
628
629 desc = irq_to_desc(irq);
630
631 set_ioapic_affinity_irq_desc(desc, mask);
632}
633#endif /* CONFIG_SMP */
634
635/* 496/*
636 * The common case is 1:1 IRQ<->pin mappings. Sometimes there are 497 * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
637 * shared ISA-space IRQs, so we have to support them. We are super 498 * shared ISA-space IRQs, so we have to support them. We are super
638 * fast in the common case, and fast for shared ISA-space IRQs. 499 * fast in the common case, and fast for shared ISA-space IRQs.
639 */ 500 */
640static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin) 501static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
641{ 502{
642 struct irq_pin_list *entry; 503 struct irq_pin_list *entry;
643 504
644 entry = cfg->irq_2_pin; 505 entry = cfg->irq_2_pin;
645 if (!entry) { 506 if (!entry) {
646 entry = get_one_free_irq_2_pin(cpu); 507 entry = get_one_free_irq_2_pin(node);
647 if (!entry) { 508 if (!entry) {
648 printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n", 509 printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n",
649 apic, pin); 510 apic, pin);
@@ -663,7 +524,7 @@ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
663 entry = entry->next; 524 entry = entry->next;
664 } 525 }
665 526
666 entry->next = get_one_free_irq_2_pin(cpu); 527 entry->next = get_one_free_irq_2_pin(node);
667 entry = entry->next; 528 entry = entry->next;
668 entry->apic = apic; 529 entry->apic = apic;
669 entry->pin = pin; 530 entry->pin = pin;
@@ -672,7 +533,7 @@ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
672/* 533/*
673 * Reroute an IRQ to a different pin. 534 * Reroute an IRQ to a different pin.
674 */ 535 */
675static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu, 536static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
676 int oldapic, int oldpin, 537 int oldapic, int oldpin,
677 int newapic, int newpin) 538 int newapic, int newpin)
678{ 539{
@@ -692,7 +553,7 @@ static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
692 553
693 /* why? call replace before add? */ 554 /* why? call replace before add? */
694 if (!replaced) 555 if (!replaced)
695 add_pin_to_irq_cpu(cfg, cpu, newapic, newpin); 556 add_pin_to_irq_node(cfg, node, newapic, newpin);
696} 557}
697 558
698static inline void io_apic_modify_irq(struct irq_cfg *cfg, 559static inline void io_apic_modify_irq(struct irq_cfg *cfg,
@@ -850,7 +711,6 @@ static int __init ioapic_pirq_setup(char *str)
850__setup("pirq=", ioapic_pirq_setup); 711__setup("pirq=", ioapic_pirq_setup);
851#endif /* CONFIG_X86_32 */ 712#endif /* CONFIG_X86_32 */
852 713
853#ifdef CONFIG_INTR_REMAP
854struct IO_APIC_route_entry **alloc_ioapic_entries(void) 714struct IO_APIC_route_entry **alloc_ioapic_entries(void)
855{ 715{
856 int apic; 716 int apic;
@@ -948,20 +808,6 @@ int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
948 return 0; 808 return 0;
949} 809}
950 810
951void reinit_intr_remapped_IO_APIC(int intr_remapping,
952 struct IO_APIC_route_entry **ioapic_entries)
953
954{
955 /*
956 * for now plain restore of previous settings.
957 * TBD: In the case of OS enabling interrupt-remapping,
958 * IO-APIC RTE's need to be setup to point to interrupt-remapping
959 * table entries. for now, do a plain restore, and wait for
960 * the setup_IO_APIC_irqs() to do proper initialization.
961 */
962 restore_IO_APIC_setup(ioapic_entries);
963}
964
965void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries) 811void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries)
966{ 812{
967 int apic; 813 int apic;
@@ -971,7 +817,6 @@ void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries)
971 817
972 kfree(ioapic_entries); 818 kfree(ioapic_entries);
973} 819}
974#endif
975 820
976/* 821/*
977 * Find the IRQ entry number of a certain pin. 822 * Find the IRQ entry number of a certain pin.
@@ -1032,54 +877,6 @@ static int __init find_isa_irq_apic(int irq, int type)
1032 return -1; 877 return -1;
1033} 878}
1034 879
1035/*
1036 * Find a specific PCI IRQ entry.
1037 * Not an __init, possibly needed by modules
1038 */
1039static int pin_2_irq(int idx, int apic, int pin);
1040
1041int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
1042{
1043 int apic, i, best_guess = -1;
1044
1045 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
1046 bus, slot, pin);
1047 if (test_bit(bus, mp_bus_not_pci)) {
1048 apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
1049 return -1;
1050 }
1051 for (i = 0; i < mp_irq_entries; i++) {
1052 int lbus = mp_irqs[i].srcbus;
1053
1054 for (apic = 0; apic < nr_ioapics; apic++)
1055 if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic ||
1056 mp_irqs[i].dstapic == MP_APIC_ALL)
1057 break;
1058
1059 if (!test_bit(lbus, mp_bus_not_pci) &&
1060 !mp_irqs[i].irqtype &&
1061 (bus == lbus) &&
1062 (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
1063 int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq);
1064
1065 if (!(apic || IO_APIC_IRQ(irq)))
1066 continue;
1067
1068 if (pin == (mp_irqs[i].srcbusirq & 3))
1069 return irq;
1070 /*
1071 * Use the first all-but-pin matching entry as a
1072 * best-guess fuzzy result for broken mptables.
1073 */
1074 if (best_guess < 0)
1075 best_guess = irq;
1076 }
1077 }
1078 return best_guess;
1079}
1080
1081EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
1082
1083#if defined(CONFIG_EISA) || defined(CONFIG_MCA) 880#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
1084/* 881/*
1085 * EISA Edge/Level control register, ELCR 882 * EISA Edge/Level control register, ELCR
@@ -1298,6 +1095,64 @@ static int pin_2_irq(int idx, int apic, int pin)
1298 return irq; 1095 return irq;
1299} 1096}
1300 1097
1098/*
1099 * Find a specific PCI IRQ entry.
1100 * Not an __init, possibly needed by modules
1101 */
1102int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
1103 struct io_apic_irq_attr *irq_attr)
1104{
1105 int apic, i, best_guess = -1;
1106
1107 apic_printk(APIC_DEBUG,
1108 "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
1109 bus, slot, pin);
1110 if (test_bit(bus, mp_bus_not_pci)) {
1111 apic_printk(APIC_VERBOSE,
1112 "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
1113 return -1;
1114 }
1115 for (i = 0; i < mp_irq_entries; i++) {
1116 int lbus = mp_irqs[i].srcbus;
1117
1118 for (apic = 0; apic < nr_ioapics; apic++)
1119 if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic ||
1120 mp_irqs[i].dstapic == MP_APIC_ALL)
1121 break;
1122
1123 if (!test_bit(lbus, mp_bus_not_pci) &&
1124 !mp_irqs[i].irqtype &&
1125 (bus == lbus) &&
1126 (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
1127 int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq);
1128
1129 if (!(apic || IO_APIC_IRQ(irq)))
1130 continue;
1131
1132 if (pin == (mp_irqs[i].srcbusirq & 3)) {
1133 set_io_apic_irq_attr(irq_attr, apic,
1134 mp_irqs[i].dstirq,
1135 irq_trigger(i),
1136 irq_polarity(i));
1137 return irq;
1138 }
1139 /*
1140 * Use the first all-but-pin matching entry as a
1141 * best-guess fuzzy result for broken mptables.
1142 */
1143 if (best_guess < 0) {
1144 set_io_apic_irq_attr(irq_attr, apic,
1145 mp_irqs[i].dstirq,
1146 irq_trigger(i),
1147 irq_polarity(i));
1148 best_guess = irq;
1149 }
1150 }
1151 }
1152 return best_guess;
1153}
1154EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
1155
1301void lock_vector_lock(void) 1156void lock_vector_lock(void)
1302{ 1157{
1303 /* Used to the online set of cpus does not change 1158 /* Used to the online set of cpus does not change
@@ -1559,6 +1414,9 @@ int setup_ioapic_entry(int apic_id, int irq,
1559 irte.vector = vector; 1414 irte.vector = vector;
1560 irte.dest_id = IRTE_DEST(destination); 1415 irte.dest_id = IRTE_DEST(destination);
1561 1416
1417 /* Set source-id of interrupt request */
1418 set_ioapic_sid(&irte, apic_id);
1419
1562 modify_irte(irq, &irte); 1420 modify_irte(irq, &irte);
1563 1421
1564 ir_entry->index2 = (index >> 15) & 0x1; 1422 ir_entry->index2 = (index >> 15) & 0x1;
@@ -1628,58 +1486,70 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
1628 ioapic_write_entry(apic_id, pin, entry); 1486 ioapic_write_entry(apic_id, pin, entry);
1629} 1487}
1630 1488
1489static struct {
1490 DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
1491} mp_ioapic_routing[MAX_IO_APICS];
1492
1631static void __init setup_IO_APIC_irqs(void) 1493static void __init setup_IO_APIC_irqs(void)
1632{ 1494{
1633 int apic_id, pin, idx, irq; 1495 int apic_id = 0, pin, idx, irq;
1634 int notcon = 0; 1496 int notcon = 0;
1635 struct irq_desc *desc; 1497 struct irq_desc *desc;
1636 struct irq_cfg *cfg; 1498 struct irq_cfg *cfg;
1637 int cpu = boot_cpu_id; 1499 int node = cpu_to_node(boot_cpu_id);
1638 1500
1639 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); 1501 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
1640 1502
1641 for (apic_id = 0; apic_id < nr_ioapics; apic_id++) { 1503#ifdef CONFIG_ACPI
1642 for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { 1504 if (!acpi_disabled && acpi_ioapic) {
1643 1505 apic_id = mp_find_ioapic(0);
1644 idx = find_irq_entry(apic_id, pin, mp_INT); 1506 if (apic_id < 0)
1645 if (idx == -1) { 1507 apic_id = 0;
1646 if (!notcon) { 1508 }
1647 notcon = 1; 1509#endif
1648 apic_printk(APIC_VERBOSE,
1649 KERN_DEBUG " %d-%d",
1650 mp_ioapics[apic_id].apicid, pin);
1651 } else
1652 apic_printk(APIC_VERBOSE, " %d-%d",
1653 mp_ioapics[apic_id].apicid, pin);
1654 continue;
1655 }
1656 if (notcon) {
1657 apic_printk(APIC_VERBOSE,
1658 " (apicid-pin) not connected\n");
1659 notcon = 0;
1660 }
1661 1510
1662 irq = pin_2_irq(idx, apic_id, pin); 1511 for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
1512 idx = find_irq_entry(apic_id, pin, mp_INT);
1513 if (idx == -1) {
1514 if (!notcon) {
1515 notcon = 1;
1516 apic_printk(APIC_VERBOSE,
1517 KERN_DEBUG " %d-%d",
1518 mp_ioapics[apic_id].apicid, pin);
1519 } else
1520 apic_printk(APIC_VERBOSE, " %d-%d",
1521 mp_ioapics[apic_id].apicid, pin);
1522 continue;
1523 }
1524 if (notcon) {
1525 apic_printk(APIC_VERBOSE,
1526 " (apicid-pin) not connected\n");
1527 notcon = 0;
1528 }
1663 1529
1664 /* 1530 irq = pin_2_irq(idx, apic_id, pin);
1665 * Skip the timer IRQ if there's a quirk handler
1666 * installed and if it returns 1:
1667 */
1668 if (apic->multi_timer_check &&
1669 apic->multi_timer_check(apic_id, irq))
1670 continue;
1671 1531
1672 desc = irq_to_desc_alloc_cpu(irq, cpu); 1532 /*
1673 if (!desc) { 1533 * Skip the timer IRQ if there's a quirk handler
1674 printk(KERN_INFO "can not get irq_desc for %d\n", irq); 1534 * installed and if it returns 1:
1675 continue; 1535 */
1676 } 1536 if (apic->multi_timer_check &&
1677 cfg = desc->chip_data; 1537 apic->multi_timer_check(apic_id, irq))
1678 add_pin_to_irq_cpu(cfg, cpu, apic_id, pin); 1538 continue;
1679 1539
1680 setup_IO_APIC_irq(apic_id, pin, irq, desc, 1540 desc = irq_to_desc_alloc_node(irq, node);
1681 irq_trigger(idx), irq_polarity(idx)); 1541 if (!desc) {
1542 printk(KERN_INFO "can not get irq_desc for %d\n", irq);
1543 continue;
1682 } 1544 }
1545 cfg = desc->chip_data;
1546 add_pin_to_irq_node(cfg, node, apic_id, pin);
1547 /*
1548 * don't mark it in pin_programmed, so later acpi could
1549 * set it correctly when irq < 16
1550 */
1551 setup_IO_APIC_irq(apic_id, pin, irq, desc,
1552 irq_trigger(idx), irq_polarity(idx));
1683 } 1553 }
1684 1554
1685 if (notcon) 1555 if (notcon)
@@ -1869,7 +1739,7 @@ __apicdebuginit(void) print_APIC_bitfield(int base)
1869 1739
1870__apicdebuginit(void) print_local_APIC(void *dummy) 1740__apicdebuginit(void) print_local_APIC(void *dummy)
1871{ 1741{
1872 unsigned int v, ver, maxlvt; 1742 unsigned int i, v, ver, maxlvt;
1873 u64 icr; 1743 u64 icr;
1874 1744
1875 if (apic_verbosity == APIC_QUIET) 1745 if (apic_verbosity == APIC_QUIET)
@@ -1957,6 +1827,18 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
1957 printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); 1827 printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
1958 v = apic_read(APIC_TDCR); 1828 v = apic_read(APIC_TDCR);
1959 printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); 1829 printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
1830
1831 if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
1832 v = apic_read(APIC_EFEAT);
1833 maxlvt = (v >> 16) & 0xff;
1834 printk(KERN_DEBUG "... APIC EFEAT: %08x\n", v);
1835 v = apic_read(APIC_ECTRL);
1836 printk(KERN_DEBUG "... APIC ECTRL: %08x\n", v);
1837 for (i = 0; i < maxlvt; i++) {
1838 v = apic_read(APIC_EILVTn(i));
1839 printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v);
1840 }
1841 }
1960 printk("\n"); 1842 printk("\n");
1961} 1843}
1962 1844
@@ -2005,6 +1887,11 @@ __apicdebuginit(void) print_PIC(void)
2005__apicdebuginit(int) print_all_ICs(void) 1887__apicdebuginit(int) print_all_ICs(void)
2006{ 1888{
2007 print_PIC(); 1889 print_PIC();
1890
1891 /* don't print out if apic is not there */
1892 if (!cpu_has_apic || disable_apic)
1893 return 0;
1894
2008 print_all_local_APICs(); 1895 print_all_local_APICs();
2009 print_IO_APIC(); 1896 print_IO_APIC();
2010 1897
@@ -2120,7 +2007,9 @@ void disable_IO_APIC(void)
2120 /* 2007 /*
2121 * Use virtual wire A mode when interrupt remapping is enabled. 2008 * Use virtual wire A mode when interrupt remapping is enabled.
2122 */ 2009 */
2123 disconnect_bsp_APIC(!intr_remapping_enabled && ioapic_i8259.pin != -1); 2010 if (cpu_has_apic)
2011 disconnect_bsp_APIC(!intr_remapping_enabled &&
2012 ioapic_i8259.pin != -1);
2124} 2013}
2125 2014
2126#ifdef CONFIG_X86_32 2015#ifdef CONFIG_X86_32
@@ -2360,6 +2249,118 @@ static int ioapic_retrigger_irq(unsigned int irq)
2360 */ 2249 */
2361 2250
2362#ifdef CONFIG_SMP 2251#ifdef CONFIG_SMP
2252static void send_cleanup_vector(struct irq_cfg *cfg)
2253{
2254 cpumask_var_t cleanup_mask;
2255
2256 if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
2257 unsigned int i;
2258 cfg->move_cleanup_count = 0;
2259 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
2260 cfg->move_cleanup_count++;
2261 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
2262 apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
2263 } else {
2264 cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
2265 cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
2266 apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
2267 free_cpumask_var(cleanup_mask);
2268 }
2269 cfg->move_in_progress = 0;
2270}
2271
2272static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
2273{
2274 int apic, pin;
2275 struct irq_pin_list *entry;
2276 u8 vector = cfg->vector;
2277
2278 entry = cfg->irq_2_pin;
2279 for (;;) {
2280 unsigned int reg;
2281
2282 if (!entry)
2283 break;
2284
2285 apic = entry->apic;
2286 pin = entry->pin;
2287 /*
2288 * With interrupt-remapping, destination information comes
2289 * from interrupt-remapping table entry.
2290 */
2291 if (!irq_remapped(irq))
2292 io_apic_write(apic, 0x11 + pin*2, dest);
2293 reg = io_apic_read(apic, 0x10 + pin*2);
2294 reg &= ~IO_APIC_REDIR_VECTOR_MASK;
2295 reg |= vector;
2296 io_apic_modify(apic, 0x10 + pin*2, reg);
2297 if (!entry->next)
2298 break;
2299 entry = entry->next;
2300 }
2301}
2302
2303static int
2304assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
2305
2306/*
2307 * Either sets desc->affinity to a valid value, and returns
2308 * ->cpu_mask_to_apicid of that, or returns BAD_APICID and
2309 * leaves desc->affinity untouched.
2310 */
2311static unsigned int
2312set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
2313{
2314 struct irq_cfg *cfg;
2315 unsigned int irq;
2316
2317 if (!cpumask_intersects(mask, cpu_online_mask))
2318 return BAD_APICID;
2319
2320 irq = desc->irq;
2321 cfg = desc->chip_data;
2322 if (assign_irq_vector(irq, cfg, mask))
2323 return BAD_APICID;
2324
2325 cpumask_copy(desc->affinity, mask);
2326
2327 return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
2328}
2329
2330static int
2331set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2332{
2333 struct irq_cfg *cfg;
2334 unsigned long flags;
2335 unsigned int dest;
2336 unsigned int irq;
2337 int ret = -1;
2338
2339 irq = desc->irq;
2340 cfg = desc->chip_data;
2341
2342 spin_lock_irqsave(&ioapic_lock, flags);
2343 dest = set_desc_affinity(desc, mask);
2344 if (dest != BAD_APICID) {
2345 /* Only the high 8 bits are valid. */
2346 dest = SET_APIC_LOGICAL_ID(dest);
2347 __target_IO_APIC_irq(irq, dest, cfg);
2348 ret = 0;
2349 }
2350 spin_unlock_irqrestore(&ioapic_lock, flags);
2351
2352 return ret;
2353}
2354
2355static int
2356set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
2357{
2358 struct irq_desc *desc;
2359
2360 desc = irq_to_desc(irq);
2361
2362 return set_ioapic_affinity_irq_desc(desc, mask);
2363}
2363 2364
2364#ifdef CONFIG_INTR_REMAP 2365#ifdef CONFIG_INTR_REMAP
2365 2366
@@ -2374,26 +2375,25 @@ static int ioapic_retrigger_irq(unsigned int irq)
2374 * Real vector that is used for interrupting cpu will be coming from 2375 * Real vector that is used for interrupting cpu will be coming from
2375 * the interrupt-remapping table entry. 2376 * the interrupt-remapping table entry.
2376 */ 2377 */
2377static void 2378static int
2378migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) 2379migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2379{ 2380{
2380 struct irq_cfg *cfg; 2381 struct irq_cfg *cfg;
2381 struct irte irte; 2382 struct irte irte;
2382 unsigned int dest; 2383 unsigned int dest;
2383 unsigned int irq; 2384 unsigned int irq;
2385 int ret = -1;
2384 2386
2385 if (!cpumask_intersects(mask, cpu_online_mask)) 2387 if (!cpumask_intersects(mask, cpu_online_mask))
2386 return; 2388 return ret;
2387 2389
2388 irq = desc->irq; 2390 irq = desc->irq;
2389 if (get_irte(irq, &irte)) 2391 if (get_irte(irq, &irte))
2390 return; 2392 return ret;
2391 2393
2392 cfg = desc->chip_data; 2394 cfg = desc->chip_data;
2393 if (assign_irq_vector(irq, cfg, mask)) 2395 if (assign_irq_vector(irq, cfg, mask))
2394 return; 2396 return ret;
2395
2396 set_extra_move_desc(desc, mask);
2397 2397
2398 dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); 2398 dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
2399 2399
@@ -2409,27 +2409,30 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2409 send_cleanup_vector(cfg); 2409 send_cleanup_vector(cfg);
2410 2410
2411 cpumask_copy(desc->affinity, mask); 2411 cpumask_copy(desc->affinity, mask);
2412
2413 return 0;
2412} 2414}
2413 2415
2414/* 2416/*
2415 * Migrates the IRQ destination in the process context. 2417 * Migrates the IRQ destination in the process context.
2416 */ 2418 */
2417static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, 2419static int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
2418 const struct cpumask *mask) 2420 const struct cpumask *mask)
2419{ 2421{
2420 migrate_ioapic_irq_desc(desc, mask); 2422 return migrate_ioapic_irq_desc(desc, mask);
2421} 2423}
2422static void set_ir_ioapic_affinity_irq(unsigned int irq, 2424static int set_ir_ioapic_affinity_irq(unsigned int irq,
2423 const struct cpumask *mask) 2425 const struct cpumask *mask)
2424{ 2426{
2425 struct irq_desc *desc = irq_to_desc(irq); 2427 struct irq_desc *desc = irq_to_desc(irq);
2426 2428
2427 set_ir_ioapic_affinity_irq_desc(desc, mask); 2429 return set_ir_ioapic_affinity_irq_desc(desc, mask);
2428} 2430}
2429#else 2431#else
2430static inline void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, 2432static inline int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
2431 const struct cpumask *mask) 2433 const struct cpumask *mask)
2432{ 2434{
2435 return 0;
2433} 2436}
2434#endif 2437#endif
2435 2438
@@ -2491,86 +2494,19 @@ static void irq_complete_move(struct irq_desc **descp)
2491 struct irq_cfg *cfg = desc->chip_data; 2494 struct irq_cfg *cfg = desc->chip_data;
2492 unsigned vector, me; 2495 unsigned vector, me;
2493 2496
2494 if (likely(!cfg->move_in_progress)) { 2497 if (likely(!cfg->move_in_progress))
2495#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
2496 if (likely(!cfg->move_desc_pending))
2497 return;
2498
2499 /* domain has not changed, but affinity did */
2500 me = smp_processor_id();
2501 if (cpumask_test_cpu(me, desc->affinity)) {
2502 *descp = desc = move_irq_desc(desc, me);
2503 /* get the new one */
2504 cfg = desc->chip_data;
2505 cfg->move_desc_pending = 0;
2506 }
2507#endif
2508 return; 2498 return;
2509 }
2510 2499
2511 vector = ~get_irq_regs()->orig_ax; 2500 vector = ~get_irq_regs()->orig_ax;
2512 me = smp_processor_id(); 2501 me = smp_processor_id();
2513 2502
2514 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) { 2503 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
2515#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
2516 *descp = desc = move_irq_desc(desc, me);
2517 /* get the new one */
2518 cfg = desc->chip_data;
2519#endif
2520 send_cleanup_vector(cfg); 2504 send_cleanup_vector(cfg);
2521 }
2522} 2505}
2523#else 2506#else
2524static inline void irq_complete_move(struct irq_desc **descp) {} 2507static inline void irq_complete_move(struct irq_desc **descp) {}
2525#endif 2508#endif
2526 2509
2527static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2528{
2529 int apic, pin;
2530 struct irq_pin_list *entry;
2531
2532 entry = cfg->irq_2_pin;
2533 for (;;) {
2534
2535 if (!entry)
2536 break;
2537
2538 apic = entry->apic;
2539 pin = entry->pin;
2540 io_apic_eoi(apic, pin);
2541 entry = entry->next;
2542 }
2543}
2544
2545static void
2546eoi_ioapic_irq(struct irq_desc *desc)
2547{
2548 struct irq_cfg *cfg;
2549 unsigned long flags;
2550 unsigned int irq;
2551
2552 irq = desc->irq;
2553 cfg = desc->chip_data;
2554
2555 spin_lock_irqsave(&ioapic_lock, flags);
2556 __eoi_ioapic_irq(irq, cfg);
2557 spin_unlock_irqrestore(&ioapic_lock, flags);
2558}
2559
2560#ifdef CONFIG_X86_X2APIC
2561static void ack_x2apic_level(unsigned int irq)
2562{
2563 struct irq_desc *desc = irq_to_desc(irq);
2564 ack_x2APIC_irq();
2565 eoi_ioapic_irq(desc);
2566}
2567
2568static void ack_x2apic_edge(unsigned int irq)
2569{
2570 ack_x2APIC_irq();
2571}
2572#endif
2573
2574static void ack_apic_edge(unsigned int irq) 2510static void ack_apic_edge(unsigned int irq)
2575{ 2511{
2576 struct irq_desc *desc = irq_to_desc(irq); 2512 struct irq_desc *desc = irq_to_desc(irq);
@@ -2634,9 +2570,6 @@ static void ack_apic_level(unsigned int irq)
2634 */ 2570 */
2635 ack_APIC_irq(); 2571 ack_APIC_irq();
2636 2572
2637 if (irq_remapped(irq))
2638 eoi_ioapic_irq(desc);
2639
2640 /* Now we can move and renable the irq */ 2573 /* Now we can move and renable the irq */
2641 if (unlikely(do_unmask_irq)) { 2574 if (unlikely(do_unmask_irq)) {
2642 /* Only migrate the irq if the ack has been received. 2575 /* Only migrate the irq if the ack has been received.
@@ -2683,22 +2616,50 @@ static void ack_apic_level(unsigned int irq)
2683} 2616}
2684 2617
2685#ifdef CONFIG_INTR_REMAP 2618#ifdef CONFIG_INTR_REMAP
2619static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2620{
2621 int apic, pin;
2622 struct irq_pin_list *entry;
2623
2624 entry = cfg->irq_2_pin;
2625 for (;;) {
2626
2627 if (!entry)
2628 break;
2629
2630 apic = entry->apic;
2631 pin = entry->pin;
2632 io_apic_eoi(apic, pin);
2633 entry = entry->next;
2634 }
2635}
2636
2637static void
2638eoi_ioapic_irq(struct irq_desc *desc)
2639{
2640 struct irq_cfg *cfg;
2641 unsigned long flags;
2642 unsigned int irq;
2643
2644 irq = desc->irq;
2645 cfg = desc->chip_data;
2646
2647 spin_lock_irqsave(&ioapic_lock, flags);
2648 __eoi_ioapic_irq(irq, cfg);
2649 spin_unlock_irqrestore(&ioapic_lock, flags);
2650}
2651
2686static void ir_ack_apic_edge(unsigned int irq) 2652static void ir_ack_apic_edge(unsigned int irq)
2687{ 2653{
2688#ifdef CONFIG_X86_X2APIC 2654 ack_APIC_irq();
2689 if (x2apic_enabled())
2690 return ack_x2apic_edge(irq);
2691#endif
2692 return ack_apic_edge(irq);
2693} 2655}
2694 2656
2695static void ir_ack_apic_level(unsigned int irq) 2657static void ir_ack_apic_level(unsigned int irq)
2696{ 2658{
2697#ifdef CONFIG_X86_X2APIC 2659 struct irq_desc *desc = irq_to_desc(irq);
2698 if (x2apic_enabled()) 2660
2699 return ack_x2apic_level(irq); 2661 ack_APIC_irq();
2700#endif 2662 eoi_ioapic_irq(desc);
2701 return ack_apic_level(irq);
2702} 2663}
2703#endif /* CONFIG_INTR_REMAP */ 2664#endif /* CONFIG_INTR_REMAP */
2704 2665
@@ -2903,7 +2864,7 @@ static inline void __init check_timer(void)
2903{ 2864{
2904 struct irq_desc *desc = irq_to_desc(0); 2865 struct irq_desc *desc = irq_to_desc(0);
2905 struct irq_cfg *cfg = desc->chip_data; 2866 struct irq_cfg *cfg = desc->chip_data;
2906 int cpu = boot_cpu_id; 2867 int node = cpu_to_node(boot_cpu_id);
2907 int apic1, pin1, apic2, pin2; 2868 int apic1, pin1, apic2, pin2;
2908 unsigned long flags; 2869 unsigned long flags;
2909 int no_pin1 = 0; 2870 int no_pin1 = 0;
@@ -2969,7 +2930,7 @@ static inline void __init check_timer(void)
2969 * Ok, does IRQ0 through the IOAPIC work? 2930 * Ok, does IRQ0 through the IOAPIC work?
2970 */ 2931 */
2971 if (no_pin1) { 2932 if (no_pin1) {
2972 add_pin_to_irq_cpu(cfg, cpu, apic1, pin1); 2933 add_pin_to_irq_node(cfg, node, apic1, pin1);
2973 setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); 2934 setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
2974 } else { 2935 } else {
2975 /* for edge trigger, setup_IO_APIC_irq already 2936 /* for edge trigger, setup_IO_APIC_irq already
@@ -3006,7 +2967,7 @@ static inline void __init check_timer(void)
3006 /* 2967 /*
3007 * legacy devices should be connected to IO APIC #0 2968 * legacy devices should be connected to IO APIC #0
3008 */ 2969 */
3009 replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2); 2970 replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
3010 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); 2971 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
3011 enable_8259A_irq(0); 2972 enable_8259A_irq(0);
3012 if (timer_irq_works()) { 2973 if (timer_irq_works()) {
@@ -3218,14 +3179,13 @@ static int nr_irqs_gsi = NR_IRQS_LEGACY;
3218/* 3179/*
3219 * Dynamic irq allocate and deallocation 3180 * Dynamic irq allocate and deallocation
3220 */ 3181 */
3221unsigned int create_irq_nr(unsigned int irq_want) 3182unsigned int create_irq_nr(unsigned int irq_want, int node)
3222{ 3183{
3223 /* Allocate an unused irq */ 3184 /* Allocate an unused irq */
3224 unsigned int irq; 3185 unsigned int irq;
3225 unsigned int new; 3186 unsigned int new;
3226 unsigned long flags; 3187 unsigned long flags;
3227 struct irq_cfg *cfg_new = NULL; 3188 struct irq_cfg *cfg_new = NULL;
3228 int cpu = boot_cpu_id;
3229 struct irq_desc *desc_new = NULL; 3189 struct irq_desc *desc_new = NULL;
3230 3190
3231 irq = 0; 3191 irq = 0;
@@ -3234,7 +3194,7 @@ unsigned int create_irq_nr(unsigned int irq_want)
3234 3194
3235 spin_lock_irqsave(&vector_lock, flags); 3195 spin_lock_irqsave(&vector_lock, flags);
3236 for (new = irq_want; new < nr_irqs; new++) { 3196 for (new = irq_want; new < nr_irqs; new++) {
3237 desc_new = irq_to_desc_alloc_cpu(new, cpu); 3197 desc_new = irq_to_desc_alloc_node(new, node);
3238 if (!desc_new) { 3198 if (!desc_new) {
3239 printk(KERN_INFO "can not get irq_desc for %d\n", new); 3199 printk(KERN_INFO "can not get irq_desc for %d\n", new);
3240 continue; 3200 continue;
@@ -3243,6 +3203,9 @@ unsigned int create_irq_nr(unsigned int irq_want)
3243 3203
3244 if (cfg_new->vector != 0) 3204 if (cfg_new->vector != 0)
3245 continue; 3205 continue;
3206
3207 desc_new = move_irq_desc(desc_new, node);
3208
3246 if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) 3209 if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
3247 irq = new; 3210 irq = new;
3248 break; 3211 break;
@@ -3260,11 +3223,12 @@ unsigned int create_irq_nr(unsigned int irq_want)
3260 3223
3261int create_irq(void) 3224int create_irq(void)
3262{ 3225{
3226 int node = cpu_to_node(boot_cpu_id);
3263 unsigned int irq_want; 3227 unsigned int irq_want;
3264 int irq; 3228 int irq;
3265 3229
3266 irq_want = nr_irqs_gsi; 3230 irq_want = nr_irqs_gsi;
3267 irq = create_irq_nr(irq_want); 3231 irq = create_irq_nr(irq_want, node);
3268 3232
3269 if (irq == 0) 3233 if (irq == 0)
3270 irq = -1; 3234 irq = -1;
@@ -3329,6 +3293,9 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
3329 irte.vector = cfg->vector; 3293 irte.vector = cfg->vector;
3330 irte.dest_id = IRTE_DEST(dest); 3294 irte.dest_id = IRTE_DEST(dest);
3331 3295
3296 /* Set source-id of interrupt request */
3297 set_msi_sid(&irte, pdev);
3298
3332 modify_irte(irq, &irte); 3299 modify_irte(irq, &irte);
3333 3300
3334 msg->address_hi = MSI_ADDR_BASE_HI; 3301 msg->address_hi = MSI_ADDR_BASE_HI;
@@ -3366,7 +3333,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
3366} 3333}
3367 3334
3368#ifdef CONFIG_SMP 3335#ifdef CONFIG_SMP
3369static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) 3336static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3370{ 3337{
3371 struct irq_desc *desc = irq_to_desc(irq); 3338 struct irq_desc *desc = irq_to_desc(irq);
3372 struct irq_cfg *cfg; 3339 struct irq_cfg *cfg;
@@ -3375,7 +3342,7 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3375 3342
3376 dest = set_desc_affinity(desc, mask); 3343 dest = set_desc_affinity(desc, mask);
3377 if (dest == BAD_APICID) 3344 if (dest == BAD_APICID)
3378 return; 3345 return -1;
3379 3346
3380 cfg = desc->chip_data; 3347 cfg = desc->chip_data;
3381 3348
@@ -3387,13 +3354,15 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3387 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3354 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3388 3355
3389 write_msi_msg_desc(desc, &msg); 3356 write_msi_msg_desc(desc, &msg);
3357
3358 return 0;
3390} 3359}
3391#ifdef CONFIG_INTR_REMAP 3360#ifdef CONFIG_INTR_REMAP
3392/* 3361/*
3393 * Migrate the MSI irq to another cpumask. This migration is 3362 * Migrate the MSI irq to another cpumask. This migration is
3394 * done in the process context using interrupt-remapping hardware. 3363 * done in the process context using interrupt-remapping hardware.
3395 */ 3364 */
3396static void 3365static int
3397ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) 3366ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3398{ 3367{
3399 struct irq_desc *desc = irq_to_desc(irq); 3368 struct irq_desc *desc = irq_to_desc(irq);
@@ -3402,11 +3371,11 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3402 struct irte irte; 3371 struct irte irte;
3403 3372
3404 if (get_irte(irq, &irte)) 3373 if (get_irte(irq, &irte))
3405 return; 3374 return -1;
3406 3375
3407 dest = set_desc_affinity(desc, mask); 3376 dest = set_desc_affinity(desc, mask);
3408 if (dest == BAD_APICID) 3377 if (dest == BAD_APICID)
3409 return; 3378 return -1;
3410 3379
3411 irte.vector = cfg->vector; 3380 irte.vector = cfg->vector;
3412 irte.dest_id = IRTE_DEST(dest); 3381 irte.dest_id = IRTE_DEST(dest);
@@ -3423,6 +3392,8 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3423 */ 3392 */
3424 if (cfg->move_in_progress) 3393 if (cfg->move_in_progress)
3425 send_cleanup_vector(cfg); 3394 send_cleanup_vector(cfg);
3395
3396 return 0;
3426} 3397}
3427 3398
3428#endif 3399#endif
@@ -3518,15 +3489,17 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3518 unsigned int irq_want; 3489 unsigned int irq_want;
3519 struct intel_iommu *iommu = NULL; 3490 struct intel_iommu *iommu = NULL;
3520 int index = 0; 3491 int index = 0;
3492 int node;
3521 3493
3522 /* x86 doesn't support multiple MSI yet */ 3494 /* x86 doesn't support multiple MSI yet */
3523 if (type == PCI_CAP_ID_MSI && nvec > 1) 3495 if (type == PCI_CAP_ID_MSI && nvec > 1)
3524 return 1; 3496 return 1;
3525 3497
3498 node = dev_to_node(&dev->dev);
3526 irq_want = nr_irqs_gsi; 3499 irq_want = nr_irqs_gsi;
3527 sub_handle = 0; 3500 sub_handle = 0;
3528 list_for_each_entry(msidesc, &dev->msi_list, list) { 3501 list_for_each_entry(msidesc, &dev->msi_list, list) {
3529 irq = create_irq_nr(irq_want); 3502 irq = create_irq_nr(irq_want, node);
3530 if (irq == 0) 3503 if (irq == 0)
3531 return -1; 3504 return -1;
3532 irq_want = irq + 1; 3505 irq_want = irq + 1;
@@ -3576,7 +3549,7 @@ void arch_teardown_msi_irq(unsigned int irq)
3576 3549
3577#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP) 3550#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
3578#ifdef CONFIG_SMP 3551#ifdef CONFIG_SMP
3579static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) 3552static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3580{ 3553{
3581 struct irq_desc *desc = irq_to_desc(irq); 3554 struct irq_desc *desc = irq_to_desc(irq);
3582 struct irq_cfg *cfg; 3555 struct irq_cfg *cfg;
@@ -3585,7 +3558,7 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3585 3558
3586 dest = set_desc_affinity(desc, mask); 3559 dest = set_desc_affinity(desc, mask);
3587 if (dest == BAD_APICID) 3560 if (dest == BAD_APICID)
3588 return; 3561 return -1;
3589 3562
3590 cfg = desc->chip_data; 3563 cfg = desc->chip_data;
3591 3564
@@ -3597,11 +3570,13 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3597 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3570 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3598 3571
3599 dmar_msi_write(irq, &msg); 3572 dmar_msi_write(irq, &msg);
3573
3574 return 0;
3600} 3575}
3601 3576
3602#endif /* CONFIG_SMP */ 3577#endif /* CONFIG_SMP */
3603 3578
3604struct irq_chip dmar_msi_type = { 3579static struct irq_chip dmar_msi_type = {
3605 .name = "DMAR_MSI", 3580 .name = "DMAR_MSI",
3606 .unmask = dmar_msi_unmask, 3581 .unmask = dmar_msi_unmask,
3607 .mask = dmar_msi_mask, 3582 .mask = dmar_msi_mask,
@@ -3630,7 +3605,7 @@ int arch_setup_dmar_msi(unsigned int irq)
3630#ifdef CONFIG_HPET_TIMER 3605#ifdef CONFIG_HPET_TIMER
3631 3606
3632#ifdef CONFIG_SMP 3607#ifdef CONFIG_SMP
3633static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) 3608static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3634{ 3609{
3635 struct irq_desc *desc = irq_to_desc(irq); 3610 struct irq_desc *desc = irq_to_desc(irq);
3636 struct irq_cfg *cfg; 3611 struct irq_cfg *cfg;
@@ -3639,7 +3614,7 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3639 3614
3640 dest = set_desc_affinity(desc, mask); 3615 dest = set_desc_affinity(desc, mask);
3641 if (dest == BAD_APICID) 3616 if (dest == BAD_APICID)
3642 return; 3617 return -1;
3643 3618
3644 cfg = desc->chip_data; 3619 cfg = desc->chip_data;
3645 3620
@@ -3651,6 +3626,8 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3651 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3626 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3652 3627
3653 hpet_msi_write(irq, &msg); 3628 hpet_msi_write(irq, &msg);
3629
3630 return 0;
3654} 3631}
3655 3632
3656#endif /* CONFIG_SMP */ 3633#endif /* CONFIG_SMP */
@@ -3707,7 +3684,7 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
3707 write_ht_irq_msg(irq, &msg); 3684 write_ht_irq_msg(irq, &msg);
3708} 3685}
3709 3686
3710static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) 3687static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
3711{ 3688{
3712 struct irq_desc *desc = irq_to_desc(irq); 3689 struct irq_desc *desc = irq_to_desc(irq);
3713 struct irq_cfg *cfg; 3690 struct irq_cfg *cfg;
@@ -3715,11 +3692,13 @@ static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
3715 3692
3716 dest = set_desc_affinity(desc, mask); 3693 dest = set_desc_affinity(desc, mask);
3717 if (dest == BAD_APICID) 3694 if (dest == BAD_APICID)
3718 return; 3695 return -1;
3719 3696
3720 cfg = desc->chip_data; 3697 cfg = desc->chip_data;
3721 3698
3722 target_ht_irq(irq, dest, cfg->vector); 3699 target_ht_irq(irq, dest, cfg->vector);
3700
3701 return 0;
3723} 3702}
3724 3703
3725#endif 3704#endif
@@ -3794,6 +3773,8 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3794 unsigned long flags; 3773 unsigned long flags;
3795 int err; 3774 int err;
3796 3775
3776 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
3777
3797 cfg = irq_cfg(irq); 3778 cfg = irq_cfg(irq);
3798 3779
3799 err = assign_irq_vector(irq, cfg, eligible_cpu); 3780 err = assign_irq_vector(irq, cfg, eligible_cpu);
@@ -3807,15 +3788,13 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3807 3788
3808 mmr_value = 0; 3789 mmr_value = 0;
3809 entry = (struct uv_IO_APIC_route_entry *)&mmr_value; 3790 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
3810 BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); 3791 entry->vector = cfg->vector;
3811 3792 entry->delivery_mode = apic->irq_delivery_mode;
3812 entry->vector = cfg->vector; 3793 entry->dest_mode = apic->irq_dest_mode;
3813 entry->delivery_mode = apic->irq_delivery_mode; 3794 entry->polarity = 0;
3814 entry->dest_mode = apic->irq_dest_mode; 3795 entry->trigger = 0;
3815 entry->polarity = 0; 3796 entry->mask = 0;
3816 entry->trigger = 0; 3797 entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
3817 entry->mask = 0;
3818 entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
3819 3798
3820 mmr_pnode = uv_blade_to_pnode(mmr_blade); 3799 mmr_pnode = uv_blade_to_pnode(mmr_blade);
3821 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); 3800 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
@@ -3833,10 +3812,10 @@ void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset)
3833 struct uv_IO_APIC_route_entry *entry; 3812 struct uv_IO_APIC_route_entry *entry;
3834 int mmr_pnode; 3813 int mmr_pnode;
3835 3814
3815 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
3816
3836 mmr_value = 0; 3817 mmr_value = 0;
3837 entry = (struct uv_IO_APIC_route_entry *)&mmr_value; 3818 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
3838 BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
3839
3840 entry->mask = 1; 3819 entry->mask = 1;
3841 3820
3842 mmr_pnode = uv_blade_to_pnode(mmr_blade); 3821 mmr_pnode = uv_blade_to_pnode(mmr_blade);
@@ -3900,6 +3879,71 @@ int __init arch_probe_nr_irqs(void)
3900} 3879}
3901#endif 3880#endif
3902 3881
3882static int __io_apic_set_pci_routing(struct device *dev, int irq,
3883 struct io_apic_irq_attr *irq_attr)
3884{
3885 struct irq_desc *desc;
3886 struct irq_cfg *cfg;
3887 int node;
3888 int ioapic, pin;
3889 int trigger, polarity;
3890
3891 ioapic = irq_attr->ioapic;
3892 if (!IO_APIC_IRQ(irq)) {
3893 apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
3894 ioapic);
3895 return -EINVAL;
3896 }
3897
3898 if (dev)
3899 node = dev_to_node(dev);
3900 else
3901 node = cpu_to_node(boot_cpu_id);
3902
3903 desc = irq_to_desc_alloc_node(irq, node);
3904 if (!desc) {
3905 printk(KERN_INFO "can not get irq_desc %d\n", irq);
3906 return 0;
3907 }
3908
3909 pin = irq_attr->ioapic_pin;
3910 trigger = irq_attr->trigger;
3911 polarity = irq_attr->polarity;
3912
3913 /*
3914 * IRQs < 16 are already in the irq_2_pin[] map
3915 */
3916 if (irq >= NR_IRQS_LEGACY) {
3917 cfg = desc->chip_data;
3918 add_pin_to_irq_node(cfg, node, ioapic, pin);
3919 }
3920
3921 setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity);
3922
3923 return 0;
3924}
3925
3926int io_apic_set_pci_routing(struct device *dev, int irq,
3927 struct io_apic_irq_attr *irq_attr)
3928{
3929 int ioapic, pin;
3930 /*
3931 * Avoid pin reprogramming. PRTs typically include entries
3932 * with redundant pin->gsi mappings (but unique PCI devices);
3933 * we only program the IOAPIC on the first.
3934 */
3935 ioapic = irq_attr->ioapic;
3936 pin = irq_attr->ioapic_pin;
3937 if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) {
3938 pr_debug("Pin %d-%d already programmed\n",
3939 mp_ioapics[ioapic].apicid, pin);
3940 return 0;
3941 }
3942 set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed);
3943
3944 return __io_apic_set_pci_routing(dev, irq, irq_attr);
3945}
3946
3903/* -------------------------------------------------------------------------- 3947/* --------------------------------------------------------------------------
3904 ACPI-based IOAPIC Configuration 3948 ACPI-based IOAPIC Configuration
3905 -------------------------------------------------------------------------- */ 3949 -------------------------------------------------------------------------- */
@@ -3980,6 +4024,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3980 4024
3981 return apic_id; 4025 return apic_id;
3982} 4026}
4027#endif
3983 4028
3984int __init io_apic_get_version(int ioapic) 4029int __init io_apic_get_version(int ioapic)
3985{ 4030{
@@ -3992,39 +4037,6 @@ int __init io_apic_get_version(int ioapic)
3992 4037
3993 return reg_01.bits.version; 4038 return reg_01.bits.version;
3994} 4039}
3995#endif
3996
3997int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
3998{
3999 struct irq_desc *desc;
4000 struct irq_cfg *cfg;
4001 int cpu = boot_cpu_id;
4002
4003 if (!IO_APIC_IRQ(irq)) {
4004 apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
4005 ioapic);
4006 return -EINVAL;
4007 }
4008
4009 desc = irq_to_desc_alloc_cpu(irq, cpu);
4010 if (!desc) {
4011 printk(KERN_INFO "can not get irq_desc %d\n", irq);
4012 return 0;
4013 }
4014
4015 /*
4016 * IRQs < 16 are already in the irq_2_pin[] map
4017 */
4018 if (irq >= NR_IRQS_LEGACY) {
4019 cfg = desc->chip_data;
4020 add_pin_to_irq_cpu(cfg, cpu, ioapic, pin);
4021 }
4022
4023 setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity);
4024
4025 return 0;
4026}
4027
4028 4040
4029int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) 4041int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
4030{ 4042{
@@ -4055,51 +4067,44 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
4055#ifdef CONFIG_SMP 4067#ifdef CONFIG_SMP
4056void __init setup_ioapic_dest(void) 4068void __init setup_ioapic_dest(void)
4057{ 4069{
4058 int pin, ioapic, irq, irq_entry; 4070 int pin, ioapic = 0, irq, irq_entry;
4059 struct irq_desc *desc; 4071 struct irq_desc *desc;
4060 struct irq_cfg *cfg;
4061 const struct cpumask *mask; 4072 const struct cpumask *mask;
4062 4073
4063 if (skip_ioapic_setup == 1) 4074 if (skip_ioapic_setup == 1)
4064 return; 4075 return;
4065 4076
4066 for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { 4077#ifdef CONFIG_ACPI
4067 for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { 4078 if (!acpi_disabled && acpi_ioapic) {
4068 irq_entry = find_irq_entry(ioapic, pin, mp_INT); 4079 ioapic = mp_find_ioapic(0);
4069 if (irq_entry == -1) 4080 if (ioapic < 0)
4070 continue; 4081 ioapic = 0;
4071 irq = pin_2_irq(irq_entry, ioapic, pin); 4082 }
4072 4083#endif
4073 /* setup_IO_APIC_irqs could fail to get vector for some device
4074 * when you have too many devices, because at that time only boot
4075 * cpu is online.
4076 */
4077 desc = irq_to_desc(irq);
4078 cfg = desc->chip_data;
4079 if (!cfg->vector) {
4080 setup_IO_APIC_irq(ioapic, pin, irq, desc,
4081 irq_trigger(irq_entry),
4082 irq_polarity(irq_entry));
4083 continue;
4084 4084
4085 } 4085 for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
4086 irq_entry = find_irq_entry(ioapic, pin, mp_INT);
4087 if (irq_entry == -1)
4088 continue;
4089 irq = pin_2_irq(irq_entry, ioapic, pin);
4086 4090
4087 /* 4091 desc = irq_to_desc(irq);
4088 * Honour affinities which have been set in early boot
4089 */
4090 if (desc->status &
4091 (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
4092 mask = desc->affinity;
4093 else
4094 mask = apic->target_cpus();
4095 4092
4096 if (intr_remapping_enabled) 4093 /*
4097 set_ir_ioapic_affinity_irq_desc(desc, mask); 4094 * Honour affinities which have been set in early boot
4098 else 4095 */
4099 set_ioapic_affinity_irq_desc(desc, mask); 4096 if (desc->status &
4100 } 4097 (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
4098 mask = desc->affinity;
4099 else
4100 mask = apic->target_cpus();
4101 4101
4102 if (intr_remapping_enabled)
4103 set_ir_ioapic_affinity_irq_desc(desc, mask);
4104 else
4105 set_ioapic_affinity_irq_desc(desc, mask);
4102 } 4106 }
4107
4103} 4108}
4104#endif 4109#endif
4105 4110
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index ce4fbfa315a1..b3025b43b63a 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -66,7 +66,7 @@ static inline unsigned int get_nmi_count(int cpu)
66 66
67static inline int mce_in_progress(void) 67static inline int mce_in_progress(void)
68{ 68{
69#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) 69#if defined(CONFIG_X86_NEW_MCE)
70 return atomic_read(&mce_entry) > 0; 70 return atomic_read(&mce_entry) > 0;
71#endif 71#endif
72 return 0; 72 return 0;
@@ -104,7 +104,7 @@ static __init void nmi_cpu_busy(void *data)
104} 104}
105#endif 105#endif
106 106
107static void report_broken_nmi(int cpu, int *prev_nmi_count) 107static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count)
108{ 108{
109 printk(KERN_CONT "\n"); 109 printk(KERN_CONT "\n");
110 110
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 01eda2ac65e4..0c0182cc947d 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -20,23 +20,12 @@
20#include <asm/apic.h> 20#include <asm/apic.h>
21#include <asm/setup.h> 21#include <asm/setup.h>
22 22
23#include <linux/threads.h>
24#include <linux/cpumask.h>
25#include <asm/mpspec.h>
26#include <asm/fixmap.h>
27#include <asm/apicdef.h>
28#include <linux/kernel.h>
29#include <linux/string.h>
30#include <linux/smp.h> 23#include <linux/smp.h>
31#include <linux/init.h>
32#include <asm/ipi.h> 24#include <asm/ipi.h>
33 25
34#include <linux/smp.h>
35#include <linux/init.h>
36#include <linux/interrupt.h> 26#include <linux/interrupt.h>
37#include <asm/acpi.h> 27#include <asm/acpi.h>
38#include <asm/e820.h> 28#include <asm/e820.h>
39#include <asm/setup.h>
40 29
41#ifdef CONFIG_HOTPLUG_CPU 30#ifdef CONFIG_HOTPLUG_CPU
42#define DEFAULT_SEND_IPI (1) 31#define DEFAULT_SEND_IPI (1)
@@ -160,7 +149,6 @@ extern struct apic apic_summit;
160extern struct apic apic_bigsmp; 149extern struct apic apic_bigsmp;
161extern struct apic apic_es7000; 150extern struct apic apic_es7000;
162extern struct apic apic_es7000_cluster; 151extern struct apic apic_es7000_cluster;
163extern struct apic apic_default;
164 152
165struct apic *apic = &apic_default; 153struct apic *apic = &apic_default;
166EXPORT_SYMBOL_GPL(apic); 154EXPORT_SYMBOL_GPL(apic);
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 1783652bb0e5..bc3e880f9b82 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -50,7 +50,7 @@ static struct apic *apic_probe[] __initdata = {
50void __init default_setup_apic_routing(void) 50void __init default_setup_apic_routing(void)
51{ 51{
52#ifdef CONFIG_X86_X2APIC 52#ifdef CONFIG_X86_X2APIC
53 if (x2apic && (apic != &apic_x2apic_phys && 53 if (x2apic_mode && (apic != &apic_x2apic_phys &&
54#ifdef CONFIG_X86_UV 54#ifdef CONFIG_X86_UV
55 apic != &apic_x2apic_uv_x && 55 apic != &apic_x2apic_uv_x &&
56#endif 56#endif
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 9cfe1f415d81..eafdfbd1ea95 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -44,7 +44,6 @@
44#include <asm/ipi.h> 44#include <asm/ipi.h>
45#include <linux/kernel.h> 45#include <linux/kernel.h>
46#include <linux/string.h> 46#include <linux/string.h>
47#include <linux/init.h>
48#include <linux/gfp.h> 47#include <linux/gfp.h>
49#include <linux/smp.h> 48#include <linux/smp.h>
50 49
@@ -173,13 +172,6 @@ static inline int is_WPEG(struct rio_detail *rio){
173 rio->type == LookOutAWPEG || rio->type == LookOutBWPEG); 172 rio->type == LookOutAWPEG || rio->type == LookOutBWPEG);
174} 173}
175 174
176
177/* In clustered mode, the high nibble of APIC ID is a cluster number.
178 * The low nibble is a 4-bit bitmap. */
179#define XAPIC_DEST_CPUS_SHIFT 4
180#define XAPIC_DEST_CPUS_MASK ((1u << XAPIC_DEST_CPUS_SHIFT) - 1)
181#define XAPIC_DEST_CLUSTER_MASK (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT)
182
183#define SUMMIT_APIC_DFR_VALUE (APIC_DFR_CLUSTER) 175#define SUMMIT_APIC_DFR_VALUE (APIC_DFR_CLUSTER)
184 176
185static const struct cpumask *summit_target_cpus(void) 177static const struct cpumask *summit_target_cpus(void)
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 4a903e2f0d17..8e4cbb255c38 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -10,7 +10,7 @@
10#include <asm/apic.h> 10#include <asm/apic.h>
11#include <asm/ipi.h> 11#include <asm/ipi.h>
12 12
13DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); 13static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
14 14
15static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 15static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
16{ 16{
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 2bda69352976..096d19aea2f7 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -105,7 +105,7 @@ static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
105 cpumask_set_cpu(cpu, retmask); 105 cpumask_set_cpu(cpu, retmask);
106} 106}
107 107
108static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) 108static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
109{ 109{
110#ifdef CONFIG_SMP 110#ifdef CONFIG_SMP
111 unsigned long val; 111 unsigned long val;
@@ -463,7 +463,7 @@ static void uv_heartbeat(unsigned long ignored)
463 uv_set_scir_bits(bits); 463 uv_set_scir_bits(bits);
464 464
465 /* enable next timer period */ 465 /* enable next timer period */
466 mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL); 466 mod_timer_pinned(timer, jiffies + SCIR_CPU_HB_INTERVAL);
467} 467}
468 468
469static void __cpuinit uv_heartbeat_enable(int cpu) 469static void __cpuinit uv_heartbeat_enable(int cpu)
@@ -562,7 +562,7 @@ void __init uv_system_init(void)
562 union uvh_node_id_u node_id; 562 union uvh_node_id_u node_id;
563 unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; 563 unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
564 int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; 564 int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val;
565 int max_pnode = 0; 565 int gnode_extra, max_pnode = 0;
566 unsigned long mmr_base, present, paddr; 566 unsigned long mmr_base, present, paddr;
567 unsigned short pnode_mask; 567 unsigned short pnode_mask;
568 568
@@ -574,6 +574,13 @@ void __init uv_system_init(void)
574 mmr_base = 574 mmr_base =
575 uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & 575 uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
576 ~UV_MMR_ENABLE; 576 ~UV_MMR_ENABLE;
577 pnode_mask = (1 << n_val) - 1;
578 node_id.v = uv_read_local_mmr(UVH_NODE_ID);
579 gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1;
580 gnode_upper = ((unsigned long)gnode_extra << m_val);
581 printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n",
582 n_val, m_val, gnode_upper, gnode_extra);
583
577 printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base); 584 printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);
578 585
579 for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) 586 for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++)
@@ -583,15 +590,18 @@ void __init uv_system_init(void)
583 590
584 bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); 591 bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
585 uv_blade_info = kmalloc(bytes, GFP_KERNEL); 592 uv_blade_info = kmalloc(bytes, GFP_KERNEL);
593 BUG_ON(!uv_blade_info);
586 594
587 get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size); 595 get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size);
588 596
589 bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes(); 597 bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes();
590 uv_node_to_blade = kmalloc(bytes, GFP_KERNEL); 598 uv_node_to_blade = kmalloc(bytes, GFP_KERNEL);
599 BUG_ON(!uv_node_to_blade);
591 memset(uv_node_to_blade, 255, bytes); 600 memset(uv_node_to_blade, 255, bytes);
592 601
593 bytes = sizeof(uv_cpu_to_blade[0]) * num_possible_cpus(); 602 bytes = sizeof(uv_cpu_to_blade[0]) * num_possible_cpus();
594 uv_cpu_to_blade = kmalloc(bytes, GFP_KERNEL); 603 uv_cpu_to_blade = kmalloc(bytes, GFP_KERNEL);
604 BUG_ON(!uv_cpu_to_blade);
595 memset(uv_cpu_to_blade, 255, bytes); 605 memset(uv_cpu_to_blade, 255, bytes);
596 606
597 blade = 0; 607 blade = 0;
@@ -607,11 +617,6 @@ void __init uv_system_init(void)
607 } 617 }
608 } 618 }
609 619
610 pnode_mask = (1 << n_val) - 1;
611 node_id.v = uv_read_local_mmr(UVH_NODE_ID);
612 gnode_upper = (((unsigned long)node_id.s.node_id) &
613 ~((1 << n_val) - 1)) << m_val;
614
615 uv_bios_init(); 620 uv_bios_init();
616 uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, 621 uv_bios_get_sn_info(0, &uv_type, &sn_partition_id,
617 &sn_coherency_id, &sn_region_size); 622 &sn_coherency_id, &sn_region_size);
@@ -634,6 +639,7 @@ void __init uv_system_init(void)
634 uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask; 639 uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
635 uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; 640 uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
636 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; 641 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
642 uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
637 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; 643 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
638 uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; 644 uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id;
639 uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; 645 uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu;
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 49e0939bac42..79302e9a33a4 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1233,9 +1233,9 @@ static int suspend(int vetoable)
1233 int err; 1233 int err;
1234 struct apm_user *as; 1234 struct apm_user *as;
1235 1235
1236 device_suspend(PMSG_SUSPEND); 1236 dpm_suspend_start(PMSG_SUSPEND);
1237 1237
1238 device_power_down(PMSG_SUSPEND); 1238 dpm_suspend_noirq(PMSG_SUSPEND);
1239 1239
1240 local_irq_disable(); 1240 local_irq_disable();
1241 sysdev_suspend(PMSG_SUSPEND); 1241 sysdev_suspend(PMSG_SUSPEND);
@@ -1259,9 +1259,9 @@ static int suspend(int vetoable)
1259 sysdev_resume(); 1259 sysdev_resume();
1260 local_irq_enable(); 1260 local_irq_enable();
1261 1261
1262 device_power_up(PMSG_RESUME); 1262 dpm_resume_noirq(PMSG_RESUME);
1263 1263
1264 device_resume(PMSG_RESUME); 1264 dpm_resume_end(PMSG_RESUME);
1265 queue_event(APM_NORMAL_RESUME, NULL); 1265 queue_event(APM_NORMAL_RESUME, NULL);
1266 spin_lock(&user_list_lock); 1266 spin_lock(&user_list_lock);
1267 for (as = user_list; as != NULL; as = as->next) { 1267 for (as = user_list; as != NULL; as = as->next) {
@@ -1277,7 +1277,7 @@ static void standby(void)
1277{ 1277{
1278 int err; 1278 int err;
1279 1279
1280 device_power_down(PMSG_SUSPEND); 1280 dpm_suspend_noirq(PMSG_SUSPEND);
1281 1281
1282 local_irq_disable(); 1282 local_irq_disable();
1283 sysdev_suspend(PMSG_SUSPEND); 1283 sysdev_suspend(PMSG_SUSPEND);
@@ -1291,7 +1291,7 @@ static void standby(void)
1291 sysdev_resume(); 1291 sysdev_resume();
1292 local_irq_enable(); 1292 local_irq_enable();
1293 1293
1294 device_power_up(PMSG_RESUME); 1294 dpm_resume_noirq(PMSG_RESUME);
1295} 1295}
1296 1296
1297static apm_event_t get_event(void) 1297static apm_event_t get_event(void)
@@ -1376,7 +1376,7 @@ static void check_events(void)
1376 ignore_bounce = 1; 1376 ignore_bounce = 1;
1377 if ((event != APM_NORMAL_RESUME) 1377 if ((event != APM_NORMAL_RESUME)
1378 || (ignore_normal_resume == 0)) { 1378 || (ignore_normal_resume == 0)) {
1379 device_resume(PMSG_RESUME); 1379 dpm_resume_end(PMSG_RESUME);
1380 queue_event(event, NULL); 1380 queue_event(event, NULL);
1381 } 1381 }
1382 ignore_normal_resume = 0; 1382 ignore_normal_resume = 0;
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 5a6aa1c1162f..dfdbf6403895 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -126,6 +126,7 @@ void foo(void)
126#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) 126#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
127 BLANK(); 127 BLANK();
128 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); 128 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
129 OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
129 OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir); 130 OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
130 131
131 BLANK(); 132 BLANK();
@@ -146,4 +147,5 @@ void foo(void)
146 OFFSET(BP_loadflags, boot_params, hdr.loadflags); 147 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
147 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); 148 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
148 OFFSET(BP_version, boot_params, hdr.version); 149 OFFSET(BP_version, boot_params, hdr.version);
150 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
149} 151}
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index e72f062fb4b5..898ecc47e129 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -125,6 +125,7 @@ int main(void)
125 OFFSET(BP_loadflags, boot_params, hdr.loadflags); 125 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
126 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); 126 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
127 OFFSET(BP_version, boot_params, hdr.version); 127 OFFSET(BP_version, boot_params, hdr.version);
128 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
128 129
129 BLANK(); 130 BLANK();
130 DEFINE(PAGE_SIZE_asm, PAGE_SIZE); 131 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 4e242f9a06e4..3efcb2b96a15 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -1,5 +1,5 @@
1# 1#
2# Makefile for x86-compatible CPU details and quirks 2# Makefile for x86-compatible CPU details, features and quirks
3# 3#
4 4
5# Don't trace early stages of a secondary CPU boot 5# Don't trace early stages of a secondary CPU boot
@@ -23,11 +23,13 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
23obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o 23obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
24obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o 24obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
25 25
26obj-$(CONFIG_X86_MCE) += mcheck/ 26obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
27obj-$(CONFIG_MTRR) += mtrr/
28obj-$(CONFIG_CPU_FREQ) += cpufreq/
29 27
30obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o 28obj-$(CONFIG_X86_MCE) += mcheck/
29obj-$(CONFIG_MTRR) += mtrr/
30obj-$(CONFIG_CPU_FREQ) += cpufreq/
31
32obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
31 33
32quiet_cmd_mkcapflags = MKCAP $@ 34quiet_cmd_mkcapflags = MKCAP $@
33 cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ 35 cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 7e4a459daa64..e5b27d8f1b47 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -6,6 +6,7 @@
6#include <asm/processor.h> 6#include <asm/processor.h>
7#include <asm/apic.h> 7#include <asm/apic.h>
8#include <asm/cpu.h> 8#include <asm/cpu.h>
9#include <asm/pci-direct.h>
9 10
10#ifdef CONFIG_X86_64 11#ifdef CONFIG_X86_64
11# include <asm/numa_64.h> 12# include <asm/numa_64.h>
@@ -272,7 +273,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
272#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 273#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
273 int cpu = smp_processor_id(); 274 int cpu = smp_processor_id();
274 int node; 275 int node;
275 unsigned apicid = hard_smp_processor_id(); 276 unsigned apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid;
276 277
277 node = c->phys_proc_id; 278 node = c->phys_proc_id;
278 if (apicid_to_node[apicid] != NUMA_NO_NODE) 279 if (apicid_to_node[apicid] != NUMA_NO_NODE)
@@ -351,6 +352,15 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
351 (c->x86_model == 8 && c->x86_mask >= 8)) 352 (c->x86_model == 8 && c->x86_mask >= 8))
352 set_cpu_cap(c, X86_FEATURE_K6_MTRR); 353 set_cpu_cap(c, X86_FEATURE_K6_MTRR);
353#endif 354#endif
355#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI)
356 /* check CPU config space for extended APIC ID */
357 if (c->x86 >= 0xf) {
358 unsigned int val;
359 val = read_pci_config(0, 24, 0, 0x68);
360 if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18)))
361 set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
362 }
363#endif
354} 364}
355 365
356static void __cpuinit init_amd(struct cpuinfo_x86 *c) 366static void __cpuinit init_amd(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 77848d9fca68..6b26d4deada0 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -13,6 +13,7 @@
13#include <linux/io.h> 13#include <linux/io.h>
14 14
15#include <asm/stackprotector.h> 15#include <asm/stackprotector.h>
16#include <asm/perf_counter.h>
16#include <asm/mmu_context.h> 17#include <asm/mmu_context.h>
17#include <asm/hypervisor.h> 18#include <asm/hypervisor.h>
18#include <asm/processor.h> 19#include <asm/processor.h>
@@ -107,7 +108,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
107 /* data */ 108 /* data */
108 [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, 109 [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
109 110
110 [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, 111 [GDT_ENTRY_ESPFIX_SS] = { { { 0x0000ffff, 0x00cf9200 } } },
111 [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } }, 112 [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } },
112 GDT_STACK_CANARY_INIT 113 GDT_STACK_CANARY_INIT
113#endif 114#endif
@@ -299,7 +300,8 @@ static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c)
299 return NULL; /* Not found */ 300 return NULL; /* Not found */
300} 301}
301 302
302__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; 303__u32 cpu_caps_cleared[NCAPINTS] __cpuinitdata;
304__u32 cpu_caps_set[NCAPINTS] __cpuinitdata;
303 305
304void load_percpu_segment(int cpu) 306void load_percpu_segment(int cpu)
305{ 307{
@@ -485,7 +487,6 @@ out:
485static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) 487static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
486{ 488{
487 char *v = c->x86_vendor_id; 489 char *v = c->x86_vendor_id;
488 static int printed;
489 int i; 490 int i;
490 491
491 for (i = 0; i < X86_VENDOR_NUM; i++) { 492 for (i = 0; i < X86_VENDOR_NUM; i++) {
@@ -502,13 +503,9 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
502 } 503 }
503 } 504 }
504 505
505 if (!printed) { 506 printk_once(KERN_ERR
506 printed++; 507 "CPU: vendor_id '%s' unknown, using generic init.\n" \
507 printk(KERN_ERR 508 "CPU: Your system may be unstable.\n", v);
508 "CPU: vendor_id '%s' unknown, using generic init.\n", v);
509
510 printk(KERN_ERR "CPU: Your system may be unstable.\n");
511 }
512 509
513 c->x86_vendor = X86_VENDOR_UNKNOWN; 510 c->x86_vendor = X86_VENDOR_UNKNOWN;
514 this_cpu = &default_cpu; 511 this_cpu = &default_cpu;
@@ -768,6 +765,12 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
768 if (this_cpu->c_identify) 765 if (this_cpu->c_identify)
769 this_cpu->c_identify(c); 766 this_cpu->c_identify(c);
770 767
768 /* Clear/Set all flags overriden by options, after probe */
769 for (i = 0; i < NCAPINTS; i++) {
770 c->x86_capability[i] &= ~cpu_caps_cleared[i];
771 c->x86_capability[i] |= cpu_caps_set[i];
772 }
773
771#ifdef CONFIG_X86_64 774#ifdef CONFIG_X86_64
772 c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); 775 c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
773#endif 776#endif
@@ -813,6 +816,16 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
813#endif 816#endif
814 817
815 init_hypervisor(c); 818 init_hypervisor(c);
819
820 /*
821 * Clear/Set all flags overriden by options, need do it
822 * before following smp all cpus cap AND.
823 */
824 for (i = 0; i < NCAPINTS; i++) {
825 c->x86_capability[i] &= ~cpu_caps_cleared[i];
826 c->x86_capability[i] |= cpu_caps_set[i];
827 }
828
816 /* 829 /*
817 * On SMP, boot_cpu_data holds the common feature set between 830 * On SMP, boot_cpu_data holds the common feature set between
818 * all CPUs; so make sure that we indicate which features are 831 * all CPUs; so make sure that we indicate which features are
@@ -825,10 +838,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
825 boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; 838 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
826 } 839 }
827 840
828 /* Clear all flags overriden by options */
829 for (i = 0; i < NCAPINTS; i++)
830 c->x86_capability[i] &= ~cleared_cpu_caps[i];
831
832#ifdef CONFIG_X86_MCE 841#ifdef CONFIG_X86_MCE
833 /* Init Machine Check Exception if available. */ 842 /* Init Machine Check Exception if available. */
834 mcheck_init(c); 843 mcheck_init(c);
@@ -839,6 +848,9 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
839#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 848#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
840 numa_add_cpu(smp_processor_id()); 849 numa_add_cpu(smp_processor_id());
841#endif 850#endif
851
852 /* Cap the iomem address space to what is addressable on all CPUs */
853 iomem_resource.end &= (1ULL << c->x86_phys_bits) - 1;
842} 854}
843 855
844#ifdef CONFIG_X86_64 856#ifdef CONFIG_X86_64
@@ -861,6 +873,7 @@ void __init identify_boot_cpu(void)
861#else 873#else
862 vgetcpu_set_mode(); 874 vgetcpu_set_mode();
863#endif 875#endif
876 init_hw_perf_counters();
864} 877}
865 878
866void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) 879void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
index 46e29ab96c6a..6b2a52dd0403 100644
--- a/arch/x86/kernel/cpu/cpu_debug.c
+++ b/arch/x86/kernel/cpu/cpu_debug.c
@@ -32,9 +32,7 @@
32 32
33static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]); 33static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]);
34static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]); 34static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]);
35static DEFINE_PER_CPU(unsigned, cpu_modelflag);
36static DEFINE_PER_CPU(int, cpu_priv_count); 35static DEFINE_PER_CPU(int, cpu_priv_count);
37static DEFINE_PER_CPU(unsigned, cpu_model);
38 36
39static DEFINE_MUTEX(cpu_debug_lock); 37static DEFINE_MUTEX(cpu_debug_lock);
40 38
@@ -80,302 +78,102 @@ static struct cpu_file_base cpu_file[] = {
80 { "value", CPU_REG_ALL, 1 }, 78 { "value", CPU_REG_ALL, 1 },
81}; 79};
82 80
83/* Intel Registers Range */ 81/* CPU Registers Range */
84static struct cpu_debug_range cpu_intel_range[] = { 82static struct cpu_debug_range cpu_reg_range[] = {
85 { 0x00000000, 0x00000001, CPU_MC, CPU_INTEL_ALL }, 83 { 0x00000000, 0x00000001, CPU_MC, },
86 { 0x00000006, 0x00000007, CPU_MONITOR, CPU_CX_AT_XE }, 84 { 0x00000006, 0x00000007, CPU_MONITOR, },
87 { 0x00000010, 0x00000010, CPU_TIME, CPU_INTEL_ALL }, 85 { 0x00000010, 0x00000010, CPU_TIME, },
88 { 0x00000011, 0x00000013, CPU_PMC, CPU_INTEL_PENTIUM }, 86 { 0x00000011, 0x00000013, CPU_PMC, },
89 { 0x00000017, 0x00000017, CPU_PLATFORM, CPU_PX_CX_AT_XE }, 87 { 0x00000017, 0x00000017, CPU_PLATFORM, },
90 { 0x0000001B, 0x0000001B, CPU_APIC, CPU_P6_CX_AT_XE }, 88 { 0x0000001B, 0x0000001B, CPU_APIC, },
91 89 { 0x0000002A, 0x0000002B, CPU_POWERON, },
92 { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_PX_CX_AT_XE }, 90 { 0x0000002C, 0x0000002C, CPU_FREQ, },
93 { 0x0000002B, 0x0000002B, CPU_POWERON, CPU_INTEL_XEON }, 91 { 0x0000003A, 0x0000003A, CPU_CONTROL, },
94 { 0x0000002C, 0x0000002C, CPU_FREQ, CPU_INTEL_XEON }, 92 { 0x00000040, 0x00000047, CPU_LBRANCH, },
95 { 0x0000003A, 0x0000003A, CPU_CONTROL, CPU_CX_AT_XE }, 93 { 0x00000060, 0x00000067, CPU_LBRANCH, },
96 94 { 0x00000079, 0x00000079, CPU_BIOS, },
97 { 0x00000040, 0x00000043, CPU_LBRANCH, CPU_PM_CX_AT_XE }, 95 { 0x00000088, 0x0000008A, CPU_CACHE, },
98 { 0x00000044, 0x00000047, CPU_LBRANCH, CPU_PM_CO_AT }, 96 { 0x0000008B, 0x0000008B, CPU_BIOS, },
99 { 0x00000060, 0x00000063, CPU_LBRANCH, CPU_C2_AT }, 97 { 0x0000009B, 0x0000009B, CPU_MONITOR, },
100 { 0x00000064, 0x00000067, CPU_LBRANCH, CPU_INTEL_ATOM }, 98 { 0x000000C1, 0x000000C4, CPU_PMC, },
101 99 { 0x000000CD, 0x000000CD, CPU_FREQ, },
102 { 0x00000079, 0x00000079, CPU_BIOS, CPU_P6_CX_AT_XE }, 100 { 0x000000E7, 0x000000E8, CPU_PERF, },
103 { 0x00000088, 0x0000008A, CPU_CACHE, CPU_INTEL_P6 }, 101 { 0x000000FE, 0x000000FE, CPU_MTRR, },
104 { 0x0000008B, 0x0000008B, CPU_BIOS, CPU_P6_CX_AT_XE }, 102
105 { 0x0000009B, 0x0000009B, CPU_MONITOR, CPU_INTEL_XEON }, 103 { 0x00000116, 0x0000011E, CPU_CACHE, },
106 104 { 0x00000174, 0x00000176, CPU_SYSENTER, },
107 { 0x000000C1, 0x000000C2, CPU_PMC, CPU_P6_CX_AT }, 105 { 0x00000179, 0x0000017B, CPU_MC, },
108 { 0x000000CD, 0x000000CD, CPU_FREQ, CPU_CX_AT }, 106 { 0x00000186, 0x00000189, CPU_PMC, },
109 { 0x000000E7, 0x000000E8, CPU_PERF, CPU_CX_AT }, 107 { 0x00000198, 0x00000199, CPU_PERF, },
110 { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_P6_CX_XE }, 108 { 0x0000019A, 0x0000019A, CPU_TIME, },
111 109 { 0x0000019B, 0x0000019D, CPU_THERM, },
112 { 0x00000116, 0x00000116, CPU_CACHE, CPU_INTEL_P6 }, 110 { 0x000001A0, 0x000001A0, CPU_MISC, },
113 { 0x00000118, 0x00000118, CPU_CACHE, CPU_INTEL_P6 }, 111 { 0x000001C9, 0x000001C9, CPU_LBRANCH, },
114 { 0x00000119, 0x00000119, CPU_CACHE, CPU_INTEL_PX }, 112 { 0x000001D7, 0x000001D8, CPU_LBRANCH, },
115 { 0x0000011A, 0x0000011B, CPU_CACHE, CPU_INTEL_P6 }, 113 { 0x000001D9, 0x000001D9, CPU_DEBUG, },
116 { 0x0000011E, 0x0000011E, CPU_CACHE, CPU_PX_CX_AT }, 114 { 0x000001DA, 0x000001E0, CPU_LBRANCH, },
117 115
118 { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_P6_CX_AT_XE }, 116 { 0x00000200, 0x0000020F, CPU_MTRR, },
119 { 0x00000179, 0x0000017A, CPU_MC, CPU_PX_CX_AT_XE }, 117 { 0x00000250, 0x00000250, CPU_MTRR, },
120 { 0x0000017B, 0x0000017B, CPU_MC, CPU_P6_XE }, 118 { 0x00000258, 0x00000259, CPU_MTRR, },
121 { 0x00000186, 0x00000187, CPU_PMC, CPU_P6_CX_AT }, 119 { 0x00000268, 0x0000026F, CPU_MTRR, },
122 { 0x00000198, 0x00000199, CPU_PERF, CPU_PM_CX_AT_XE }, 120 { 0x00000277, 0x00000277, CPU_PAT, },
123 { 0x0000019A, 0x0000019A, CPU_TIME, CPU_PM_CX_AT_XE }, 121 { 0x000002FF, 0x000002FF, CPU_MTRR, },
124 { 0x0000019B, 0x0000019D, CPU_THERM, CPU_PM_CX_AT_XE }, 122
125 { 0x000001A0, 0x000001A0, CPU_MISC, CPU_PM_CX_AT_XE }, 123 { 0x00000300, 0x00000311, CPU_PMC, },
126 124 { 0x00000345, 0x00000345, CPU_PMC, },
127 { 0x000001C9, 0x000001C9, CPU_LBRANCH, CPU_PM_CX_AT }, 125 { 0x00000360, 0x00000371, CPU_PMC, },
128 { 0x000001D7, 0x000001D8, CPU_LBRANCH, CPU_INTEL_XEON }, 126 { 0x0000038D, 0x00000390, CPU_PMC, },
129 { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_CX_AT_XE }, 127 { 0x000003A0, 0x000003BE, CPU_PMC, },
130 { 0x000001DA, 0x000001DA, CPU_LBRANCH, CPU_INTEL_XEON }, 128 { 0x000003C0, 0x000003CD, CPU_PMC, },
131 { 0x000001DB, 0x000001DB, CPU_LBRANCH, CPU_P6_XE }, 129 { 0x000003E0, 0x000003E1, CPU_PMC, },
132 { 0x000001DC, 0x000001DC, CPU_LBRANCH, CPU_INTEL_P6 }, 130 { 0x000003F0, 0x000003F2, CPU_PMC, },
133 { 0x000001DD, 0x000001DE, CPU_LBRANCH, CPU_PX_CX_AT_XE }, 131
134 { 0x000001E0, 0x000001E0, CPU_LBRANCH, CPU_INTEL_P6 }, 132 { 0x00000400, 0x00000417, CPU_MC, },
135 133 { 0x00000480, 0x0000048B, CPU_VMX, },
136 { 0x00000200, 0x0000020F, CPU_MTRR, CPU_P6_CX_XE }, 134
137 { 0x00000250, 0x00000250, CPU_MTRR, CPU_P6_CX_XE }, 135 { 0x00000600, 0x00000600, CPU_DEBUG, },
138 { 0x00000258, 0x00000259, CPU_MTRR, CPU_P6_CX_XE }, 136 { 0x00000680, 0x0000068F, CPU_LBRANCH, },
139 { 0x00000268, 0x0000026F, CPU_MTRR, CPU_P6_CX_XE }, 137 { 0x000006C0, 0x000006CF, CPU_LBRANCH, },
140 { 0x00000277, 0x00000277, CPU_PAT, CPU_C2_AT_XE }, 138
141 { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_P6_CX_XE }, 139 { 0x000107CC, 0x000107D3, CPU_PMC, },
142 140
143 { 0x00000300, 0x00000308, CPU_PMC, CPU_INTEL_XEON }, 141 { 0xC0000080, 0xC0000080, CPU_FEATURES, },
144 { 0x00000309, 0x0000030B, CPU_PMC, CPU_C2_AT_XE }, 142 { 0xC0000081, 0xC0000084, CPU_CALL, },
145 { 0x0000030C, 0x00000311, CPU_PMC, CPU_INTEL_XEON }, 143 { 0xC0000100, 0xC0000102, CPU_BASE, },
146 { 0x00000345, 0x00000345, CPU_PMC, CPU_C2_AT }, 144 { 0xC0000103, 0xC0000103, CPU_TIME, },
147 { 0x00000360, 0x00000371, CPU_PMC, CPU_INTEL_XEON }, 145
148 { 0x0000038D, 0x00000390, CPU_PMC, CPU_C2_AT }, 146 { 0xC0010000, 0xC0010007, CPU_PMC, },
149 { 0x000003A0, 0x000003BE, CPU_PMC, CPU_INTEL_XEON }, 147 { 0xC0010010, 0xC0010010, CPU_CONF, },
150 { 0x000003C0, 0x000003CD, CPU_PMC, CPU_INTEL_XEON }, 148 { 0xC0010015, 0xC0010015, CPU_CONF, },
151 { 0x000003E0, 0x000003E1, CPU_PMC, CPU_INTEL_XEON }, 149 { 0xC0010016, 0xC001001A, CPU_MTRR, },
152 { 0x000003F0, 0x000003F0, CPU_PMC, CPU_INTEL_XEON }, 150 { 0xC001001D, 0xC001001D, CPU_MTRR, },
153 { 0x000003F1, 0x000003F1, CPU_PMC, CPU_C2_AT_XE }, 151 { 0xC001001F, 0xC001001F, CPU_CONF, },
154 { 0x000003F2, 0x000003F2, CPU_PMC, CPU_INTEL_XEON }, 152 { 0xC0010030, 0xC0010035, CPU_BIOS, },
155 153 { 0xC0010044, 0xC0010048, CPU_MC, },
156 { 0x00000400, 0x00000402, CPU_MC, CPU_PM_CX_AT_XE }, 154 { 0xC0010050, 0xC0010056, CPU_SMM, },
157 { 0x00000403, 0x00000403, CPU_MC, CPU_INTEL_XEON }, 155 { 0xC0010058, 0xC0010058, CPU_CONF, },
158 { 0x00000404, 0x00000406, CPU_MC, CPU_PM_CX_AT_XE }, 156 { 0xC0010060, 0xC0010060, CPU_CACHE, },
159 { 0x00000407, 0x00000407, CPU_MC, CPU_INTEL_XEON }, 157 { 0xC0010061, 0xC0010068, CPU_SMM, },
160 { 0x00000408, 0x0000040A, CPU_MC, CPU_PM_CX_AT_XE }, 158 { 0xC0010069, 0xC001006B, CPU_SMM, },
161 { 0x0000040B, 0x0000040B, CPU_MC, CPU_INTEL_XEON }, 159 { 0xC0010070, 0xC0010071, CPU_SMM, },
162 { 0x0000040C, 0x0000040E, CPU_MC, CPU_PM_CX_XE }, 160 { 0xC0010111, 0xC0010113, CPU_SMM, },
163 { 0x0000040F, 0x0000040F, CPU_MC, CPU_INTEL_XEON }, 161 { 0xC0010114, 0xC0010118, CPU_SVM, },
164 { 0x00000410, 0x00000412, CPU_MC, CPU_PM_CX_AT_XE }, 162 { 0xC0010140, 0xC0010141, CPU_OSVM, },
165 { 0x00000413, 0x00000417, CPU_MC, CPU_CX_AT_XE }, 163 { 0xC0011022, 0xC0011023, CPU_CONF, },
166 { 0x00000480, 0x0000048B, CPU_VMX, CPU_CX_AT_XE },
167
168 { 0x00000600, 0x00000600, CPU_DEBUG, CPU_PM_CX_AT_XE },
169 { 0x00000680, 0x0000068F, CPU_LBRANCH, CPU_INTEL_XEON },
170 { 0x000006C0, 0x000006CF, CPU_LBRANCH, CPU_INTEL_XEON },
171
172 { 0x000107CC, 0x000107D3, CPU_PMC, CPU_INTEL_XEON_MP },
173
174 { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_INTEL_XEON },
175 { 0xC0000081, 0xC0000082, CPU_CALL, CPU_INTEL_XEON },
176 { 0xC0000084, 0xC0000084, CPU_CALL, CPU_INTEL_XEON },
177 { 0xC0000100, 0xC0000102, CPU_BASE, CPU_INTEL_XEON },
178}; 164};
179 165
180/* AMD Registers Range */
181static struct cpu_debug_range cpu_amd_range[] = {
182 { 0x00000000, 0x00000001, CPU_MC, CPU_K10_PLUS, },
183 { 0x00000010, 0x00000010, CPU_TIME, CPU_K8_PLUS, },
184 { 0x0000001B, 0x0000001B, CPU_APIC, CPU_K8_PLUS, },
185 { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_K7_PLUS },
186 { 0x0000008B, 0x0000008B, CPU_VER, CPU_K8_PLUS },
187 { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_K8_PLUS, },
188
189 { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_K8_PLUS, },
190 { 0x00000179, 0x0000017B, CPU_MC, CPU_K8_PLUS, },
191 { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_K8_PLUS, },
192 { 0x000001DB, 0x000001DE, CPU_LBRANCH, CPU_K8_PLUS, },
193
194 { 0x00000200, 0x0000020F, CPU_MTRR, CPU_K8_PLUS, },
195 { 0x00000250, 0x00000250, CPU_MTRR, CPU_K8_PLUS, },
196 { 0x00000258, 0x00000259, CPU_MTRR, CPU_K8_PLUS, },
197 { 0x00000268, 0x0000026F, CPU_MTRR, CPU_K8_PLUS, },
198 { 0x00000277, 0x00000277, CPU_PAT, CPU_K8_PLUS, },
199 { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_K8_PLUS, },
200
201 { 0x00000400, 0x00000413, CPU_MC, CPU_K8_PLUS, },
202
203 { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_AMD_ALL, },
204 { 0xC0000081, 0xC0000084, CPU_CALL, CPU_K8_PLUS, },
205 { 0xC0000100, 0xC0000102, CPU_BASE, CPU_K8_PLUS, },
206 { 0xC0000103, 0xC0000103, CPU_TIME, CPU_K10_PLUS, },
207
208 { 0xC0010000, 0xC0010007, CPU_PMC, CPU_K8_PLUS, },
209 { 0xC0010010, 0xC0010010, CPU_CONF, CPU_K7_PLUS, },
210 { 0xC0010015, 0xC0010015, CPU_CONF, CPU_K7_PLUS, },
211 { 0xC0010016, 0xC001001A, CPU_MTRR, CPU_K8_PLUS, },
212 { 0xC001001D, 0xC001001D, CPU_MTRR, CPU_K8_PLUS, },
213 { 0xC001001F, 0xC001001F, CPU_CONF, CPU_K8_PLUS, },
214 { 0xC0010030, 0xC0010035, CPU_BIOS, CPU_K8_PLUS, },
215 { 0xC0010044, 0xC0010048, CPU_MC, CPU_K8_PLUS, },
216 { 0xC0010050, 0xC0010056, CPU_SMM, CPU_K0F_PLUS, },
217 { 0xC0010058, 0xC0010058, CPU_CONF, CPU_K10_PLUS, },
218 { 0xC0010060, 0xC0010060, CPU_CACHE, CPU_AMD_11, },
219 { 0xC0010061, 0xC0010068, CPU_SMM, CPU_K10_PLUS, },
220 { 0xC0010069, 0xC001006B, CPU_SMM, CPU_AMD_11, },
221 { 0xC0010070, 0xC0010071, CPU_SMM, CPU_K10_PLUS, },
222 { 0xC0010111, 0xC0010113, CPU_SMM, CPU_K8_PLUS, },
223 { 0xC0010114, 0xC0010118, CPU_SVM, CPU_K10_PLUS, },
224 { 0xC0010140, 0xC0010141, CPU_OSVM, CPU_K10_PLUS, },
225 { 0xC0011022, 0xC0011023, CPU_CONF, CPU_K10_PLUS, },
226};
227
228
229/* Intel */
230static int get_intel_modelflag(unsigned model)
231{
232 int flag;
233
234 switch (model) {
235 case 0x0501:
236 case 0x0502:
237 case 0x0504:
238 flag = CPU_INTEL_PENTIUM;
239 break;
240 case 0x0601:
241 case 0x0603:
242 case 0x0605:
243 case 0x0607:
244 case 0x0608:
245 case 0x060A:
246 case 0x060B:
247 flag = CPU_INTEL_P6;
248 break;
249 case 0x0609:
250 case 0x060D:
251 flag = CPU_INTEL_PENTIUM_M;
252 break;
253 case 0x060E:
254 flag = CPU_INTEL_CORE;
255 break;
256 case 0x060F:
257 case 0x0617:
258 flag = CPU_INTEL_CORE2;
259 break;
260 case 0x061C:
261 flag = CPU_INTEL_ATOM;
262 break;
263 case 0x0F00:
264 case 0x0F01:
265 case 0x0F02:
266 case 0x0F03:
267 case 0x0F04:
268 flag = CPU_INTEL_XEON_P4;
269 break;
270 case 0x0F06:
271 flag = CPU_INTEL_XEON_MP;
272 break;
273 default:
274 flag = CPU_NONE;
275 break;
276 }
277
278 return flag;
279}
280
281/* AMD */
282static int get_amd_modelflag(unsigned model)
283{
284 int flag;
285
286 switch (model >> 8) {
287 case 0x6:
288 flag = CPU_AMD_K6;
289 break;
290 case 0x7:
291 flag = CPU_AMD_K7;
292 break;
293 case 0x8:
294 flag = CPU_AMD_K8;
295 break;
296 case 0xf:
297 flag = CPU_AMD_0F;
298 break;
299 case 0x10:
300 flag = CPU_AMD_10;
301 break;
302 case 0x11:
303 flag = CPU_AMD_11;
304 break;
305 default:
306 flag = CPU_NONE;
307 break;
308 }
309
310 return flag;
311}
312
313static int get_cpu_modelflag(unsigned cpu)
314{
315 int flag;
316
317 flag = per_cpu(cpu_model, cpu);
318
319 switch (flag >> 16) {
320 case X86_VENDOR_INTEL:
321 flag = get_intel_modelflag(flag);
322 break;
323 case X86_VENDOR_AMD:
324 flag = get_amd_modelflag(flag & 0xffff);
325 break;
326 default:
327 flag = CPU_NONE;
328 break;
329 }
330
331 return flag;
332}
333
334static int get_cpu_range_count(unsigned cpu)
335{
336 int index;
337
338 switch (per_cpu(cpu_model, cpu) >> 16) {
339 case X86_VENDOR_INTEL:
340 index = ARRAY_SIZE(cpu_intel_range);
341 break;
342 case X86_VENDOR_AMD:
343 index = ARRAY_SIZE(cpu_amd_range);
344 break;
345 default:
346 index = 0;
347 break;
348 }
349
350 return index;
351}
352
353static int is_typeflag_valid(unsigned cpu, unsigned flag) 166static int is_typeflag_valid(unsigned cpu, unsigned flag)
354{ 167{
355 unsigned vendor, modelflag; 168 int i;
356 int i, index;
357 169
358 /* Standard Registers should be always valid */ 170 /* Standard Registers should be always valid */
359 if (flag >= CPU_TSS) 171 if (flag >= CPU_TSS)
360 return 1; 172 return 1;
361 173
362 modelflag = per_cpu(cpu_modelflag, cpu); 174 for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
363 vendor = per_cpu(cpu_model, cpu) >> 16; 175 if (cpu_reg_range[i].flag == flag)
364 index = get_cpu_range_count(cpu); 176 return 1;
365
366 for (i = 0; i < index; i++) {
367 switch (vendor) {
368 case X86_VENDOR_INTEL:
369 if ((cpu_intel_range[i].model & modelflag) &&
370 (cpu_intel_range[i].flag & flag))
371 return 1;
372 break;
373 case X86_VENDOR_AMD:
374 if ((cpu_amd_range[i].model & modelflag) &&
375 (cpu_amd_range[i].flag & flag))
376 return 1;
377 break;
378 }
379 } 177 }
380 178
381 /* Invalid */ 179 /* Invalid */
@@ -385,26 +183,11 @@ static int is_typeflag_valid(unsigned cpu, unsigned flag)
385static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max, 183static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max,
386 int index, unsigned flag) 184 int index, unsigned flag)
387{ 185{
388 unsigned modelflag; 186 if (cpu_reg_range[index].flag == flag) {
389 187 *min = cpu_reg_range[index].min;
390 modelflag = per_cpu(cpu_modelflag, cpu); 188 *max = cpu_reg_range[index].max;
391 *max = 0; 189 } else
392 switch (per_cpu(cpu_model, cpu) >> 16) { 190 *max = 0;
393 case X86_VENDOR_INTEL:
394 if ((cpu_intel_range[index].model & modelflag) &&
395 (cpu_intel_range[index].flag & flag)) {
396 *min = cpu_intel_range[index].min;
397 *max = cpu_intel_range[index].max;
398 }
399 break;
400 case X86_VENDOR_AMD:
401 if ((cpu_amd_range[index].model & modelflag) &&
402 (cpu_amd_range[index].flag & flag)) {
403 *min = cpu_amd_range[index].min;
404 *max = cpu_amd_range[index].max;
405 }
406 break;
407 }
408 191
409 return *max; 192 return *max;
410} 193}
@@ -434,7 +217,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
434 unsigned msr, msr_min, msr_max; 217 unsigned msr, msr_min, msr_max;
435 struct cpu_private *priv; 218 struct cpu_private *priv;
436 u32 low, high; 219 u32 low, high;
437 int i, range; 220 int i;
438 221
439 if (seq) { 222 if (seq) {
440 priv = seq->private; 223 priv = seq->private;
@@ -446,9 +229,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
446 } 229 }
447 } 230 }
448 231
449 range = get_cpu_range_count(cpu); 232 for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
450
451 for (i = 0; i < range; i++) {
452 if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag)) 233 if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag))
453 continue; 234 continue;
454 235
@@ -588,8 +369,20 @@ static void print_apic(void *arg)
588 seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT)); 369 seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT));
589 seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT)); 370 seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT));
590 seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR)); 371 seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR));
591#endif /* CONFIG_X86_LOCAL_APIC */ 372 if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
373 unsigned int i, v, maxeilvt;
374
375 v = apic_read(APIC_EFEAT);
376 maxeilvt = (v >> 16) & 0xff;
377 seq_printf(seq, " EFEAT\t\t: %08x\n", v);
378 seq_printf(seq, " ECTRL\t\t: %08x\n", apic_read(APIC_ECTRL));
592 379
380 for (i = 0; i < maxeilvt; i++) {
381 v = apic_read(APIC_EILVTn(i));
382 seq_printf(seq, " EILVT%d\t\t: %08x\n", i, v);
383 }
384 }
385#endif /* CONFIG_X86_LOCAL_APIC */
593 seq_printf(seq, "\n MSR\t:\n"); 386 seq_printf(seq, "\n MSR\t:\n");
594} 387}
595 388
@@ -788,13 +581,11 @@ static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry)
788{ 581{
789 struct dentry *cpu_dentry = NULL; 582 struct dentry *cpu_dentry = NULL;
790 unsigned reg, reg_min, reg_max; 583 unsigned reg, reg_min, reg_max;
791 int i, range, err = 0; 584 int i, err = 0;
792 char reg_dir[12]; 585 char reg_dir[12];
793 u32 low, high; 586 u32 low, high;
794 587
795 range = get_cpu_range_count(cpu); 588 for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
796
797 for (i = 0; i < range; i++) {
798 if (!get_cpu_range(cpu, &reg_min, &reg_max, i, 589 if (!get_cpu_range(cpu, &reg_min, &reg_max, i,
799 cpu_base[type].flag)) 590 cpu_base[type].flag))
800 continue; 591 continue;
@@ -850,10 +641,6 @@ static int cpu_init_cpu(void)
850 cpui = &cpu_data(cpu); 641 cpui = &cpu_data(cpu);
851 if (!cpu_has(cpui, X86_FEATURE_MSR)) 642 if (!cpu_has(cpui, X86_FEATURE_MSR))
852 continue; 643 continue;
853 per_cpu(cpu_model, cpu) = ((cpui->x86_vendor << 16) |
854 (cpui->x86 << 8) |
855 (cpui->x86_model));
856 per_cpu(cpu_modelflag, cpu) = get_cpu_modelflag(cpu);
857 644
858 sprintf(cpu_dir, "cpu%d", cpu); 645 sprintf(cpu_dir, "cpu%d", cpu);
859 cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir); 646 cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir);
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
index 52c839875478..f138c6c389b9 100644
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ b/arch/x86/kernel/cpu/cpufreq/Kconfig
@@ -220,11 +220,14 @@ config X86_LONGHAUL
220 If in doubt, say N. 220 If in doubt, say N.
221 221
222config X86_E_POWERSAVER 222config X86_E_POWERSAVER
223 tristate "VIA C7 Enhanced PowerSaver" 223 tristate "VIA C7 Enhanced PowerSaver (DANGEROUS)"
224 select CPU_FREQ_TABLE 224 select CPU_FREQ_TABLE
225 depends on X86_32 225 depends on X86_32 && EXPERIMENTAL
226 help 226 help
227 This adds the CPUFreq driver for VIA C7 processors. 227 This adds the CPUFreq driver for VIA C7 processors. However, this driver
228 does not have any safeguards to prevent operating the CPU out of spec
229 and is thus considered dangerous. Please use the regular ACPI cpufreq
230 driver, enabled by CONFIG_X86_ACPI_CPUFREQ.
228 231
229 If in doubt, say N. 232 If in doubt, say N.
230 233
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 752e8c6b2c7e..ae9b503220ca 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -90,11 +90,7 @@ static int check_est_cpu(unsigned int cpuid)
90{ 90{
91 struct cpuinfo_x86 *cpu = &cpu_data(cpuid); 91 struct cpuinfo_x86 *cpu = &cpu_data(cpuid);
92 92
93 if (cpu->x86_vendor != X86_VENDOR_INTEL || 93 return cpu_has(cpu, X86_FEATURE_EST);
94 !cpu_has(cpu, X86_FEATURE_EST))
95 return 0;
96
97 return 1;
98} 94}
99 95
100static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data) 96static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data)
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index cf52215d9eb1..81cbe64ed6b4 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -1,3 +1,4 @@
1
1/* 2/*
2 * (c) 2003-2006 Advanced Micro Devices, Inc. 3 * (c) 2003-2006 Advanced Micro Devices, Inc.
3 * Your use of this code is subject to the terms and conditions of the 4 * Your use of this code is subject to the terms and conditions of the
@@ -117,20 +118,17 @@ static int query_current_values_with_pending_wait(struct powernow_k8_data *data)
117 u32 i = 0; 118 u32 i = 0;
118 119
119 if (cpu_family == CPU_HW_PSTATE) { 120 if (cpu_family == CPU_HW_PSTATE) {
120 if (data->currpstate == HW_PSTATE_INVALID) { 121 rdmsr(MSR_PSTATE_STATUS, lo, hi);
121 /* read (initial) hw pstate if not yet set */ 122 i = lo & HW_PSTATE_MASK;
122 rdmsr(MSR_PSTATE_STATUS, lo, hi); 123 data->currpstate = i;
123 i = lo & HW_PSTATE_MASK; 124
124 125 /*
125 /* 126 * a workaround for family 11h erratum 311 might cause
126 * a workaround for family 11h erratum 311 might cause 127 * an "out-of-range Pstate if the core is in Pstate-0
127 * an "out-of-range Pstate if the core is in Pstate-0 128 */
128 */ 129 if ((boot_cpu_data.x86 == 0x11) && (i >= data->numps))
129 if (i >= data->numps) 130 data->currpstate = HW_PSTATE_0;
130 data->currpstate = HW_PSTATE_0; 131
131 else
132 data->currpstate = i;
133 }
134 return 0; 132 return 0;
135 } 133 }
136 do { 134 do {
@@ -510,41 +508,34 @@ static int core_voltage_post_transition(struct powernow_k8_data *data,
510 return 0; 508 return 0;
511} 509}
512 510
513static int check_supported_cpu(unsigned int cpu) 511static void check_supported_cpu(void *_rc)
514{ 512{
515 cpumask_t oldmask;
516 u32 eax, ebx, ecx, edx; 513 u32 eax, ebx, ecx, edx;
517 unsigned int rc = 0; 514 int *rc = _rc;
518
519 oldmask = current->cpus_allowed;
520 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
521 515
522 if (smp_processor_id() != cpu) { 516 *rc = -ENODEV;
523 printk(KERN_ERR PFX "limiting to cpu %u failed\n", cpu);
524 goto out;
525 }
526 517
527 if (current_cpu_data.x86_vendor != X86_VENDOR_AMD) 518 if (current_cpu_data.x86_vendor != X86_VENDOR_AMD)
528 goto out; 519 return;
529 520
530 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); 521 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
531 if (((eax & CPUID_XFAM) != CPUID_XFAM_K8) && 522 if (((eax & CPUID_XFAM) != CPUID_XFAM_K8) &&
532 ((eax & CPUID_XFAM) < CPUID_XFAM_10H)) 523 ((eax & CPUID_XFAM) < CPUID_XFAM_10H))
533 goto out; 524 return;
534 525
535 if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) { 526 if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) {
536 if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) || 527 if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) ||
537 ((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) { 528 ((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) {
538 printk(KERN_INFO PFX 529 printk(KERN_INFO PFX
539 "Processor cpuid %x not supported\n", eax); 530 "Processor cpuid %x not supported\n", eax);
540 goto out; 531 return;
541 } 532 }
542 533
543 eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES); 534 eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES);
544 if (eax < CPUID_FREQ_VOLT_CAPABILITIES) { 535 if (eax < CPUID_FREQ_VOLT_CAPABILITIES) {
545 printk(KERN_INFO PFX 536 printk(KERN_INFO PFX
546 "No frequency change capabilities detected\n"); 537 "No frequency change capabilities detected\n");
547 goto out; 538 return;
548 } 539 }
549 540
550 cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); 541 cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
@@ -552,21 +543,17 @@ static int check_supported_cpu(unsigned int cpu)
552 != P_STATE_TRANSITION_CAPABLE) { 543 != P_STATE_TRANSITION_CAPABLE) {
553 printk(KERN_INFO PFX 544 printk(KERN_INFO PFX
554 "Power state transitions not supported\n"); 545 "Power state transitions not supported\n");
555 goto out; 546 return;
556 } 547 }
557 } else { /* must be a HW Pstate capable processor */ 548 } else { /* must be a HW Pstate capable processor */
558 cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); 549 cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
559 if ((edx & USE_HW_PSTATE) == USE_HW_PSTATE) 550 if ((edx & USE_HW_PSTATE) == USE_HW_PSTATE)
560 cpu_family = CPU_HW_PSTATE; 551 cpu_family = CPU_HW_PSTATE;
561 else 552 else
562 goto out; 553 return;
563 } 554 }
564 555
565 rc = 1; 556 *rc = 0;
566
567out:
568 set_cpus_allowed_ptr(current, &oldmask);
569 return rc;
570} 557}
571 558
572static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, 559static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst,
@@ -823,13 +810,14 @@ static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data,
823 if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE)) 810 if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE))
824 return; 811 return;
825 812
826 control = data->acpi_data.states[index].control; data->irt = (control 813 control = data->acpi_data.states[index].control;
827 >> IRT_SHIFT) & IRT_MASK; data->rvo = (control >> 814 data->irt = (control >> IRT_SHIFT) & IRT_MASK;
828 RVO_SHIFT) & RVO_MASK; data->exttype = (control 815 data->rvo = (control >> RVO_SHIFT) & RVO_MASK;
829 >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK; 816 data->exttype = (control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK;
830 data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK; data->vidmvs = 1 817 data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK;
831 << ((control >> MVS_SHIFT) & MVS_MASK); data->vstable = 818 data->vidmvs = 1 << ((control >> MVS_SHIFT) & MVS_MASK);
832 (control >> VST_SHIFT) & VST_MASK; } 819 data->vstable = (control >> VST_SHIFT) & VST_MASK;
820}
833 821
834static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) 822static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
835{ 823{
@@ -1046,6 +1034,19 @@ static int get_transition_latency(struct powernow_k8_data *data)
1046 if (cur_latency > max_latency) 1034 if (cur_latency > max_latency)
1047 max_latency = cur_latency; 1035 max_latency = cur_latency;
1048 } 1036 }
1037 if (max_latency == 0) {
1038 /*
1039 * Fam 11h always returns 0 as transition latency.
1040 * This is intended and means "very fast". While cpufreq core
1041 * and governors currently can handle that gracefully, better
1042 * set it to 1 to avoid problems in the future.
1043 * For all others it's a BIOS bug.
1044 */
1045 if (!boot_cpu_data.x86 == 0x11)
1046 printk(KERN_ERR FW_WARN PFX "Invalid zero transition "
1047 "latency\n");
1048 max_latency = 1;
1049 }
1049 /* value in usecs, needs to be in nanoseconds */ 1050 /* value in usecs, needs to be in nanoseconds */
1050 return 1000 * max_latency; 1051 return 1000 * max_latency;
1051} 1052}
@@ -1093,7 +1094,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data,
1093 freqs.old = find_khz_freq_from_fid(data->currfid); 1094 freqs.old = find_khz_freq_from_fid(data->currfid);
1094 freqs.new = find_khz_freq_from_fid(fid); 1095 freqs.new = find_khz_freq_from_fid(fid);
1095 1096
1096 for_each_cpu_mask_nr(i, *(data->available_cores)) { 1097 for_each_cpu(i, data->available_cores) {
1097 freqs.cpu = i; 1098 freqs.cpu = i;
1098 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 1099 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
1099 } 1100 }
@@ -1101,7 +1102,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data,
1101 res = transition_fid_vid(data, fid, vid); 1102 res = transition_fid_vid(data, fid, vid);
1102 freqs.new = find_khz_freq_from_fid(data->currfid); 1103 freqs.new = find_khz_freq_from_fid(data->currfid);
1103 1104
1104 for_each_cpu_mask_nr(i, *(data->available_cores)) { 1105 for_each_cpu(i, data->available_cores) {
1105 freqs.cpu = i; 1106 freqs.cpu = i;
1106 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 1107 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
1107 } 1108 }
@@ -1126,7 +1127,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data,
1126 data->currpstate); 1127 data->currpstate);
1127 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); 1128 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
1128 1129
1129 for_each_cpu_mask_nr(i, *(data->available_cores)) { 1130 for_each_cpu(i, data->available_cores) {
1130 freqs.cpu = i; 1131 freqs.cpu = i;
1131 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 1132 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
1132 } 1133 }
@@ -1134,7 +1135,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data,
1134 res = transition_pstate(data, pstate); 1135 res = transition_pstate(data, pstate);
1135 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); 1136 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
1136 1137
1137 for_each_cpu_mask_nr(i, *(data->available_cores)) { 1138 for_each_cpu(i, data->available_cores) {
1138 freqs.cpu = i; 1139 freqs.cpu = i;
1139 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 1140 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
1140 } 1141 }
@@ -1235,21 +1236,47 @@ static int powernowk8_verify(struct cpufreq_policy *pol)
1235 return cpufreq_frequency_table_verify(pol, data->powernow_table); 1236 return cpufreq_frequency_table_verify(pol, data->powernow_table);
1236} 1237}
1237 1238
1238static const char ACPI_PSS_BIOS_BUG_MSG[] = 1239struct init_on_cpu {
1239 KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n" 1240 struct powernow_k8_data *data;
1240 KERN_ERR FW_BUG PFX "Try again with latest BIOS.\n"; 1241 int rc;
1242};
1243
1244static void __cpuinit powernowk8_cpu_init_on_cpu(void *_init_on_cpu)
1245{
1246 struct init_on_cpu *init_on_cpu = _init_on_cpu;
1247
1248 if (pending_bit_stuck()) {
1249 printk(KERN_ERR PFX "failing init, change pending bit set\n");
1250 init_on_cpu->rc = -ENODEV;
1251 return;
1252 }
1253
1254 if (query_current_values_with_pending_wait(init_on_cpu->data)) {
1255 init_on_cpu->rc = -ENODEV;
1256 return;
1257 }
1258
1259 if (cpu_family == CPU_OPTERON)
1260 fidvid_msr_init();
1261
1262 init_on_cpu->rc = 0;
1263}
1241 1264
1242/* per CPU init entry point to the driver */ 1265/* per CPU init entry point to the driver */
1243static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) 1266static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1244{ 1267{
1268 static const char ACPI_PSS_BIOS_BUG_MSG[] =
1269 KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n"
1270 KERN_ERR FW_BUG PFX "Try again with latest BIOS.\n";
1245 struct powernow_k8_data *data; 1271 struct powernow_k8_data *data;
1246 cpumask_t oldmask; 1272 struct init_on_cpu init_on_cpu;
1247 int rc; 1273 int rc;
1248 1274
1249 if (!cpu_online(pol->cpu)) 1275 if (!cpu_online(pol->cpu))
1250 return -ENODEV; 1276 return -ENODEV;
1251 1277
1252 if (!check_supported_cpu(pol->cpu)) 1278 smp_call_function_single(pol->cpu, check_supported_cpu, &rc, 1);
1279 if (rc)
1253 return -ENODEV; 1280 return -ENODEV;
1254 1281
1255 data = kzalloc(sizeof(struct powernow_k8_data), GFP_KERNEL); 1282 data = kzalloc(sizeof(struct powernow_k8_data), GFP_KERNEL);
@@ -1289,27 +1316,12 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1289 pol->cpuinfo.transition_latency = get_transition_latency(data); 1316 pol->cpuinfo.transition_latency = get_transition_latency(data);
1290 1317
1291 /* only run on specific CPU from here on */ 1318 /* only run on specific CPU from here on */
1292 oldmask = current->cpus_allowed; 1319 init_on_cpu.data = data;
1293 set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu)); 1320 smp_call_function_single(data->cpu, powernowk8_cpu_init_on_cpu,
1294 1321 &init_on_cpu, 1);
1295 if (smp_processor_id() != pol->cpu) { 1322 rc = init_on_cpu.rc;
1296 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); 1323 if (rc != 0)
1297 goto err_out_unmask; 1324 goto err_out_exit_acpi;
1298 }
1299
1300 if (pending_bit_stuck()) {
1301 printk(KERN_ERR PFX "failing init, change pending bit set\n");
1302 goto err_out_unmask;
1303 }
1304
1305 if (query_current_values_with_pending_wait(data))
1306 goto err_out_unmask;
1307
1308 if (cpu_family == CPU_OPTERON)
1309 fidvid_msr_init();
1310
1311 /* run on any CPU again */
1312 set_cpus_allowed_ptr(current, &oldmask);
1313 1325
1314 if (cpu_family == CPU_HW_PSTATE) 1326 if (cpu_family == CPU_HW_PSTATE)
1315 cpumask_copy(pol->cpus, cpumask_of(pol->cpu)); 1327 cpumask_copy(pol->cpus, cpumask_of(pol->cpu));
@@ -1346,8 +1358,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1346 1358
1347 return 0; 1359 return 0;
1348 1360
1349err_out_unmask: 1361err_out_exit_acpi:
1350 set_cpus_allowed_ptr(current, &oldmask);
1351 powernow_k8_cpu_exit_acpi(data); 1362 powernow_k8_cpu_exit_acpi(data);
1352 1363
1353err_out: 1364err_out:
@@ -1372,28 +1383,25 @@ static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol)
1372 return 0; 1383 return 0;
1373} 1384}
1374 1385
1386static void query_values_on_cpu(void *_err)
1387{
1388 int *err = _err;
1389 struct powernow_k8_data *data = __get_cpu_var(powernow_data);
1390
1391 *err = query_current_values_with_pending_wait(data);
1392}
1393
1375static unsigned int powernowk8_get(unsigned int cpu) 1394static unsigned int powernowk8_get(unsigned int cpu)
1376{ 1395{
1377 struct powernow_k8_data *data; 1396 struct powernow_k8_data *data = per_cpu(powernow_data, cpu);
1378 cpumask_t oldmask = current->cpus_allowed;
1379 unsigned int khz = 0; 1397 unsigned int khz = 0;
1380 unsigned int first; 1398 int err;
1381
1382 first = cpumask_first(cpu_core_mask(cpu));
1383 data = per_cpu(powernow_data, first);
1384 1399
1385 if (!data) 1400 if (!data)
1386 return -EINVAL; 1401 return -EINVAL;
1387 1402
1388 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 1403 smp_call_function_single(cpu, query_values_on_cpu, &err, true);
1389 if (smp_processor_id() != cpu) { 1404 if (err)
1390 printk(KERN_ERR PFX
1391 "limiting to CPU %d failed in powernowk8_get\n", cpu);
1392 set_cpus_allowed_ptr(current, &oldmask);
1393 return 0;
1394 }
1395
1396 if (query_current_values_with_pending_wait(data))
1397 goto out; 1405 goto out;
1398 1406
1399 if (cpu_family == CPU_HW_PSTATE) 1407 if (cpu_family == CPU_HW_PSTATE)
@@ -1404,7 +1412,6 @@ static unsigned int powernowk8_get(unsigned int cpu)
1404 1412
1405 1413
1406out: 1414out:
1407 set_cpus_allowed_ptr(current, &oldmask);
1408 return khz; 1415 return khz;
1409} 1416}
1410 1417
@@ -1430,7 +1437,9 @@ static int __cpuinit powernowk8_init(void)
1430 unsigned int i, supported_cpus = 0; 1437 unsigned int i, supported_cpus = 0;
1431 1438
1432 for_each_online_cpu(i) { 1439 for_each_online_cpu(i) {
1433 if (check_supported_cpu(i)) 1440 int rc;
1441 smp_call_function_single(i, check_supported_cpu, &rc, 1);
1442 if (rc == 0)
1434 supported_cpus++; 1443 supported_cpus++;
1435 } 1444 }
1436 1445
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
index 6c6698feade1..c9c1190b5e1f 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
@@ -223,14 +223,3 @@ static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned
223 223
224static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table); 224static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
225static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table); 225static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
226
227#ifdef CONFIG_SMP
228static inline void define_siblings(int cpu, cpumask_t cpu_sharedcore_mask[])
229{
230}
231#else
232static inline void define_siblings(int cpu, cpumask_t cpu_sharedcore_mask[])
233{
234 cpu_set(0, cpu_sharedcore_mask[0]);
235}
236#endif
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index 55c831ed71ce..8d672ef162ce 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -323,14 +323,8 @@ static unsigned int get_cur_freq(unsigned int cpu)
323{ 323{
324 unsigned l, h; 324 unsigned l, h;
325 unsigned clock_freq; 325 unsigned clock_freq;
326 cpumask_t saved_mask;
327 326
328 saved_mask = current->cpus_allowed; 327 rdmsr_on_cpu(cpu, MSR_IA32_PERF_STATUS, &l, &h);
329 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
330 if (smp_processor_id() != cpu)
331 return 0;
332
333 rdmsr(MSR_IA32_PERF_STATUS, l, h);
334 clock_freq = extract_clock(l, cpu, 0); 328 clock_freq = extract_clock(l, cpu, 0);
335 329
336 if (unlikely(clock_freq == 0)) { 330 if (unlikely(clock_freq == 0)) {
@@ -340,11 +334,9 @@ static unsigned int get_cur_freq(unsigned int cpu)
340 * P-state transition (like TM2). Get the last freq set 334 * P-state transition (like TM2). Get the last freq set
341 * in PERF_CTL. 335 * in PERF_CTL.
342 */ 336 */
343 rdmsr(MSR_IA32_PERF_CTL, l, h); 337 rdmsr_on_cpu(cpu, MSR_IA32_PERF_CTL, &l, &h);
344 clock_freq = extract_clock(l, cpu, 1); 338 clock_freq = extract_clock(l, cpu, 1);
345 } 339 }
346
347 set_cpus_allowed_ptr(current, &saved_mask);
348 return clock_freq; 340 return clock_freq;
349} 341}
350 342
@@ -467,15 +459,10 @@ static int centrino_target (struct cpufreq_policy *policy,
467 struct cpufreq_freqs freqs; 459 struct cpufreq_freqs freqs;
468 int retval = 0; 460 int retval = 0;
469 unsigned int j, k, first_cpu, tmp; 461 unsigned int j, k, first_cpu, tmp;
470 cpumask_var_t saved_mask, covered_cpus; 462 cpumask_var_t covered_cpus;
471 463
472 if (unlikely(!alloc_cpumask_var(&saved_mask, GFP_KERNEL))) 464 if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL)))
473 return -ENOMEM;
474 if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL))) {
475 free_cpumask_var(saved_mask);
476 return -ENOMEM; 465 return -ENOMEM;
477 }
478 cpumask_copy(saved_mask, &current->cpus_allowed);
479 466
480 if (unlikely(per_cpu(centrino_model, cpu) == NULL)) { 467 if (unlikely(per_cpu(centrino_model, cpu) == NULL)) {
481 retval = -ENODEV; 468 retval = -ENODEV;
@@ -493,7 +480,7 @@ static int centrino_target (struct cpufreq_policy *policy,
493 480
494 first_cpu = 1; 481 first_cpu = 1;
495 for_each_cpu(j, policy->cpus) { 482 for_each_cpu(j, policy->cpus) {
496 const struct cpumask *mask; 483 int good_cpu;
497 484
498 /* cpufreq holds the hotplug lock, so we are safe here */ 485 /* cpufreq holds the hotplug lock, so we are safe here */
499 if (!cpu_online(j)) 486 if (!cpu_online(j))
@@ -504,32 +491,30 @@ static int centrino_target (struct cpufreq_policy *policy,
504 * Make sure we are running on CPU that wants to change freq 491 * Make sure we are running on CPU that wants to change freq
505 */ 492 */
506 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) 493 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
507 mask = policy->cpus; 494 good_cpu = cpumask_any_and(policy->cpus,
495 cpu_online_mask);
508 else 496 else
509 mask = cpumask_of(j); 497 good_cpu = j;
510 498
511 set_cpus_allowed_ptr(current, mask); 499 if (good_cpu >= nr_cpu_ids) {
512 preempt_disable();
513 if (unlikely(!cpu_isset(smp_processor_id(), *mask))) {
514 dprintk("couldn't limit to CPUs in this domain\n"); 500 dprintk("couldn't limit to CPUs in this domain\n");
515 retval = -EAGAIN; 501 retval = -EAGAIN;
516 if (first_cpu) { 502 if (first_cpu) {
517 /* We haven't started the transition yet. */ 503 /* We haven't started the transition yet. */
518 goto migrate_end; 504 goto out;
519 } 505 }
520 preempt_enable();
521 break; 506 break;
522 } 507 }
523 508
524 msr = per_cpu(centrino_model, cpu)->op_points[newstate].index; 509 msr = per_cpu(centrino_model, cpu)->op_points[newstate].index;
525 510
526 if (first_cpu) { 511 if (first_cpu) {
527 rdmsr(MSR_IA32_PERF_CTL, oldmsr, h); 512 rdmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, &oldmsr, &h);
528 if (msr == (oldmsr & 0xffff)) { 513 if (msr == (oldmsr & 0xffff)) {
529 dprintk("no change needed - msr was and needs " 514 dprintk("no change needed - msr was and needs "
530 "to be %x\n", oldmsr); 515 "to be %x\n", oldmsr);
531 retval = 0; 516 retval = 0;
532 goto migrate_end; 517 goto out;
533 } 518 }
534 519
535 freqs.old = extract_clock(oldmsr, cpu, 0); 520 freqs.old = extract_clock(oldmsr, cpu, 0);
@@ -553,14 +538,11 @@ static int centrino_target (struct cpufreq_policy *policy,
553 oldmsr |= msr; 538 oldmsr |= msr;
554 } 539 }
555 540
556 wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); 541 wrmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, oldmsr, h);
557 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) { 542 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
558 preempt_enable();
559 break; 543 break;
560 }
561 544
562 cpu_set(j, *covered_cpus); 545 cpumask_set_cpu(j, covered_cpus);
563 preempt_enable();
564 } 546 }
565 547
566 for_each_cpu(k, policy->cpus) { 548 for_each_cpu(k, policy->cpus) {
@@ -578,10 +560,8 @@ static int centrino_target (struct cpufreq_policy *policy,
578 * Best effort undo.. 560 * Best effort undo..
579 */ 561 */
580 562
581 for_each_cpu_mask_nr(j, *covered_cpus) { 563 for_each_cpu(j, covered_cpus)
582 set_cpus_allowed_ptr(current, &cpumask_of_cpu(j)); 564 wrmsr_on_cpu(j, MSR_IA32_PERF_CTL, oldmsr, h);
583 wrmsr(MSR_IA32_PERF_CTL, oldmsr, h);
584 }
585 565
586 tmp = freqs.new; 566 tmp = freqs.new;
587 freqs.new = freqs.old; 567 freqs.new = freqs.old;
@@ -593,15 +573,9 @@ static int centrino_target (struct cpufreq_policy *policy,
593 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 573 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
594 } 574 }
595 } 575 }
596 set_cpus_allowed_ptr(current, saved_mask);
597 retval = 0; 576 retval = 0;
598 goto out;
599 577
600migrate_end:
601 preempt_enable();
602 set_cpus_allowed_ptr(current, saved_mask);
603out: 578out:
604 free_cpumask_var(saved_mask);
605 free_cpumask_var(covered_cpus); 579 free_cpumask_var(covered_cpus);
606 return retval; 580 return retval;
607} 581}
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 016c1a4fa3fc..6911e91fb4f6 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -89,7 +89,8 @@ static int speedstep_find_register(void)
89 * speedstep_set_state - set the SpeedStep state 89 * speedstep_set_state - set the SpeedStep state
90 * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH) 90 * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
91 * 91 *
92 * Tries to change the SpeedStep state. 92 * Tries to change the SpeedStep state. Can be called from
93 * smp_call_function_single.
93 */ 94 */
94static void speedstep_set_state(unsigned int state) 95static void speedstep_set_state(unsigned int state)
95{ 96{
@@ -143,6 +144,11 @@ static void speedstep_set_state(unsigned int state)
143 return; 144 return;
144} 145}
145 146
147/* Wrapper for smp_call_function_single. */
148static void _speedstep_set_state(void *_state)
149{
150 speedstep_set_state(*(unsigned int *)_state);
151}
146 152
147/** 153/**
148 * speedstep_activate - activate SpeedStep control in the chipset 154 * speedstep_activate - activate SpeedStep control in the chipset
@@ -226,22 +232,28 @@ static unsigned int speedstep_detect_chipset(void)
226 return 0; 232 return 0;
227} 233}
228 234
229static unsigned int _speedstep_get(const struct cpumask *cpus) 235struct get_freq_data {
230{
231 unsigned int speed; 236 unsigned int speed;
232 cpumask_t cpus_allowed; 237 unsigned int processor;
233 238};
234 cpus_allowed = current->cpus_allowed; 239
235 set_cpus_allowed_ptr(current, cpus); 240static void get_freq_data(void *_data)
236 speed = speedstep_get_frequency(speedstep_processor); 241{
237 set_cpus_allowed_ptr(current, &cpus_allowed); 242 struct get_freq_data *data = _data;
238 dprintk("detected %u kHz as current frequency\n", speed); 243
239 return speed; 244 data->speed = speedstep_get_frequency(data->processor);
240} 245}
241 246
242static unsigned int speedstep_get(unsigned int cpu) 247static unsigned int speedstep_get(unsigned int cpu)
243{ 248{
244 return _speedstep_get(cpumask_of(cpu)); 249 struct get_freq_data data = { .processor = cpu };
250
251 /* You're supposed to ensure CPU is online. */
252 if (smp_call_function_single(cpu, get_freq_data, &data, 1) != 0)
253 BUG();
254
255 dprintk("detected %u kHz as current frequency\n", data.speed);
256 return data.speed;
245} 257}
246 258
247/** 259/**
@@ -257,16 +269,16 @@ static int speedstep_target(struct cpufreq_policy *policy,
257 unsigned int target_freq, 269 unsigned int target_freq,
258 unsigned int relation) 270 unsigned int relation)
259{ 271{
260 unsigned int newstate = 0; 272 unsigned int newstate = 0, policy_cpu;
261 struct cpufreq_freqs freqs; 273 struct cpufreq_freqs freqs;
262 cpumask_t cpus_allowed;
263 int i; 274 int i;
264 275
265 if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], 276 if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0],
266 target_freq, relation, &newstate)) 277 target_freq, relation, &newstate))
267 return -EINVAL; 278 return -EINVAL;
268 279
269 freqs.old = _speedstep_get(policy->cpus); 280 policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);
281 freqs.old = speedstep_get(policy_cpu);
270 freqs.new = speedstep_freqs[newstate].frequency; 282 freqs.new = speedstep_freqs[newstate].frequency;
271 freqs.cpu = policy->cpu; 283 freqs.cpu = policy->cpu;
272 284
@@ -276,20 +288,13 @@ static int speedstep_target(struct cpufreq_policy *policy,
276 if (freqs.old == freqs.new) 288 if (freqs.old == freqs.new)
277 return 0; 289 return 0;
278 290
279 cpus_allowed = current->cpus_allowed;
280
281 for_each_cpu(i, policy->cpus) { 291 for_each_cpu(i, policy->cpus) {
282 freqs.cpu = i; 292 freqs.cpu = i;
283 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 293 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
284 } 294 }
285 295
286 /* switch to physical CPU where state is to be changed */ 296 smp_call_function_single(policy_cpu, _speedstep_set_state, &newstate,
287 set_cpus_allowed_ptr(current, policy->cpus); 297 true);
288
289 speedstep_set_state(newstate);
290
291 /* allow to be run on all CPUs */
292 set_cpus_allowed_ptr(current, &cpus_allowed);
293 298
294 for_each_cpu(i, policy->cpus) { 299 for_each_cpu(i, policy->cpus) {
295 freqs.cpu = i; 300 freqs.cpu = i;
@@ -312,33 +317,43 @@ static int speedstep_verify(struct cpufreq_policy *policy)
312 return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]); 317 return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
313} 318}
314 319
320struct get_freqs {
321 struct cpufreq_policy *policy;
322 int ret;
323};
324
325static void get_freqs_on_cpu(void *_get_freqs)
326{
327 struct get_freqs *get_freqs = _get_freqs;
328
329 get_freqs->ret =
330 speedstep_get_freqs(speedstep_processor,
331 &speedstep_freqs[SPEEDSTEP_LOW].frequency,
332 &speedstep_freqs[SPEEDSTEP_HIGH].frequency,
333 &get_freqs->policy->cpuinfo.transition_latency,
334 &speedstep_set_state);
335}
315 336
316static int speedstep_cpu_init(struct cpufreq_policy *policy) 337static int speedstep_cpu_init(struct cpufreq_policy *policy)
317{ 338{
318 int result = 0; 339 int result;
319 unsigned int speed; 340 unsigned int policy_cpu, speed;
320 cpumask_t cpus_allowed; 341 struct get_freqs gf;
321 342
322 /* only run on CPU to be set, or on its sibling */ 343 /* only run on CPU to be set, or on its sibling */
323#ifdef CONFIG_SMP 344#ifdef CONFIG_SMP
324 cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu)); 345 cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));
325#endif 346#endif
326 347 policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);
327 cpus_allowed = current->cpus_allowed;
328 set_cpus_allowed_ptr(current, policy->cpus);
329 348
330 /* detect low and high frequency and transition latency */ 349 /* detect low and high frequency and transition latency */
331 result = speedstep_get_freqs(speedstep_processor, 350 gf.policy = policy;
332 &speedstep_freqs[SPEEDSTEP_LOW].frequency, 351 smp_call_function_single(policy_cpu, get_freqs_on_cpu, &gf, 1);
333 &speedstep_freqs[SPEEDSTEP_HIGH].frequency, 352 if (gf.ret)
334 &policy->cpuinfo.transition_latency, 353 return gf.ret;
335 &speedstep_set_state);
336 set_cpus_allowed_ptr(current, &cpus_allowed);
337 if (result)
338 return result;
339 354
340 /* get current speed setting */ 355 /* get current speed setting */
341 speed = _speedstep_get(policy->cpus); 356 speed = speedstep_get(policy_cpu);
342 if (!speed) 357 if (!speed)
343 return -EIO; 358 return -EIO;
344 359
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
index 2e3c6862657b..f4c290b8482f 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
@@ -226,6 +226,7 @@ static unsigned int pentium4_get_frequency(void)
226} 226}
227 227
228 228
229/* Warning: may get called from smp_call_function_single. */
229unsigned int speedstep_get_frequency(unsigned int processor) 230unsigned int speedstep_get_frequency(unsigned int processor)
230{ 231{
231 switch (processor) { 232 switch (processor) {
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 7437fa133c02..3260ab044996 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -86,6 +86,29 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
86 */ 86 */
87 if (c->x86 == 6 && c->x86_model < 15) 87 if (c->x86 == 6 && c->x86_model < 15)
88 clear_cpu_cap(c, X86_FEATURE_PAT); 88 clear_cpu_cap(c, X86_FEATURE_PAT);
89
90#ifdef CONFIG_KMEMCHECK
91 /*
92 * P4s have a "fast strings" feature which causes single-
93 * stepping REP instructions to only generate a #DB on
94 * cache-line boundaries.
95 *
96 * Ingo Molnar reported a Pentium D (model 6) and a Xeon
97 * (model 2) with the same problem.
98 */
99 if (c->x86 == 15) {
100 u64 misc_enable;
101
102 rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
103
104 if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) {
105 printk(KERN_INFO "kmemcheck: Disabling fast string operations\n");
106
107 misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING;
108 wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
109 }
110 }
111#endif
89} 112}
90 113
91#ifdef CONFIG_X86_32 114#ifdef CONFIG_X86_32
@@ -229,12 +252,12 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
229} 252}
230#endif 253#endif
231 254
232static void __cpuinit srat_detect_node(void) 255static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
233{ 256{
234#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 257#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
235 unsigned node; 258 unsigned node;
236 int cpu = smp_processor_id(); 259 int cpu = smp_processor_id();
237 int apicid = hard_smp_processor_id(); 260 int apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid;
238 261
239 /* Don't do the funky fallback heuristics the AMD version employs 262 /* Don't do the funky fallback heuristics the AMD version employs
240 for now. */ 263 for now. */
@@ -400,7 +423,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
400 } 423 }
401 424
402 /* Work around errata */ 425 /* Work around errata */
403 srat_detect_node(); 426 srat_detect_node(c);
404 427
405 if (cpu_has(c, X86_FEATURE_VMX)) 428 if (cpu_has(c, X86_FEATURE_VMX))
406 detect_vmx_virtcap(c); 429 detect_vmx_virtcap(c);
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 483eda96e102..789efe217e1a 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -17,6 +17,7 @@
17 17
18#include <asm/processor.h> 18#include <asm/processor.h>
19#include <asm/smp.h> 19#include <asm/smp.h>
20#include <asm/k8.h>
20 21
21#define LVL_1_INST 1 22#define LVL_1_INST 1
22#define LVL_1_DATA 2 23#define LVL_1_DATA 2
@@ -159,14 +160,6 @@ struct _cpuid4_info_regs {
159 unsigned long can_disable; 160 unsigned long can_disable;
160}; 161};
161 162
162#if defined(CONFIG_PCI) && defined(CONFIG_SYSFS)
163static struct pci_device_id k8_nb_id[] = {
164 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) },
165 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) },
166 {}
167};
168#endif
169
170unsigned short num_cache_leaves; 163unsigned short num_cache_leaves;
171 164
172/* AMD doesn't have CPUID4. Emulate it here to report the same 165/* AMD doesn't have CPUID4. Emulate it here to report the same
@@ -207,10 +200,17 @@ union l3_cache {
207}; 200};
208 201
209static const unsigned short __cpuinitconst assocs[] = { 202static const unsigned short __cpuinitconst assocs[] = {
210 [1] = 1, [2] = 2, [4] = 4, [6] = 8, 203 [1] = 1,
211 [8] = 16, [0xa] = 32, [0xb] = 48, 204 [2] = 2,
205 [4] = 4,
206 [6] = 8,
207 [8] = 16,
208 [0xa] = 32,
209 [0xb] = 48,
212 [0xc] = 64, 210 [0xc] = 64,
213 [0xf] = 0xffff // ?? 211 [0xd] = 96,
212 [0xe] = 128,
213 [0xf] = 0xffff /* fully associative - no way to show this currently */
214}; 214};
215 215
216static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 }; 216static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 };
@@ -271,7 +271,8 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
271 eax->split.type = types[leaf]; 271 eax->split.type = types[leaf];
272 eax->split.level = levels[leaf]; 272 eax->split.level = levels[leaf];
273 if (leaf == 3) 273 if (leaf == 3)
274 eax->split.num_threads_sharing = current_cpu_data.x86_max_cores - 1; 274 eax->split.num_threads_sharing =
275 current_cpu_data.x86_max_cores - 1;
275 else 276 else
276 eax->split.num_threads_sharing = 0; 277 eax->split.num_threads_sharing = 0;
277 eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; 278 eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1;
@@ -291,6 +292,14 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
291{ 292{
292 if (index < 3) 293 if (index < 3)
293 return; 294 return;
295
296 if (boot_cpu_data.x86 == 0x11)
297 return;
298
299 /* see erratum #382 */
300 if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8))
301 return;
302
294 this_leaf->can_disable = 1; 303 this_leaf->can_disable = 1;
295} 304}
296 305
@@ -696,97 +705,75 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)
696#define to_object(k) container_of(k, struct _index_kobject, kobj) 705#define to_object(k) container_of(k, struct _index_kobject, kobj)
697#define to_attr(a) container_of(a, struct _cache_attr, attr) 706#define to_attr(a) container_of(a, struct _cache_attr, attr)
698 707
699#ifdef CONFIG_PCI 708static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
700static struct pci_dev *get_k8_northbridge(int node) 709 unsigned int index)
701{
702 struct pci_dev *dev = NULL;
703 int i;
704
705 for (i = 0; i <= node; i++) {
706 do {
707 dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
708 if (!dev)
709 break;
710 } while (!pci_match_id(&k8_nb_id[0], dev));
711 if (!dev)
712 break;
713 }
714 return dev;
715}
716#else
717static struct pci_dev *get_k8_northbridge(int node)
718{
719 return NULL;
720}
721#endif
722
723static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf)
724{ 710{
725 const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); 711 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
726 int node = cpu_to_node(cpumask_first(mask)); 712 int node = cpu_to_node(cpu);
727 struct pci_dev *dev = NULL; 713 struct pci_dev *dev = node_to_k8_nb_misc(node);
728 ssize_t ret = 0; 714 unsigned int reg = 0;
729 int i;
730 715
731 if (!this_leaf->can_disable) 716 if (!this_leaf->can_disable)
732 return sprintf(buf, "Feature not enabled\n");
733
734 dev = get_k8_northbridge(node);
735 if (!dev) {
736 printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
737 return -EINVAL; 717 return -EINVAL;
738 }
739 718
740 for (i = 0; i < 2; i++) { 719 if (!dev)
741 unsigned int reg; 720 return -EINVAL;
742 721
743 pci_read_config_dword(dev, 0x1BC + i * 4, &reg); 722 pci_read_config_dword(dev, 0x1BC + index * 4, &reg);
723 return sprintf(buf, "%x\n", reg);
724}
744 725
745 ret += sprintf(buf, "%sEntry: %d\n", buf, i); 726#define SHOW_CACHE_DISABLE(index) \
746 ret += sprintf(buf, "%sReads: %s\tNew Entries: %s\n", 727static ssize_t \
747 buf, 728show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \
748 reg & 0x80000000 ? "Disabled" : "Allowed", 729{ \
749 reg & 0x40000000 ? "Disabled" : "Allowed"); 730 return show_cache_disable(this_leaf, buf, index); \
750 ret += sprintf(buf, "%sSubCache: %x\tIndex: %x\n",
751 buf, (reg & 0x30000) >> 16, reg & 0xfff);
752 }
753 return ret;
754} 731}
732SHOW_CACHE_DISABLE(0)
733SHOW_CACHE_DISABLE(1)
755 734
756static ssize_t 735static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
757store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, 736 const char *buf, size_t count, unsigned int index)
758 size_t count)
759{ 737{
760 const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); 738 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
761 int node = cpu_to_node(cpumask_first(mask)); 739 int node = cpu_to_node(cpu);
762 struct pci_dev *dev = NULL; 740 struct pci_dev *dev = node_to_k8_nb_misc(node);
763 unsigned int ret, index, val; 741 unsigned long val = 0;
742 unsigned int scrubber = 0;
764 743
765 if (!this_leaf->can_disable) 744 if (!this_leaf->can_disable)
766 return 0;
767
768 if (strlen(buf) > 15)
769 return -EINVAL; 745 return -EINVAL;
770 746
771 ret = sscanf(buf, "%x %x", &index, &val); 747 if (!capable(CAP_SYS_ADMIN))
772 if (ret != 2) 748 return -EPERM;
749
750 if (!dev)
773 return -EINVAL; 751 return -EINVAL;
774 if (index > 1) 752
753 if (strict_strtoul(buf, 10, &val) < 0)
775 return -EINVAL; 754 return -EINVAL;
776 755
777 val |= 0xc0000000; 756 val |= 0xc0000000;
778 dev = get_k8_northbridge(node); 757
779 if (!dev) { 758 pci_read_config_dword(dev, 0x58, &scrubber);
780 printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); 759 scrubber &= ~0x1f000000;
781 return -EINVAL; 760 pci_write_config_dword(dev, 0x58, scrubber);
782 }
783 761
784 pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000); 762 pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000);
785 wbinvd(); 763 wbinvd();
786 pci_write_config_dword(dev, 0x1BC + index * 4, val); 764 pci_write_config_dword(dev, 0x1BC + index * 4, val);
765 return count;
766}
787 767
788 return 1; 768#define STORE_CACHE_DISABLE(index) \
769static ssize_t \
770store_cache_disable_##index(struct _cpuid4_info *this_leaf, \
771 const char *buf, size_t count) \
772{ \
773 return store_cache_disable(this_leaf, buf, count, index); \
789} 774}
775STORE_CACHE_DISABLE(0)
776STORE_CACHE_DISABLE(1)
790 777
791struct _cache_attr { 778struct _cache_attr {
792 struct attribute attr; 779 struct attribute attr;
@@ -808,7 +795,10 @@ define_one_ro(size);
808define_one_ro(shared_cpu_map); 795define_one_ro(shared_cpu_map);
809define_one_ro(shared_cpu_list); 796define_one_ro(shared_cpu_list);
810 797
811static struct _cache_attr cache_disable = __ATTR(cache_disable, 0644, show_cache_disable, store_cache_disable); 798static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
799 show_cache_disable_0, store_cache_disable_0);
800static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
801 show_cache_disable_1, store_cache_disable_1);
812 802
813static struct attribute * default_attrs[] = { 803static struct attribute * default_attrs[] = {
814 &type.attr, 804 &type.attr,
@@ -820,7 +810,8 @@ static struct attribute * default_attrs[] = {
820 &size.attr, 810 &size.attr,
821 &shared_cpu_map.attr, 811 &shared_cpu_map.attr,
822 &shared_cpu_list.attr, 812 &shared_cpu_list.attr,
823 &cache_disable.attr, 813 &cache_disable_0.attr,
814 &cache_disable_1.attr,
824 NULL 815 NULL
825}; 816};
826 817
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index b2f89829bbe8..188a1ca5ad2b 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -1,7 +1,12 @@
1obj-y = mce_$(BITS).o therm_throt.o 1obj-y = mce.o
2 2
3obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o 3obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o
4obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o 4obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o
5obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o 5obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o
6obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o
7obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o
6obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o 8obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
7obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o 9obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
10obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
11
12obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
index dd3af6e7b39a..b945d5dbc609 100644
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ b/arch/x86/kernel/cpu/mcheck/k7.c
@@ -2,25 +2,23 @@
2 * Athlon specific Machine Check Exception Reporting 2 * Athlon specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Dave Jones <davej@redhat.com> 3 * (C) Copyright 2002 Dave Jones <davej@redhat.com>
4 */ 4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
10#include <linux/smp.h> 9#include <linux/smp.h>
11 10
12#include <asm/processor.h> 11#include <asm/processor.h>
13#include <asm/system.h> 12#include <asm/system.h>
13#include <asm/mce.h>
14#include <asm/msr.h> 14#include <asm/msr.h>
15 15
16#include "mce.h" 16/* Machine Check Handler For AMD Athlon/Duron: */
17
18/* Machine Check Handler For AMD Athlon/Duron */
19static void k7_machine_check(struct pt_regs *regs, long error_code) 17static void k7_machine_check(struct pt_regs *regs, long error_code)
20{ 18{
21 int recover = 1;
22 u32 alow, ahigh, high, low; 19 u32 alow, ahigh, high, low;
23 u32 mcgstl, mcgsth; 20 u32 mcgstl, mcgsth;
21 int recover = 1;
24 int i; 22 int i;
25 23
26 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 24 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
@@ -32,15 +30,19 @@ static void k7_machine_check(struct pt_regs *regs, long error_code)
32 30
33 for (i = 1; i < nr_mce_banks; i++) { 31 for (i = 1; i < nr_mce_banks; i++) {
34 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); 32 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
35 if (high&(1<<31)) { 33 if (high & (1<<31)) {
36 char misc[20]; 34 char misc[20];
37 char addr[24]; 35 char addr[24];
38 misc[0] = addr[0] = '\0'; 36
37 misc[0] = '\0';
38 addr[0] = '\0';
39
39 if (high & (1<<29)) 40 if (high & (1<<29))
40 recover |= 1; 41 recover |= 1;
41 if (high & (1<<25)) 42 if (high & (1<<25))
42 recover |= 2; 43 recover |= 2;
43 high &= ~(1<<31); 44 high &= ~(1<<31);
45
44 if (high & (1<<27)) { 46 if (high & (1<<27)) {
45 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); 47 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
46 snprintf(misc, 20, "[%08x%08x]", ahigh, alow); 48 snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
@@ -49,27 +51,31 @@ static void k7_machine_check(struct pt_regs *regs, long error_code)
49 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); 51 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
50 snprintf(addr, 24, " at %08x%08x", ahigh, alow); 52 snprintf(addr, 24, " at %08x%08x", ahigh, alow);
51 } 53 }
54
52 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", 55 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
53 smp_processor_id(), i, high, low, misc, addr); 56 smp_processor_id(), i, high, low, misc, addr);
54 /* Clear it */ 57
58 /* Clear it: */
55 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); 59 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
56 /* Serialize */ 60 /* Serialize: */
57 wmb(); 61 wmb();
58 add_taint(TAINT_MACHINE_CHECK); 62 add_taint(TAINT_MACHINE_CHECK);
59 } 63 }
60 } 64 }
61 65
62 if (recover&2) 66 if (recover & 2)
63 panic("CPU context corrupt"); 67 panic("CPU context corrupt");
64 if (recover&1) 68 if (recover & 1)
65 panic("Unable to continue"); 69 panic("Unable to continue");
70
66 printk(KERN_EMERG "Attempting to continue.\n"); 71 printk(KERN_EMERG "Attempting to continue.\n");
72
67 mcgstl &= ~(1<<2); 73 mcgstl &= ~(1<<2);
68 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 74 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
69} 75}
70 76
71 77
72/* AMD K7 machine check is Intel like */ 78/* AMD K7 machine check is Intel like: */
73void amd_mcheck_init(struct cpuinfo_x86 *c) 79void amd_mcheck_init(struct cpuinfo_x86 *c)
74{ 80{
75 u32 l, h; 81 u32 l, h;
@@ -79,21 +85,26 @@ void amd_mcheck_init(struct cpuinfo_x86 *c)
79 return; 85 return;
80 86
81 machine_check_vector = k7_machine_check; 87 machine_check_vector = k7_machine_check;
88 /* Make sure the vector pointer is visible before we enable MCEs: */
82 wmb(); 89 wmb();
83 90
84 printk(KERN_INFO "Intel machine check architecture supported.\n"); 91 printk(KERN_INFO "Intel machine check architecture supported.\n");
92
85 rdmsr(MSR_IA32_MCG_CAP, l, h); 93 rdmsr(MSR_IA32_MCG_CAP, l, h);
86 if (l & (1<<8)) /* Control register present ? */ 94 if (l & (1<<8)) /* Control register present ? */
87 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 95 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
88 nr_mce_banks = l & 0xff; 96 nr_mce_banks = l & 0xff;
89 97
90 /* Clear status for MC index 0 separately, we don't touch CTL, 98 /*
91 * as some K7 Athlons cause spurious MCEs when its enabled. */ 99 * Clear status for MC index 0 separately, we don't touch CTL,
100 * as some K7 Athlons cause spurious MCEs when its enabled:
101 */
92 if (boot_cpu_data.x86 == 6) { 102 if (boot_cpu_data.x86 == 6) {
93 wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0); 103 wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0);
94 i = 1; 104 i = 1;
95 } else 105 } else
96 i = 0; 106 i = 0;
107
97 for (; i < nr_mce_banks; i++) { 108 for (; i < nr_mce_banks; i++) {
98 wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); 109 wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
99 wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); 110 wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
new file mode 100644
index 000000000000..a3a235a53f09
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -0,0 +1,127 @@
1/*
2 * Machine check injection support.
3 * Copyright 2008 Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; version 2
8 * of the License.
9 *
10 * Authors:
11 * Andi Kleen
12 * Ying Huang
13 */
14#include <linux/uaccess.h>
15#include <linux/module.h>
16#include <linux/timer.h>
17#include <linux/kernel.h>
18#include <linux/string.h>
19#include <linux/fs.h>
20#include <linux/smp.h>
21#include <asm/mce.h>
22
23/* Update fake mce registers on current CPU. */
24static void inject_mce(struct mce *m)
25{
26 struct mce *i = &per_cpu(injectm, m->extcpu);
27
28 /* Make sure noone reads partially written injectm */
29 i->finished = 0;
30 mb();
31 m->finished = 0;
32 /* First set the fields after finished */
33 i->extcpu = m->extcpu;
34 mb();
35 /* Now write record in order, finished last (except above) */
36 memcpy(i, m, sizeof(struct mce));
37 /* Finally activate it */
38 mb();
39 i->finished = 1;
40}
41
42struct delayed_mce {
43 struct timer_list timer;
44 struct mce m;
45};
46
47/* Inject mce on current CPU */
48static void raise_mce(unsigned long data)
49{
50 struct delayed_mce *dm = (struct delayed_mce *)data;
51 struct mce *m = &dm->m;
52 int cpu = m->extcpu;
53
54 inject_mce(m);
55 if (m->status & MCI_STATUS_UC) {
56 struct pt_regs regs;
57 memset(&regs, 0, sizeof(struct pt_regs));
58 regs.ip = m->ip;
59 regs.cs = m->cs;
60 printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu);
61 do_machine_check(&regs, 0);
62 printk(KERN_INFO "MCE exception done on CPU %d\n", cpu);
63 } else {
64 mce_banks_t b;
65 memset(&b, 0xff, sizeof(mce_banks_t));
66 printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu);
67 machine_check_poll(0, &b);
68 mce_notify_irq();
69 printk(KERN_INFO "Finished machine check poll on CPU %d\n",
70 cpu);
71 }
72 kfree(dm);
73}
74
75/* Error injection interface */
76static ssize_t mce_write(struct file *filp, const char __user *ubuf,
77 size_t usize, loff_t *off)
78{
79 struct delayed_mce *dm;
80 struct mce m;
81
82 if (!capable(CAP_SYS_ADMIN))
83 return -EPERM;
84 /*
85 * There are some cases where real MSR reads could slip
86 * through.
87 */
88 if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA))
89 return -EIO;
90
91 if ((unsigned long)usize > sizeof(struct mce))
92 usize = sizeof(struct mce);
93 if (copy_from_user(&m, ubuf, usize))
94 return -EFAULT;
95
96 if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu))
97 return -EINVAL;
98
99 dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL);
100 if (!dm)
101 return -ENOMEM;
102
103 /*
104 * Need to give user space some time to set everything up,
105 * so do it a jiffie or two later everywhere.
106 * Should we use a hrtimer here for better synchronization?
107 */
108 memcpy(&dm->m, &m, sizeof(struct mce));
109 setup_timer(&dm->timer, raise_mce, (unsigned long)dm);
110 dm->timer.expires = jiffies + 2;
111 add_timer_on(&dm->timer, m.extcpu);
112 return usize;
113}
114
115static int inject_init(void)
116{
117 printk(KERN_INFO "Machine check injector initialized\n");
118 mce_chrdev_ops.write = mce_write;
119 return 0;
120}
121
122module_init(inject_init);
123/*
124 * Cannot tolerate unloading currently because we cannot
125 * guarantee all openers of mce_chrdev will get a reference to us.
126 */
127MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
new file mode 100644
index 000000000000..54dcb8ff12e5
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -0,0 +1,15 @@
1#include <asm/mce.h>
2
3enum severity_level {
4 MCE_NO_SEVERITY,
5 MCE_KEEP_SEVERITY,
6 MCE_SOME_SEVERITY,
7 MCE_AO_SEVERITY,
8 MCE_UC_SEVERITY,
9 MCE_AR_SEVERITY,
10 MCE_PANIC_SEVERITY,
11};
12
13int mce_severity(struct mce *a, int tolerant, char **msg);
14
15extern int mce_ser;
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
new file mode 100644
index 000000000000..ff0807f97056
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -0,0 +1,218 @@
1/*
2 * MCE grading rules.
3 * Copyright 2008, 2009 Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; version 2
8 * of the License.
9 *
10 * Author: Andi Kleen
11 */
12#include <linux/kernel.h>
13#include <linux/seq_file.h>
14#include <linux/init.h>
15#include <linux/debugfs.h>
16#include <asm/mce.h>
17
18#include "mce-internal.h"
19
20/*
21 * Grade an mce by severity. In general the most severe ones are processed
22 * first. Since there are quite a lot of combinations test the bits in a
23 * table-driven way. The rules are simply processed in order, first
24 * match wins.
25 *
26 * Note this is only used for machine check exceptions, the corrected
27 * errors use much simpler rules. The exceptions still check for the corrected
28 * errors, but only to leave them alone for the CMCI handler (except for
29 * panic situations)
30 */
31
32enum context { IN_KERNEL = 1, IN_USER = 2 };
33enum ser { SER_REQUIRED = 1, NO_SER = 2 };
34
35static struct severity {
36 u64 mask;
37 u64 result;
38 unsigned char sev;
39 unsigned char mcgmask;
40 unsigned char mcgres;
41 unsigned char ser;
42 unsigned char context;
43 unsigned char covered;
44 char *msg;
45} severities[] = {
46#define KERNEL .context = IN_KERNEL
47#define USER .context = IN_USER
48#define SER .ser = SER_REQUIRED
49#define NOSER .ser = NO_SER
50#define SEV(s) .sev = MCE_ ## s ## _SEVERITY
51#define BITCLR(x, s, m, r...) { .mask = x, .result = 0, SEV(s), .msg = m, ## r }
52#define BITSET(x, s, m, r...) { .mask = x, .result = x, SEV(s), .msg = m, ## r }
53#define MCGMASK(x, res, s, m, r...) \
54 { .mcgmask = x, .mcgres = res, SEV(s), .msg = m, ## r }
55#define MASK(x, y, s, m, r...) \
56 { .mask = x, .result = y, SEV(s), .msg = m, ## r }
57#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
58#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
59#define MCACOD 0xffff
60
61 BITCLR(MCI_STATUS_VAL, NO, "Invalid"),
62 BITCLR(MCI_STATUS_EN, NO, "Not enabled"),
63 BITSET(MCI_STATUS_PCC, PANIC, "Processor context corrupt"),
64 /* When MCIP is not set something is very confused */
65 MCGMASK(MCG_STATUS_MCIP, 0, PANIC, "MCIP not set in MCA handler"),
66 /* Neither return not error IP -- no chance to recover -> PANIC */
67 MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0, PANIC,
68 "Neither restart nor error IP"),
69 MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "In kernel and no restart IP",
70 KERNEL),
71 BITCLR(MCI_STATUS_UC, KEEP, "Corrected error", NOSER),
72 MASK(MCI_STATUS_OVER|MCI_STATUS_UC|MCI_STATUS_EN, MCI_STATUS_UC, SOME,
73 "Spurious not enabled", SER),
74
75 /* ignore OVER for UCNA */
76 MASK(MCI_UC_SAR, MCI_STATUS_UC, KEEP,
77 "Uncorrected no action required", SER),
78 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR, PANIC,
79 "Illegal combination (UCNA with AR=1)", SER),
80 MASK(MCI_STATUS_S, 0, KEEP, "Non signalled machine check", SER),
81
82 /* AR add known MCACODs here */
83 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR, PANIC,
84 "Action required with lost events", SER),
85 MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_SAR, PANIC,
86 "Action required; unknown MCACOD", SER),
87
88 /* known AO MCACODs: */
89 MASK(MCI_UC_SAR|MCI_STATUS_OVER|0xfff0, MCI_UC_S|0xc0, AO,
90 "Action optional: memory scrubbing error", SER),
91 MASK(MCI_UC_SAR|MCI_STATUS_OVER|MCACOD, MCI_UC_S|0x17a, AO,
92 "Action optional: last level cache writeback error", SER),
93
94 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S, SOME,
95 "Action optional unknown MCACOD", SER),
96 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S|MCI_STATUS_OVER, SOME,
97 "Action optional with lost events", SER),
98 BITSET(MCI_STATUS_UC|MCI_STATUS_OVER, PANIC, "Overflowed uncorrected"),
99 BITSET(MCI_STATUS_UC, UC, "Uncorrected"),
100 BITSET(0, SOME, "No match") /* always matches. keep at end */
101};
102
103/*
104 * If the EIPV bit is set, it means the saved IP is the
105 * instruction which caused the MCE.
106 */
107static int error_context(struct mce *m)
108{
109 if (m->mcgstatus & MCG_STATUS_EIPV)
110 return (m->ip && (m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
111 /* Unknown, assume kernel */
112 return IN_KERNEL;
113}
114
115int mce_severity(struct mce *a, int tolerant, char **msg)
116{
117 enum context ctx = error_context(a);
118 struct severity *s;
119
120 for (s = severities;; s++) {
121 if ((a->status & s->mask) != s->result)
122 continue;
123 if ((a->mcgstatus & s->mcgmask) != s->mcgres)
124 continue;
125 if (s->ser == SER_REQUIRED && !mce_ser)
126 continue;
127 if (s->ser == NO_SER && mce_ser)
128 continue;
129 if (s->context && ctx != s->context)
130 continue;
131 if (msg)
132 *msg = s->msg;
133 s->covered = 1;
134 if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) {
135 if (panic_on_oops || tolerant < 1)
136 return MCE_PANIC_SEVERITY;
137 }
138 return s->sev;
139 }
140}
141
142static void *s_start(struct seq_file *f, loff_t *pos)
143{
144 if (*pos >= ARRAY_SIZE(severities))
145 return NULL;
146 return &severities[*pos];
147}
148
149static void *s_next(struct seq_file *f, void *data, loff_t *pos)
150{
151 if (++(*pos) >= ARRAY_SIZE(severities))
152 return NULL;
153 return &severities[*pos];
154}
155
156static void s_stop(struct seq_file *f, void *data)
157{
158}
159
160static int s_show(struct seq_file *f, void *data)
161{
162 struct severity *ser = data;
163 seq_printf(f, "%d\t%s\n", ser->covered, ser->msg);
164 return 0;
165}
166
167static const struct seq_operations severities_seq_ops = {
168 .start = s_start,
169 .next = s_next,
170 .stop = s_stop,
171 .show = s_show,
172};
173
174static int severities_coverage_open(struct inode *inode, struct file *file)
175{
176 return seq_open(file, &severities_seq_ops);
177}
178
179static ssize_t severities_coverage_write(struct file *file,
180 const char __user *ubuf,
181 size_t count, loff_t *ppos)
182{
183 int i;
184 for (i = 0; i < ARRAY_SIZE(severities); i++)
185 severities[i].covered = 0;
186 return count;
187}
188
189static const struct file_operations severities_coverage_fops = {
190 .open = severities_coverage_open,
191 .release = seq_release,
192 .read = seq_read,
193 .write = severities_coverage_write,
194};
195
196static int __init severities_debugfs_init(void)
197{
198 struct dentry *dmce = NULL, *fseverities_coverage = NULL;
199
200 dmce = debugfs_create_dir("mce", NULL);
201 if (dmce == NULL)
202 goto err_out;
203 fseverities_coverage = debugfs_create_file("severities-coverage",
204 0444, dmce, NULL,
205 &severities_coverage_fops);
206 if (fseverities_coverage == NULL)
207 goto err_out;
208
209 return 0;
210
211err_out:
212 if (fseverities_coverage)
213 debugfs_remove(fseverities_coverage);
214 if (dmce)
215 debugfs_remove(dmce);
216 return -ENOMEM;
217}
218late_initcall(severities_debugfs_init);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
new file mode 100644
index 000000000000..284d1de968bc
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -0,0 +1,2049 @@
1/*
2 * Machine check handler.
3 *
4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 * Rest from unknown author(s).
6 * 2004 Andi Kleen. Rewrote most of it.
7 * Copyright 2008 Intel Corporation
8 * Author: Andi Kleen
9 */
10#include <linux/thread_info.h>
11#include <linux/capability.h>
12#include <linux/miscdevice.h>
13#include <linux/interrupt.h>
14#include <linux/ratelimit.h>
15#include <linux/kallsyms.h>
16#include <linux/rcupdate.h>
17#include <linux/kobject.h>
18#include <linux/uaccess.h>
19#include <linux/kdebug.h>
20#include <linux/kernel.h>
21#include <linux/percpu.h>
22#include <linux/string.h>
23#include <linux/sysdev.h>
24#include <linux/delay.h>
25#include <linux/ctype.h>
26#include <linux/sched.h>
27#include <linux/sysfs.h>
28#include <linux/types.h>
29#include <linux/init.h>
30#include <linux/kmod.h>
31#include <linux/poll.h>
32#include <linux/nmi.h>
33#include <linux/cpu.h>
34#include <linux/smp.h>
35#include <linux/fs.h>
36#include <linux/mm.h>
37
38#include <asm/processor.h>
39#include <asm/hw_irq.h>
40#include <asm/apic.h>
41#include <asm/idle.h>
42#include <asm/ipi.h>
43#include <asm/mce.h>
44#include <asm/msr.h>
45
46#include "mce-internal.h"
47
48/* Handle unconfigured int18 (should never happen) */
49static void unexpected_machine_check(struct pt_regs *regs, long error_code)
50{
51 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
52 smp_processor_id());
53}
54
55/* Call the installed machine check handler for this CPU setup. */
56void (*machine_check_vector)(struct pt_regs *, long error_code) =
57 unexpected_machine_check;
58
59int mce_disabled __read_mostly;
60
61#ifdef CONFIG_X86_NEW_MCE
62
63#define MISC_MCELOG_MINOR 227
64
65#define SPINUNIT 100 /* 100ns */
66
67atomic_t mce_entry;
68
69DEFINE_PER_CPU(unsigned, mce_exception_count);
70
71/*
72 * Tolerant levels:
73 * 0: always panic on uncorrected errors, log corrected errors
74 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
75 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
76 * 3: never panic or SIGBUS, log all errors (for testing only)
77 */
78static int tolerant __read_mostly = 1;
79static int banks __read_mostly;
80static u64 *bank __read_mostly;
81static int rip_msr __read_mostly;
82static int mce_bootlog __read_mostly = -1;
83static int monarch_timeout __read_mostly = -1;
84static int mce_panic_timeout __read_mostly;
85static int mce_dont_log_ce __read_mostly;
86int mce_cmci_disabled __read_mostly;
87int mce_ignore_ce __read_mostly;
88int mce_ser __read_mostly;
89
90/* User mode helper program triggered by machine check event */
91static unsigned long mce_need_notify;
92static char mce_helper[128];
93static char *mce_helper_argv[2] = { mce_helper, NULL };
94
95static unsigned long dont_init_banks;
96
97static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
98static DEFINE_PER_CPU(struct mce, mces_seen);
99static int cpu_missing;
100
101
102/* MCA banks polled by the period polling timer for corrected events */
103DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
104 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
105};
106
107static inline int skip_bank_init(int i)
108{
109 return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);
110}
111
112static DEFINE_PER_CPU(struct work_struct, mce_work);
113
114/* Do initial initialization of a struct mce */
115void mce_setup(struct mce *m)
116{
117 memset(m, 0, sizeof(struct mce));
118 m->cpu = m->extcpu = smp_processor_id();
119 rdtscll(m->tsc);
120 /* We hope get_seconds stays lockless */
121 m->time = get_seconds();
122 m->cpuvendor = boot_cpu_data.x86_vendor;
123 m->cpuid = cpuid_eax(1);
124#ifdef CONFIG_SMP
125 m->socketid = cpu_data(m->extcpu).phys_proc_id;
126#endif
127 m->apicid = cpu_data(m->extcpu).initial_apicid;
128 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
129}
130
131DEFINE_PER_CPU(struct mce, injectm);
132EXPORT_PER_CPU_SYMBOL_GPL(injectm);
133
134/*
135 * Lockless MCE logging infrastructure.
136 * This avoids deadlocks on printk locks without having to break locks. Also
137 * separate MCEs from kernel messages to avoid bogus bug reports.
138 */
139
140static struct mce_log mcelog = {
141 .signature = MCE_LOG_SIGNATURE,
142 .len = MCE_LOG_LEN,
143 .recordlen = sizeof(struct mce),
144};
145
146void mce_log(struct mce *mce)
147{
148 unsigned next, entry;
149
150 mce->finished = 0;
151 wmb();
152 for (;;) {
153 entry = rcu_dereference(mcelog.next);
154 for (;;) {
155 /*
156 * When the buffer fills up discard new entries.
157 * Assume that the earlier errors are the more
158 * interesting ones:
159 */
160 if (entry >= MCE_LOG_LEN) {
161 set_bit(MCE_OVERFLOW,
162 (unsigned long *)&mcelog.flags);
163 return;
164 }
165 /* Old left over entry. Skip: */
166 if (mcelog.entry[entry].finished) {
167 entry++;
168 continue;
169 }
170 break;
171 }
172 smp_rmb();
173 next = entry + 1;
174 if (cmpxchg(&mcelog.next, entry, next) == entry)
175 break;
176 }
177 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
178 wmb();
179 mcelog.entry[entry].finished = 1;
180 wmb();
181
182 mce->finished = 1;
183 set_bit(0, &mce_need_notify);
184}
185
186static void print_mce(struct mce *m)
187{
188 printk(KERN_EMERG
189 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
190 m->extcpu, m->mcgstatus, m->bank, m->status);
191 if (m->ip) {
192 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
193 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
194 m->cs, m->ip);
195 if (m->cs == __KERNEL_CS)
196 print_symbol("{%s}", m->ip);
197 printk("\n");
198 }
199 printk(KERN_EMERG "TSC %llx ", m->tsc);
200 if (m->addr)
201 printk("ADDR %llx ", m->addr);
202 if (m->misc)
203 printk("MISC %llx ", m->misc);
204 printk("\n");
205 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
206 m->cpuvendor, m->cpuid, m->time, m->socketid,
207 m->apicid);
208}
209
210static void print_mce_head(void)
211{
212 printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n");
213}
214
215static void print_mce_tail(void)
216{
217 printk(KERN_EMERG "This is not a software problem!\n"
218 KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n");
219}
220
221#define PANIC_TIMEOUT 5 /* 5 seconds */
222
223static atomic_t mce_paniced;
224
225/* Panic in progress. Enable interrupts and wait for final IPI */
226static void wait_for_panic(void)
227{
228 long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
229 preempt_disable();
230 local_irq_enable();
231 while (timeout-- > 0)
232 udelay(1);
233 if (panic_timeout == 0)
234 panic_timeout = mce_panic_timeout;
235 panic("Panicing machine check CPU died");
236}
237
238static void mce_panic(char *msg, struct mce *final, char *exp)
239{
240 int i;
241
242 /*
243 * Make sure only one CPU runs in machine check panic
244 */
245 if (atomic_add_return(1, &mce_paniced) > 1)
246 wait_for_panic();
247 barrier();
248
249 bust_spinlocks(1);
250 console_verbose();
251 print_mce_head();
252 /* First print corrected ones that are still unlogged */
253 for (i = 0; i < MCE_LOG_LEN; i++) {
254 struct mce *m = &mcelog.entry[i];
255 if (!(m->status & MCI_STATUS_VAL))
256 continue;
257 if (!(m->status & MCI_STATUS_UC))
258 print_mce(m);
259 }
260 /* Now print uncorrected but with the final one last */
261 for (i = 0; i < MCE_LOG_LEN; i++) {
262 struct mce *m = &mcelog.entry[i];
263 if (!(m->status & MCI_STATUS_VAL))
264 continue;
265 if (!(m->status & MCI_STATUS_UC))
266 continue;
267 if (!final || memcmp(m, final, sizeof(struct mce)))
268 print_mce(m);
269 }
270 if (final)
271 print_mce(final);
272 if (cpu_missing)
273 printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
274 print_mce_tail();
275 if (exp)
276 printk(KERN_EMERG "Machine check: %s\n", exp);
277 if (panic_timeout == 0)
278 panic_timeout = mce_panic_timeout;
279 panic(msg);
280}
281
282/* Support code for software error injection */
283
284static int msr_to_offset(u32 msr)
285{
286 unsigned bank = __get_cpu_var(injectm.bank);
287 if (msr == rip_msr)
288 return offsetof(struct mce, ip);
289 if (msr == MSR_IA32_MC0_STATUS + bank*4)
290 return offsetof(struct mce, status);
291 if (msr == MSR_IA32_MC0_ADDR + bank*4)
292 return offsetof(struct mce, addr);
293 if (msr == MSR_IA32_MC0_MISC + bank*4)
294 return offsetof(struct mce, misc);
295 if (msr == MSR_IA32_MCG_STATUS)
296 return offsetof(struct mce, mcgstatus);
297 return -1;
298}
299
300/* MSR access wrappers used for error injection */
301static u64 mce_rdmsrl(u32 msr)
302{
303 u64 v;
304 if (__get_cpu_var(injectm).finished) {
305 int offset = msr_to_offset(msr);
306 if (offset < 0)
307 return 0;
308 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
309 }
310 rdmsrl(msr, v);
311 return v;
312}
313
314static void mce_wrmsrl(u32 msr, u64 v)
315{
316 if (__get_cpu_var(injectm).finished) {
317 int offset = msr_to_offset(msr);
318 if (offset >= 0)
319 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
320 return;
321 }
322 wrmsrl(msr, v);
323}
324
325/*
326 * Simple lockless ring to communicate PFNs from the exception handler with the
327 * process context work function. This is vastly simplified because there's
328 * only a single reader and a single writer.
329 */
330#define MCE_RING_SIZE 16 /* we use one entry less */
331
332struct mce_ring {
333 unsigned short start;
334 unsigned short end;
335 unsigned long ring[MCE_RING_SIZE];
336};
337static DEFINE_PER_CPU(struct mce_ring, mce_ring);
338
339/* Runs with CPU affinity in workqueue */
340static int mce_ring_empty(void)
341{
342 struct mce_ring *r = &__get_cpu_var(mce_ring);
343
344 return r->start == r->end;
345}
346
347static int mce_ring_get(unsigned long *pfn)
348{
349 struct mce_ring *r;
350 int ret = 0;
351
352 *pfn = 0;
353 get_cpu();
354 r = &__get_cpu_var(mce_ring);
355 if (r->start == r->end)
356 goto out;
357 *pfn = r->ring[r->start];
358 r->start = (r->start + 1) % MCE_RING_SIZE;
359 ret = 1;
360out:
361 put_cpu();
362 return ret;
363}
364
365/* Always runs in MCE context with preempt off */
366static int mce_ring_add(unsigned long pfn)
367{
368 struct mce_ring *r = &__get_cpu_var(mce_ring);
369 unsigned next;
370
371 next = (r->end + 1) % MCE_RING_SIZE;
372 if (next == r->start)
373 return -1;
374 r->ring[r->end] = pfn;
375 wmb();
376 r->end = next;
377 return 0;
378}
379
380int mce_available(struct cpuinfo_x86 *c)
381{
382 if (mce_disabled)
383 return 0;
384 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
385}
386
387static void mce_schedule_work(void)
388{
389 if (!mce_ring_empty()) {
390 struct work_struct *work = &__get_cpu_var(mce_work);
391 if (!work_pending(work))
392 schedule_work(work);
393 }
394}
395
396/*
397 * Get the address of the instruction at the time of the machine check
398 * error.
399 */
400static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
401{
402
403 if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) {
404 m->ip = regs->ip;
405 m->cs = regs->cs;
406 } else {
407 m->ip = 0;
408 m->cs = 0;
409 }
410 if (rip_msr)
411 m->ip = mce_rdmsrl(rip_msr);
412}
413
414#ifdef CONFIG_X86_LOCAL_APIC
415/*
416 * Called after interrupts have been reenabled again
417 * when a MCE happened during an interrupts off region
418 * in the kernel.
419 */
420asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
421{
422 ack_APIC_irq();
423 exit_idle();
424 irq_enter();
425 mce_notify_irq();
426 mce_schedule_work();
427 irq_exit();
428}
429#endif
430
431static void mce_report_event(struct pt_regs *regs)
432{
433 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
434 mce_notify_irq();
435 /*
436 * Triggering the work queue here is just an insurance
437 * policy in case the syscall exit notify handler
438 * doesn't run soon enough or ends up running on the
439 * wrong CPU (can happen when audit sleeps)
440 */
441 mce_schedule_work();
442 return;
443 }
444
445#ifdef CONFIG_X86_LOCAL_APIC
446 /*
447 * Without APIC do not notify. The event will be picked
448 * up eventually.
449 */
450 if (!cpu_has_apic)
451 return;
452
453 /*
454 * When interrupts are disabled we cannot use
455 * kernel services safely. Trigger an self interrupt
456 * through the APIC to instead do the notification
457 * after interrupts are reenabled again.
458 */
459 apic->send_IPI_self(MCE_SELF_VECTOR);
460
461 /*
462 * Wait for idle afterwards again so that we don't leave the
463 * APIC in a non idle state because the normal APIC writes
464 * cannot exclude us.
465 */
466 apic_wait_icr_idle();
467#endif
468}
469
470DEFINE_PER_CPU(unsigned, mce_poll_count);
471
472/*
473 * Poll for corrected events or events that happened before reset.
474 * Those are just logged through /dev/mcelog.
475 *
476 * This is executed in standard interrupt context.
477 *
478 * Note: spec recommends to panic for fatal unsignalled
479 * errors here. However this would be quite problematic --
480 * we would need to reimplement the Monarch handling and
481 * it would mess up the exclusion between exception handler
482 * and poll hander -- * so we skip this for now.
483 * These cases should not happen anyways, or only when the CPU
484 * is already totally * confused. In this case it's likely it will
485 * not fully execute the machine check handler either.
486 */
487void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
488{
489 struct mce m;
490 int i;
491
492 __get_cpu_var(mce_poll_count)++;
493
494 mce_setup(&m);
495
496 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
497 for (i = 0; i < banks; i++) {
498 if (!bank[i] || !test_bit(i, *b))
499 continue;
500
501 m.misc = 0;
502 m.addr = 0;
503 m.bank = i;
504 m.tsc = 0;
505
506 barrier();
507 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
508 if (!(m.status & MCI_STATUS_VAL))
509 continue;
510
511 /*
512 * Uncorrected or signalled events are handled by the exception
513 * handler when it is enabled, so don't process those here.
514 *
515 * TBD do the same check for MCI_STATUS_EN here?
516 */
517 if (!(flags & MCP_UC) &&
518 (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))
519 continue;
520
521 if (m.status & MCI_STATUS_MISCV)
522 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
523 if (m.status & MCI_STATUS_ADDRV)
524 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
525
526 if (!(flags & MCP_TIMESTAMP))
527 m.tsc = 0;
528 /*
529 * Don't get the IP here because it's unlikely to
530 * have anything to do with the actual error location.
531 */
532 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
533 mce_log(&m);
534 add_taint(TAINT_MACHINE_CHECK);
535 }
536
537 /*
538 * Clear state for this bank.
539 */
540 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
541 }
542
543 /*
544 * Don't clear MCG_STATUS here because it's only defined for
545 * exceptions.
546 */
547
548 sync_core();
549}
550EXPORT_SYMBOL_GPL(machine_check_poll);
551
552/*
553 * Do a quick check if any of the events requires a panic.
554 * This decides if we keep the events around or clear them.
555 */
556static int mce_no_way_out(struct mce *m, char **msg)
557{
558 int i;
559
560 for (i = 0; i < banks; i++) {
561 m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
562 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
563 return 1;
564 }
565 return 0;
566}
567
568/*
569 * Variable to establish order between CPUs while scanning.
570 * Each CPU spins initially until executing is equal its number.
571 */
572static atomic_t mce_executing;
573
574/*
575 * Defines order of CPUs on entry. First CPU becomes Monarch.
576 */
577static atomic_t mce_callin;
578
579/*
580 * Check if a timeout waiting for other CPUs happened.
581 */
582static int mce_timed_out(u64 *t)
583{
584 /*
585 * The others already did panic for some reason.
586 * Bail out like in a timeout.
587 * rmb() to tell the compiler that system_state
588 * might have been modified by someone else.
589 */
590 rmb();
591 if (atomic_read(&mce_paniced))
592 wait_for_panic();
593 if (!monarch_timeout)
594 goto out;
595 if ((s64)*t < SPINUNIT) {
596 /* CHECKME: Make panic default for 1 too? */
597 if (tolerant < 1)
598 mce_panic("Timeout synchronizing machine check over CPUs",
599 NULL, NULL);
600 cpu_missing = 1;
601 return 1;
602 }
603 *t -= SPINUNIT;
604out:
605 touch_nmi_watchdog();
606 return 0;
607}
608
609/*
610 * The Monarch's reign. The Monarch is the CPU who entered
611 * the machine check handler first. It waits for the others to
612 * raise the exception too and then grades them. When any
613 * error is fatal panic. Only then let the others continue.
614 *
615 * The other CPUs entering the MCE handler will be controlled by the
616 * Monarch. They are called Subjects.
617 *
618 * This way we prevent any potential data corruption in a unrecoverable case
619 * and also makes sure always all CPU's errors are examined.
620 *
621 * Also this detects the case of an machine check event coming from outer
622 * space (not detected by any CPUs) In this case some external agent wants
623 * us to shut down, so panic too.
624 *
625 * The other CPUs might still decide to panic if the handler happens
626 * in a unrecoverable place, but in this case the system is in a semi-stable
627 * state and won't corrupt anything by itself. It's ok to let the others
628 * continue for a bit first.
629 *
630 * All the spin loops have timeouts; when a timeout happens a CPU
631 * typically elects itself to be Monarch.
632 */
633static void mce_reign(void)
634{
635 int cpu;
636 struct mce *m = NULL;
637 int global_worst = 0;
638 char *msg = NULL;
639 char *nmsg = NULL;
640
641 /*
642 * This CPU is the Monarch and the other CPUs have run
643 * through their handlers.
644 * Grade the severity of the errors of all the CPUs.
645 */
646 for_each_possible_cpu(cpu) {
647 int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant,
648 &nmsg);
649 if (severity > global_worst) {
650 msg = nmsg;
651 global_worst = severity;
652 m = &per_cpu(mces_seen, cpu);
653 }
654 }
655
656 /*
657 * Cannot recover? Panic here then.
658 * This dumps all the mces in the log buffer and stops the
659 * other CPUs.
660 */
661 if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3)
662 mce_panic("Fatal Machine check", m, msg);
663
664 /*
665 * For UC somewhere we let the CPU who detects it handle it.
666 * Also must let continue the others, otherwise the handling
667 * CPU could deadlock on a lock.
668 */
669
670 /*
671 * No machine check event found. Must be some external
672 * source or one CPU is hung. Panic.
673 */
674 if (!m && tolerant < 3)
675 mce_panic("Machine check from unknown source", NULL, NULL);
676
677 /*
678 * Now clear all the mces_seen so that they don't reappear on
679 * the next mce.
680 */
681 for_each_possible_cpu(cpu)
682 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
683}
684
685static atomic_t global_nwo;
686
687/*
688 * Start of Monarch synchronization. This waits until all CPUs have
689 * entered the exception handler and then determines if any of them
690 * saw a fatal event that requires panic. Then it executes them
691 * in the entry order.
692 * TBD double check parallel CPU hotunplug
693 */
694static int mce_start(int *no_way_out)
695{
696 int order;
697 int cpus = num_online_cpus();
698 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
699
700 if (!timeout)
701 return -1;
702
703 atomic_add(*no_way_out, &global_nwo);
704 /*
705 * global_nwo should be updated before mce_callin
706 */
707 smp_wmb();
708 order = atomic_add_return(1, &mce_callin);
709
710 /*
711 * Wait for everyone.
712 */
713 while (atomic_read(&mce_callin) != cpus) {
714 if (mce_timed_out(&timeout)) {
715 atomic_set(&global_nwo, 0);
716 return -1;
717 }
718 ndelay(SPINUNIT);
719 }
720
721 /*
722 * mce_callin should be read before global_nwo
723 */
724 smp_rmb();
725
726 if (order == 1) {
727 /*
728 * Monarch: Starts executing now, the others wait.
729 */
730 atomic_set(&mce_executing, 1);
731 } else {
732 /*
733 * Subject: Now start the scanning loop one by one in
734 * the original callin order.
735 * This way when there are any shared banks it will be
736 * only seen by one CPU before cleared, avoiding duplicates.
737 */
738 while (atomic_read(&mce_executing) < order) {
739 if (mce_timed_out(&timeout)) {
740 atomic_set(&global_nwo, 0);
741 return -1;
742 }
743 ndelay(SPINUNIT);
744 }
745 }
746
747 /*
748 * Cache the global no_way_out state.
749 */
750 *no_way_out = atomic_read(&global_nwo);
751
752 return order;
753}
754
755/*
756 * Synchronize between CPUs after main scanning loop.
757 * This invokes the bulk of the Monarch processing.
758 */
759static int mce_end(int order)
760{
761 int ret = -1;
762 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
763
764 if (!timeout)
765 goto reset;
766 if (order < 0)
767 goto reset;
768
769 /*
770 * Allow others to run.
771 */
772 atomic_inc(&mce_executing);
773
774 if (order == 1) {
775 /* CHECKME: Can this race with a parallel hotplug? */
776 int cpus = num_online_cpus();
777
778 /*
779 * Monarch: Wait for everyone to go through their scanning
780 * loops.
781 */
782 while (atomic_read(&mce_executing) <= cpus) {
783 if (mce_timed_out(&timeout))
784 goto reset;
785 ndelay(SPINUNIT);
786 }
787
788 mce_reign();
789 barrier();
790 ret = 0;
791 } else {
792 /*
793 * Subject: Wait for Monarch to finish.
794 */
795 while (atomic_read(&mce_executing) != 0) {
796 if (mce_timed_out(&timeout))
797 goto reset;
798 ndelay(SPINUNIT);
799 }
800
801 /*
802 * Don't reset anything. That's done by the Monarch.
803 */
804 return 0;
805 }
806
807 /*
808 * Reset all global state.
809 */
810reset:
811 atomic_set(&global_nwo, 0);
812 atomic_set(&mce_callin, 0);
813 barrier();
814
815 /*
816 * Let others run again.
817 */
818 atomic_set(&mce_executing, 0);
819 return ret;
820}
821
822/*
823 * Check if the address reported by the CPU is in a format we can parse.
824 * It would be possible to add code for most other cases, but all would
825 * be somewhat complicated (e.g. segment offset would require an instruction
826 * parser). So only support physical addresses upto page granuality for now.
827 */
828static int mce_usable_address(struct mce *m)
829{
830 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
831 return 0;
832 if ((m->misc & 0x3f) > PAGE_SHIFT)
833 return 0;
834 if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS)
835 return 0;
836 return 1;
837}
838
839static void mce_clear_state(unsigned long *toclear)
840{
841 int i;
842
843 for (i = 0; i < banks; i++) {
844 if (test_bit(i, toclear))
845 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
846 }
847}
848
849/*
850 * The actual machine check handler. This only handles real
851 * exceptions when something got corrupted coming in through int 18.
852 *
853 * This is executed in NMI context not subject to normal locking rules. This
854 * implies that most kernel services cannot be safely used. Don't even
855 * think about putting a printk in there!
856 *
857 * On Intel systems this is entered on all CPUs in parallel through
858 * MCE broadcast. However some CPUs might be broken beyond repair,
859 * so be always careful when synchronizing with others.
860 */
861void do_machine_check(struct pt_regs *regs, long error_code)
862{
863 struct mce m, *final;
864 int i;
865 int worst = 0;
866 int severity;
867 /*
868 * Establish sequential order between the CPUs entering the machine
869 * check handler.
870 */
871 int order;
872 /*
873 * If no_way_out gets set, there is no safe way to recover from this
874 * MCE. If tolerant is cranked up, we'll try anyway.
875 */
876 int no_way_out = 0;
877 /*
878 * If kill_it gets set, there might be a way to recover from this
879 * error.
880 */
881 int kill_it = 0;
882 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
883 char *msg = "Unknown";
884
885 atomic_inc(&mce_entry);
886
887 __get_cpu_var(mce_exception_count)++;
888
889 if (notify_die(DIE_NMI, "machine check", regs, error_code,
890 18, SIGKILL) == NOTIFY_STOP)
891 goto out;
892 if (!banks)
893 goto out;
894
895 mce_setup(&m);
896
897 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
898 no_way_out = mce_no_way_out(&m, &msg);
899
900 final = &__get_cpu_var(mces_seen);
901 *final = m;
902
903 barrier();
904
905 /*
906 * When no restart IP must always kill or panic.
907 */
908 if (!(m.mcgstatus & MCG_STATUS_RIPV))
909 kill_it = 1;
910
911 /*
912 * Go through all the banks in exclusion of the other CPUs.
913 * This way we don't report duplicated events on shared banks
914 * because the first one to see it will clear it.
915 */
916 order = mce_start(&no_way_out);
917 for (i = 0; i < banks; i++) {
918 __clear_bit(i, toclear);
919 if (!bank[i])
920 continue;
921
922 m.misc = 0;
923 m.addr = 0;
924 m.bank = i;
925
926 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
927 if ((m.status & MCI_STATUS_VAL) == 0)
928 continue;
929
930 /*
931 * Non uncorrected or non signaled errors are handled by
932 * machine_check_poll. Leave them alone, unless this panics.
933 */
934 if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
935 !no_way_out)
936 continue;
937
938 /*
939 * Set taint even when machine check was not enabled.
940 */
941 add_taint(TAINT_MACHINE_CHECK);
942
943 severity = mce_severity(&m, tolerant, NULL);
944
945 /*
946 * When machine check was for corrected handler don't touch,
947 * unless we're panicing.
948 */
949 if (severity == MCE_KEEP_SEVERITY && !no_way_out)
950 continue;
951 __set_bit(i, toclear);
952 if (severity == MCE_NO_SEVERITY) {
953 /*
954 * Machine check event was not enabled. Clear, but
955 * ignore.
956 */
957 continue;
958 }
959
960 /*
961 * Kill on action required.
962 */
963 if (severity == MCE_AR_SEVERITY)
964 kill_it = 1;
965
966 if (m.status & MCI_STATUS_MISCV)
967 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
968 if (m.status & MCI_STATUS_ADDRV)
969 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
970
971 /*
972 * Action optional error. Queue address for later processing.
973 * When the ring overflows we just ignore the AO error.
974 * RED-PEN add some logging mechanism when
975 * usable_address or mce_add_ring fails.
976 * RED-PEN don't ignore overflow for tolerant == 0
977 */
978 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
979 mce_ring_add(m.addr >> PAGE_SHIFT);
980
981 mce_get_rip(&m, regs);
982 mce_log(&m);
983
984 if (severity > worst) {
985 *final = m;
986 worst = severity;
987 }
988 }
989
990 if (!no_way_out)
991 mce_clear_state(toclear);
992
993 /*
994 * Do most of the synchronization with other CPUs.
995 * When there's any problem use only local no_way_out state.
996 */
997 if (mce_end(order) < 0)
998 no_way_out = worst >= MCE_PANIC_SEVERITY;
999
1000 /*
1001 * If we have decided that we just CAN'T continue, and the user
1002 * has not set tolerant to an insane level, give up and die.
1003 *
1004 * This is mainly used in the case when the system doesn't
1005 * support MCE broadcasting or it has been disabled.
1006 */
1007 if (no_way_out && tolerant < 3)
1008 mce_panic("Fatal machine check on current CPU", final, msg);
1009
1010 /*
1011 * If the error seems to be unrecoverable, something should be
1012 * done. Try to kill as little as possible. If we can kill just
1013 * one task, do that. If the user has set the tolerance very
1014 * high, don't try to do anything at all.
1015 */
1016
1017 if (kill_it && tolerant < 3)
1018 force_sig(SIGBUS, current);
1019
1020 /* notify userspace ASAP */
1021 set_thread_flag(TIF_MCE_NOTIFY);
1022
1023 if (worst > 0)
1024 mce_report_event(regs);
1025 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1026out:
1027 atomic_dec(&mce_entry);
1028 sync_core();
1029}
1030EXPORT_SYMBOL_GPL(do_machine_check);
1031
1032/* dummy to break dependency. actual code is in mm/memory-failure.c */
1033void __attribute__((weak)) memory_failure(unsigned long pfn, int vector)
1034{
1035 printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn);
1036}
1037
1038/*
1039 * Called after mce notification in process context. This code
1040 * is allowed to sleep. Call the high level VM handler to process
1041 * any corrupted pages.
1042 * Assume that the work queue code only calls this one at a time
1043 * per CPU.
1044 * Note we don't disable preemption, so this code might run on the wrong
1045 * CPU. In this case the event is picked up by the scheduled work queue.
1046 * This is merely a fast path to expedite processing in some common
1047 * cases.
1048 */
1049void mce_notify_process(void)
1050{
1051 unsigned long pfn;
1052 mce_notify_irq();
1053 while (mce_ring_get(&pfn))
1054 memory_failure(pfn, MCE_VECTOR);
1055}
1056
1057static void mce_process_work(struct work_struct *dummy)
1058{
1059 mce_notify_process();
1060}
1061
1062#ifdef CONFIG_X86_MCE_INTEL
1063/***
1064 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
1065 * @cpu: The CPU on which the event occurred.
1066 * @status: Event status information
1067 *
1068 * This function should be called by the thermal interrupt after the
1069 * event has been processed and the decision was made to log the event
1070 * further.
1071 *
1072 * The status parameter will be saved to the 'status' field of 'struct mce'
1073 * and historically has been the register value of the
1074 * MSR_IA32_THERMAL_STATUS (Intel) msr.
1075 */
1076void mce_log_therm_throt_event(__u64 status)
1077{
1078 struct mce m;
1079
1080 mce_setup(&m);
1081 m.bank = MCE_THERMAL_BANK;
1082 m.status = status;
1083 mce_log(&m);
1084}
1085#endif /* CONFIG_X86_MCE_INTEL */
1086
1087/*
1088 * Periodic polling timer for "silent" machine check errors. If the
1089 * poller finds an MCE, poll 2x faster. When the poller finds no more
1090 * errors, poll 2x slower (up to check_interval seconds).
1091 */
1092static int check_interval = 5 * 60; /* 5 minutes */
1093
1094static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
1095static DEFINE_PER_CPU(struct timer_list, mce_timer);
1096
1097static void mcheck_timer(unsigned long data)
1098{
1099 struct timer_list *t = &per_cpu(mce_timer, data);
1100 int *n;
1101
1102 WARN_ON(smp_processor_id() != data);
1103
1104 if (mce_available(&current_cpu_data)) {
1105 machine_check_poll(MCP_TIMESTAMP,
1106 &__get_cpu_var(mce_poll_banks));
1107 }
1108
1109 /*
1110 * Alert userspace if needed. If we logged an MCE, reduce the
1111 * polling interval, otherwise increase the polling interval.
1112 */
1113 n = &__get_cpu_var(next_interval);
1114 if (mce_notify_irq())
1115 *n = max(*n/2, HZ/100);
1116 else
1117 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
1118
1119 t->expires = jiffies + *n;
1120 add_timer(t);
1121}
1122
1123static void mce_do_trigger(struct work_struct *work)
1124{
1125 call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
1126}
1127
1128static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
1129
1130/*
1131 * Notify the user(s) about new machine check events.
1132 * Can be called from interrupt context, but not from machine check/NMI
1133 * context.
1134 */
1135int mce_notify_irq(void)
1136{
1137 /* Not more than two messages every minute */
1138 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1139
1140 clear_thread_flag(TIF_MCE_NOTIFY);
1141
1142 if (test_and_clear_bit(0, &mce_need_notify)) {
1143 wake_up_interruptible(&mce_wait);
1144
1145 /*
1146 * There is no risk of missing notifications because
1147 * work_pending is always cleared before the function is
1148 * executed.
1149 */
1150 if (mce_helper[0] && !work_pending(&mce_trigger_work))
1151 schedule_work(&mce_trigger_work);
1152
1153 if (__ratelimit(&ratelimit))
1154 printk(KERN_INFO "Machine check events logged\n");
1155
1156 return 1;
1157 }
1158 return 0;
1159}
1160EXPORT_SYMBOL_GPL(mce_notify_irq);
1161
1162/*
1163 * Initialize Machine Checks for a CPU.
1164 */
1165static int mce_cap_init(void)
1166{
1167 unsigned b;
1168 u64 cap;
1169
1170 rdmsrl(MSR_IA32_MCG_CAP, cap);
1171
1172 b = cap & MCG_BANKCNT_MASK;
1173 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
1174
1175 if (b > MAX_NR_BANKS) {
1176 printk(KERN_WARNING
1177 "MCE: Using only %u machine check banks out of %u\n",
1178 MAX_NR_BANKS, b);
1179 b = MAX_NR_BANKS;
1180 }
1181
1182 /* Don't support asymmetric configurations today */
1183 WARN_ON(banks != 0 && b != banks);
1184 banks = b;
1185 if (!bank) {
1186 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
1187 if (!bank)
1188 return -ENOMEM;
1189 memset(bank, 0xff, banks * sizeof(u64));
1190 }
1191
1192 /* Use accurate RIP reporting if available. */
1193 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1194 rip_msr = MSR_IA32_MCG_EIP;
1195
1196 if (cap & MCG_SER_P)
1197 mce_ser = 1;
1198
1199 return 0;
1200}
1201
1202static void mce_init(void)
1203{
1204 mce_banks_t all_banks;
1205 u64 cap;
1206 int i;
1207
1208 /*
1209 * Log the machine checks left over from the previous reset.
1210 */
1211 bitmap_fill(all_banks, MAX_NR_BANKS);
1212 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
1213
1214 set_in_cr4(X86_CR4_MCE);
1215
1216 rdmsrl(MSR_IA32_MCG_CAP, cap);
1217 if (cap & MCG_CTL_P)
1218 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1219
1220 for (i = 0; i < banks; i++) {
1221 if (skip_bank_init(i))
1222 continue;
1223 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
1224 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
1225 }
1226}
1227
1228/* Add per CPU specific workarounds here */
1229static void mce_cpu_quirks(struct cpuinfo_x86 *c)
1230{
1231 /* This should be disabled by the BIOS, but isn't always */
1232 if (c->x86_vendor == X86_VENDOR_AMD) {
1233 if (c->x86 == 15 && banks > 4) {
1234 /*
1235 * disable GART TBL walk error reporting, which
1236 * trips off incorrectly with the IOMMU & 3ware
1237 * & Cerberus:
1238 */
1239 clear_bit(10, (unsigned long *)&bank[4]);
1240 }
1241 if (c->x86 <= 17 && mce_bootlog < 0) {
1242 /*
1243 * Lots of broken BIOS around that don't clear them
1244 * by default and leave crap in there. Don't log:
1245 */
1246 mce_bootlog = 0;
1247 }
1248 /*
1249 * Various K7s with broken bank 0 around. Always disable
1250 * by default.
1251 */
1252 if (c->x86 == 6 && banks > 0)
1253 bank[0] = 0;
1254 }
1255
1256 if (c->x86_vendor == X86_VENDOR_INTEL) {
1257 /*
1258 * SDM documents that on family 6 bank 0 should not be written
1259 * because it aliases to another special BIOS controlled
1260 * register.
1261 * But it's not aliased anymore on model 0x1a+
1262 * Don't ignore bank 0 completely because there could be a
1263 * valid event later, merely don't write CTL0.
1264 */
1265
1266 if (c->x86 == 6 && c->x86_model < 0x1A)
1267 __set_bit(0, &dont_init_banks);
1268
1269 /*
1270 * All newer Intel systems support MCE broadcasting. Enable
1271 * synchronization with a one second timeout.
1272 */
1273 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1274 monarch_timeout < 0)
1275 monarch_timeout = USEC_PER_SEC;
1276 }
1277 if (monarch_timeout < 0)
1278 monarch_timeout = 0;
1279 if (mce_bootlog != 0)
1280 mce_panic_timeout = 30;
1281}
1282
1283static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
1284{
1285 if (c->x86 != 5)
1286 return;
1287 switch (c->x86_vendor) {
1288 case X86_VENDOR_INTEL:
1289 intel_p5_mcheck_init(c);
1290 break;
1291 case X86_VENDOR_CENTAUR:
1292 winchip_mcheck_init(c);
1293 break;
1294 }
1295}
1296
1297static void mce_cpu_features(struct cpuinfo_x86 *c)
1298{
1299 switch (c->x86_vendor) {
1300 case X86_VENDOR_INTEL:
1301 mce_intel_feature_init(c);
1302 break;
1303 case X86_VENDOR_AMD:
1304 mce_amd_feature_init(c);
1305 break;
1306 default:
1307 break;
1308 }
1309}
1310
1311static void mce_init_timer(void)
1312{
1313 struct timer_list *t = &__get_cpu_var(mce_timer);
1314 int *n = &__get_cpu_var(next_interval);
1315
1316 if (mce_ignore_ce)
1317 return;
1318
1319 *n = check_interval * HZ;
1320 if (!*n)
1321 return;
1322 setup_timer(t, mcheck_timer, smp_processor_id());
1323 t->expires = round_jiffies(jiffies + *n);
1324 add_timer(t);
1325}
1326
1327/*
1328 * Called for each booted CPU to set up machine checks.
1329 * Must be called with preempt off:
1330 */
1331void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
1332{
1333 if (mce_disabled)
1334 return;
1335
1336 mce_ancient_init(c);
1337
1338 if (!mce_available(c))
1339 return;
1340
1341 if (mce_cap_init() < 0) {
1342 mce_disabled = 1;
1343 return;
1344 }
1345 mce_cpu_quirks(c);
1346
1347 machine_check_vector = do_machine_check;
1348
1349 mce_init();
1350 mce_cpu_features(c);
1351 mce_init_timer();
1352 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
1353}
1354
1355/*
1356 * Character device to read and clear the MCE log.
1357 */
1358
1359static DEFINE_SPINLOCK(mce_state_lock);
1360static int open_count; /* #times opened */
1361static int open_exclu; /* already open exclusive? */
1362
1363static int mce_open(struct inode *inode, struct file *file)
1364{
1365 spin_lock(&mce_state_lock);
1366
1367 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
1368 spin_unlock(&mce_state_lock);
1369
1370 return -EBUSY;
1371 }
1372
1373 if (file->f_flags & O_EXCL)
1374 open_exclu = 1;
1375 open_count++;
1376
1377 spin_unlock(&mce_state_lock);
1378
1379 return nonseekable_open(inode, file);
1380}
1381
1382static int mce_release(struct inode *inode, struct file *file)
1383{
1384 spin_lock(&mce_state_lock);
1385
1386 open_count--;
1387 open_exclu = 0;
1388
1389 spin_unlock(&mce_state_lock);
1390
1391 return 0;
1392}
1393
1394static void collect_tscs(void *data)
1395{
1396 unsigned long *cpu_tsc = (unsigned long *)data;
1397
1398 rdtscll(cpu_tsc[smp_processor_id()]);
1399}
1400
1401static DEFINE_MUTEX(mce_read_mutex);
1402
1403static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1404 loff_t *off)
1405{
1406 char __user *buf = ubuf;
1407 unsigned long *cpu_tsc;
1408 unsigned prev, next;
1409 int i, err;
1410
1411 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
1412 if (!cpu_tsc)
1413 return -ENOMEM;
1414
1415 mutex_lock(&mce_read_mutex);
1416 next = rcu_dereference(mcelog.next);
1417
1418 /* Only supports full reads right now */
1419 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
1420 mutex_unlock(&mce_read_mutex);
1421 kfree(cpu_tsc);
1422
1423 return -EINVAL;
1424 }
1425
1426 err = 0;
1427 prev = 0;
1428 do {
1429 for (i = prev; i < next; i++) {
1430 unsigned long start = jiffies;
1431
1432 while (!mcelog.entry[i].finished) {
1433 if (time_after_eq(jiffies, start + 2)) {
1434 memset(mcelog.entry + i, 0,
1435 sizeof(struct mce));
1436 goto timeout;
1437 }
1438 cpu_relax();
1439 }
1440 smp_rmb();
1441 err |= copy_to_user(buf, mcelog.entry + i,
1442 sizeof(struct mce));
1443 buf += sizeof(struct mce);
1444timeout:
1445 ;
1446 }
1447
1448 memset(mcelog.entry + prev, 0,
1449 (next - prev) * sizeof(struct mce));
1450 prev = next;
1451 next = cmpxchg(&mcelog.next, prev, 0);
1452 } while (next != prev);
1453
1454 synchronize_sched();
1455
1456 /*
1457 * Collect entries that were still getting written before the
1458 * synchronize.
1459 */
1460 on_each_cpu(collect_tscs, cpu_tsc, 1);
1461
1462 for (i = next; i < MCE_LOG_LEN; i++) {
1463 if (mcelog.entry[i].finished &&
1464 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
1465 err |= copy_to_user(buf, mcelog.entry+i,
1466 sizeof(struct mce));
1467 smp_rmb();
1468 buf += sizeof(struct mce);
1469 memset(&mcelog.entry[i], 0, sizeof(struct mce));
1470 }
1471 }
1472 mutex_unlock(&mce_read_mutex);
1473 kfree(cpu_tsc);
1474
1475 return err ? -EFAULT : buf - ubuf;
1476}
1477
1478static unsigned int mce_poll(struct file *file, poll_table *wait)
1479{
1480 poll_wait(file, &mce_wait, wait);
1481 if (rcu_dereference(mcelog.next))
1482 return POLLIN | POLLRDNORM;
1483 return 0;
1484}
1485
1486static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
1487{
1488 int __user *p = (int __user *)arg;
1489
1490 if (!capable(CAP_SYS_ADMIN))
1491 return -EPERM;
1492
1493 switch (cmd) {
1494 case MCE_GET_RECORD_LEN:
1495 return put_user(sizeof(struct mce), p);
1496 case MCE_GET_LOG_LEN:
1497 return put_user(MCE_LOG_LEN, p);
1498 case MCE_GETCLEAR_FLAGS: {
1499 unsigned flags;
1500
1501 do {
1502 flags = mcelog.flags;
1503 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
1504
1505 return put_user(flags, p);
1506 }
1507 default:
1508 return -ENOTTY;
1509 }
1510}
1511
1512/* Modified in mce-inject.c, so not static or const */
1513struct file_operations mce_chrdev_ops = {
1514 .open = mce_open,
1515 .release = mce_release,
1516 .read = mce_read,
1517 .poll = mce_poll,
1518 .unlocked_ioctl = mce_ioctl,
1519};
1520EXPORT_SYMBOL_GPL(mce_chrdev_ops);
1521
1522static struct miscdevice mce_log_device = {
1523 MISC_MCELOG_MINOR,
1524 "mcelog",
1525 &mce_chrdev_ops,
1526};
1527
1528/*
1529 * mce=off Disables machine check
1530 * mce=no_cmci Disables CMCI
1531 * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
1532 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
1533 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
1534 * monarchtimeout is how long to wait for other CPUs on machine
1535 * check, or 0 to not wait
1536 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
1537 * mce=nobootlog Don't log MCEs from before booting.
1538 */
1539static int __init mcheck_enable(char *str)
1540{
1541 if (*str == 0)
1542 enable_p5_mce();
1543 if (*str == '=')
1544 str++;
1545 if (!strcmp(str, "off"))
1546 mce_disabled = 1;
1547 else if (!strcmp(str, "no_cmci"))
1548 mce_cmci_disabled = 1;
1549 else if (!strcmp(str, "dont_log_ce"))
1550 mce_dont_log_ce = 1;
1551 else if (!strcmp(str, "ignore_ce"))
1552 mce_ignore_ce = 1;
1553 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
1554 mce_bootlog = (str[0] == 'b');
1555 else if (isdigit(str[0])) {
1556 get_option(&str, &tolerant);
1557 if (*str == ',') {
1558 ++str;
1559 get_option(&str, &monarch_timeout);
1560 }
1561 } else {
1562 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
1563 str);
1564 return 0;
1565 }
1566 return 1;
1567}
1568__setup("mce", mcheck_enable);
1569
1570/*
1571 * Sysfs support
1572 */
1573
1574/*
1575 * Disable machine checks on suspend and shutdown. We can't really handle
1576 * them later.
1577 */
1578static int mce_disable(void)
1579{
1580 int i;
1581
1582 for (i = 0; i < banks; i++) {
1583 if (!skip_bank_init(i))
1584 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1585 }
1586 return 0;
1587}
1588
1589static int mce_suspend(struct sys_device *dev, pm_message_t state)
1590{
1591 return mce_disable();
1592}
1593
1594static int mce_shutdown(struct sys_device *dev)
1595{
1596 return mce_disable();
1597}
1598
1599/*
1600 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
1601 * Only one CPU is active at this time, the others get re-added later using
1602 * CPU hotplug:
1603 */
1604static int mce_resume(struct sys_device *dev)
1605{
1606 mce_init();
1607 mce_cpu_features(&current_cpu_data);
1608
1609 return 0;
1610}
1611
1612static void mce_cpu_restart(void *data)
1613{
1614 del_timer_sync(&__get_cpu_var(mce_timer));
1615 if (!mce_available(&current_cpu_data))
1616 return;
1617 mce_init();
1618 mce_init_timer();
1619}
1620
1621/* Reinit MCEs after user configuration changes */
1622static void mce_restart(void)
1623{
1624 on_each_cpu(mce_cpu_restart, NULL, 1);
1625}
1626
1627/* Toggle features for corrected errors */
1628static void mce_disable_ce(void *all)
1629{
1630 if (!mce_available(&current_cpu_data))
1631 return;
1632 if (all)
1633 del_timer_sync(&__get_cpu_var(mce_timer));
1634 cmci_clear();
1635}
1636
1637static void mce_enable_ce(void *all)
1638{
1639 if (!mce_available(&current_cpu_data))
1640 return;
1641 cmci_reenable();
1642 cmci_recheck();
1643 if (all)
1644 mce_init_timer();
1645}
1646
1647static struct sysdev_class mce_sysclass = {
1648 .suspend = mce_suspend,
1649 .shutdown = mce_shutdown,
1650 .resume = mce_resume,
1651 .name = "machinecheck",
1652};
1653
1654DEFINE_PER_CPU(struct sys_device, mce_dev);
1655
1656__cpuinitdata
1657void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
1658
1659static struct sysdev_attribute *bank_attrs;
1660
1661static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
1662 char *buf)
1663{
1664 u64 b = bank[attr - bank_attrs];
1665
1666 return sprintf(buf, "%llx\n", b);
1667}
1668
1669static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
1670 const char *buf, size_t size)
1671{
1672 u64 new;
1673
1674 if (strict_strtoull(buf, 0, &new) < 0)
1675 return -EINVAL;
1676
1677 bank[attr - bank_attrs] = new;
1678 mce_restart();
1679
1680 return size;
1681}
1682
1683static ssize_t
1684show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
1685{
1686 strcpy(buf, mce_helper);
1687 strcat(buf, "\n");
1688 return strlen(mce_helper) + 1;
1689}
1690
1691static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
1692 const char *buf, size_t siz)
1693{
1694 char *p;
1695 int len;
1696
1697 strncpy(mce_helper, buf, sizeof(mce_helper));
1698 mce_helper[sizeof(mce_helper)-1] = 0;
1699 len = strlen(mce_helper);
1700 p = strchr(mce_helper, '\n');
1701
1702 if (*p)
1703 *p = 0;
1704
1705 return len;
1706}
1707
1708static ssize_t set_ignore_ce(struct sys_device *s,
1709 struct sysdev_attribute *attr,
1710 const char *buf, size_t size)
1711{
1712 u64 new;
1713
1714 if (strict_strtoull(buf, 0, &new) < 0)
1715 return -EINVAL;
1716
1717 if (mce_ignore_ce ^ !!new) {
1718 if (new) {
1719 /* disable ce features */
1720 on_each_cpu(mce_disable_ce, (void *)1, 1);
1721 mce_ignore_ce = 1;
1722 } else {
1723 /* enable ce features */
1724 mce_ignore_ce = 0;
1725 on_each_cpu(mce_enable_ce, (void *)1, 1);
1726 }
1727 }
1728 return size;
1729}
1730
1731static ssize_t set_cmci_disabled(struct sys_device *s,
1732 struct sysdev_attribute *attr,
1733 const char *buf, size_t size)
1734{
1735 u64 new;
1736
1737 if (strict_strtoull(buf, 0, &new) < 0)
1738 return -EINVAL;
1739
1740 if (mce_cmci_disabled ^ !!new) {
1741 if (new) {
1742 /* disable cmci */
1743 on_each_cpu(mce_disable_ce, NULL, 1);
1744 mce_cmci_disabled = 1;
1745 } else {
1746 /* enable cmci */
1747 mce_cmci_disabled = 0;
1748 on_each_cpu(mce_enable_ce, NULL, 1);
1749 }
1750 }
1751 return size;
1752}
1753
1754static ssize_t store_int_with_restart(struct sys_device *s,
1755 struct sysdev_attribute *attr,
1756 const char *buf, size_t size)
1757{
1758 ssize_t ret = sysdev_store_int(s, attr, buf, size);
1759 mce_restart();
1760 return ret;
1761}
1762
1763static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
1764static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
1765static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
1766static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);
1767
1768static struct sysdev_ext_attribute attr_check_interval = {
1769 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
1770 store_int_with_restart),
1771 &check_interval
1772};
1773
1774static struct sysdev_ext_attribute attr_ignore_ce = {
1775 _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce),
1776 &mce_ignore_ce
1777};
1778
1779static struct sysdev_ext_attribute attr_cmci_disabled = {
1780 _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled),
1781 &mce_cmci_disabled
1782};
1783
1784static struct sysdev_attribute *mce_attrs[] = {
1785 &attr_tolerant.attr,
1786 &attr_check_interval.attr,
1787 &attr_trigger,
1788 &attr_monarch_timeout.attr,
1789 &attr_dont_log_ce.attr,
1790 &attr_ignore_ce.attr,
1791 &attr_cmci_disabled.attr,
1792 NULL
1793};
1794
1795static cpumask_var_t mce_dev_initialized;
1796
1797/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
1798static __cpuinit int mce_create_device(unsigned int cpu)
1799{
1800 int err;
1801 int i, j;
1802
1803 if (!mce_available(&boot_cpu_data))
1804 return -EIO;
1805
1806 memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
1807 per_cpu(mce_dev, cpu).id = cpu;
1808 per_cpu(mce_dev, cpu).cls = &mce_sysclass;
1809
1810 err = sysdev_register(&per_cpu(mce_dev, cpu));
1811 if (err)
1812 return err;
1813
1814 for (i = 0; mce_attrs[i]; i++) {
1815 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1816 if (err)
1817 goto error;
1818 }
1819 for (j = 0; j < banks; j++) {
1820 err = sysdev_create_file(&per_cpu(mce_dev, cpu),
1821 &bank_attrs[j]);
1822 if (err)
1823 goto error2;
1824 }
1825 cpumask_set_cpu(cpu, mce_dev_initialized);
1826
1827 return 0;
1828error2:
1829 while (--j >= 0)
1830 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[j]);
1831error:
1832 while (--i >= 0)
1833 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1834
1835 sysdev_unregister(&per_cpu(mce_dev, cpu));
1836
1837 return err;
1838}
1839
1840static __cpuinit void mce_remove_device(unsigned int cpu)
1841{
1842 int i;
1843
1844 if (!cpumask_test_cpu(cpu, mce_dev_initialized))
1845 return;
1846
1847 for (i = 0; mce_attrs[i]; i++)
1848 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1849
1850 for (i = 0; i < banks; i++)
1851 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
1852
1853 sysdev_unregister(&per_cpu(mce_dev, cpu));
1854 cpumask_clear_cpu(cpu, mce_dev_initialized);
1855}
1856
1857/* Make sure there are no machine checks on offlined CPUs. */
1858static void mce_disable_cpu(void *h)
1859{
1860 unsigned long action = *(unsigned long *)h;
1861 int i;
1862
1863 if (!mce_available(&current_cpu_data))
1864 return;
1865 if (!(action & CPU_TASKS_FROZEN))
1866 cmci_clear();
1867 for (i = 0; i < banks; i++) {
1868 if (!skip_bank_init(i))
1869 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1870 }
1871}
1872
1873static void mce_reenable_cpu(void *h)
1874{
1875 unsigned long action = *(unsigned long *)h;
1876 int i;
1877
1878 if (!mce_available(&current_cpu_data))
1879 return;
1880
1881 if (!(action & CPU_TASKS_FROZEN))
1882 cmci_reenable();
1883 for (i = 0; i < banks; i++) {
1884 if (!skip_bank_init(i))
1885 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1886 }
1887}
1888
1889/* Get notified when a cpu comes on/off. Be hotplug friendly. */
1890static int __cpuinit
1891mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
1892{
1893 unsigned int cpu = (unsigned long)hcpu;
1894 struct timer_list *t = &per_cpu(mce_timer, cpu);
1895
1896 switch (action) {
1897 case CPU_ONLINE:
1898 case CPU_ONLINE_FROZEN:
1899 mce_create_device(cpu);
1900 if (threshold_cpu_callback)
1901 threshold_cpu_callback(action, cpu);
1902 break;
1903 case CPU_DEAD:
1904 case CPU_DEAD_FROZEN:
1905 if (threshold_cpu_callback)
1906 threshold_cpu_callback(action, cpu);
1907 mce_remove_device(cpu);
1908 break;
1909 case CPU_DOWN_PREPARE:
1910 case CPU_DOWN_PREPARE_FROZEN:
1911 del_timer_sync(t);
1912 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1913 break;
1914 case CPU_DOWN_FAILED:
1915 case CPU_DOWN_FAILED_FROZEN:
1916 t->expires = round_jiffies(jiffies +
1917 __get_cpu_var(next_interval));
1918 add_timer_on(t, cpu);
1919 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1920 break;
1921 case CPU_POST_DEAD:
1922 /* intentionally ignoring frozen here */
1923 cmci_rediscover(cpu);
1924 break;
1925 }
1926 return NOTIFY_OK;
1927}
1928
1929static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1930 .notifier_call = mce_cpu_callback,
1931};
1932
1933static __init int mce_init_banks(void)
1934{
1935 int i;
1936
1937 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1938 GFP_KERNEL);
1939 if (!bank_attrs)
1940 return -ENOMEM;
1941
1942 for (i = 0; i < banks; i++) {
1943 struct sysdev_attribute *a = &bank_attrs[i];
1944
1945 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
1946 if (!a->attr.name)
1947 goto nomem;
1948
1949 a->attr.mode = 0644;
1950 a->show = show_bank;
1951 a->store = set_bank;
1952 }
1953 return 0;
1954
1955nomem:
1956 while (--i >= 0)
1957 kfree(bank_attrs[i].attr.name);
1958 kfree(bank_attrs);
1959 bank_attrs = NULL;
1960
1961 return -ENOMEM;
1962}
1963
1964static __init int mce_init_device(void)
1965{
1966 int err;
1967 int i = 0;
1968
1969 if (!mce_available(&boot_cpu_data))
1970 return -EIO;
1971
1972 zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
1973
1974 err = mce_init_banks();
1975 if (err)
1976 return err;
1977
1978 err = sysdev_class_register(&mce_sysclass);
1979 if (err)
1980 return err;
1981
1982 for_each_online_cpu(i) {
1983 err = mce_create_device(i);
1984 if (err)
1985 return err;
1986 }
1987
1988 register_hotcpu_notifier(&mce_cpu_notifier);
1989 misc_register(&mce_log_device);
1990
1991 return err;
1992}
1993
1994device_initcall(mce_init_device);
1995
1996#else /* CONFIG_X86_OLD_MCE: */
1997
1998int nr_mce_banks;
1999EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */
2000
2001/* This has to be run for each processor */
2002void mcheck_init(struct cpuinfo_x86 *c)
2003{
2004 if (mce_disabled)
2005 return;
2006
2007 switch (c->x86_vendor) {
2008 case X86_VENDOR_AMD:
2009 amd_mcheck_init(c);
2010 break;
2011
2012 case X86_VENDOR_INTEL:
2013 if (c->x86 == 5)
2014 intel_p5_mcheck_init(c);
2015 if (c->x86 == 6)
2016 intel_p6_mcheck_init(c);
2017 if (c->x86 == 15)
2018 intel_p4_mcheck_init(c);
2019 break;
2020
2021 case X86_VENDOR_CENTAUR:
2022 if (c->x86 == 5)
2023 winchip_mcheck_init(c);
2024 break;
2025
2026 default:
2027 break;
2028 }
2029 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks);
2030}
2031
2032static int __init mcheck_enable(char *str)
2033{
2034 mce_p5_enabled = 1;
2035 return 1;
2036}
2037__setup("mce", mcheck_enable);
2038
2039#endif /* CONFIG_X86_OLD_MCE */
2040
2041/*
2042 * Old style boot options parsing. Only for compatibility.
2043 */
2044static int __init mcheck_disable(char *str)
2045{
2046 mce_disabled = 1;
2047 return 1;
2048}
2049__setup("nomce", mcheck_disable);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h
deleted file mode 100644
index ae9f628838f1..000000000000
--- a/arch/x86/kernel/cpu/mcheck/mce.h
+++ /dev/null
@@ -1,14 +0,0 @@
1#include <linux/init.h>
2#include <asm/mce.h>
3
4void amd_mcheck_init(struct cpuinfo_x86 *c);
5void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
6void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
7void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
8void winchip_mcheck_init(struct cpuinfo_x86 *c);
9
10/* Call the installed machine check handler for this CPU setup. */
11extern void (*machine_check_vector)(struct pt_regs *, long error_code);
12
13extern int nr_mce_banks;
14
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
deleted file mode 100644
index 3552119b091d..000000000000
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ /dev/null
@@ -1,76 +0,0 @@
1/*
2 * mce.c - x86 Machine Check Exception Reporting
3 * (c) 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>, Dave Jones <davej@redhat.com>
4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/module.h>
10#include <linux/smp.h>
11#include <linux/thread_info.h>
12
13#include <asm/processor.h>
14#include <asm/system.h>
15#include <asm/mce.h>
16
17#include "mce.h"
18
19int mce_disabled;
20int nr_mce_banks;
21
22EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */
23
24/* Handle unconfigured int18 (should never happen) */
25static void unexpected_machine_check(struct pt_regs *regs, long error_code)
26{
27 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id());
28}
29
30/* Call the installed machine check handler for this CPU setup. */
31void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check;
32
33/* This has to be run for each processor */
34void mcheck_init(struct cpuinfo_x86 *c)
35{
36 if (mce_disabled == 1)
37 return;
38
39 switch (c->x86_vendor) {
40 case X86_VENDOR_AMD:
41 amd_mcheck_init(c);
42 break;
43
44 case X86_VENDOR_INTEL:
45 if (c->x86 == 5)
46 intel_p5_mcheck_init(c);
47 if (c->x86 == 6)
48 intel_p6_mcheck_init(c);
49 if (c->x86 == 15)
50 intel_p4_mcheck_init(c);
51 break;
52
53 case X86_VENDOR_CENTAUR:
54 if (c->x86 == 5)
55 winchip_mcheck_init(c);
56 break;
57
58 default:
59 break;
60 }
61}
62
63static int __init mcheck_disable(char *str)
64{
65 mce_disabled = 1;
66 return 1;
67}
68
69static int __init mcheck_enable(char *str)
70{
71 mce_disabled = -1;
72 return 1;
73}
74
75__setup("nomce", mcheck_disable);
76__setup("mce", mcheck_enable);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
deleted file mode 100644
index 09dd1d414fc3..000000000000
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ /dev/null
@@ -1,1187 +0,0 @@
1/*
2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
6 * Copyright 2008 Intel Corporation
7 * Author: Andi Kleen
8 */
9
10#include <linux/init.h>
11#include <linux/types.h>
12#include <linux/kernel.h>
13#include <linux/sched.h>
14#include <linux/smp_lock.h>
15#include <linux/string.h>
16#include <linux/rcupdate.h>
17#include <linux/kallsyms.h>
18#include <linux/sysdev.h>
19#include <linux/miscdevice.h>
20#include <linux/fs.h>
21#include <linux/capability.h>
22#include <linux/cpu.h>
23#include <linux/percpu.h>
24#include <linux/poll.h>
25#include <linux/thread_info.h>
26#include <linux/ctype.h>
27#include <linux/kmod.h>
28#include <linux/kdebug.h>
29#include <linux/kobject.h>
30#include <linux/sysfs.h>
31#include <linux/ratelimit.h>
32#include <asm/processor.h>
33#include <asm/msr.h>
34#include <asm/mce.h>
35#include <asm/uaccess.h>
36#include <asm/smp.h>
37#include <asm/idle.h>
38
39#define MISC_MCELOG_MINOR 227
40
41atomic_t mce_entry;
42
43static int mce_dont_init;
44
45/*
46 * Tolerant levels:
47 * 0: always panic on uncorrected errors, log corrected errors
48 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
49 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
50 * 3: never panic or SIGBUS, log all errors (for testing only)
51 */
52static int tolerant = 1;
53static int banks;
54static u64 *bank;
55static unsigned long notify_user;
56static int rip_msr;
57static int mce_bootlog = -1;
58static atomic_t mce_events;
59
60static char trigger[128];
61static char *trigger_argv[2] = { trigger, NULL };
62
63static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
64
65/* MCA banks polled by the period polling timer for corrected events */
66DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
67 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
68};
69
70/* Do initial initialization of a struct mce */
71void mce_setup(struct mce *m)
72{
73 memset(m, 0, sizeof(struct mce));
74 m->cpu = smp_processor_id();
75 rdtscll(m->tsc);
76}
77
78/*
79 * Lockless MCE logging infrastructure.
80 * This avoids deadlocks on printk locks without having to break locks. Also
81 * separate MCEs from kernel messages to avoid bogus bug reports.
82 */
83
84static struct mce_log mcelog = {
85 MCE_LOG_SIGNATURE,
86 MCE_LOG_LEN,
87};
88
89void mce_log(struct mce *mce)
90{
91 unsigned next, entry;
92 atomic_inc(&mce_events);
93 mce->finished = 0;
94 wmb();
95 for (;;) {
96 entry = rcu_dereference(mcelog.next);
97 for (;;) {
98 /* When the buffer fills up discard new entries. Assume
99 that the earlier errors are the more interesting. */
100 if (entry >= MCE_LOG_LEN) {
101 set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
102 return;
103 }
104 /* Old left over entry. Skip. */
105 if (mcelog.entry[entry].finished) {
106 entry++;
107 continue;
108 }
109 break;
110 }
111 smp_rmb();
112 next = entry + 1;
113 if (cmpxchg(&mcelog.next, entry, next) == entry)
114 break;
115 }
116 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
117 wmb();
118 mcelog.entry[entry].finished = 1;
119 wmb();
120
121 set_bit(0, &notify_user);
122}
123
124static void print_mce(struct mce *m)
125{
126 printk(KERN_EMERG "\n"
127 KERN_EMERG "HARDWARE ERROR\n"
128 KERN_EMERG
129 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
130 m->cpu, m->mcgstatus, m->bank, m->status);
131 if (m->ip) {
132 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
133 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
134 m->cs, m->ip);
135 if (m->cs == __KERNEL_CS)
136 print_symbol("{%s}", m->ip);
137 printk("\n");
138 }
139 printk(KERN_EMERG "TSC %llx ", m->tsc);
140 if (m->addr)
141 printk("ADDR %llx ", m->addr);
142 if (m->misc)
143 printk("MISC %llx ", m->misc);
144 printk("\n");
145 printk(KERN_EMERG "This is not a software problem!\n");
146 printk(KERN_EMERG "Run through mcelog --ascii to decode "
147 "and contact your hardware vendor\n");
148}
149
150static void mce_panic(char *msg, struct mce *backup, unsigned long start)
151{
152 int i;
153
154 oops_begin();
155 for (i = 0; i < MCE_LOG_LEN; i++) {
156 unsigned long tsc = mcelog.entry[i].tsc;
157
158 if (time_before(tsc, start))
159 continue;
160 print_mce(&mcelog.entry[i]);
161 if (backup && mcelog.entry[i].tsc == backup->tsc)
162 backup = NULL;
163 }
164 if (backup)
165 print_mce(backup);
166 panic(msg);
167}
168
169int mce_available(struct cpuinfo_x86 *c)
170{
171 if (mce_dont_init)
172 return 0;
173 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
174}
175
176static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
177{
178 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
179 m->ip = regs->ip;
180 m->cs = regs->cs;
181 } else {
182 m->ip = 0;
183 m->cs = 0;
184 }
185 if (rip_msr) {
186 /* Assume the RIP in the MSR is exact. Is this true? */
187 m->mcgstatus |= MCG_STATUS_EIPV;
188 rdmsrl(rip_msr, m->ip);
189 m->cs = 0;
190 }
191}
192
193/*
194 * Poll for corrected events or events that happened before reset.
195 * Those are just logged through /dev/mcelog.
196 *
197 * This is executed in standard interrupt context.
198 */
199void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
200{
201 struct mce m;
202 int i;
203
204 mce_setup(&m);
205
206 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
207 for (i = 0; i < banks; i++) {
208 if (!bank[i] || !test_bit(i, *b))
209 continue;
210
211 m.misc = 0;
212 m.addr = 0;
213 m.bank = i;
214 m.tsc = 0;
215
216 barrier();
217 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
218 if (!(m.status & MCI_STATUS_VAL))
219 continue;
220
221 /*
222 * Uncorrected events are handled by the exception handler
223 * when it is enabled. But when the exception is disabled log
224 * everything.
225 *
226 * TBD do the same check for MCI_STATUS_EN here?
227 */
228 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
229 continue;
230
231 if (m.status & MCI_STATUS_MISCV)
232 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
233 if (m.status & MCI_STATUS_ADDRV)
234 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
235
236 if (!(flags & MCP_TIMESTAMP))
237 m.tsc = 0;
238 /*
239 * Don't get the IP here because it's unlikely to
240 * have anything to do with the actual error location.
241 */
242 if (!(flags & MCP_DONTLOG)) {
243 mce_log(&m);
244 add_taint(TAINT_MACHINE_CHECK);
245 }
246
247 /*
248 * Clear state for this bank.
249 */
250 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
251 }
252
253 /*
254 * Don't clear MCG_STATUS here because it's only defined for
255 * exceptions.
256 */
257}
258
259/*
260 * The actual machine check handler. This only handles real
261 * exceptions when something got corrupted coming in through int 18.
262 *
263 * This is executed in NMI context not subject to normal locking rules. This
264 * implies that most kernel services cannot be safely used. Don't even
265 * think about putting a printk in there!
266 */
267void do_machine_check(struct pt_regs * regs, long error_code)
268{
269 struct mce m, panicm;
270 u64 mcestart = 0;
271 int i;
272 int panicm_found = 0;
273 /*
274 * If no_way_out gets set, there is no safe way to recover from this
275 * MCE. If tolerant is cranked up, we'll try anyway.
276 */
277 int no_way_out = 0;
278 /*
279 * If kill_it gets set, there might be a way to recover from this
280 * error.
281 */
282 int kill_it = 0;
283 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
284
285 atomic_inc(&mce_entry);
286
287 if (notify_die(DIE_NMI, "machine check", regs, error_code,
288 18, SIGKILL) == NOTIFY_STOP)
289 goto out2;
290 if (!banks)
291 goto out2;
292
293 mce_setup(&m);
294
295 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
296 /* if the restart IP is not valid, we're done for */
297 if (!(m.mcgstatus & MCG_STATUS_RIPV))
298 no_way_out = 1;
299
300 rdtscll(mcestart);
301 barrier();
302
303 for (i = 0; i < banks; i++) {
304 __clear_bit(i, toclear);
305 if (!bank[i])
306 continue;
307
308 m.misc = 0;
309 m.addr = 0;
310 m.bank = i;
311
312 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
313 if ((m.status & MCI_STATUS_VAL) == 0)
314 continue;
315
316 /*
317 * Non uncorrected errors are handled by machine_check_poll
318 * Leave them alone.
319 */
320 if ((m.status & MCI_STATUS_UC) == 0)
321 continue;
322
323 /*
324 * Set taint even when machine check was not enabled.
325 */
326 add_taint(TAINT_MACHINE_CHECK);
327
328 __set_bit(i, toclear);
329
330 if (m.status & MCI_STATUS_EN) {
331 /* if PCC was set, there's no way out */
332 no_way_out |= !!(m.status & MCI_STATUS_PCC);
333 /*
334 * If this error was uncorrectable and there was
335 * an overflow, we're in trouble. If no overflow,
336 * we might get away with just killing a task.
337 */
338 if (m.status & MCI_STATUS_UC) {
339 if (tolerant < 1 || m.status & MCI_STATUS_OVER)
340 no_way_out = 1;
341 kill_it = 1;
342 }
343 } else {
344 /*
345 * Machine check event was not enabled. Clear, but
346 * ignore.
347 */
348 continue;
349 }
350
351 if (m.status & MCI_STATUS_MISCV)
352 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
353 if (m.status & MCI_STATUS_ADDRV)
354 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
355
356 mce_get_rip(&m, regs);
357 mce_log(&m);
358
359 /* Did this bank cause the exception? */
360 /* Assume that the bank with uncorrectable errors did it,
361 and that there is only a single one. */
362 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
363 panicm = m;
364 panicm_found = 1;
365 }
366 }
367
368 /* If we didn't find an uncorrectable error, pick
369 the last one (shouldn't happen, just being safe). */
370 if (!panicm_found)
371 panicm = m;
372
373 /*
374 * If we have decided that we just CAN'T continue, and the user
375 * has not set tolerant to an insane level, give up and die.
376 */
377 if (no_way_out && tolerant < 3)
378 mce_panic("Machine check", &panicm, mcestart);
379
380 /*
381 * If the error seems to be unrecoverable, something should be
382 * done. Try to kill as little as possible. If we can kill just
383 * one task, do that. If the user has set the tolerance very
384 * high, don't try to do anything at all.
385 */
386 if (kill_it && tolerant < 3) {
387 int user_space = 0;
388
389 /*
390 * If the EIPV bit is set, it means the saved IP is the
391 * instruction which caused the MCE.
392 */
393 if (m.mcgstatus & MCG_STATUS_EIPV)
394 user_space = panicm.ip && (panicm.cs & 3);
395
396 /*
397 * If we know that the error was in user space, send a
398 * SIGBUS. Otherwise, panic if tolerance is low.
399 *
400 * force_sig() takes an awful lot of locks and has a slight
401 * risk of deadlocking.
402 */
403 if (user_space) {
404 force_sig(SIGBUS, current);
405 } else if (panic_on_oops || tolerant < 2) {
406 mce_panic("Uncorrected machine check",
407 &panicm, mcestart);
408 }
409 }
410
411 /* notify userspace ASAP */
412 set_thread_flag(TIF_MCE_NOTIFY);
413
414 /* the last thing we do is clear state */
415 for (i = 0; i < banks; i++) {
416 if (test_bit(i, toclear))
417 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
418 }
419 wrmsrl(MSR_IA32_MCG_STATUS, 0);
420 out2:
421 atomic_dec(&mce_entry);
422}
423
424#ifdef CONFIG_X86_MCE_INTEL
425/***
426 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
427 * @cpu: The CPU on which the event occurred.
428 * @status: Event status information
429 *
430 * This function should be called by the thermal interrupt after the
431 * event has been processed and the decision was made to log the event
432 * further.
433 *
434 * The status parameter will be saved to the 'status' field of 'struct mce'
435 * and historically has been the register value of the
436 * MSR_IA32_THERMAL_STATUS (Intel) msr.
437 */
438void mce_log_therm_throt_event(__u64 status)
439{
440 struct mce m;
441
442 mce_setup(&m);
443 m.bank = MCE_THERMAL_BANK;
444 m.status = status;
445 mce_log(&m);
446}
447#endif /* CONFIG_X86_MCE_INTEL */
448
449/*
450 * Periodic polling timer for "silent" machine check errors. If the
451 * poller finds an MCE, poll 2x faster. When the poller finds no more
452 * errors, poll 2x slower (up to check_interval seconds).
453 */
454
455static int check_interval = 5 * 60; /* 5 minutes */
456static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
457static void mcheck_timer(unsigned long);
458static DEFINE_PER_CPU(struct timer_list, mce_timer);
459
460static void mcheck_timer(unsigned long data)
461{
462 struct timer_list *t = &per_cpu(mce_timer, data);
463 int *n;
464
465 WARN_ON(smp_processor_id() != data);
466
467 if (mce_available(&current_cpu_data))
468 machine_check_poll(MCP_TIMESTAMP,
469 &__get_cpu_var(mce_poll_banks));
470
471 /*
472 * Alert userspace if needed. If we logged an MCE, reduce the
473 * polling interval, otherwise increase the polling interval.
474 */
475 n = &__get_cpu_var(next_interval);
476 if (mce_notify_user()) {
477 *n = max(*n/2, HZ/100);
478 } else {
479 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
480 }
481
482 t->expires = jiffies + *n;
483 add_timer(t);
484}
485
486static void mce_do_trigger(struct work_struct *work)
487{
488 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
489}
490
491static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
492
493/*
494 * Notify the user(s) about new machine check events.
495 * Can be called from interrupt context, but not from machine check/NMI
496 * context.
497 */
498int mce_notify_user(void)
499{
500 /* Not more than two messages every minute */
501 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
502
503 clear_thread_flag(TIF_MCE_NOTIFY);
504 if (test_and_clear_bit(0, &notify_user)) {
505 wake_up_interruptible(&mce_wait);
506
507 /*
508 * There is no risk of missing notifications because
509 * work_pending is always cleared before the function is
510 * executed.
511 */
512 if (trigger[0] && !work_pending(&mce_trigger_work))
513 schedule_work(&mce_trigger_work);
514
515 if (__ratelimit(&ratelimit))
516 printk(KERN_INFO "Machine check events logged\n");
517
518 return 1;
519 }
520 return 0;
521}
522
523/* see if the idle task needs to notify userspace */
524static int
525mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
526{
527 /* IDLE_END should be safe - interrupts are back on */
528 if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
529 mce_notify_user();
530
531 return NOTIFY_OK;
532}
533
534static struct notifier_block mce_idle_notifier = {
535 .notifier_call = mce_idle_callback,
536};
537
538static __init int periodic_mcheck_init(void)
539{
540 idle_notifier_register(&mce_idle_notifier);
541 return 0;
542}
543__initcall(periodic_mcheck_init);
544
545/*
546 * Initialize Machine Checks for a CPU.
547 */
548static int mce_cap_init(void)
549{
550 u64 cap;
551 unsigned b;
552
553 rdmsrl(MSR_IA32_MCG_CAP, cap);
554 b = cap & 0xff;
555 if (b > MAX_NR_BANKS) {
556 printk(KERN_WARNING
557 "MCE: Using only %u machine check banks out of %u\n",
558 MAX_NR_BANKS, b);
559 b = MAX_NR_BANKS;
560 }
561
562 /* Don't support asymmetric configurations today */
563 WARN_ON(banks != 0 && b != banks);
564 banks = b;
565 if (!bank) {
566 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
567 if (!bank)
568 return -ENOMEM;
569 memset(bank, 0xff, banks * sizeof(u64));
570 }
571
572 /* Use accurate RIP reporting if available. */
573 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
574 rip_msr = MSR_IA32_MCG_EIP;
575
576 return 0;
577}
578
579static void mce_init(void *dummy)
580{
581 u64 cap;
582 int i;
583 mce_banks_t all_banks;
584
585 /*
586 * Log the machine checks left over from the previous reset.
587 */
588 bitmap_fill(all_banks, MAX_NR_BANKS);
589 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
590
591 set_in_cr4(X86_CR4_MCE);
592
593 rdmsrl(MSR_IA32_MCG_CAP, cap);
594 if (cap & MCG_CTL_P)
595 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
596
597 for (i = 0; i < banks; i++) {
598 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
599 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
600 }
601}
602
603/* Add per CPU specific workarounds here */
604static void mce_cpu_quirks(struct cpuinfo_x86 *c)
605{
606 /* This should be disabled by the BIOS, but isn't always */
607 if (c->x86_vendor == X86_VENDOR_AMD) {
608 if (c->x86 == 15 && banks > 4)
609 /* disable GART TBL walk error reporting, which trips off
610 incorrectly with the IOMMU & 3ware & Cerberus. */
611 clear_bit(10, (unsigned long *)&bank[4]);
612 if(c->x86 <= 17 && mce_bootlog < 0)
613 /* Lots of broken BIOS around that don't clear them
614 by default and leave crap in there. Don't log. */
615 mce_bootlog = 0;
616 }
617
618}
619
620static void mce_cpu_features(struct cpuinfo_x86 *c)
621{
622 switch (c->x86_vendor) {
623 case X86_VENDOR_INTEL:
624 mce_intel_feature_init(c);
625 break;
626 case X86_VENDOR_AMD:
627 mce_amd_feature_init(c);
628 break;
629 default:
630 break;
631 }
632}
633
634static void mce_init_timer(void)
635{
636 struct timer_list *t = &__get_cpu_var(mce_timer);
637 int *n = &__get_cpu_var(next_interval);
638
639 *n = check_interval * HZ;
640 if (!*n)
641 return;
642 setup_timer(t, mcheck_timer, smp_processor_id());
643 t->expires = round_jiffies(jiffies + *n);
644 add_timer(t);
645}
646
647/*
648 * Called for each booted CPU to set up machine checks.
649 * Must be called with preempt off.
650 */
651void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
652{
653 if (!mce_available(c))
654 return;
655
656 if (mce_cap_init() < 0) {
657 mce_dont_init = 1;
658 return;
659 }
660 mce_cpu_quirks(c);
661
662 mce_init(NULL);
663 mce_cpu_features(c);
664 mce_init_timer();
665}
666
667/*
668 * Character device to read and clear the MCE log.
669 */
670
671static DEFINE_SPINLOCK(mce_state_lock);
672static int open_count; /* #times opened */
673static int open_exclu; /* already open exclusive? */
674
675static int mce_open(struct inode *inode, struct file *file)
676{
677 lock_kernel();
678 spin_lock(&mce_state_lock);
679
680 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
681 spin_unlock(&mce_state_lock);
682 unlock_kernel();
683 return -EBUSY;
684 }
685
686 if (file->f_flags & O_EXCL)
687 open_exclu = 1;
688 open_count++;
689
690 spin_unlock(&mce_state_lock);
691 unlock_kernel();
692
693 return nonseekable_open(inode, file);
694}
695
696static int mce_release(struct inode *inode, struct file *file)
697{
698 spin_lock(&mce_state_lock);
699
700 open_count--;
701 open_exclu = 0;
702
703 spin_unlock(&mce_state_lock);
704
705 return 0;
706}
707
708static void collect_tscs(void *data)
709{
710 unsigned long *cpu_tsc = (unsigned long *)data;
711
712 rdtscll(cpu_tsc[smp_processor_id()]);
713}
714
715static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
716 loff_t *off)
717{
718 unsigned long *cpu_tsc;
719 static DEFINE_MUTEX(mce_read_mutex);
720 unsigned prev, next;
721 char __user *buf = ubuf;
722 int i, err;
723
724 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
725 if (!cpu_tsc)
726 return -ENOMEM;
727
728 mutex_lock(&mce_read_mutex);
729 next = rcu_dereference(mcelog.next);
730
731 /* Only supports full reads right now */
732 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
733 mutex_unlock(&mce_read_mutex);
734 kfree(cpu_tsc);
735 return -EINVAL;
736 }
737
738 err = 0;
739 prev = 0;
740 do {
741 for (i = prev; i < next; i++) {
742 unsigned long start = jiffies;
743
744 while (!mcelog.entry[i].finished) {
745 if (time_after_eq(jiffies, start + 2)) {
746 memset(mcelog.entry + i, 0,
747 sizeof(struct mce));
748 goto timeout;
749 }
750 cpu_relax();
751 }
752 smp_rmb();
753 err |= copy_to_user(buf, mcelog.entry + i,
754 sizeof(struct mce));
755 buf += sizeof(struct mce);
756timeout:
757 ;
758 }
759
760 memset(mcelog.entry + prev, 0,
761 (next - prev) * sizeof(struct mce));
762 prev = next;
763 next = cmpxchg(&mcelog.next, prev, 0);
764 } while (next != prev);
765
766 synchronize_sched();
767
768 /*
769 * Collect entries that were still getting written before the
770 * synchronize.
771 */
772 on_each_cpu(collect_tscs, cpu_tsc, 1);
773 for (i = next; i < MCE_LOG_LEN; i++) {
774 if (mcelog.entry[i].finished &&
775 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
776 err |= copy_to_user(buf, mcelog.entry+i,
777 sizeof(struct mce));
778 smp_rmb();
779 buf += sizeof(struct mce);
780 memset(&mcelog.entry[i], 0, sizeof(struct mce));
781 }
782 }
783 mutex_unlock(&mce_read_mutex);
784 kfree(cpu_tsc);
785 return err ? -EFAULT : buf - ubuf;
786}
787
788static unsigned int mce_poll(struct file *file, poll_table *wait)
789{
790 poll_wait(file, &mce_wait, wait);
791 if (rcu_dereference(mcelog.next))
792 return POLLIN | POLLRDNORM;
793 return 0;
794}
795
796static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
797{
798 int __user *p = (int __user *)arg;
799
800 if (!capable(CAP_SYS_ADMIN))
801 return -EPERM;
802 switch (cmd) {
803 case MCE_GET_RECORD_LEN:
804 return put_user(sizeof(struct mce), p);
805 case MCE_GET_LOG_LEN:
806 return put_user(MCE_LOG_LEN, p);
807 case MCE_GETCLEAR_FLAGS: {
808 unsigned flags;
809
810 do {
811 flags = mcelog.flags;
812 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
813 return put_user(flags, p);
814 }
815 default:
816 return -ENOTTY;
817 }
818}
819
820static const struct file_operations mce_chrdev_ops = {
821 .open = mce_open,
822 .release = mce_release,
823 .read = mce_read,
824 .poll = mce_poll,
825 .unlocked_ioctl = mce_ioctl,
826};
827
828static struct miscdevice mce_log_device = {
829 MISC_MCELOG_MINOR,
830 "mcelog",
831 &mce_chrdev_ops,
832};
833
834/*
835 * Old style boot options parsing. Only for compatibility.
836 */
837static int __init mcheck_disable(char *str)
838{
839 mce_dont_init = 1;
840 return 1;
841}
842
843/* mce=off disables machine check.
844 mce=TOLERANCELEVEL (number, see above)
845 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
846 mce=nobootlog Don't log MCEs from before booting. */
847static int __init mcheck_enable(char *str)
848{
849 if (!strcmp(str, "off"))
850 mce_dont_init = 1;
851 else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
852 mce_bootlog = str[0] == 'b';
853 else if (isdigit(str[0]))
854 get_option(&str, &tolerant);
855 else
856 printk("mce= argument %s ignored. Please use /sys", str);
857 return 1;
858}
859
860__setup("nomce", mcheck_disable);
861__setup("mce=", mcheck_enable);
862
863/*
864 * Sysfs support
865 */
866
867/*
868 * Disable machine checks on suspend and shutdown. We can't really handle
869 * them later.
870 */
871static int mce_disable(void)
872{
873 int i;
874
875 for (i = 0; i < banks; i++)
876 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
877 return 0;
878}
879
880static int mce_suspend(struct sys_device *dev, pm_message_t state)
881{
882 return mce_disable();
883}
884
885static int mce_shutdown(struct sys_device *dev)
886{
887 return mce_disable();
888}
889
890/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
891 Only one CPU is active at this time, the others get readded later using
892 CPU hotplug. */
893static int mce_resume(struct sys_device *dev)
894{
895 mce_init(NULL);
896 mce_cpu_features(&current_cpu_data);
897 return 0;
898}
899
900static void mce_cpu_restart(void *data)
901{
902 del_timer_sync(&__get_cpu_var(mce_timer));
903 if (mce_available(&current_cpu_data))
904 mce_init(NULL);
905 mce_init_timer();
906}
907
908/* Reinit MCEs after user configuration changes */
909static void mce_restart(void)
910{
911 on_each_cpu(mce_cpu_restart, NULL, 1);
912}
913
914static struct sysdev_class mce_sysclass = {
915 .suspend = mce_suspend,
916 .shutdown = mce_shutdown,
917 .resume = mce_resume,
918 .name = "machinecheck",
919};
920
921DEFINE_PER_CPU(struct sys_device, device_mce);
922void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata;
923
924/* Why are there no generic functions for this? */
925#define ACCESSOR(name, var, start) \
926 static ssize_t show_ ## name(struct sys_device *s, \
927 struct sysdev_attribute *attr, \
928 char *buf) { \
929 return sprintf(buf, "%lx\n", (unsigned long)var); \
930 } \
931 static ssize_t set_ ## name(struct sys_device *s, \
932 struct sysdev_attribute *attr, \
933 const char *buf, size_t siz) { \
934 char *end; \
935 unsigned long new = simple_strtoul(buf, &end, 0); \
936 if (end == buf) return -EINVAL; \
937 var = new; \
938 start; \
939 return end-buf; \
940 } \
941 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
942
943static struct sysdev_attribute *bank_attrs;
944
945static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
946 char *buf)
947{
948 u64 b = bank[attr - bank_attrs];
949 return sprintf(buf, "%llx\n", b);
950}
951
952static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
953 const char *buf, size_t siz)
954{
955 char *end;
956 u64 new = simple_strtoull(buf, &end, 0);
957 if (end == buf)
958 return -EINVAL;
959 bank[attr - bank_attrs] = new;
960 mce_restart();
961 return end-buf;
962}
963
964static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
965 char *buf)
966{
967 strcpy(buf, trigger);
968 strcat(buf, "\n");
969 return strlen(trigger) + 1;
970}
971
972static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
973 const char *buf,size_t siz)
974{
975 char *p;
976 int len;
977 strncpy(trigger, buf, sizeof(trigger));
978 trigger[sizeof(trigger)-1] = 0;
979 len = strlen(trigger);
980 p = strchr(trigger, '\n');
981 if (*p) *p = 0;
982 return len;
983}
984
985static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
986static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
987ACCESSOR(check_interval,check_interval,mce_restart())
988static struct sysdev_attribute *mce_attributes[] = {
989 &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
990 NULL
991};
992
993static cpumask_var_t mce_device_initialized;
994
995/* Per cpu sysdev init. All of the cpus still share the same ctl bank */
996static __cpuinit int mce_create_device(unsigned int cpu)
997{
998 int err;
999 int i;
1000
1001 if (!mce_available(&boot_cpu_data))
1002 return -EIO;
1003
1004 memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
1005 per_cpu(device_mce,cpu).id = cpu;
1006 per_cpu(device_mce,cpu).cls = &mce_sysclass;
1007
1008 err = sysdev_register(&per_cpu(device_mce,cpu));
1009 if (err)
1010 return err;
1011
1012 for (i = 0; mce_attributes[i]; i++) {
1013 err = sysdev_create_file(&per_cpu(device_mce,cpu),
1014 mce_attributes[i]);
1015 if (err)
1016 goto error;
1017 }
1018 for (i = 0; i < banks; i++) {
1019 err = sysdev_create_file(&per_cpu(device_mce, cpu),
1020 &bank_attrs[i]);
1021 if (err)
1022 goto error2;
1023 }
1024 cpumask_set_cpu(cpu, mce_device_initialized);
1025
1026 return 0;
1027error2:
1028 while (--i >= 0) {
1029 sysdev_remove_file(&per_cpu(device_mce, cpu),
1030 &bank_attrs[i]);
1031 }
1032error:
1033 while (--i >= 0) {
1034 sysdev_remove_file(&per_cpu(device_mce,cpu),
1035 mce_attributes[i]);
1036 }
1037 sysdev_unregister(&per_cpu(device_mce,cpu));
1038
1039 return err;
1040}
1041
1042static __cpuinit void mce_remove_device(unsigned int cpu)
1043{
1044 int i;
1045
1046 if (!cpumask_test_cpu(cpu, mce_device_initialized))
1047 return;
1048
1049 for (i = 0; mce_attributes[i]; i++)
1050 sysdev_remove_file(&per_cpu(device_mce,cpu),
1051 mce_attributes[i]);
1052 for (i = 0; i < banks; i++)
1053 sysdev_remove_file(&per_cpu(device_mce, cpu),
1054 &bank_attrs[i]);
1055 sysdev_unregister(&per_cpu(device_mce,cpu));
1056 cpumask_clear_cpu(cpu, mce_device_initialized);
1057}
1058
1059/* Make sure there are no machine checks on offlined CPUs. */
1060static void mce_disable_cpu(void *h)
1061{
1062 int i;
1063 unsigned long action = *(unsigned long *)h;
1064
1065 if (!mce_available(&current_cpu_data))
1066 return;
1067 if (!(action & CPU_TASKS_FROZEN))
1068 cmci_clear();
1069 for (i = 0; i < banks; i++)
1070 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1071}
1072
1073static void mce_reenable_cpu(void *h)
1074{
1075 int i;
1076 unsigned long action = *(unsigned long *)h;
1077
1078 if (!mce_available(&current_cpu_data))
1079 return;
1080 if (!(action & CPU_TASKS_FROZEN))
1081 cmci_reenable();
1082 for (i = 0; i < banks; i++)
1083 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1084}
1085
1086/* Get notified when a cpu comes on/off. Be hotplug friendly. */
1087static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
1088 unsigned long action, void *hcpu)
1089{
1090 unsigned int cpu = (unsigned long)hcpu;
1091 struct timer_list *t = &per_cpu(mce_timer, cpu);
1092
1093 switch (action) {
1094 case CPU_ONLINE:
1095 case CPU_ONLINE_FROZEN:
1096 mce_create_device(cpu);
1097 if (threshold_cpu_callback)
1098 threshold_cpu_callback(action, cpu);
1099 break;
1100 case CPU_DEAD:
1101 case CPU_DEAD_FROZEN:
1102 if (threshold_cpu_callback)
1103 threshold_cpu_callback(action, cpu);
1104 mce_remove_device(cpu);
1105 break;
1106 case CPU_DOWN_PREPARE:
1107 case CPU_DOWN_PREPARE_FROZEN:
1108 del_timer_sync(t);
1109 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1110 break;
1111 case CPU_DOWN_FAILED:
1112 case CPU_DOWN_FAILED_FROZEN:
1113 t->expires = round_jiffies(jiffies +
1114 __get_cpu_var(next_interval));
1115 add_timer_on(t, cpu);
1116 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1117 break;
1118 case CPU_POST_DEAD:
1119 /* intentionally ignoring frozen here */
1120 cmci_rediscover(cpu);
1121 break;
1122 }
1123 return NOTIFY_OK;
1124}
1125
1126static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1127 .notifier_call = mce_cpu_callback,
1128};
1129
1130static __init int mce_init_banks(void)
1131{
1132 int i;
1133
1134 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1135 GFP_KERNEL);
1136 if (!bank_attrs)
1137 return -ENOMEM;
1138
1139 for (i = 0; i < banks; i++) {
1140 struct sysdev_attribute *a = &bank_attrs[i];
1141 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
1142 if (!a->attr.name)
1143 goto nomem;
1144 a->attr.mode = 0644;
1145 a->show = show_bank;
1146 a->store = set_bank;
1147 }
1148 return 0;
1149
1150nomem:
1151 while (--i >= 0)
1152 kfree(bank_attrs[i].attr.name);
1153 kfree(bank_attrs);
1154 bank_attrs = NULL;
1155 return -ENOMEM;
1156}
1157
1158static __init int mce_init_device(void)
1159{
1160 int err;
1161 int i = 0;
1162
1163 if (!mce_available(&boot_cpu_data))
1164 return -EIO;
1165
1166 zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
1167
1168 err = mce_init_banks();
1169 if (err)
1170 return err;
1171
1172 err = sysdev_class_register(&mce_sysclass);
1173 if (err)
1174 return err;
1175
1176 for_each_online_cpu(i) {
1177 err = mce_create_device(i);
1178 if (err)
1179 return err;
1180 }
1181
1182 register_hotcpu_notifier(&mce_cpu_notifier);
1183 misc_register(&mce_log_device);
1184 return err;
1185}
1186
1187device_initcall(mce_init_device);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 56dde9c4bc96..ddae21620bda 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -13,22 +13,22 @@
13 * 13 *
14 * All MC4_MISCi registers are shared between multi-cores 14 * All MC4_MISCi registers are shared between multi-cores
15 */ 15 */
16
17#include <linux/cpu.h>
18#include <linux/errno.h>
19#include <linux/init.h>
20#include <linux/interrupt.h> 16#include <linux/interrupt.h>
21#include <linux/kobject.h>
22#include <linux/notifier.h> 17#include <linux/notifier.h>
23#include <linux/sched.h> 18#include <linux/kobject.h>
24#include <linux/smp.h> 19#include <linux/percpu.h>
25#include <linux/sysdev.h> 20#include <linux/sysdev.h>
21#include <linux/errno.h>
22#include <linux/sched.h>
26#include <linux/sysfs.h> 23#include <linux/sysfs.h>
24#include <linux/init.h>
25#include <linux/cpu.h>
26#include <linux/smp.h>
27
27#include <asm/apic.h> 28#include <asm/apic.h>
29#include <asm/idle.h>
28#include <asm/mce.h> 30#include <asm/mce.h>
29#include <asm/msr.h> 31#include <asm/msr.h>
30#include <asm/percpu.h>
31#include <asm/idle.h>
32 32
33#define PFX "mce_threshold: " 33#define PFX "mce_threshold: "
34#define VERSION "version 1.1.1" 34#define VERSION "version 1.1.1"
@@ -48,26 +48,26 @@
48#define MCG_XBLK_ADDR 0xC0000400 48#define MCG_XBLK_ADDR 0xC0000400
49 49
50struct threshold_block { 50struct threshold_block {
51 unsigned int block; 51 unsigned int block;
52 unsigned int bank; 52 unsigned int bank;
53 unsigned int cpu; 53 unsigned int cpu;
54 u32 address; 54 u32 address;
55 u16 interrupt_enable; 55 u16 interrupt_enable;
56 u16 threshold_limit; 56 u16 threshold_limit;
57 struct kobject kobj; 57 struct kobject kobj;
58 struct list_head miscj; 58 struct list_head miscj;
59}; 59};
60 60
61/* defaults used early on boot */ 61/* defaults used early on boot */
62static struct threshold_block threshold_defaults = { 62static struct threshold_block threshold_defaults = {
63 .interrupt_enable = 0, 63 .interrupt_enable = 0,
64 .threshold_limit = THRESHOLD_MAX, 64 .threshold_limit = THRESHOLD_MAX,
65}; 65};
66 66
67struct threshold_bank { 67struct threshold_bank {
68 struct kobject *kobj; 68 struct kobject *kobj;
69 struct threshold_block *blocks; 69 struct threshold_block *blocks;
70 cpumask_var_t cpus; 70 cpumask_var_t cpus;
71}; 71};
72static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); 72static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
73 73
@@ -86,9 +86,9 @@ static void amd_threshold_interrupt(void);
86 */ 86 */
87 87
88struct thresh_restart { 88struct thresh_restart {
89 struct threshold_block *b; 89 struct threshold_block *b;
90 int reset; 90 int reset;
91 u16 old_limit; 91 u16 old_limit;
92}; 92};
93 93
94/* must be called with correct cpu affinity */ 94/* must be called with correct cpu affinity */
@@ -110,6 +110,7 @@ static void threshold_restart_bank(void *_tr)
110 } else if (tr->old_limit) { /* change limit w/o reset */ 110 } else if (tr->old_limit) { /* change limit w/o reset */
111 int new_count = (mci_misc_hi & THRESHOLD_MAX) + 111 int new_count = (mci_misc_hi & THRESHOLD_MAX) +
112 (tr->old_limit - tr->b->threshold_limit); 112 (tr->old_limit - tr->b->threshold_limit);
113
113 mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | 114 mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
114 (new_count & THRESHOLD_MAX); 115 (new_count & THRESHOLD_MAX);
115 } 116 }
@@ -125,11 +126,11 @@ static void threshold_restart_bank(void *_tr)
125/* cpu init entry point, called from mce.c with preempt off */ 126/* cpu init entry point, called from mce.c with preempt off */
126void mce_amd_feature_init(struct cpuinfo_x86 *c) 127void mce_amd_feature_init(struct cpuinfo_x86 *c)
127{ 128{
128 unsigned int bank, block;
129 unsigned int cpu = smp_processor_id(); 129 unsigned int cpu = smp_processor_id();
130 u8 lvt_off;
131 u32 low = 0, high = 0, address = 0; 130 u32 low = 0, high = 0, address = 0;
131 unsigned int bank, block;
132 struct thresh_restart tr; 132 struct thresh_restart tr;
133 u8 lvt_off;
133 134
134 for (bank = 0; bank < NR_BANKS; ++bank) { 135 for (bank = 0; bank < NR_BANKS; ++bank) {
135 for (block = 0; block < NR_BLOCKS; ++block) { 136 for (block = 0; block < NR_BLOCKS; ++block) {
@@ -140,8 +141,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
140 if (!address) 141 if (!address)
141 break; 142 break;
142 address += MCG_XBLK_ADDR; 143 address += MCG_XBLK_ADDR;
143 } 144 } else
144 else
145 ++address; 145 ++address;
146 146
147 if (rdmsr_safe(address, &low, &high)) 147 if (rdmsr_safe(address, &low, &high))
@@ -193,9 +193,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
193 */ 193 */
194static void amd_threshold_interrupt(void) 194static void amd_threshold_interrupt(void)
195{ 195{
196 u32 low = 0, high = 0, address = 0;
196 unsigned int bank, block; 197 unsigned int bank, block;
197 struct mce m; 198 struct mce m;
198 u32 low = 0, high = 0, address = 0;
199 199
200 mce_setup(&m); 200 mce_setup(&m);
201 201
@@ -204,16 +204,16 @@ static void amd_threshold_interrupt(void)
204 if (!(per_cpu(bank_map, m.cpu) & (1 << bank))) 204 if (!(per_cpu(bank_map, m.cpu) & (1 << bank)))
205 continue; 205 continue;
206 for (block = 0; block < NR_BLOCKS; ++block) { 206 for (block = 0; block < NR_BLOCKS; ++block) {
207 if (block == 0) 207 if (block == 0) {
208 address = MSR_IA32_MC0_MISC + bank * 4; 208 address = MSR_IA32_MC0_MISC + bank * 4;
209 else if (block == 1) { 209 } else if (block == 1) {
210 address = (low & MASK_BLKPTR_LO) >> 21; 210 address = (low & MASK_BLKPTR_LO) >> 21;
211 if (!address) 211 if (!address)
212 break; 212 break;
213 address += MCG_XBLK_ADDR; 213 address += MCG_XBLK_ADDR;
214 } 214 } else {
215 else
216 ++address; 215 ++address;
216 }
217 217
218 if (rdmsr_safe(address, &low, &high)) 218 if (rdmsr_safe(address, &low, &high))
219 break; 219 break;
@@ -229,8 +229,10 @@ static void amd_threshold_interrupt(void)
229 (high & MASK_LOCKED_HI)) 229 (high & MASK_LOCKED_HI))
230 continue; 230 continue;
231 231
232 /* Log the machine check that caused the threshold 232 /*
233 event. */ 233 * Log the machine check that caused the threshold
234 * event.
235 */
234 machine_check_poll(MCP_TIMESTAMP, 236 machine_check_poll(MCP_TIMESTAMP,
235 &__get_cpu_var(mce_poll_banks)); 237 &__get_cpu_var(mce_poll_banks));
236 238
@@ -254,48 +256,52 @@ static void amd_threshold_interrupt(void)
254 256
255struct threshold_attr { 257struct threshold_attr {
256 struct attribute attr; 258 struct attribute attr;
257 ssize_t(*show) (struct threshold_block *, char *); 259 ssize_t (*show) (struct threshold_block *, char *);
258 ssize_t(*store) (struct threshold_block *, const char *, size_t count); 260 ssize_t (*store) (struct threshold_block *, const char *, size_t count);
259}; 261};
260 262
261#define SHOW_FIELDS(name) \ 263#define SHOW_FIELDS(name) \
262static ssize_t show_ ## name(struct threshold_block * b, char *buf) \ 264static ssize_t show_ ## name(struct threshold_block *b, char *buf) \
263{ \ 265{ \
264 return sprintf(buf, "%lx\n", (unsigned long) b->name); \ 266 return sprintf(buf, "%lx\n", (unsigned long) b->name); \
265} 267}
266SHOW_FIELDS(interrupt_enable) 268SHOW_FIELDS(interrupt_enable)
267SHOW_FIELDS(threshold_limit) 269SHOW_FIELDS(threshold_limit)
268 270
269static ssize_t store_interrupt_enable(struct threshold_block *b, 271static ssize_t
270 const char *buf, size_t count) 272store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
271{ 273{
272 char *end;
273 struct thresh_restart tr; 274 struct thresh_restart tr;
274 unsigned long new = simple_strtoul(buf, &end, 0); 275 unsigned long new;
275 if (end == buf) 276
277 if (strict_strtoul(buf, 0, &new) < 0)
276 return -EINVAL; 278 return -EINVAL;
279
277 b->interrupt_enable = !!new; 280 b->interrupt_enable = !!new;
278 281
279 tr.b = b; 282 tr.b = b;
280 tr.reset = 0; 283 tr.reset = 0;
281 tr.old_limit = 0; 284 tr.old_limit = 0;
285
282 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); 286 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
283 287
284 return end - buf; 288 return size;
285} 289}
286 290
287static ssize_t store_threshold_limit(struct threshold_block *b, 291static ssize_t
288 const char *buf, size_t count) 292store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
289{ 293{
290 char *end;
291 struct thresh_restart tr; 294 struct thresh_restart tr;
292 unsigned long new = simple_strtoul(buf, &end, 0); 295 unsigned long new;
293 if (end == buf) 296
297 if (strict_strtoul(buf, 0, &new) < 0)
294 return -EINVAL; 298 return -EINVAL;
299
295 if (new > THRESHOLD_MAX) 300 if (new > THRESHOLD_MAX)
296 new = THRESHOLD_MAX; 301 new = THRESHOLD_MAX;
297 if (new < 1) 302 if (new < 1)
298 new = 1; 303 new = 1;
304
299 tr.old_limit = b->threshold_limit; 305 tr.old_limit = b->threshold_limit;
300 b->threshold_limit = new; 306 b->threshold_limit = new;
301 tr.b = b; 307 tr.b = b;
@@ -303,12 +309,12 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
303 309
304 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); 310 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
305 311
306 return end - buf; 312 return size;
307} 313}
308 314
309struct threshold_block_cross_cpu { 315struct threshold_block_cross_cpu {
310 struct threshold_block *tb; 316 struct threshold_block *tb;
311 long retval; 317 long retval;
312}; 318};
313 319
314static void local_error_count_handler(void *_tbcc) 320static void local_error_count_handler(void *_tbcc)
@@ -338,16 +344,13 @@ static ssize_t store_error_count(struct threshold_block *b,
338 return 1; 344 return 1;
339} 345}
340 346
341#define THRESHOLD_ATTR(_name,_mode,_show,_store) { \ 347#define RW_ATTR(val) \
342 .attr = {.name = __stringify(_name), .mode = _mode }, \ 348static struct threshold_attr val = { \
343 .show = _show, \ 349 .attr = {.name = __stringify(val), .mode = 0644 }, \
344 .store = _store, \ 350 .show = show_## val, \
351 .store = store_## val, \
345}; 352};
346 353
347#define RW_ATTR(name) \
348static struct threshold_attr name = \
349 THRESHOLD_ATTR(name, 0644, show_## name, store_## name)
350
351RW_ATTR(interrupt_enable); 354RW_ATTR(interrupt_enable);
352RW_ATTR(threshold_limit); 355RW_ATTR(threshold_limit);
353RW_ATTR(error_count); 356RW_ATTR(error_count);
@@ -359,15 +362,17 @@ static struct attribute *default_attrs[] = {
359 NULL 362 NULL
360}; 363};
361 364
362#define to_block(k) container_of(k, struct threshold_block, kobj) 365#define to_block(k) container_of(k, struct threshold_block, kobj)
363#define to_attr(a) container_of(a, struct threshold_attr, attr) 366#define to_attr(a) container_of(a, struct threshold_attr, attr)
364 367
365static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) 368static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
366{ 369{
367 struct threshold_block *b = to_block(kobj); 370 struct threshold_block *b = to_block(kobj);
368 struct threshold_attr *a = to_attr(attr); 371 struct threshold_attr *a = to_attr(attr);
369 ssize_t ret; 372 ssize_t ret;
373
370 ret = a->show ? a->show(b, buf) : -EIO; 374 ret = a->show ? a->show(b, buf) : -EIO;
375
371 return ret; 376 return ret;
372} 377}
373 378
@@ -377,18 +382,20 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
377 struct threshold_block *b = to_block(kobj); 382 struct threshold_block *b = to_block(kobj);
378 struct threshold_attr *a = to_attr(attr); 383 struct threshold_attr *a = to_attr(attr);
379 ssize_t ret; 384 ssize_t ret;
385
380 ret = a->store ? a->store(b, buf, count) : -EIO; 386 ret = a->store ? a->store(b, buf, count) : -EIO;
387
381 return ret; 388 return ret;
382} 389}
383 390
384static struct sysfs_ops threshold_ops = { 391static struct sysfs_ops threshold_ops = {
385 .show = show, 392 .show = show,
386 .store = store, 393 .store = store,
387}; 394};
388 395
389static struct kobj_type threshold_ktype = { 396static struct kobj_type threshold_ktype = {
390 .sysfs_ops = &threshold_ops, 397 .sysfs_ops = &threshold_ops,
391 .default_attrs = default_attrs, 398 .default_attrs = default_attrs,
392}; 399};
393 400
394static __cpuinit int allocate_threshold_blocks(unsigned int cpu, 401static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
@@ -396,9 +403,9 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
396 unsigned int block, 403 unsigned int block,
397 u32 address) 404 u32 address)
398{ 405{
399 int err;
400 u32 low, high;
401 struct threshold_block *b = NULL; 406 struct threshold_block *b = NULL;
407 u32 low, high;
408 int err;
402 409
403 if ((bank >= NR_BANKS) || (block >= NR_BLOCKS)) 410 if ((bank >= NR_BANKS) || (block >= NR_BLOCKS))
404 return 0; 411 return 0;
@@ -421,20 +428,21 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
421 if (!b) 428 if (!b)
422 return -ENOMEM; 429 return -ENOMEM;
423 430
424 b->block = block; 431 b->block = block;
425 b->bank = bank; 432 b->bank = bank;
426 b->cpu = cpu; 433 b->cpu = cpu;
427 b->address = address; 434 b->address = address;
428 b->interrupt_enable = 0; 435 b->interrupt_enable = 0;
429 b->threshold_limit = THRESHOLD_MAX; 436 b->threshold_limit = THRESHOLD_MAX;
430 437
431 INIT_LIST_HEAD(&b->miscj); 438 INIT_LIST_HEAD(&b->miscj);
432 439
433 if (per_cpu(threshold_banks, cpu)[bank]->blocks) 440 if (per_cpu(threshold_banks, cpu)[bank]->blocks) {
434 list_add(&b->miscj, 441 list_add(&b->miscj,
435 &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj); 442 &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj);
436 else 443 } else {
437 per_cpu(threshold_banks, cpu)[bank]->blocks = b; 444 per_cpu(threshold_banks, cpu)[bank]->blocks = b;
445 }
438 446
439 err = kobject_init_and_add(&b->kobj, &threshold_ktype, 447 err = kobject_init_and_add(&b->kobj, &threshold_ktype,
440 per_cpu(threshold_banks, cpu)[bank]->kobj, 448 per_cpu(threshold_banks, cpu)[bank]->kobj,
@@ -447,8 +455,9 @@ recurse:
447 if (!address) 455 if (!address)
448 return 0; 456 return 0;
449 address += MCG_XBLK_ADDR; 457 address += MCG_XBLK_ADDR;
450 } else 458 } else {
451 ++address; 459 ++address;
460 }
452 461
453 err = allocate_threshold_blocks(cpu, bank, ++block, address); 462 err = allocate_threshold_blocks(cpu, bank, ++block, address);
454 if (err) 463 if (err)
@@ -500,13 +509,14 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
500 if (!b) 509 if (!b)
501 goto out; 510 goto out;
502 511
503 err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj, 512 err = sysfs_create_link(&per_cpu(mce_dev, cpu).kobj,
504 b->kobj, name); 513 b->kobj, name);
505 if (err) 514 if (err)
506 goto out; 515 goto out;
507 516
508 cpumask_copy(b->cpus, cpu_core_mask(cpu)); 517 cpumask_copy(b->cpus, cpu_core_mask(cpu));
509 per_cpu(threshold_banks, cpu)[bank] = b; 518 per_cpu(threshold_banks, cpu)[bank] = b;
519
510 goto out; 520 goto out;
511 } 521 }
512#endif 522#endif
@@ -522,7 +532,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
522 goto out; 532 goto out;
523 } 533 }
524 534
525 b->kobj = kobject_create_and_add(name, &per_cpu(device_mce, cpu).kobj); 535 b->kobj = kobject_create_and_add(name, &per_cpu(mce_dev, cpu).kobj);
526 if (!b->kobj) 536 if (!b->kobj)
527 goto out_free; 537 goto out_free;
528 538
@@ -542,7 +552,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
542 if (i == cpu) 552 if (i == cpu)
543 continue; 553 continue;
544 554
545 err = sysfs_create_link(&per_cpu(device_mce, i).kobj, 555 err = sysfs_create_link(&per_cpu(mce_dev, i).kobj,
546 b->kobj, name); 556 b->kobj, name);
547 if (err) 557 if (err)
548 goto out; 558 goto out;
@@ -605,15 +615,13 @@ static void deallocate_threshold_block(unsigned int cpu,
605 615
606static void threshold_remove_bank(unsigned int cpu, int bank) 616static void threshold_remove_bank(unsigned int cpu, int bank)
607{ 617{
608 int i = 0;
609 struct threshold_bank *b; 618 struct threshold_bank *b;
610 char name[32]; 619 char name[32];
620 int i = 0;
611 621
612 b = per_cpu(threshold_banks, cpu)[bank]; 622 b = per_cpu(threshold_banks, cpu)[bank];
613
614 if (!b) 623 if (!b)
615 return; 624 return;
616
617 if (!b->blocks) 625 if (!b->blocks)
618 goto free_out; 626 goto free_out;
619 627
@@ -622,8 +630,9 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
622#ifdef CONFIG_SMP 630#ifdef CONFIG_SMP
623 /* sibling symlink */ 631 /* sibling symlink */
624 if (shared_bank[bank] && b->blocks->cpu != cpu) { 632 if (shared_bank[bank] && b->blocks->cpu != cpu) {
625 sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name); 633 sysfs_remove_link(&per_cpu(mce_dev, cpu).kobj, name);
626 per_cpu(threshold_banks, cpu)[bank] = NULL; 634 per_cpu(threshold_banks, cpu)[bank] = NULL;
635
627 return; 636 return;
628 } 637 }
629#endif 638#endif
@@ -633,7 +642,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
633 if (i == cpu) 642 if (i == cpu)
634 continue; 643 continue;
635 644
636 sysfs_remove_link(&per_cpu(device_mce, i).kobj, name); 645 sysfs_remove_link(&per_cpu(mce_dev, i).kobj, name);
637 per_cpu(threshold_banks, i)[bank] = NULL; 646 per_cpu(threshold_banks, i)[bank] = NULL;
638 } 647 }
639 648
@@ -659,12 +668,9 @@ static void threshold_remove_device(unsigned int cpu)
659} 668}
660 669
661/* get notified when a cpu comes on/off */ 670/* get notified when a cpu comes on/off */
662static void __cpuinit amd_64_threshold_cpu_callback(unsigned long action, 671static void __cpuinit
663 unsigned int cpu) 672amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu)
664{ 673{
665 if (cpu >= NR_CPUS)
666 return;
667
668 switch (action) { 674 switch (action) {
669 case CPU_ONLINE: 675 case CPU_ONLINE:
670 case CPU_ONLINE_FROZEN: 676 case CPU_ONLINE_FROZEN:
@@ -686,11 +692,12 @@ static __init int threshold_init_device(void)
686 /* to hit CPUs online before the notifier is up */ 692 /* to hit CPUs online before the notifier is up */
687 for_each_online_cpu(lcpu) { 693 for_each_online_cpu(lcpu) {
688 int err = threshold_create_device(lcpu); 694 int err = threshold_create_device(lcpu);
695
689 if (err) 696 if (err)
690 return err; 697 return err;
691 } 698 }
692 threshold_cpu_callback = amd_64_threshold_cpu_callback; 699 threshold_cpu_callback = amd_64_threshold_cpu_callback;
700
693 return 0; 701 return 0;
694} 702}
695
696device_initcall(threshold_init_device); 703device_initcall(threshold_init_device);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index cef3ee30744b..e1acec0f7a32 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -8,85 +8,10 @@
8#include <linux/init.h> 8#include <linux/init.h>
9#include <linux/interrupt.h> 9#include <linux/interrupt.h>
10#include <linux/percpu.h> 10#include <linux/percpu.h>
11#include <asm/processor.h>
12#include <asm/apic.h> 11#include <asm/apic.h>
12#include <asm/processor.h>
13#include <asm/msr.h> 13#include <asm/msr.h>
14#include <asm/mce.h> 14#include <asm/mce.h>
15#include <asm/hw_irq.h>
16#include <asm/idle.h>
17#include <asm/therm_throt.h>
18#include <asm/apic.h>
19
20asmlinkage void smp_thermal_interrupt(void)
21{
22 __u64 msr_val;
23
24 ack_APIC_irq();
25
26 exit_idle();
27 irq_enter();
28
29 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
30 if (therm_throt_process(msr_val & 1))
31 mce_log_therm_throt_event(msr_val);
32
33 inc_irq_stat(irq_thermal_count);
34 irq_exit();
35}
36
37static void intel_init_thermal(struct cpuinfo_x86 *c)
38{
39 u32 l, h;
40 int tm2 = 0;
41 unsigned int cpu = smp_processor_id();
42
43 if (!cpu_has(c, X86_FEATURE_ACPI))
44 return;
45
46 if (!cpu_has(c, X86_FEATURE_ACC))
47 return;
48
49 /* first check if TM1 is already enabled by the BIOS, in which
50 * case there might be some SMM goo which handles it, so we can't even
51 * put a handler since it might be delivered via SMI already.
52 */
53 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
54 h = apic_read(APIC_LVTTHMR);
55 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
56 printk(KERN_DEBUG
57 "CPU%d: Thermal monitoring handled by SMI\n", cpu);
58 return;
59 }
60
61 if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2))
62 tm2 = 1;
63
64 if (h & APIC_VECTOR_MASK) {
65 printk(KERN_DEBUG
66 "CPU%d: Thermal LVT vector (%#x) already "
67 "installed\n", cpu, (h & APIC_VECTOR_MASK));
68 return;
69 }
70
71 h = THERMAL_APIC_VECTOR;
72 h |= (APIC_DM_FIXED | APIC_LVT_MASKED);
73 apic_write(APIC_LVTTHMR, h);
74
75 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
76 wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h);
77
78 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
79 wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
80
81 l = apic_read(APIC_LVTTHMR);
82 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
83 printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
84 cpu, tm2 ? "TM2" : "TM1");
85
86 /* enable thermal throttle processing */
87 atomic_set(&therm_throt_en, 1);
88 return;
89}
90 15
91/* 16/*
92 * Support for Intel Correct Machine Check Interrupts. This allows 17 * Support for Intel Correct Machine Check Interrupts. This allows
@@ -109,6 +34,9 @@ static int cmci_supported(int *banks)
109{ 34{
110 u64 cap; 35 u64 cap;
111 36
37 if (mce_cmci_disabled || mce_ignore_ce)
38 return 0;
39
112 /* 40 /*
113 * Vendor check is not strictly needed, but the initial 41 * Vendor check is not strictly needed, but the initial
114 * initialization is vendor keyed and this 42 * initialization is vendor keyed and this
@@ -132,7 +60,7 @@ static int cmci_supported(int *banks)
132static void intel_threshold_interrupt(void) 60static void intel_threshold_interrupt(void)
133{ 61{
134 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); 62 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
135 mce_notify_user(); 63 mce_notify_irq();
136} 64}
137 65
138static void print_update(char *type, int *hdr, int num) 66static void print_update(char *type, int *hdr, int num)
@@ -248,7 +176,7 @@ void cmci_rediscover(int dying)
248 return; 176 return;
249 cpumask_copy(old, &current->cpus_allowed); 177 cpumask_copy(old, &current->cpus_allowed);
250 178
251 for_each_online_cpu (cpu) { 179 for_each_online_cpu(cpu) {
252 if (cpu == dying) 180 if (cpu == dying)
253 continue; 181 continue;
254 if (set_cpus_allowed_ptr(current, cpumask_of(cpu))) 182 if (set_cpus_allowed_ptr(current, cpumask_of(cpu)))
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c
index a74af128efc9..f5f2d6f71fb6 100644
--- a/arch/x86/kernel/cpu/mcheck/non-fatal.c
+++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c
@@ -6,25 +6,23 @@
6 * This file contains routines to check for non-fatal MCEs every 15s 6 * This file contains routines to check for non-fatal MCEs every 15s
7 * 7 *
8 */ 8 */
9
10#include <linux/init.h>
11#include <linux/types.h>
12#include <linux/kernel.h>
13#include <linux/jiffies.h>
14#include <linux/workqueue.h>
15#include <linux/interrupt.h> 9#include <linux/interrupt.h>
16#include <linux/smp.h> 10#include <linux/workqueue.h>
11#include <linux/jiffies.h>
12#include <linux/kernel.h>
17#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/types.h>
15#include <linux/init.h>
16#include <linux/smp.h>
18 17
19#include <asm/processor.h> 18#include <asm/processor.h>
20#include <asm/system.h> 19#include <asm/system.h>
20#include <asm/mce.h>
21#include <asm/msr.h> 21#include <asm/msr.h>
22 22
23#include "mce.h" 23static int firstbank;
24 24
25static int firstbank; 25#define MCE_RATE (15*HZ) /* timer rate is 15s */
26
27#define MCE_RATE 15*HZ /* timer rate is 15s */
28 26
29static void mce_checkregs(void *info) 27static void mce_checkregs(void *info)
30{ 28{
@@ -34,23 +32,24 @@ static void mce_checkregs(void *info)
34 for (i = firstbank; i < nr_mce_banks; i++) { 32 for (i = firstbank; i < nr_mce_banks; i++) {
35 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); 33 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
36 34
37 if (high & (1<<31)) { 35 if (!(high & (1<<31)))
38 printk(KERN_INFO "MCE: The hardware reports a non " 36 continue;
39 "fatal, correctable incident occurred on " 37
40 "CPU %d.\n", 38 printk(KERN_INFO "MCE: The hardware reports a non fatal, "
39 "correctable incident occurred on CPU %d.\n",
41 smp_processor_id()); 40 smp_processor_id());
42 printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low); 41
43 42 printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low);
44 /* 43
45 * Scrub the error so we don't pick it up in MCE_RATE 44 /*
46 * seconds time. 45 * Scrub the error so we don't pick it up in MCE_RATE
47 */ 46 * seconds time:
48 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); 47 */
49 48 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
50 /* Serialize */ 49
51 wmb(); 50 /* Serialize: */
52 add_taint(TAINT_MACHINE_CHECK); 51 wmb();
53 } 52 add_taint(TAINT_MACHINE_CHECK);
54 } 53 }
55} 54}
56 55
@@ -77,16 +76,17 @@ static int __init init_nonfatal_mce_checker(void)
77 76
78 /* Some Athlons misbehave when we frob bank 0 */ 77 /* Some Athlons misbehave when we frob bank 0 */
79 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && 78 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
80 boot_cpu_data.x86 == 6) 79 boot_cpu_data.x86 == 6)
81 firstbank = 1; 80 firstbank = 1;
82 else 81 else
83 firstbank = 0; 82 firstbank = 0;
84 83
85 /* 84 /*
86 * Check for non-fatal errors every MCE_RATE s 85 * Check for non-fatal errors every MCE_RATE s
87 */ 86 */
88 schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); 87 schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
89 printk(KERN_INFO "Machine check exception polling timer started.\n"); 88 printk(KERN_INFO "Machine check exception polling timer started.\n");
89
90 return 0; 90 return 0;
91} 91}
92module_init(init_nonfatal_mce_checker); 92module_init(init_nonfatal_mce_checker);
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
index f53bdcbaf382..4482aea9aa2e 100644
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -1,21 +1,14 @@
1/* 1/*
2 * P4 specific Machine Check Exception Reporting 2 * P4 specific Machine Check Exception Reporting
3 */ 3 */
4
5#include <linux/init.h>
6#include <linux/types.h>
7#include <linux/kernel.h> 4#include <linux/kernel.h>
8#include <linux/interrupt.h> 5#include <linux/types.h>
6#include <linux/init.h>
9#include <linux/smp.h> 7#include <linux/smp.h>
10 8
11#include <asm/processor.h> 9#include <asm/processor.h>
12#include <asm/system.h> 10#include <asm/mce.h>
13#include <asm/msr.h> 11#include <asm/msr.h>
14#include <asm/apic.h>
15
16#include <asm/therm_throt.h>
17
18#include "mce.h"
19 12
20/* as supported by the P4/Xeon family */ 13/* as supported by the P4/Xeon family */
21struct intel_mce_extended_msrs { 14struct intel_mce_extended_msrs {
@@ -34,98 +27,8 @@ struct intel_mce_extended_msrs {
34 27
35static int mce_num_extended_msrs; 28static int mce_num_extended_msrs;
36 29
37
38#ifdef CONFIG_X86_MCE_P4THERMAL
39static void unexpected_thermal_interrupt(struct pt_regs *regs)
40{
41 printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n",
42 smp_processor_id());
43 add_taint(TAINT_MACHINE_CHECK);
44}
45
46/* P4/Xeon Thermal transition interrupt handler */
47static void intel_thermal_interrupt(struct pt_regs *regs)
48{
49 __u64 msr_val;
50
51 ack_APIC_irq();
52
53 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
54 therm_throt_process(msr_val & 0x1);
55}
56
57/* Thermal interrupt handler for this CPU setup */
58static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt;
59
60void smp_thermal_interrupt(struct pt_regs *regs)
61{
62 irq_enter();
63 vendor_thermal_interrupt(regs);
64 __get_cpu_var(irq_stat).irq_thermal_count++;
65 irq_exit();
66}
67
68/* P4/Xeon Thermal regulation detect and init */
69static void intel_init_thermal(struct cpuinfo_x86 *c)
70{
71 u32 l, h;
72 unsigned int cpu = smp_processor_id();
73
74 /* Thermal monitoring */
75 if (!cpu_has(c, X86_FEATURE_ACPI))
76 return; /* -ENODEV */
77
78 /* Clock modulation */
79 if (!cpu_has(c, X86_FEATURE_ACC))
80 return; /* -ENODEV */
81
82 /* first check if its enabled already, in which case there might
83 * be some SMM goo which handles it, so we can't even put a handler
84 * since it might be delivered via SMI already -zwanem.
85 */
86 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
87 h = apic_read(APIC_LVTTHMR);
88 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
89 printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n",
90 cpu);
91 return; /* -EBUSY */
92 }
93
94 /* check whether a vector already exists, temporarily masked? */
95 if (h & APIC_VECTOR_MASK) {
96 printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already "
97 "installed\n",
98 cpu, (h & APIC_VECTOR_MASK));
99 return; /* -EBUSY */
100 }
101
102 /* The temperature transition interrupt handler setup */
103 h = THERMAL_APIC_VECTOR; /* our delivery vector */
104 h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */
105 apic_write(APIC_LVTTHMR, h);
106
107 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
108 wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
109
110 /* ok we're good to go... */
111 vendor_thermal_interrupt = intel_thermal_interrupt;
112
113 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
114 wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
115
116 l = apic_read(APIC_LVTTHMR);
117 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
118 printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
119
120 /* enable thermal throttle processing */
121 atomic_set(&therm_throt_en, 1);
122 return;
123}
124#endif /* CONFIG_X86_MCE_P4THERMAL */
125
126
127/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ 30/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */
128static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) 31static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
129{ 32{
130 u32 h; 33 u32 h;
131 34
@@ -143,9 +46,9 @@ static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
143 46
144static void intel_machine_check(struct pt_regs *regs, long error_code) 47static void intel_machine_check(struct pt_regs *regs, long error_code)
145{ 48{
146 int recover = 1;
147 u32 alow, ahigh, high, low; 49 u32 alow, ahigh, high, low;
148 u32 mcgstl, mcgsth; 50 u32 mcgstl, mcgsth;
51 int recover = 1;
149 int i; 52 int i;
150 53
151 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 54 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
@@ -157,7 +60,9 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
157 60
158 if (mce_num_extended_msrs > 0) { 61 if (mce_num_extended_msrs > 0) {
159 struct intel_mce_extended_msrs dbg; 62 struct intel_mce_extended_msrs dbg;
63
160 intel_get_extended_msrs(&dbg); 64 intel_get_extended_msrs(&dbg);
65
161 printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n" 66 printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n"
162 "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n" 67 "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n"
163 "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", 68 "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n",
@@ -171,6 +76,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
171 if (high & (1<<31)) { 76 if (high & (1<<31)) {
172 char misc[20]; 77 char misc[20];
173 char addr[24]; 78 char addr[24];
79
174 misc[0] = addr[0] = '\0'; 80 misc[0] = addr[0] = '\0';
175 if (high & (1<<29)) 81 if (high & (1<<29))
176 recover |= 1; 82 recover |= 1;
@@ -196,6 +102,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
196 panic("Unable to continue"); 102 panic("Unable to continue");
197 103
198 printk(KERN_EMERG "Attempting to continue.\n"); 104 printk(KERN_EMERG "Attempting to continue.\n");
105
199 /* 106 /*
200 * Do not clear the MSR_IA32_MCi_STATUS if the error is not 107 * Do not clear the MSR_IA32_MCi_STATUS if the error is not
201 * recoverable/continuable.This will allow BIOS to look at the MSRs 108 * recoverable/continuable.This will allow BIOS to look at the MSRs
@@ -217,7 +124,6 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
217 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 124 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
218} 125}
219 126
220
221void intel_p4_mcheck_init(struct cpuinfo_x86 *c) 127void intel_p4_mcheck_init(struct cpuinfo_x86 *c)
222{ 128{
223 u32 l, h; 129 u32 l, h;
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index c9f77ea69edc..5c0e6533d9bc 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -2,52 +2,67 @@
2 * P5 specific Machine Check Exception Reporting 2 * P5 specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> 3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
4 */ 4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
10#include <linux/smp.h> 9#include <linux/smp.h>
11 10
12#include <asm/processor.h> 11#include <asm/processor.h>
13#include <asm/system.h> 12#include <asm/system.h>
13#include <asm/mce.h>
14#include <asm/msr.h> 14#include <asm/msr.h>
15 15
16#include "mce.h" 16/* By default disabled */
17int mce_p5_enabled __read_mostly;
17 18
18/* Machine check handler for Pentium class Intel */ 19/* Machine check handler for Pentium class Intel CPUs: */
19static void pentium_machine_check(struct pt_regs *regs, long error_code) 20static void pentium_machine_check(struct pt_regs *regs, long error_code)
20{ 21{
21 u32 loaddr, hi, lotype; 22 u32 loaddr, hi, lotype;
23
22 rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); 24 rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
23 rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); 25 rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
24 printk(KERN_EMERG "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n", smp_processor_id(), loaddr, lotype); 26
25 if (lotype&(1<<5)) 27 printk(KERN_EMERG
26 printk(KERN_EMERG "CPU#%d: Possible thermal failure (CPU on fire ?).\n", smp_processor_id()); 28 "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n",
29 smp_processor_id(), loaddr, lotype);
30
31 if (lotype & (1<<5)) {
32 printk(KERN_EMERG
33 "CPU#%d: Possible thermal failure (CPU on fire ?).\n",
34 smp_processor_id());
35 }
36
27 add_taint(TAINT_MACHINE_CHECK); 37 add_taint(TAINT_MACHINE_CHECK);
28} 38}
29 39
30/* Set up machine check reporting for processors with Intel style MCE */ 40/* Set up machine check reporting for processors with Intel style MCE: */
31void intel_p5_mcheck_init(struct cpuinfo_x86 *c) 41void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
32{ 42{
33 u32 l, h; 43 u32 l, h;
34 44
35 /*Check for MCE support */ 45 /* Default P5 to off as its often misconnected: */
36 if (!cpu_has(c, X86_FEATURE_MCE)) 46 if (!mce_p5_enabled)
37 return; 47 return;
38 48
39 /* Default P5 to off as its often misconnected */ 49 /* Check for MCE support: */
40 if (mce_disabled != -1) 50 if (!cpu_has(c, X86_FEATURE_MCE))
41 return; 51 return;
52
42 machine_check_vector = pentium_machine_check; 53 machine_check_vector = pentium_machine_check;
54 /* Make sure the vector pointer is visible before we enable MCEs: */
43 wmb(); 55 wmb();
44 56
45 /* Read registers before enabling */ 57 /* Read registers before enabling: */
46 rdmsr(MSR_IA32_P5_MC_ADDR, l, h); 58 rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
47 rdmsr(MSR_IA32_P5_MC_TYPE, l, h); 59 rdmsr(MSR_IA32_P5_MC_TYPE, l, h);
48 printk(KERN_INFO "Intel old style machine check architecture supported.\n"); 60 printk(KERN_INFO
61 "Intel old style machine check architecture supported.\n");
49 62
50 /* Enable MCE */ 63 /* Enable MCE: */
51 set_in_cr4(X86_CR4_MCE); 64 set_in_cr4(X86_CR4_MCE);
52 printk(KERN_INFO "Intel old style machine check reporting enabled on CPU#%d.\n", smp_processor_id()); 65 printk(KERN_INFO
66 "Intel old style machine check reporting enabled on CPU#%d.\n",
67 smp_processor_id());
53} 68}
diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c
index 2ac52d7b434b..01e4f8178183 100644
--- a/arch/x86/kernel/cpu/mcheck/p6.c
+++ b/arch/x86/kernel/cpu/mcheck/p6.c
@@ -2,25 +2,23 @@
2 * P6 specific Machine Check Exception Reporting 2 * P6 specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> 3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
4 */ 4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
10#include <linux/smp.h> 9#include <linux/smp.h>
11 10
12#include <asm/processor.h> 11#include <asm/processor.h>
13#include <asm/system.h> 12#include <asm/system.h>
13#include <asm/mce.h>
14#include <asm/msr.h> 14#include <asm/msr.h>
15 15
16#include "mce.h"
17
18/* Machine Check Handler For PII/PIII */ 16/* Machine Check Handler For PII/PIII */
19static void intel_machine_check(struct pt_regs *regs, long error_code) 17static void intel_machine_check(struct pt_regs *regs, long error_code)
20{ 18{
21 int recover = 1;
22 u32 alow, ahigh, high, low; 19 u32 alow, ahigh, high, low;
23 u32 mcgstl, mcgsth; 20 u32 mcgstl, mcgsth;
21 int recover = 1;
24 int i; 22 int i;
25 23
26 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 24 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
@@ -35,12 +33,16 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
35 if (high & (1<<31)) { 33 if (high & (1<<31)) {
36 char misc[20]; 34 char misc[20];
37 char addr[24]; 35 char addr[24];
38 misc[0] = addr[0] = '\0'; 36
37 misc[0] = '\0';
38 addr[0] = '\0';
39
39 if (high & (1<<29)) 40 if (high & (1<<29))
40 recover |= 1; 41 recover |= 1;
41 if (high & (1<<25)) 42 if (high & (1<<25))
42 recover |= 2; 43 recover |= 2;
43 high &= ~(1<<31); 44 high &= ~(1<<31);
45
44 if (high & (1<<27)) { 46 if (high & (1<<27)) {
45 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); 47 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
46 snprintf(misc, 20, "[%08x%08x]", ahigh, alow); 48 snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
@@ -49,6 +51,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
49 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); 51 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
50 snprintf(addr, 24, " at %08x%08x", ahigh, alow); 52 snprintf(addr, 24, " at %08x%08x", ahigh, alow);
51 } 53 }
54
52 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", 55 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
53 smp_processor_id(), i, high, low, misc, addr); 56 smp_processor_id(), i, high, low, misc, addr);
54 } 57 }
@@ -63,16 +66,17 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
63 /* 66 /*
64 * Do not clear the MSR_IA32_MCi_STATUS if the error is not 67 * Do not clear the MSR_IA32_MCi_STATUS if the error is not
65 * recoverable/continuable.This will allow BIOS to look at the MSRs 68 * recoverable/continuable.This will allow BIOS to look at the MSRs
66 * for errors if the OS could not log the error. 69 * for errors if the OS could not log the error:
67 */ 70 */
68 for (i = 0; i < nr_mce_banks; i++) { 71 for (i = 0; i < nr_mce_banks; i++) {
69 unsigned int msr; 72 unsigned int msr;
73
70 msr = MSR_IA32_MC0_STATUS+i*4; 74 msr = MSR_IA32_MC0_STATUS+i*4;
71 rdmsr(msr, low, high); 75 rdmsr(msr, low, high);
72 if (high & (1<<31)) { 76 if (high & (1<<31)) {
73 /* Clear it */ 77 /* Clear it: */
74 wrmsr(msr, 0UL, 0UL); 78 wrmsr(msr, 0UL, 0UL);
75 /* Serialize */ 79 /* Serialize: */
76 wmb(); 80 wmb();
77 add_taint(TAINT_MACHINE_CHECK); 81 add_taint(TAINT_MACHINE_CHECK);
78 } 82 }
@@ -81,7 +85,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
81 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 85 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
82} 86}
83 87
84/* Set up machine check reporting for processors with Intel style MCE */ 88/* Set up machine check reporting for processors with Intel style MCE: */
85void intel_p6_mcheck_init(struct cpuinfo_x86 *c) 89void intel_p6_mcheck_init(struct cpuinfo_x86 *c)
86{ 90{
87 u32 l, h; 91 u32 l, h;
@@ -97,6 +101,7 @@ void intel_p6_mcheck_init(struct cpuinfo_x86 *c)
97 101
98 /* Ok machine check is available */ 102 /* Ok machine check is available */
99 machine_check_vector = intel_machine_check; 103 machine_check_vector = intel_machine_check;
104 /* Make sure the vector pointer is visible before we enable MCEs: */
100 wmb(); 105 wmb();
101 106
102 printk(KERN_INFO "Intel machine check architecture supported.\n"); 107 printk(KERN_INFO "Intel machine check architecture supported.\n");
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index d5ae2243f0b9..bff8dd191dd5 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -1,7 +1,7 @@
1/* 1/*
2 *
3 * Thermal throttle event support code (such as syslog messaging and rate 2 * Thermal throttle event support code (such as syslog messaging and rate
4 * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c). 3 * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c).
4 *
5 * This allows consistent reporting of CPU thermal throttle events. 5 * This allows consistent reporting of CPU thermal throttle events.
6 * 6 *
7 * Maintains a counter in /sys that keeps track of the number of thermal 7 * Maintains a counter in /sys that keeps track of the number of thermal
@@ -13,43 +13,53 @@
13 * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c. 13 * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c.
14 * Inspired by Ross Biro's and Al Borchers' counter code. 14 * Inspired by Ross Biro's and Al Borchers' counter code.
15 */ 15 */
16 16#include <linux/interrupt.h>
17#include <linux/notifier.h>
18#include <linux/jiffies.h>
19#include <linux/kernel.h>
17#include <linux/percpu.h> 20#include <linux/percpu.h>
18#include <linux/sysdev.h> 21#include <linux/sysdev.h>
22#include <linux/types.h>
23#include <linux/init.h>
24#include <linux/smp.h>
19#include <linux/cpu.h> 25#include <linux/cpu.h>
20#include <asm/cpu.h> 26
21#include <linux/notifier.h> 27#include <asm/processor.h>
22#include <linux/jiffies.h> 28#include <asm/system.h>
23#include <asm/therm_throt.h> 29#include <asm/apic.h>
30#include <asm/idle.h>
31#include <asm/mce.h>
32#include <asm/msr.h>
24 33
25/* How long to wait between reporting thermal events */ 34/* How long to wait between reporting thermal events */
26#define CHECK_INTERVAL (300 * HZ) 35#define CHECK_INTERVAL (300 * HZ)
27 36
28static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; 37static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES;
29static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); 38static DEFINE_PER_CPU(unsigned long, thermal_throttle_count);
30atomic_t therm_throt_en = ATOMIC_INIT(0); 39
40static atomic_t therm_throt_en = ATOMIC_INIT(0);
31 41
32#ifdef CONFIG_SYSFS 42#ifdef CONFIG_SYSFS
33#define define_therm_throt_sysdev_one_ro(_name) \ 43#define define_therm_throt_sysdev_one_ro(_name) \
34 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) 44 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
35 45
36#define define_therm_throt_sysdev_show_func(name) \ 46#define define_therm_throt_sysdev_show_func(name) \
37static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ 47static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \
38 struct sysdev_attribute *attr, \ 48 struct sysdev_attribute *attr, \
39 char *buf) \ 49 char *buf) \
40{ \ 50{ \
41 unsigned int cpu = dev->id; \ 51 unsigned int cpu = dev->id; \
42 ssize_t ret; \ 52 ssize_t ret; \
43 \ 53 \
44 preempt_disable(); /* CPU hotplug */ \ 54 preempt_disable(); /* CPU hotplug */ \
45 if (cpu_online(cpu)) \ 55 if (cpu_online(cpu)) \
46 ret = sprintf(buf, "%lu\n", \ 56 ret = sprintf(buf, "%lu\n", \
47 per_cpu(thermal_throttle_##name, cpu)); \ 57 per_cpu(thermal_throttle_##name, cpu)); \
48 else \ 58 else \
49 ret = 0; \ 59 ret = 0; \
50 preempt_enable(); \ 60 preempt_enable(); \
51 \ 61 \
52 return ret; \ 62 return ret; \
53} 63}
54 64
55define_therm_throt_sysdev_show_func(count); 65define_therm_throt_sysdev_show_func(count);
@@ -61,8 +71,8 @@ static struct attribute *thermal_throttle_attrs[] = {
61}; 71};
62 72
63static struct attribute_group thermal_throttle_attr_group = { 73static struct attribute_group thermal_throttle_attr_group = {
64 .attrs = thermal_throttle_attrs, 74 .attrs = thermal_throttle_attrs,
65 .name = "thermal_throttle" 75 .name = "thermal_throttle"
66}; 76};
67#endif /* CONFIG_SYSFS */ 77#endif /* CONFIG_SYSFS */
68 78
@@ -82,7 +92,7 @@ static struct attribute_group thermal_throttle_attr_group = {
82 * 1 : Event should be logged further, and a message has been 92 * 1 : Event should be logged further, and a message has been
83 * printed to the syslog. 93 * printed to the syslog.
84 */ 94 */
85int therm_throt_process(int curr) 95static int therm_throt_process(int curr)
86{ 96{
87 unsigned int cpu = smp_processor_id(); 97 unsigned int cpu = smp_processor_id();
88 __u64 tmp_jiffs = get_jiffies_64(); 98 __u64 tmp_jiffs = get_jiffies_64();
@@ -110,10 +120,11 @@ int therm_throt_process(int curr)
110} 120}
111 121
112#ifdef CONFIG_SYSFS 122#ifdef CONFIG_SYSFS
113/* Add/Remove thermal_throttle interface for CPU device */ 123/* Add/Remove thermal_throttle interface for CPU device: */
114static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) 124static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
115{ 125{
116 return sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group); 126 return sysfs_create_group(&sys_dev->kobj,
127 &thermal_throttle_attr_group);
117} 128}
118 129
119static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) 130static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
@@ -121,19 +132,21 @@ static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
121 sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); 132 sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group);
122} 133}
123 134
124/* Mutex protecting device creation against CPU hotplug */ 135/* Mutex protecting device creation against CPU hotplug: */
125static DEFINE_MUTEX(therm_cpu_lock); 136static DEFINE_MUTEX(therm_cpu_lock);
126 137
127/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 138/* Get notified when a cpu comes on/off. Be hotplug friendly. */
128static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb, 139static __cpuinit int
129 unsigned long action, 140thermal_throttle_cpu_callback(struct notifier_block *nfb,
130 void *hcpu) 141 unsigned long action,
142 void *hcpu)
131{ 143{
132 unsigned int cpu = (unsigned long)hcpu; 144 unsigned int cpu = (unsigned long)hcpu;
133 struct sys_device *sys_dev; 145 struct sys_device *sys_dev;
134 int err = 0; 146 int err = 0;
135 147
136 sys_dev = get_cpu_sysdev(cpu); 148 sys_dev = get_cpu_sysdev(cpu);
149
137 switch (action) { 150 switch (action) {
138 case CPU_UP_PREPARE: 151 case CPU_UP_PREPARE:
139 case CPU_UP_PREPARE_FROZEN: 152 case CPU_UP_PREPARE_FROZEN:
@@ -183,6 +196,94 @@ static __init int thermal_throttle_init_device(void)
183 196
184 return 0; 197 return 0;
185} 198}
186
187device_initcall(thermal_throttle_init_device); 199device_initcall(thermal_throttle_init_device);
200
188#endif /* CONFIG_SYSFS */ 201#endif /* CONFIG_SYSFS */
202
203/* Thermal transition interrupt handler */
204static void intel_thermal_interrupt(void)
205{
206 __u64 msr_val;
207
208 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
209 if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT))
210 mce_log_therm_throt_event(msr_val);
211}
212
213static void unexpected_thermal_interrupt(void)
214{
215 printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n",
216 smp_processor_id());
217 add_taint(TAINT_MACHINE_CHECK);
218}
219
220static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
221
222asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)
223{
224 exit_idle();
225 irq_enter();
226 inc_irq_stat(irq_thermal_count);
227 smp_thermal_vector();
228 irq_exit();
229 /* Ack only at the end to avoid potential reentry */
230 ack_APIC_irq();
231}
232
233void intel_init_thermal(struct cpuinfo_x86 *c)
234{
235 unsigned int cpu = smp_processor_id();
236 int tm2 = 0;
237 u32 l, h;
238
239 /* Thermal monitoring depends on ACPI and clock modulation*/
240 if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
241 return;
242
243 /*
244 * First check if its enabled already, in which case there might
245 * be some SMM goo which handles it, so we can't even put a handler
246 * since it might be delivered via SMI already:
247 */
248 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
249 h = apic_read(APIC_LVTTHMR);
250 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
251 printk(KERN_DEBUG
252 "CPU%d: Thermal monitoring handled by SMI\n", cpu);
253 return;
254 }
255
256 if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2))
257 tm2 = 1;
258
259 /* Check whether a vector already exists */
260 if (h & APIC_VECTOR_MASK) {
261 printk(KERN_DEBUG
262 "CPU%d: Thermal LVT vector (%#x) already installed\n",
263 cpu, (h & APIC_VECTOR_MASK));
264 return;
265 }
266
267 /* We'll mask the thermal vector in the lapic till we're ready: */
268 h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
269 apic_write(APIC_LVTTHMR, h);
270
271 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
272 wrmsr(MSR_IA32_THERM_INTERRUPT,
273 l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
274
275 smp_thermal_vector = intel_thermal_interrupt;
276
277 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
278 wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
279
280 /* Unmask the thermal vector: */
281 l = apic_read(APIC_LVTTHMR);
282 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
283
284 printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
285 cpu, tm2 ? "TM2" : "TM1");
286
287 /* enable thermal throttle processing */
288 atomic_set(&therm_throt_en, 1);
289}
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
index 23ee9e730f78..d746df2909c9 100644
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -17,7 +17,7 @@ static void default_threshold_interrupt(void)
17 17
18void (*mce_threshold_vector)(void) = default_threshold_interrupt; 18void (*mce_threshold_vector)(void) = default_threshold_interrupt;
19 19
20asmlinkage void mce_threshold_interrupt(void) 20asmlinkage void smp_threshold_interrupt(void)
21{ 21{
22 exit_idle(); 22 exit_idle();
23 irq_enter(); 23 irq_enter();
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 2a043d89811d..54060f565974 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -2,19 +2,17 @@
2 * IDT Winchip specific Machine Check Exception Reporting 2 * IDT Winchip specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> 3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
4 */ 4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
10 9
11#include <asm/processor.h> 10#include <asm/processor.h>
12#include <asm/system.h> 11#include <asm/system.h>
12#include <asm/mce.h>
13#include <asm/msr.h> 13#include <asm/msr.h>
14 14
15#include "mce.h" 15/* Machine check handler for WinChip C6: */
16
17/* Machine check handler for WinChip C6 */
18static void winchip_machine_check(struct pt_regs *regs, long error_code) 16static void winchip_machine_check(struct pt_regs *regs, long error_code)
19{ 17{
20 printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); 18 printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
@@ -25,12 +23,18 @@ static void winchip_machine_check(struct pt_regs *regs, long error_code)
25void winchip_mcheck_init(struct cpuinfo_x86 *c) 23void winchip_mcheck_init(struct cpuinfo_x86 *c)
26{ 24{
27 u32 lo, hi; 25 u32 lo, hi;
26
28 machine_check_vector = winchip_machine_check; 27 machine_check_vector = winchip_machine_check;
28 /* Make sure the vector pointer is visible before we enable MCEs: */
29 wmb(); 29 wmb();
30
30 rdmsr(MSR_IDT_FCR1, lo, hi); 31 rdmsr(MSR_IDT_FCR1, lo, hi);
31 lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */ 32 lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */
32 lo &= ~(1<<4); /* Enable MCE */ 33 lo &= ~(1<<4); /* Enable MCE */
33 wrmsr(MSR_IDT_FCR1, lo, hi); 34 wrmsr(MSR_IDT_FCR1, lo, hi);
35
34 set_in_cr4(X86_CR4_MCE); 36 set_in_cr4(X86_CR4_MCE);
35 printk(KERN_INFO "Winchip machine check reporting enabled on CPU#0.\n"); 37
38 printk(KERN_INFO
39 "Winchip machine check reporting enabled on CPU#0.\n");
36} 40}
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index ce0fe4b5c04f..1d584a18a50d 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -808,7 +808,7 @@ int __init mtrr_cleanup(unsigned address_bits)
808 808
809 if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) 809 if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
810 return 0; 810 return 0;
811 rdmsr(MTRRdefType_MSR, def, dummy); 811 rdmsr(MSR_MTRRdefType, def, dummy);
812 def &= 0xff; 812 def &= 0xff;
813 if (def != MTRR_TYPE_UNCACHABLE) 813 if (def != MTRR_TYPE_UNCACHABLE)
814 return 0; 814 return 0;
@@ -1003,7 +1003,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1003 */ 1003 */
1004 if (!is_cpu(INTEL) || disable_mtrr_trim) 1004 if (!is_cpu(INTEL) || disable_mtrr_trim)
1005 return 0; 1005 return 0;
1006 rdmsr(MTRRdefType_MSR, def, dummy); 1006 rdmsr(MSR_MTRRdefType, def, dummy);
1007 def &= 0xff; 1007 def &= 0xff;
1008 if (def != MTRR_TYPE_UNCACHABLE) 1008 if (def != MTRR_TYPE_UNCACHABLE)
1009 return 0; 1009 return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index d21d4fb161f7..0543f69f0b27 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -20,9 +20,9 @@ struct fixed_range_block {
20}; 20};
21 21
22static struct fixed_range_block fixed_range_blocks[] = { 22static struct fixed_range_block fixed_range_blocks[] = {
23 { MTRRfix64K_00000_MSR, 1 }, /* one 64k MTRR */ 23 { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */
24 { MTRRfix16K_80000_MSR, 2 }, /* two 16k MTRRs */ 24 { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */
25 { MTRRfix4K_C0000_MSR, 8 }, /* eight 4k MTRRs */ 25 { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */
26 {} 26 {}
27}; 27};
28 28
@@ -194,12 +194,12 @@ get_fixed_ranges(mtrr_type * frs)
194 194
195 k8_check_syscfg_dram_mod_en(); 195 k8_check_syscfg_dram_mod_en();
196 196
197 rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]); 197 rdmsr(MSR_MTRRfix64K_00000, p[0], p[1]);
198 198
199 for (i = 0; i < 2; i++) 199 for (i = 0; i < 2; i++)
200 rdmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2], p[3 + i * 2]); 200 rdmsr(MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]);
201 for (i = 0; i < 8; i++) 201 for (i = 0; i < 8; i++)
202 rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]); 202 rdmsr(MSR_MTRRfix4K_C0000 + i, p[6 + i * 2], p[7 + i * 2]);
203} 203}
204 204
205void mtrr_save_fixed_ranges(void *info) 205void mtrr_save_fixed_ranges(void *info)
@@ -310,7 +310,7 @@ void __init get_mtrr_state(void)
310 310
311 vrs = mtrr_state.var_ranges; 311 vrs = mtrr_state.var_ranges;
312 312
313 rdmsr(MTRRcap_MSR, lo, dummy); 313 rdmsr(MSR_MTRRcap, lo, dummy);
314 mtrr_state.have_fixed = (lo >> 8) & 1; 314 mtrr_state.have_fixed = (lo >> 8) & 1;
315 315
316 for (i = 0; i < num_var_ranges; i++) 316 for (i = 0; i < num_var_ranges; i++)
@@ -318,7 +318,7 @@ void __init get_mtrr_state(void)
318 if (mtrr_state.have_fixed) 318 if (mtrr_state.have_fixed)
319 get_fixed_ranges(mtrr_state.fixed_ranges); 319 get_fixed_ranges(mtrr_state.fixed_ranges);
320 320
321 rdmsr(MTRRdefType_MSR, lo, dummy); 321 rdmsr(MSR_MTRRdefType, lo, dummy);
322 mtrr_state.def_type = (lo & 0xff); 322 mtrr_state.def_type = (lo & 0xff);
323 mtrr_state.enabled = (lo & 0xc00) >> 10; 323 mtrr_state.enabled = (lo & 0xc00) >> 10;
324 324
@@ -583,10 +583,10 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
583 __flush_tlb(); 583 __flush_tlb();
584 584
585 /* Save MTRR state */ 585 /* Save MTRR state */
586 rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); 586 rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
587 587
588 /* Disable MTRRs, and set the default type to uncached */ 588 /* Disable MTRRs, and set the default type to uncached */
589 mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & ~0xcff, deftype_hi); 589 mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi);
590} 590}
591 591
592static void post_set(void) __releases(set_atomicity_lock) 592static void post_set(void) __releases(set_atomicity_lock)
@@ -595,7 +595,7 @@ static void post_set(void) __releases(set_atomicity_lock)
595 __flush_tlb(); 595 __flush_tlb();
596 596
597 /* Intel (P6) standard MTRRs */ 597 /* Intel (P6) standard MTRRs */
598 mtrr_wrmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); 598 mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
599 599
600 /* Enable caches */ 600 /* Enable caches */
601 write_cr0(read_cr0() & 0xbfffffff); 601 write_cr0(read_cr0() & 0xbfffffff);
@@ -707,7 +707,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i
707static int generic_have_wrcomb(void) 707static int generic_have_wrcomb(void)
708{ 708{
709 unsigned long config, dummy; 709 unsigned long config, dummy;
710 rdmsr(MTRRcap_MSR, config, dummy); 710 rdmsr(MSR_MTRRcap, config, dummy);
711 return (config & (1 << 10)); 711 return (config & (1 << 10));
712} 712}
713 713
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 03cda01f57c7..8fc248b5aeaf 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -104,7 +104,7 @@ static void __init set_num_var_ranges(void)
104 unsigned long config = 0, dummy; 104 unsigned long config = 0, dummy;
105 105
106 if (use_intel()) { 106 if (use_intel()) {
107 rdmsr(MTRRcap_MSR, config, dummy); 107 rdmsr(MSR_MTRRcap, config, dummy);
108 } else if (is_cpu(AMD)) 108 } else if (is_cpu(AMD))
109 config = 2; 109 config = 2;
110 else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) 110 else if (is_cpu(CYRIX) || is_cpu(CENTAUR))
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 77f67f7b347a..7538b767f206 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -5,21 +5,6 @@
5#include <linux/types.h> 5#include <linux/types.h>
6#include <linux/stddef.h> 6#include <linux/stddef.h>
7 7
8#define MTRRcap_MSR 0x0fe
9#define MTRRdefType_MSR 0x2ff
10
11#define MTRRfix64K_00000_MSR 0x250
12#define MTRRfix16K_80000_MSR 0x258
13#define MTRRfix16K_A0000_MSR 0x259
14#define MTRRfix4K_C0000_MSR 0x268
15#define MTRRfix4K_C8000_MSR 0x269
16#define MTRRfix4K_D0000_MSR 0x26a
17#define MTRRfix4K_D8000_MSR 0x26b
18#define MTRRfix4K_E0000_MSR 0x26c
19#define MTRRfix4K_E8000_MSR 0x26d
20#define MTRRfix4K_F0000_MSR 0x26e
21#define MTRRfix4K_F8000_MSR 0x26f
22
23#define MTRR_CHANGE_MASK_FIXED 0x01 8#define MTRR_CHANGE_MASK_FIXED 0x01
24#define MTRR_CHANGE_MASK_VARIABLE 0x02 9#define MTRR_CHANGE_MASK_VARIABLE 0x02
25#define MTRR_CHANGE_MASK_DEFTYPE 0x04 10#define MTRR_CHANGE_MASK_DEFTYPE 0x04
diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c
index 7f7e2753685b..1f5fb1588d1f 100644
--- a/arch/x86/kernel/cpu/mtrr/state.c
+++ b/arch/x86/kernel/cpu/mtrr/state.c
@@ -35,7 +35,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt)
35 35
36 if (use_intel()) 36 if (use_intel())
37 /* Save MTRR state */ 37 /* Save MTRR state */
38 rdmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); 38 rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
39 else 39 else
40 /* Cyrix ARRs - everything else were excluded at the top */ 40 /* Cyrix ARRs - everything else were excluded at the top */
41 ctxt->ccr3 = getCx86(CX86_CCR3); 41 ctxt->ccr3 = getCx86(CX86_CCR3);
@@ -46,7 +46,7 @@ void set_mtrr_cache_disable(struct set_mtrr_context *ctxt)
46{ 46{
47 if (use_intel()) 47 if (use_intel())
48 /* Disable MTRRs, and set the default type to uncached */ 48 /* Disable MTRRs, and set the default type to uncached */
49 mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo & 0xf300UL, 49 mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL,
50 ctxt->deftype_hi); 50 ctxt->deftype_hi);
51 else if (is_cpu(CYRIX)) 51 else if (is_cpu(CYRIX))
52 /* Cyrix ARRs - everything else were excluded at the top */ 52 /* Cyrix ARRs - everything else were excluded at the top */
@@ -64,7 +64,7 @@ void set_mtrr_done(struct set_mtrr_context *ctxt)
64 /* Restore MTRRdefType */ 64 /* Restore MTRRdefType */
65 if (use_intel()) 65 if (use_intel())
66 /* Intel (P6) standard MTRRs */ 66 /* Intel (P6) standard MTRRs */
67 mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); 67 mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
68 else 68 else
69 /* Cyrix ARRs - everything else was excluded at the top */ 69 /* Cyrix ARRs - everything else was excluded at the top */
70 setCx86(CX86_CCR3, ctxt->ccr3); 70 setCx86(CX86_CCR3, ctxt->ccr3);
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
new file mode 100644
index 000000000000..76dfef23f789
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -0,0 +1,1721 @@
1/*
2 * Performance counter x86 architecture code
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2009 Jaswinder Singh Rajput
7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
9 *
10 * For licencing details see kernel-base/COPYING
11 */
12
13#include <linux/perf_counter.h>
14#include <linux/capability.h>
15#include <linux/notifier.h>
16#include <linux/hardirq.h>
17#include <linux/kprobes.h>
18#include <linux/module.h>
19#include <linux/kdebug.h>
20#include <linux/sched.h>
21#include <linux/uaccess.h>
22#include <linux/highmem.h>
23
24#include <asm/apic.h>
25#include <asm/stacktrace.h>
26#include <asm/nmi.h>
27
28static u64 perf_counter_mask __read_mostly;
29
30struct cpu_hw_counters {
31 struct perf_counter *counters[X86_PMC_IDX_MAX];
32 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
33 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
34 unsigned long interrupts;
35 int enabled;
36};
37
38/*
39 * struct x86_pmu - generic x86 pmu
40 */
41struct x86_pmu {
42 const char *name;
43 int version;
44 int (*handle_irq)(struct pt_regs *);
45 void (*disable_all)(void);
46 void (*enable_all)(void);
47 void (*enable)(struct hw_perf_counter *, int);
48 void (*disable)(struct hw_perf_counter *, int);
49 unsigned eventsel;
50 unsigned perfctr;
51 u64 (*event_map)(int);
52 u64 (*raw_event)(u64);
53 int max_events;
54 int num_counters;
55 int num_counters_fixed;
56 int counter_bits;
57 u64 counter_mask;
58 u64 max_period;
59 u64 intel_ctrl;
60};
61
62static struct x86_pmu x86_pmu __read_mostly;
63
64static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
65 .enabled = 1,
66};
67
68/*
69 * Intel PerfMon v3. Used on Core2 and later.
70 */
71static const u64 intel_perfmon_event_map[] =
72{
73 [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
74 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
75 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
76 [PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
77 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
78 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
79 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
80};
81
82static u64 intel_pmu_event_map(int event)
83{
84 return intel_perfmon_event_map[event];
85}
86
87/*
88 * Generalized hw caching related event table, filled
89 * in on a per model basis. A value of 0 means
90 * 'not supported', -1 means 'event makes no sense on
91 * this CPU', any other value means the raw event
92 * ID.
93 */
94
95#define C(x) PERF_COUNT_HW_CACHE_##x
96
97static u64 __read_mostly hw_cache_event_ids
98 [PERF_COUNT_HW_CACHE_MAX]
99 [PERF_COUNT_HW_CACHE_OP_MAX]
100 [PERF_COUNT_HW_CACHE_RESULT_MAX];
101
102static const u64 nehalem_hw_cache_event_ids
103 [PERF_COUNT_HW_CACHE_MAX]
104 [PERF_COUNT_HW_CACHE_OP_MAX]
105 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
106{
107 [ C(L1D) ] = {
108 [ C(OP_READ) ] = {
109 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
110 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
111 },
112 [ C(OP_WRITE) ] = {
113 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
114 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
115 },
116 [ C(OP_PREFETCH) ] = {
117 [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
118 [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
119 },
120 },
121 [ C(L1I ) ] = {
122 [ C(OP_READ) ] = {
123 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
124 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
125 },
126 [ C(OP_WRITE) ] = {
127 [ C(RESULT_ACCESS) ] = -1,
128 [ C(RESULT_MISS) ] = -1,
129 },
130 [ C(OP_PREFETCH) ] = {
131 [ C(RESULT_ACCESS) ] = 0x0,
132 [ C(RESULT_MISS) ] = 0x0,
133 },
134 },
135 [ C(LL ) ] = {
136 [ C(OP_READ) ] = {
137 [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */
138 [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */
139 },
140 [ C(OP_WRITE) ] = {
141 [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */
142 [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */
143 },
144 [ C(OP_PREFETCH) ] = {
145 [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */
146 [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */
147 },
148 },
149 [ C(DTLB) ] = {
150 [ C(OP_READ) ] = {
151 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
152 [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
153 },
154 [ C(OP_WRITE) ] = {
155 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
156 [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
157 },
158 [ C(OP_PREFETCH) ] = {
159 [ C(RESULT_ACCESS) ] = 0x0,
160 [ C(RESULT_MISS) ] = 0x0,
161 },
162 },
163 [ C(ITLB) ] = {
164 [ C(OP_READ) ] = {
165 [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
166 [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */
167 },
168 [ C(OP_WRITE) ] = {
169 [ C(RESULT_ACCESS) ] = -1,
170 [ C(RESULT_MISS) ] = -1,
171 },
172 [ C(OP_PREFETCH) ] = {
173 [ C(RESULT_ACCESS) ] = -1,
174 [ C(RESULT_MISS) ] = -1,
175 },
176 },
177 [ C(BPU ) ] = {
178 [ C(OP_READ) ] = {
179 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
180 [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
181 },
182 [ C(OP_WRITE) ] = {
183 [ C(RESULT_ACCESS) ] = -1,
184 [ C(RESULT_MISS) ] = -1,
185 },
186 [ C(OP_PREFETCH) ] = {
187 [ C(RESULT_ACCESS) ] = -1,
188 [ C(RESULT_MISS) ] = -1,
189 },
190 },
191};
192
193static const u64 core2_hw_cache_event_ids
194 [PERF_COUNT_HW_CACHE_MAX]
195 [PERF_COUNT_HW_CACHE_OP_MAX]
196 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
197{
198 [ C(L1D) ] = {
199 [ C(OP_READ) ] = {
200 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
201 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
202 },
203 [ C(OP_WRITE) ] = {
204 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
205 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
206 },
207 [ C(OP_PREFETCH) ] = {
208 [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */
209 [ C(RESULT_MISS) ] = 0,
210 },
211 },
212 [ C(L1I ) ] = {
213 [ C(OP_READ) ] = {
214 [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */
215 [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */
216 },
217 [ C(OP_WRITE) ] = {
218 [ C(RESULT_ACCESS) ] = -1,
219 [ C(RESULT_MISS) ] = -1,
220 },
221 [ C(OP_PREFETCH) ] = {
222 [ C(RESULT_ACCESS) ] = 0,
223 [ C(RESULT_MISS) ] = 0,
224 },
225 },
226 [ C(LL ) ] = {
227 [ C(OP_READ) ] = {
228 [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
229 [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
230 },
231 [ C(OP_WRITE) ] = {
232 [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
233 [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
234 },
235 [ C(OP_PREFETCH) ] = {
236 [ C(RESULT_ACCESS) ] = 0,
237 [ C(RESULT_MISS) ] = 0,
238 },
239 },
240 [ C(DTLB) ] = {
241 [ C(OP_READ) ] = {
242 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
243 [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */
244 },
245 [ C(OP_WRITE) ] = {
246 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
247 [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */
248 },
249 [ C(OP_PREFETCH) ] = {
250 [ C(RESULT_ACCESS) ] = 0,
251 [ C(RESULT_MISS) ] = 0,
252 },
253 },
254 [ C(ITLB) ] = {
255 [ C(OP_READ) ] = {
256 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
257 [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */
258 },
259 [ C(OP_WRITE) ] = {
260 [ C(RESULT_ACCESS) ] = -1,
261 [ C(RESULT_MISS) ] = -1,
262 },
263 [ C(OP_PREFETCH) ] = {
264 [ C(RESULT_ACCESS) ] = -1,
265 [ C(RESULT_MISS) ] = -1,
266 },
267 },
268 [ C(BPU ) ] = {
269 [ C(OP_READ) ] = {
270 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
271 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
272 },
273 [ C(OP_WRITE) ] = {
274 [ C(RESULT_ACCESS) ] = -1,
275 [ C(RESULT_MISS) ] = -1,
276 },
277 [ C(OP_PREFETCH) ] = {
278 [ C(RESULT_ACCESS) ] = -1,
279 [ C(RESULT_MISS) ] = -1,
280 },
281 },
282};
283
284static const u64 atom_hw_cache_event_ids
285 [PERF_COUNT_HW_CACHE_MAX]
286 [PERF_COUNT_HW_CACHE_OP_MAX]
287 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
288{
289 [ C(L1D) ] = {
290 [ C(OP_READ) ] = {
291 [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */
292 [ C(RESULT_MISS) ] = 0,
293 },
294 [ C(OP_WRITE) ] = {
295 [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */
296 [ C(RESULT_MISS) ] = 0,
297 },
298 [ C(OP_PREFETCH) ] = {
299 [ C(RESULT_ACCESS) ] = 0x0,
300 [ C(RESULT_MISS) ] = 0,
301 },
302 },
303 [ C(L1I ) ] = {
304 [ C(OP_READ) ] = {
305 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
306 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
307 },
308 [ C(OP_WRITE) ] = {
309 [ C(RESULT_ACCESS) ] = -1,
310 [ C(RESULT_MISS) ] = -1,
311 },
312 [ C(OP_PREFETCH) ] = {
313 [ C(RESULT_ACCESS) ] = 0,
314 [ C(RESULT_MISS) ] = 0,
315 },
316 },
317 [ C(LL ) ] = {
318 [ C(OP_READ) ] = {
319 [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
320 [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
321 },
322 [ C(OP_WRITE) ] = {
323 [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
324 [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
325 },
326 [ C(OP_PREFETCH) ] = {
327 [ C(RESULT_ACCESS) ] = 0,
328 [ C(RESULT_MISS) ] = 0,
329 },
330 },
331 [ C(DTLB) ] = {
332 [ C(OP_READ) ] = {
333 [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */
334 [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */
335 },
336 [ C(OP_WRITE) ] = {
337 [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */
338 [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */
339 },
340 [ C(OP_PREFETCH) ] = {
341 [ C(RESULT_ACCESS) ] = 0,
342 [ C(RESULT_MISS) ] = 0,
343 },
344 },
345 [ C(ITLB) ] = {
346 [ C(OP_READ) ] = {
347 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
348 [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */
349 },
350 [ C(OP_WRITE) ] = {
351 [ C(RESULT_ACCESS) ] = -1,
352 [ C(RESULT_MISS) ] = -1,
353 },
354 [ C(OP_PREFETCH) ] = {
355 [ C(RESULT_ACCESS) ] = -1,
356 [ C(RESULT_MISS) ] = -1,
357 },
358 },
359 [ C(BPU ) ] = {
360 [ C(OP_READ) ] = {
361 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
362 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
363 },
364 [ C(OP_WRITE) ] = {
365 [ C(RESULT_ACCESS) ] = -1,
366 [ C(RESULT_MISS) ] = -1,
367 },
368 [ C(OP_PREFETCH) ] = {
369 [ C(RESULT_ACCESS) ] = -1,
370 [ C(RESULT_MISS) ] = -1,
371 },
372 },
373};
374
375static u64 intel_pmu_raw_event(u64 event)
376{
377#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
378#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
379#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
380#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
381#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL
382
383#define CORE_EVNTSEL_MASK \
384 (CORE_EVNTSEL_EVENT_MASK | \
385 CORE_EVNTSEL_UNIT_MASK | \
386 CORE_EVNTSEL_EDGE_MASK | \
387 CORE_EVNTSEL_INV_MASK | \
388 CORE_EVNTSEL_COUNTER_MASK)
389
390 return event & CORE_EVNTSEL_MASK;
391}
392
393static const u64 amd_hw_cache_event_ids
394 [PERF_COUNT_HW_CACHE_MAX]
395 [PERF_COUNT_HW_CACHE_OP_MAX]
396 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
397{
398 [ C(L1D) ] = {
399 [ C(OP_READ) ] = {
400 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
401 [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */
402 },
403 [ C(OP_WRITE) ] = {
404 [ C(RESULT_ACCESS) ] = 0x0042, /* Data Cache Refills from L2 */
405 [ C(RESULT_MISS) ] = 0,
406 },
407 [ C(OP_PREFETCH) ] = {
408 [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */
409 [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */
410 },
411 },
412 [ C(L1I ) ] = {
413 [ C(OP_READ) ] = {
414 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */
415 [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */
416 },
417 [ C(OP_WRITE) ] = {
418 [ C(RESULT_ACCESS) ] = -1,
419 [ C(RESULT_MISS) ] = -1,
420 },
421 [ C(OP_PREFETCH) ] = {
422 [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
423 [ C(RESULT_MISS) ] = 0,
424 },
425 },
426 [ C(LL ) ] = {
427 [ C(OP_READ) ] = {
428 [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
429 [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */
430 },
431 [ C(OP_WRITE) ] = {
432 [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */
433 [ C(RESULT_MISS) ] = 0,
434 },
435 [ C(OP_PREFETCH) ] = {
436 [ C(RESULT_ACCESS) ] = 0,
437 [ C(RESULT_MISS) ] = 0,
438 },
439 },
440 [ C(DTLB) ] = {
441 [ C(OP_READ) ] = {
442 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
443 [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */
444 },
445 [ C(OP_WRITE) ] = {
446 [ C(RESULT_ACCESS) ] = 0,
447 [ C(RESULT_MISS) ] = 0,
448 },
449 [ C(OP_PREFETCH) ] = {
450 [ C(RESULT_ACCESS) ] = 0,
451 [ C(RESULT_MISS) ] = 0,
452 },
453 },
454 [ C(ITLB) ] = {
455 [ C(OP_READ) ] = {
456 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
457 [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */
458 },
459 [ C(OP_WRITE) ] = {
460 [ C(RESULT_ACCESS) ] = -1,
461 [ C(RESULT_MISS) ] = -1,
462 },
463 [ C(OP_PREFETCH) ] = {
464 [ C(RESULT_ACCESS) ] = -1,
465 [ C(RESULT_MISS) ] = -1,
466 },
467 },
468 [ C(BPU ) ] = {
469 [ C(OP_READ) ] = {
470 [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */
471 [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */
472 },
473 [ C(OP_WRITE) ] = {
474 [ C(RESULT_ACCESS) ] = -1,
475 [ C(RESULT_MISS) ] = -1,
476 },
477 [ C(OP_PREFETCH) ] = {
478 [ C(RESULT_ACCESS) ] = -1,
479 [ C(RESULT_MISS) ] = -1,
480 },
481 },
482};
483
484/*
485 * AMD Performance Monitor K7 and later.
486 */
487static const u64 amd_perfmon_event_map[] =
488{
489 [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
490 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
491 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
492 [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
493 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
494 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
495};
496
497static u64 amd_pmu_event_map(int event)
498{
499 return amd_perfmon_event_map[event];
500}
501
502static u64 amd_pmu_raw_event(u64 event)
503{
504#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL
505#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL
506#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL
507#define K7_EVNTSEL_INV_MASK 0x000800000ULL
508#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL
509
510#define K7_EVNTSEL_MASK \
511 (K7_EVNTSEL_EVENT_MASK | \
512 K7_EVNTSEL_UNIT_MASK | \
513 K7_EVNTSEL_EDGE_MASK | \
514 K7_EVNTSEL_INV_MASK | \
515 K7_EVNTSEL_COUNTER_MASK)
516
517 return event & K7_EVNTSEL_MASK;
518}
519
520/*
521 * Propagate counter elapsed time into the generic counter.
522 * Can only be executed on the CPU where the counter is active.
523 * Returns the delta events processed.
524 */
525static u64
526x86_perf_counter_update(struct perf_counter *counter,
527 struct hw_perf_counter *hwc, int idx)
528{
529 int shift = 64 - x86_pmu.counter_bits;
530 u64 prev_raw_count, new_raw_count;
531 s64 delta;
532
533 /*
534 * Careful: an NMI might modify the previous counter value.
535 *
536 * Our tactic to handle this is to first atomically read and
537 * exchange a new raw count - then add that new-prev delta
538 * count to the generic counter atomically:
539 */
540again:
541 prev_raw_count = atomic64_read(&hwc->prev_count);
542 rdmsrl(hwc->counter_base + idx, new_raw_count);
543
544 if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
545 new_raw_count) != prev_raw_count)
546 goto again;
547
548 /*
549 * Now we have the new raw value and have updated the prev
550 * timestamp already. We can now calculate the elapsed delta
551 * (counter-)time and add that to the generic counter.
552 *
553 * Careful, not all hw sign-extends above the physical width
554 * of the count.
555 */
556 delta = (new_raw_count << shift) - (prev_raw_count << shift);
557 delta >>= shift;
558
559 atomic64_add(delta, &counter->count);
560 atomic64_sub(delta, &hwc->period_left);
561
562 return new_raw_count;
563}
564
565static atomic_t active_counters;
566static DEFINE_MUTEX(pmc_reserve_mutex);
567
568static bool reserve_pmc_hardware(void)
569{
570 int i;
571
572 if (nmi_watchdog == NMI_LOCAL_APIC)
573 disable_lapic_nmi_watchdog();
574
575 for (i = 0; i < x86_pmu.num_counters; i++) {
576 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
577 goto perfctr_fail;
578 }
579
580 for (i = 0; i < x86_pmu.num_counters; i++) {
581 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
582 goto eventsel_fail;
583 }
584
585 return true;
586
587eventsel_fail:
588 for (i--; i >= 0; i--)
589 release_evntsel_nmi(x86_pmu.eventsel + i);
590
591 i = x86_pmu.num_counters;
592
593perfctr_fail:
594 for (i--; i >= 0; i--)
595 release_perfctr_nmi(x86_pmu.perfctr + i);
596
597 if (nmi_watchdog == NMI_LOCAL_APIC)
598 enable_lapic_nmi_watchdog();
599
600 return false;
601}
602
603static void release_pmc_hardware(void)
604{
605 int i;
606
607 for (i = 0; i < x86_pmu.num_counters; i++) {
608 release_perfctr_nmi(x86_pmu.perfctr + i);
609 release_evntsel_nmi(x86_pmu.eventsel + i);
610 }
611
612 if (nmi_watchdog == NMI_LOCAL_APIC)
613 enable_lapic_nmi_watchdog();
614}
615
616static void hw_perf_counter_destroy(struct perf_counter *counter)
617{
618 if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) {
619 release_pmc_hardware();
620 mutex_unlock(&pmc_reserve_mutex);
621 }
622}
623
624static inline int x86_pmu_initialized(void)
625{
626 return x86_pmu.handle_irq != NULL;
627}
628
629static inline int
630set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
631{
632 unsigned int cache_type, cache_op, cache_result;
633 u64 config, val;
634
635 config = attr->config;
636
637 cache_type = (config >> 0) & 0xff;
638 if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
639 return -EINVAL;
640
641 cache_op = (config >> 8) & 0xff;
642 if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
643 return -EINVAL;
644
645 cache_result = (config >> 16) & 0xff;
646 if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
647 return -EINVAL;
648
649 val = hw_cache_event_ids[cache_type][cache_op][cache_result];
650
651 if (val == 0)
652 return -ENOENT;
653
654 if (val == -1)
655 return -EINVAL;
656
657 hwc->config |= val;
658
659 return 0;
660}
661
662/*
663 * Setup the hardware configuration for a given attr_type
664 */
665static int __hw_perf_counter_init(struct perf_counter *counter)
666{
667 struct perf_counter_attr *attr = &counter->attr;
668 struct hw_perf_counter *hwc = &counter->hw;
669 int err;
670
671 if (!x86_pmu_initialized())
672 return -ENODEV;
673
674 err = 0;
675 if (!atomic_inc_not_zero(&active_counters)) {
676 mutex_lock(&pmc_reserve_mutex);
677 if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware())
678 err = -EBUSY;
679 else
680 atomic_inc(&active_counters);
681 mutex_unlock(&pmc_reserve_mutex);
682 }
683 if (err)
684 return err;
685
686 /*
687 * Generate PMC IRQs:
688 * (keep 'enabled' bit clear for now)
689 */
690 hwc->config = ARCH_PERFMON_EVENTSEL_INT;
691
692 /*
693 * Count user and OS events unless requested not to.
694 */
695 if (!attr->exclude_user)
696 hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
697 if (!attr->exclude_kernel)
698 hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
699
700 if (!hwc->sample_period) {
701 hwc->sample_period = x86_pmu.max_period;
702 hwc->last_period = hwc->sample_period;
703 atomic64_set(&hwc->period_left, hwc->sample_period);
704 }
705
706 counter->destroy = hw_perf_counter_destroy;
707
708 /*
709 * Raw event type provide the config in the event structure
710 */
711 if (attr->type == PERF_TYPE_RAW) {
712 hwc->config |= x86_pmu.raw_event(attr->config);
713 return 0;
714 }
715
716 if (attr->type == PERF_TYPE_HW_CACHE)
717 return set_ext_hw_attr(hwc, attr);
718
719 if (attr->config >= x86_pmu.max_events)
720 return -EINVAL;
721 /*
722 * The generic map:
723 */
724 hwc->config |= x86_pmu.event_map(attr->config);
725
726 return 0;
727}
728
729static void intel_pmu_disable_all(void)
730{
731 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
732}
733
734static void amd_pmu_disable_all(void)
735{
736 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
737 int idx;
738
739 if (!cpuc->enabled)
740 return;
741
742 cpuc->enabled = 0;
743 /*
744 * ensure we write the disable before we start disabling the
745 * counters proper, so that amd_pmu_enable_counter() does the
746 * right thing.
747 */
748 barrier();
749
750 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
751 u64 val;
752
753 if (!test_bit(idx, cpuc->active_mask))
754 continue;
755 rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
756 if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
757 continue;
758 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
759 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
760 }
761}
762
763void hw_perf_disable(void)
764{
765 if (!x86_pmu_initialized())
766 return;
767 return x86_pmu.disable_all();
768}
769
770static void intel_pmu_enable_all(void)
771{
772 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
773}
774
775static void amd_pmu_enable_all(void)
776{
777 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
778 int idx;
779
780 if (cpuc->enabled)
781 return;
782
783 cpuc->enabled = 1;
784 barrier();
785
786 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
787 u64 val;
788
789 if (!test_bit(idx, cpuc->active_mask))
790 continue;
791 rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
792 if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
793 continue;
794 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
795 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
796 }
797}
798
799void hw_perf_enable(void)
800{
801 if (!x86_pmu_initialized())
802 return;
803 x86_pmu.enable_all();
804}
805
806static inline u64 intel_pmu_get_status(void)
807{
808 u64 status;
809
810 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
811
812 return status;
813}
814
815static inline void intel_pmu_ack_status(u64 ack)
816{
817 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
818}
819
820static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
821{
822 int err;
823 err = checking_wrmsrl(hwc->config_base + idx,
824 hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
825}
826
827static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
828{
829 int err;
830 err = checking_wrmsrl(hwc->config_base + idx,
831 hwc->config);
832}
833
834static inline void
835intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
836{
837 int idx = __idx - X86_PMC_IDX_FIXED;
838 u64 ctrl_val, mask;
839 int err;
840
841 mask = 0xfULL << (idx * 4);
842
843 rdmsrl(hwc->config_base, ctrl_val);
844 ctrl_val &= ~mask;
845 err = checking_wrmsrl(hwc->config_base, ctrl_val);
846}
847
848static inline void
849intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
850{
851 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
852 intel_pmu_disable_fixed(hwc, idx);
853 return;
854 }
855
856 x86_pmu_disable_counter(hwc, idx);
857}
858
859static inline void
860amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
861{
862 x86_pmu_disable_counter(hwc, idx);
863}
864
865static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
866
867/*
868 * Set the next IRQ period, based on the hwc->period_left value.
869 * To be called with the counter disabled in hw:
870 */
871static int
872x86_perf_counter_set_period(struct perf_counter *counter,
873 struct hw_perf_counter *hwc, int idx)
874{
875 s64 left = atomic64_read(&hwc->period_left);
876 s64 period = hwc->sample_period;
877 int err, ret = 0;
878
879 /*
880 * If we are way outside a reasoable range then just skip forward:
881 */
882 if (unlikely(left <= -period)) {
883 left = period;
884 atomic64_set(&hwc->period_left, left);
885 hwc->last_period = period;
886 ret = 1;
887 }
888
889 if (unlikely(left <= 0)) {
890 left += period;
891 atomic64_set(&hwc->period_left, left);
892 hwc->last_period = period;
893 ret = 1;
894 }
895 /*
896 * Quirk: certain CPUs dont like it if just 1 event is left:
897 */
898 if (unlikely(left < 2))
899 left = 2;
900
901 if (left > x86_pmu.max_period)
902 left = x86_pmu.max_period;
903
904 per_cpu(prev_left[idx], smp_processor_id()) = left;
905
906 /*
907 * The hw counter starts counting from this counter offset,
908 * mark it to be able to extra future deltas:
909 */
910 atomic64_set(&hwc->prev_count, (u64)-left);
911
912 err = checking_wrmsrl(hwc->counter_base + idx,
913 (u64)(-left) & x86_pmu.counter_mask);
914
915 return ret;
916}
917
918static inline void
919intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
920{
921 int idx = __idx - X86_PMC_IDX_FIXED;
922 u64 ctrl_val, bits, mask;
923 int err;
924
925 /*
926 * Enable IRQ generation (0x8),
927 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
928 * if requested:
929 */
930 bits = 0x8ULL;
931 if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
932 bits |= 0x2;
933 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
934 bits |= 0x1;
935 bits <<= (idx * 4);
936 mask = 0xfULL << (idx * 4);
937
938 rdmsrl(hwc->config_base, ctrl_val);
939 ctrl_val &= ~mask;
940 ctrl_val |= bits;
941 err = checking_wrmsrl(hwc->config_base, ctrl_val);
942}
943
944static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
945{
946 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
947 intel_pmu_enable_fixed(hwc, idx);
948 return;
949 }
950
951 x86_pmu_enable_counter(hwc, idx);
952}
953
954static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
955{
956 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
957
958 if (cpuc->enabled)
959 x86_pmu_enable_counter(hwc, idx);
960 else
961 x86_pmu_disable_counter(hwc, idx);
962}
963
964static int
965fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
966{
967 unsigned int event;
968
969 if (!x86_pmu.num_counters_fixed)
970 return -1;
971
972 /*
973 * Quirk, IA32_FIXED_CTRs do not work on current Atom processors:
974 */
975 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
976 boot_cpu_data.x86_model == 28)
977 return -1;
978
979 event = hwc->config & ARCH_PERFMON_EVENT_MASK;
980
981 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
982 return X86_PMC_IDX_FIXED_INSTRUCTIONS;
983 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
984 return X86_PMC_IDX_FIXED_CPU_CYCLES;
985 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
986 return X86_PMC_IDX_FIXED_BUS_CYCLES;
987
988 return -1;
989}
990
991/*
992 * Find a PMC slot for the freshly enabled / scheduled in counter:
993 */
994static int x86_pmu_enable(struct perf_counter *counter)
995{
996 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
997 struct hw_perf_counter *hwc = &counter->hw;
998 int idx;
999
1000 idx = fixed_mode_idx(counter, hwc);
1001 if (idx >= 0) {
1002 /*
1003 * Try to get the fixed counter, if that is already taken
1004 * then try to get a generic counter:
1005 */
1006 if (test_and_set_bit(idx, cpuc->used_mask))
1007 goto try_generic;
1008
1009 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
1010 /*
1011 * We set it so that counter_base + idx in wrmsr/rdmsr maps to
1012 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
1013 */
1014 hwc->counter_base =
1015 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
1016 hwc->idx = idx;
1017 } else {
1018 idx = hwc->idx;
1019 /* Try to get the previous generic counter again */
1020 if (test_and_set_bit(idx, cpuc->used_mask)) {
1021try_generic:
1022 idx = find_first_zero_bit(cpuc->used_mask,
1023 x86_pmu.num_counters);
1024 if (idx == x86_pmu.num_counters)
1025 return -EAGAIN;
1026
1027 set_bit(idx, cpuc->used_mask);
1028 hwc->idx = idx;
1029 }
1030 hwc->config_base = x86_pmu.eventsel;
1031 hwc->counter_base = x86_pmu.perfctr;
1032 }
1033
1034 perf_counters_lapic_init();
1035
1036 x86_pmu.disable(hwc, idx);
1037
1038 cpuc->counters[idx] = counter;
1039 set_bit(idx, cpuc->active_mask);
1040
1041 x86_perf_counter_set_period(counter, hwc, idx);
1042 x86_pmu.enable(hwc, idx);
1043
1044 return 0;
1045}
1046
1047static void x86_pmu_unthrottle(struct perf_counter *counter)
1048{
1049 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
1050 struct hw_perf_counter *hwc = &counter->hw;
1051
1052 if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
1053 cpuc->counters[hwc->idx] != counter))
1054 return;
1055
1056 x86_pmu.enable(hwc, hwc->idx);
1057}
1058
1059void perf_counter_print_debug(void)
1060{
1061 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
1062 struct cpu_hw_counters *cpuc;
1063 unsigned long flags;
1064 int cpu, idx;
1065
1066 if (!x86_pmu.num_counters)
1067 return;
1068
1069 local_irq_save(flags);
1070
1071 cpu = smp_processor_id();
1072 cpuc = &per_cpu(cpu_hw_counters, cpu);
1073
1074 if (x86_pmu.version >= 2) {
1075 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
1076 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
1077 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
1078 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
1079
1080 pr_info("\n");
1081 pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl);
1082 pr_info("CPU#%d: status: %016llx\n", cpu, status);
1083 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
1084 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
1085 }
1086 pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask);
1087
1088 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1089 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
1090 rdmsrl(x86_pmu.perfctr + idx, pmc_count);
1091
1092 prev_left = per_cpu(prev_left[idx], cpu);
1093
1094 pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n",
1095 cpu, idx, pmc_ctrl);
1096 pr_info("CPU#%d: gen-PMC%d count: %016llx\n",
1097 cpu, idx, pmc_count);
1098 pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
1099 cpu, idx, prev_left);
1100 }
1101 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
1102 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
1103
1104 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
1105 cpu, idx, pmc_count);
1106 }
1107 local_irq_restore(flags);
1108}
1109
1110static void x86_pmu_disable(struct perf_counter *counter)
1111{
1112 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
1113 struct hw_perf_counter *hwc = &counter->hw;
1114 int idx = hwc->idx;
1115
1116 /*
1117 * Must be done before we disable, otherwise the nmi handler
1118 * could reenable again:
1119 */
1120 clear_bit(idx, cpuc->active_mask);
1121 x86_pmu.disable(hwc, idx);
1122
1123 /*
1124 * Make sure the cleared pointer becomes visible before we
1125 * (potentially) free the counter:
1126 */
1127 barrier();
1128
1129 /*
1130 * Drain the remaining delta count out of a counter
1131 * that we are disabling:
1132 */
1133 x86_perf_counter_update(counter, hwc, idx);
1134 cpuc->counters[idx] = NULL;
1135 clear_bit(idx, cpuc->used_mask);
1136}
1137
1138/*
1139 * Save and restart an expired counter. Called by NMI contexts,
1140 * so it has to be careful about preempting normal counter ops:
1141 */
1142static int intel_pmu_save_and_restart(struct perf_counter *counter)
1143{
1144 struct hw_perf_counter *hwc = &counter->hw;
1145 int idx = hwc->idx;
1146 int ret;
1147
1148 x86_perf_counter_update(counter, hwc, idx);
1149 ret = x86_perf_counter_set_period(counter, hwc, idx);
1150
1151 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
1152 intel_pmu_enable_counter(hwc, idx);
1153
1154 return ret;
1155}
1156
1157static void intel_pmu_reset(void)
1158{
1159 unsigned long flags;
1160 int idx;
1161
1162 if (!x86_pmu.num_counters)
1163 return;
1164
1165 local_irq_save(flags);
1166
1167 printk("clearing PMU state on CPU#%d\n", smp_processor_id());
1168
1169 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1170 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
1171 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
1172 }
1173 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
1174 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
1175 }
1176
1177 local_irq_restore(flags);
1178}
1179
1180
1181/*
1182 * This handler is triggered by the local APIC, so the APIC IRQ handling
1183 * rules apply:
1184 */
1185static int intel_pmu_handle_irq(struct pt_regs *regs)
1186{
1187 struct perf_sample_data data;
1188 struct cpu_hw_counters *cpuc;
1189 int bit, cpu, loops;
1190 u64 ack, status;
1191
1192 data.regs = regs;
1193 data.addr = 0;
1194
1195 cpu = smp_processor_id();
1196 cpuc = &per_cpu(cpu_hw_counters, cpu);
1197
1198 perf_disable();
1199 status = intel_pmu_get_status();
1200 if (!status) {
1201 perf_enable();
1202 return 0;
1203 }
1204
1205 loops = 0;
1206again:
1207 if (++loops > 100) {
1208 WARN_ONCE(1, "perfcounters: irq loop stuck!\n");
1209 perf_counter_print_debug();
1210 intel_pmu_reset();
1211 perf_enable();
1212 return 1;
1213 }
1214
1215 inc_irq_stat(apic_perf_irqs);
1216 ack = status;
1217 for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
1218 struct perf_counter *counter = cpuc->counters[bit];
1219
1220 clear_bit(bit, (unsigned long *) &status);
1221 if (!test_bit(bit, cpuc->active_mask))
1222 continue;
1223
1224 if (!intel_pmu_save_and_restart(counter))
1225 continue;
1226
1227 data.period = counter->hw.last_period;
1228
1229 if (perf_counter_overflow(counter, 1, &data))
1230 intel_pmu_disable_counter(&counter->hw, bit);
1231 }
1232
1233 intel_pmu_ack_status(ack);
1234
1235 /*
1236 * Repeat if there is more work to be done:
1237 */
1238 status = intel_pmu_get_status();
1239 if (status)
1240 goto again;
1241
1242 perf_enable();
1243
1244 return 1;
1245}
1246
1247static int amd_pmu_handle_irq(struct pt_regs *regs)
1248{
1249 struct perf_sample_data data;
1250 struct cpu_hw_counters *cpuc;
1251 struct perf_counter *counter;
1252 struct hw_perf_counter *hwc;
1253 int cpu, idx, handled = 0;
1254 u64 val;
1255
1256 data.regs = regs;
1257 data.addr = 0;
1258
1259 cpu = smp_processor_id();
1260 cpuc = &per_cpu(cpu_hw_counters, cpu);
1261
1262 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1263 if (!test_bit(idx, cpuc->active_mask))
1264 continue;
1265
1266 counter = cpuc->counters[idx];
1267 hwc = &counter->hw;
1268
1269 val = x86_perf_counter_update(counter, hwc, idx);
1270 if (val & (1ULL << (x86_pmu.counter_bits - 1)))
1271 continue;
1272
1273 /*
1274 * counter overflow
1275 */
1276 handled = 1;
1277 data.period = counter->hw.last_period;
1278
1279 if (!x86_perf_counter_set_period(counter, hwc, idx))
1280 continue;
1281
1282 if (perf_counter_overflow(counter, 1, &data))
1283 amd_pmu_disable_counter(hwc, idx);
1284 }
1285
1286 if (handled)
1287 inc_irq_stat(apic_perf_irqs);
1288
1289 return handled;
1290}
1291
1292void smp_perf_pending_interrupt(struct pt_regs *regs)
1293{
1294 irq_enter();
1295 ack_APIC_irq();
1296 inc_irq_stat(apic_pending_irqs);
1297 perf_counter_do_pending();
1298 irq_exit();
1299}
1300
1301void set_perf_counter_pending(void)
1302{
1303 apic->send_IPI_self(LOCAL_PENDING_VECTOR);
1304}
1305
1306void perf_counters_lapic_init(void)
1307{
1308 if (!x86_pmu_initialized())
1309 return;
1310
1311 /*
1312 * Always use NMI for PMU
1313 */
1314 apic_write(APIC_LVTPC, APIC_DM_NMI);
1315}
1316
1317static int __kprobes
1318perf_counter_nmi_handler(struct notifier_block *self,
1319 unsigned long cmd, void *__args)
1320{
1321 struct die_args *args = __args;
1322 struct pt_regs *regs;
1323
1324 if (!atomic_read(&active_counters))
1325 return NOTIFY_DONE;
1326
1327 switch (cmd) {
1328 case DIE_NMI:
1329 case DIE_NMI_IPI:
1330 break;
1331
1332 default:
1333 return NOTIFY_DONE;
1334 }
1335
1336 regs = args->regs;
1337
1338 apic_write(APIC_LVTPC, APIC_DM_NMI);
1339 /*
1340 * Can't rely on the handled return value to say it was our NMI, two
1341 * counters could trigger 'simultaneously' raising two back-to-back NMIs.
1342 *
1343 * If the first NMI handles both, the latter will be empty and daze
1344 * the CPU.
1345 */
1346 x86_pmu.handle_irq(regs);
1347
1348 return NOTIFY_STOP;
1349}
1350
1351static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
1352 .notifier_call = perf_counter_nmi_handler,
1353 .next = NULL,
1354 .priority = 1
1355};
1356
1357static struct x86_pmu intel_pmu = {
1358 .name = "Intel",
1359 .handle_irq = intel_pmu_handle_irq,
1360 .disable_all = intel_pmu_disable_all,
1361 .enable_all = intel_pmu_enable_all,
1362 .enable = intel_pmu_enable_counter,
1363 .disable = intel_pmu_disable_counter,
1364 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
1365 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
1366 .event_map = intel_pmu_event_map,
1367 .raw_event = intel_pmu_raw_event,
1368 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
1369 /*
1370 * Intel PMCs cannot be accessed sanely above 32 bit width,
1371 * so we install an artificial 1<<31 period regardless of
1372 * the generic counter period:
1373 */
1374 .max_period = (1ULL << 31) - 1,
1375};
1376
1377static struct x86_pmu amd_pmu = {
1378 .name = "AMD",
1379 .handle_irq = amd_pmu_handle_irq,
1380 .disable_all = amd_pmu_disable_all,
1381 .enable_all = amd_pmu_enable_all,
1382 .enable = amd_pmu_enable_counter,
1383 .disable = amd_pmu_disable_counter,
1384 .eventsel = MSR_K7_EVNTSEL0,
1385 .perfctr = MSR_K7_PERFCTR0,
1386 .event_map = amd_pmu_event_map,
1387 .raw_event = amd_pmu_raw_event,
1388 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
1389 .num_counters = 4,
1390 .counter_bits = 48,
1391 .counter_mask = (1ULL << 48) - 1,
1392 /* use highest bit to detect overflow */
1393 .max_period = (1ULL << 47) - 1,
1394};
1395
1396static int intel_pmu_init(void)
1397{
1398 union cpuid10_edx edx;
1399 union cpuid10_eax eax;
1400 unsigned int unused;
1401 unsigned int ebx;
1402 int version;
1403
1404 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
1405 return -ENODEV;
1406
1407 /*
1408 * Check whether the Architectural PerfMon supports
1409 * Branch Misses Retired Event or not.
1410 */
1411 cpuid(10, &eax.full, &ebx, &unused, &edx.full);
1412 if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
1413 return -ENODEV;
1414
1415 version = eax.split.version_id;
1416 if (version < 2)
1417 return -ENODEV;
1418
1419 x86_pmu = intel_pmu;
1420 x86_pmu.version = version;
1421 x86_pmu.num_counters = eax.split.num_counters;
1422 x86_pmu.counter_bits = eax.split.bit_width;
1423 x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1;
1424
1425 /*
1426 * Quirk: v2 perfmon does not report fixed-purpose counters, so
1427 * assume at least 3 counters:
1428 */
1429 x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
1430
1431 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
1432
1433 /*
1434 * Install the hw-cache-events table:
1435 */
1436 switch (boot_cpu_data.x86_model) {
1437 case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
1438 case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
1439 case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
1440 case 29: /* six-core 45 nm xeon "Dunnington" */
1441 memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
1442 sizeof(hw_cache_event_ids));
1443
1444 pr_cont("Core2 events, ");
1445 break;
1446 default:
1447 case 26:
1448 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
1449 sizeof(hw_cache_event_ids));
1450
1451 pr_cont("Nehalem/Corei7 events, ");
1452 break;
1453 case 28:
1454 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
1455 sizeof(hw_cache_event_ids));
1456
1457 pr_cont("Atom events, ");
1458 break;
1459 }
1460 return 0;
1461}
1462
1463static int amd_pmu_init(void)
1464{
1465 /* Performance-monitoring supported from K7 and later: */
1466 if (boot_cpu_data.x86 < 6)
1467 return -ENODEV;
1468
1469 x86_pmu = amd_pmu;
1470
1471 /* Events are common for all AMDs */
1472 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
1473 sizeof(hw_cache_event_ids));
1474
1475 return 0;
1476}
1477
1478void __init init_hw_perf_counters(void)
1479{
1480 int err;
1481
1482 pr_info("Performance Counters: ");
1483
1484 switch (boot_cpu_data.x86_vendor) {
1485 case X86_VENDOR_INTEL:
1486 err = intel_pmu_init();
1487 break;
1488 case X86_VENDOR_AMD:
1489 err = amd_pmu_init();
1490 break;
1491 default:
1492 return;
1493 }
1494 if (err != 0) {
1495 pr_cont("no PMU driver, software counters only.\n");
1496 return;
1497 }
1498
1499 pr_cont("%s PMU driver.\n", x86_pmu.name);
1500
1501 if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
1502 x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
1503 WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
1504 x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
1505 }
1506 perf_counter_mask = (1 << x86_pmu.num_counters) - 1;
1507 perf_max_counters = x86_pmu.num_counters;
1508
1509 if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
1510 x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
1511 WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
1512 x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
1513 }
1514
1515 perf_counter_mask |=
1516 ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
1517
1518 perf_counters_lapic_init();
1519 register_die_notifier(&perf_counter_nmi_notifier);
1520
1521 pr_info("... version: %d\n", x86_pmu.version);
1522 pr_info("... bit width: %d\n", x86_pmu.counter_bits);
1523 pr_info("... generic counters: %d\n", x86_pmu.num_counters);
1524 pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask);
1525 pr_info("... max period: %016Lx\n", x86_pmu.max_period);
1526 pr_info("... fixed-purpose counters: %d\n", x86_pmu.num_counters_fixed);
1527 pr_info("... counter mask: %016Lx\n", perf_counter_mask);
1528}
1529
1530static inline void x86_pmu_read(struct perf_counter *counter)
1531{
1532 x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
1533}
1534
1535static const struct pmu pmu = {
1536 .enable = x86_pmu_enable,
1537 .disable = x86_pmu_disable,
1538 .read = x86_pmu_read,
1539 .unthrottle = x86_pmu_unthrottle,
1540};
1541
1542const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
1543{
1544 int err;
1545
1546 err = __hw_perf_counter_init(counter);
1547 if (err)
1548 return ERR_PTR(err);
1549
1550 return &pmu;
1551}
1552
1553/*
1554 * callchain support
1555 */
1556
1557static inline
1558void callchain_store(struct perf_callchain_entry *entry, u64 ip)
1559{
1560 if (entry->nr < PERF_MAX_STACK_DEPTH)
1561 entry->ip[entry->nr++] = ip;
1562}
1563
1564static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
1565static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
1566
1567
1568static void
1569backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
1570{
1571 /* Ignore warnings */
1572}
1573
1574static void backtrace_warning(void *data, char *msg)
1575{
1576 /* Ignore warnings */
1577}
1578
1579static int backtrace_stack(void *data, char *name)
1580{
1581 /* Process all stacks: */
1582 return 0;
1583}
1584
1585static void backtrace_address(void *data, unsigned long addr, int reliable)
1586{
1587 struct perf_callchain_entry *entry = data;
1588
1589 if (reliable)
1590 callchain_store(entry, addr);
1591}
1592
1593static const struct stacktrace_ops backtrace_ops = {
1594 .warning = backtrace_warning,
1595 .warning_symbol = backtrace_warning_symbol,
1596 .stack = backtrace_stack,
1597 .address = backtrace_address,
1598};
1599
1600#include "../dumpstack.h"
1601
1602static void
1603perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
1604{
1605 callchain_store(entry, PERF_CONTEXT_KERNEL);
1606 callchain_store(entry, regs->ip);
1607
1608 dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
1609}
1610
1611/*
1612 * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
1613 */
1614static unsigned long
1615copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
1616{
1617 unsigned long offset, addr = (unsigned long)from;
1618 int type = in_nmi() ? KM_NMI : KM_IRQ0;
1619 unsigned long size, len = 0;
1620 struct page *page;
1621 void *map;
1622 int ret;
1623
1624 do {
1625 ret = __get_user_pages_fast(addr, 1, 0, &page);
1626 if (!ret)
1627 break;
1628
1629 offset = addr & (PAGE_SIZE - 1);
1630 size = min(PAGE_SIZE - offset, n - len);
1631
1632 map = kmap_atomic(page, type);
1633 memcpy(to, map+offset, size);
1634 kunmap_atomic(map, type);
1635 put_page(page);
1636
1637 len += size;
1638 to += size;
1639 addr += size;
1640
1641 } while (len < n);
1642
1643 return len;
1644}
1645
1646static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
1647{
1648 unsigned long bytes;
1649
1650 bytes = copy_from_user_nmi(frame, fp, sizeof(*frame));
1651
1652 return bytes == sizeof(*frame);
1653}
1654
1655static void
1656perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
1657{
1658 struct stack_frame frame;
1659 const void __user *fp;
1660
1661 if (!user_mode(regs))
1662 regs = task_pt_regs(current);
1663
1664 fp = (void __user *)regs->bp;
1665
1666 callchain_store(entry, PERF_CONTEXT_USER);
1667 callchain_store(entry, regs->ip);
1668
1669 while (entry->nr < PERF_MAX_STACK_DEPTH) {
1670 frame.next_frame = NULL;
1671 frame.return_address = 0;
1672
1673 if (!copy_stack_frame(fp, &frame))
1674 break;
1675
1676 if ((unsigned long)fp < regs->sp)
1677 break;
1678
1679 callchain_store(entry, frame.return_address);
1680 fp = frame.next_frame;
1681 }
1682}
1683
1684static void
1685perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
1686{
1687 int is_user;
1688
1689 if (!regs)
1690 return;
1691
1692 is_user = user_mode(regs);
1693
1694 if (!current || current->pid == 0)
1695 return;
1696
1697 if (is_user && current->state != TASK_RUNNING)
1698 return;
1699
1700 if (!is_user)
1701 perf_callchain_kernel(regs, entry);
1702
1703 if (current->mm)
1704 perf_callchain_user(regs, entry);
1705}
1706
1707struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1708{
1709 struct perf_callchain_entry *entry;
1710
1711 if (in_nmi())
1712 entry = &__get_cpu_var(nmi_entry);
1713 else
1714 entry = &__get_cpu_var(irq_entry);
1715
1716 entry->nr = 0;
1717
1718 perf_do_callchain(regs, entry);
1719
1720 return entry;
1721}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index f6c70a164e32..5c481f6205bf 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -19,8 +19,8 @@
19#include <linux/nmi.h> 19#include <linux/nmi.h>
20#include <linux/kprobes.h> 20#include <linux/kprobes.h>
21 21
22#include <asm/genapic.h> 22#include <asm/apic.h>
23#include <asm/intel_arch_perfmon.h> 23#include <asm/perf_counter.h>
24 24
25struct nmi_watchdog_ctlblk { 25struct nmi_watchdog_ctlblk {
26 unsigned int cccr_msr; 26 unsigned int cccr_msr;
@@ -716,11 +716,15 @@ static void probe_nmi_watchdog(void)
716 wd_ops = &k7_wd_ops; 716 wd_ops = &k7_wd_ops;
717 break; 717 break;
718 case X86_VENDOR_INTEL: 718 case X86_VENDOR_INTEL:
719 /* 719 /* Work around where perfctr1 doesn't have a working enable
720 * Work around Core Duo (Yonah) errata AE49 where perfctr1 720 * bit as described in the following errata:
721 * doesn't have a working enable bit. 721 * AE49 Core Duo and Intel Core Solo 65 nm
722 * AN49 Intel Pentium Dual-Core
723 * AF49 Dual-Core Intel Xeon Processor LV
722 */ 724 */
723 if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) { 725 if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) ||
726 ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 15 &&
727 boot_cpu_data.x86_mask == 4))) {
724 intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0; 728 intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0;
725 intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0; 729 intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0;
726 } 730 }
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 2ac1f0c2beb3..b07af8861244 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -182,6 +182,11 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier =
182 .notifier_call = cpuid_class_cpu_callback, 182 .notifier_call = cpuid_class_cpu_callback,
183}; 183};
184 184
185static char *cpuid_nodename(struct device *dev)
186{
187 return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));
188}
189
185static int __init cpuid_init(void) 190static int __init cpuid_init(void)
186{ 191{
187 int i, err = 0; 192 int i, err = 0;
@@ -198,6 +203,7 @@ static int __init cpuid_init(void)
198 err = PTR_ERR(cpuid_class); 203 err = PTR_ERR(cpuid_class);
199 goto out_chrdev; 204 goto out_chrdev;
200 } 205 }
206 cpuid_class->nodename = cpuid_nodename;
201 for_each_online_cpu(i) { 207 for_each_online_cpu(i) {
202 err = cpuid_device_create(i); 208 err = cpuid_device_create(i);
203 if (err != 0) 209 if (err != 0)
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index ff958248e61d..5e409dc298a4 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -27,6 +27,7 @@
27#include <asm/cpu.h> 27#include <asm/cpu.h>
28#include <asm/reboot.h> 28#include <asm/reboot.h>
29#include <asm/virtext.h> 29#include <asm/virtext.h>
30#include <asm/iommu.h>
30 31
31 32
32#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) 33#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
@@ -103,5 +104,10 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
103#ifdef CONFIG_HPET_TIMER 104#ifdef CONFIG_HPET_TIMER
104 hpet_disable(); 105 hpet_disable();
105#endif 106#endif
107
108#ifdef CONFIG_X86_64
109 pci_iommu_shutdown();
110#endif
111
106 crash_save_cpu(regs, safe_smp_processor_id()); 112 crash_save_cpu(regs, safe_smp_processor_id());
107} 113}
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 87b67e3a765a..48bfe1386038 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -19,45 +19,61 @@
19 * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009 19 * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
20 */ 20 */
21 21
22 22#include <linux/kernel.h>
23#include <asm/ds.h>
24
25#include <linux/errno.h>
26#include <linux/string.h> 23#include <linux/string.h>
27#include <linux/slab.h> 24#include <linux/errno.h>
28#include <linux/sched.h> 25#include <linux/sched.h>
26#include <linux/slab.h>
29#include <linux/mm.h> 27#include <linux/mm.h>
30#include <linux/kernel.h> 28#include <linux/trace_clock.h>
29
30#include <asm/ds.h>
31 31
32#include "ds_selftest.h"
32 33
33/* 34/*
34 * The configuration for a particular DS hardware implementation. 35 * The configuration for a particular DS hardware implementation:
35 */ 36 */
36struct ds_configuration { 37struct ds_configuration {
37 /* the name of the configuration */ 38 /* The name of the configuration: */
38 const char *name; 39 const char *name;
39 /* the size of one pointer-typed field in the DS structure and 40
40 in the BTS and PEBS buffers in bytes; 41 /* The size of pointer-typed fields in DS, BTS, and PEBS: */
41 this covers the first 8 DS fields related to buffer management. */ 42 unsigned char sizeof_ptr_field;
42 unsigned char sizeof_field; 43
43 /* the size of a BTS/PEBS record in bytes */ 44 /* The size of a BTS/PEBS record in bytes: */
44 unsigned char sizeof_rec[2]; 45 unsigned char sizeof_rec[2];
45 /* a series of bit-masks to control various features indexed 46
46 * by enum ds_feature */ 47 /* The number of pebs counter reset values in the DS structure. */
47 unsigned long ctl[dsf_ctl_max]; 48 unsigned char nr_counter_reset;
49
50 /* Control bit-masks indexed by enum ds_feature: */
51 unsigned long ctl[dsf_ctl_max];
48}; 52};
49static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array); 53static struct ds_configuration ds_cfg __read_mostly;
54
55
56/* Maximal size of a DS configuration: */
57#define MAX_SIZEOF_DS 0x80
50 58
51#define ds_cfg per_cpu(ds_cfg_array, smp_processor_id()) 59/* Maximal size of a BTS record: */
60#define MAX_SIZEOF_BTS (3 * 8)
52 61
53#define MAX_SIZEOF_DS (12 * 8) /* maximal size of a DS configuration */ 62/* BTS and PEBS buffer alignment: */
54#define MAX_SIZEOF_BTS (3 * 8) /* maximal size of a BTS record */ 63#define DS_ALIGNMENT (1 << 3)
55#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment */
56 64
57#define BTS_CONTROL \ 65/* Number of buffer pointers in DS: */
58 (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\ 66#define NUM_DS_PTR_FIELDS 8
59 ds_cfg.ctl[dsf_bts_overflow])
60 67
68/* Size of a pebs reset value in DS: */
69#define PEBS_RESET_FIELD_SIZE 8
70
71/* Mask of control bits in the DS MSR register: */
72#define BTS_CONTROL \
73 ( ds_cfg.ctl[dsf_bts] | \
74 ds_cfg.ctl[dsf_bts_kernel] | \
75 ds_cfg.ctl[dsf_bts_user] | \
76 ds_cfg.ctl[dsf_bts_overflow] )
61 77
62/* 78/*
63 * A BTS or PEBS tracer. 79 * A BTS or PEBS tracer.
@@ -66,29 +82,36 @@ static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array);
66 * to identify tracers. 82 * to identify tracers.
67 */ 83 */
68struct ds_tracer { 84struct ds_tracer {
69 /* the DS context (partially) owned by this tracer */ 85 /* The DS context (partially) owned by this tracer. */
70 struct ds_context *context; 86 struct ds_context *context;
71 /* the buffer provided on ds_request() and its size in bytes */ 87 /* The buffer provided on ds_request() and its size in bytes. */
72 void *buffer; 88 void *buffer;
73 size_t size; 89 size_t size;
74}; 90};
75 91
76struct bts_tracer { 92struct bts_tracer {
77 /* the common DS part */ 93 /* The common DS part: */
78 struct ds_tracer ds; 94 struct ds_tracer ds;
79 /* the trace including the DS configuration */ 95
80 struct bts_trace trace; 96 /* The trace including the DS configuration: */
81 /* buffer overflow notification function */ 97 struct bts_trace trace;
82 bts_ovfl_callback_t ovfl; 98
99 /* Buffer overflow notification function: */
100 bts_ovfl_callback_t ovfl;
101
102 /* Active flags affecting trace collection. */
103 unsigned int flags;
83}; 104};
84 105
85struct pebs_tracer { 106struct pebs_tracer {
86 /* the common DS part */ 107 /* The common DS part: */
87 struct ds_tracer ds; 108 struct ds_tracer ds;
88 /* the trace including the DS configuration */ 109
89 struct pebs_trace trace; 110 /* The trace including the DS configuration: */
90 /* buffer overflow notification function */ 111 struct pebs_trace trace;
91 pebs_ovfl_callback_t ovfl; 112
113 /* Buffer overflow notification function: */
114 pebs_ovfl_callback_t ovfl;
92}; 115};
93 116
94/* 117/*
@@ -97,6 +120,7 @@ struct pebs_tracer {
97 * 120 *
98 * The DS configuration consists of the following fields; different 121 * The DS configuration consists of the following fields; different
99 * architetures vary in the size of those fields. 122 * architetures vary in the size of those fields.
123 *
100 * - double-word aligned base linear address of the BTS buffer 124 * - double-word aligned base linear address of the BTS buffer
101 * - write pointer into the BTS buffer 125 * - write pointer into the BTS buffer
102 * - end linear address of the BTS buffer (one byte beyond the end of 126 * - end linear address of the BTS buffer (one byte beyond the end of
@@ -135,21 +159,22 @@ enum ds_field {
135}; 159};
136 160
137enum ds_qualifier { 161enum ds_qualifier {
138 ds_bts = 0, 162 ds_bts = 0,
139 ds_pebs 163 ds_pebs
140}; 164};
141 165
142static inline unsigned long ds_get(const unsigned char *base, 166static inline unsigned long
143 enum ds_qualifier qual, enum ds_field field) 167ds_get(const unsigned char *base, enum ds_qualifier qual, enum ds_field field)
144{ 168{
145 base += (ds_cfg.sizeof_field * (field + (4 * qual))); 169 base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
146 return *(unsigned long *)base; 170 return *(unsigned long *)base;
147} 171}
148 172
149static inline void ds_set(unsigned char *base, enum ds_qualifier qual, 173static inline void
150 enum ds_field field, unsigned long value) 174ds_set(unsigned char *base, enum ds_qualifier qual, enum ds_field field,
175 unsigned long value)
151{ 176{
152 base += (ds_cfg.sizeof_field * (field + (4 * qual))); 177 base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
153 (*(unsigned long *)base) = value; 178 (*(unsigned long *)base) = value;
154} 179}
155 180
@@ -159,7 +184,6 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
159 */ 184 */
160static DEFINE_SPINLOCK(ds_lock); 185static DEFINE_SPINLOCK(ds_lock);
161 186
162
163/* 187/*
164 * We either support (system-wide) per-cpu or per-thread allocation. 188 * We either support (system-wide) per-cpu or per-thread allocation.
165 * We distinguish the two based on the task_struct pointer, where a 189 * We distinguish the two based on the task_struct pointer, where a
@@ -178,12 +202,28 @@ static DEFINE_SPINLOCK(ds_lock);
178 */ 202 */
179static atomic_t tracers = ATOMIC_INIT(0); 203static atomic_t tracers = ATOMIC_INIT(0);
180 204
181static inline void get_tracer(struct task_struct *task) 205static inline int get_tracer(struct task_struct *task)
182{ 206{
183 if (task) 207 int error;
208
209 spin_lock_irq(&ds_lock);
210
211 if (task) {
212 error = -EPERM;
213 if (atomic_read(&tracers) < 0)
214 goto out;
184 atomic_inc(&tracers); 215 atomic_inc(&tracers);
185 else 216 } else {
217 error = -EPERM;
218 if (atomic_read(&tracers) > 0)
219 goto out;
186 atomic_dec(&tracers); 220 atomic_dec(&tracers);
221 }
222
223 error = 0;
224out:
225 spin_unlock_irq(&ds_lock);
226 return error;
187} 227}
188 228
189static inline void put_tracer(struct task_struct *task) 229static inline void put_tracer(struct task_struct *task)
@@ -194,14 +234,6 @@ static inline void put_tracer(struct task_struct *task)
194 atomic_inc(&tracers); 234 atomic_inc(&tracers);
195} 235}
196 236
197static inline int check_tracer(struct task_struct *task)
198{
199 return task ?
200 (atomic_read(&tracers) >= 0) :
201 (atomic_read(&tracers) <= 0);
202}
203
204
205/* 237/*
206 * The DS context is either attached to a thread or to a cpu: 238 * The DS context is either attached to a thread or to a cpu:
207 * - in the former case, the thread_struct contains a pointer to the 239 * - in the former case, the thread_struct contains a pointer to the
@@ -213,61 +245,58 @@ static inline int check_tracer(struct task_struct *task)
213 * deallocated when the last user puts the context. 245 * deallocated when the last user puts the context.
214 */ 246 */
215struct ds_context { 247struct ds_context {
216 /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */ 248 /* The DS configuration; goes into MSR_IA32_DS_AREA: */
217 unsigned char ds[MAX_SIZEOF_DS]; 249 unsigned char ds[MAX_SIZEOF_DS];
218 /* the owner of the BTS and PEBS configuration, respectively */ 250
219 struct bts_tracer *bts_master; 251 /* The owner of the BTS and PEBS configuration, respectively: */
220 struct pebs_tracer *pebs_master; 252 struct bts_tracer *bts_master;
221 /* use count */ 253 struct pebs_tracer *pebs_master;
222 unsigned long count;
223 /* a pointer to the context location inside the thread_struct
224 * or the per_cpu context array */
225 struct ds_context **this;
226 /* a pointer to the task owning this context, or NULL, if the
227 * context is owned by a cpu */
228 struct task_struct *task;
229};
230 254
231static DEFINE_PER_CPU(struct ds_context *, system_context_array); 255 /* Use count: */
256 unsigned long count;
232 257
233#define system_context per_cpu(system_context_array, smp_processor_id()) 258 /* Pointer to the context pointer field: */
259 struct ds_context **this;
260
261 /* The traced task; NULL for cpu tracing: */
262 struct task_struct *task;
263
264 /* The traced cpu; only valid if task is NULL: */
265 int cpu;
266};
234 267
268static DEFINE_PER_CPU(struct ds_context *, cpu_context);
235 269
236static inline struct ds_context *ds_get_context(struct task_struct *task) 270
271static struct ds_context *ds_get_context(struct task_struct *task, int cpu)
237{ 272{
238 struct ds_context **p_context = 273 struct ds_context **p_context =
239 (task ? &task->thread.ds_ctx : &system_context); 274 (task ? &task->thread.ds_ctx : &per_cpu(cpu_context, cpu));
240 struct ds_context *context = NULL; 275 struct ds_context *context = NULL;
241 struct ds_context *new_context = NULL; 276 struct ds_context *new_context = NULL;
242 unsigned long irq;
243 277
244 /* Chances are small that we already have a context. */ 278 /* Chances are small that we already have a context. */
245 new_context = kzalloc(sizeof(*new_context), GFP_KERNEL); 279 new_context = kzalloc(sizeof(*new_context), GFP_KERNEL);
246 if (!new_context) 280 if (!new_context)
247 return NULL; 281 return NULL;
248 282
249 spin_lock_irqsave(&ds_lock, irq); 283 spin_lock_irq(&ds_lock);
250 284
251 context = *p_context; 285 context = *p_context;
252 if (!context) { 286 if (likely(!context)) {
253 context = new_context; 287 context = new_context;
254 288
255 context->this = p_context; 289 context->this = p_context;
256 context->task = task; 290 context->task = task;
291 context->cpu = cpu;
257 context->count = 0; 292 context->count = 0;
258 293
259 if (task)
260 set_tsk_thread_flag(task, TIF_DS_AREA_MSR);
261
262 if (!task || (task == current))
263 wrmsrl(MSR_IA32_DS_AREA, (unsigned long)context->ds);
264
265 *p_context = context; 294 *p_context = context;
266 } 295 }
267 296
268 context->count++; 297 context->count++;
269 298
270 spin_unlock_irqrestore(&ds_lock, irq); 299 spin_unlock_irq(&ds_lock);
271 300
272 if (context != new_context) 301 if (context != new_context)
273 kfree(new_context); 302 kfree(new_context);
@@ -275,8 +304,9 @@ static inline struct ds_context *ds_get_context(struct task_struct *task)
275 return context; 304 return context;
276} 305}
277 306
278static inline void ds_put_context(struct ds_context *context) 307static void ds_put_context(struct ds_context *context)
279{ 308{
309 struct task_struct *task;
280 unsigned long irq; 310 unsigned long irq;
281 311
282 if (!context) 312 if (!context)
@@ -291,17 +321,55 @@ static inline void ds_put_context(struct ds_context *context)
291 321
292 *(context->this) = NULL; 322 *(context->this) = NULL;
293 323
294 if (context->task) 324 task = context->task;
295 clear_tsk_thread_flag(context->task, TIF_DS_AREA_MSR); 325
326 if (task)
327 clear_tsk_thread_flag(task, TIF_DS_AREA_MSR);
296 328
297 if (!context->task || (context->task == current)) 329 /*
298 wrmsrl(MSR_IA32_DS_AREA, 0); 330 * We leave the (now dangling) pointer to the DS configuration in
331 * the DS_AREA msr. This is as good or as bad as replacing it with
332 * NULL - the hardware would crash if we enabled tracing.
333 *
334 * This saves us some problems with having to write an msr on a
335 * different cpu while preventing others from doing the same for the
336 * next context for that same cpu.
337 */
299 338
300 spin_unlock_irqrestore(&ds_lock, irq); 339 spin_unlock_irqrestore(&ds_lock, irq);
301 340
341 /* The context might still be in use for context switching. */
342 if (task && (task != current))
343 wait_task_context_switch(task);
344
302 kfree(context); 345 kfree(context);
303} 346}
304 347
348static void ds_install_ds_area(struct ds_context *context)
349{
350 unsigned long ds;
351
352 ds = (unsigned long)context->ds;
353
354 /*
355 * There is a race between the bts master and the pebs master.
356 *
357 * The thread/cpu access is synchronized via get/put_cpu() for
358 * task tracing and via wrmsr_on_cpu for cpu tracing.
359 *
360 * If bts and pebs are collected for the same task or same cpu,
361 * the same confiuration is written twice.
362 */
363 if (context->task) {
364 get_cpu();
365 if (context->task == current)
366 wrmsrl(MSR_IA32_DS_AREA, ds);
367 set_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
368 put_cpu();
369 } else
370 wrmsr_on_cpu(context->cpu, MSR_IA32_DS_AREA,
371 (u32)((u64)ds), (u32)((u64)ds >> 32));
372}
305 373
306/* 374/*
307 * Call the tracer's callback on a buffer overflow. 375 * Call the tracer's callback on a buffer overflow.
@@ -332,9 +400,9 @@ static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
332 * The remainder of any partially written record is zeroed out. 400 * The remainder of any partially written record is zeroed out.
333 * 401 *
334 * context: the DS context 402 * context: the DS context
335 * qual: the buffer type 403 * qual: the buffer type
336 * record: the data to write 404 * record: the data to write
337 * size: the size of the data 405 * size: the size of the data
338 */ 406 */
339static int ds_write(struct ds_context *context, enum ds_qualifier qual, 407static int ds_write(struct ds_context *context, enum ds_qualifier qual,
340 const void *record, size_t size) 408 const void *record, size_t size)
@@ -349,14 +417,14 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,
349 unsigned long write_size, adj_write_size; 417 unsigned long write_size, adj_write_size;
350 418
351 /* 419 /*
352 * write as much as possible without producing an 420 * Write as much as possible without producing an
353 * overflow interrupt. 421 * overflow interrupt.
354 * 422 *
355 * interrupt_threshold must either be 423 * Interrupt_threshold must either be
356 * - bigger than absolute_maximum or 424 * - bigger than absolute_maximum or
357 * - point to a record between buffer_base and absolute_maximum 425 * - point to a record between buffer_base and absolute_maximum
358 * 426 *
359 * index points to a valid record. 427 * Index points to a valid record.
360 */ 428 */
361 base = ds_get(context->ds, qual, ds_buffer_base); 429 base = ds_get(context->ds, qual, ds_buffer_base);
362 index = ds_get(context->ds, qual, ds_index); 430 index = ds_get(context->ds, qual, ds_index);
@@ -365,8 +433,10 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,
365 433
366 write_end = min(end, int_th); 434 write_end = min(end, int_th);
367 435
368 /* if we are already beyond the interrupt threshold, 436 /*
369 * we fill the entire buffer */ 437 * If we are already beyond the interrupt threshold,
438 * we fill the entire buffer.
439 */
370 if (write_end <= index) 440 if (write_end <= index)
371 write_end = end; 441 write_end = end;
372 442
@@ -383,7 +453,7 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,
383 adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; 453 adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
384 adj_write_size *= ds_cfg.sizeof_rec[qual]; 454 adj_write_size *= ds_cfg.sizeof_rec[qual];
385 455
386 /* zero out trailing bytes */ 456 /* Zero out trailing bytes. */
387 memset((char *)index + write_size, 0, 457 memset((char *)index + write_size, 0,
388 adj_write_size - write_size); 458 adj_write_size - write_size);
389 index += adj_write_size; 459 index += adj_write_size;
@@ -410,7 +480,7 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,
410 * Later architectures use 64bit pointers throughout, whereas earlier 480 * Later architectures use 64bit pointers throughout, whereas earlier
411 * architectures use 32bit pointers in 32bit mode. 481 * architectures use 32bit pointers in 32bit mode.
412 * 482 *
413 * We compute the base address for the first 8 fields based on: 483 * We compute the base address for the fields based on:
414 * - the field size stored in the DS configuration 484 * - the field size stored in the DS configuration
415 * - the relative field position 485 * - the relative field position
416 * 486 *
@@ -431,23 +501,23 @@ enum bts_field {
431 bts_to, 501 bts_to,
432 bts_flags, 502 bts_flags,
433 503
434 bts_qual = bts_from, 504 bts_qual = bts_from,
435 bts_jiffies = bts_to, 505 bts_clock = bts_to,
436 bts_pid = bts_flags, 506 bts_pid = bts_flags,
437 507
438 bts_qual_mask = (bts_qual_max - 1), 508 bts_qual_mask = (bts_qual_max - 1),
439 bts_escape = ((unsigned long)-1 & ~bts_qual_mask) 509 bts_escape = ((unsigned long)-1 & ~bts_qual_mask)
440}; 510};
441 511
442static inline unsigned long bts_get(const char *base, enum bts_field field) 512static inline unsigned long bts_get(const char *base, enum bts_field field)
443{ 513{
444 base += (ds_cfg.sizeof_field * field); 514 base += (ds_cfg.sizeof_ptr_field * field);
445 return *(unsigned long *)base; 515 return *(unsigned long *)base;
446} 516}
447 517
448static inline void bts_set(char *base, enum bts_field field, unsigned long val) 518static inline void bts_set(char *base, enum bts_field field, unsigned long val)
449{ 519{
450 base += (ds_cfg.sizeof_field * field);; 520 base += (ds_cfg.sizeof_ptr_field * field);;
451 (*(unsigned long *)base) = val; 521 (*(unsigned long *)base) = val;
452} 522}
453 523
@@ -463,8 +533,8 @@ static inline void bts_set(char *base, enum bts_field field, unsigned long val)
463 * 533 *
464 * return: bytes read/written on success; -Eerrno, otherwise 534 * return: bytes read/written on success; -Eerrno, otherwise
465 */ 535 */
466static int bts_read(struct bts_tracer *tracer, const void *at, 536static int
467 struct bts_struct *out) 537bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out)
468{ 538{
469 if (!tracer) 539 if (!tracer)
470 return -EINVAL; 540 return -EINVAL;
@@ -478,8 +548,8 @@ static int bts_read(struct bts_tracer *tracer, const void *at,
478 memset(out, 0, sizeof(*out)); 548 memset(out, 0, sizeof(*out));
479 if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) { 549 if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) {
480 out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask); 550 out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask);
481 out->variant.timestamp.jiffies = bts_get(at, bts_jiffies); 551 out->variant.event.clock = bts_get(at, bts_clock);
482 out->variant.timestamp.pid = bts_get(at, bts_pid); 552 out->variant.event.pid = bts_get(at, bts_pid);
483 } else { 553 } else {
484 out->qualifier = bts_branch; 554 out->qualifier = bts_branch;
485 out->variant.lbr.from = bts_get(at, bts_from); 555 out->variant.lbr.from = bts_get(at, bts_from);
@@ -516,8 +586,8 @@ static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in)
516 case bts_task_arrives: 586 case bts_task_arrives:
517 case bts_task_departs: 587 case bts_task_departs:
518 bts_set(raw, bts_qual, (bts_escape | in->qualifier)); 588 bts_set(raw, bts_qual, (bts_escape | in->qualifier));
519 bts_set(raw, bts_jiffies, in->variant.timestamp.jiffies); 589 bts_set(raw, bts_clock, in->variant.event.clock);
520 bts_set(raw, bts_pid, in->variant.timestamp.pid); 590 bts_set(raw, bts_pid, in->variant.event.pid);
521 break; 591 break;
522 default: 592 default:
523 return -EINVAL; 593 return -EINVAL;
@@ -555,7 +625,8 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
555 unsigned int flags) { 625 unsigned int flags) {
556 unsigned long buffer, adj; 626 unsigned long buffer, adj;
557 627
558 /* adjust the buffer address and size to meet alignment 628 /*
629 * Adjust the buffer address and size to meet alignment
559 * constraints: 630 * constraints:
560 * - buffer is double-word aligned 631 * - buffer is double-word aligned
561 * - size is multiple of record size 632 * - size is multiple of record size
@@ -577,9 +648,11 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
577 trace->begin = (void *)buffer; 648 trace->begin = (void *)buffer;
578 trace->top = trace->begin; 649 trace->top = trace->begin;
579 trace->end = (void *)(buffer + size); 650 trace->end = (void *)(buffer + size);
580 /* The value for 'no threshold' is -1, which will set the 651 /*
652 * The value for 'no threshold' is -1, which will set the
581 * threshold outside of the buffer, just like we want it. 653 * threshold outside of the buffer, just like we want it.
582 */ 654 */
655 ith *= ds_cfg.sizeof_rec[qual];
583 trace->ith = (void *)(buffer + size - ith); 656 trace->ith = (void *)(buffer + size - ith);
584 657
585 trace->flags = flags; 658 trace->flags = flags;
@@ -588,18 +661,27 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
588 661
589static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, 662static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
590 enum ds_qualifier qual, struct task_struct *task, 663 enum ds_qualifier qual, struct task_struct *task,
591 void *base, size_t size, size_t th, unsigned int flags) 664 int cpu, void *base, size_t size, size_t th)
592{ 665{
593 struct ds_context *context; 666 struct ds_context *context;
594 int error; 667 int error;
668 size_t req_size;
669
670 error = -EOPNOTSUPP;
671 if (!ds_cfg.sizeof_rec[qual])
672 goto out;
595 673
596 error = -EINVAL; 674 error = -EINVAL;
597 if (!base) 675 if (!base)
598 goto out; 676 goto out;
599 677
600 /* we require some space to do alignment adjustments below */ 678 req_size = ds_cfg.sizeof_rec[qual];
679 /* We might need space for alignment adjustments. */
680 if (!IS_ALIGNED((unsigned long)base, DS_ALIGNMENT))
681 req_size += DS_ALIGNMENT;
682
601 error = -EINVAL; 683 error = -EINVAL;
602 if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual])) 684 if (size < req_size)
603 goto out; 685 goto out;
604 686
605 if (th != (size_t)-1) { 687 if (th != (size_t)-1) {
@@ -614,182 +696,318 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
614 tracer->size = size; 696 tracer->size = size;
615 697
616 error = -ENOMEM; 698 error = -ENOMEM;
617 context = ds_get_context(task); 699 context = ds_get_context(task, cpu);
618 if (!context) 700 if (!context)
619 goto out; 701 goto out;
620 tracer->context = context; 702 tracer->context = context;
621 703
622 ds_init_ds_trace(trace, qual, base, size, th, flags); 704 /*
705 * Defer any tracer-specific initialization work for the context until
706 * context ownership has been clarified.
707 */
623 708
624 error = 0; 709 error = 0;
625 out: 710 out:
626 return error; 711 return error;
627} 712}
628 713
629struct bts_tracer *ds_request_bts(struct task_struct *task, 714static struct bts_tracer *ds_request_bts(struct task_struct *task, int cpu,
630 void *base, size_t size, 715 void *base, size_t size,
631 bts_ovfl_callback_t ovfl, size_t th, 716 bts_ovfl_callback_t ovfl, size_t th,
632 unsigned int flags) 717 unsigned int flags)
633{ 718{
634 struct bts_tracer *tracer; 719 struct bts_tracer *tracer;
635 unsigned long irq;
636 int error; 720 int error;
637 721
722 /* Buffer overflow notification is not yet implemented. */
638 error = -EOPNOTSUPP; 723 error = -EOPNOTSUPP;
639 if (!ds_cfg.ctl[dsf_bts]) 724 if (ovfl)
640 goto out; 725 goto out;
641 726
642 /* buffer overflow notification is not yet implemented */ 727 error = get_tracer(task);
643 error = -EOPNOTSUPP; 728 if (error < 0)
644 if (ovfl)
645 goto out; 729 goto out;
646 730
647 error = -ENOMEM; 731 error = -ENOMEM;
648 tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); 732 tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
649 if (!tracer) 733 if (!tracer)
650 goto out; 734 goto out_put_tracer;
651 tracer->ovfl = ovfl; 735 tracer->ovfl = ovfl;
652 736
737 /* Do some more error checking and acquire a tracing context. */
653 error = ds_request(&tracer->ds, &tracer->trace.ds, 738 error = ds_request(&tracer->ds, &tracer->trace.ds,
654 ds_bts, task, base, size, th, flags); 739 ds_bts, task, cpu, base, size, th);
655 if (error < 0) 740 if (error < 0)
656 goto out_tracer; 741 goto out_tracer;
657 742
658 743 /* Claim the bts part of the tracing context we acquired above. */
659 spin_lock_irqsave(&ds_lock, irq); 744 spin_lock_irq(&ds_lock);
660
661 error = -EPERM;
662 if (!check_tracer(task))
663 goto out_unlock;
664 get_tracer(task);
665 745
666 error = -EPERM; 746 error = -EPERM;
667 if (tracer->ds.context->bts_master) 747 if (tracer->ds.context->bts_master)
668 goto out_put_tracer; 748 goto out_unlock;
669 tracer->ds.context->bts_master = tracer; 749 tracer->ds.context->bts_master = tracer;
670 750
671 spin_unlock_irqrestore(&ds_lock, irq); 751 spin_unlock_irq(&ds_lock);
672 752
753 /*
754 * Now that we own the bts part of the context, let's complete the
755 * initialization for that part.
756 */
757 ds_init_ds_trace(&tracer->trace.ds, ds_bts, base, size, th, flags);
758 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
759 ds_install_ds_area(tracer->ds.context);
673 760
674 tracer->trace.read = bts_read; 761 tracer->trace.read = bts_read;
675 tracer->trace.write = bts_write; 762 tracer->trace.write = bts_write;
676 763
677 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); 764 /* Start tracing. */
678 ds_resume_bts(tracer); 765 ds_resume_bts(tracer);
679 766
680 return tracer; 767 return tracer;
681 768
682 out_put_tracer:
683 put_tracer(task);
684 out_unlock: 769 out_unlock:
685 spin_unlock_irqrestore(&ds_lock, irq); 770 spin_unlock_irq(&ds_lock);
686 ds_put_context(tracer->ds.context); 771 ds_put_context(tracer->ds.context);
687 out_tracer: 772 out_tracer:
688 kfree(tracer); 773 kfree(tracer);
774 out_put_tracer:
775 put_tracer(task);
689 out: 776 out:
690 return ERR_PTR(error); 777 return ERR_PTR(error);
691} 778}
692 779
693struct pebs_tracer *ds_request_pebs(struct task_struct *task, 780struct bts_tracer *ds_request_bts_task(struct task_struct *task,
694 void *base, size_t size, 781 void *base, size_t size,
695 pebs_ovfl_callback_t ovfl, size_t th, 782 bts_ovfl_callback_t ovfl,
696 unsigned int flags) 783 size_t th, unsigned int flags)
784{
785 return ds_request_bts(task, 0, base, size, ovfl, th, flags);
786}
787
788struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
789 bts_ovfl_callback_t ovfl,
790 size_t th, unsigned int flags)
791{
792 return ds_request_bts(NULL, cpu, base, size, ovfl, th, flags);
793}
794
795static struct pebs_tracer *ds_request_pebs(struct task_struct *task, int cpu,
796 void *base, size_t size,
797 pebs_ovfl_callback_t ovfl, size_t th,
798 unsigned int flags)
697{ 799{
698 struct pebs_tracer *tracer; 800 struct pebs_tracer *tracer;
699 unsigned long irq;
700 int error; 801 int error;
701 802
702 /* buffer overflow notification is not yet implemented */ 803 /* Buffer overflow notification is not yet implemented. */
703 error = -EOPNOTSUPP; 804 error = -EOPNOTSUPP;
704 if (ovfl) 805 if (ovfl)
705 goto out; 806 goto out;
706 807
808 error = get_tracer(task);
809 if (error < 0)
810 goto out;
811
707 error = -ENOMEM; 812 error = -ENOMEM;
708 tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); 813 tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
709 if (!tracer) 814 if (!tracer)
710 goto out; 815 goto out_put_tracer;
711 tracer->ovfl = ovfl; 816 tracer->ovfl = ovfl;
712 817
818 /* Do some more error checking and acquire a tracing context. */
713 error = ds_request(&tracer->ds, &tracer->trace.ds, 819 error = ds_request(&tracer->ds, &tracer->trace.ds,
714 ds_pebs, task, base, size, th, flags); 820 ds_pebs, task, cpu, base, size, th);
715 if (error < 0) 821 if (error < 0)
716 goto out_tracer; 822 goto out_tracer;
717 823
718 spin_lock_irqsave(&ds_lock, irq); 824 /* Claim the pebs part of the tracing context we acquired above. */
719 825 spin_lock_irq(&ds_lock);
720 error = -EPERM;
721 if (!check_tracer(task))
722 goto out_unlock;
723 get_tracer(task);
724 826
725 error = -EPERM; 827 error = -EPERM;
726 if (tracer->ds.context->pebs_master) 828 if (tracer->ds.context->pebs_master)
727 goto out_put_tracer; 829 goto out_unlock;
728 tracer->ds.context->pebs_master = tracer; 830 tracer->ds.context->pebs_master = tracer;
729 831
730 spin_unlock_irqrestore(&ds_lock, irq); 832 spin_unlock_irq(&ds_lock);
731 833
834 /*
835 * Now that we own the pebs part of the context, let's complete the
836 * initialization for that part.
837 */
838 ds_init_ds_trace(&tracer->trace.ds, ds_pebs, base, size, th, flags);
732 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); 839 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
840 ds_install_ds_area(tracer->ds.context);
841
842 /* Start tracing. */
733 ds_resume_pebs(tracer); 843 ds_resume_pebs(tracer);
734 844
735 return tracer; 845 return tracer;
736 846
737 out_put_tracer:
738 put_tracer(task);
739 out_unlock: 847 out_unlock:
740 spin_unlock_irqrestore(&ds_lock, irq); 848 spin_unlock_irq(&ds_lock);
741 ds_put_context(tracer->ds.context); 849 ds_put_context(tracer->ds.context);
742 out_tracer: 850 out_tracer:
743 kfree(tracer); 851 kfree(tracer);
852 out_put_tracer:
853 put_tracer(task);
744 out: 854 out:
745 return ERR_PTR(error); 855 return ERR_PTR(error);
746} 856}
747 857
748void ds_release_bts(struct bts_tracer *tracer) 858struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
859 void *base, size_t size,
860 pebs_ovfl_callback_t ovfl,
861 size_t th, unsigned int flags)
749{ 862{
750 if (!tracer) 863 return ds_request_pebs(task, 0, base, size, ovfl, th, flags);
751 return; 864}
752 865
753 ds_suspend_bts(tracer); 866struct pebs_tracer *ds_request_pebs_cpu(int cpu, void *base, size_t size,
867 pebs_ovfl_callback_t ovfl,
868 size_t th, unsigned int flags)
869{
870 return ds_request_pebs(NULL, cpu, base, size, ovfl, th, flags);
871}
872
873static void ds_free_bts(struct bts_tracer *tracer)
874{
875 struct task_struct *task;
876
877 task = tracer->ds.context->task;
754 878
755 WARN_ON_ONCE(tracer->ds.context->bts_master != tracer); 879 WARN_ON_ONCE(tracer->ds.context->bts_master != tracer);
756 tracer->ds.context->bts_master = NULL; 880 tracer->ds.context->bts_master = NULL;
757 881
758 put_tracer(tracer->ds.context->task); 882 /* Make sure tracing stopped and the tracer is not in use. */
883 if (task && (task != current))
884 wait_task_context_switch(task);
885
759 ds_put_context(tracer->ds.context); 886 ds_put_context(tracer->ds.context);
887 put_tracer(task);
760 888
761 kfree(tracer); 889 kfree(tracer);
762} 890}
763 891
892void ds_release_bts(struct bts_tracer *tracer)
893{
894 might_sleep();
895
896 if (!tracer)
897 return;
898
899 ds_suspend_bts(tracer);
900 ds_free_bts(tracer);
901}
902
903int ds_release_bts_noirq(struct bts_tracer *tracer)
904{
905 struct task_struct *task;
906 unsigned long irq;
907 int error;
908
909 if (!tracer)
910 return 0;
911
912 task = tracer->ds.context->task;
913
914 local_irq_save(irq);
915
916 error = -EPERM;
917 if (!task &&
918 (tracer->ds.context->cpu != smp_processor_id()))
919 goto out;
920
921 error = -EPERM;
922 if (task && (task != current))
923 goto out;
924
925 ds_suspend_bts_noirq(tracer);
926 ds_free_bts(tracer);
927
928 error = 0;
929 out:
930 local_irq_restore(irq);
931 return error;
932}
933
934static void update_task_debugctlmsr(struct task_struct *task,
935 unsigned long debugctlmsr)
936{
937 task->thread.debugctlmsr = debugctlmsr;
938
939 get_cpu();
940 if (task == current)
941 update_debugctlmsr(debugctlmsr);
942 put_cpu();
943}
944
764void ds_suspend_bts(struct bts_tracer *tracer) 945void ds_suspend_bts(struct bts_tracer *tracer)
765{ 946{
766 struct task_struct *task; 947 struct task_struct *task;
948 unsigned long debugctlmsr;
949 int cpu;
767 950
768 if (!tracer) 951 if (!tracer)
769 return; 952 return;
770 953
954 tracer->flags = 0;
955
771 task = tracer->ds.context->task; 956 task = tracer->ds.context->task;
957 cpu = tracer->ds.context->cpu;
772 958
773 if (!task || (task == current)) 959 WARN_ON(!task && irqs_disabled());
774 update_debugctlmsr(get_debugctlmsr() & ~BTS_CONTROL);
775 960
776 if (task) { 961 debugctlmsr = (task ?
777 task->thread.debugctlmsr &= ~BTS_CONTROL; 962 task->thread.debugctlmsr :
963 get_debugctlmsr_on_cpu(cpu));
964 debugctlmsr &= ~BTS_CONTROL;
778 965
779 if (!task->thread.debugctlmsr) 966 if (task)
780 clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR); 967 update_task_debugctlmsr(task, debugctlmsr);
781 } 968 else
969 update_debugctlmsr_on_cpu(cpu, debugctlmsr);
782} 970}
783 971
784void ds_resume_bts(struct bts_tracer *tracer) 972int ds_suspend_bts_noirq(struct bts_tracer *tracer)
785{ 973{
786 struct task_struct *task; 974 struct task_struct *task;
787 unsigned long control; 975 unsigned long debugctlmsr, irq;
976 int cpu, error = 0;
788 977
789 if (!tracer) 978 if (!tracer)
790 return; 979 return 0;
980
981 tracer->flags = 0;
791 982
792 task = tracer->ds.context->task; 983 task = tracer->ds.context->task;
984 cpu = tracer->ds.context->cpu;
985
986 local_irq_save(irq);
987
988 error = -EPERM;
989 if (!task && (cpu != smp_processor_id()))
990 goto out;
991
992 debugctlmsr = (task ?
993 task->thread.debugctlmsr :
994 get_debugctlmsr());
995 debugctlmsr &= ~BTS_CONTROL;
996
997 if (task)
998 update_task_debugctlmsr(task, debugctlmsr);
999 else
1000 update_debugctlmsr(debugctlmsr);
1001
1002 error = 0;
1003 out:
1004 local_irq_restore(irq);
1005 return error;
1006}
1007
1008static unsigned long ds_bts_control(struct bts_tracer *tracer)
1009{
1010 unsigned long control;
793 1011
794 control = ds_cfg.ctl[dsf_bts]; 1012 control = ds_cfg.ctl[dsf_bts];
795 if (!(tracer->trace.ds.flags & BTS_KERNEL)) 1013 if (!(tracer->trace.ds.flags & BTS_KERNEL))
@@ -797,41 +1015,149 @@ void ds_resume_bts(struct bts_tracer *tracer)
797 if (!(tracer->trace.ds.flags & BTS_USER)) 1015 if (!(tracer->trace.ds.flags & BTS_USER))
798 control |= ds_cfg.ctl[dsf_bts_user]; 1016 control |= ds_cfg.ctl[dsf_bts_user];
799 1017
800 if (task) { 1018 return control;
801 task->thread.debugctlmsr |= control;
802 set_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
803 }
804
805 if (!task || (task == current))
806 update_debugctlmsr(get_debugctlmsr() | control);
807} 1019}
808 1020
809void ds_release_pebs(struct pebs_tracer *tracer) 1021void ds_resume_bts(struct bts_tracer *tracer)
810{ 1022{
1023 struct task_struct *task;
1024 unsigned long debugctlmsr;
1025 int cpu;
1026
811 if (!tracer) 1027 if (!tracer)
812 return; 1028 return;
813 1029
814 ds_suspend_pebs(tracer); 1030 tracer->flags = tracer->trace.ds.flags;
1031
1032 task = tracer->ds.context->task;
1033 cpu = tracer->ds.context->cpu;
1034
1035 WARN_ON(!task && irqs_disabled());
1036
1037 debugctlmsr = (task ?
1038 task->thread.debugctlmsr :
1039 get_debugctlmsr_on_cpu(cpu));
1040 debugctlmsr |= ds_bts_control(tracer);
1041
1042 if (task)
1043 update_task_debugctlmsr(task, debugctlmsr);
1044 else
1045 update_debugctlmsr_on_cpu(cpu, debugctlmsr);
1046}
1047
1048int ds_resume_bts_noirq(struct bts_tracer *tracer)
1049{
1050 struct task_struct *task;
1051 unsigned long debugctlmsr, irq;
1052 int cpu, error = 0;
1053
1054 if (!tracer)
1055 return 0;
1056
1057 tracer->flags = tracer->trace.ds.flags;
1058
1059 task = tracer->ds.context->task;
1060 cpu = tracer->ds.context->cpu;
1061
1062 local_irq_save(irq);
1063
1064 error = -EPERM;
1065 if (!task && (cpu != smp_processor_id()))
1066 goto out;
1067
1068 debugctlmsr = (task ?
1069 task->thread.debugctlmsr :
1070 get_debugctlmsr());
1071 debugctlmsr |= ds_bts_control(tracer);
1072
1073 if (task)
1074 update_task_debugctlmsr(task, debugctlmsr);
1075 else
1076 update_debugctlmsr(debugctlmsr);
1077
1078 error = 0;
1079 out:
1080 local_irq_restore(irq);
1081 return error;
1082}
1083
1084static void ds_free_pebs(struct pebs_tracer *tracer)
1085{
1086 struct task_struct *task;
1087
1088 task = tracer->ds.context->task;
815 1089
816 WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer); 1090 WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer);
817 tracer->ds.context->pebs_master = NULL; 1091 tracer->ds.context->pebs_master = NULL;
818 1092
819 put_tracer(tracer->ds.context->task);
820 ds_put_context(tracer->ds.context); 1093 ds_put_context(tracer->ds.context);
1094 put_tracer(task);
821 1095
822 kfree(tracer); 1096 kfree(tracer);
823} 1097}
824 1098
1099void ds_release_pebs(struct pebs_tracer *tracer)
1100{
1101 might_sleep();
1102
1103 if (!tracer)
1104 return;
1105
1106 ds_suspend_pebs(tracer);
1107 ds_free_pebs(tracer);
1108}
1109
1110int ds_release_pebs_noirq(struct pebs_tracer *tracer)
1111{
1112 struct task_struct *task;
1113 unsigned long irq;
1114 int error;
1115
1116 if (!tracer)
1117 return 0;
1118
1119 task = tracer->ds.context->task;
1120
1121 local_irq_save(irq);
1122
1123 error = -EPERM;
1124 if (!task &&
1125 (tracer->ds.context->cpu != smp_processor_id()))
1126 goto out;
1127
1128 error = -EPERM;
1129 if (task && (task != current))
1130 goto out;
1131
1132 ds_suspend_pebs_noirq(tracer);
1133 ds_free_pebs(tracer);
1134
1135 error = 0;
1136 out:
1137 local_irq_restore(irq);
1138 return error;
1139}
1140
825void ds_suspend_pebs(struct pebs_tracer *tracer) 1141void ds_suspend_pebs(struct pebs_tracer *tracer)
826{ 1142{
827 1143
828} 1144}
829 1145
1146int ds_suspend_pebs_noirq(struct pebs_tracer *tracer)
1147{
1148 return 0;
1149}
1150
830void ds_resume_pebs(struct pebs_tracer *tracer) 1151void ds_resume_pebs(struct pebs_tracer *tracer)
831{ 1152{
832 1153
833} 1154}
834 1155
1156int ds_resume_pebs_noirq(struct pebs_tracer *tracer)
1157{
1158 return 0;
1159}
1160
835const struct bts_trace *ds_read_bts(struct bts_tracer *tracer) 1161const struct bts_trace *ds_read_bts(struct bts_tracer *tracer)
836{ 1162{
837 if (!tracer) 1163 if (!tracer)
@@ -847,8 +1173,12 @@ const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer)
847 return NULL; 1173 return NULL;
848 1174
849 ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); 1175 ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
850 tracer->trace.reset_value = 1176
851 *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)); 1177 tracer->trace.counters = ds_cfg.nr_counter_reset;
1178 memcpy(tracer->trace.counter_reset,
1179 tracer->ds.context->ds +
1180 (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field),
1181 ds_cfg.nr_counter_reset * PEBS_RESET_FIELD_SIZE);
852 1182
853 return &tracer->trace; 1183 return &tracer->trace;
854} 1184}
@@ -873,18 +1203,24 @@ int ds_reset_pebs(struct pebs_tracer *tracer)
873 1203
874 tracer->trace.ds.top = tracer->trace.ds.begin; 1204 tracer->trace.ds.top = tracer->trace.ds.begin;
875 1205
876 ds_set(tracer->ds.context->ds, ds_bts, ds_index, 1206 ds_set(tracer->ds.context->ds, ds_pebs, ds_index,
877 (unsigned long)tracer->trace.ds.top); 1207 (unsigned long)tracer->trace.ds.top);
878 1208
879 return 0; 1209 return 0;
880} 1210}
881 1211
882int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value) 1212int ds_set_pebs_reset(struct pebs_tracer *tracer,
1213 unsigned int counter, u64 value)
883{ 1214{
884 if (!tracer) 1215 if (!tracer)
885 return -EINVAL; 1216 return -EINVAL;
886 1217
887 *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value; 1218 if (ds_cfg.nr_counter_reset < counter)
1219 return -EINVAL;
1220
1221 *(u64 *)(tracer->ds.context->ds +
1222 (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field) +
1223 (counter * PEBS_RESET_FIELD_SIZE)) = value;
888 1224
889 return 0; 1225 return 0;
890} 1226}
@@ -894,73 +1230,117 @@ static const struct ds_configuration ds_cfg_netburst = {
894 .ctl[dsf_bts] = (1 << 2) | (1 << 3), 1230 .ctl[dsf_bts] = (1 << 2) | (1 << 3),
895 .ctl[dsf_bts_kernel] = (1 << 5), 1231 .ctl[dsf_bts_kernel] = (1 << 5),
896 .ctl[dsf_bts_user] = (1 << 6), 1232 .ctl[dsf_bts_user] = (1 << 6),
897 1233 .nr_counter_reset = 1,
898 .sizeof_field = sizeof(long),
899 .sizeof_rec[ds_bts] = sizeof(long) * 3,
900#ifdef __i386__
901 .sizeof_rec[ds_pebs] = sizeof(long) * 10,
902#else
903 .sizeof_rec[ds_pebs] = sizeof(long) * 18,
904#endif
905}; 1234};
906static const struct ds_configuration ds_cfg_pentium_m = { 1235static const struct ds_configuration ds_cfg_pentium_m = {
907 .name = "Pentium M", 1236 .name = "Pentium M",
908 .ctl[dsf_bts] = (1 << 6) | (1 << 7), 1237 .ctl[dsf_bts] = (1 << 6) | (1 << 7),
909 1238 .nr_counter_reset = 1,
910 .sizeof_field = sizeof(long),
911 .sizeof_rec[ds_bts] = sizeof(long) * 3,
912#ifdef __i386__
913 .sizeof_rec[ds_pebs] = sizeof(long) * 10,
914#else
915 .sizeof_rec[ds_pebs] = sizeof(long) * 18,
916#endif
917}; 1239};
918static const struct ds_configuration ds_cfg_core2_atom = { 1240static const struct ds_configuration ds_cfg_core2_atom = {
919 .name = "Core 2/Atom", 1241 .name = "Core 2/Atom",
920 .ctl[dsf_bts] = (1 << 6) | (1 << 7), 1242 .ctl[dsf_bts] = (1 << 6) | (1 << 7),
921 .ctl[dsf_bts_kernel] = (1 << 9), 1243 .ctl[dsf_bts_kernel] = (1 << 9),
922 .ctl[dsf_bts_user] = (1 << 10), 1244 .ctl[dsf_bts_user] = (1 << 10),
923 1245 .nr_counter_reset = 1,
924 .sizeof_field = 8, 1246};
925 .sizeof_rec[ds_bts] = 8 * 3, 1247static const struct ds_configuration ds_cfg_core_i7 = {
926 .sizeof_rec[ds_pebs] = 8 * 18, 1248 .name = "Core i7",
1249 .ctl[dsf_bts] = (1 << 6) | (1 << 7),
1250 .ctl[dsf_bts_kernel] = (1 << 9),
1251 .ctl[dsf_bts_user] = (1 << 10),
1252 .nr_counter_reset = 4,
927}; 1253};
928 1254
929static void 1255static void
930ds_configure(const struct ds_configuration *cfg) 1256ds_configure(const struct ds_configuration *cfg,
1257 struct cpuinfo_x86 *cpu)
931{ 1258{
1259 unsigned long nr_pebs_fields = 0;
1260
1261 printk(KERN_INFO "[ds] using %s configuration\n", cfg->name);
1262
1263#ifdef __i386__
1264 nr_pebs_fields = 10;
1265#else
1266 nr_pebs_fields = 18;
1267#endif
1268
1269 /*
1270 * Starting with version 2, architectural performance
1271 * monitoring supports a format specifier.
1272 */
1273 if ((cpuid_eax(0xa) & 0xff) > 1) {
1274 unsigned long perf_capabilities, format;
1275
1276 rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_capabilities);
1277
1278 format = (perf_capabilities >> 8) & 0xf;
1279
1280 switch (format) {
1281 case 0:
1282 nr_pebs_fields = 18;
1283 break;
1284 case 1:
1285 nr_pebs_fields = 22;
1286 break;
1287 default:
1288 printk(KERN_INFO
1289 "[ds] unknown PEBS format: %lu\n", format);
1290 nr_pebs_fields = 0;
1291 break;
1292 }
1293 }
1294
932 memset(&ds_cfg, 0, sizeof(ds_cfg)); 1295 memset(&ds_cfg, 0, sizeof(ds_cfg));
933 ds_cfg = *cfg; 1296 ds_cfg = *cfg;
934 1297
935 printk(KERN_INFO "[ds] using %s configuration\n", ds_cfg.name); 1298 ds_cfg.sizeof_ptr_field =
1299 (cpu_has(cpu, X86_FEATURE_DTES64) ? 8 : 4);
1300
1301 ds_cfg.sizeof_rec[ds_bts] = ds_cfg.sizeof_ptr_field * 3;
1302 ds_cfg.sizeof_rec[ds_pebs] = ds_cfg.sizeof_ptr_field * nr_pebs_fields;
936 1303
937 if (!cpu_has_bts) { 1304 if (!cpu_has(cpu, X86_FEATURE_BTS)) {
938 ds_cfg.ctl[dsf_bts] = 0; 1305 ds_cfg.sizeof_rec[ds_bts] = 0;
939 printk(KERN_INFO "[ds] bts not available\n"); 1306 printk(KERN_INFO "[ds] bts not available\n");
940 } 1307 }
941 if (!cpu_has_pebs) 1308 if (!cpu_has(cpu, X86_FEATURE_PEBS)) {
1309 ds_cfg.sizeof_rec[ds_pebs] = 0;
942 printk(KERN_INFO "[ds] pebs not available\n"); 1310 printk(KERN_INFO "[ds] pebs not available\n");
1311 }
1312
1313 printk(KERN_INFO "[ds] sizes: address: %u bit, ",
1314 8 * ds_cfg.sizeof_ptr_field);
1315 printk("bts/pebs record: %u/%u bytes\n",
1316 ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]);
943 1317
944 WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field)); 1318 WARN_ON_ONCE(MAX_PEBS_COUNTERS < ds_cfg.nr_counter_reset);
945} 1319}
946 1320
947void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) 1321void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
948{ 1322{
1323 /* Only configure the first cpu. Others are identical. */
1324 if (ds_cfg.name)
1325 return;
1326
949 switch (c->x86) { 1327 switch (c->x86) {
950 case 0x6: 1328 case 0x6:
951 switch (c->x86_model) { 1329 switch (c->x86_model) {
952 case 0x9: 1330 case 0x9:
953 case 0xd: /* Pentium M */ 1331 case 0xd: /* Pentium M */
954 ds_configure(&ds_cfg_pentium_m); 1332 ds_configure(&ds_cfg_pentium_m, c);
955 break; 1333 break;
956 case 0xf: 1334 case 0xf:
957 case 0x17: /* Core2 */ 1335 case 0x17: /* Core2 */
958 case 0x1c: /* Atom */ 1336 case 0x1c: /* Atom */
959 ds_configure(&ds_cfg_core2_atom); 1337 ds_configure(&ds_cfg_core2_atom, c);
1338 break;
1339 case 0x1a: /* Core i7 */
1340 ds_configure(&ds_cfg_core_i7, c);
960 break; 1341 break;
961 case 0x1a: /* i7 */
962 default: 1342 default:
963 /* sorry, don't know about them */ 1343 /* Sorry, don't know about them. */
964 break; 1344 break;
965 } 1345 }
966 break; 1346 break;
@@ -969,64 +1349,89 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
969 case 0x0: 1349 case 0x0:
970 case 0x1: 1350 case 0x1:
971 case 0x2: /* Netburst */ 1351 case 0x2: /* Netburst */
972 ds_configure(&ds_cfg_netburst); 1352 ds_configure(&ds_cfg_netburst, c);
973 break; 1353 break;
974 default: 1354 default:
975 /* sorry, don't know about them */ 1355 /* Sorry, don't know about them. */
976 break; 1356 break;
977 } 1357 }
978 break; 1358 break;
979 default: 1359 default:
980 /* sorry, don't know about them */ 1360 /* Sorry, don't know about them. */
981 break; 1361 break;
982 } 1362 }
983} 1363}
984 1364
1365static inline void ds_take_timestamp(struct ds_context *context,
1366 enum bts_qualifier qualifier,
1367 struct task_struct *task)
1368{
1369 struct bts_tracer *tracer = context->bts_master;
1370 struct bts_struct ts;
1371
1372 /* Prevent compilers from reading the tracer pointer twice. */
1373 barrier();
1374
1375 if (!tracer || !(tracer->flags & BTS_TIMESTAMPS))
1376 return;
1377
1378 memset(&ts, 0, sizeof(ts));
1379 ts.qualifier = qualifier;
1380 ts.variant.event.clock = trace_clock_global();
1381 ts.variant.event.pid = task->pid;
1382
1383 bts_write(tracer, &ts);
1384}
1385
985/* 1386/*
986 * Change the DS configuration from tracing prev to tracing next. 1387 * Change the DS configuration from tracing prev to tracing next.
987 */ 1388 */
988void ds_switch_to(struct task_struct *prev, struct task_struct *next) 1389void ds_switch_to(struct task_struct *prev, struct task_struct *next)
989{ 1390{
990 struct ds_context *prev_ctx = prev->thread.ds_ctx; 1391 struct ds_context *prev_ctx = prev->thread.ds_ctx;
991 struct ds_context *next_ctx = next->thread.ds_ctx; 1392 struct ds_context *next_ctx = next->thread.ds_ctx;
1393 unsigned long debugctlmsr = next->thread.debugctlmsr;
1394
1395 /* Make sure all data is read before we start. */
1396 barrier();
992 1397
993 if (prev_ctx) { 1398 if (prev_ctx) {
994 update_debugctlmsr(0); 1399 update_debugctlmsr(0);
995 1400
996 if (prev_ctx->bts_master && 1401 ds_take_timestamp(prev_ctx, bts_task_departs, prev);
997 (prev_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) {
998 struct bts_struct ts = {
999 .qualifier = bts_task_departs,
1000 .variant.timestamp.jiffies = jiffies_64,
1001 .variant.timestamp.pid = prev->pid
1002 };
1003 bts_write(prev_ctx->bts_master, &ts);
1004 }
1005 } 1402 }
1006 1403
1007 if (next_ctx) { 1404 if (next_ctx) {
1008 if (next_ctx->bts_master && 1405 ds_take_timestamp(next_ctx, bts_task_arrives, next);
1009 (next_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) {
1010 struct bts_struct ts = {
1011 .qualifier = bts_task_arrives,
1012 .variant.timestamp.jiffies = jiffies_64,
1013 .variant.timestamp.pid = next->pid
1014 };
1015 bts_write(next_ctx->bts_master, &ts);
1016 }
1017 1406
1018 wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds); 1407 wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds);
1019 } 1408 }
1020 1409
1021 update_debugctlmsr(next->thread.debugctlmsr); 1410 update_debugctlmsr(debugctlmsr);
1022} 1411}
1023 1412
1024void ds_copy_thread(struct task_struct *tsk, struct task_struct *father) 1413static __init int ds_selftest(void)
1025{ 1414{
1026 clear_tsk_thread_flag(tsk, TIF_DS_AREA_MSR); 1415 if (ds_cfg.sizeof_rec[ds_bts]) {
1027 tsk->thread.ds_ctx = NULL; 1416 int error;
1028}
1029 1417
1030void ds_exit_thread(struct task_struct *tsk) 1418 error = ds_selftest_bts();
1031{ 1419 if (error) {
1420 WARN(1, "[ds] selftest failed. disabling bts.\n");
1421 ds_cfg.sizeof_rec[ds_bts] = 0;
1422 }
1423 }
1424
1425 if (ds_cfg.sizeof_rec[ds_pebs]) {
1426 int error;
1427
1428 error = ds_selftest_pebs();
1429 if (error) {
1430 WARN(1, "[ds] selftest failed. disabling pebs.\n");
1431 ds_cfg.sizeof_rec[ds_pebs] = 0;
1432 }
1433 }
1434
1435 return 0;
1032} 1436}
1437device_initcall(ds_selftest);
diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c
new file mode 100644
index 000000000000..6bc7c199ab99
--- /dev/null
+++ b/arch/x86/kernel/ds_selftest.c
@@ -0,0 +1,408 @@
1/*
2 * Debug Store support - selftest
3 *
4 *
5 * Copyright (C) 2009 Intel Corporation.
6 * Markus Metzger <markus.t.metzger@intel.com>, 2009
7 */
8
9#include "ds_selftest.h"
10
11#include <linux/kernel.h>
12#include <linux/string.h>
13#include <linux/smp.h>
14#include <linux/cpu.h>
15
16#include <asm/ds.h>
17
18
19#define BUFFER_SIZE 521 /* Intentionally chose an odd size. */
20#define SMALL_BUFFER_SIZE 24 /* A single bts entry. */
21
22struct ds_selftest_bts_conf {
23 struct bts_tracer *tracer;
24 int error;
25 int (*suspend)(struct bts_tracer *);
26 int (*resume)(struct bts_tracer *);
27};
28
29static int ds_selftest_bts_consistency(const struct bts_trace *trace)
30{
31 int error = 0;
32
33 if (!trace) {
34 printk(KERN_CONT "failed to access trace...");
35 /* Bail out. Other tests are pointless. */
36 return -1;
37 }
38
39 if (!trace->read) {
40 printk(KERN_CONT "bts read not available...");
41 error = -1;
42 }
43
44 /* Do some sanity checks on the trace configuration. */
45 if (!trace->ds.n) {
46 printk(KERN_CONT "empty bts buffer...");
47 error = -1;
48 }
49 if (!trace->ds.size) {
50 printk(KERN_CONT "bad bts trace setup...");
51 error = -1;
52 }
53 if (trace->ds.end !=
54 (char *)trace->ds.begin + (trace->ds.n * trace->ds.size)) {
55 printk(KERN_CONT "bad bts buffer setup...");
56 error = -1;
57 }
58 /*
59 * We allow top in [begin; end], since its not clear when the
60 * overflow adjustment happens: after the increment or before the
61 * write.
62 */
63 if ((trace->ds.top < trace->ds.begin) ||
64 (trace->ds.end < trace->ds.top)) {
65 printk(KERN_CONT "bts top out of bounds...");
66 error = -1;
67 }
68
69 return error;
70}
71
72static int ds_selftest_bts_read(struct bts_tracer *tracer,
73 const struct bts_trace *trace,
74 const void *from, const void *to)
75{
76 const unsigned char *at;
77
78 /*
79 * Check a few things which do not belong to this test.
80 * They should be covered by other tests.
81 */
82 if (!trace)
83 return -1;
84
85 if (!trace->read)
86 return -1;
87
88 if (to < from)
89 return -1;
90
91 if (from < trace->ds.begin)
92 return -1;
93
94 if (trace->ds.end < to)
95 return -1;
96
97 if (!trace->ds.size)
98 return -1;
99
100 /* Now to the test itself. */
101 for (at = from; (void *)at < to; at += trace->ds.size) {
102 struct bts_struct bts;
103 unsigned long index;
104 int error;
105
106 if (((void *)at - trace->ds.begin) % trace->ds.size) {
107 printk(KERN_CONT
108 "read from non-integer index...");
109 return -1;
110 }
111 index = ((void *)at - trace->ds.begin) / trace->ds.size;
112
113 memset(&bts, 0, sizeof(bts));
114 error = trace->read(tracer, at, &bts);
115 if (error < 0) {
116 printk(KERN_CONT
117 "error reading bts trace at [%lu] (0x%p)...",
118 index, at);
119 return error;
120 }
121
122 switch (bts.qualifier) {
123 case BTS_BRANCH:
124 break;
125 default:
126 printk(KERN_CONT
127 "unexpected bts entry %llu at [%lu] (0x%p)...",
128 bts.qualifier, index, at);
129 return -1;
130 }
131 }
132
133 return 0;
134}
135
136static void ds_selftest_bts_cpu(void *arg)
137{
138 struct ds_selftest_bts_conf *conf = arg;
139 const struct bts_trace *trace;
140 void *top;
141
142 if (IS_ERR(conf->tracer)) {
143 conf->error = PTR_ERR(conf->tracer);
144 conf->tracer = NULL;
145
146 printk(KERN_CONT
147 "initialization failed (err: %d)...", conf->error);
148 return;
149 }
150
151 /* We should meanwhile have enough trace. */
152 conf->error = conf->suspend(conf->tracer);
153 if (conf->error < 0)
154 return;
155
156 /* Let's see if we can access the trace. */
157 trace = ds_read_bts(conf->tracer);
158
159 conf->error = ds_selftest_bts_consistency(trace);
160 if (conf->error < 0)
161 return;
162
163 /* If everything went well, we should have a few trace entries. */
164 if (trace->ds.top == trace->ds.begin) {
165 /*
166 * It is possible but highly unlikely that we got a
167 * buffer overflow and end up at exactly the same
168 * position we started from.
169 * Let's issue a warning, but continue.
170 */
171 printk(KERN_CONT "no trace/overflow...");
172 }
173
174 /* Let's try to read the trace we collected. */
175 conf->error =
176 ds_selftest_bts_read(conf->tracer, trace,
177 trace->ds.begin, trace->ds.top);
178 if (conf->error < 0)
179 return;
180
181 /*
182 * Let's read the trace again.
183 * Since we suspended tracing, we should get the same result.
184 */
185 top = trace->ds.top;
186
187 trace = ds_read_bts(conf->tracer);
188 conf->error = ds_selftest_bts_consistency(trace);
189 if (conf->error < 0)
190 return;
191
192 if (top != trace->ds.top) {
193 printk(KERN_CONT "suspend not working...");
194 conf->error = -1;
195 return;
196 }
197
198 /* Let's collect some more trace - see if resume is working. */
199 conf->error = conf->resume(conf->tracer);
200 if (conf->error < 0)
201 return;
202
203 conf->error = conf->suspend(conf->tracer);
204 if (conf->error < 0)
205 return;
206
207 trace = ds_read_bts(conf->tracer);
208
209 conf->error = ds_selftest_bts_consistency(trace);
210 if (conf->error < 0)
211 return;
212
213 if (trace->ds.top == top) {
214 /*
215 * It is possible but highly unlikely that we got a
216 * buffer overflow and end up at exactly the same
217 * position we started from.
218 * Let's issue a warning and check the full trace.
219 */
220 printk(KERN_CONT
221 "no resume progress/overflow...");
222
223 conf->error =
224 ds_selftest_bts_read(conf->tracer, trace,
225 trace->ds.begin, trace->ds.end);
226 } else if (trace->ds.top < top) {
227 /*
228 * We had a buffer overflow - the entire buffer should
229 * contain trace records.
230 */
231 conf->error =
232 ds_selftest_bts_read(conf->tracer, trace,
233 trace->ds.begin, trace->ds.end);
234 } else {
235 /*
236 * It is quite likely that the buffer did not overflow.
237 * Let's just check the delta trace.
238 */
239 conf->error =
240 ds_selftest_bts_read(conf->tracer, trace, top,
241 trace->ds.top);
242 }
243 if (conf->error < 0)
244 return;
245
246 conf->error = 0;
247}
248
249static int ds_suspend_bts_wrap(struct bts_tracer *tracer)
250{
251 ds_suspend_bts(tracer);
252 return 0;
253}
254
255static int ds_resume_bts_wrap(struct bts_tracer *tracer)
256{
257 ds_resume_bts(tracer);
258 return 0;
259}
260
261static void ds_release_bts_noirq_wrap(void *tracer)
262{
263 (void)ds_release_bts_noirq(tracer);
264}
265
266static int ds_selftest_bts_bad_release_noirq(int cpu,
267 struct bts_tracer *tracer)
268{
269 int error = -EPERM;
270
271 /* Try to release the tracer on the wrong cpu. */
272 get_cpu();
273 if (cpu != smp_processor_id()) {
274 error = ds_release_bts_noirq(tracer);
275 if (error != -EPERM)
276 printk(KERN_CONT "release on wrong cpu...");
277 }
278 put_cpu();
279
280 return error ? 0 : -1;
281}
282
283static int ds_selftest_bts_bad_request_cpu(int cpu, void *buffer)
284{
285 struct bts_tracer *tracer;
286 int error;
287
288 /* Try to request cpu tracing while task tracing is active. */
289 tracer = ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, NULL,
290 (size_t)-1, BTS_KERNEL);
291 error = PTR_ERR(tracer);
292 if (!IS_ERR(tracer)) {
293 ds_release_bts(tracer);
294 error = 0;
295 }
296
297 if (error != -EPERM)
298 printk(KERN_CONT "cpu/task tracing overlap...");
299
300 return error ? 0 : -1;
301}
302
303static int ds_selftest_bts_bad_request_task(void *buffer)
304{
305 struct bts_tracer *tracer;
306 int error;
307
308 /* Try to request cpu tracing while task tracing is active. */
309 tracer = ds_request_bts_task(current, buffer, BUFFER_SIZE, NULL,
310 (size_t)-1, BTS_KERNEL);
311 error = PTR_ERR(tracer);
312 if (!IS_ERR(tracer)) {
313 error = 0;
314 ds_release_bts(tracer);
315 }
316
317 if (error != -EPERM)
318 printk(KERN_CONT "task/cpu tracing overlap...");
319
320 return error ? 0 : -1;
321}
322
323int ds_selftest_bts(void)
324{
325 struct ds_selftest_bts_conf conf;
326 unsigned char buffer[BUFFER_SIZE], *small_buffer;
327 unsigned long irq;
328 int cpu;
329
330 printk(KERN_INFO "[ds] bts selftest...");
331 conf.error = 0;
332
333 small_buffer = (unsigned char *)ALIGN((unsigned long)buffer, 8) + 8;
334
335 get_online_cpus();
336 for_each_online_cpu(cpu) {
337 conf.suspend = ds_suspend_bts_wrap;
338 conf.resume = ds_resume_bts_wrap;
339 conf.tracer =
340 ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
341 NULL, (size_t)-1, BTS_KERNEL);
342 ds_selftest_bts_cpu(&conf);
343 if (conf.error >= 0)
344 conf.error = ds_selftest_bts_bad_request_task(buffer);
345 ds_release_bts(conf.tracer);
346 if (conf.error < 0)
347 goto out;
348
349 conf.suspend = ds_suspend_bts_noirq;
350 conf.resume = ds_resume_bts_noirq;
351 conf.tracer =
352 ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
353 NULL, (size_t)-1, BTS_KERNEL);
354 smp_call_function_single(cpu, ds_selftest_bts_cpu, &conf, 1);
355 if (conf.error >= 0) {
356 conf.error =
357 ds_selftest_bts_bad_release_noirq(cpu,
358 conf.tracer);
359 /* We must not release the tracer twice. */
360 if (conf.error < 0)
361 conf.tracer = NULL;
362 }
363 if (conf.error >= 0)
364 conf.error = ds_selftest_bts_bad_request_task(buffer);
365 smp_call_function_single(cpu, ds_release_bts_noirq_wrap,
366 conf.tracer, 1);
367 if (conf.error < 0)
368 goto out;
369 }
370
371 conf.suspend = ds_suspend_bts_wrap;
372 conf.resume = ds_resume_bts_wrap;
373 conf.tracer =
374 ds_request_bts_task(current, buffer, BUFFER_SIZE,
375 NULL, (size_t)-1, BTS_KERNEL);
376 ds_selftest_bts_cpu(&conf);
377 if (conf.error >= 0)
378 conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
379 ds_release_bts(conf.tracer);
380 if (conf.error < 0)
381 goto out;
382
383 conf.suspend = ds_suspend_bts_noirq;
384 conf.resume = ds_resume_bts_noirq;
385 conf.tracer =
386 ds_request_bts_task(current, small_buffer, SMALL_BUFFER_SIZE,
387 NULL, (size_t)-1, BTS_KERNEL);
388 local_irq_save(irq);
389 ds_selftest_bts_cpu(&conf);
390 if (conf.error >= 0)
391 conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
392 ds_release_bts_noirq(conf.tracer);
393 local_irq_restore(irq);
394 if (conf.error < 0)
395 goto out;
396
397 conf.error = 0;
398 out:
399 put_online_cpus();
400 printk(KERN_CONT "%s.\n", (conf.error ? "failed" : "passed"));
401
402 return conf.error;
403}
404
405int ds_selftest_pebs(void)
406{
407 return 0;
408}
diff --git a/arch/x86/kernel/ds_selftest.h b/arch/x86/kernel/ds_selftest.h
new file mode 100644
index 000000000000..2ba8745c6663
--- /dev/null
+++ b/arch/x86/kernel/ds_selftest.h
@@ -0,0 +1,15 @@
1/*
2 * Debug Store support - selftest
3 *
4 *
5 * Copyright (C) 2009 Intel Corporation.
6 * Markus Metzger <markus.t.metzger@intel.com>, 2009
7 */
8
9#ifdef CONFIG_X86_DS_SELFTEST
10extern int ds_selftest_bts(void);
11extern int ds_selftest_pebs(void);
12#else
13static inline int ds_selftest_bts(void) { return 0; }
14static inline int ds_selftest_pebs(void) { return 0; }
15#endif
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
index da87590b8698..81086c227ab7 100644
--- a/arch/x86/kernel/dumpstack.h
+++ b/arch/x86/kernel/dumpstack.h
@@ -29,7 +29,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
29 unsigned long *sp, unsigned long bp, char *log_lvl); 29 unsigned long *sp, unsigned long bp, char *log_lvl);
30 30
31extern unsigned int code_bytes; 31extern unsigned int code_bytes;
32extern int kstack_depth_to_print;
33 32
34/* The form of the top of the frame on the stack */ 33/* The form of the top of the frame on the stack */
35struct stack_frame { 34struct stack_frame {
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 006281302925..7271fa33d791 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -617,7 +617,7 @@ __init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
617 */ 617 */
618__init void e820_setup_gap(void) 618__init void e820_setup_gap(void)
619{ 619{
620 unsigned long gapstart, gapsize, round; 620 unsigned long gapstart, gapsize;
621 int found; 621 int found;
622 622
623 gapstart = 0x10000000; 623 gapstart = 0x10000000;
@@ -635,14 +635,9 @@ __init void e820_setup_gap(void)
635#endif 635#endif
636 636
637 /* 637 /*
638 * See how much we want to round up: start off with 638 * e820_reserve_resources_late protect stolen RAM already
639 * rounding to the next 1MB area.
640 */ 639 */
641 round = 0x100000; 640 pci_mem_start = gapstart;
642 while ((gapsize >> 4) > round)
643 round += round;
644 /* Fun with two's complement */
645 pci_mem_start = (gapstart + round) & -round;
646 641
647 printk(KERN_INFO 642 printk(KERN_INFO
648 "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", 643 "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
@@ -1371,6 +1366,23 @@ void __init e820_reserve_resources(void)
1371 } 1366 }
1372} 1367}
1373 1368
1369/* How much should we pad RAM ending depending on where it is? */
1370static unsigned long ram_alignment(resource_size_t pos)
1371{
1372 unsigned long mb = pos >> 20;
1373
1374 /* To 64kB in the first megabyte */
1375 if (!mb)
1376 return 64*1024;
1377
1378 /* To 1MB in the first 16MB */
1379 if (mb < 16)
1380 return 1024*1024;
1381
1382 /* To 32MB for anything above that */
1383 return 32*1024*1024;
1384}
1385
1374void __init e820_reserve_resources_late(void) 1386void __init e820_reserve_resources_late(void)
1375{ 1387{
1376 int i; 1388 int i;
@@ -1382,6 +1394,24 @@ void __init e820_reserve_resources_late(void)
1382 insert_resource_expand_to_fit(&iomem_resource, res); 1394 insert_resource_expand_to_fit(&iomem_resource, res);
1383 res++; 1395 res++;
1384 } 1396 }
1397
1398 /*
1399 * Try to bump up RAM regions to reasonable boundaries to
1400 * avoid stolen RAM:
1401 */
1402 for (i = 0; i < e820.nr_map; i++) {
1403 struct e820entry *entry = &e820_saved.map[i];
1404 resource_size_t start, end;
1405
1406 if (entry->type != E820_RAM)
1407 continue;
1408 start = entry->addr + entry->size;
1409 end = round_up(start, ram_alignment(start));
1410 if (start == end)
1411 continue;
1412 reserve_region_with_split(&iomem_resource, start,
1413 end - 1, "RAM buffer");
1414 }
1385} 1415}
1386 1416
1387char *__init default_machine_specific_memory_setup(void) 1417char *__init default_machine_specific_memory_setup(void)
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 76b8cd953dee..ebdb85cf2686 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -97,6 +97,7 @@ static void __init nvidia_bugs(int num, int slot, int func)
97} 97}
98 98
99#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) 99#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
100#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
100static u32 __init ati_ixp4x0_rev(int num, int slot, int func) 101static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
101{ 102{
102 u32 d; 103 u32 d;
@@ -114,6 +115,7 @@ static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
114 d &= 0xff; 115 d &= 0xff;
115 return d; 116 return d;
116} 117}
118#endif
117 119
118static void __init ati_bugs(int num, int slot, int func) 120static void __init ati_bugs(int num, int slot, int func)
119{ 121{
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 1736acc4d7aa..96f7ac0bbf01 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -240,10 +240,35 @@ static void __init do_add_efi_memmap(void)
240 unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; 240 unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
241 int e820_type; 241 int e820_type;
242 242
243 if (md->attribute & EFI_MEMORY_WB) 243 switch (md->type) {
244 e820_type = E820_RAM; 244 case EFI_LOADER_CODE:
245 else 245 case EFI_LOADER_DATA:
246 case EFI_BOOT_SERVICES_CODE:
247 case EFI_BOOT_SERVICES_DATA:
248 case EFI_CONVENTIONAL_MEMORY:
249 if (md->attribute & EFI_MEMORY_WB)
250 e820_type = E820_RAM;
251 else
252 e820_type = E820_RESERVED;
253 break;
254 case EFI_ACPI_RECLAIM_MEMORY:
255 e820_type = E820_ACPI;
256 break;
257 case EFI_ACPI_MEMORY_NVS:
258 e820_type = E820_NVS;
259 break;
260 case EFI_UNUSABLE_MEMORY:
261 e820_type = E820_UNUSABLE;
262 break;
263 default:
264 /*
265 * EFI_RESERVED_TYPE EFI_RUNTIME_SERVICES_CODE
266 * EFI_RUNTIME_SERVICES_DATA EFI_MEMORY_MAPPED_IO
267 * EFI_MEMORY_MAPPED_IO_PORT_SPACE EFI_PAL_CODE
268 */
246 e820_type = E820_RESERVED; 269 e820_type = E820_RESERVED;
270 break;
271 }
247 e820_add_region(start, size, e820_type); 272 e820_add_region(start, size, e820_type);
248 } 273 }
249 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 274 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c929add475c9..c097e7d607c6 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -48,7 +48,6 @@
48#include <asm/segment.h> 48#include <asm/segment.h>
49#include <asm/smp.h> 49#include <asm/smp.h>
50#include <asm/page_types.h> 50#include <asm/page_types.h>
51#include <asm/desc.h>
52#include <asm/percpu.h> 51#include <asm/percpu.h>
53#include <asm/dwarf2.h> 52#include <asm/dwarf2.h>
54#include <asm/processor-flags.h> 53#include <asm/processor-flags.h>
@@ -84,7 +83,7 @@
84#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF 83#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
85#else 84#else
86#define preempt_stop(clobbers) 85#define preempt_stop(clobbers)
87#define resume_kernel restore_nocheck 86#define resume_kernel restore_all
88#endif 87#endif
89 88
90.macro TRACE_IRQS_IRET 89.macro TRACE_IRQS_IRET
@@ -372,7 +371,7 @@ END(ret_from_exception)
372ENTRY(resume_kernel) 371ENTRY(resume_kernel)
373 DISABLE_INTERRUPTS(CLBR_ANY) 372 DISABLE_INTERRUPTS(CLBR_ANY)
374 cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? 373 cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
375 jnz restore_nocheck 374 jnz restore_all
376need_resched: 375need_resched:
377 movl TI_flags(%ebp), %ecx # need_resched set ? 376 movl TI_flags(%ebp), %ecx # need_resched set ?
378 testb $_TIF_NEED_RESCHED, %cl 377 testb $_TIF_NEED_RESCHED, %cl
@@ -540,6 +539,8 @@ syscall_exit:
540 jne syscall_exit_work 539 jne syscall_exit_work
541 540
542restore_all: 541restore_all:
542 TRACE_IRQS_IRET
543restore_all_notrace:
543 movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS 544 movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
544 # Warning: PT_OLDSS(%esp) contains the wrong/random values if we 545 # Warning: PT_OLDSS(%esp) contains the wrong/random values if we
545 # are returning to the kernel. 546 # are returning to the kernel.
@@ -551,8 +552,6 @@ restore_all:
551 CFI_REMEMBER_STATE 552 CFI_REMEMBER_STATE
552 je ldt_ss # returning to user-space with LDT SS 553 je ldt_ss # returning to user-space with LDT SS
553restore_nocheck: 554restore_nocheck:
554 TRACE_IRQS_IRET
555restore_nocheck_notrace:
556 RESTORE_REGS 4 # skip orig_eax/error_code 555 RESTORE_REGS 4 # skip orig_eax/error_code
557 CFI_ADJUST_CFA_OFFSET -4 556 CFI_ADJUST_CFA_OFFSET -4
558irq_return: 557irq_return:
@@ -588,22 +587,34 @@ ldt_ss:
588 jne restore_nocheck 587 jne restore_nocheck
589#endif 588#endif
590 589
591 /* If returning to userspace with 16bit stack, 590/*
592 * try to fix the higher word of ESP, as the CPU 591 * Setup and switch to ESPFIX stack
593 * won't restore it. 592 *
594 * This is an "official" bug of all the x86-compatible 593 * We're returning to userspace with a 16 bit stack. The CPU will not
595 * CPUs, which we can try to work around to make 594 * restore the high word of ESP for us on executing iret... This is an
596 * dosemu and wine happy. */ 595 * "official" bug of all the x86-compatible CPUs, which we can work
597 movl PT_OLDESP(%esp), %eax 596 * around to make dosemu and wine happy. We do this by preloading the
598 movl %esp, %edx 597 * high word of ESP with the high word of the userspace ESP while
599 call patch_espfix_desc 598 * compensating for the offset by changing to the ESPFIX segment with
599 * a base address that matches for the difference.
600 */
601 mov %esp, %edx /* load kernel esp */
602 mov PT_OLDESP(%esp), %eax /* load userspace esp */
603 mov %dx, %ax /* eax: new kernel esp */
604 sub %eax, %edx /* offset (low word is 0) */
605 PER_CPU(gdt_page, %ebx)
606 shr $16, %edx
607 mov %dl, GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx) /* bits 16..23 */
608 mov %dh, GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx) /* bits 24..31 */
600 pushl $__ESPFIX_SS 609 pushl $__ESPFIX_SS
601 CFI_ADJUST_CFA_OFFSET 4 610 CFI_ADJUST_CFA_OFFSET 4
602 pushl %eax 611 push %eax /* new kernel esp */
603 CFI_ADJUST_CFA_OFFSET 4 612 CFI_ADJUST_CFA_OFFSET 4
613 /* Disable interrupts, but do not irqtrace this section: we
614 * will soon execute iret and the tracer was already set to
615 * the irqstate after the iret */
604 DISABLE_INTERRUPTS(CLBR_EAX) 616 DISABLE_INTERRUPTS(CLBR_EAX)
605 TRACE_IRQS_OFF 617 lss (%esp), %esp /* switch to espfix segment */
606 lss (%esp), %esp
607 CFI_ADJUST_CFA_OFFSET -8 618 CFI_ADJUST_CFA_OFFSET -8
608 jmp restore_nocheck 619 jmp restore_nocheck
609 CFI_ENDPROC 620 CFI_ENDPROC
@@ -716,15 +727,24 @@ PTREGSCALL(vm86)
716PTREGSCALL(vm86old) 727PTREGSCALL(vm86old)
717 728
718.macro FIXUP_ESPFIX_STACK 729.macro FIXUP_ESPFIX_STACK
719 /* since we are on a wrong stack, we cant make it a C code :( */ 730/*
731 * Switch back for ESPFIX stack to the normal zerobased stack
732 *
733 * We can't call C functions using the ESPFIX stack. This code reads
734 * the high word of the segment base from the GDT and swiches to the
735 * normal stack and adjusts ESP with the matching offset.
736 */
737 /* fixup the stack */
720 PER_CPU(gdt_page, %ebx) 738 PER_CPU(gdt_page, %ebx)
721 GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah) 739 mov GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx), %al /* bits 16..23 */
722 addl %esp, %eax 740 mov GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx), %ah /* bits 24..31 */
741 shl $16, %eax
742 addl %esp, %eax /* the adjusted stack pointer */
723 pushl $__KERNEL_DS 743 pushl $__KERNEL_DS
724 CFI_ADJUST_CFA_OFFSET 4 744 CFI_ADJUST_CFA_OFFSET 4
725 pushl %eax 745 pushl %eax
726 CFI_ADJUST_CFA_OFFSET 4 746 CFI_ADJUST_CFA_OFFSET 4
727 lss (%esp), %esp 747 lss (%esp), %esp /* switch to the normal stack segment */
728 CFI_ADJUST_CFA_OFFSET -8 748 CFI_ADJUST_CFA_OFFSET -8
729.endm 749.endm
730.macro UNWIND_ESPFIX_STACK 750.macro UNWIND_ESPFIX_STACK
@@ -1154,6 +1174,7 @@ ENTRY(ftrace_graph_caller)
1154 pushl %edx 1174 pushl %edx
1155 movl 0xc(%esp), %edx 1175 movl 0xc(%esp), %edx
1156 lea 0x4(%ebp), %eax 1176 lea 0x4(%ebp), %eax
1177 movl (%ebp), %ecx
1157 subl $MCOUNT_INSN_SIZE, %edx 1178 subl $MCOUNT_INSN_SIZE, %edx
1158 call prepare_ftrace_return 1179 call prepare_ftrace_return
1159 popl %edx 1180 popl %edx
@@ -1168,6 +1189,7 @@ return_to_handler:
1168 pushl %eax 1189 pushl %eax
1169 pushl %ecx 1190 pushl %ecx
1170 pushl %edx 1191 pushl %edx
1192 movl %ebp, %eax
1171 call ftrace_return_to_handler 1193 call ftrace_return_to_handler
1172 movl %eax, 0xc(%esp) 1194 movl %eax, 0xc(%esp)
1173 popl %edx 1195 popl %edx
@@ -1329,7 +1351,7 @@ nmi_stack_correct:
1329 xorl %edx,%edx # zero error code 1351 xorl %edx,%edx # zero error code
1330 movl %esp,%eax # pt_regs pointer 1352 movl %esp,%eax # pt_regs pointer
1331 call do_nmi 1353 call do_nmi
1332 jmp restore_nocheck_notrace 1354 jmp restore_all_notrace
1333 CFI_ENDPROC 1355 CFI_ENDPROC
1334 1356
1335nmi_stack_fixup: 1357nmi_stack_fixup:
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 38946c6e8433..c251be745107 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -135,6 +135,7 @@ ENTRY(ftrace_graph_caller)
135 135
136 leaq 8(%rbp), %rdi 136 leaq 8(%rbp), %rdi
137 movq 0x38(%rsp), %rsi 137 movq 0x38(%rsp), %rsi
138 movq (%rbp), %rdx
138 subq $MCOUNT_INSN_SIZE, %rsi 139 subq $MCOUNT_INSN_SIZE, %rsi
139 140
140 call prepare_ftrace_return 141 call prepare_ftrace_return
@@ -147,27 +148,15 @@ END(ftrace_graph_caller)
147GLOBAL(return_to_handler) 148GLOBAL(return_to_handler)
148 subq $80, %rsp 149 subq $80, %rsp
149 150
151 /* Save the return values */
150 movq %rax, (%rsp) 152 movq %rax, (%rsp)
151 movq %rcx, 8(%rsp) 153 movq %rdx, 8(%rsp)
152 movq %rdx, 16(%rsp) 154 movq %rbp, %rdi
153 movq %rsi, 24(%rsp)
154 movq %rdi, 32(%rsp)
155 movq %r8, 40(%rsp)
156 movq %r9, 48(%rsp)
157 movq %r10, 56(%rsp)
158 movq %r11, 64(%rsp)
159 155
160 call ftrace_return_to_handler 156 call ftrace_return_to_handler
161 157
162 movq %rax, 72(%rsp) 158 movq %rax, 72(%rsp)
163 movq 64(%rsp), %r11 159 movq 8(%rsp), %rdx
164 movq 56(%rsp), %r10
165 movq 48(%rsp), %r9
166 movq 40(%rsp), %r8
167 movq 32(%rsp), %rdi
168 movq 24(%rsp), %rsi
169 movq 16(%rsp), %rdx
170 movq 8(%rsp), %rcx
171 movq (%rsp), %rax 160 movq (%rsp), %rax
172 addq $72, %rsp 161 addq $72, %rsp
173 retq 162 retq
@@ -976,6 +965,8 @@ END(\sym)
976#ifdef CONFIG_SMP 965#ifdef CONFIG_SMP
977apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \ 966apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
978 irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt 967 irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
968apicinterrupt REBOOT_VECTOR \
969 reboot_interrupt smp_reboot_interrupt
979#endif 970#endif
980 971
981#ifdef CONFIG_X86_UV 972#ifdef CONFIG_X86_UV
@@ -1007,10 +998,15 @@ apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \
1007#endif 998#endif
1008 999
1009apicinterrupt THRESHOLD_APIC_VECTOR \ 1000apicinterrupt THRESHOLD_APIC_VECTOR \
1010 threshold_interrupt mce_threshold_interrupt 1001 threshold_interrupt smp_threshold_interrupt
1011apicinterrupt THERMAL_APIC_VECTOR \ 1002apicinterrupt THERMAL_APIC_VECTOR \
1012 thermal_interrupt smp_thermal_interrupt 1003 thermal_interrupt smp_thermal_interrupt
1013 1004
1005#ifdef CONFIG_X86_MCE
1006apicinterrupt MCE_SELF_VECTOR \
1007 mce_self_interrupt smp_mce_self_interrupt
1008#endif
1009
1014#ifdef CONFIG_SMP 1010#ifdef CONFIG_SMP
1015apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \ 1011apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
1016 call_function_single_interrupt smp_call_function_single_interrupt 1012 call_function_single_interrupt smp_call_function_single_interrupt
@@ -1025,6 +1021,11 @@ apicinterrupt ERROR_APIC_VECTOR \
1025apicinterrupt SPURIOUS_APIC_VECTOR \ 1021apicinterrupt SPURIOUS_APIC_VECTOR \
1026 spurious_interrupt smp_spurious_interrupt 1022 spurious_interrupt smp_spurious_interrupt
1027 1023
1024#ifdef CONFIG_PERF_COUNTERS
1025apicinterrupt LOCAL_PENDING_VECTOR \
1026 perf_pending_interrupt smp_perf_pending_interrupt
1027#endif
1028
1028/* 1029/*
1029 * Exception entry points. 1030 * Exception entry points.
1030 */ 1031 */
@@ -1379,10 +1380,15 @@ END(xen_failsafe_callback)
1379paranoidzeroentry_ist debug do_debug DEBUG_STACK 1380paranoidzeroentry_ist debug do_debug DEBUG_STACK
1380paranoidzeroentry_ist int3 do_int3 DEBUG_STACK 1381paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
1381paranoiderrorentry stack_segment do_stack_segment 1382paranoiderrorentry stack_segment do_stack_segment
1383#ifdef CONFIG_XEN
1384zeroentry xen_debug do_debug
1385zeroentry xen_int3 do_int3
1386errorentry xen_stack_segment do_stack_segment
1387#endif
1382errorentry general_protection do_general_protection 1388errorentry general_protection do_general_protection
1383errorentry page_fault do_page_fault 1389errorentry page_fault do_page_fault
1384#ifdef CONFIG_X86_MCE 1390#ifdef CONFIG_X86_MCE
1385paranoidzeroentry machine_check do_machine_check 1391paranoidzeroentry machine_check *machine_check_vector(%rip)
1386#endif 1392#endif
1387 1393
1388 /* 1394 /*
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index b79c5533c421..d94e1ea3b9fe 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -408,7 +408,8 @@ int ftrace_disable_ftrace_graph_caller(void)
408 * Hook the return address and push it in the stack of return addrs 408 * Hook the return address and push it in the stack of return addrs
409 * in current thread info. 409 * in current thread info.
410 */ 410 */
411void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) 411void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
412 unsigned long frame_pointer)
412{ 413{
413 unsigned long old; 414 unsigned long old;
414 int faulted; 415 int faulted;
@@ -453,7 +454,8 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
453 return; 454 return;
454 } 455 }
455 456
456 if (ftrace_push_return_trace(old, self_addr, &trace.depth) == -EBUSY) { 457 if (ftrace_push_return_trace(old, self_addr, &trace.depth,
458 frame_pointer) == -EBUSY) {
457 *parent = old; 459 *parent = old;
458 return; 460 return;
459 } 461 }
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 30683883e0cd..8663afb56535 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -13,7 +13,6 @@
13#include <asm/segment.h> 13#include <asm/segment.h>
14#include <asm/page_types.h> 14#include <asm/page_types.h>
15#include <asm/pgtable_types.h> 15#include <asm/pgtable_types.h>
16#include <asm/desc.h>
17#include <asm/cache.h> 16#include <asm/cache.h>
18#include <asm/thread_info.h> 17#include <asm/thread_info.h>
19#include <asm/asm-offsets.h> 18#include <asm/asm-offsets.h>
@@ -608,13 +607,6 @@ ignore_int:
608ENTRY(initial_code) 607ENTRY(initial_code)
609 .long i386_start_kernel 608 .long i386_start_kernel
610 609
611.section .text
612/*
613 * Real beginning of normal "text" segment
614 */
615ENTRY(stext)
616ENTRY(_stext)
617
618/* 610/*
619 * BSS section 611 * BSS section
620 */ 612 */
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 54b29bb24e71..fa54f78e2a05 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -12,7 +12,6 @@
12#include <linux/linkage.h> 12#include <linux/linkage.h>
13#include <linux/threads.h> 13#include <linux/threads.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <asm/desc.h>
16#include <asm/segment.h> 15#include <asm/segment.h>
17#include <asm/pgtable.h> 16#include <asm/pgtable.h>
18#include <asm/page.h> 17#include <asm/page.h>
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 81408b93f887..dedc2bddf7a5 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -510,7 +510,8 @@ static int hpet_setup_irq(struct hpet_dev *dev)
510{ 510{
511 511
512 if (request_irq(dev->irq, hpet_interrupt_handler, 512 if (request_irq(dev->irq, hpet_interrupt_handler,
513 IRQF_DISABLED|IRQF_NOBALANCING, dev->name, dev)) 513 IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
514 dev->name, dev))
514 return -1; 515 return -1;
515 516
516 disable_irq(dev->irq); 517 disable_irq(dev->irq);
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index c2e0bb0890d4..5cf36c053ac4 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -7,6 +7,7 @@
7#include <linux/spinlock.h> 7#include <linux/spinlock.h>
8#include <linux/jiffies.h> 8#include <linux/jiffies.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/timex.h>
10#include <linux/delay.h> 11#include <linux/delay.h>
11#include <linux/init.h> 12#include <linux/init.h>
12#include <linux/io.h> 13#include <linux/io.h>
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index df3bf269beab..270ff83efc11 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -12,7 +12,6 @@
12 12
13static struct signal_struct init_signals = INIT_SIGNALS(init_signals); 13static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
14static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); 14static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
15struct mm_struct init_mm = INIT_MM(init_mm);
16 15
17/* 16/*
18 * Initial thread structure. 17 * Initial thread structure.
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index c3fe010d74c8..b0cdde6932f5 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -12,6 +12,8 @@
12#include <asm/io_apic.h> 12#include <asm/io_apic.h>
13#include <asm/irq.h> 13#include <asm/irq.h>
14#include <asm/idle.h> 14#include <asm/idle.h>
15#include <asm/mce.h>
16#include <asm/hw_irq.h>
15 17
16atomic_t irq_err_count; 18atomic_t irq_err_count;
17 19
@@ -24,9 +26,9 @@ void (*generic_interrupt_extension)(void) = NULL;
24 */ 26 */
25void ack_bad_irq(unsigned int irq) 27void ack_bad_irq(unsigned int irq)
26{ 28{
27 printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq); 29 if (printk_ratelimit())
30 pr_err("unexpected IRQ trap at vector %02x\n", irq);
28 31
29#ifdef CONFIG_X86_LOCAL_APIC
30 /* 32 /*
31 * Currently unexpected vectors happen only on SMP and APIC. 33 * Currently unexpected vectors happen only on SMP and APIC.
32 * We _must_ ack these because every local APIC has only N 34 * We _must_ ack these because every local APIC has only N
@@ -36,9 +38,7 @@ void ack_bad_irq(unsigned int irq)
36 * completely. 38 * completely.
37 * But only ack when the APIC is enabled -AK 39 * But only ack when the APIC is enabled -AK
38 */ 40 */
39 if (cpu_has_apic) 41 ack_APIC_irq();
40 ack_APIC_irq();
41#endif
42} 42}
43 43
44#define irq_stats(x) (&per_cpu(irq_stat, x)) 44#define irq_stats(x) (&per_cpu(irq_stat, x))
@@ -63,6 +63,14 @@ static int show_other_interrupts(struct seq_file *p, int prec)
63 for_each_online_cpu(j) 63 for_each_online_cpu(j)
64 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); 64 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
65 seq_printf(p, " Spurious interrupts\n"); 65 seq_printf(p, " Spurious interrupts\n");
66 seq_printf(p, "%*s: ", prec, "CNT");
67 for_each_online_cpu(j)
68 seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
69 seq_printf(p, " Performance counter interrupts\n");
70 seq_printf(p, "%*s: ", prec, "PND");
71 for_each_online_cpu(j)
72 seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
73 seq_printf(p, " Performance pending work\n");
66#endif 74#endif
67 if (generic_interrupt_extension) { 75 if (generic_interrupt_extension) {
68 seq_printf(p, "%*s: ", prec, "PLT"); 76 seq_printf(p, "%*s: ", prec, "PLT");
@@ -89,13 +97,23 @@ static int show_other_interrupts(struct seq_file *p, int prec)
89 for_each_online_cpu(j) 97 for_each_online_cpu(j)
90 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); 98 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
91 seq_printf(p, " Thermal event interrupts\n"); 99 seq_printf(p, " Thermal event interrupts\n");
92# ifdef CONFIG_X86_64 100# ifdef CONFIG_X86_MCE_THRESHOLD
93 seq_printf(p, "%*s: ", prec, "THR"); 101 seq_printf(p, "%*s: ", prec, "THR");
94 for_each_online_cpu(j) 102 for_each_online_cpu(j)
95 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); 103 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
96 seq_printf(p, " Threshold APIC interrupts\n"); 104 seq_printf(p, " Threshold APIC interrupts\n");
97# endif 105# endif
98#endif 106#endif
107#ifdef CONFIG_X86_NEW_MCE
108 seq_printf(p, "%*s: ", prec, "MCE");
109 for_each_online_cpu(j)
110 seq_printf(p, "%10u ", per_cpu(mce_exception_count, j));
111 seq_printf(p, " Machine check exceptions\n");
112 seq_printf(p, "%*s: ", prec, "MCP");
113 for_each_online_cpu(j)
114 seq_printf(p, "%10u ", per_cpu(mce_poll_count, j));
115 seq_printf(p, " Machine check polls\n");
116#endif
99 seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); 117 seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
100#if defined(CONFIG_X86_IO_APIC) 118#if defined(CONFIG_X86_IO_APIC)
101 seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count)); 119 seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
@@ -166,6 +184,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
166#ifdef CONFIG_X86_LOCAL_APIC 184#ifdef CONFIG_X86_LOCAL_APIC
167 sum += irq_stats(cpu)->apic_timer_irqs; 185 sum += irq_stats(cpu)->apic_timer_irqs;
168 sum += irq_stats(cpu)->irq_spurious_count; 186 sum += irq_stats(cpu)->irq_spurious_count;
187 sum += irq_stats(cpu)->apic_perf_irqs;
188 sum += irq_stats(cpu)->apic_pending_irqs;
169#endif 189#endif
170 if (generic_interrupt_extension) 190 if (generic_interrupt_extension)
171 sum += irq_stats(cpu)->generic_irqs; 191 sum += irq_stats(cpu)->generic_irqs;
@@ -176,9 +196,13 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
176#endif 196#endif
177#ifdef CONFIG_X86_MCE 197#ifdef CONFIG_X86_MCE
178 sum += irq_stats(cpu)->irq_thermal_count; 198 sum += irq_stats(cpu)->irq_thermal_count;
179# ifdef CONFIG_X86_64 199# ifdef CONFIG_X86_MCE_THRESHOLD
180 sum += irq_stats(cpu)->irq_threshold_count; 200 sum += irq_stats(cpu)->irq_threshold_count;
201# endif
181#endif 202#endif
203#ifdef CONFIG_X86_NEW_MCE
204 sum += per_cpu(mce_exception_count, cpu);
205 sum += per_cpu(mce_poll_count, cpu);
182#endif 206#endif
183 return sum; 207 return sum;
184} 208}
@@ -213,14 +237,11 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
213 irq = __get_cpu_var(vector_irq)[vector]; 237 irq = __get_cpu_var(vector_irq)[vector];
214 238
215 if (!handle_irq(irq, regs)) { 239 if (!handle_irq(irq, regs)) {
216#ifdef CONFIG_X86_64 240 ack_APIC_irq();
217 if (!disable_apic)
218 ack_APIC_irq();
219#endif
220 241
221 if (printk_ratelimit()) 242 if (printk_ratelimit())
222 printk(KERN_EMERG "%s: %d.%d No irq handler for vector (irq %d)\n", 243 pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n",
223 __func__, smp_processor_id(), vector, irq); 244 __func__, smp_processor_id(), vector, irq);
224 } 245 }
225 246
226 irq_exit(); 247 irq_exit();
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit.c
index 368b0a8836f9..696f0e475c2d 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit.c
@@ -1,20 +1,25 @@
1#include <linux/linkage.h>
1#include <linux/errno.h> 2#include <linux/errno.h>
2#include <linux/signal.h> 3#include <linux/signal.h>
3#include <linux/sched.h> 4#include <linux/sched.h>
4#include <linux/ioport.h> 5#include <linux/ioport.h>
5#include <linux/interrupt.h> 6#include <linux/interrupt.h>
7#include <linux/timex.h>
6#include <linux/slab.h> 8#include <linux/slab.h>
7#include <linux/random.h> 9#include <linux/random.h>
10#include <linux/kprobes.h>
8#include <linux/init.h> 11#include <linux/init.h>
9#include <linux/kernel_stat.h> 12#include <linux/kernel_stat.h>
10#include <linux/sysdev.h> 13#include <linux/sysdev.h>
11#include <linux/bitops.h> 14#include <linux/bitops.h>
15#include <linux/acpi.h>
12#include <linux/io.h> 16#include <linux/io.h>
13#include <linux/delay.h> 17#include <linux/delay.h>
14 18
15#include <asm/atomic.h> 19#include <asm/atomic.h>
16#include <asm/system.h> 20#include <asm/system.h>
17#include <asm/timer.h> 21#include <asm/timer.h>
22#include <asm/hw_irq.h>
18#include <asm/pgtable.h> 23#include <asm/pgtable.h>
19#include <asm/desc.h> 24#include <asm/desc.h>
20#include <asm/apic.h> 25#include <asm/apic.h>
@@ -22,7 +27,23 @@
22#include <asm/i8259.h> 27#include <asm/i8259.h>
23#include <asm/traps.h> 28#include <asm/traps.h>
24 29
30/*
31 * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
32 * (these are usually mapped to vectors 0x30-0x3f)
33 */
34
35/*
36 * The IO-APIC gives us many more interrupt sources. Most of these
37 * are unused but an SMP system is supposed to have enough memory ...
38 * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
39 * across the spectrum, so we really want to be prepared to get all
40 * of these. Plus, more powerful systems might have more than 64
41 * IO-APIC registers.
42 *
43 * (these are usually mapped into the 0x30-0xff vector range)
44 */
25 45
46#ifdef CONFIG_X86_32
26/* 47/*
27 * Note that on a 486, we don't want to do a SIGFPE on an irq13 48 * Note that on a 486, we don't want to do a SIGFPE on an irq13
28 * as the irq is unreliable, and exception 16 works correctly 49 * as the irq is unreliable, and exception 16 works correctly
@@ -52,30 +73,7 @@ static struct irqaction fpu_irq = {
52 .handler = math_error_irq, 73 .handler = math_error_irq,
53 .name = "fpu", 74 .name = "fpu",
54}; 75};
55
56void __init init_ISA_irqs(void)
57{
58 int i;
59
60#ifdef CONFIG_X86_LOCAL_APIC
61 init_bsp_APIC();
62#endif 76#endif
63 init_8259A(0);
64
65 /*
66 * 16 old-style INTA-cycle interrupts:
67 */
68 for (i = 0; i < NR_IRQS_LEGACY; i++) {
69 struct irq_desc *desc = irq_to_desc(i);
70
71 desc->status = IRQ_DISABLED;
72 desc->action = NULL;
73 desc->depth = 1;
74
75 set_irq_chip_and_handler_name(i, &i8259A_chip,
76 handle_level_irq, "XT");
77 }
78}
79 77
80/* 78/*
81 * IRQ2 is cascade interrupt to second interrupt controller 79 * IRQ2 is cascade interrupt to second interrupt controller
@@ -118,29 +116,37 @@ int vector_used_by_percpu_irq(unsigned int vector)
118 return 0; 116 return 0;
119} 117}
120 118
121/* Overridden in paravirt.c */ 119static void __init init_ISA_irqs(void)
122void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
123
124void __init native_init_IRQ(void)
125{ 120{
126 int i; 121 int i;
127 122
128 /* Execute any quirks before the call gates are initialised: */ 123#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
129 x86_quirk_pre_intr_init(); 124 init_bsp_APIC();
125#endif
126 init_8259A(0);
130 127
131 /* 128 /*
132 * Cover the whole vector space, no vector can escape 129 * 16 old-style INTA-cycle interrupts:
133 * us. (some of these will be overridden and become
134 * 'special' SMP interrupts)
135 */ 130 */
136 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { 131 for (i = 0; i < NR_IRQS_LEGACY; i++) {
137 /* SYSCALL_VECTOR was reserved in trap_init. */ 132 struct irq_desc *desc = irq_to_desc(i);
138 if (i != SYSCALL_VECTOR) 133
139 set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); 134 desc->status = IRQ_DISABLED;
135 desc->action = NULL;
136 desc->depth = 1;
137
138 set_irq_chip_and_handler_name(i, &i8259A_chip,
139 handle_level_irq, "XT");
140 } 140 }
141}
141 142
143/* Overridden in paravirt.c */
144void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
142 145
143#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) 146static void __init smp_intr_init(void)
147{
148#ifdef CONFIG_SMP
149#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
144 /* 150 /*
145 * The reschedule interrupt is a CPU-to-CPU reschedule-helper 151 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
146 * IPI, driven by wakeup. 152 * IPI, driven by wakeup.
@@ -160,16 +166,35 @@ void __init native_init_IRQ(void)
160 /* IPI for generic function call */ 166 /* IPI for generic function call */
161 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); 167 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
162 168
163 /* IPI for single call function */ 169 /* IPI for generic single function call */
164 alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, 170 alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
165 call_function_single_interrupt); 171 call_function_single_interrupt);
166 172
167 /* Low priority IPI to cleanup after moving an irq */ 173 /* Low priority IPI to cleanup after moving an irq */
168 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); 174 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
169 set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); 175 set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
176
177 /* IPI used for rebooting/stopping */
178 alloc_intr_gate(REBOOT_VECTOR, reboot_interrupt);
170#endif 179#endif
180#endif /* CONFIG_SMP */
181}
182
183static void __init apic_intr_init(void)
184{
185 smp_intr_init();
171 186
172#ifdef CONFIG_X86_LOCAL_APIC 187#ifdef CONFIG_X86_THERMAL_VECTOR
188 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
189#endif
190#ifdef CONFIG_X86_THRESHOLD
191 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
192#endif
193#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC)
194 alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt);
195#endif
196
197#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
173 /* self generated IPI for local APIC timer */ 198 /* self generated IPI for local APIC timer */
174 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); 199 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
175 200
@@ -179,16 +204,59 @@ void __init native_init_IRQ(void)
179 /* IPI vectors for APIC spurious and error interrupts */ 204 /* IPI vectors for APIC spurious and error interrupts */
180 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 205 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
181 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 206 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
207
208 /* Performance monitoring interrupts: */
209# ifdef CONFIG_PERF_COUNTERS
210 alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
211# endif
212
182#endif 213#endif
214}
183 215
184#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) 216/**
185 /* thermal monitor LVT interrupt */ 217 * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
186 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); 218 *
219 * Description:
220 * Perform any necessary interrupt initialisation prior to setting up
221 * the "ordinary" interrupt call gates. For legacy reasons, the ISA
222 * interrupts should be initialised here if the machine emulates a PC
223 * in any way.
224 **/
225static void __init x86_quirk_pre_intr_init(void)
226{
227#ifdef CONFIG_X86_32
228 if (x86_quirks->arch_pre_intr_init) {
229 if (x86_quirks->arch_pre_intr_init())
230 return;
231 }
187#endif 232#endif
233 init_ISA_irqs();
234}
235
236void __init native_init_IRQ(void)
237{
238 int i;
239
240 /* Execute any quirks before the call gates are initialised: */
241 x86_quirk_pre_intr_init();
242
243 apic_intr_init();
244
245 /*
246 * Cover the whole vector space, no vector can escape
247 * us. (some of these will be overridden and become
248 * 'special' SMP interrupts)
249 */
250 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
251 /* IA32_SYSCALL_VECTOR could be used in trap_init already. */
252 if (!test_bit(i, used_vectors))
253 set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
254 }
188 255
189 if (!acpi_ioapic) 256 if (!acpi_ioapic)
190 setup_irq(2, &irq2); 257 setup_irq(2, &irq2);
191 258
259#ifdef CONFIG_X86_32
192 /* 260 /*
193 * Call quirks after call gates are initialised (usually add in 261 * Call quirks after call gates are initialised (usually add in
194 * the architecture specific gates): 262 * the architecture specific gates):
@@ -203,4 +271,5 @@ void __init native_init_IRQ(void)
203 setup_irq(FPU_IRQ, &fpu_irq); 271 setup_irq(FPU_IRQ, &fpu_irq);
204 272
205 irq_ctx_init(smp_processor_id()); 273 irq_ctx_init(smp_processor_id());
274#endif
206} 275}
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
deleted file mode 100644
index 8cd10537fd46..000000000000
--- a/arch/x86/kernel/irqinit_64.c
+++ /dev/null
@@ -1,177 +0,0 @@
1#include <linux/linkage.h>
2#include <linux/errno.h>
3#include <linux/signal.h>
4#include <linux/sched.h>
5#include <linux/ioport.h>
6#include <linux/interrupt.h>
7#include <linux/timex.h>
8#include <linux/slab.h>
9#include <linux/random.h>
10#include <linux/init.h>
11#include <linux/kernel_stat.h>
12#include <linux/sysdev.h>
13#include <linux/bitops.h>
14#include <linux/acpi.h>
15#include <linux/io.h>
16#include <linux/delay.h>
17
18#include <asm/atomic.h>
19#include <asm/system.h>
20#include <asm/hw_irq.h>
21#include <asm/pgtable.h>
22#include <asm/desc.h>
23#include <asm/apic.h>
24#include <asm/i8259.h>
25
26/*
27 * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
28 * (these are usually mapped to vectors 0x30-0x3f)
29 */
30
31/*
32 * The IO-APIC gives us many more interrupt sources. Most of these
33 * are unused but an SMP system is supposed to have enough memory ...
34 * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
35 * across the spectrum, so we really want to be prepared to get all
36 * of these. Plus, more powerful systems might have more than 64
37 * IO-APIC registers.
38 *
39 * (these are usually mapped into the 0x30-0xff vector range)
40 */
41
42/*
43 * IRQ2 is cascade interrupt to second interrupt controller
44 */
45
46static struct irqaction irq2 = {
47 .handler = no_action,
48 .name = "cascade",
49};
50DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
51 [0 ... IRQ0_VECTOR - 1] = -1,
52 [IRQ0_VECTOR] = 0,
53 [IRQ1_VECTOR] = 1,
54 [IRQ2_VECTOR] = 2,
55 [IRQ3_VECTOR] = 3,
56 [IRQ4_VECTOR] = 4,
57 [IRQ5_VECTOR] = 5,
58 [IRQ6_VECTOR] = 6,
59 [IRQ7_VECTOR] = 7,
60 [IRQ8_VECTOR] = 8,
61 [IRQ9_VECTOR] = 9,
62 [IRQ10_VECTOR] = 10,
63 [IRQ11_VECTOR] = 11,
64 [IRQ12_VECTOR] = 12,
65 [IRQ13_VECTOR] = 13,
66 [IRQ14_VECTOR] = 14,
67 [IRQ15_VECTOR] = 15,
68 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
69};
70
71int vector_used_by_percpu_irq(unsigned int vector)
72{
73 int cpu;
74
75 for_each_online_cpu(cpu) {
76 if (per_cpu(vector_irq, cpu)[vector] != -1)
77 return 1;
78 }
79
80 return 0;
81}
82
83static void __init init_ISA_irqs(void)
84{
85 int i;
86
87 init_bsp_APIC();
88 init_8259A(0);
89
90 for (i = 0; i < NR_IRQS_LEGACY; i++) {
91 struct irq_desc *desc = irq_to_desc(i);
92
93 desc->status = IRQ_DISABLED;
94 desc->action = NULL;
95 desc->depth = 1;
96
97 /*
98 * 16 old-style INTA-cycle interrupts:
99 */
100 set_irq_chip_and_handler_name(i, &i8259A_chip,
101 handle_level_irq, "XT");
102 }
103}
104
105void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
106
107static void __init smp_intr_init(void)
108{
109#ifdef CONFIG_SMP
110 /*
111 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
112 * IPI, driven by wakeup.
113 */
114 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
115
116 /* IPIs for invalidation */
117 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
118 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
119 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
120 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
121 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
122 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
123 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
124 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
125
126 /* IPI for generic function call */
127 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
128
129 /* IPI for generic single function call */
130 alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
131 call_function_single_interrupt);
132
133 /* Low priority IPI to cleanup after moving an irq */
134 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
135 set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
136#endif
137}
138
139static void __init apic_intr_init(void)
140{
141 smp_intr_init();
142
143 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
144 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
145
146 /* self generated IPI for local APIC timer */
147 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
148
149 /* generic IPI for platform specific use */
150 alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
151
152 /* IPI vectors for APIC spurious and error interrupts */
153 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
154 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
155}
156
157void __init native_init_IRQ(void)
158{
159 int i;
160
161 init_ISA_irqs();
162 /*
163 * Cover the whole vector space, no vector can escape
164 * us. (some of these will be overridden and become
165 * 'special' SMP interrupts)
166 */
167 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
168 int vector = FIRST_EXTERNAL_VECTOR + i;
169 if (vector != IA32_SYSCALL_VECTOR)
170 set_intr_gate(vector, interrupt[i]);
171 }
172
173 apic_intr_init();
174
175 if (!acpi_ioapic)
176 setup_irq(2, &irq2);
177}
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index b1f4dffb919e..8d82a77a3f3b 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -142,7 +142,7 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
142 gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); 142 gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8);
143 gdb_regs32[GDB_CS] = __KERNEL_CS; 143 gdb_regs32[GDB_CS] = __KERNEL_CS;
144 gdb_regs32[GDB_SS] = __KERNEL_DS; 144 gdb_regs32[GDB_SS] = __KERNEL_DS;
145 gdb_regs[GDB_PC] = p->thread.ip; 145 gdb_regs[GDB_PC] = 0;
146 gdb_regs[GDB_R8] = 0; 146 gdb_regs[GDB_R8] = 0;
147 gdb_regs[GDB_R9] = 0; 147 gdb_regs[GDB_R9] = 0;
148 gdb_regs[GDB_R10] = 0; 148 gdb_regs[GDB_R10] = 0;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 33019ddb56b4..a78ecad0c900 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -27,6 +27,7 @@
27#include <linux/mm.h> 27#include <linux/mm.h>
28#include <linux/highmem.h> 28#include <linux/highmem.h>
29#include <linux/hardirq.h> 29#include <linux/hardirq.h>
30#include <asm/timer.h>
30 31
31#define MMU_QUEUE_SIZE 1024 32#define MMU_QUEUE_SIZE 1024
32 33
@@ -195,7 +196,7 @@ static void kvm_leave_lazy_mmu(void)
195 struct kvm_para_state *state = kvm_para_state(); 196 struct kvm_para_state *state = kvm_para_state();
196 197
197 mmu_queue_flush(state); 198 mmu_queue_flush(state);
198 paravirt_leave_lazy(paravirt_get_lazy_mode()); 199 paravirt_leave_lazy_mmu();
199 state->mode = paravirt_get_lazy_mode(); 200 state->mode = paravirt_get_lazy_mode();
200} 201}
201 202
@@ -230,6 +231,9 @@ static void paravirt_ops_setup(void)
230 pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu; 231 pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu;
231 pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu; 232 pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu;
232 } 233 }
234#ifdef CONFIG_X86_IO_APIC
235 no_timer_check = 1;
236#endif
233} 237}
234 238
235void __init kvm_guest_init(void) 239void __init kvm_guest_init(void)
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 453b5795a5c6..366baa179913 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -13,25 +13,13 @@
13 * Licensed under the terms of the GNU General Public 13 * Licensed under the terms of the GNU General Public
14 * License version 2. See file COPYING for details. 14 * License version 2. See file COPYING for details.
15 */ 15 */
16#include <linux/platform_device.h>
17#include <linux/capability.h>
18#include <linux/miscdevice.h>
19#include <linux/firmware.h> 16#include <linux/firmware.h>
20#include <linux/spinlock.h>
21#include <linux/cpumask.h>
22#include <linux/pci_ids.h> 17#include <linux/pci_ids.h>
23#include <linux/uaccess.h> 18#include <linux/uaccess.h>
24#include <linux/vmalloc.h> 19#include <linux/vmalloc.h>
25#include <linux/kernel.h> 20#include <linux/kernel.h>
26#include <linux/module.h> 21#include <linux/module.h>
27#include <linux/mutex.h>
28#include <linux/sched.h>
29#include <linux/init.h>
30#include <linux/slab.h>
31#include <linux/cpu.h>
32#include <linux/pci.h> 22#include <linux/pci.h>
33#include <linux/fs.h>
34#include <linux/mm.h>
35 23
36#include <asm/microcode.h> 24#include <asm/microcode.h>
37#include <asm/processor.h> 25#include <asm/processor.h>
@@ -79,9 +67,6 @@ struct microcode_amd {
79#define UCODE_CONTAINER_SECTION_HDR 8 67#define UCODE_CONTAINER_SECTION_HDR 8
80#define UCODE_CONTAINER_HEADER_SIZE 12 68#define UCODE_CONTAINER_HEADER_SIZE 12
81 69
82/* serialize access to the physical write */
83static DEFINE_SPINLOCK(microcode_update_lock);
84
85static struct equiv_cpu_entry *equiv_cpu_table; 70static struct equiv_cpu_entry *equiv_cpu_table;
86 71
87static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) 72static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
@@ -144,9 +129,8 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
144 return 1; 129 return 1;
145} 130}
146 131
147static void apply_microcode_amd(int cpu) 132static int apply_microcode_amd(int cpu)
148{ 133{
149 unsigned long flags;
150 u32 rev, dummy; 134 u32 rev, dummy;
151 int cpu_num = raw_smp_processor_id(); 135 int cpu_num = raw_smp_processor_id();
152 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; 136 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
@@ -156,25 +140,25 @@ static void apply_microcode_amd(int cpu)
156 BUG_ON(cpu_num != cpu); 140 BUG_ON(cpu_num != cpu);
157 141
158 if (mc_amd == NULL) 142 if (mc_amd == NULL)
159 return; 143 return 0;
160 144
161 spin_lock_irqsave(&microcode_update_lock, flags);
162 wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); 145 wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
163 /* get patch id after patching */ 146 /* get patch id after patching */
164 rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); 147 rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
165 spin_unlock_irqrestore(&microcode_update_lock, flags);
166 148
167 /* check current patch id and patch's id for match */ 149 /* check current patch id and patch's id for match */
168 if (rev != mc_amd->hdr.patch_id) { 150 if (rev != mc_amd->hdr.patch_id) {
169 printk(KERN_ERR "microcode: CPU%d: update failed " 151 printk(KERN_ERR "microcode: CPU%d: update failed "
170 "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id); 152 "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id);
171 return; 153 return -1;
172 } 154 }
173 155
174 printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n", 156 printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n",
175 cpu, rev); 157 cpu, rev);
176 158
177 uci->cpu_sig.rev = rev; 159 uci->cpu_sig.rev = rev;
160
161 return 0;
178} 162}
179 163
180static int get_ucode_data(void *to, const u8 *from, size_t n) 164static int get_ucode_data(void *to, const u8 *from, size_t n)
@@ -257,13 +241,12 @@ static int install_equiv_cpu_table(const u8 *buf)
257 241
258static void free_equiv_cpu_table(void) 242static void free_equiv_cpu_table(void)
259{ 243{
260 if (equiv_cpu_table) { 244 vfree(equiv_cpu_table);
261 vfree(equiv_cpu_table); 245 equiv_cpu_table = NULL;
262 equiv_cpu_table = NULL;
263 }
264} 246}
265 247
266static int generic_load_microcode(int cpu, const u8 *data, size_t size) 248static enum ucode_state
249generic_load_microcode(int cpu, const u8 *data, size_t size)
267{ 250{
268 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 251 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
269 const u8 *ucode_ptr = data; 252 const u8 *ucode_ptr = data;
@@ -272,12 +255,13 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size)
272 int new_rev = uci->cpu_sig.rev; 255 int new_rev = uci->cpu_sig.rev;
273 unsigned int leftover; 256 unsigned int leftover;
274 unsigned long offset; 257 unsigned long offset;
258 enum ucode_state state = UCODE_OK;
275 259
276 offset = install_equiv_cpu_table(ucode_ptr); 260 offset = install_equiv_cpu_table(ucode_ptr);
277 if (!offset) { 261 if (!offset) {
278 printk(KERN_ERR "microcode: failed to create " 262 printk(KERN_ERR "microcode: failed to create "
279 "equivalent cpu table\n"); 263 "equivalent cpu table\n");
280 return -EINVAL; 264 return UCODE_ERROR;
281 } 265 }
282 266
283 ucode_ptr += offset; 267 ucode_ptr += offset;
@@ -293,8 +277,7 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size)
293 277
294 mc_header = (struct microcode_header_amd *)mc; 278 mc_header = (struct microcode_header_amd *)mc;
295 if (get_matching_microcode(cpu, mc, new_rev)) { 279 if (get_matching_microcode(cpu, mc, new_rev)) {
296 if (new_mc) 280 vfree(new_mc);
297 vfree(new_mc);
298 new_rev = mc_header->patch_id; 281 new_rev = mc_header->patch_id;
299 new_mc = mc; 282 new_mc = mc;
300 } else 283 } else
@@ -306,34 +289,32 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size)
306 289
307 if (new_mc) { 290 if (new_mc) {
308 if (!leftover) { 291 if (!leftover) {
309 if (uci->mc) 292 vfree(uci->mc);
310 vfree(uci->mc);
311 uci->mc = new_mc; 293 uci->mc = new_mc;
312 pr_debug("microcode: CPU%d found a matching microcode " 294 pr_debug("microcode: CPU%d found a matching microcode "
313 "update with version 0x%x (current=0x%x)\n", 295 "update with version 0x%x (current=0x%x)\n",
314 cpu, new_rev, uci->cpu_sig.rev); 296 cpu, new_rev, uci->cpu_sig.rev);
315 } else 297 } else {
316 vfree(new_mc); 298 vfree(new_mc);
317 } 299 state = UCODE_ERROR;
300 }
301 } else
302 state = UCODE_NFOUND;
318 303
319 free_equiv_cpu_table(); 304 free_equiv_cpu_table();
320 305
321 return (int)leftover; 306 return state;
322} 307}
323 308
324static int request_microcode_fw(int cpu, struct device *device) 309static enum ucode_state request_microcode_fw(int cpu, struct device *device)
325{ 310{
326 const char *fw_name = "amd-ucode/microcode_amd.bin"; 311 const char *fw_name = "amd-ucode/microcode_amd.bin";
327 const struct firmware *firmware; 312 const struct firmware *firmware;
328 int ret; 313 enum ucode_state ret;
329
330 /* We should bind the task to the CPU */
331 BUG_ON(cpu != raw_smp_processor_id());
332 314
333 ret = request_firmware(&firmware, fw_name, device); 315 if (request_firmware(&firmware, fw_name, device)) {
334 if (ret) {
335 printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); 316 printk(KERN_ERR "microcode: failed to load file %s\n", fw_name);
336 return ret; 317 return UCODE_NFOUND;
337 } 318 }
338 319
339 ret = generic_load_microcode(cpu, firmware->data, firmware->size); 320 ret = generic_load_microcode(cpu, firmware->data, firmware->size);
@@ -343,11 +324,12 @@ static int request_microcode_fw(int cpu, struct device *device)
343 return ret; 324 return ret;
344} 325}
345 326
346static int request_microcode_user(int cpu, const void __user *buf, size_t size) 327static enum ucode_state
328request_microcode_user(int cpu, const void __user *buf, size_t size)
347{ 329{
348 printk(KERN_INFO "microcode: AMD microcode update via " 330 printk(KERN_INFO "microcode: AMD microcode update via "
349 "/dev/cpu/microcode not supported\n"); 331 "/dev/cpu/microcode not supported\n");
350 return -1; 332 return UCODE_ERROR;
351} 333}
352 334
353static void microcode_fini_cpu_amd(int cpu) 335static void microcode_fini_cpu_amd(int cpu)
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 98c470c069d1..9371448290ac 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -71,27 +71,18 @@
71 * Thanks to Stuart Swales for pointing out this bug. 71 * Thanks to Stuart Swales for pointing out this bug.
72 */ 72 */
73#include <linux/platform_device.h> 73#include <linux/platform_device.h>
74#include <linux/capability.h>
75#include <linux/miscdevice.h> 74#include <linux/miscdevice.h>
76#include <linux/firmware.h> 75#include <linux/capability.h>
77#include <linux/smp_lock.h> 76#include <linux/smp_lock.h>
78#include <linux/spinlock.h>
79#include <linux/cpumask.h>
80#include <linux/uaccess.h>
81#include <linux/vmalloc.h>
82#include <linux/kernel.h> 77#include <linux/kernel.h>
83#include <linux/module.h> 78#include <linux/module.h>
84#include <linux/mutex.h> 79#include <linux/mutex.h>
85#include <linux/sched.h>
86#include <linux/init.h>
87#include <linux/slab.h>
88#include <linux/cpu.h> 80#include <linux/cpu.h>
89#include <linux/fs.h> 81#include <linux/fs.h>
90#include <linux/mm.h> 82#include <linux/mm.h>
91 83
92#include <asm/microcode.h> 84#include <asm/microcode.h>
93#include <asm/processor.h> 85#include <asm/processor.h>
94#include <asm/msr.h>
95 86
96MODULE_DESCRIPTION("Microcode Update Driver"); 87MODULE_DESCRIPTION("Microcode Update Driver");
97MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); 88MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
@@ -101,36 +92,110 @@ MODULE_LICENSE("GPL");
101 92
102static struct microcode_ops *microcode_ops; 93static struct microcode_ops *microcode_ops;
103 94
104/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ 95/*
96 * Synchronization.
97 *
98 * All non cpu-hotplug-callback call sites use:
99 *
100 * - microcode_mutex to synchronize with each other;
101 * - get/put_online_cpus() to synchronize with
102 * the cpu-hotplug-callback call sites.
103 *
104 * We guarantee that only a single cpu is being
105 * updated at any particular moment of time.
106 */
105static DEFINE_MUTEX(microcode_mutex); 107static DEFINE_MUTEX(microcode_mutex);
106 108
107struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; 109struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
108EXPORT_SYMBOL_GPL(ucode_cpu_info); 110EXPORT_SYMBOL_GPL(ucode_cpu_info);
109 111
112/*
113 * Operations that are run on a target cpu:
114 */
115
116struct cpu_info_ctx {
117 struct cpu_signature *cpu_sig;
118 int err;
119};
120
121static void collect_cpu_info_local(void *arg)
122{
123 struct cpu_info_ctx *ctx = arg;
124
125 ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(),
126 ctx->cpu_sig);
127}
128
129static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig)
130{
131 struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 };
132 int ret;
133
134 ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1);
135 if (!ret)
136 ret = ctx.err;
137
138 return ret;
139}
140
141static int collect_cpu_info(int cpu)
142{
143 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
144 int ret;
145
146 memset(uci, 0, sizeof(*uci));
147
148 ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig);
149 if (!ret)
150 uci->valid = 1;
151
152 return ret;
153}
154
155struct apply_microcode_ctx {
156 int err;
157};
158
159static void apply_microcode_local(void *arg)
160{
161 struct apply_microcode_ctx *ctx = arg;
162
163 ctx->err = microcode_ops->apply_microcode(smp_processor_id());
164}
165
166static int apply_microcode_on_target(int cpu)
167{
168 struct apply_microcode_ctx ctx = { .err = 0 };
169 int ret;
170
171 ret = smp_call_function_single(cpu, apply_microcode_local, &ctx, 1);
172 if (!ret)
173 ret = ctx.err;
174
175 return ret;
176}
177
110#ifdef CONFIG_MICROCODE_OLD_INTERFACE 178#ifdef CONFIG_MICROCODE_OLD_INTERFACE
111static int do_microcode_update(const void __user *buf, size_t size) 179static int do_microcode_update(const void __user *buf, size_t size)
112{ 180{
113 cpumask_t old;
114 int error = 0; 181 int error = 0;
115 int cpu; 182 int cpu;
116 183
117 old = current->cpus_allowed;
118
119 for_each_online_cpu(cpu) { 184 for_each_online_cpu(cpu) {
120 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 185 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
186 enum ucode_state ustate;
121 187
122 if (!uci->valid) 188 if (!uci->valid)
123 continue; 189 continue;
124 190
125 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 191 ustate = microcode_ops->request_microcode_user(cpu, buf, size);
126 error = microcode_ops->request_microcode_user(cpu, buf, size); 192 if (ustate == UCODE_ERROR) {
127 if (error < 0) 193 error = -1;
128 goto out; 194 break;
129 if (!error) 195 } else if (ustate == UCODE_OK)
130 microcode_ops->apply_microcode(cpu); 196 apply_microcode_on_target(cpu);
131 } 197 }
132out: 198
133 set_cpus_allowed_ptr(current, &old);
134 return error; 199 return error;
135} 200}
136 201
@@ -143,19 +208,17 @@ static int microcode_open(struct inode *unused1, struct file *unused2)
143static ssize_t microcode_write(struct file *file, const char __user *buf, 208static ssize_t microcode_write(struct file *file, const char __user *buf,
144 size_t len, loff_t *ppos) 209 size_t len, loff_t *ppos)
145{ 210{
146 ssize_t ret; 211 ssize_t ret = -EINVAL;
147 212
148 if ((len >> PAGE_SHIFT) > num_physpages) { 213 if ((len >> PAGE_SHIFT) > num_physpages) {
149 printk(KERN_ERR "microcode: too much data (max %ld pages)\n", 214 pr_err("microcode: too much data (max %ld pages)\n", num_physpages);
150 num_physpages); 215 return ret;
151 return -EINVAL;
152 } 216 }
153 217
154 get_online_cpus(); 218 get_online_cpus();
155 mutex_lock(&microcode_mutex); 219 mutex_lock(&microcode_mutex);
156 220
157 ret = do_microcode_update(buf, len); 221 if (do_microcode_update(buf, len) == 0)
158 if (!ret)
159 ret = (ssize_t)len; 222 ret = (ssize_t)len;
160 223
161 mutex_unlock(&microcode_mutex); 224 mutex_unlock(&microcode_mutex);
@@ -165,15 +228,16 @@ static ssize_t microcode_write(struct file *file, const char __user *buf,
165} 228}
166 229
167static const struct file_operations microcode_fops = { 230static const struct file_operations microcode_fops = {
168 .owner = THIS_MODULE, 231 .owner = THIS_MODULE,
169 .write = microcode_write, 232 .write = microcode_write,
170 .open = microcode_open, 233 .open = microcode_open,
171}; 234};
172 235
173static struct miscdevice microcode_dev = { 236static struct miscdevice microcode_dev = {
174 .minor = MICROCODE_MINOR, 237 .minor = MICROCODE_MINOR,
175 .name = "microcode", 238 .name = "microcode",
176 .fops = &microcode_fops, 239 .devnode = "cpu/microcode",
240 .fops = &microcode_fops,
177}; 241};
178 242
179static int __init microcode_dev_init(void) 243static int __init microcode_dev_init(void)
@@ -182,9 +246,7 @@ static int __init microcode_dev_init(void)
182 246
183 error = misc_register(&microcode_dev); 247 error = misc_register(&microcode_dev);
184 if (error) { 248 if (error) {
185 printk(KERN_ERR 249 pr_err("microcode: can't misc_register on minor=%d\n", MICROCODE_MINOR);
186 "microcode: can't misc_register on minor=%d\n",
187 MICROCODE_MINOR);
188 return error; 250 return error;
189 } 251 }
190 252
@@ -205,42 +267,51 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
205/* fake device for request_firmware */ 267/* fake device for request_firmware */
206static struct platform_device *microcode_pdev; 268static struct platform_device *microcode_pdev;
207 269
208static long reload_for_cpu(void *unused) 270static int reload_for_cpu(int cpu)
209{ 271{
210 struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id(); 272 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
211 int err = 0; 273 int err = 0;
212 274
213 mutex_lock(&microcode_mutex); 275 mutex_lock(&microcode_mutex);
214 if (uci->valid) { 276 if (uci->valid) {
215 err = microcode_ops->request_microcode_fw(smp_processor_id(), 277 enum ucode_state ustate;
216 &microcode_pdev->dev); 278
217 if (!err) 279 ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
218 microcode_ops->apply_microcode(smp_processor_id()); 280 if (ustate == UCODE_OK)
281 apply_microcode_on_target(cpu);
282 else
283 if (ustate == UCODE_ERROR)
284 err = -EINVAL;
219 } 285 }
220 mutex_unlock(&microcode_mutex); 286 mutex_unlock(&microcode_mutex);
287
221 return err; 288 return err;
222} 289}
223 290
224static ssize_t reload_store(struct sys_device *dev, 291static ssize_t reload_store(struct sys_device *dev,
225 struct sysdev_attribute *attr, 292 struct sysdev_attribute *attr,
226 const char *buf, size_t sz) 293 const char *buf, size_t size)
227{ 294{
228 char *end; 295 unsigned long val;
229 unsigned long val = simple_strtoul(buf, &end, 0);
230 int err = 0;
231 int cpu = dev->id; 296 int cpu = dev->id;
297 int ret = 0;
298 char *end;
232 299
300 val = simple_strtoul(buf, &end, 0);
233 if (end == buf) 301 if (end == buf)
234 return -EINVAL; 302 return -EINVAL;
303
235 if (val == 1) { 304 if (val == 1) {
236 get_online_cpus(); 305 get_online_cpus();
237 if (cpu_online(cpu)) 306 if (cpu_online(cpu))
238 err = work_on_cpu(cpu, reload_for_cpu, NULL); 307 ret = reload_for_cpu(cpu);
239 put_online_cpus(); 308 put_online_cpus();
240 } 309 }
241 if (err) 310
242 return err; 311 if (!ret)
243 return sz; 312 ret = size;
313
314 return ret;
244} 315}
245 316
246static ssize_t version_show(struct sys_device *dev, 317static ssize_t version_show(struct sys_device *dev,
@@ -271,11 +342,11 @@ static struct attribute *mc_default_attrs[] = {
271}; 342};
272 343
273static struct attribute_group mc_attr_group = { 344static struct attribute_group mc_attr_group = {
274 .attrs = mc_default_attrs, 345 .attrs = mc_default_attrs,
275 .name = "microcode", 346 .name = "microcode",
276}; 347};
277 348
278static void __microcode_fini_cpu(int cpu) 349static void microcode_fini_cpu(int cpu)
279{ 350{
280 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 351 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
281 352
@@ -283,103 +354,68 @@ static void __microcode_fini_cpu(int cpu)
283 uci->valid = 0; 354 uci->valid = 0;
284} 355}
285 356
286static void microcode_fini_cpu(int cpu) 357static enum ucode_state microcode_resume_cpu(int cpu)
287{
288 mutex_lock(&microcode_mutex);
289 __microcode_fini_cpu(cpu);
290 mutex_unlock(&microcode_mutex);
291}
292
293static void collect_cpu_info(int cpu)
294{ 358{
295 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 359 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
296 360
297 memset(uci, 0, sizeof(*uci)); 361 if (!uci->mc)
298 if (!microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig)) 362 return UCODE_NFOUND;
299 uci->valid = 1; 363
364 pr_debug("microcode: CPU%d updated upon resume\n", cpu);
365 apply_microcode_on_target(cpu);
366
367 return UCODE_OK;
300} 368}
301 369
302static int microcode_resume_cpu(int cpu) 370static enum ucode_state microcode_init_cpu(int cpu)
303{ 371{
304 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 372 enum ucode_state ustate;
305 struct cpu_signature nsig;
306 373
307 pr_debug("microcode: CPU%d resumed\n", cpu); 374 if (collect_cpu_info(cpu))
375 return UCODE_ERROR;
308 376
309 if (!uci->mc) 377 /* --dimm. Trigger a delayed update? */
310 return 1; 378 if (system_state != SYSTEM_RUNNING)
379 return UCODE_NFOUND;
311 380
312 /* 381 ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
313 * Let's verify that the 'cached' ucode does belong
314 * to this cpu (a bit of paranoia):
315 */
316 if (microcode_ops->collect_cpu_info(cpu, &nsig)) {
317 __microcode_fini_cpu(cpu);
318 printk(KERN_ERR "failed to collect_cpu_info for resuming cpu #%d\n",
319 cpu);
320 return -1;
321 }
322 382
323 if ((nsig.sig != uci->cpu_sig.sig) || (nsig.pf != uci->cpu_sig.pf)) { 383 if (ustate == UCODE_OK) {
324 __microcode_fini_cpu(cpu); 384 pr_debug("microcode: CPU%d updated upon init\n", cpu);
325 printk(KERN_ERR "cached ucode doesn't match the resuming cpu #%d\n", 385 apply_microcode_on_target(cpu);
326 cpu);
327 /* Should we look for a new ucode here? */
328 return 1;
329 } 386 }
330 387
331 return 0; 388 return ustate;
332} 389}
333 390
334static long microcode_update_cpu(void *unused) 391static enum ucode_state microcode_update_cpu(int cpu)
335{ 392{
336 struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id(); 393 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
337 int err = 0; 394 enum ucode_state ustate;
338 395
339 /* 396 if (uci->valid)
340 * Check if the system resume is in progress (uci->valid != NULL), 397 ustate = microcode_resume_cpu(cpu);
341 * otherwise just request a firmware: 398 else
342 */ 399 ustate = microcode_init_cpu(cpu);
343 if (uci->valid) {
344 err = microcode_resume_cpu(smp_processor_id());
345 } else {
346 collect_cpu_info(smp_processor_id());
347 if (uci->valid && system_state == SYSTEM_RUNNING)
348 err = microcode_ops->request_microcode_fw(
349 smp_processor_id(),
350 &microcode_pdev->dev);
351 }
352 if (!err)
353 microcode_ops->apply_microcode(smp_processor_id());
354 return err;
355}
356 400
357static int microcode_init_cpu(int cpu) 401 return ustate;
358{
359 int err;
360 mutex_lock(&microcode_mutex);
361 err = work_on_cpu(cpu, microcode_update_cpu, NULL);
362 mutex_unlock(&microcode_mutex);
363
364 return err;
365} 402}
366 403
367static int mc_sysdev_add(struct sys_device *sys_dev) 404static int mc_sysdev_add(struct sys_device *sys_dev)
368{ 405{
369 int err, cpu = sys_dev->id; 406 int err, cpu = sys_dev->id;
370 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
371 407
372 if (!cpu_online(cpu)) 408 if (!cpu_online(cpu))
373 return 0; 409 return 0;
374 410
375 pr_debug("microcode: CPU%d added\n", cpu); 411 pr_debug("microcode: CPU%d added\n", cpu);
376 memset(uci, 0, sizeof(*uci));
377 412
378 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); 413 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
379 if (err) 414 if (err)
380 return err; 415 return err;
381 416
382 err = microcode_init_cpu(cpu); 417 if (microcode_init_cpu(cpu) == UCODE_ERROR)
418 err = -EINVAL;
383 419
384 return err; 420 return err;
385} 421}
@@ -400,19 +436,30 @@ static int mc_sysdev_remove(struct sys_device *sys_dev)
400static int mc_sysdev_resume(struct sys_device *dev) 436static int mc_sysdev_resume(struct sys_device *dev)
401{ 437{
402 int cpu = dev->id; 438 int cpu = dev->id;
439 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
403 440
404 if (!cpu_online(cpu)) 441 if (!cpu_online(cpu))
405 return 0; 442 return 0;
406 443
407 /* only CPU 0 will apply ucode here */ 444 /*
408 microcode_update_cpu(NULL); 445 * All non-bootup cpus are still disabled,
446 * so only CPU 0 will apply ucode here.
447 *
448 * Moreover, there can be no concurrent
449 * updates from any other places at this point.
450 */
451 WARN_ON(cpu != 0);
452
453 if (uci->valid && uci->mc)
454 microcode_ops->apply_microcode(cpu);
455
409 return 0; 456 return 0;
410} 457}
411 458
412static struct sysdev_driver mc_sysdev_driver = { 459static struct sysdev_driver mc_sysdev_driver = {
413 .add = mc_sysdev_add, 460 .add = mc_sysdev_add,
414 .remove = mc_sysdev_remove, 461 .remove = mc_sysdev_remove,
415 .resume = mc_sysdev_resume, 462 .resume = mc_sysdev_resume,
416}; 463};
417 464
418static __cpuinit int 465static __cpuinit int
@@ -425,15 +472,12 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
425 switch (action) { 472 switch (action) {
426 case CPU_ONLINE: 473 case CPU_ONLINE:
427 case CPU_ONLINE_FROZEN: 474 case CPU_ONLINE_FROZEN:
428 if (microcode_init_cpu(cpu)) 475 microcode_update_cpu(cpu);
429 printk(KERN_ERR "microcode: failed to init CPU%d\n",
430 cpu);
431 case CPU_DOWN_FAILED: 476 case CPU_DOWN_FAILED:
432 case CPU_DOWN_FAILED_FROZEN: 477 case CPU_DOWN_FAILED_FROZEN:
433 pr_debug("microcode: CPU%d added\n", cpu); 478 pr_debug("microcode: CPU%d added\n", cpu);
434 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) 479 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
435 printk(KERN_ERR "microcode: Failed to create the sysfs " 480 pr_err("microcode: Failed to create group for CPU%d\n", cpu);
436 "group for CPU%d\n", cpu);
437 break; 481 break;
438 case CPU_DOWN_PREPARE: 482 case CPU_DOWN_PREPARE:
439 case CPU_DOWN_PREPARE_FROZEN: 483 case CPU_DOWN_PREPARE_FROZEN:
@@ -465,13 +509,10 @@ static int __init microcode_init(void)
465 microcode_ops = init_amd_microcode(); 509 microcode_ops = init_amd_microcode();
466 510
467 if (!microcode_ops) { 511 if (!microcode_ops) {
468 printk(KERN_ERR "microcode: no support for this CPU vendor\n"); 512 pr_err("microcode: no support for this CPU vendor\n");
469 return -ENODEV; 513 return -ENODEV;
470 } 514 }
471 515
472 error = microcode_dev_init();
473 if (error)
474 return error;
475 microcode_pdev = platform_device_register_simple("microcode", -1, 516 microcode_pdev = platform_device_register_simple("microcode", -1,
476 NULL, 0); 517 NULL, 0);
477 if (IS_ERR(microcode_pdev)) { 518 if (IS_ERR(microcode_pdev)) {
@@ -480,23 +521,31 @@ static int __init microcode_init(void)
480 } 521 }
481 522
482 get_online_cpus(); 523 get_online_cpus();
524 mutex_lock(&microcode_mutex);
525
483 error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); 526 error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
527
528 mutex_unlock(&microcode_mutex);
484 put_online_cpus(); 529 put_online_cpus();
530
485 if (error) { 531 if (error) {
486 microcode_dev_exit();
487 platform_device_unregister(microcode_pdev); 532 platform_device_unregister(microcode_pdev);
488 return error; 533 return error;
489 } 534 }
490 535
536 error = microcode_dev_init();
537 if (error)
538 return error;
539
491 register_hotcpu_notifier(&mc_cpu_notifier); 540 register_hotcpu_notifier(&mc_cpu_notifier);
492 541
493 printk(KERN_INFO 542 pr_info("Microcode Update Driver: v" MICROCODE_VERSION
494 "Microcode Update Driver: v" MICROCODE_VERSION
495 " <tigran@aivazian.fsnet.co.uk>," 543 " <tigran@aivazian.fsnet.co.uk>,"
496 " Peter Oruba\n"); 544 " Peter Oruba\n");
497 545
498 return 0; 546 return 0;
499} 547}
548module_init(microcode_init);
500 549
501static void __exit microcode_exit(void) 550static void __exit microcode_exit(void)
502{ 551{
@@ -505,16 +554,17 @@ static void __exit microcode_exit(void)
505 unregister_hotcpu_notifier(&mc_cpu_notifier); 554 unregister_hotcpu_notifier(&mc_cpu_notifier);
506 555
507 get_online_cpus(); 556 get_online_cpus();
557 mutex_lock(&microcode_mutex);
558
508 sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); 559 sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
560
561 mutex_unlock(&microcode_mutex);
509 put_online_cpus(); 562 put_online_cpus();
510 563
511 platform_device_unregister(microcode_pdev); 564 platform_device_unregister(microcode_pdev);
512 565
513 microcode_ops = NULL; 566 microcode_ops = NULL;
514 567
515 printk(KERN_INFO 568 pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
516 "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
517} 569}
518
519module_init(microcode_init);
520module_exit(microcode_exit); 570module_exit(microcode_exit);
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 149b9ec7c1ab..0d334ddd0a96 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -70,24 +70,11 @@
70 * Fix sigmatch() macro to handle old CPUs with pf == 0. 70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug. 71 * Thanks to Stuart Swales for pointing out this bug.
72 */ 72 */
73#include <linux/platform_device.h>
74#include <linux/capability.h>
75#include <linux/miscdevice.h>
76#include <linux/firmware.h> 73#include <linux/firmware.h>
77#include <linux/smp_lock.h>
78#include <linux/spinlock.h>
79#include <linux/cpumask.h>
80#include <linux/uaccess.h> 74#include <linux/uaccess.h>
81#include <linux/vmalloc.h>
82#include <linux/kernel.h> 75#include <linux/kernel.h>
83#include <linux/module.h> 76#include <linux/module.h>
84#include <linux/mutex.h> 77#include <linux/vmalloc.h>
85#include <linux/sched.h>
86#include <linux/init.h>
87#include <linux/slab.h>
88#include <linux/cpu.h>
89#include <linux/fs.h>
90#include <linux/mm.h>
91 78
92#include <asm/microcode.h> 79#include <asm/microcode.h>
93#include <asm/processor.h> 80#include <asm/processor.h>
@@ -150,13 +137,9 @@ struct extended_sigtable {
150 137
151#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) 138#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
152 139
153/* serialize access to the physical write to MSR 0x79 */
154static DEFINE_SPINLOCK(microcode_update_lock);
155
156static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) 140static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
157{ 141{
158 struct cpuinfo_x86 *c = &cpu_data(cpu_num); 142 struct cpuinfo_x86 *c = &cpu_data(cpu_num);
159 unsigned long flags;
160 unsigned int val[2]; 143 unsigned int val[2];
161 144
162 memset(csig, 0, sizeof(*csig)); 145 memset(csig, 0, sizeof(*csig));
@@ -176,18 +159,14 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
176 csig->pf = 1 << ((val[1] >> 18) & 7); 159 csig->pf = 1 << ((val[1] >> 18) & 7);
177 } 160 }
178 161
179 /* serialize access to the physical write to MSR 0x79 */
180 spin_lock_irqsave(&microcode_update_lock, flags);
181
182 wrmsr(MSR_IA32_UCODE_REV, 0, 0); 162 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
183 /* see notes above for revision 1.07. Apparent chip bug */ 163 /* see notes above for revision 1.07. Apparent chip bug */
184 sync_core(); 164 sync_core();
185 /* get the current revision from MSR 0x8B */ 165 /* get the current revision from MSR 0x8B */
186 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); 166 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
187 spin_unlock_irqrestore(&microcode_update_lock, flags);
188 167
189 pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n", 168 printk(KERN_INFO "microcode: CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n",
190 csig->sig, csig->pf, csig->rev); 169 cpu_num, csig->sig, csig->pf, csig->rev);
191 170
192 return 0; 171 return 0;
193} 172}
@@ -318,11 +297,10 @@ get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev)
318 return 0; 297 return 0;
319} 298}
320 299
321static void apply_microcode(int cpu) 300static int apply_microcode(int cpu)
322{ 301{
323 struct microcode_intel *mc_intel; 302 struct microcode_intel *mc_intel;
324 struct ucode_cpu_info *uci; 303 struct ucode_cpu_info *uci;
325 unsigned long flags;
326 unsigned int val[2]; 304 unsigned int val[2];
327 int cpu_num; 305 int cpu_num;
328 306
@@ -334,10 +312,7 @@ static void apply_microcode(int cpu)
334 BUG_ON(cpu_num != cpu); 312 BUG_ON(cpu_num != cpu);
335 313
336 if (mc_intel == NULL) 314 if (mc_intel == NULL)
337 return; 315 return 0;
338
339 /* serialize access to the physical write to MSR 0x79 */
340 spin_lock_irqsave(&microcode_update_lock, flags);
341 316
342 /* write microcode via MSR 0x79 */ 317 /* write microcode via MSR 0x79 */
343 wrmsr(MSR_IA32_UCODE_WRITE, 318 wrmsr(MSR_IA32_UCODE_WRITE,
@@ -351,30 +326,32 @@ static void apply_microcode(int cpu)
351 /* get the current revision from MSR 0x8B */ 326 /* get the current revision from MSR 0x8B */
352 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); 327 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
353 328
354 spin_unlock_irqrestore(&microcode_update_lock, flags);
355 if (val[1] != mc_intel->hdr.rev) { 329 if (val[1] != mc_intel->hdr.rev) {
356 printk(KERN_ERR "microcode: CPU%d update from revision " 330 printk(KERN_ERR "microcode: CPU%d update "
357 "0x%x to 0x%x failed\n", 331 "to revision 0x%x failed\n",
358 cpu_num, uci->cpu_sig.rev, val[1]); 332 cpu_num, mc_intel->hdr.rev);
359 return; 333 return -1;
360 } 334 }
361 printk(KERN_INFO "microcode: CPU%d updated from revision " 335 printk(KERN_INFO "microcode: CPU%d updated to revision "
362 "0x%x to 0x%x, date = %04x-%02x-%02x \n", 336 "0x%x, date = %04x-%02x-%02x \n",
363 cpu_num, uci->cpu_sig.rev, val[1], 337 cpu_num, val[1],
364 mc_intel->hdr.date & 0xffff, 338 mc_intel->hdr.date & 0xffff,
365 mc_intel->hdr.date >> 24, 339 mc_intel->hdr.date >> 24,
366 (mc_intel->hdr.date >> 16) & 0xff); 340 (mc_intel->hdr.date >> 16) & 0xff);
367 341
368 uci->cpu_sig.rev = val[1]; 342 uci->cpu_sig.rev = val[1];
343
344 return 0;
369} 345}
370 346
371static int generic_load_microcode(int cpu, void *data, size_t size, 347static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
372 int (*get_ucode_data)(void *, const void *, size_t)) 348 int (*get_ucode_data)(void *, const void *, size_t))
373{ 349{
374 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 350 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
375 u8 *ucode_ptr = data, *new_mc = NULL, *mc; 351 u8 *ucode_ptr = data, *new_mc = NULL, *mc;
376 int new_rev = uci->cpu_sig.rev; 352 int new_rev = uci->cpu_sig.rev;
377 unsigned int leftover = size; 353 unsigned int leftover = size;
354 enum ucode_state state = UCODE_OK;
378 355
379 while (leftover) { 356 while (leftover) {
380 struct microcode_header_intel mc_header; 357 struct microcode_header_intel mc_header;
@@ -412,11 +389,15 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
412 leftover -= mc_size; 389 leftover -= mc_size;
413 } 390 }
414 391
415 if (!new_mc) 392 if (leftover) {
393 if (new_mc)
394 vfree(new_mc);
395 state = UCODE_ERROR;
416 goto out; 396 goto out;
397 }
417 398
418 if (leftover) { 399 if (!new_mc) {
419 vfree(new_mc); 400 state = UCODE_NFOUND;
420 goto out; 401 goto out;
421 } 402 }
422 403
@@ -427,9 +408,8 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
427 pr_debug("microcode: CPU%d found a matching microcode update with" 408 pr_debug("microcode: CPU%d found a matching microcode update with"
428 " version 0x%x (current=0x%x)\n", 409 " version 0x%x (current=0x%x)\n",
429 cpu, new_rev, uci->cpu_sig.rev); 410 cpu, new_rev, uci->cpu_sig.rev);
430 411out:
431 out: 412 return state;
432 return (int)leftover;
433} 413}
434 414
435static int get_ucode_fw(void *to, const void *from, size_t n) 415static int get_ucode_fw(void *to, const void *from, size_t n)
@@ -438,21 +418,19 @@ static int get_ucode_fw(void *to, const void *from, size_t n)
438 return 0; 418 return 0;
439} 419}
440 420
441static int request_microcode_fw(int cpu, struct device *device) 421static enum ucode_state request_microcode_fw(int cpu, struct device *device)
442{ 422{
443 char name[30]; 423 char name[30];
444 struct cpuinfo_x86 *c = &cpu_data(cpu); 424 struct cpuinfo_x86 *c = &cpu_data(cpu);
445 const struct firmware *firmware; 425 const struct firmware *firmware;
446 int ret; 426 enum ucode_state ret;
447 427
448 /* We should bind the task to the CPU */
449 BUG_ON(cpu != raw_smp_processor_id());
450 sprintf(name, "intel-ucode/%02x-%02x-%02x", 428 sprintf(name, "intel-ucode/%02x-%02x-%02x",
451 c->x86, c->x86_model, c->x86_mask); 429 c->x86, c->x86_model, c->x86_mask);
452 ret = request_firmware(&firmware, name, device); 430
453 if (ret) { 431 if (request_firmware(&firmware, name, device)) {
454 pr_debug("microcode: data file %s load failed\n", name); 432 pr_debug("microcode: data file %s load failed\n", name);
455 return ret; 433 return UCODE_NFOUND;
456 } 434 }
457 435
458 ret = generic_load_microcode(cpu, (void *)firmware->data, 436 ret = generic_load_microcode(cpu, (void *)firmware->data,
@@ -468,11 +446,9 @@ static int get_ucode_user(void *to, const void *from, size_t n)
468 return copy_from_user(to, from, n); 446 return copy_from_user(to, from, n);
469} 447}
470 448
471static int request_microcode_user(int cpu, const void __user *buf, size_t size) 449static enum ucode_state
450request_microcode_user(int cpu, const void __user *buf, size_t size)
472{ 451{
473 /* We should bind the task to the CPU */
474 BUG_ON(cpu != raw_smp_processor_id());
475
476 return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user); 452 return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user);
477} 453}
478 454
diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module.c
index c23880b90b5c..89f386f044e4 100644
--- a/arch/x86/kernel/module_64.c
+++ b/arch/x86/kernel/module.c
@@ -1,6 +1,5 @@
1/* Kernel module help for x86-64 1/* Kernel module help for x86.
2 Copyright (C) 2001 Rusty Russell. 2 Copyright (C) 2001 Rusty Russell.
3 Copyright (C) 2002,2003 Andi Kleen, SuSE Labs.
4 3
5 This program is free software; you can redistribute it and/or modify 4 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by 5 it under the terms of the GNU General Public License as published by
@@ -22,23 +21,18 @@
22#include <linux/fs.h> 21#include <linux/fs.h>
23#include <linux/string.h> 22#include <linux/string.h>
24#include <linux/kernel.h> 23#include <linux/kernel.h>
25#include <linux/mm.h>
26#include <linux/slab.h>
27#include <linux/bug.h> 24#include <linux/bug.h>
25#include <linux/mm.h>
28 26
29#include <asm/system.h> 27#include <asm/system.h>
30#include <asm/page.h> 28#include <asm/page.h>
31#include <asm/pgtable.h> 29#include <asm/pgtable.h>
32 30
31#if 0
32#define DEBUGP printk
33#else
33#define DEBUGP(fmt...) 34#define DEBUGP(fmt...)
34 35#endif
35#ifndef CONFIG_UML
36void module_free(struct module *mod, void *module_region)
37{
38 vfree(module_region);
39 /* FIXME: If module_region == mod->init_region, trim exception
40 table entries. */
41}
42 36
43void *module_alloc(unsigned long size) 37void *module_alloc(unsigned long size)
44{ 38{
@@ -54,9 +48,15 @@ void *module_alloc(unsigned long size)
54 if (!area) 48 if (!area)
55 return NULL; 49 return NULL;
56 50
57 return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC); 51 return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM,
52 PAGE_KERNEL_EXEC);
53}
54
55/* Free memory returned from module_alloc */
56void module_free(struct module *mod, void *module_region)
57{
58 vfree(module_region);
58} 59}
59#endif
60 60
61/* We don't need anything special. */ 61/* We don't need anything special. */
62int module_frob_arch_sections(Elf_Ehdr *hdr, 62int module_frob_arch_sections(Elf_Ehdr *hdr,
@@ -67,6 +67,58 @@ int module_frob_arch_sections(Elf_Ehdr *hdr,
67 return 0; 67 return 0;
68} 68}
69 69
70#ifdef CONFIG_X86_32
71int apply_relocate(Elf32_Shdr *sechdrs,
72 const char *strtab,
73 unsigned int symindex,
74 unsigned int relsec,
75 struct module *me)
76{
77 unsigned int i;
78 Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr;
79 Elf32_Sym *sym;
80 uint32_t *location;
81
82 DEBUGP("Applying relocate section %u to %u\n", relsec,
83 sechdrs[relsec].sh_info);
84 for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
85 /* This is where to make the change */
86 location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
87 + rel[i].r_offset;
88 /* This is the symbol it is referring to. Note that all
89 undefined symbols have been resolved. */
90 sym = (Elf32_Sym *)sechdrs[symindex].sh_addr
91 + ELF32_R_SYM(rel[i].r_info);
92
93 switch (ELF32_R_TYPE(rel[i].r_info)) {
94 case R_386_32:
95 /* We add the value into the location given */
96 *location += sym->st_value;
97 break;
98 case R_386_PC32:
99 /* Add the value, subtract its postition */
100 *location += sym->st_value - (uint32_t)location;
101 break;
102 default:
103 printk(KERN_ERR "module %s: Unknown relocation: %u\n",
104 me->name, ELF32_R_TYPE(rel[i].r_info));
105 return -ENOEXEC;
106 }
107 }
108 return 0;
109}
110
111int apply_relocate_add(Elf32_Shdr *sechdrs,
112 const char *strtab,
113 unsigned int symindex,
114 unsigned int relsec,
115 struct module *me)
116{
117 printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n",
118 me->name);
119 return -ENOEXEC;
120}
121#else /*X86_64*/
70int apply_relocate_add(Elf64_Shdr *sechdrs, 122int apply_relocate_add(Elf64_Shdr *sechdrs,
71 const char *strtab, 123 const char *strtab,
72 unsigned int symindex, 124 unsigned int symindex,
@@ -147,6 +199,8 @@ int apply_relocate(Elf_Shdr *sechdrs,
147 return -ENOSYS; 199 return -ENOSYS;
148} 200}
149 201
202#endif
203
150int module_finalize(const Elf_Ehdr *hdr, 204int module_finalize(const Elf_Ehdr *hdr,
151 const Elf_Shdr *sechdrs, 205 const Elf_Shdr *sechdrs,
152 struct module *me) 206 struct module *me)
diff --git a/arch/x86/kernel/module_32.c b/arch/x86/kernel/module_32.c
deleted file mode 100644
index 0edd819050e7..000000000000
--- a/arch/x86/kernel/module_32.c
+++ /dev/null
@@ -1,152 +0,0 @@
1/* Kernel module help for i386.
2 Copyright (C) 2001 Rusty Russell.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/
18#include <linux/moduleloader.h>
19#include <linux/elf.h>
20#include <linux/vmalloc.h>
21#include <linux/fs.h>
22#include <linux/string.h>
23#include <linux/kernel.h>
24#include <linux/bug.h>
25
26#if 0
27#define DEBUGP printk
28#else
29#define DEBUGP(fmt...)
30#endif
31
32void *module_alloc(unsigned long size)
33{
34 if (size == 0)
35 return NULL;
36 return vmalloc_exec(size);
37}
38
39
40/* Free memory returned from module_alloc */
41void module_free(struct module *mod, void *module_region)
42{
43 vfree(module_region);
44 /* FIXME: If module_region == mod->init_region, trim exception
45 table entries. */
46}
47
48/* We don't need anything special. */
49int module_frob_arch_sections(Elf_Ehdr *hdr,
50 Elf_Shdr *sechdrs,
51 char *secstrings,
52 struct module *mod)
53{
54 return 0;
55}
56
57int apply_relocate(Elf32_Shdr *sechdrs,
58 const char *strtab,
59 unsigned int symindex,
60 unsigned int relsec,
61 struct module *me)
62{
63 unsigned int i;
64 Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr;
65 Elf32_Sym *sym;
66 uint32_t *location;
67
68 DEBUGP("Applying relocate section %u to %u\n", relsec,
69 sechdrs[relsec].sh_info);
70 for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
71 /* This is where to make the change */
72 location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
73 + rel[i].r_offset;
74 /* This is the symbol it is referring to. Note that all
75 undefined symbols have been resolved. */
76 sym = (Elf32_Sym *)sechdrs[symindex].sh_addr
77 + ELF32_R_SYM(rel[i].r_info);
78
79 switch (ELF32_R_TYPE(rel[i].r_info)) {
80 case R_386_32:
81 /* We add the value into the location given */
82 *location += sym->st_value;
83 break;
84 case R_386_PC32:
85 /* Add the value, subtract its postition */
86 *location += sym->st_value - (uint32_t)location;
87 break;
88 default:
89 printk(KERN_ERR "module %s: Unknown relocation: %u\n",
90 me->name, ELF32_R_TYPE(rel[i].r_info));
91 return -ENOEXEC;
92 }
93 }
94 return 0;
95}
96
97int apply_relocate_add(Elf32_Shdr *sechdrs,
98 const char *strtab,
99 unsigned int symindex,
100 unsigned int relsec,
101 struct module *me)
102{
103 printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n",
104 me->name);
105 return -ENOEXEC;
106}
107
108int module_finalize(const Elf_Ehdr *hdr,
109 const Elf_Shdr *sechdrs,
110 struct module *me)
111{
112 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
113 *para = NULL;
114 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
115
116 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
117 if (!strcmp(".text", secstrings + s->sh_name))
118 text = s;
119 if (!strcmp(".altinstructions", secstrings + s->sh_name))
120 alt = s;
121 if (!strcmp(".smp_locks", secstrings + s->sh_name))
122 locks = s;
123 if (!strcmp(".parainstructions", secstrings + s->sh_name))
124 para = s;
125 }
126
127 if (alt) {
128 /* patch .altinstructions */
129 void *aseg = (void *)alt->sh_addr;
130 apply_alternatives(aseg, aseg + alt->sh_size);
131 }
132 if (locks && text) {
133 void *lseg = (void *)locks->sh_addr;
134 void *tseg = (void *)text->sh_addr;
135 alternatives_smp_module_add(me, me->name,
136 lseg, lseg + locks->sh_size,
137 tseg, tseg + text->sh_size);
138 }
139
140 if (para) {
141 void *pseg = (void *)para->sh_addr;
142 apply_paravirt(pseg, pseg + para->sh_size);
143 }
144
145 return module_bug_finalize(hdr, sechdrs, me);
146}
147
148void module_arch_cleanup(struct module *mod)
149{
150 alternatives_smp_module_del(mod);
151 module_bug_cleanup(mod);
152}
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 70fd7e414c15..651c93b28862 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -17,6 +17,7 @@
17#include <linux/acpi.h> 17#include <linux/acpi.h>
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <linux/pci.h>
20 21
21#include <asm/mtrr.h> 22#include <asm/mtrr.h>
22#include <asm/mpspec.h> 23#include <asm/mpspec.h>
@@ -870,24 +871,17 @@ static
870inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {} 871inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
871#endif /* CONFIG_X86_IO_APIC */ 872#endif /* CONFIG_X86_IO_APIC */
872 873
873static int check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, 874static int
874 int count) 875check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
875{ 876{
876 if (!mpc_new_phys) { 877 int ret = 0;
877 pr_info("No spare slots, try to append...take your risk, " 878
878 "new mpc_length %x\n", count); 879 if (!mpc_new_phys || count <= mpc_new_length) {
879 } else { 880 WARN(1, "update_mptable: No spare slots (length: %x)\n", count);
880 if (count <= mpc_new_length) 881 return -1;
881 pr_info("No spare slots, try to append..., "
882 "new mpc_length %x\n", count);
883 else {
884 pr_err("mpc_new_length %lx is too small\n",
885 mpc_new_length);
886 return -1;
887 }
888 } 882 }
889 883
890 return 0; 884 return ret;
891} 885}
892 886
893static int __init replace_intsrc_all(struct mpc_table *mpc, 887static int __init replace_intsrc_all(struct mpc_table *mpc,
@@ -946,7 +940,7 @@ static int __init replace_intsrc_all(struct mpc_table *mpc,
946 } else { 940 } else {
947 struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; 941 struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
948 count += sizeof(struct mpc_intsrc); 942 count += sizeof(struct mpc_intsrc);
949 if (!check_slot(mpc_new_phys, mpc_new_length, count)) 943 if (check_slot(mpc_new_phys, mpc_new_length, count) < 0)
950 goto out; 944 goto out;
951 assign_to_mpc_intsrc(&mp_irqs[i], m); 945 assign_to_mpc_intsrc(&mp_irqs[i], m);
952 mpc->length = count; 946 mpc->length = count;
@@ -963,11 +957,14 @@ out:
963 return 0; 957 return 0;
964} 958}
965 959
966static int __initdata enable_update_mptable; 960int enable_update_mptable;
967 961
968static int __init update_mptable_setup(char *str) 962static int __init update_mptable_setup(char *str)
969{ 963{
970 enable_update_mptable = 1; 964 enable_update_mptable = 1;
965#ifdef CONFIG_PCI
966 pci_routeirq = 1;
967#endif
971 return 0; 968 return 0;
972} 969}
973early_param("update_mptable", update_mptable_setup); 970early_param("update_mptable", update_mptable_setup);
@@ -980,6 +977,9 @@ static int __initdata alloc_mptable;
980static int __init parse_alloc_mptable_opt(char *p) 977static int __init parse_alloc_mptable_opt(char *p)
981{ 978{
982 enable_update_mptable = 1; 979 enable_update_mptable = 1;
980#ifdef CONFIG_PCI
981 pci_routeirq = 1;
982#endif
983 alloc_mptable = 1; 983 alloc_mptable = 1;
984 if (!p) 984 if (!p)
985 return 0; 985 return 0;
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 3cf3413ec626..98fd6cd4e3a4 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -196,6 +196,11 @@ static struct notifier_block __refdata msr_class_cpu_notifier = {
196 .notifier_call = msr_class_cpu_callback, 196 .notifier_call = msr_class_cpu_callback,
197}; 197};
198 198
199static char *msr_nodename(struct device *dev)
200{
201 return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));
202}
203
199static int __init msr_init(void) 204static int __init msr_init(void)
200{ 205{
201 int i, err = 0; 206 int i, err = 0;
@@ -212,6 +217,7 @@ static int __init msr_init(void)
212 err = PTR_ERR(msr_class); 217 err = PTR_ERR(msr_class);
213 goto out_chrdev; 218 goto out_chrdev;
214 } 219 }
220 msr_class->nodename = msr_nodename;
215 for_each_online_cpu(i) { 221 for_each_online_cpu(i) {
216 err = msr_device_create(i); 222 err = msr_device_create(i);
217 if (err != 0) 223 if (err != 0)
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 9faf43bea336..70ec9b951d76 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -248,18 +248,16 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA
248 248
249static inline void enter_lazy(enum paravirt_lazy_mode mode) 249static inline void enter_lazy(enum paravirt_lazy_mode mode)
250{ 250{
251 BUG_ON(__get_cpu_var(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); 251 BUG_ON(percpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
252 BUG_ON(preemptible());
253 252
254 __get_cpu_var(paravirt_lazy_mode) = mode; 253 percpu_write(paravirt_lazy_mode, mode);
255} 254}
256 255
257void paravirt_leave_lazy(enum paravirt_lazy_mode mode) 256static void leave_lazy(enum paravirt_lazy_mode mode)
258{ 257{
259 BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode); 258 BUG_ON(percpu_read(paravirt_lazy_mode) != mode);
260 BUG_ON(preemptible());
261 259
262 __get_cpu_var(paravirt_lazy_mode) = PARAVIRT_LAZY_NONE; 260 percpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
263} 261}
264 262
265void paravirt_enter_lazy_mmu(void) 263void paravirt_enter_lazy_mmu(void)
@@ -269,22 +267,36 @@ void paravirt_enter_lazy_mmu(void)
269 267
270void paravirt_leave_lazy_mmu(void) 268void paravirt_leave_lazy_mmu(void)
271{ 269{
272 paravirt_leave_lazy(PARAVIRT_LAZY_MMU); 270 leave_lazy(PARAVIRT_LAZY_MMU);
273} 271}
274 272
275void paravirt_enter_lazy_cpu(void) 273void paravirt_start_context_switch(struct task_struct *prev)
276{ 274{
275 BUG_ON(preemptible());
276
277 if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
278 arch_leave_lazy_mmu_mode();
279 set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES);
280 }
277 enter_lazy(PARAVIRT_LAZY_CPU); 281 enter_lazy(PARAVIRT_LAZY_CPU);
278} 282}
279 283
280void paravirt_leave_lazy_cpu(void) 284void paravirt_end_context_switch(struct task_struct *next)
281{ 285{
282 paravirt_leave_lazy(PARAVIRT_LAZY_CPU); 286 BUG_ON(preemptible());
287
288 leave_lazy(PARAVIRT_LAZY_CPU);
289
290 if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES))
291 arch_enter_lazy_mmu_mode();
283} 292}
284 293
285enum paravirt_lazy_mode paravirt_get_lazy_mode(void) 294enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
286{ 295{
287 return __get_cpu_var(paravirt_lazy_mode); 296 if (in_interrupt())
297 return PARAVIRT_LAZY_NONE;
298
299 return percpu_read(paravirt_lazy_mode);
288} 300}
289 301
290void arch_flush_lazy_mmu_mode(void) 302void arch_flush_lazy_mmu_mode(void)
@@ -292,7 +304,6 @@ void arch_flush_lazy_mmu_mode(void)
292 preempt_disable(); 304 preempt_disable();
293 305
294 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { 306 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
295 WARN_ON(preempt_count() == 1);
296 arch_leave_lazy_mmu_mode(); 307 arch_leave_lazy_mmu_mode();
297 arch_enter_lazy_mmu_mode(); 308 arch_enter_lazy_mmu_mode();
298 } 309 }
@@ -300,19 +311,6 @@ void arch_flush_lazy_mmu_mode(void)
300 preempt_enable(); 311 preempt_enable();
301} 312}
302 313
303void arch_flush_lazy_cpu_mode(void)
304{
305 preempt_disable();
306
307 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
308 WARN_ON(preempt_count() == 1);
309 arch_leave_lazy_cpu_mode();
310 arch_enter_lazy_cpu_mode();
311 }
312
313 preempt_enable();
314}
315
316struct pv_info pv_info = { 314struct pv_info pv_info = {
317 .name = "bare hardware", 315 .name = "bare hardware",
318 .paravirt_enabled = 0, 316 .paravirt_enabled = 0,
@@ -404,10 +402,8 @@ struct pv_cpu_ops pv_cpu_ops = {
404 .set_iopl_mask = native_set_iopl_mask, 402 .set_iopl_mask = native_set_iopl_mask,
405 .io_delay = native_io_delay, 403 .io_delay = native_io_delay,
406 404
407 .lazy_mode = { 405 .start_context_switch = paravirt_nop,
408 .enter = paravirt_nop, 406 .end_context_switch = paravirt_nop,
409 .leave = paravirt_nop,
410 },
411}; 407};
412 408
413struct pv_apic_ops pv_apic_ops = { 409struct pv_apic_ops pv_apic_ops = {
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 755c21e906f3..971a3bec47a8 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -186,37 +186,6 @@ static struct cal_chipset_ops calioc2_chip_ops = {
186 186
187static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, }; 187static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, };
188 188
189/* enable this to stress test the chip's TCE cache */
190#ifdef CONFIG_IOMMU_DEBUG
191static int debugging = 1;
192
193static inline unsigned long verify_bit_range(unsigned long* bitmap,
194 int expected, unsigned long start, unsigned long end)
195{
196 unsigned long idx = start;
197
198 BUG_ON(start >= end);
199
200 while (idx < end) {
201 if (!!test_bit(idx, bitmap) != expected)
202 return idx;
203 ++idx;
204 }
205
206 /* all bits have the expected value */
207 return ~0UL;
208}
209#else /* debugging is disabled */
210static int debugging;
211
212static inline unsigned long verify_bit_range(unsigned long* bitmap,
213 int expected, unsigned long start, unsigned long end)
214{
215 return ~0UL;
216}
217
218#endif /* CONFIG_IOMMU_DEBUG */
219
220static inline int translation_enabled(struct iommu_table *tbl) 189static inline int translation_enabled(struct iommu_table *tbl)
221{ 190{
222 /* only PHBs with translation enabled have an IOMMU table */ 191 /* only PHBs with translation enabled have an IOMMU table */
@@ -228,7 +197,6 @@ static void iommu_range_reserve(struct iommu_table *tbl,
228{ 197{
229 unsigned long index; 198 unsigned long index;
230 unsigned long end; 199 unsigned long end;
231 unsigned long badbit;
232 unsigned long flags; 200 unsigned long flags;
233 201
234 index = start_addr >> PAGE_SHIFT; 202 index = start_addr >> PAGE_SHIFT;
@@ -243,14 +211,6 @@ static void iommu_range_reserve(struct iommu_table *tbl,
243 211
244 spin_lock_irqsave(&tbl->it_lock, flags); 212 spin_lock_irqsave(&tbl->it_lock, flags);
245 213
246 badbit = verify_bit_range(tbl->it_map, 0, index, end);
247 if (badbit != ~0UL) {
248 if (printk_ratelimit())
249 printk(KERN_ERR "Calgary: entry already allocated at "
250 "0x%lx tbl %p dma 0x%lx npages %u\n",
251 badbit, tbl, start_addr, npages);
252 }
253
254 iommu_area_reserve(tbl->it_map, index, npages); 214 iommu_area_reserve(tbl->it_map, index, npages);
255 215
256 spin_unlock_irqrestore(&tbl->it_lock, flags); 216 spin_unlock_irqrestore(&tbl->it_lock, flags);
@@ -326,7 +286,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
326 unsigned int npages) 286 unsigned int npages)
327{ 287{
328 unsigned long entry; 288 unsigned long entry;
329 unsigned long badbit;
330 unsigned long badend; 289 unsigned long badend;
331 unsigned long flags; 290 unsigned long flags;
332 291
@@ -346,14 +305,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
346 305
347 spin_lock_irqsave(&tbl->it_lock, flags); 306 spin_lock_irqsave(&tbl->it_lock, flags);
348 307
349 badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages);
350 if (badbit != ~0UL) {
351 if (printk_ratelimit())
352 printk(KERN_ERR "Calgary: bit is off at 0x%lx "
353 "tbl %p dma 0x%Lx entry 0x%lx npages %u\n",
354 badbit, tbl, dma_addr, entry, npages);
355 }
356
357 iommu_area_free(tbl->it_map, entry, npages); 308 iommu_area_free(tbl->it_map, entry, npages);
358 309
359 spin_unlock_irqrestore(&tbl->it_lock, flags); 310 spin_unlock_irqrestore(&tbl->it_lock, flags);
@@ -1488,9 +1439,8 @@ void __init detect_calgary(void)
1488 iommu_detected = 1; 1439 iommu_detected = 1;
1489 calgary_detected = 1; 1440 calgary_detected = 1;
1490 printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n"); 1441 printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n");
1491 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " 1442 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n",
1492 "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, 1443 specified_table_size);
1493 debugging ? "enabled" : "disabled");
1494 1444
1495 /* swiotlb for devices that aren't behind the Calgary. */ 1445 /* swiotlb for devices that aren't behind the Calgary. */
1496 if (max_pfn > MAX_DMA32_PFN) 1446 if (max_pfn > MAX_DMA32_PFN)
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 745579bc8256..47630479b067 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -32,6 +32,8 @@ int no_iommu __read_mostly;
32/* Set this to 1 if there is a HW IOMMU in the system */ 32/* Set this to 1 if there is a HW IOMMU in the system */
33int iommu_detected __read_mostly = 0; 33int iommu_detected __read_mostly = 0;
34 34
35int iommu_pass_through;
36
35dma_addr_t bad_dma_address __read_mostly = 0; 37dma_addr_t bad_dma_address __read_mostly = 0;
36EXPORT_SYMBOL(bad_dma_address); 38EXPORT_SYMBOL(bad_dma_address);
37 39
@@ -209,6 +211,10 @@ static __init int iommu_setup(char *p)
209#ifdef CONFIG_SWIOTLB 211#ifdef CONFIG_SWIOTLB
210 if (!strncmp(p, "soft", 4)) 212 if (!strncmp(p, "soft", 4))
211 swiotlb = 1; 213 swiotlb = 1;
214 if (!strncmp(p, "pt", 2)) {
215 iommu_pass_through = 1;
216 return 1;
217 }
212#endif 218#endif
213 219
214 gart_parse_options(p); 220 gart_parse_options(p);
@@ -290,6 +296,8 @@ static int __init pci_iommu_init(void)
290void pci_iommu_shutdown(void) 296void pci_iommu_shutdown(void)
291{ 297{
292 gart_iommu_shutdown(); 298 gart_iommu_shutdown();
299
300 amd_iommu_shutdown();
293} 301}
294/* Must execute after PCI subsystem */ 302/* Must execute after PCI subsystem */
295fs_initcall(pci_iommu_init); 303fs_initcall(pci_iommu_init);
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index b284b58c035c..cfd9f9063896 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -144,48 +144,21 @@ static void flush_gart(void)
144} 144}
145 145
146#ifdef CONFIG_IOMMU_LEAK 146#ifdef CONFIG_IOMMU_LEAK
147
148#define SET_LEAK(x) \
149 do { \
150 if (iommu_leak_tab) \
151 iommu_leak_tab[x] = __builtin_return_address(0);\
152 } while (0)
153
154#define CLEAR_LEAK(x) \
155 do { \
156 if (iommu_leak_tab) \
157 iommu_leak_tab[x] = NULL; \
158 } while (0)
159
160/* Debugging aid for drivers that don't free their IOMMU tables */ 147/* Debugging aid for drivers that don't free their IOMMU tables */
161static void **iommu_leak_tab;
162static int leak_trace; 148static int leak_trace;
163static int iommu_leak_pages = 20; 149static int iommu_leak_pages = 20;
164 150
165static void dump_leak(void) 151static void dump_leak(void)
166{ 152{
167 int i;
168 static int dump; 153 static int dump;
169 154
170 if (dump || !iommu_leak_tab) 155 if (dump)
171 return; 156 return;
172 dump = 1; 157 dump = 1;
173 show_stack(NULL, NULL);
174 158
175 /* Very crude. dump some from the end of the table too */ 159 show_stack(NULL, NULL);
176 printk(KERN_DEBUG "Dumping %d pages from end of IOMMU:\n", 160 debug_dma_dump_mappings(NULL);
177 iommu_leak_pages);
178 for (i = 0; i < iommu_leak_pages; i += 2) {
179 printk(KERN_DEBUG "%lu: ", iommu_pages-i);
180 printk_address((unsigned long) iommu_leak_tab[iommu_pages-i],
181 0);
182 printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' ');
183 }
184 printk(KERN_DEBUG "\n");
185} 161}
186#else
187# define SET_LEAK(x)
188# define CLEAR_LEAK(x)
189#endif 162#endif
190 163
191static void iommu_full(struct device *dev, size_t size, int dir) 164static void iommu_full(struct device *dev, size_t size, int dir)
@@ -248,7 +221,6 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
248 221
249 for (i = 0; i < npages; i++) { 222 for (i = 0; i < npages; i++) {
250 iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem); 223 iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem);
251 SET_LEAK(iommu_page + i);
252 phys_mem += PAGE_SIZE; 224 phys_mem += PAGE_SIZE;
253 } 225 }
254 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); 226 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
@@ -294,7 +266,6 @@ static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr,
294 npages = iommu_num_pages(dma_addr, size, PAGE_SIZE); 266 npages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
295 for (i = 0; i < npages; i++) { 267 for (i = 0; i < npages; i++) {
296 iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; 268 iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
297 CLEAR_LEAK(iommu_page + i);
298 } 269 }
299 free_iommu(iommu_page, npages); 270 free_iommu(iommu_page, npages);
300} 271}
@@ -377,7 +348,6 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
377 pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE); 348 pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE);
378 while (pages--) { 349 while (pages--) {
379 iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); 350 iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
380 SET_LEAK(iommu_page);
381 addr += PAGE_SIZE; 351 addr += PAGE_SIZE;
382 iommu_page++; 352 iommu_page++;
383 } 353 }
@@ -688,8 +658,6 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
688 658
689 agp_gatt_table = gatt; 659 agp_gatt_table = gatt;
690 660
691 enable_gart_translations();
692
693 error = sysdev_class_register(&gart_sysdev_class); 661 error = sysdev_class_register(&gart_sysdev_class);
694 if (!error) 662 if (!error)
695 error = sysdev_register(&device_gart); 663 error = sysdev_register(&device_gart);
@@ -801,11 +769,12 @@ void __init gart_iommu_init(void)
801 769
802#ifdef CONFIG_IOMMU_LEAK 770#ifdef CONFIG_IOMMU_LEAK
803 if (leak_trace) { 771 if (leak_trace) {
804 iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, 772 int ret;
805 get_order(iommu_pages*sizeof(void *))); 773
806 if (!iommu_leak_tab) 774 ret = dma_debug_resize_entries(iommu_pages);
775 if (ret)
807 printk(KERN_DEBUG 776 printk(KERN_DEBUG
808 "PCI-DMA: Cannot allocate leak trace area\n"); 777 "PCI-DMA: Cannot trace all the entries\n");
809 } 778 }
810#endif 779#endif
811 780
@@ -845,6 +814,14 @@ void __init gart_iommu_init(void)
845 * the pages as Not-Present: 814 * the pages as Not-Present:
846 */ 815 */
847 wbinvd(); 816 wbinvd();
817
818 /*
819 * Now all caches are flushed and we can safely enable
820 * GART hardware. Doing it early leaves the possibility
821 * of stale cache entries that can lead to GART PTE
822 * errors.
823 */
824 enable_gart_translations();
848 825
849 /* 826 /*
850 * Try to workaround a bug (thanks to BenH): 827 * Try to workaround a bug (thanks to BenH):
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 221a3853e268..6af96ee44200 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -28,7 +28,7 @@ dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr)
28 return paddr; 28 return paddr;
29} 29}
30 30
31phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr) 31phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr)
32{ 32{
33 return baddr; 33 return baddr;
34} 34}
@@ -71,7 +71,8 @@ void __init pci_swiotlb_init(void)
71{ 71{
72 /* don't initialize swiotlb if iommu=off (no_iommu=1) */ 72 /* don't initialize swiotlb if iommu=off (no_iommu=1) */
73#ifdef CONFIG_X86_64 73#ifdef CONFIG_X86_64
74 if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) 74 if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) ||
75 iommu_pass_through)
75 swiotlb = 1; 76 swiotlb = 1;
76#endif 77#endif
77 if (swiotlb_force) 78 if (swiotlb_force)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ca989158e847..994dd6a4a2a0 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -8,12 +8,15 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/pm.h> 9#include <linux/pm.h>
10#include <linux/clockchips.h> 10#include <linux/clockchips.h>
11#include <linux/random.h>
11#include <trace/power.h> 12#include <trace/power.h>
12#include <asm/system.h> 13#include <asm/system.h>
13#include <asm/apic.h> 14#include <asm/apic.h>
15#include <asm/syscalls.h>
14#include <asm/idle.h> 16#include <asm/idle.h>
15#include <asm/uaccess.h> 17#include <asm/uaccess.h>
16#include <asm/i387.h> 18#include <asm/i387.h>
19#include <asm/ds.h>
17 20
18unsigned long idle_halt; 21unsigned long idle_halt;
19EXPORT_SYMBOL(idle_halt); 22EXPORT_SYMBOL(idle_halt);
@@ -45,6 +48,8 @@ void free_thread_xstate(struct task_struct *tsk)
45 kmem_cache_free(task_xstate_cachep, tsk->thread.xstate); 48 kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
46 tsk->thread.xstate = NULL; 49 tsk->thread.xstate = NULL;
47 } 50 }
51
52 WARN(tsk->thread.ds_ctx, "leaking DS context\n");
48} 53}
49 54
50void free_thread_info(struct thread_info *ti) 55void free_thread_info(struct thread_info *ti)
@@ -58,7 +63,7 @@ void arch_task_cache_init(void)
58 task_xstate_cachep = 63 task_xstate_cachep =
59 kmem_cache_create("task_xstate", xstate_size, 64 kmem_cache_create("task_xstate", xstate_size,
60 __alignof__(union thread_xstate), 65 __alignof__(union thread_xstate),
61 SLAB_PANIC, NULL); 66 SLAB_PANIC | SLAB_NOTRACK, NULL);
62} 67}
63 68
64/* 69/*
@@ -83,8 +88,6 @@ void exit_thread(void)
83 put_cpu(); 88 put_cpu();
84 kfree(bp); 89 kfree(bp);
85 } 90 }
86
87 ds_exit_thread(current);
88} 91}
89 92
90void flush_thread(void) 93void flush_thread(void)
@@ -613,3 +616,16 @@ static int __init idle_setup(char *str)
613} 616}
614early_param("idle", idle_setup); 617early_param("idle", idle_setup);
615 618
619unsigned long arch_align_stack(unsigned long sp)
620{
621 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
622 sp -= get_random_int() % 8192;
623 return sp & ~0xf;
624}
625
626unsigned long arch_randomize_brk(struct mm_struct *mm)
627{
628 unsigned long range_end = mm->brk + 0x02000000;
629 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
630}
631
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 76f8f84043a2..59f4524984af 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -9,8 +9,6 @@
9 * This file handles the architecture-dependent parts of process handling.. 9 * This file handles the architecture-dependent parts of process handling..
10 */ 10 */
11 11
12#include <stdarg.h>
13
14#include <linux/stackprotector.h> 12#include <linux/stackprotector.h>
15#include <linux/cpu.h> 13#include <linux/cpu.h>
16#include <linux/errno.h> 14#include <linux/errno.h>
@@ -33,7 +31,6 @@
33#include <linux/module.h> 31#include <linux/module.h>
34#include <linux/kallsyms.h> 32#include <linux/kallsyms.h>
35#include <linux/ptrace.h> 33#include <linux/ptrace.h>
36#include <linux/random.h>
37#include <linux/personality.h> 34#include <linux/personality.h>
38#include <linux/tick.h> 35#include <linux/tick.h>
39#include <linux/percpu.h> 36#include <linux/percpu.h>
@@ -290,7 +287,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
290 p->thread.io_bitmap_max = 0; 287 p->thread.io_bitmap_max = 0;
291 } 288 }
292 289
293 ds_copy_thread(p, current); 290 clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
291 p->thread.ds_ctx = NULL;
294 292
295 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); 293 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
296 p->thread.debugctlmsr = 0; 294 p->thread.debugctlmsr = 0;
@@ -407,7 +405,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
407 * done before math_state_restore, so the TS bit is up 405 * done before math_state_restore, so the TS bit is up
408 * to date. 406 * to date.
409 */ 407 */
410 arch_leave_lazy_cpu_mode(); 408 arch_end_context_switch(next_p);
411 409
412 /* If the task has used fpu the last 5 timeslices, just do a full 410 /* If the task has used fpu the last 5 timeslices, just do a full
413 * restore of the math state immediately to avoid the trap; the 411 * restore of the math state immediately to avoid the trap; the
@@ -497,15 +495,3 @@ unsigned long get_wchan(struct task_struct *p)
497 return 0; 495 return 0;
498} 496}
499 497
500unsigned long arch_align_stack(unsigned long sp)
501{
502 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
503 sp -= get_random_int() % 8192;
504 return sp & ~0xf;
505}
506
507unsigned long arch_randomize_brk(struct mm_struct *mm)
508{
509 unsigned long range_end = mm->brk + 0x02000000;
510 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
511}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b751a41392b1..ebefb5407b9d 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -14,8 +14,6 @@
14 * This file handles the architecture-dependent parts of process handling.. 14 * This file handles the architecture-dependent parts of process handling..
15 */ 15 */
16 16
17#include <stdarg.h>
18
19#include <linux/stackprotector.h> 17#include <linux/stackprotector.h>
20#include <linux/cpu.h> 18#include <linux/cpu.h>
21#include <linux/errno.h> 19#include <linux/errno.h>
@@ -32,7 +30,6 @@
32#include <linux/delay.h> 30#include <linux/delay.h>
33#include <linux/module.h> 31#include <linux/module.h>
34#include <linux/ptrace.h> 32#include <linux/ptrace.h>
35#include <linux/random.h>
36#include <linux/notifier.h> 33#include <linux/notifier.h>
37#include <linux/kprobes.h> 34#include <linux/kprobes.h>
38#include <linux/kdebug.h> 35#include <linux/kdebug.h>
@@ -335,7 +332,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
335 goto out; 332 goto out;
336 } 333 }
337 334
338 ds_copy_thread(p, me); 335 clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
336 p->thread.ds_ctx = NULL;
339 337
340 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); 338 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
341 p->thread.debugctlmsr = 0; 339 p->thread.debugctlmsr = 0;
@@ -428,7 +426,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
428 * done before math_state_restore, so the TS bit is up 426 * done before math_state_restore, so the TS bit is up
429 * to date. 427 * to date.
430 */ 428 */
431 arch_leave_lazy_cpu_mode(); 429 arch_end_context_switch(next_p);
432 430
433 /* 431 /*
434 * Switch FS and GS. 432 * Switch FS and GS.
@@ -660,15 +658,3 @@ long sys_arch_prctl(int code, unsigned long addr)
660 return do_arch_prctl(current, code, addr); 658 return do_arch_prctl(current, code, addr);
661} 659}
662 660
663unsigned long arch_align_stack(unsigned long sp)
664{
665 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
666 sp -= get_random_int() % 8192;
667 return sp & ~0xf;
668}
669
670unsigned long arch_randomize_brk(struct mm_struct *mm)
671{
672 unsigned long range_end = mm->brk + 0x02000000;
673 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
674}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 23b7c8f017e2..09ecbde91c13 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -21,6 +21,7 @@
21#include <linux/audit.h> 21#include <linux/audit.h>
22#include <linux/seccomp.h> 22#include <linux/seccomp.h>
23#include <linux/signal.h> 23#include <linux/signal.h>
24#include <linux/workqueue.h>
24 25
25#include <asm/uaccess.h> 26#include <asm/uaccess.h>
26#include <asm/pgtable.h> 27#include <asm/pgtable.h>
@@ -578,17 +579,130 @@ static int ioperm_get(struct task_struct *target,
578} 579}
579 580
580#ifdef CONFIG_X86_PTRACE_BTS 581#ifdef CONFIG_X86_PTRACE_BTS
582/*
583 * A branch trace store context.
584 *
585 * Contexts may only be installed by ptrace_bts_config() and only for
586 * ptraced tasks.
587 *
588 * Contexts are destroyed when the tracee is detached from the tracer.
589 * The actual destruction work requires interrupts enabled, so the
590 * work is deferred and will be scheduled during __ptrace_unlink().
591 *
592 * Contexts hold an additional task_struct reference on the traced
593 * task, as well as a reference on the tracer's mm.
594 *
595 * Ptrace already holds a task_struct for the duration of ptrace operations,
596 * but since destruction is deferred, it may be executed after both
597 * tracer and tracee exited.
598 */
599struct bts_context {
600 /* The branch trace handle. */
601 struct bts_tracer *tracer;
602
603 /* The buffer used to store the branch trace and its size. */
604 void *buffer;
605 unsigned int size;
606
607 /* The mm that paid for the above buffer. */
608 struct mm_struct *mm;
609
610 /* The task this context belongs to. */
611 struct task_struct *task;
612
613 /* The signal to send on a bts buffer overflow. */
614 unsigned int bts_ovfl_signal;
615
616 /* The work struct to destroy a context. */
617 struct work_struct work;
618};
619
620static int alloc_bts_buffer(struct bts_context *context, unsigned int size)
621{
622 void *buffer = NULL;
623 int err = -ENOMEM;
624
625 err = account_locked_memory(current->mm, current->signal->rlim, size);
626 if (err < 0)
627 return err;
628
629 buffer = kzalloc(size, GFP_KERNEL);
630 if (!buffer)
631 goto out_refund;
632
633 context->buffer = buffer;
634 context->size = size;
635 context->mm = get_task_mm(current);
636
637 return 0;
638
639 out_refund:
640 refund_locked_memory(current->mm, size);
641 return err;
642}
643
644static inline void free_bts_buffer(struct bts_context *context)
645{
646 if (!context->buffer)
647 return;
648
649 kfree(context->buffer);
650 context->buffer = NULL;
651
652 refund_locked_memory(context->mm, context->size);
653 context->size = 0;
654
655 mmput(context->mm);
656 context->mm = NULL;
657}
658
659static void free_bts_context_work(struct work_struct *w)
660{
661 struct bts_context *context;
662
663 context = container_of(w, struct bts_context, work);
664
665 ds_release_bts(context->tracer);
666 put_task_struct(context->task);
667 free_bts_buffer(context);
668 kfree(context);
669}
670
671static inline void free_bts_context(struct bts_context *context)
672{
673 INIT_WORK(&context->work, free_bts_context_work);
674 schedule_work(&context->work);
675}
676
677static inline struct bts_context *alloc_bts_context(struct task_struct *task)
678{
679 struct bts_context *context = kzalloc(sizeof(*context), GFP_KERNEL);
680 if (context) {
681 context->task = task;
682 task->bts = context;
683
684 get_task_struct(task);
685 }
686
687 return context;
688}
689
581static int ptrace_bts_read_record(struct task_struct *child, size_t index, 690static int ptrace_bts_read_record(struct task_struct *child, size_t index,
582 struct bts_struct __user *out) 691 struct bts_struct __user *out)
583{ 692{
693 struct bts_context *context;
584 const struct bts_trace *trace; 694 const struct bts_trace *trace;
585 struct bts_struct bts; 695 struct bts_struct bts;
586 const unsigned char *at; 696 const unsigned char *at;
587 int error; 697 int error;
588 698
589 trace = ds_read_bts(child->bts); 699 context = child->bts;
700 if (!context)
701 return -ESRCH;
702
703 trace = ds_read_bts(context->tracer);
590 if (!trace) 704 if (!trace)
591 return -EPERM; 705 return -ESRCH;
592 706
593 at = trace->ds.top - ((index + 1) * trace->ds.size); 707 at = trace->ds.top - ((index + 1) * trace->ds.size);
594 if ((void *)at < trace->ds.begin) 708 if ((void *)at < trace->ds.begin)
@@ -597,7 +711,7 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index,
597 if (!trace->read) 711 if (!trace->read)
598 return -EOPNOTSUPP; 712 return -EOPNOTSUPP;
599 713
600 error = trace->read(child->bts, at, &bts); 714 error = trace->read(context->tracer, at, &bts);
601 if (error < 0) 715 if (error < 0)
602 return error; 716 return error;
603 717
@@ -611,13 +725,18 @@ static int ptrace_bts_drain(struct task_struct *child,
611 long size, 725 long size,
612 struct bts_struct __user *out) 726 struct bts_struct __user *out)
613{ 727{
728 struct bts_context *context;
614 const struct bts_trace *trace; 729 const struct bts_trace *trace;
615 const unsigned char *at; 730 const unsigned char *at;
616 int error, drained = 0; 731 int error, drained = 0;
617 732
618 trace = ds_read_bts(child->bts); 733 context = child->bts;
734 if (!context)
735 return -ESRCH;
736
737 trace = ds_read_bts(context->tracer);
619 if (!trace) 738 if (!trace)
620 return -EPERM; 739 return -ESRCH;
621 740
622 if (!trace->read) 741 if (!trace->read)
623 return -EOPNOTSUPP; 742 return -EOPNOTSUPP;
@@ -628,9 +747,8 @@ static int ptrace_bts_drain(struct task_struct *child,
628 for (at = trace->ds.begin; (void *)at < trace->ds.top; 747 for (at = trace->ds.begin; (void *)at < trace->ds.top;
629 out++, drained++, at += trace->ds.size) { 748 out++, drained++, at += trace->ds.size) {
630 struct bts_struct bts; 749 struct bts_struct bts;
631 int error;
632 750
633 error = trace->read(child->bts, at, &bts); 751 error = trace->read(context->tracer, at, &bts);
634 if (error < 0) 752 if (error < 0)
635 return error; 753 return error;
636 754
@@ -640,35 +758,18 @@ static int ptrace_bts_drain(struct task_struct *child,
640 758
641 memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size); 759 memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
642 760
643 error = ds_reset_bts(child->bts); 761 error = ds_reset_bts(context->tracer);
644 if (error < 0) 762 if (error < 0)
645 return error; 763 return error;
646 764
647 return drained; 765 return drained;
648} 766}
649 767
650static int ptrace_bts_allocate_buffer(struct task_struct *child, size_t size)
651{
652 child->bts_buffer = alloc_locked_buffer(size);
653 if (!child->bts_buffer)
654 return -ENOMEM;
655
656 child->bts_size = size;
657
658 return 0;
659}
660
661static void ptrace_bts_free_buffer(struct task_struct *child)
662{
663 free_locked_buffer(child->bts_buffer, child->bts_size);
664 child->bts_buffer = NULL;
665 child->bts_size = 0;
666}
667
668static int ptrace_bts_config(struct task_struct *child, 768static int ptrace_bts_config(struct task_struct *child,
669 long cfg_size, 769 long cfg_size,
670 const struct ptrace_bts_config __user *ucfg) 770 const struct ptrace_bts_config __user *ucfg)
671{ 771{
772 struct bts_context *context;
672 struct ptrace_bts_config cfg; 773 struct ptrace_bts_config cfg;
673 unsigned int flags = 0; 774 unsigned int flags = 0;
674 775
@@ -678,28 +779,33 @@ static int ptrace_bts_config(struct task_struct *child,
678 if (copy_from_user(&cfg, ucfg, sizeof(cfg))) 779 if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
679 return -EFAULT; 780 return -EFAULT;
680 781
681 if (child->bts) { 782 context = child->bts;
682 ds_release_bts(child->bts); 783 if (!context)
683 child->bts = NULL; 784 context = alloc_bts_context(child);
684 } 785 if (!context)
786 return -ENOMEM;
685 787
686 if (cfg.flags & PTRACE_BTS_O_SIGNAL) { 788 if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
687 if (!cfg.signal) 789 if (!cfg.signal)
688 return -EINVAL; 790 return -EINVAL;
689 791
690 child->thread.bts_ovfl_signal = cfg.signal;
691 return -EOPNOTSUPP; 792 return -EOPNOTSUPP;
793 context->bts_ovfl_signal = cfg.signal;
692 } 794 }
693 795
694 if ((cfg.flags & PTRACE_BTS_O_ALLOC) && 796 ds_release_bts(context->tracer);
695 (cfg.size != child->bts_size)) { 797 context->tracer = NULL;
696 int error;
697 798
698 ptrace_bts_free_buffer(child); 799 if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) {
800 int err;
699 801
700 error = ptrace_bts_allocate_buffer(child, cfg.size); 802 free_bts_buffer(context);
701 if (error < 0) 803 if (!cfg.size)
702 return error; 804 return 0;
805
806 err = alloc_bts_buffer(context, cfg.size);
807 if (err < 0)
808 return err;
703 } 809 }
704 810
705 if (cfg.flags & PTRACE_BTS_O_TRACE) 811 if (cfg.flags & PTRACE_BTS_O_TRACE)
@@ -708,15 +814,14 @@ static int ptrace_bts_config(struct task_struct *child,
708 if (cfg.flags & PTRACE_BTS_O_SCHED) 814 if (cfg.flags & PTRACE_BTS_O_SCHED)
709 flags |= BTS_TIMESTAMPS; 815 flags |= BTS_TIMESTAMPS;
710 816
711 child->bts = ds_request_bts(child, child->bts_buffer, child->bts_size, 817 context->tracer =
712 /* ovfl = */ NULL, /* th = */ (size_t)-1, 818 ds_request_bts_task(child, context->buffer, context->size,
713 flags); 819 NULL, (size_t)-1, flags);
714 if (IS_ERR(child->bts)) { 820 if (unlikely(IS_ERR(context->tracer))) {
715 int error = PTR_ERR(child->bts); 821 int error = PTR_ERR(context->tracer);
716
717 ptrace_bts_free_buffer(child);
718 child->bts = NULL;
719 822
823 free_bts_buffer(context);
824 context->tracer = NULL;
720 return error; 825 return error;
721 } 826 }
722 827
@@ -727,20 +832,25 @@ static int ptrace_bts_status(struct task_struct *child,
727 long cfg_size, 832 long cfg_size,
728 struct ptrace_bts_config __user *ucfg) 833 struct ptrace_bts_config __user *ucfg)
729{ 834{
835 struct bts_context *context;
730 const struct bts_trace *trace; 836 const struct bts_trace *trace;
731 struct ptrace_bts_config cfg; 837 struct ptrace_bts_config cfg;
732 838
839 context = child->bts;
840 if (!context)
841 return -ESRCH;
842
733 if (cfg_size < sizeof(cfg)) 843 if (cfg_size < sizeof(cfg))
734 return -EIO; 844 return -EIO;
735 845
736 trace = ds_read_bts(child->bts); 846 trace = ds_read_bts(context->tracer);
737 if (!trace) 847 if (!trace)
738 return -EPERM; 848 return -ESRCH;
739 849
740 memset(&cfg, 0, sizeof(cfg)); 850 memset(&cfg, 0, sizeof(cfg));
741 cfg.size = trace->ds.end - trace->ds.begin; 851 cfg.size = trace->ds.end - trace->ds.begin;
742 cfg.signal = child->thread.bts_ovfl_signal; 852 cfg.signal = context->bts_ovfl_signal;
743 cfg.bts_size = sizeof(struct bts_struct); 853 cfg.bts_size = sizeof(struct bts_struct);
744 854
745 if (cfg.signal) 855 if (cfg.signal)
746 cfg.flags |= PTRACE_BTS_O_SIGNAL; 856 cfg.flags |= PTRACE_BTS_O_SIGNAL;
@@ -759,80 +869,51 @@ static int ptrace_bts_status(struct task_struct *child,
759 869
760static int ptrace_bts_clear(struct task_struct *child) 870static int ptrace_bts_clear(struct task_struct *child)
761{ 871{
872 struct bts_context *context;
762 const struct bts_trace *trace; 873 const struct bts_trace *trace;
763 874
764 trace = ds_read_bts(child->bts); 875 context = child->bts;
876 if (!context)
877 return -ESRCH;
878
879 trace = ds_read_bts(context->tracer);
765 if (!trace) 880 if (!trace)
766 return -EPERM; 881 return -ESRCH;
767 882
768 memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size); 883 memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
769 884
770 return ds_reset_bts(child->bts); 885 return ds_reset_bts(context->tracer);
771} 886}
772 887
773static int ptrace_bts_size(struct task_struct *child) 888static int ptrace_bts_size(struct task_struct *child)
774{ 889{
890 struct bts_context *context;
775 const struct bts_trace *trace; 891 const struct bts_trace *trace;
776 892
777 trace = ds_read_bts(child->bts); 893 context = child->bts;
894 if (!context)
895 return -ESRCH;
896
897 trace = ds_read_bts(context->tracer);
778 if (!trace) 898 if (!trace)
779 return -EPERM; 899 return -ESRCH;
780 900
781 return (trace->ds.top - trace->ds.begin) / trace->ds.size; 901 return (trace->ds.top - trace->ds.begin) / trace->ds.size;
782} 902}
783 903
784static void ptrace_bts_fork(struct task_struct *tsk) 904/*
785{ 905 * Called from __ptrace_unlink() after the child has been moved back
786 tsk->bts = NULL; 906 * to its original parent.
787 tsk->bts_buffer = NULL; 907 */
788 tsk->bts_size = 0; 908void ptrace_bts_untrace(struct task_struct *child)
789 tsk->thread.bts_ovfl_signal = 0;
790}
791
792static void ptrace_bts_untrace(struct task_struct *child)
793{ 909{
794 if (unlikely(child->bts)) { 910 if (unlikely(child->bts)) {
795 ds_release_bts(child->bts); 911 free_bts_context(child->bts);
796 child->bts = NULL; 912 child->bts = NULL;
797
798 /* We cannot update total_vm and locked_vm since
799 child's mm is already gone. But we can reclaim the
800 memory. */
801 kfree(child->bts_buffer);
802 child->bts_buffer = NULL;
803 child->bts_size = 0;
804 } 913 }
805} 914}
806
807static void ptrace_bts_detach(struct task_struct *child)
808{
809 /*
810 * Ptrace_detach() races with ptrace_untrace() in case
811 * the child dies and is reaped by another thread.
812 *
813 * We only do the memory accounting at this point and
814 * leave the buffer deallocation and the bts tracer
815 * release to ptrace_bts_untrace() which will be called
816 * later on with tasklist_lock held.
817 */
818 release_locked_buffer(child->bts_buffer, child->bts_size);
819}
820#else
821static inline void ptrace_bts_fork(struct task_struct *tsk) {}
822static inline void ptrace_bts_detach(struct task_struct *child) {}
823static inline void ptrace_bts_untrace(struct task_struct *child) {}
824#endif /* CONFIG_X86_PTRACE_BTS */ 915#endif /* CONFIG_X86_PTRACE_BTS */
825 916
826void x86_ptrace_fork(struct task_struct *child, unsigned long clone_flags)
827{
828 ptrace_bts_fork(child);
829}
830
831void x86_ptrace_untrace(struct task_struct *child)
832{
833 ptrace_bts_untrace(child);
834}
835
836/* 917/*
837 * Called by kernel/ptrace.c when detaching.. 918 * Called by kernel/ptrace.c when detaching..
838 * 919 *
@@ -844,7 +925,6 @@ void ptrace_disable(struct task_struct *child)
844#ifdef TIF_SYSCALL_EMU 925#ifdef TIF_SYSCALL_EMU
845 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); 926 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
846#endif 927#endif
847 ptrace_bts_detach(child);
848} 928}
849 929
850#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 930#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 7563b31b4f03..af71d06624bf 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -491,5 +491,42 @@ void force_hpet_resume(void)
491 break; 491 break;
492 } 492 }
493} 493}
494#endif
495
496#if defined(CONFIG_PCI) && defined(CONFIG_NUMA)
497/* Set correct numa_node information for AMD NB functions */
498static void __init quirk_amd_nb_node(struct pci_dev *dev)
499{
500 struct pci_dev *nb_ht;
501 unsigned int devfn;
502 u32 val;
503
504 devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0);
505 nb_ht = pci_get_slot(dev->bus, devfn);
506 if (!nb_ht)
507 return;
508
509 pci_read_config_dword(nb_ht, 0x60, &val);
510 set_dev_node(&dev->dev, val & 7);
511 pci_dev_put(dev);
512}
494 513
514DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
515 quirk_amd_nb_node);
516DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP,
517 quirk_amd_nb_node);
518DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MEMCTL,
519 quirk_amd_nb_node);
520DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC,
521 quirk_amd_nb_node);
522DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_HT,
523 quirk_amd_nb_node);
524DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MAP,
525 quirk_amd_nb_node);
526DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_DRAM,
527 quirk_amd_nb_node);
528DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC,
529 quirk_amd_nb_node);
530DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK,
531 quirk_amd_nb_node);
495#endif 532#endif
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 667188e0b5a0..d2d1ce8170f0 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -192,6 +192,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
192 DMI_MATCH(DMI_BOARD_NAME, "0KP561"), 192 DMI_MATCH(DMI_BOARD_NAME, "0KP561"),
193 }, 193 },
194 }, 194 },
195 { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */
196 .callback = set_bios_reboot,
197 .ident = "Dell OptiPlex 360",
198 .matches = {
199 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
200 DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"),
201 DMI_MATCH(DMI_BOARD_NAME, "0T656F"),
202 },
203 },
195 { /* Handle problems with rebooting on Dell 2400's */ 204 { /* Handle problems with rebooting on Dell 2400's */
196 .callback = set_bios_reboot, 205 .callback = set_bios_reboot,
197 .ident = "Dell PowerEdge 2400", 206 .ident = "Dell PowerEdge 2400",
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b4158439bf63..be5ae80f897f 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -112,6 +112,14 @@
112#define ARCH_SETUP 112#define ARCH_SETUP
113#endif 113#endif
114 114
115/*
116 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
117 * The direct mapping extends to max_pfn_mapped, so that we can directly access
118 * apertures, ACPI and other tables without having to play with fixmaps.
119 */
120unsigned long max_low_pfn_mapped;
121unsigned long max_pfn_mapped;
122
115RESERVE_BRK(dmi_alloc, 65536); 123RESERVE_BRK(dmi_alloc, 65536);
116 124
117unsigned int boot_cpu_id __read_mostly; 125unsigned int boot_cpu_id __read_mostly;
@@ -214,8 +222,8 @@ unsigned long mmu_cr4_features;
214unsigned long mmu_cr4_features = X86_CR4_PAE; 222unsigned long mmu_cr4_features = X86_CR4_PAE;
215#endif 223#endif
216 224
217/* Boot loader ID as an integer, for the benefit of proc_dointvec */ 225/* Boot loader ID and version as integers, for the benefit of proc_dointvec */
218int bootloader_type; 226int bootloader_type, bootloader_version;
219 227
220/* 228/*
221 * Setup options 229 * Setup options
@@ -293,15 +301,13 @@ static void __init reserve_brk(void)
293 301
294#ifdef CONFIG_BLK_DEV_INITRD 302#ifdef CONFIG_BLK_DEV_INITRD
295 303
296#ifdef CONFIG_X86_32
297
298#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) 304#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
299static void __init relocate_initrd(void) 305static void __init relocate_initrd(void)
300{ 306{
301 307
302 u64 ramdisk_image = boot_params.hdr.ramdisk_image; 308 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
303 u64 ramdisk_size = boot_params.hdr.ramdisk_size; 309 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
304 u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; 310 u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
305 u64 ramdisk_here; 311 u64 ramdisk_here;
306 unsigned long slop, clen, mapaddr; 312 unsigned long slop, clen, mapaddr;
307 char *p, *q; 313 char *p, *q;
@@ -357,14 +363,13 @@ static void __init relocate_initrd(void)
357 ramdisk_image, ramdisk_image + ramdisk_size - 1, 363 ramdisk_image, ramdisk_image + ramdisk_size - 1,
358 ramdisk_here, ramdisk_here + ramdisk_size - 1); 364 ramdisk_here, ramdisk_here + ramdisk_size - 1);
359} 365}
360#endif
361 366
362static void __init reserve_initrd(void) 367static void __init reserve_initrd(void)
363{ 368{
364 u64 ramdisk_image = boot_params.hdr.ramdisk_image; 369 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
365 u64 ramdisk_size = boot_params.hdr.ramdisk_size; 370 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
366 u64 ramdisk_end = ramdisk_image + ramdisk_size; 371 u64 ramdisk_end = ramdisk_image + ramdisk_size;
367 u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; 372 u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
368 373
369 if (!boot_params.hdr.type_of_loader || 374 if (!boot_params.hdr.type_of_loader ||
370 !ramdisk_image || !ramdisk_size) 375 !ramdisk_image || !ramdisk_size)
@@ -394,14 +399,8 @@ static void __init reserve_initrd(void)
394 return; 399 return;
395 } 400 }
396 401
397#ifdef CONFIG_X86_32
398 relocate_initrd(); 402 relocate_initrd();
399#else 403
400 printk(KERN_ERR "initrd extends beyond end of memory "
401 "(0x%08llx > 0x%08llx)\ndisabling initrd\n",
402 ramdisk_end, end_of_lowmem);
403 initrd_start = 0;
404#endif
405 free_early(ramdisk_image, ramdisk_end); 404 free_early(ramdisk_image, ramdisk_end);
406} 405}
407#else 406#else
@@ -706,6 +705,12 @@ void __init setup_arch(char **cmdline_p)
706#endif 705#endif
707 saved_video_mode = boot_params.hdr.vid_mode; 706 saved_video_mode = boot_params.hdr.vid_mode;
708 bootloader_type = boot_params.hdr.type_of_loader; 707 bootloader_type = boot_params.hdr.type_of_loader;
708 if ((bootloader_type >> 4) == 0xe) {
709 bootloader_type &= 0xf;
710 bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4;
711 }
712 bootloader_version = bootloader_type & 0xf;
713 bootloader_version |= boot_params.hdr.ext_loader_ver << 4;
709 714
710#ifdef CONFIG_BLK_DEV_RAM 715#ifdef CONFIG_BLK_DEV_RAM
711 rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; 716 rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
@@ -854,12 +859,16 @@ void __init setup_arch(char **cmdline_p)
854 max_low_pfn = max_pfn; 859 max_low_pfn = max_pfn;
855 860
856 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; 861 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
862 max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
857#endif 863#endif
858 864
859#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION 865#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
860 setup_bios_corruption_check(); 866 setup_bios_corruption_check();
861#endif 867#endif
862 868
869 printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
870 max_pfn_mapped<<PAGE_SHIFT);
871
863 reserve_brk(); 872 reserve_brk();
864 873
865 /* max_pfn_mapped is updated here */ 874 /* max_pfn_mapped is updated here */
@@ -997,24 +1006,6 @@ void __init setup_arch(char **cmdline_p)
997#ifdef CONFIG_X86_32 1006#ifdef CONFIG_X86_32
998 1007
999/** 1008/**
1000 * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
1001 *
1002 * Description:
1003 * Perform any necessary interrupt initialisation prior to setting up
1004 * the "ordinary" interrupt call gates. For legacy reasons, the ISA
1005 * interrupts should be initialised here if the machine emulates a PC
1006 * in any way.
1007 **/
1008void __init x86_quirk_pre_intr_init(void)
1009{
1010 if (x86_quirks->arch_pre_intr_init) {
1011 if (x86_quirks->arch_pre_intr_init())
1012 return;
1013 }
1014 init_ISA_irqs();
1015}
1016
1017/**
1018 * x86_quirk_intr_init - post gate setup interrupt initialisation 1009 * x86_quirk_intr_init - post gate setup interrupt initialisation
1019 * 1010 *
1020 * Description: 1011 * Description:
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 8f0e13be36b3..9c3f0823e6aa 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -425,6 +425,14 @@ void __init setup_per_cpu_areas(void)
425 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; 425 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
426#endif 426#endif
427 427
428#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
429 /*
430 * make sure boot cpu node_number is right, when boot cpu is on the
431 * node that doesn't have mem installed
432 */
433 per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id);
434#endif
435
428 /* Setup node to cpumask map */ 436 /* Setup node to cpumask map */
429 setup_node_to_cpumask_map(); 437 setup_node_to_cpumask_map();
430 438
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 14425166b8e3..4c578751e94e 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -6,7 +6,6 @@
6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes 6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
7 * 2000-2002 x86-64 support by Andi Kleen 7 * 2000-2002 x86-64 support by Andi Kleen
8 */ 8 */
9
10#include <linux/sched.h> 9#include <linux/sched.h>
11#include <linux/mm.h> 10#include <linux/mm.h>
12#include <linux/smp.h> 11#include <linux/smp.h>
@@ -25,11 +24,11 @@
25#include <asm/ucontext.h> 24#include <asm/ucontext.h>
26#include <asm/i387.h> 25#include <asm/i387.h>
27#include <asm/vdso.h> 26#include <asm/vdso.h>
27#include <asm/mce.h>
28 28
29#ifdef CONFIG_X86_64 29#ifdef CONFIG_X86_64
30#include <asm/proto.h> 30#include <asm/proto.h>
31#include <asm/ia32_unistd.h> 31#include <asm/ia32_unistd.h>
32#include <asm/mce.h>
33#endif /* CONFIG_X86_64 */ 32#endif /* CONFIG_X86_64 */
34 33
35#include <asm/syscall.h> 34#include <asm/syscall.h>
@@ -857,10 +856,10 @@ static void do_signal(struct pt_regs *regs)
857void 856void
858do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) 857do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
859{ 858{
860#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) 859#ifdef CONFIG_X86_NEW_MCE
861 /* notify userspace of pending MCEs */ 860 /* notify userspace of pending MCEs */
862 if (thread_info_flags & _TIF_MCE_NOTIFY) 861 if (thread_info_flags & _TIF_MCE_NOTIFY)
863 mce_notify_user(); 862 mce_notify_process();
864#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ 863#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
865 864
866 /* deal with pending signal delivery */ 865 /* deal with pending signal delivery */
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 13f33ea8ccaa..ec1de97600e7 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -150,14 +150,40 @@ void native_send_call_func_ipi(const struct cpumask *mask)
150 * this function calls the 'stop' function on all other CPUs in the system. 150 * this function calls the 'stop' function on all other CPUs in the system.
151 */ 151 */
152 152
153asmlinkage void smp_reboot_interrupt(void)
154{
155 ack_APIC_irq();
156 irq_enter();
157 stop_this_cpu(NULL);
158 irq_exit();
159}
160
153static void native_smp_send_stop(void) 161static void native_smp_send_stop(void)
154{ 162{
155 unsigned long flags; 163 unsigned long flags;
164 unsigned long wait;
156 165
157 if (reboot_force) 166 if (reboot_force)
158 return; 167 return;
159 168
160 smp_call_function(stop_this_cpu, NULL, 0); 169 /*
170 * Use an own vector here because smp_call_function
171 * does lots of things not suitable in a panic situation.
172 * On most systems we could also use an NMI here,
173 * but there are a few systems around where NMI
174 * is problematic so stay with an non NMI for now
175 * (this implies we cannot stop CPUs spinning with irq off
176 * currently)
177 */
178 if (num_online_cpus() > 1) {
179 apic->send_IPI_allbutself(REBOOT_VECTOR);
180
181 /* Don't wait longer than a second */
182 wait = USEC_PER_SEC;
183 while (num_online_cpus() > 1 && wait--)
184 udelay(1);
185 }
186
161 local_irq_save(flags); 187 local_irq_save(flags);
162 disable_local_APIC(); 188 disable_local_APIC();
163 local_irq_restore(flags); 189 local_irq_restore(flags);
@@ -172,6 +198,9 @@ void smp_reschedule_interrupt(struct pt_regs *regs)
172{ 198{
173 ack_APIC_irq(); 199 ack_APIC_irq();
174 inc_irq_stat(irq_resched_count); 200 inc_irq_stat(irq_resched_count);
201 /*
202 * KVM uses this interrupt to force a cpu out of guest mode
203 */
175} 204}
176 205
177void smp_call_function_interrupt(struct pt_regs *regs) 206void smp_call_function_interrupt(struct pt_regs *regs)
@@ -193,19 +222,19 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
193} 222}
194 223
195struct smp_ops smp_ops = { 224struct smp_ops smp_ops = {
196 .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, 225 .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
197 .smp_prepare_cpus = native_smp_prepare_cpus, 226 .smp_prepare_cpus = native_smp_prepare_cpus,
198 .smp_cpus_done = native_smp_cpus_done, 227 .smp_cpus_done = native_smp_cpus_done,
199 228
200 .smp_send_stop = native_smp_send_stop, 229 .smp_send_stop = native_smp_send_stop,
201 .smp_send_reschedule = native_smp_send_reschedule, 230 .smp_send_reschedule = native_smp_send_reschedule,
202 231
203 .cpu_up = native_cpu_up, 232 .cpu_up = native_cpu_up,
204 .cpu_die = native_cpu_die, 233 .cpu_die = native_cpu_die,
205 .cpu_disable = native_cpu_disable, 234 .cpu_disable = native_cpu_disable,
206 .play_dead = native_play_dead, 235 .play_dead = native_play_dead,
207 236
208 .send_call_func_ipi = native_send_call_func_ipi, 237 .send_call_func_ipi = native_send_call_func_ipi,
209 .send_call_func_single_ipi = native_send_call_func_single_ipi, 238 .send_call_func_single_ipi = native_send_call_func_single_ipi,
210}; 239};
211EXPORT_SYMBOL_GPL(smp_ops); 240EXPORT_SYMBOL_GPL(smp_ops);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 58d24ef917d8..2fecda69ee64 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -504,7 +504,7 @@ void __inquire_remote_apic(int apicid)
504 * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this 504 * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
505 * won't ... remember to clear down the APIC, etc later. 505 * won't ... remember to clear down the APIC, etc later.
506 */ 506 */
507int __devinit 507int __cpuinit
508wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) 508wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
509{ 509{
510 unsigned long send_status, accept_status = 0; 510 unsigned long send_status, accept_status = 0;
@@ -538,7 +538,7 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
538 return (send_status | accept_status); 538 return (send_status | accept_status);
539} 539}
540 540
541int __devinit 541static int __cpuinit
542wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) 542wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
543{ 543{
544 unsigned long send_status, accept_status = 0; 544 unsigned long send_status, accept_status = 0;
@@ -822,10 +822,12 @@ do_rest:
822 /* mark "stuck" area as not stuck */ 822 /* mark "stuck" area as not stuck */
823 *((volatile unsigned long *)trampoline_base) = 0; 823 *((volatile unsigned long *)trampoline_base) = 0;
824 824
825 /* 825 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
826 * Cleanup possible dangling ends... 826 /*
827 */ 827 * Cleanup possible dangling ends...
828 smpboot_restore_warm_reset_vector(); 828 */
829 smpboot_restore_warm_reset_vector();
830 }
829 831
830 return boot_error; 832 return boot_error;
831} 833}
@@ -871,7 +873,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
871 873
872 err = do_boot_cpu(apicid, cpu); 874 err = do_boot_cpu(apicid, cpu);
873 875
874 zap_low_mappings(); 876 zap_low_mappings(false);
875 low_mappings = 0; 877 low_mappings = 0;
876#else 878#else
877 err = do_boot_cpu(apicid, cpu); 879 err = do_boot_cpu(apicid, cpu);
@@ -990,10 +992,12 @@ static int __init smp_sanity_check(unsigned max_cpus)
990 */ 992 */
991 if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && 993 if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) &&
992 !cpu_has_apic) { 994 !cpu_has_apic) {
993 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", 995 if (!disable_apic) {
994 boot_cpu_physical_apicid); 996 pr_err("BIOS bug, local APIC #%d not detected!...\n",
995 printk(KERN_ERR "... forcing use of dummy APIC emulation." 997 boot_cpu_physical_apicid);
998 pr_err("... forcing use of dummy APIC emulation."
996 "(tell your hw vendor)\n"); 999 "(tell your hw vendor)\n");
1000 }
997 smpboot_clear_io_apic(); 1001 smpboot_clear_io_apic();
998 arch_disable_smp_support(); 1002 arch_disable_smp_support();
999 return -1; 1003 return -1;
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index f7bddc2e37d1..c3eb207181fe 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -20,7 +20,7 @@ save_stack_warning_symbol(void *data, char *msg, unsigned long symbol)
20 20
21static int save_stack_stack(void *data, char *name) 21static int save_stack_stack(void *data, char *name)
22{ 22{
23 return -1; 23 return 0;
24} 24}
25 25
26static void save_stack_address(void *data, unsigned long addr, int reliable) 26static void save_stack_address(void *data, unsigned long addr, int reliable)
@@ -77,6 +77,13 @@ void save_stack_trace(struct stack_trace *trace)
77} 77}
78EXPORT_SYMBOL_GPL(save_stack_trace); 78EXPORT_SYMBOL_GPL(save_stack_trace);
79 79
80void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp)
81{
82 dump_trace(current, NULL, NULL, bp, &save_stack_ops, trace);
83 if (trace->nr_entries < trace->max_entries)
84 trace->entries[trace->nr_entries++] = ULONG_MAX;
85}
86
80void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) 87void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
81{ 88{
82 dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace); 89 dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace);
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index ff5c8736b491..d51321ddafda 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -334,3 +334,5 @@ ENTRY(sys_call_table)
334 .long sys_inotify_init1 334 .long sys_inotify_init1
335 .long sys_preadv 335 .long sys_preadv
336 .long sys_pwritev 336 .long sys_pwritev
337 .long sys_rt_tgsigqueueinfo /* 335 */
338 .long sys_perf_counter_open
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 8c7b03b0cfcb..124d40c575df 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -715,7 +715,12 @@ uv_activation_descriptor_init(int node, int pnode)
715 struct bau_desc *adp; 715 struct bau_desc *adp;
716 struct bau_desc *ad2; 716 struct bau_desc *ad2;
717 717
718 adp = (struct bau_desc *)kmalloc_node(16384, GFP_KERNEL, node); 718 /*
719 * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
720 * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per blade
721 */
722 adp = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)*
723 UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
719 BUG_ON(!adp); 724 BUG_ON(!adp);
720 725
721 pa = uv_gpa(adp); /* need the real nasid*/ 726 pa = uv_gpa(adp); /* need the real nasid*/
@@ -729,7 +734,13 @@ uv_activation_descriptor_init(int node, int pnode)
729 (n << UV_DESC_BASE_PNODE_SHIFT | m)); 734 (n << UV_DESC_BASE_PNODE_SHIFT | m));
730 } 735 }
731 736
732 for (i = 0, ad2 = adp; i < UV_ACTIVATION_DESCRIPTOR_SIZE; i++, ad2++) { 737 /*
738 * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
739 * cpu even though we only use the first one; one descriptor can
740 * describe a broadcast to 256 nodes.
741 */
742 for (i = 0, ad2 = adp; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
743 i++, ad2++) {
733 memset(ad2, 0, sizeof(struct bau_desc)); 744 memset(ad2, 0, sizeof(struct bau_desc));
734 ad2->header.sw_ack_flag = 1; 745 ad2->header.sw_ack_flag = 1;
735 /* 746 /*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a1d288327ff0..a0f48f5671c0 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -45,6 +45,7 @@
45#include <linux/edac.h> 45#include <linux/edac.h>
46#endif 46#endif
47 47
48#include <asm/kmemcheck.h>
48#include <asm/stacktrace.h> 49#include <asm/stacktrace.h>
49#include <asm/processor.h> 50#include <asm/processor.h>
50#include <asm/debugreg.h> 51#include <asm/debugreg.h>
@@ -53,6 +54,7 @@
53#include <asm/traps.h> 54#include <asm/traps.h>
54#include <asm/desc.h> 55#include <asm/desc.h>
55#include <asm/i387.h> 56#include <asm/i387.h>
57#include <asm/mce.h>
56 58
57#include <asm/mach_traps.h> 59#include <asm/mach_traps.h>
58 60
@@ -64,8 +66,6 @@
64#include <asm/setup.h> 66#include <asm/setup.h>
65#include <asm/traps.h> 67#include <asm/traps.h>
66 68
67#include "cpu/mcheck/mce.h"
68
69asmlinkage int system_call(void); 69asmlinkage int system_call(void);
70 70
71/* Do we ignore FPU interrupts ? */ 71/* Do we ignore FPU interrupts ? */
@@ -534,6 +534,10 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
534 534
535 get_debugreg(condition, 6); 535 get_debugreg(condition, 6);
536 536
537 /* Catch kmemcheck conditions first of all! */
538 if (condition & DR_STEP && kmemcheck_trap(regs))
539 return;
540
537 /* 541 /*
538 * The processor cleared BTF, so don't mark that we need it set. 542 * The processor cleared BTF, so don't mark that we need it set.
539 */ 543 */
@@ -798,15 +802,15 @@ unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
798 802
799 return new_kesp; 803 return new_kesp;
800} 804}
801#else 805#endif
806
802asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) 807asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
803{ 808{
804} 809}
805 810
806asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) 811asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)
807{ 812{
808} 813}
809#endif
810 814
811/* 815/*
812 * 'math_state_restore()' saves the current math information in the 816 * 'math_state_restore()' saves the current math information in the
@@ -839,9 +843,6 @@ asmlinkage void math_state_restore(void)
839 } 843 }
840 844
841 clts(); /* Allow maths ops (or we recurse) */ 845 clts(); /* Allow maths ops (or we recurse) */
842#ifdef CONFIG_X86_32
843 restore_fpu(tsk);
844#else
845 /* 846 /*
846 * Paranoid restore. send a SIGSEGV if we fail to restore the state. 847 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
847 */ 848 */
@@ -850,7 +851,7 @@ asmlinkage void math_state_restore(void)
850 force_sig(SIGSEGV, tsk); 851 force_sig(SIGSEGV, tsk);
851 return; 852 return;
852 } 853 }
853#endif 854
854 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ 855 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
855 tsk->fpu_counter++; 856 tsk->fpu_counter++;
856} 857}
@@ -945,8 +946,13 @@ void __init trap_init(void)
945#endif 946#endif
946 set_intr_gate(19, &simd_coprocessor_error); 947 set_intr_gate(19, &simd_coprocessor_error);
947 948
949 /* Reserve all the builtin and the syscall vector: */
950 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
951 set_bit(i, used_vectors);
952
948#ifdef CONFIG_IA32_EMULATION 953#ifdef CONFIG_IA32_EMULATION
949 set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); 954 set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
955 set_bit(IA32_SYSCALL_VECTOR, used_vectors);
950#endif 956#endif
951 957
952#ifdef CONFIG_X86_32 958#ifdef CONFIG_X86_32
@@ -963,17 +969,9 @@ void __init trap_init(void)
963 } 969 }
964 970
965 set_system_trap_gate(SYSCALL_VECTOR, &system_call); 971 set_system_trap_gate(SYSCALL_VECTOR, &system_call);
966#endif
967
968 /* Reserve all the builtin and the syscall vector: */
969 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
970 set_bit(i, used_vectors);
971
972#ifdef CONFIG_X86_64
973 set_bit(IA32_SYSCALL_VECTOR, used_vectors);
974#else
975 set_bit(SYSCALL_VECTOR, used_vectors); 972 set_bit(SYSCALL_VECTOR, used_vectors);
976#endif 973#endif
974
977 /* 975 /*
978 * Should be a barrier for any external CPU state: 976 * Should be a barrier for any external CPU state:
979 */ 977 */
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index d57de05dc430..6e1a368d21d4 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -9,6 +9,7 @@
9#include <linux/delay.h> 9#include <linux/delay.h>
10#include <linux/clocksource.h> 10#include <linux/clocksource.h>
11#include <linux/percpu.h> 11#include <linux/percpu.h>
12#include <linux/timex.h>
12 13
13#include <asm/hpet.h> 14#include <asm/hpet.h>
14#include <asm/timer.h> 15#include <asm/timer.h>
@@ -384,13 +385,13 @@ unsigned long native_calibrate_tsc(void)
384{ 385{
385 u64 tsc1, tsc2, delta, ref1, ref2; 386 u64 tsc1, tsc2, delta, ref1, ref2;
386 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; 387 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
387 unsigned long flags, latch, ms, fast_calibrate, tsc_khz; 388 unsigned long flags, latch, ms, fast_calibrate, hv_tsc_khz;
388 int hpet = is_hpet_enabled(), i, loopmin; 389 int hpet = is_hpet_enabled(), i, loopmin;
389 390
390 tsc_khz = get_hypervisor_tsc_freq(); 391 hv_tsc_khz = get_hypervisor_tsc_freq();
391 if (tsc_khz) { 392 if (hv_tsc_khz) {
392 printk(KERN_INFO "TSC: Frequency read from the hypervisor\n"); 393 printk(KERN_INFO "TSC: Frequency read from the hypervisor\n");
393 return tsc_khz; 394 return hv_tsc_khz;
394 } 395 }
395 396
396 local_irq_save(flags); 397 local_irq_save(flags);
@@ -589,22 +590,26 @@ EXPORT_SYMBOL(recalibrate_cpu_khz);
589 */ 590 */
590 591
591DEFINE_PER_CPU(unsigned long, cyc2ns); 592DEFINE_PER_CPU(unsigned long, cyc2ns);
593DEFINE_PER_CPU(unsigned long long, cyc2ns_offset);
592 594
593static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) 595static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
594{ 596{
595 unsigned long long tsc_now, ns_now; 597 unsigned long long tsc_now, ns_now, *offset;
596 unsigned long flags, *scale; 598 unsigned long flags, *scale;
597 599
598 local_irq_save(flags); 600 local_irq_save(flags);
599 sched_clock_idle_sleep_event(); 601 sched_clock_idle_sleep_event();
600 602
601 scale = &per_cpu(cyc2ns, cpu); 603 scale = &per_cpu(cyc2ns, cpu);
604 offset = &per_cpu(cyc2ns_offset, cpu);
602 605
603 rdtscll(tsc_now); 606 rdtscll(tsc_now);
604 ns_now = __cycles_2_ns(tsc_now); 607 ns_now = __cycles_2_ns(tsc_now);
605 608
606 if (cpu_khz) 609 if (cpu_khz) {
607 *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; 610 *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
611 *offset = ns_now - (tsc_now * *scale >> CYC2NS_SCALE_FACTOR);
612 }
608 613
609 sched_clock_idle_wakeup_event(0); 614 sched_clock_idle_wakeup_event(0);
610 local_irq_restore(flags); 615 local_irq_restore(flags);
@@ -631,17 +636,15 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
631 void *data) 636 void *data)
632{ 637{
633 struct cpufreq_freqs *freq = data; 638 struct cpufreq_freqs *freq = data;
634 unsigned long *lpj, dummy; 639 unsigned long *lpj;
635 640
636 if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC)) 641 if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC))
637 return 0; 642 return 0;
638 643
639 lpj = &dummy; 644 lpj = &boot_cpu_data.loops_per_jiffy;
640 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
641#ifdef CONFIG_SMP 645#ifdef CONFIG_SMP
646 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
642 lpj = &cpu_data(freq->cpu).loops_per_jiffy; 647 lpj = &cpu_data(freq->cpu).loops_per_jiffy;
643#else
644 lpj = &boot_cpu_data.loops_per_jiffy;
645#endif 648#endif
646 649
647 if (!ref_freq) { 650 if (!ref_freq) {
@@ -710,7 +713,16 @@ static cycle_t read_tsc(struct clocksource *cs)
710#ifdef CONFIG_X86_64 713#ifdef CONFIG_X86_64
711static cycle_t __vsyscall_fn vread_tsc(void) 714static cycle_t __vsyscall_fn vread_tsc(void)
712{ 715{
713 cycle_t ret = (cycle_t)vget_cycles(); 716 cycle_t ret;
717
718 /*
719 * Surround the RDTSC by barriers, to make sure it's not
720 * speculated to outside the seqlock critical section and
721 * does not cause time warps:
722 */
723 rdtsc_barrier();
724 ret = (cycle_t)vget_cycles();
725 rdtsc_barrier();
714 726
715 return ret >= __vsyscall_gtod_data.clock.cycle_last ? 727 return ret >= __vsyscall_gtod_data.clock.cycle_last ?
716 ret : __vsyscall_gtod_data.clock.cycle_last; 728 ret : __vsyscall_gtod_data.clock.cycle_last;
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index bf36328f6ef9..027b5b498993 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -34,6 +34,7 @@ static __cpuinitdata atomic_t stop_count;
34 * of a critical section, to be able to prove TSC time-warps: 34 * of a critical section, to be able to prove TSC time-warps:
35 */ 35 */
36static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; 36static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED;
37
37static __cpuinitdata cycles_t last_tsc; 38static __cpuinitdata cycles_t last_tsc;
38static __cpuinitdata cycles_t max_warp; 39static __cpuinitdata cycles_t max_warp;
39static __cpuinitdata int nr_warps; 40static __cpuinitdata int nr_warps;
@@ -113,13 +114,12 @@ void __cpuinit check_tsc_sync_source(int cpu)
113 return; 114 return;
114 115
115 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { 116 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
116 printk(KERN_INFO 117 pr_info("Skipping synchronization checks as TSC is reliable.\n");
117 "Skipping synchronization checks as TSC is reliable.\n");
118 return; 118 return;
119 } 119 }
120 120
121 printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:", 121 pr_info("checking TSC synchronization [CPU#%d -> CPU#%d]:",
122 smp_processor_id(), cpu); 122 smp_processor_id(), cpu);
123 123
124 /* 124 /*
125 * Reset it - in case this is a second bootup: 125 * Reset it - in case this is a second bootup:
@@ -143,8 +143,8 @@ void __cpuinit check_tsc_sync_source(int cpu)
143 143
144 if (nr_warps) { 144 if (nr_warps) {
145 printk("\n"); 145 printk("\n");
146 printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs," 146 pr_warning("Measured %Ld cycles TSC warp between CPUs, "
147 " turning off TSC clock.\n", max_warp); 147 "turning off TSC clock.\n", max_warp);
148 mark_tsc_unstable("check_tsc_sync_source failed"); 148 mark_tsc_unstable("check_tsc_sync_source failed");
149 } else { 149 } else {
150 printk(" passed.\n"); 150 printk(" passed.\n");
@@ -195,5 +195,3 @@ void __cpuinit check_tsc_sync_target(void)
195 while (atomic_read(&stop_count) != cpus) 195 while (atomic_read(&stop_count) != cpus)
196 cpu_relax(); 196 cpu_relax();
197} 197}
198#undef NR_LOOPS
199
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index d7ac84e7fc1c..9c4e62539058 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -287,10 +287,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
287 info->regs.pt.ds = 0; 287 info->regs.pt.ds = 0;
288 info->regs.pt.es = 0; 288 info->regs.pt.es = 0;
289 info->regs.pt.fs = 0; 289 info->regs.pt.fs = 0;
290 290#ifndef CONFIG_X86_32_LAZY_GS
291/* we are clearing gs later just before "jmp resume_userspace", 291 info->regs.pt.gs = 0;
292 * because it is not saved/restored. 292#endif
293 */
294 293
295/* 294/*
296 * The flags register is also special: we cannot trust that the user 295 * The flags register is also special: we cannot trust that the user
@@ -318,9 +317,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
318 } 317 }
319 318
320/* 319/*
321 * Save old state, set default return value (%ax) to 0 320 * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL)
322 */ 321 */
323 info->regs32->ax = 0; 322 info->regs32->ax = VM86_SIGNAL;
324 tsk->thread.saved_sp0 = tsk->thread.sp0; 323 tsk->thread.saved_sp0 = tsk->thread.sp0;
325 tsk->thread.saved_fs = info->regs32->fs; 324 tsk->thread.saved_fs = info->regs32->fs;
326 tsk->thread.saved_gs = get_user_gs(info->regs32); 325 tsk->thread.saved_gs = get_user_gs(info->regs32);
@@ -343,7 +342,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
343 __asm__ __volatile__( 342 __asm__ __volatile__(
344 "movl %0,%%esp\n\t" 343 "movl %0,%%esp\n\t"
345 "movl %1,%%ebp\n\t" 344 "movl %1,%%ebp\n\t"
345#ifdef CONFIG_X86_32_LAZY_GS
346 "mov %2, %%gs\n\t" 346 "mov %2, %%gs\n\t"
347#endif
347 "jmp resume_userspace" 348 "jmp resume_userspace"
348 : /* no outputs */ 349 : /* no outputs */
349 :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0)); 350 :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 95deb9f2211e..b263423fbe2a 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -462,22 +462,28 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
462} 462}
463#endif 463#endif
464 464
465static void vmi_enter_lazy_cpu(void) 465static void vmi_start_context_switch(struct task_struct *prev)
466{ 466{
467 paravirt_enter_lazy_cpu(); 467 paravirt_start_context_switch(prev);
468 vmi_ops.set_lazy_mode(2); 468 vmi_ops.set_lazy_mode(2);
469} 469}
470 470
471static void vmi_end_context_switch(struct task_struct *next)
472{
473 vmi_ops.set_lazy_mode(0);
474 paravirt_end_context_switch(next);
475}
476
471static void vmi_enter_lazy_mmu(void) 477static void vmi_enter_lazy_mmu(void)
472{ 478{
473 paravirt_enter_lazy_mmu(); 479 paravirt_enter_lazy_mmu();
474 vmi_ops.set_lazy_mode(1); 480 vmi_ops.set_lazy_mode(1);
475} 481}
476 482
477static void vmi_leave_lazy(void) 483static void vmi_leave_lazy_mmu(void)
478{ 484{
479 paravirt_leave_lazy(paravirt_get_lazy_mode());
480 vmi_ops.set_lazy_mode(0); 485 vmi_ops.set_lazy_mode(0);
486 paravirt_leave_lazy_mmu();
481} 487}
482 488
483static inline int __init check_vmi_rom(struct vrom_header *rom) 489static inline int __init check_vmi_rom(struct vrom_header *rom)
@@ -711,14 +717,14 @@ static inline int __init activate_vmi(void)
711 para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask); 717 para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
712 para_fill(pv_cpu_ops.io_delay, IODelay); 718 para_fill(pv_cpu_ops.io_delay, IODelay);
713 719
714 para_wrap(pv_cpu_ops.lazy_mode.enter, vmi_enter_lazy_cpu, 720 para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch,
715 set_lazy_mode, SetLazyMode); 721 set_lazy_mode, SetLazyMode);
716 para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy, 722 para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch,
717 set_lazy_mode, SetLazyMode); 723 set_lazy_mode, SetLazyMode);
718 724
719 para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu, 725 para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu,
720 set_lazy_mode, SetLazyMode); 726 set_lazy_mode, SetLazyMode);
721 para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy, 727 para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu,
722 set_lazy_mode, SetLazyMode); 728 set_lazy_mode, SetLazyMode);
723 729
724 /* user and kernel flush are just handled with different flags to FlushTLB */ 730 /* user and kernel flush are just handled with different flags to FlushTLB */
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 849ee611f013..367e87882041 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -1,5 +1,433 @@
1/*
2 * ld script for the x86 kernel
3 *
4 * Historic 32-bit version written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>
5 *
6 * Modernisation, unification and other changes and fixes:
7 * Copyright (C) 2007-2009 Sam Ravnborg <sam@ravnborg.org>
8 *
9 *
10 * Don't define absolute symbols until and unless you know that symbol
11 * value is should remain constant even if kernel image is relocated
12 * at run time. Absolute symbols are not relocated. If symbol value should
13 * change if kernel is relocated, make the symbol section relative and
14 * put it inside the section definition.
15 */
16
1#ifdef CONFIG_X86_32 17#ifdef CONFIG_X86_32
2# include "vmlinux_32.lds.S" 18#define LOAD_OFFSET __PAGE_OFFSET
3#else 19#else
4# include "vmlinux_64.lds.S" 20#define LOAD_OFFSET __START_KERNEL_map
5#endif 21#endif
22
23#include <asm-generic/vmlinux.lds.h>
24#include <asm/asm-offsets.h>
25#include <asm/thread_info.h>
26#include <asm/page_types.h>
27#include <asm/cache.h>
28#include <asm/boot.h>
29
30#undef i386 /* in case the preprocessor is a 32bit one */
31
32OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
33
34#ifdef CONFIG_X86_32
35OUTPUT_ARCH(i386)
36ENTRY(phys_startup_32)
37jiffies = jiffies_64;
38#else
39OUTPUT_ARCH(i386:x86-64)
40ENTRY(phys_startup_64)
41jiffies_64 = jiffies;
42#endif
43
44PHDRS {
45 text PT_LOAD FLAGS(5); /* R_E */
46 data PT_LOAD FLAGS(7); /* RWE */
47#ifdef CONFIG_X86_64
48 user PT_LOAD FLAGS(7); /* RWE */
49 data.init PT_LOAD FLAGS(7); /* RWE */
50#ifdef CONFIG_SMP
51 percpu PT_LOAD FLAGS(7); /* RWE */
52#endif
53 data.init2 PT_LOAD FLAGS(7); /* RWE */
54#endif
55 note PT_NOTE FLAGS(0); /* ___ */
56}
57
58SECTIONS
59{
60#ifdef CONFIG_X86_32
61 . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
62 phys_startup_32 = startup_32 - LOAD_OFFSET;
63#else
64 . = __START_KERNEL;
65 phys_startup_64 = startup_64 - LOAD_OFFSET;
66#endif
67
68 /* Text and read-only data */
69
70 /* bootstrapping code */
71 .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
72 _text = .;
73 *(.text.head)
74 } :text = 0x9090
75
76 /* The rest of the text */
77 .text : AT(ADDR(.text) - LOAD_OFFSET) {
78#ifdef CONFIG_X86_32
79 /* not really needed, already page aligned */
80 . = ALIGN(PAGE_SIZE);
81 *(.text.page_aligned)
82#endif
83 . = ALIGN(8);
84 _stext = .;
85 TEXT_TEXT
86 SCHED_TEXT
87 LOCK_TEXT
88 KPROBES_TEXT
89 IRQENTRY_TEXT
90 *(.fixup)
91 *(.gnu.warning)
92 /* End of text section */
93 _etext = .;
94 } :text = 0x9090
95
96 NOTES :text :note
97
98 /* Exception table */
99 . = ALIGN(16);
100 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
101 __start___ex_table = .;
102 *(__ex_table)
103 __stop___ex_table = .;
104 } :text = 0x9090
105
106 RODATA
107
108 /* Data */
109 . = ALIGN(PAGE_SIZE);
110 .data : AT(ADDR(.data) - LOAD_OFFSET) {
111 /* Start of data section */
112 _sdata = .;
113 DATA_DATA
114 CONSTRUCTORS
115
116#ifdef CONFIG_X86_64
117 /* End of data section */
118 _edata = .;
119#endif
120 } :data
121
122#ifdef CONFIG_X86_32
123 /* 32 bit has nosave before _edata */
124 . = ALIGN(PAGE_SIZE);
125 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
126 __nosave_begin = .;
127 *(.data.nosave)
128 . = ALIGN(PAGE_SIZE);
129 __nosave_end = .;
130 }
131#endif
132
133 . = ALIGN(PAGE_SIZE);
134 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
135 *(.data.page_aligned)
136 *(.data.idt)
137 }
138
139#ifdef CONFIG_X86_32
140 . = ALIGN(32);
141#else
142 . = ALIGN(PAGE_SIZE);
143 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
144#endif
145 .data.cacheline_aligned :
146 AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
147 *(.data.cacheline_aligned)
148 }
149
150 /* rarely changed data like cpu maps */
151#ifdef CONFIG_X86_32
152 . = ALIGN(32);
153#else
154 . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
155#endif
156 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
157 *(.data.read_mostly)
158
159#ifdef CONFIG_X86_32
160 /* End of data section */
161 _edata = .;
162#endif
163 }
164
165#ifdef CONFIG_X86_64
166
167#define VSYSCALL_ADDR (-10*1024*1024)
168#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \
169 SIZEOF(.data.read_mostly) + 4095) & ~(4095))
170#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \
171 SIZEOF(.data.read_mostly) + 4095) & ~(4095))
172
173#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
174#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
175
176#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
177#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
178
179 . = VSYSCALL_ADDR;
180 .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) {
181 *(.vsyscall_0)
182 } :user
183
184 __vsyscall_0 = VSYSCALL_VIRT_ADDR;
185
186 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
187 .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
188 *(.vsyscall_fn)
189 }
190
191 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
192 .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
193 *(.vsyscall_gtod_data)
194 }
195
196 vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
197 .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) {
198 *(.vsyscall_clock)
199 }
200 vsyscall_clock = VVIRT(.vsyscall_clock);
201
202
203 .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) {
204 *(.vsyscall_1)
205 }
206 .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) {
207 *(.vsyscall_2)
208 }
209
210 .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) {
211 *(.vgetcpu_mode)
212 }
213 vgetcpu_mode = VVIRT(.vgetcpu_mode);
214
215 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
216 .jiffies : AT(VLOAD(.jiffies)) {
217 *(.jiffies)
218 }
219 jiffies = VVIRT(.jiffies);
220
221 .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) {
222 *(.vsyscall_3)
223 }
224
225 . = VSYSCALL_VIRT_ADDR + PAGE_SIZE;
226
227#undef VSYSCALL_ADDR
228#undef VSYSCALL_PHYS_ADDR
229#undef VSYSCALL_VIRT_ADDR
230#undef VLOAD_OFFSET
231#undef VLOAD
232#undef VVIRT_OFFSET
233#undef VVIRT
234
235#endif /* CONFIG_X86_64 */
236
237 /* init_task */
238 . = ALIGN(THREAD_SIZE);
239 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
240 *(.data.init_task)
241 }
242#ifdef CONFIG_X86_64
243 :data.init
244#endif
245
246 /*
247 * smp_locks might be freed after init
248 * start/end must be page aligned
249 */
250 . = ALIGN(PAGE_SIZE);
251 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
252 __smp_locks = .;
253 *(.smp_locks)
254 __smp_locks_end = .;
255 . = ALIGN(PAGE_SIZE);
256 }
257
258 /* Init code and data - will be freed after init */
259 . = ALIGN(PAGE_SIZE);
260 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
261 __init_begin = .; /* paired with __init_end */
262 _sinittext = .;
263 INIT_TEXT
264 _einittext = .;
265 }
266
267 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
268 INIT_DATA
269 }
270
271 . = ALIGN(16);
272 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
273 __setup_start = .;
274 *(.init.setup)
275 __setup_end = .;
276 }
277 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
278 __initcall_start = .;
279 INITCALLS
280 __initcall_end = .;
281 }
282
283 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
284 __con_initcall_start = .;
285 *(.con_initcall.init)
286 __con_initcall_end = .;
287 }
288
289 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
290 __x86_cpu_dev_start = .;
291 *(.x86_cpu_dev.init)
292 __x86_cpu_dev_end = .;
293 }
294
295 SECURITY_INIT
296
297 . = ALIGN(8);
298 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
299 __parainstructions = .;
300 *(.parainstructions)
301 __parainstructions_end = .;
302 }
303
304 . = ALIGN(8);
305 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
306 __alt_instructions = .;
307 *(.altinstructions)
308 __alt_instructions_end = .;
309 }
310
311 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
312 *(.altinstr_replacement)
313 }
314
315 /*
316 * .exit.text is discard at runtime, not link time, to deal with
317 * references from .altinstructions and .eh_frame
318 */
319 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
320 EXIT_TEXT
321 }
322
323 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
324 EXIT_DATA
325 }
326
327#ifdef CONFIG_BLK_DEV_INITRD
328 . = ALIGN(PAGE_SIZE);
329 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
330 __initramfs_start = .;
331 *(.init.ramfs)
332 __initramfs_end = .;
333 }
334#endif
335
336#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
337 /*
338 * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
339 * output PHDR, so the next output section - __data_nosave - should
340 * start another section data.init2. Also, pda should be at the head of
341 * percpu area. Preallocate it and define the percpu offset symbol
342 * so that it can be accessed as a percpu variable.
343 */
344 . = ALIGN(PAGE_SIZE);
345 PERCPU_VADDR(0, :percpu)
346#else
347 PERCPU(PAGE_SIZE)
348#endif
349
350 . = ALIGN(PAGE_SIZE);
351
352 /* freed after init ends here */
353 .init.end : AT(ADDR(.init.end) - LOAD_OFFSET) {
354 __init_end = .;
355 }
356
357#ifdef CONFIG_X86_64
358 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
359 . = ALIGN(PAGE_SIZE);
360 __nosave_begin = .;
361 *(.data.nosave)
362 . = ALIGN(PAGE_SIZE);
363 __nosave_end = .;
364 } :data.init2
365 /* use another section data.init2, see PERCPU_VADDR() above */
366#endif
367
368 /* BSS */
369 . = ALIGN(PAGE_SIZE);
370 .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
371 __bss_start = .;
372 *(.bss.page_aligned)
373 *(.bss)
374 . = ALIGN(4);
375 __bss_stop = .;
376 }
377
378 . = ALIGN(PAGE_SIZE);
379 .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
380 __brk_base = .;
381 . += 64 * 1024; /* 64k alignment slop space */
382 *(.brk_reservation) /* areas brk users have reserved */
383 __brk_limit = .;
384 }
385
386 .end : AT(ADDR(.end) - LOAD_OFFSET) {
387 _end = .;
388 }
389
390 /* Sections to be discarded */
391 /DISCARD/ : {
392 *(.exitcall.exit)
393 *(.eh_frame)
394 *(.discard)
395 }
396
397 STABS_DEBUG
398 DWARF_DEBUG
399}
400
401
402#ifdef CONFIG_X86_32
403ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
404 "kernel image bigger than KERNEL_IMAGE_SIZE")
405#else
406/*
407 * Per-cpu symbols which need to be offset from __per_cpu_load
408 * for the boot processor.
409 */
410#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load
411INIT_PER_CPU(gdt_page);
412INIT_PER_CPU(irq_stack_union);
413
414/*
415 * Build-time check on the image size:
416 */
417ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
418 "kernel image bigger than KERNEL_IMAGE_SIZE")
419
420#ifdef CONFIG_SMP
421ASSERT((per_cpu__irq_stack_union == 0),
422 "irq_stack_union is not at start of per-cpu area");
423#endif
424
425#endif /* CONFIG_X86_32 */
426
427#ifdef CONFIG_KEXEC
428#include <asm/kexec.h>
429
430ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
431 "kexec control code size is too big")
432#endif
433
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
deleted file mode 100644
index 62ad500d55f3..000000000000
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ /dev/null
@@ -1,229 +0,0 @@
1/* ld script to make i386 Linux kernel
2 * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
3 *
4 * Don't define absolute symbols until and unless you know that symbol
5 * value is should remain constant even if kernel image is relocated
6 * at run time. Absolute symbols are not relocated. If symbol value should
7 * change if kernel is relocated, make the symbol section relative and
8 * put it inside the section definition.
9 */
10
11#define LOAD_OFFSET __PAGE_OFFSET
12
13#include <asm-generic/vmlinux.lds.h>
14#include <asm/thread_info.h>
15#include <asm/page_types.h>
16#include <asm/cache.h>
17#include <asm/boot.h>
18
19OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
20OUTPUT_ARCH(i386)
21ENTRY(phys_startup_32)
22jiffies = jiffies_64;
23
24PHDRS {
25 text PT_LOAD FLAGS(5); /* R_E */
26 data PT_LOAD FLAGS(7); /* RWE */
27 note PT_NOTE FLAGS(0); /* ___ */
28}
29SECTIONS
30{
31 . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
32 phys_startup_32 = startup_32 - LOAD_OFFSET;
33
34 .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
35 _text = .; /* Text and read-only data */
36 *(.text.head)
37 } :text = 0x9090
38
39 /* read-only */
40 .text : AT(ADDR(.text) - LOAD_OFFSET) {
41 . = ALIGN(PAGE_SIZE); /* not really needed, already page aligned */
42 *(.text.page_aligned)
43 TEXT_TEXT
44 SCHED_TEXT
45 LOCK_TEXT
46 KPROBES_TEXT
47 IRQENTRY_TEXT
48 *(.fixup)
49 *(.gnu.warning)
50 _etext = .; /* End of text section */
51 } :text = 0x9090
52
53 NOTES :text :note
54
55 . = ALIGN(16); /* Exception table */
56 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
57 __start___ex_table = .;
58 *(__ex_table)
59 __stop___ex_table = .;
60 } :text = 0x9090
61
62 RODATA
63
64 /* writeable */
65 . = ALIGN(PAGE_SIZE);
66 .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */
67 DATA_DATA
68 CONSTRUCTORS
69 } :data
70
71 . = ALIGN(PAGE_SIZE);
72 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
73 __nosave_begin = .;
74 *(.data.nosave)
75 . = ALIGN(PAGE_SIZE);
76 __nosave_end = .;
77 }
78
79 . = ALIGN(PAGE_SIZE);
80 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
81 *(.data.page_aligned)
82 *(.data.idt)
83 }
84
85 . = ALIGN(32);
86 .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
87 *(.data.cacheline_aligned)
88 }
89
90 /* rarely changed data like cpu maps */
91 . = ALIGN(32);
92 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
93 *(.data.read_mostly)
94 _edata = .; /* End of data section */
95 }
96
97 . = ALIGN(THREAD_SIZE); /* init_task */
98 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
99 *(.data.init_task)
100 }
101
102 /* might get freed after init */
103 . = ALIGN(PAGE_SIZE);
104 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
105 __smp_locks = .;
106 *(.smp_locks)
107 __smp_locks_end = .;
108 }
109 /* will be freed after init
110 * Following ALIGN() is required to make sure no other data falls on the
111 * same page where __smp_alt_end is pointing as that page might be freed
112 * after boot. Always make sure that ALIGN() directive is present after
113 * the section which contains __smp_alt_end.
114 */
115 . = ALIGN(PAGE_SIZE);
116
117 /* will be freed after init */
118 . = ALIGN(PAGE_SIZE); /* Init code and data */
119 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
120 __init_begin = .;
121 _sinittext = .;
122 INIT_TEXT
123 _einittext = .;
124 }
125 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
126 INIT_DATA
127 }
128 . = ALIGN(16);
129 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
130 __setup_start = .;
131 *(.init.setup)
132 __setup_end = .;
133 }
134 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
135 __initcall_start = .;
136 INITCALLS
137 __initcall_end = .;
138 }
139 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
140 __con_initcall_start = .;
141 *(.con_initcall.init)
142 __con_initcall_end = .;
143 }
144 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
145 __x86_cpu_dev_start = .;
146 *(.x86_cpu_dev.init)
147 __x86_cpu_dev_end = .;
148 }
149 SECURITY_INIT
150 . = ALIGN(4);
151 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
152 __alt_instructions = .;
153 *(.altinstructions)
154 __alt_instructions_end = .;
155 }
156 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
157 *(.altinstr_replacement)
158 }
159 . = ALIGN(4);
160 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
161 __parainstructions = .;
162 *(.parainstructions)
163 __parainstructions_end = .;
164 }
165 /* .exit.text is discard at runtime, not link time, to deal with references
166 from .altinstructions and .eh_frame */
167 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
168 EXIT_TEXT
169 }
170 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
171 EXIT_DATA
172 }
173#if defined(CONFIG_BLK_DEV_INITRD)
174 . = ALIGN(PAGE_SIZE);
175 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
176 __initramfs_start = .;
177 *(.init.ramfs)
178 __initramfs_end = .;
179 }
180#endif
181 PERCPU(PAGE_SIZE)
182 . = ALIGN(PAGE_SIZE);
183 /* freed after init ends here */
184
185 .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
186 __init_end = .;
187 __bss_start = .; /* BSS */
188 *(.bss.page_aligned)
189 *(.bss)
190 . = ALIGN(4);
191 __bss_stop = .;
192 }
193
194 .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
195 . = ALIGN(PAGE_SIZE);
196 __brk_base = . ;
197 . += 64 * 1024 ; /* 64k alignment slop space */
198 *(.brk_reservation) /* areas brk users have reserved */
199 __brk_limit = . ;
200 }
201
202 .end : AT(ADDR(.end) - LOAD_OFFSET) {
203 _end = . ;
204 }
205
206 /* Sections to be discarded */
207 /DISCARD/ : {
208 *(.exitcall.exit)
209 *(.discard)
210 }
211
212 STABS_DEBUG
213
214 DWARF_DEBUG
215}
216
217/*
218 * Build-time check on the image size:
219 */
220ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
221 "kernel image bigger than KERNEL_IMAGE_SIZE")
222
223#ifdef CONFIG_KEXEC
224/* Link time checks */
225#include <asm/kexec.h>
226
227ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
228 "kexec control code size is too big")
229#endif
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
deleted file mode 100644
index c8742507b030..000000000000
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ /dev/null
@@ -1,298 +0,0 @@
1/* ld script to make x86-64 Linux kernel
2 * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
3 */
4
5#define LOAD_OFFSET __START_KERNEL_map
6
7#include <asm-generic/vmlinux.lds.h>
8#include <asm/asm-offsets.h>
9#include <asm/page_types.h>
10
11#undef i386 /* in case the preprocessor is a 32bit one */
12
13OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
14OUTPUT_ARCH(i386:x86-64)
15ENTRY(phys_startup_64)
16jiffies_64 = jiffies;
17PHDRS {
18 text PT_LOAD FLAGS(5); /* R_E */
19 data PT_LOAD FLAGS(7); /* RWE */
20 user PT_LOAD FLAGS(7); /* RWE */
21 data.init PT_LOAD FLAGS(7); /* RWE */
22#ifdef CONFIG_SMP
23 percpu PT_LOAD FLAGS(7); /* RWE */
24#endif
25 data.init2 PT_LOAD FLAGS(7); /* RWE */
26 note PT_NOTE FLAGS(0); /* ___ */
27}
28SECTIONS
29{
30 . = __START_KERNEL;
31 phys_startup_64 = startup_64 - LOAD_OFFSET;
32 .text : AT(ADDR(.text) - LOAD_OFFSET) {
33 _text = .; /* Text and read-only data */
34 /* First the code that has to be first for bootstrapping */
35 *(.text.head)
36 _stext = .;
37 /* Then the rest */
38 TEXT_TEXT
39 SCHED_TEXT
40 LOCK_TEXT
41 KPROBES_TEXT
42 IRQENTRY_TEXT
43 *(.fixup)
44 *(.gnu.warning)
45 _etext = .; /* End of text section */
46 } :text = 0x9090
47
48 NOTES :text :note
49
50 . = ALIGN(16); /* Exception table */
51 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
52 __start___ex_table = .;
53 *(__ex_table)
54 __stop___ex_table = .;
55 } :text = 0x9090
56
57 RODATA
58
59 . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */
60 /* Data */
61 .data : AT(ADDR(.data) - LOAD_OFFSET) {
62 DATA_DATA
63 CONSTRUCTORS
64 _edata = .; /* End of data section */
65 } :data
66
67
68 .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
69 . = ALIGN(PAGE_SIZE);
70 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
71 *(.data.cacheline_aligned)
72 }
73 . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
74 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
75 *(.data.read_mostly)
76 }
77
78#define VSYSCALL_ADDR (-10*1024*1024)
79#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
80#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
81
82#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
83#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
84
85#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
86#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
87
88 . = VSYSCALL_ADDR;
89 .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user
90 __vsyscall_0 = VSYSCALL_VIRT_ADDR;
91
92 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
93 .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) }
94 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
95 .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data))
96 { *(.vsyscall_gtod_data) }
97 vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
98 .vsyscall_clock : AT(VLOAD(.vsyscall_clock))
99 { *(.vsyscall_clock) }
100 vsyscall_clock = VVIRT(.vsyscall_clock);
101
102
103 .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1))
104 { *(.vsyscall_1) }
105 .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2))
106 { *(.vsyscall_2) }
107
108 .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) }
109 vgetcpu_mode = VVIRT(.vgetcpu_mode);
110
111 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
112 .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) }
113 jiffies = VVIRT(.jiffies);
114
115 .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3))
116 { *(.vsyscall_3) }
117
118 . = VSYSCALL_VIRT_ADDR + PAGE_SIZE;
119
120#undef VSYSCALL_ADDR
121#undef VSYSCALL_PHYS_ADDR
122#undef VSYSCALL_VIRT_ADDR
123#undef VLOAD_OFFSET
124#undef VLOAD
125#undef VVIRT_OFFSET
126#undef VVIRT
127
128 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
129 . = ALIGN(THREAD_SIZE); /* init_task */
130 *(.data.init_task)
131 }:data.init
132
133 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
134 . = ALIGN(PAGE_SIZE);
135 *(.data.page_aligned)
136 }
137
138 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
139 /* might get freed after init */
140 . = ALIGN(PAGE_SIZE);
141 __smp_alt_begin = .;
142 __smp_locks = .;
143 *(.smp_locks)
144 __smp_locks_end = .;
145 . = ALIGN(PAGE_SIZE);
146 __smp_alt_end = .;
147 }
148
149 . = ALIGN(PAGE_SIZE); /* Init code and data */
150 __init_begin = .; /* paired with __init_end */
151 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
152 _sinittext = .;
153 INIT_TEXT
154 _einittext = .;
155 }
156 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
157 __initdata_begin = .;
158 INIT_DATA
159 __initdata_end = .;
160 }
161
162 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
163 . = ALIGN(16);
164 __setup_start = .;
165 *(.init.setup)
166 __setup_end = .;
167 }
168 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
169 __initcall_start = .;
170 INITCALLS
171 __initcall_end = .;
172 }
173 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
174 __con_initcall_start = .;
175 *(.con_initcall.init)
176 __con_initcall_end = .;
177 }
178 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
179 __x86_cpu_dev_start = .;
180 *(.x86_cpu_dev.init)
181 __x86_cpu_dev_end = .;
182 }
183 SECURITY_INIT
184
185 . = ALIGN(8);
186 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
187 __parainstructions = .;
188 *(.parainstructions)
189 __parainstructions_end = .;
190 }
191
192 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
193 . = ALIGN(8);
194 __alt_instructions = .;
195 *(.altinstructions)
196 __alt_instructions_end = .;
197 }
198 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
199 *(.altinstr_replacement)
200 }
201 /* .exit.text is discard at runtime, not link time, to deal with references
202 from .altinstructions and .eh_frame */
203 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
204 EXIT_TEXT
205 }
206 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
207 EXIT_DATA
208 }
209
210#ifdef CONFIG_BLK_DEV_INITRD
211 . = ALIGN(PAGE_SIZE);
212 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
213 __initramfs_start = .;
214 *(.init.ramfs)
215 __initramfs_end = .;
216 }
217#endif
218
219#ifdef CONFIG_SMP
220 /*
221 * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
222 * output PHDR, so the next output section - __data_nosave - should
223 * start another section data.init2. Also, pda should be at the head of
224 * percpu area. Preallocate it and define the percpu offset symbol
225 * so that it can be accessed as a percpu variable.
226 */
227 . = ALIGN(PAGE_SIZE);
228 PERCPU_VADDR(0, :percpu)
229#else
230 PERCPU(PAGE_SIZE)
231#endif
232
233 . = ALIGN(PAGE_SIZE);
234 __init_end = .;
235
236 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
237 . = ALIGN(PAGE_SIZE);
238 __nosave_begin = .;
239 *(.data.nosave)
240 . = ALIGN(PAGE_SIZE);
241 __nosave_end = .;
242 } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */
243
244 .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
245 . = ALIGN(PAGE_SIZE);
246 __bss_start = .; /* BSS */
247 *(.bss.page_aligned)
248 *(.bss)
249 __bss_stop = .;
250 }
251
252 .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
253 . = ALIGN(PAGE_SIZE);
254 __brk_base = . ;
255 . += 64 * 1024 ; /* 64k alignment slop space */
256 *(.brk_reservation) /* areas brk users have reserved */
257 __brk_limit = . ;
258 }
259
260 _end = . ;
261
262 /* Sections to be discarded */
263 /DISCARD/ : {
264 *(.exitcall.exit)
265 *(.eh_frame)
266 *(.discard)
267 }
268
269 STABS_DEBUG
270
271 DWARF_DEBUG
272}
273
274 /*
275 * Per-cpu symbols which need to be offset from __per_cpu_load
276 * for the boot processor.
277 */
278#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load
279INIT_PER_CPU(gdt_page);
280INIT_PER_CPU(irq_stack_union);
281
282/*
283 * Build-time check on the image size:
284 */
285ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
286 "kernel image bigger than KERNEL_IMAGE_SIZE")
287
288#ifdef CONFIG_SMP
289ASSERT((per_cpu__irq_stack_union == 0),
290 "irq_stack_union is not at start of per-cpu area");
291#endif
292
293#ifdef CONFIG_KEXEC
294#include <asm/kexec.h>
295
296ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
297 "kexec control code size is too big")
298#endif
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 44153afc9067..25ee06a80aad 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -132,15 +132,7 @@ static __always_inline void do_vgettimeofday(struct timeval * tv)
132 return; 132 return;
133 } 133 }
134 134
135 /*
136 * Surround the RDTSC by barriers, to make sure it's not
137 * speculated to outside the seqlock critical section and
138 * does not cause time warps:
139 */
140 rdtsc_barrier();
141 now = vread(); 135 now = vread();
142 rdtsc_barrier();
143
144 base = __vsyscall_gtod_data.clock.cycle_last; 136 base = __vsyscall_gtod_data.clock.cycle_last;
145 mask = __vsyscall_gtod_data.clock.mask; 137 mask = __vsyscall_gtod_data.clock.mask;
146 mult = __vsyscall_gtod_data.clock.mult; 138 mult = __vsyscall_gtod_data.clock.mult;
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index a58504ea78cc..8600a09e0c6c 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -50,6 +50,9 @@ config KVM_INTEL
50 Provides support for KVM on Intel processors equipped with the VT 50 Provides support for KVM on Intel processors equipped with the VT
51 extensions. 51 extensions.
52 52
53 To compile this as a module, choose M here: the module
54 will be called kvm-intel.
55
53config KVM_AMD 56config KVM_AMD
54 tristate "KVM for AMD processors support" 57 tristate "KVM for AMD processors support"
55 depends on KVM 58 depends on KVM
@@ -57,6 +60,9 @@ config KVM_AMD
57 Provides support for KVM on AMD processors equipped with the AMD-V 60 Provides support for KVM on AMD processors equipped with the AMD-V
58 (SVM) extensions. 61 (SVM) extensions.
59 62
63 To compile this as a module, choose M here: the module
64 will be called kvm-amd.
65
60config KVM_TRACE 66config KVM_TRACE
61 bool "KVM trace support" 67 bool "KVM trace support"
62 depends on KVM && SYSFS 68 depends on KVM && SYSFS
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index d3ec292f00f2..b43c4efafe80 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -14,7 +14,7 @@ endif
14EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm 14EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
15 15
16kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \ 16kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \
17 i8254.o 17 i8254.o timer.o
18obj-$(CONFIG_KVM) += kvm.o 18obj-$(CONFIG_KVM) += kvm.o
19kvm-intel-objs = vmx.o 19kvm-intel-objs = vmx.o
20obj-$(CONFIG_KVM_INTEL) += kvm-intel.o 20obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index c13bb92d3157..4d6f0d293ee2 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -98,6 +98,37 @@ static int pit_get_gate(struct kvm *kvm, int channel)
98 return kvm->arch.vpit->pit_state.channels[channel].gate; 98 return kvm->arch.vpit->pit_state.channels[channel].gate;
99} 99}
100 100
101static s64 __kpit_elapsed(struct kvm *kvm)
102{
103 s64 elapsed;
104 ktime_t remaining;
105 struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
106
107 /*
108 * The Counter does not stop when it reaches zero. In
109 * Modes 0, 1, 4, and 5 the Counter ``wraps around'' to
110 * the highest count, either FFFF hex for binary counting
111 * or 9999 for BCD counting, and continues counting.
112 * Modes 2 and 3 are periodic; the Counter reloads
113 * itself with the initial count and continues counting
114 * from there.
115 */
116 remaining = hrtimer_expires_remaining(&ps->pit_timer.timer);
117 elapsed = ps->pit_timer.period - ktime_to_ns(remaining);
118 elapsed = mod_64(elapsed, ps->pit_timer.period);
119
120 return elapsed;
121}
122
123static s64 kpit_elapsed(struct kvm *kvm, struct kvm_kpit_channel_state *c,
124 int channel)
125{
126 if (channel == 0)
127 return __kpit_elapsed(kvm);
128
129 return ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
130}
131
101static int pit_get_count(struct kvm *kvm, int channel) 132static int pit_get_count(struct kvm *kvm, int channel)
102{ 133{
103 struct kvm_kpit_channel_state *c = 134 struct kvm_kpit_channel_state *c =
@@ -107,7 +138,7 @@ static int pit_get_count(struct kvm *kvm, int channel)
107 138
108 WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); 139 WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
109 140
110 t = ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time)); 141 t = kpit_elapsed(kvm, c, channel);
111 d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC); 142 d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
112 143
113 switch (c->mode) { 144 switch (c->mode) {
@@ -137,7 +168,7 @@ static int pit_get_out(struct kvm *kvm, int channel)
137 168
138 WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); 169 WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
139 170
140 t = ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time)); 171 t = kpit_elapsed(kvm, c, channel);
141 d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC); 172 d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
142 173
143 switch (c->mode) { 174 switch (c->mode) {
@@ -193,28 +224,6 @@ static void pit_latch_status(struct kvm *kvm, int channel)
193 } 224 }
194} 225}
195 226
196static int __pit_timer_fn(struct kvm_kpit_state *ps)
197{
198 struct kvm_vcpu *vcpu0 = ps->pit->kvm->vcpus[0];
199 struct kvm_kpit_timer *pt = &ps->pit_timer;
200
201 if (!atomic_inc_and_test(&pt->pending))
202 set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
203
204 if (!pt->reinject)
205 atomic_set(&pt->pending, 1);
206
207 if (vcpu0 && waitqueue_active(&vcpu0->wq))
208 wake_up_interruptible(&vcpu0->wq);
209
210 hrtimer_add_expires_ns(&pt->timer, pt->period);
211 pt->scheduled = hrtimer_get_expires_ns(&pt->timer);
212 if (pt->period)
213 ps->channels[0].count_load_time = ktime_get();
214
215 return (pt->period == 0 ? 0 : 1);
216}
217
218int pit_has_pending_timer(struct kvm_vcpu *vcpu) 227int pit_has_pending_timer(struct kvm_vcpu *vcpu)
219{ 228{
220 struct kvm_pit *pit = vcpu->kvm->arch.vpit; 229 struct kvm_pit *pit = vcpu->kvm->arch.vpit;
@@ -235,21 +244,6 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
235 spin_unlock(&ps->inject_lock); 244 spin_unlock(&ps->inject_lock);
236} 245}
237 246
238static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
239{
240 struct kvm_kpit_state *ps;
241 int restart_timer = 0;
242
243 ps = container_of(data, struct kvm_kpit_state, pit_timer.timer);
244
245 restart_timer = __pit_timer_fn(ps);
246
247 if (restart_timer)
248 return HRTIMER_RESTART;
249 else
250 return HRTIMER_NORESTART;
251}
252
253void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) 247void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
254{ 248{
255 struct kvm_pit *pit = vcpu->kvm->arch.vpit; 249 struct kvm_pit *pit = vcpu->kvm->arch.vpit;
@@ -263,15 +257,26 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
263 hrtimer_start_expires(timer, HRTIMER_MODE_ABS); 257 hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
264} 258}
265 259
266static void destroy_pit_timer(struct kvm_kpit_timer *pt) 260static void destroy_pit_timer(struct kvm_timer *pt)
267{ 261{
268 pr_debug("pit: execute del timer!\n"); 262 pr_debug("pit: execute del timer!\n");
269 hrtimer_cancel(&pt->timer); 263 hrtimer_cancel(&pt->timer);
270} 264}
271 265
266static bool kpit_is_periodic(struct kvm_timer *ktimer)
267{
268 struct kvm_kpit_state *ps = container_of(ktimer, struct kvm_kpit_state,
269 pit_timer);
270 return ps->is_periodic;
271}
272
273static struct kvm_timer_ops kpit_ops = {
274 .is_periodic = kpit_is_periodic,
275};
276
272static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) 277static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
273{ 278{
274 struct kvm_kpit_timer *pt = &ps->pit_timer; 279 struct kvm_timer *pt = &ps->pit_timer;
275 s64 interval; 280 s64 interval;
276 281
277 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); 282 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
@@ -280,8 +285,14 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
280 285
281 /* TODO The new value only affected after the retriggered */ 286 /* TODO The new value only affected after the retriggered */
282 hrtimer_cancel(&pt->timer); 287 hrtimer_cancel(&pt->timer);
283 pt->period = (is_period == 0) ? 0 : interval; 288 pt->period = interval;
284 pt->timer.function = pit_timer_fn; 289 ps->is_periodic = is_period;
290
291 pt->timer.function = kvm_timer_fn;
292 pt->t_ops = &kpit_ops;
293 pt->kvm = ps->pit->kvm;
294 pt->vcpu_id = 0;
295
285 atomic_set(&pt->pending, 0); 296 atomic_set(&pt->pending, 0);
286 ps->irq_ack = 1; 297 ps->irq_ack = 1;
287 298
@@ -298,23 +309,23 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
298 pr_debug("pit: load_count val is %d, channel is %d\n", val, channel); 309 pr_debug("pit: load_count val is %d, channel is %d\n", val, channel);
299 310
300 /* 311 /*
301 * Though spec said the state of 8254 is undefined after power-up, 312 * The largest possible initial count is 0; this is equivalent
302 * seems some tricky OS like Windows XP depends on IRQ0 interrupt 313 * to 216 for binary counting and 104 for BCD counting.
303 * when booting up.
304 * So here setting initialize rate for it, and not a specific number
305 */ 314 */
306 if (val == 0) 315 if (val == 0)
307 val = 0x10000; 316 val = 0x10000;
308 317
309 ps->channels[channel].count_load_time = ktime_get();
310 ps->channels[channel].count = val; 318 ps->channels[channel].count = val;
311 319
312 if (channel != 0) 320 if (channel != 0) {
321 ps->channels[channel].count_load_time = ktime_get();
313 return; 322 return;
323 }
314 324
315 /* Two types of timer 325 /* Two types of timer
316 * mode 1 is one shot, mode 2 is period, otherwise del timer */ 326 * mode 1 is one shot, mode 2 is period, otherwise del timer */
317 switch (ps->channels[0].mode) { 327 switch (ps->channels[0].mode) {
328 case 0:
318 case 1: 329 case 1:
319 /* FIXME: enhance mode 4 precision */ 330 /* FIXME: enhance mode 4 precision */
320 case 4: 331 case 4:
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 6acbe4b505d5..bbd863ff60b7 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -3,15 +3,6 @@
3 3
4#include "iodev.h" 4#include "iodev.h"
5 5
6struct kvm_kpit_timer {
7 struct hrtimer timer;
8 int irq;
9 s64 period; /* unit: ns */
10 s64 scheduled;
11 atomic_t pending;
12 bool reinject;
13};
14
15struct kvm_kpit_channel_state { 6struct kvm_kpit_channel_state {
16 u32 count; /* can be 65536 */ 7 u32 count; /* can be 65536 */
17 u16 latched_count; 8 u16 latched_count;
@@ -30,7 +21,8 @@ struct kvm_kpit_channel_state {
30 21
31struct kvm_kpit_state { 22struct kvm_kpit_state {
32 struct kvm_kpit_channel_state channels[3]; 23 struct kvm_kpit_channel_state channels[3];
33 struct kvm_kpit_timer pit_timer; 24 struct kvm_timer pit_timer;
25 bool is_periodic;
34 u32 speaker_data_on; 26 u32 speaker_data_on;
35 struct mutex lock; 27 struct mutex lock;
36 struct kvm_pit *pit; 28 struct kvm_pit *pit;
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index cf17ed52f6fb..96dfbb6ad2a9 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -24,6 +24,7 @@
24 24
25#include "irq.h" 25#include "irq.h"
26#include "i8254.h" 26#include "i8254.h"
27#include "x86.h"
27 28
28/* 29/*
29 * check if there are pending timer events 30 * check if there are pending timer events
@@ -48,6 +49,9 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
48{ 49{
49 struct kvm_pic *s; 50 struct kvm_pic *s;
50 51
52 if (!irqchip_in_kernel(v->kvm))
53 return v->arch.interrupt.pending;
54
51 if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */ 55 if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */
52 if (kvm_apic_accept_pic_intr(v)) { 56 if (kvm_apic_accept_pic_intr(v)) {
53 s = pic_irqchip(v->kvm); /* PIC */ 57 s = pic_irqchip(v->kvm); /* PIC */
@@ -67,6 +71,9 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
67 struct kvm_pic *s; 71 struct kvm_pic *s;
68 int vector; 72 int vector;
69 73
74 if (!irqchip_in_kernel(v->kvm))
75 return v->arch.interrupt.nr;
76
70 vector = kvm_get_apic_interrupt(v); /* APIC */ 77 vector = kvm_get_apic_interrupt(v); /* APIC */
71 if (vector == -1) { 78 if (vector == -1) {
72 if (kvm_apic_accept_pic_intr(v)) { 79 if (kvm_apic_accept_pic_intr(v)) {
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h
new file mode 100644
index 000000000000..26bd6ba74e1c
--- /dev/null
+++ b/arch/x86/kvm/kvm_timer.h
@@ -0,0 +1,18 @@
1
2struct kvm_timer {
3 struct hrtimer timer;
4 s64 period; /* unit: ns */
5 atomic_t pending; /* accumulated triggered timers */
6 bool reinject;
7 struct kvm_timer_ops *t_ops;
8 struct kvm *kvm;
9 int vcpu_id;
10};
11
12struct kvm_timer_ops {
13 bool (*is_periodic)(struct kvm_timer *);
14};
15
16
17enum hrtimer_restart kvm_timer_fn(struct hrtimer *data);
18
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index f0b67f2cdd69..ae99d83f81a3 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -196,20 +196,15 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
196} 196}
197EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr); 197EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
198 198
199int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig) 199static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
200 int vector, int level, int trig_mode);
201
202int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq)
200{ 203{
201 struct kvm_lapic *apic = vcpu->arch.apic; 204 struct kvm_lapic *apic = vcpu->arch.apic;
202 205
203 if (!apic_test_and_set_irr(vec, apic)) { 206 return __apic_accept_irq(apic, irq->delivery_mode, irq->vector,
204 /* a new pending irq is set in IRR */ 207 irq->level, irq->trig_mode);
205 if (trig)
206 apic_set_vector(vec, apic->regs + APIC_TMR);
207 else
208 apic_clear_vector(vec, apic->regs + APIC_TMR);
209 kvm_vcpu_kick(apic->vcpu);
210 return 1;
211 }
212 return 0;
213} 208}
214 209
215static inline int apic_find_highest_isr(struct kvm_lapic *apic) 210static inline int apic_find_highest_isr(struct kvm_lapic *apic)
@@ -250,7 +245,7 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
250 245
251int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest) 246int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest)
252{ 247{
253 return kvm_apic_id(apic) == dest; 248 return dest == 0xff || kvm_apic_id(apic) == dest;
254} 249}
255 250
256int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) 251int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
@@ -279,37 +274,34 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
279 return result; 274 return result;
280} 275}
281 276
282static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, 277int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
283 int short_hand, int dest, int dest_mode) 278 int short_hand, int dest, int dest_mode)
284{ 279{
285 int result = 0; 280 int result = 0;
286 struct kvm_lapic *target = vcpu->arch.apic; 281 struct kvm_lapic *target = vcpu->arch.apic;
287 282
288 apic_debug("target %p, source %p, dest 0x%x, " 283 apic_debug("target %p, source %p, dest 0x%x, "
289 "dest_mode 0x%x, short_hand 0x%x", 284 "dest_mode 0x%x, short_hand 0x%x\n",
290 target, source, dest, dest_mode, short_hand); 285 target, source, dest, dest_mode, short_hand);
291 286
292 ASSERT(!target); 287 ASSERT(!target);
293 switch (short_hand) { 288 switch (short_hand) {
294 case APIC_DEST_NOSHORT: 289 case APIC_DEST_NOSHORT:
295 if (dest_mode == 0) { 290 if (dest_mode == 0)
296 /* Physical mode. */ 291 /* Physical mode. */
297 if ((dest == 0xFF) || (dest == kvm_apic_id(target))) 292 result = kvm_apic_match_physical_addr(target, dest);
298 result = 1; 293 else
299 } else
300 /* Logical mode. */ 294 /* Logical mode. */
301 result = kvm_apic_match_logical_addr(target, dest); 295 result = kvm_apic_match_logical_addr(target, dest);
302 break; 296 break;
303 case APIC_DEST_SELF: 297 case APIC_DEST_SELF:
304 if (target == source) 298 result = (target == source);
305 result = 1;
306 break; 299 break;
307 case APIC_DEST_ALLINC: 300 case APIC_DEST_ALLINC:
308 result = 1; 301 result = 1;
309 break; 302 break;
310 case APIC_DEST_ALLBUT: 303 case APIC_DEST_ALLBUT:
311 if (target != source) 304 result = (target != source);
312 result = 1;
313 break; 305 break;
314 default: 306 default:
315 printk(KERN_WARNING "Bad dest shorthand value %x\n", 307 printk(KERN_WARNING "Bad dest shorthand value %x\n",
@@ -327,20 +319,22 @@ static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
327static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, 319static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
328 int vector, int level, int trig_mode) 320 int vector, int level, int trig_mode)
329{ 321{
330 int orig_irr, result = 0; 322 int result = 0;
331 struct kvm_vcpu *vcpu = apic->vcpu; 323 struct kvm_vcpu *vcpu = apic->vcpu;
332 324
333 switch (delivery_mode) { 325 switch (delivery_mode) {
334 case APIC_DM_FIXED:
335 case APIC_DM_LOWEST: 326 case APIC_DM_LOWEST:
327 vcpu->arch.apic_arb_prio++;
328 case APIC_DM_FIXED:
336 /* FIXME add logic for vcpu on reset */ 329 /* FIXME add logic for vcpu on reset */
337 if (unlikely(!apic_enabled(apic))) 330 if (unlikely(!apic_enabled(apic)))
338 break; 331 break;
339 332
340 orig_irr = apic_test_and_set_irr(vector, apic); 333 result = !apic_test_and_set_irr(vector, apic);
341 if (orig_irr && trig_mode) { 334 if (!result) {
342 apic_debug("level trig mode repeatedly for vector %d", 335 if (trig_mode)
343 vector); 336 apic_debug("level trig mode repeatedly for "
337 "vector %d", vector);
344 break; 338 break;
345 } 339 }
346 340
@@ -349,10 +343,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
349 apic_set_vector(vector, apic->regs + APIC_TMR); 343 apic_set_vector(vector, apic->regs + APIC_TMR);
350 } else 344 } else
351 apic_clear_vector(vector, apic->regs + APIC_TMR); 345 apic_clear_vector(vector, apic->regs + APIC_TMR);
352
353 kvm_vcpu_kick(vcpu); 346 kvm_vcpu_kick(vcpu);
354
355 result = (orig_irr == 0);
356 break; 347 break;
357 348
358 case APIC_DM_REMRD: 349 case APIC_DM_REMRD:
@@ -364,12 +355,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
364 break; 355 break;
365 356
366 case APIC_DM_NMI: 357 case APIC_DM_NMI:
358 result = 1;
367 kvm_inject_nmi(vcpu); 359 kvm_inject_nmi(vcpu);
368 kvm_vcpu_kick(vcpu); 360 kvm_vcpu_kick(vcpu);
369 break; 361 break;
370 362
371 case APIC_DM_INIT: 363 case APIC_DM_INIT:
372 if (level) { 364 if (level) {
365 result = 1;
373 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) 366 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
374 printk(KERN_DEBUG 367 printk(KERN_DEBUG
375 "INIT on a runnable vcpu %d\n", 368 "INIT on a runnable vcpu %d\n",
@@ -386,6 +379,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
386 apic_debug("SIPI to vcpu %d vector 0x%02x\n", 379 apic_debug("SIPI to vcpu %d vector 0x%02x\n",
387 vcpu->vcpu_id, vector); 380 vcpu->vcpu_id, vector);
388 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { 381 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
382 result = 1;
389 vcpu->arch.sipi_vector = vector; 383 vcpu->arch.sipi_vector = vector;
390 vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; 384 vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
391 kvm_vcpu_kick(vcpu); 385 kvm_vcpu_kick(vcpu);
@@ -408,43 +402,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
408 return result; 402 return result;
409} 403}
410 404
411static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, 405int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
412 unsigned long bitmap)
413{
414 int last;
415 int next;
416 struct kvm_lapic *apic = NULL;
417
418 last = kvm->arch.round_robin_prev_vcpu;
419 next = last;
420
421 do {
422 if (++next == KVM_MAX_VCPUS)
423 next = 0;
424 if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap))
425 continue;
426 apic = kvm->vcpus[next]->arch.apic;
427 if (apic && apic_enabled(apic))
428 break;
429 apic = NULL;
430 } while (next != last);
431 kvm->arch.round_robin_prev_vcpu = next;
432
433 if (!apic)
434 printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n");
435
436 return apic;
437}
438
439struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
440 unsigned long bitmap)
441{ 406{
442 struct kvm_lapic *apic; 407 return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
443
444 apic = kvm_apic_round_robin(kvm, vector, bitmap);
445 if (apic)
446 return apic->vcpu;
447 return NULL;
448} 408}
449 409
450static void apic_set_eoi(struct kvm_lapic *apic) 410static void apic_set_eoi(struct kvm_lapic *apic)
@@ -472,47 +432,24 @@ static void apic_send_ipi(struct kvm_lapic *apic)
472{ 432{
473 u32 icr_low = apic_get_reg(apic, APIC_ICR); 433 u32 icr_low = apic_get_reg(apic, APIC_ICR);
474 u32 icr_high = apic_get_reg(apic, APIC_ICR2); 434 u32 icr_high = apic_get_reg(apic, APIC_ICR2);
435 struct kvm_lapic_irq irq;
475 436
476 unsigned int dest = GET_APIC_DEST_FIELD(icr_high); 437 irq.vector = icr_low & APIC_VECTOR_MASK;
477 unsigned int short_hand = icr_low & APIC_SHORT_MASK; 438 irq.delivery_mode = icr_low & APIC_MODE_MASK;
478 unsigned int trig_mode = icr_low & APIC_INT_LEVELTRIG; 439 irq.dest_mode = icr_low & APIC_DEST_MASK;
479 unsigned int level = icr_low & APIC_INT_ASSERT; 440 irq.level = icr_low & APIC_INT_ASSERT;
480 unsigned int dest_mode = icr_low & APIC_DEST_MASK; 441 irq.trig_mode = icr_low & APIC_INT_LEVELTRIG;
481 unsigned int delivery_mode = icr_low & APIC_MODE_MASK; 442 irq.shorthand = icr_low & APIC_SHORT_MASK;
482 unsigned int vector = icr_low & APIC_VECTOR_MASK; 443 irq.dest_id = GET_APIC_DEST_FIELD(icr_high);
483
484 struct kvm_vcpu *target;
485 struct kvm_vcpu *vcpu;
486 unsigned long lpr_map = 0;
487 int i;
488 444
489 apic_debug("icr_high 0x%x, icr_low 0x%x, " 445 apic_debug("icr_high 0x%x, icr_low 0x%x, "
490 "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, " 446 "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
491 "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n", 447 "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
492 icr_high, icr_low, short_hand, dest, 448 icr_high, icr_low, irq.shorthand, irq.dest_id,
493 trig_mode, level, dest_mode, delivery_mode, vector); 449 irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
494 450 irq.vector);
495 for (i = 0; i < KVM_MAX_VCPUS; i++) {
496 vcpu = apic->vcpu->kvm->vcpus[i];
497 if (!vcpu)
498 continue;
499
500 if (vcpu->arch.apic &&
501 apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) {
502 if (delivery_mode == APIC_DM_LOWEST)
503 set_bit(vcpu->vcpu_id, &lpr_map);
504 else
505 __apic_accept_irq(vcpu->arch.apic, delivery_mode,
506 vector, level, trig_mode);
507 }
508 }
509 451
510 if (delivery_mode == APIC_DM_LOWEST) { 452 kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq);
511 target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map);
512 if (target != NULL)
513 __apic_accept_irq(target->arch.apic, delivery_mode,
514 vector, level, trig_mode);
515 }
516} 453}
517 454
518static u32 apic_get_tmcct(struct kvm_lapic *apic) 455static u32 apic_get_tmcct(struct kvm_lapic *apic)
@@ -527,12 +464,13 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
527 if (apic_get_reg(apic, APIC_TMICT) == 0) 464 if (apic_get_reg(apic, APIC_TMICT) == 0)
528 return 0; 465 return 0;
529 466
530 remaining = hrtimer_expires_remaining(&apic->timer.dev); 467 remaining = hrtimer_expires_remaining(&apic->lapic_timer.timer);
531 if (ktime_to_ns(remaining) < 0) 468 if (ktime_to_ns(remaining) < 0)
532 remaining = ktime_set(0, 0); 469 remaining = ktime_set(0, 0);
533 470
534 ns = mod_64(ktime_to_ns(remaining), apic->timer.period); 471 ns = mod_64(ktime_to_ns(remaining), apic->lapic_timer.period);
535 tmcct = div64_u64(ns, (APIC_BUS_CYCLE_NS * apic->timer.divide_count)); 472 tmcct = div64_u64(ns,
473 (APIC_BUS_CYCLE_NS * apic->divide_count));
536 474
537 return tmcct; 475 return tmcct;
538} 476}
@@ -619,25 +557,25 @@ static void update_divide_count(struct kvm_lapic *apic)
619 tdcr = apic_get_reg(apic, APIC_TDCR); 557 tdcr = apic_get_reg(apic, APIC_TDCR);
620 tmp1 = tdcr & 0xf; 558 tmp1 = tdcr & 0xf;
621 tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1; 559 tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1;
622 apic->timer.divide_count = 0x1 << (tmp2 & 0x7); 560 apic->divide_count = 0x1 << (tmp2 & 0x7);
623 561
624 apic_debug("timer divide count is 0x%x\n", 562 apic_debug("timer divide count is 0x%x\n",
625 apic->timer.divide_count); 563 apic->divide_count);
626} 564}
627 565
628static void start_apic_timer(struct kvm_lapic *apic) 566static void start_apic_timer(struct kvm_lapic *apic)
629{ 567{
630 ktime_t now = apic->timer.dev.base->get_time(); 568 ktime_t now = apic->lapic_timer.timer.base->get_time();
631 569
632 apic->timer.period = apic_get_reg(apic, APIC_TMICT) * 570 apic->lapic_timer.period = apic_get_reg(apic, APIC_TMICT) *
633 APIC_BUS_CYCLE_NS * apic->timer.divide_count; 571 APIC_BUS_CYCLE_NS * apic->divide_count;
634 atomic_set(&apic->timer.pending, 0); 572 atomic_set(&apic->lapic_timer.pending, 0);
635 573
636 if (!apic->timer.period) 574 if (!apic->lapic_timer.period)
637 return; 575 return;
638 576
639 hrtimer_start(&apic->timer.dev, 577 hrtimer_start(&apic->lapic_timer.timer,
640 ktime_add_ns(now, apic->timer.period), 578 ktime_add_ns(now, apic->lapic_timer.period),
641 HRTIMER_MODE_ABS); 579 HRTIMER_MODE_ABS);
642 580
643 apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" 581 apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
@@ -646,9 +584,9 @@ static void start_apic_timer(struct kvm_lapic *apic)
646 "expire @ 0x%016" PRIx64 ".\n", __func__, 584 "expire @ 0x%016" PRIx64 ".\n", __func__,
647 APIC_BUS_CYCLE_NS, ktime_to_ns(now), 585 APIC_BUS_CYCLE_NS, ktime_to_ns(now),
648 apic_get_reg(apic, APIC_TMICT), 586 apic_get_reg(apic, APIC_TMICT),
649 apic->timer.period, 587 apic->lapic_timer.period,
650 ktime_to_ns(ktime_add_ns(now, 588 ktime_to_ns(ktime_add_ns(now,
651 apic->timer.period))); 589 apic->lapic_timer.period)));
652} 590}
653 591
654static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) 592static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
@@ -730,7 +668,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
730 apic_set_reg(apic, APIC_LVTT + 0x10 * i, 668 apic_set_reg(apic, APIC_LVTT + 0x10 * i,
731 lvt_val | APIC_LVT_MASKED); 669 lvt_val | APIC_LVT_MASKED);
732 } 670 }
733 atomic_set(&apic->timer.pending, 0); 671 atomic_set(&apic->lapic_timer.pending, 0);
734 672
735 } 673 }
736 break; 674 break;
@@ -762,7 +700,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
762 break; 700 break;
763 701
764 case APIC_TMICT: 702 case APIC_TMICT:
765 hrtimer_cancel(&apic->timer.dev); 703 hrtimer_cancel(&apic->lapic_timer.timer);
766 apic_set_reg(apic, APIC_TMICT, val); 704 apic_set_reg(apic, APIC_TMICT, val);
767 start_apic_timer(apic); 705 start_apic_timer(apic);
768 return; 706 return;
@@ -802,7 +740,7 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
802 if (!vcpu->arch.apic) 740 if (!vcpu->arch.apic)
803 return; 741 return;
804 742
805 hrtimer_cancel(&vcpu->arch.apic->timer.dev); 743 hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer);
806 744
807 if (vcpu->arch.apic->regs_page) 745 if (vcpu->arch.apic->regs_page)
808 __free_page(vcpu->arch.apic->regs_page); 746 __free_page(vcpu->arch.apic->regs_page);
@@ -880,7 +818,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
880 ASSERT(apic != NULL); 818 ASSERT(apic != NULL);
881 819
882 /* Stop the timer in case it's a reset to an active apic */ 820 /* Stop the timer in case it's a reset to an active apic */
883 hrtimer_cancel(&apic->timer.dev); 821 hrtimer_cancel(&apic->lapic_timer.timer);
884 822
885 apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24); 823 apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24);
886 apic_set_reg(apic, APIC_LVR, APIC_VERSION); 824 apic_set_reg(apic, APIC_LVR, APIC_VERSION);
@@ -905,11 +843,13 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
905 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); 843 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
906 } 844 }
907 update_divide_count(apic); 845 update_divide_count(apic);
908 atomic_set(&apic->timer.pending, 0); 846 atomic_set(&apic->lapic_timer.pending, 0);
909 if (vcpu->vcpu_id == 0) 847 if (vcpu->vcpu_id == 0)
910 vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; 848 vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
911 apic_update_ppr(apic); 849 apic_update_ppr(apic);
912 850
851 vcpu->arch.apic_arb_prio = 0;
852
913 apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" 853 apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
914 "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__, 854 "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__,
915 vcpu, kvm_apic_id(apic), 855 vcpu, kvm_apic_id(apic),
@@ -917,16 +857,14 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
917} 857}
918EXPORT_SYMBOL_GPL(kvm_lapic_reset); 858EXPORT_SYMBOL_GPL(kvm_lapic_reset);
919 859
920int kvm_lapic_enabled(struct kvm_vcpu *vcpu) 860bool kvm_apic_present(struct kvm_vcpu *vcpu)
921{ 861{
922 struct kvm_lapic *apic = vcpu->arch.apic; 862 return vcpu->arch.apic && apic_hw_enabled(vcpu->arch.apic);
923 int ret = 0; 863}
924
925 if (!apic)
926 return 0;
927 ret = apic_enabled(apic);
928 864
929 return ret; 865int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
866{
867 return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic);
930} 868}
931EXPORT_SYMBOL_GPL(kvm_lapic_enabled); 869EXPORT_SYMBOL_GPL(kvm_lapic_enabled);
932 870
@@ -936,22 +874,11 @@ EXPORT_SYMBOL_GPL(kvm_lapic_enabled);
936 *---------------------------------------------------------------------- 874 *----------------------------------------------------------------------
937 */ 875 */
938 876
939/* TODO: make sure __apic_timer_fn runs in current pCPU */ 877static bool lapic_is_periodic(struct kvm_timer *ktimer)
940static int __apic_timer_fn(struct kvm_lapic *apic)
941{ 878{
942 int result = 0; 879 struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic,
943 wait_queue_head_t *q = &apic->vcpu->wq; 880 lapic_timer);
944 881 return apic_lvtt_period(apic);
945 if(!atomic_inc_and_test(&apic->timer.pending))
946 set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
947 if (waitqueue_active(q))
948 wake_up_interruptible(q);
949
950 if (apic_lvtt_period(apic)) {
951 result = 1;
952 hrtimer_add_expires_ns(&apic->timer.dev, apic->timer.period);
953 }
954 return result;
955} 882}
956 883
957int apic_has_pending_timer(struct kvm_vcpu *vcpu) 884int apic_has_pending_timer(struct kvm_vcpu *vcpu)
@@ -959,7 +886,7 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu)
959 struct kvm_lapic *lapic = vcpu->arch.apic; 886 struct kvm_lapic *lapic = vcpu->arch.apic;
960 887
961 if (lapic && apic_enabled(lapic) && apic_lvt_enabled(lapic, APIC_LVTT)) 888 if (lapic && apic_enabled(lapic) && apic_lvt_enabled(lapic, APIC_LVTT))
962 return atomic_read(&lapic->timer.pending); 889 return atomic_read(&lapic->lapic_timer.pending);
963 890
964 return 0; 891 return 0;
965} 892}
@@ -986,20 +913,9 @@ void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
986 kvm_apic_local_deliver(apic, APIC_LVT0); 913 kvm_apic_local_deliver(apic, APIC_LVT0);
987} 914}
988 915
989static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) 916static struct kvm_timer_ops lapic_timer_ops = {
990{ 917 .is_periodic = lapic_is_periodic,
991 struct kvm_lapic *apic; 918};
992 int restart_timer = 0;
993
994 apic = container_of(data, struct kvm_lapic, timer.dev);
995
996 restart_timer = __apic_timer_fn(apic);
997
998 if (restart_timer)
999 return HRTIMER_RESTART;
1000 else
1001 return HRTIMER_NORESTART;
1002}
1003 919
1004int kvm_create_lapic(struct kvm_vcpu *vcpu) 920int kvm_create_lapic(struct kvm_vcpu *vcpu)
1005{ 921{
@@ -1024,8 +940,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
1024 memset(apic->regs, 0, PAGE_SIZE); 940 memset(apic->regs, 0, PAGE_SIZE);
1025 apic->vcpu = vcpu; 941 apic->vcpu = vcpu;
1026 942
1027 hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 943 hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
1028 apic->timer.dev.function = apic_timer_fn; 944 HRTIMER_MODE_ABS);
945 apic->lapic_timer.timer.function = kvm_timer_fn;
946 apic->lapic_timer.t_ops = &lapic_timer_ops;
947 apic->lapic_timer.kvm = vcpu->kvm;
948 apic->lapic_timer.vcpu_id = vcpu->vcpu_id;
949
1029 apic->base_address = APIC_DEFAULT_PHYS_BASE; 950 apic->base_address = APIC_DEFAULT_PHYS_BASE;
1030 vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE; 951 vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
1031 952
@@ -1078,9 +999,9 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
1078{ 999{
1079 struct kvm_lapic *apic = vcpu->arch.apic; 1000 struct kvm_lapic *apic = vcpu->arch.apic;
1080 1001
1081 if (apic && atomic_read(&apic->timer.pending) > 0) { 1002 if (apic && atomic_read(&apic->lapic_timer.pending) > 0) {
1082 if (kvm_apic_local_deliver(apic, APIC_LVTT)) 1003 if (kvm_apic_local_deliver(apic, APIC_LVTT))
1083 atomic_dec(&apic->timer.pending); 1004 atomic_dec(&apic->lapic_timer.pending);
1084 } 1005 }
1085} 1006}
1086 1007
@@ -1106,7 +1027,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
1106 MSR_IA32_APICBASE_BASE; 1027 MSR_IA32_APICBASE_BASE;
1107 apic_set_reg(apic, APIC_LVR, APIC_VERSION); 1028 apic_set_reg(apic, APIC_LVR, APIC_VERSION);
1108 apic_update_ppr(apic); 1029 apic_update_ppr(apic);
1109 hrtimer_cancel(&apic->timer.dev); 1030 hrtimer_cancel(&apic->lapic_timer.timer);
1110 update_divide_count(apic); 1031 update_divide_count(apic);
1111 start_apic_timer(apic); 1032 start_apic_timer(apic);
1112} 1033}
@@ -1119,7 +1040,7 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
1119 if (!apic) 1040 if (!apic)
1120 return; 1041 return;
1121 1042
1122 timer = &apic->timer.dev; 1043 timer = &apic->lapic_timer.timer;
1123 if (hrtimer_cancel(timer)) 1044 if (hrtimer_cancel(timer))
1124 hrtimer_start_expires(timer, HRTIMER_MODE_ABS); 1045 hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
1125} 1046}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 45ab6ee71209..a587f8349c46 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -2,18 +2,15 @@
2#define __KVM_X86_LAPIC_H 2#define __KVM_X86_LAPIC_H
3 3
4#include "iodev.h" 4#include "iodev.h"
5#include "kvm_timer.h"
5 6
6#include <linux/kvm_host.h> 7#include <linux/kvm_host.h>
7 8
8struct kvm_lapic { 9struct kvm_lapic {
9 unsigned long base_address; 10 unsigned long base_address;
10 struct kvm_io_device dev; 11 struct kvm_io_device dev;
11 struct { 12 struct kvm_timer lapic_timer;
12 atomic_t pending; 13 u32 divide_count;
13 s64 period; /* unit: ns */
14 u32 divide_count;
15 struct hrtimer dev;
16 } timer;
17 struct kvm_vcpu *vcpu; 14 struct kvm_vcpu *vcpu;
18 struct page *regs_page; 15 struct page *regs_page;
19 void *regs; 16 void *regs;
@@ -34,12 +31,13 @@ u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
34 31
35int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); 32int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
36int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); 33int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
37int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig); 34int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
38 35
39u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); 36u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
40void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); 37void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
41void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu); 38void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
42int kvm_lapic_enabled(struct kvm_vcpu *vcpu); 39int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
40bool kvm_apic_present(struct kvm_vcpu *vcpu);
43int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); 41int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
44 42
45void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); 43void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 32cf11e5728a..5c3d6e81a7dc 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -126,6 +126,7 @@ module_param(oos_shadow, bool, 0644);
126#define PFERR_PRESENT_MASK (1U << 0) 126#define PFERR_PRESENT_MASK (1U << 0)
127#define PFERR_WRITE_MASK (1U << 1) 127#define PFERR_WRITE_MASK (1U << 1)
128#define PFERR_USER_MASK (1U << 2) 128#define PFERR_USER_MASK (1U << 2)
129#define PFERR_RSVD_MASK (1U << 3)
129#define PFERR_FETCH_MASK (1U << 4) 130#define PFERR_FETCH_MASK (1U << 4)
130 131
131#define PT_DIRECTORY_LEVEL 2 132#define PT_DIRECTORY_LEVEL 2
@@ -177,7 +178,11 @@ static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
177static u64 __read_mostly shadow_user_mask; 178static u64 __read_mostly shadow_user_mask;
178static u64 __read_mostly shadow_accessed_mask; 179static u64 __read_mostly shadow_accessed_mask;
179static u64 __read_mostly shadow_dirty_mask; 180static u64 __read_mostly shadow_dirty_mask;
180static u64 __read_mostly shadow_mt_mask; 181
182static inline u64 rsvd_bits(int s, int e)
183{
184 return ((1ULL << (e - s + 1)) - 1) << s;
185}
181 186
182void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) 187void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
183{ 188{
@@ -193,14 +198,13 @@ void kvm_mmu_set_base_ptes(u64 base_pte)
193EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes); 198EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
194 199
195void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 200void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
196 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask) 201 u64 dirty_mask, u64 nx_mask, u64 x_mask)
197{ 202{
198 shadow_user_mask = user_mask; 203 shadow_user_mask = user_mask;
199 shadow_accessed_mask = accessed_mask; 204 shadow_accessed_mask = accessed_mask;
200 shadow_dirty_mask = dirty_mask; 205 shadow_dirty_mask = dirty_mask;
201 shadow_nx_mask = nx_mask; 206 shadow_nx_mask = nx_mask;
202 shadow_x_mask = x_mask; 207 shadow_x_mask = x_mask;
203 shadow_mt_mask = mt_mask;
204} 208}
205EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); 209EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
206 210
@@ -219,11 +223,6 @@ static int is_nx(struct kvm_vcpu *vcpu)
219 return vcpu->arch.shadow_efer & EFER_NX; 223 return vcpu->arch.shadow_efer & EFER_NX;
220} 224}
221 225
222static int is_present_pte(unsigned long pte)
223{
224 return pte & PT_PRESENT_MASK;
225}
226
227static int is_shadow_present_pte(u64 pte) 226static int is_shadow_present_pte(u64 pte)
228{ 227{
229 return pte != shadow_trap_nonpresent_pte 228 return pte != shadow_trap_nonpresent_pte
@@ -1074,18 +1073,10 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
1074 return NULL; 1073 return NULL;
1075} 1074}
1076 1075
1077static void kvm_unlink_unsync_global(struct kvm *kvm, struct kvm_mmu_page *sp)
1078{
1079 list_del(&sp->oos_link);
1080 --kvm->stat.mmu_unsync_global;
1081}
1082
1083static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1076static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1084{ 1077{
1085 WARN_ON(!sp->unsync); 1078 WARN_ON(!sp->unsync);
1086 sp->unsync = 0; 1079 sp->unsync = 0;
1087 if (sp->global)
1088 kvm_unlink_unsync_global(kvm, sp);
1089 --kvm->stat.mmu_unsync; 1080 --kvm->stat.mmu_unsync;
1090} 1081}
1091 1082
@@ -1248,7 +1239,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1248 pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word); 1239 pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word);
1249 sp->gfn = gfn; 1240 sp->gfn = gfn;
1250 sp->role = role; 1241 sp->role = role;
1251 sp->global = 0;
1252 hlist_add_head(&sp->hash_link, bucket); 1242 hlist_add_head(&sp->hash_link, bucket);
1253 if (!direct) { 1243 if (!direct) {
1254 if (rmap_write_protect(vcpu->kvm, gfn)) 1244 if (rmap_write_protect(vcpu->kvm, gfn))
@@ -1616,7 +1606,7 @@ static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
1616 return mtrr_state->def_type; 1606 return mtrr_state->def_type;
1617} 1607}
1618 1608
1619static u8 get_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) 1609u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
1620{ 1610{
1621 u8 mtrr; 1611 u8 mtrr;
1622 1612
@@ -1626,6 +1616,7 @@ static u8 get_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
1626 mtrr = MTRR_TYPE_WRBACK; 1616 mtrr = MTRR_TYPE_WRBACK;
1627 return mtrr; 1617 return mtrr;
1628} 1618}
1619EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
1629 1620
1630static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 1621static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1631{ 1622{
@@ -1646,11 +1637,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1646 ++vcpu->kvm->stat.mmu_unsync; 1637 ++vcpu->kvm->stat.mmu_unsync;
1647 sp->unsync = 1; 1638 sp->unsync = 1;
1648 1639
1649 if (sp->global) { 1640 kvm_mmu_mark_parents_unsync(vcpu, sp);
1650 list_add(&sp->oos_link, &vcpu->kvm->arch.oos_global_pages);
1651 ++vcpu->kvm->stat.mmu_unsync_global;
1652 } else
1653 kvm_mmu_mark_parents_unsync(vcpu, sp);
1654 1641
1655 mmu_convert_notrap(sp); 1642 mmu_convert_notrap(sp);
1656 return 0; 1643 return 0;
@@ -1677,21 +1664,11 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
1677static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, 1664static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1678 unsigned pte_access, int user_fault, 1665 unsigned pte_access, int user_fault,
1679 int write_fault, int dirty, int largepage, 1666 int write_fault, int dirty, int largepage,
1680 int global, gfn_t gfn, pfn_t pfn, bool speculative, 1667 gfn_t gfn, pfn_t pfn, bool speculative,
1681 bool can_unsync) 1668 bool can_unsync)
1682{ 1669{
1683 u64 spte; 1670 u64 spte;
1684 int ret = 0; 1671 int ret = 0;
1685 u64 mt_mask = shadow_mt_mask;
1686 struct kvm_mmu_page *sp = page_header(__pa(shadow_pte));
1687
1688 if (!global && sp->global) {
1689 sp->global = 0;
1690 if (sp->unsync) {
1691 kvm_unlink_unsync_global(vcpu->kvm, sp);
1692 kvm_mmu_mark_parents_unsync(vcpu, sp);
1693 }
1694 }
1695 1672
1696 /* 1673 /*
1697 * We don't set the accessed bit, since we sometimes want to see 1674 * We don't set the accessed bit, since we sometimes want to see
@@ -1711,16 +1688,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1711 spte |= shadow_user_mask; 1688 spte |= shadow_user_mask;
1712 if (largepage) 1689 if (largepage)
1713 spte |= PT_PAGE_SIZE_MASK; 1690 spte |= PT_PAGE_SIZE_MASK;
1714 if (mt_mask) { 1691 if (tdp_enabled)
1715 if (!kvm_is_mmio_pfn(pfn)) { 1692 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
1716 mt_mask = get_memory_type(vcpu, gfn) << 1693 kvm_is_mmio_pfn(pfn));
1717 kvm_x86_ops->get_mt_mask_shift();
1718 mt_mask |= VMX_EPT_IGMT_BIT;
1719 } else
1720 mt_mask = MTRR_TYPE_UNCACHABLE <<
1721 kvm_x86_ops->get_mt_mask_shift();
1722 spte |= mt_mask;
1723 }
1724 1694
1725 spte |= (u64)pfn << PAGE_SHIFT; 1695 spte |= (u64)pfn << PAGE_SHIFT;
1726 1696
@@ -1765,8 +1735,8 @@ set_pte:
1765static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, 1735static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1766 unsigned pt_access, unsigned pte_access, 1736 unsigned pt_access, unsigned pte_access,
1767 int user_fault, int write_fault, int dirty, 1737 int user_fault, int write_fault, int dirty,
1768 int *ptwrite, int largepage, int global, 1738 int *ptwrite, int largepage, gfn_t gfn,
1769 gfn_t gfn, pfn_t pfn, bool speculative) 1739 pfn_t pfn, bool speculative)
1770{ 1740{
1771 int was_rmapped = 0; 1741 int was_rmapped = 0;
1772 int was_writeble = is_writeble_pte(*shadow_pte); 1742 int was_writeble = is_writeble_pte(*shadow_pte);
@@ -1795,7 +1765,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1795 was_rmapped = 1; 1765 was_rmapped = 1;
1796 } 1766 }
1797 if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, 1767 if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
1798 dirty, largepage, global, gfn, pfn, speculative, true)) { 1768 dirty, largepage, gfn, pfn, speculative, true)) {
1799 if (write_fault) 1769 if (write_fault)
1800 *ptwrite = 1; 1770 *ptwrite = 1;
1801 kvm_x86_ops->tlb_flush(vcpu); 1771 kvm_x86_ops->tlb_flush(vcpu);
@@ -1843,7 +1813,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1843 || (largepage && iterator.level == PT_DIRECTORY_LEVEL)) { 1813 || (largepage && iterator.level == PT_DIRECTORY_LEVEL)) {
1844 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, 1814 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
1845 0, write, 1, &pt_write, 1815 0, write, 1, &pt_write,
1846 largepage, 0, gfn, pfn, false); 1816 largepage, gfn, pfn, false);
1847 ++vcpu->stat.pf_fixed; 1817 ++vcpu->stat.pf_fixed;
1848 break; 1818 break;
1849 } 1819 }
@@ -1942,7 +1912,19 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
1942 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 1912 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1943} 1913}
1944 1914
1945static void mmu_alloc_roots(struct kvm_vcpu *vcpu) 1915static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
1916{
1917 int ret = 0;
1918
1919 if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
1920 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
1921 ret = 1;
1922 }
1923
1924 return ret;
1925}
1926
1927static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
1946{ 1928{
1947 int i; 1929 int i;
1948 gfn_t root_gfn; 1930 gfn_t root_gfn;
@@ -1957,13 +1939,15 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
1957 ASSERT(!VALID_PAGE(root)); 1939 ASSERT(!VALID_PAGE(root));
1958 if (tdp_enabled) 1940 if (tdp_enabled)
1959 direct = 1; 1941 direct = 1;
1942 if (mmu_check_root(vcpu, root_gfn))
1943 return 1;
1960 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, 1944 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
1961 PT64_ROOT_LEVEL, direct, 1945 PT64_ROOT_LEVEL, direct,
1962 ACC_ALL, NULL); 1946 ACC_ALL, NULL);
1963 root = __pa(sp->spt); 1947 root = __pa(sp->spt);
1964 ++sp->root_count; 1948 ++sp->root_count;
1965 vcpu->arch.mmu.root_hpa = root; 1949 vcpu->arch.mmu.root_hpa = root;
1966 return; 1950 return 0;
1967 } 1951 }
1968 direct = !is_paging(vcpu); 1952 direct = !is_paging(vcpu);
1969 if (tdp_enabled) 1953 if (tdp_enabled)
@@ -1980,6 +1964,8 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
1980 root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT; 1964 root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT;
1981 } else if (vcpu->arch.mmu.root_level == 0) 1965 } else if (vcpu->arch.mmu.root_level == 0)
1982 root_gfn = 0; 1966 root_gfn = 0;
1967 if (mmu_check_root(vcpu, root_gfn))
1968 return 1;
1983 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, 1969 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
1984 PT32_ROOT_LEVEL, direct, 1970 PT32_ROOT_LEVEL, direct,
1985 ACC_ALL, NULL); 1971 ACC_ALL, NULL);
@@ -1988,6 +1974,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
1988 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; 1974 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
1989 } 1975 }
1990 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); 1976 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
1977 return 0;
1991} 1978}
1992 1979
1993static void mmu_sync_roots(struct kvm_vcpu *vcpu) 1980static void mmu_sync_roots(struct kvm_vcpu *vcpu)
@@ -2006,7 +1993,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2006 for (i = 0; i < 4; ++i) { 1993 for (i = 0; i < 4; ++i) {
2007 hpa_t root = vcpu->arch.mmu.pae_root[i]; 1994 hpa_t root = vcpu->arch.mmu.pae_root[i];
2008 1995
2009 if (root) { 1996 if (root && VALID_PAGE(root)) {
2010 root &= PT64_BASE_ADDR_MASK; 1997 root &= PT64_BASE_ADDR_MASK;
2011 sp = page_header(root); 1998 sp = page_header(root);
2012 mmu_sync_children(vcpu, sp); 1999 mmu_sync_children(vcpu, sp);
@@ -2014,15 +2001,6 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2014 } 2001 }
2015} 2002}
2016 2003
2017static void mmu_sync_global(struct kvm_vcpu *vcpu)
2018{
2019 struct kvm *kvm = vcpu->kvm;
2020 struct kvm_mmu_page *sp, *n;
2021
2022 list_for_each_entry_safe(sp, n, &kvm->arch.oos_global_pages, oos_link)
2023 kvm_sync_page(vcpu, sp);
2024}
2025
2026void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) 2004void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
2027{ 2005{
2028 spin_lock(&vcpu->kvm->mmu_lock); 2006 spin_lock(&vcpu->kvm->mmu_lock);
@@ -2030,13 +2008,6 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
2030 spin_unlock(&vcpu->kvm->mmu_lock); 2008 spin_unlock(&vcpu->kvm->mmu_lock);
2031} 2009}
2032 2010
2033void kvm_mmu_sync_global(struct kvm_vcpu *vcpu)
2034{
2035 spin_lock(&vcpu->kvm->mmu_lock);
2036 mmu_sync_global(vcpu);
2037 spin_unlock(&vcpu->kvm->mmu_lock);
2038}
2039
2040static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) 2011static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
2041{ 2012{
2042 return vaddr; 2013 return vaddr;
@@ -2151,6 +2122,14 @@ static void paging_free(struct kvm_vcpu *vcpu)
2151 nonpaging_free(vcpu); 2122 nonpaging_free(vcpu);
2152} 2123}
2153 2124
2125static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level)
2126{
2127 int bit7;
2128
2129 bit7 = (gpte >> 7) & 1;
2130 return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0;
2131}
2132
2154#define PTTYPE 64 2133#define PTTYPE 64
2155#include "paging_tmpl.h" 2134#include "paging_tmpl.h"
2156#undef PTTYPE 2135#undef PTTYPE
@@ -2159,6 +2138,59 @@ static void paging_free(struct kvm_vcpu *vcpu)
2159#include "paging_tmpl.h" 2138#include "paging_tmpl.h"
2160#undef PTTYPE 2139#undef PTTYPE
2161 2140
2141static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2142{
2143 struct kvm_mmu *context = &vcpu->arch.mmu;
2144 int maxphyaddr = cpuid_maxphyaddr(vcpu);
2145 u64 exb_bit_rsvd = 0;
2146
2147 if (!is_nx(vcpu))
2148 exb_bit_rsvd = rsvd_bits(63, 63);
2149 switch (level) {
2150 case PT32_ROOT_LEVEL:
2151 /* no rsvd bits for 2 level 4K page table entries */
2152 context->rsvd_bits_mask[0][1] = 0;
2153 context->rsvd_bits_mask[0][0] = 0;
2154 if (is_cpuid_PSE36())
2155 /* 36bits PSE 4MB page */
2156 context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
2157 else
2158 /* 32 bits PSE 4MB page */
2159 context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
2160 context->rsvd_bits_mask[1][0] = ~0ull;
2161 break;
2162 case PT32E_ROOT_LEVEL:
2163 context->rsvd_bits_mask[0][2] =
2164 rsvd_bits(maxphyaddr, 63) |
2165 rsvd_bits(7, 8) | rsvd_bits(1, 2); /* PDPTE */
2166 context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
2167 rsvd_bits(maxphyaddr, 62); /* PDE */
2168 context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
2169 rsvd_bits(maxphyaddr, 62); /* PTE */
2170 context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2171 rsvd_bits(maxphyaddr, 62) |
2172 rsvd_bits(13, 20); /* large page */
2173 context->rsvd_bits_mask[1][0] = ~0ull;
2174 break;
2175 case PT64_ROOT_LEVEL:
2176 context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
2177 rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
2178 context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
2179 rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
2180 context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
2181 rsvd_bits(maxphyaddr, 51);
2182 context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
2183 rsvd_bits(maxphyaddr, 51);
2184 context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
2185 context->rsvd_bits_mask[1][2] = context->rsvd_bits_mask[0][2];
2186 context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2187 rsvd_bits(maxphyaddr, 51) |
2188 rsvd_bits(13, 20); /* large page */
2189 context->rsvd_bits_mask[1][0] = ~0ull;
2190 break;
2191 }
2192}
2193
2162static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) 2194static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
2163{ 2195{
2164 struct kvm_mmu *context = &vcpu->arch.mmu; 2196 struct kvm_mmu *context = &vcpu->arch.mmu;
@@ -2179,6 +2211,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
2179 2211
2180static int paging64_init_context(struct kvm_vcpu *vcpu) 2212static int paging64_init_context(struct kvm_vcpu *vcpu)
2181{ 2213{
2214 reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
2182 return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL); 2215 return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
2183} 2216}
2184 2217
@@ -2186,6 +2219,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
2186{ 2219{
2187 struct kvm_mmu *context = &vcpu->arch.mmu; 2220 struct kvm_mmu *context = &vcpu->arch.mmu;
2188 2221
2222 reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
2189 context->new_cr3 = paging_new_cr3; 2223 context->new_cr3 = paging_new_cr3;
2190 context->page_fault = paging32_page_fault; 2224 context->page_fault = paging32_page_fault;
2191 context->gva_to_gpa = paging32_gva_to_gpa; 2225 context->gva_to_gpa = paging32_gva_to_gpa;
@@ -2201,6 +2235,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
2201 2235
2202static int paging32E_init_context(struct kvm_vcpu *vcpu) 2236static int paging32E_init_context(struct kvm_vcpu *vcpu)
2203{ 2237{
2238 reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
2204 return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); 2239 return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
2205} 2240}
2206 2241
@@ -2221,12 +2256,15 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2221 context->gva_to_gpa = nonpaging_gva_to_gpa; 2256 context->gva_to_gpa = nonpaging_gva_to_gpa;
2222 context->root_level = 0; 2257 context->root_level = 0;
2223 } else if (is_long_mode(vcpu)) { 2258 } else if (is_long_mode(vcpu)) {
2259 reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
2224 context->gva_to_gpa = paging64_gva_to_gpa; 2260 context->gva_to_gpa = paging64_gva_to_gpa;
2225 context->root_level = PT64_ROOT_LEVEL; 2261 context->root_level = PT64_ROOT_LEVEL;
2226 } else if (is_pae(vcpu)) { 2262 } else if (is_pae(vcpu)) {
2263 reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
2227 context->gva_to_gpa = paging64_gva_to_gpa; 2264 context->gva_to_gpa = paging64_gva_to_gpa;
2228 context->root_level = PT32E_ROOT_LEVEL; 2265 context->root_level = PT32E_ROOT_LEVEL;
2229 } else { 2266 } else {
2267 reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
2230 context->gva_to_gpa = paging32_gva_to_gpa; 2268 context->gva_to_gpa = paging32_gva_to_gpa;
2231 context->root_level = PT32_ROOT_LEVEL; 2269 context->root_level = PT32_ROOT_LEVEL;
2232 } 2270 }
@@ -2290,9 +2328,11 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
2290 goto out; 2328 goto out;
2291 spin_lock(&vcpu->kvm->mmu_lock); 2329 spin_lock(&vcpu->kvm->mmu_lock);
2292 kvm_mmu_free_some_pages(vcpu); 2330 kvm_mmu_free_some_pages(vcpu);
2293 mmu_alloc_roots(vcpu); 2331 r = mmu_alloc_roots(vcpu);
2294 mmu_sync_roots(vcpu); 2332 mmu_sync_roots(vcpu);
2295 spin_unlock(&vcpu->kvm->mmu_lock); 2333 spin_unlock(&vcpu->kvm->mmu_lock);
2334 if (r)
2335 goto out;
2296 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); 2336 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
2297 kvm_mmu_flush_tlb(vcpu); 2337 kvm_mmu_flush_tlb(vcpu);
2298out: 2338out:
@@ -2638,14 +2678,6 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp);
2638 2678
2639static void free_mmu_pages(struct kvm_vcpu *vcpu) 2679static void free_mmu_pages(struct kvm_vcpu *vcpu)
2640{ 2680{
2641 struct kvm_mmu_page *sp;
2642
2643 while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
2644 sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
2645 struct kvm_mmu_page, link);
2646 kvm_mmu_zap_page(vcpu->kvm, sp);
2647 cond_resched();
2648 }
2649 free_page((unsigned long)vcpu->arch.mmu.pae_root); 2681 free_page((unsigned long)vcpu->arch.mmu.pae_root);
2650} 2682}
2651 2683
@@ -2710,7 +2742,6 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
2710{ 2742{
2711 struct kvm_mmu_page *sp; 2743 struct kvm_mmu_page *sp;
2712 2744
2713 spin_lock(&kvm->mmu_lock);
2714 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { 2745 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
2715 int i; 2746 int i;
2716 u64 *pt; 2747 u64 *pt;
@@ -2725,7 +2756,6 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
2725 pt[i] &= ~PT_WRITABLE_MASK; 2756 pt[i] &= ~PT_WRITABLE_MASK;
2726 } 2757 }
2727 kvm_flush_remote_tlbs(kvm); 2758 kvm_flush_remote_tlbs(kvm);
2728 spin_unlock(&kvm->mmu_lock);
2729} 2759}
2730 2760
2731void kvm_mmu_zap_all(struct kvm *kvm) 2761void kvm_mmu_zap_all(struct kvm *kvm)
@@ -3007,11 +3037,13 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
3007 " in nonleaf level: levels %d gva %lx" 3037 " in nonleaf level: levels %d gva %lx"
3008 " level %d pte %llx\n", audit_msg, 3038 " level %d pte %llx\n", audit_msg,
3009 vcpu->arch.mmu.root_level, va, level, ent); 3039 vcpu->arch.mmu.root_level, va, level, ent);
3010 3040 else
3011 audit_mappings_page(vcpu, ent, va, level - 1); 3041 audit_mappings_page(vcpu, ent, va, level - 1);
3012 } else { 3042 } else {
3013 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); 3043 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
3014 hpa_t hpa = (hpa_t)gpa_to_pfn(vcpu, gpa) << PAGE_SHIFT; 3044 gfn_t gfn = gpa >> PAGE_SHIFT;
3045 pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
3046 hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
3015 3047
3016 if (is_shadow_present_pte(ent) 3048 if (is_shadow_present_pte(ent)
3017 && (ent & PT64_BASE_ADDR_MASK) != hpa) 3049 && (ent & PT64_BASE_ADDR_MASK) != hpa)
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index eaab2145f62b..3494a2fb136e 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -75,4 +75,9 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
75 return vcpu->arch.cr0 & X86_CR0_PG; 75 return vcpu->arch.cr0 & X86_CR0_PG;
76} 76}
77 77
78static inline int is_present_pte(unsigned long pte)
79{
80 return pte & PT_PRESENT_MASK;
81}
82
78#endif 83#endif
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 6bd70206c561..258e4591e1ca 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -123,6 +123,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
123 gfn_t table_gfn; 123 gfn_t table_gfn;
124 unsigned index, pt_access, pte_access; 124 unsigned index, pt_access, pte_access;
125 gpa_t pte_gpa; 125 gpa_t pte_gpa;
126 int rsvd_fault = 0;
126 127
127 pgprintk("%s: addr %lx\n", __func__, addr); 128 pgprintk("%s: addr %lx\n", __func__, addr);
128walk: 129walk:
@@ -157,6 +158,10 @@ walk:
157 if (!is_present_pte(pte)) 158 if (!is_present_pte(pte))
158 goto not_present; 159 goto not_present;
159 160
161 rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level);
162 if (rsvd_fault)
163 goto access_error;
164
160 if (write_fault && !is_writeble_pte(pte)) 165 if (write_fault && !is_writeble_pte(pte))
161 if (user_fault || is_write_protection(vcpu)) 166 if (user_fault || is_write_protection(vcpu))
162 goto access_error; 167 goto access_error;
@@ -209,7 +214,6 @@ walk:
209 if (ret) 214 if (ret)
210 goto walk; 215 goto walk;
211 pte |= PT_DIRTY_MASK; 216 pte |= PT_DIRTY_MASK;
212 kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte), 0);
213 walker->ptes[walker->level - 1] = pte; 217 walker->ptes[walker->level - 1] = pte;
214 } 218 }
215 219
@@ -233,6 +237,8 @@ err:
233 walker->error_code |= PFERR_USER_MASK; 237 walker->error_code |= PFERR_USER_MASK;
234 if (fetch_fault) 238 if (fetch_fault)
235 walker->error_code |= PFERR_FETCH_MASK; 239 walker->error_code |= PFERR_FETCH_MASK;
240 if (rsvd_fault)
241 walker->error_code |= PFERR_RSVD_MASK;
236 return 0; 242 return 0;
237} 243}
238 244
@@ -262,8 +268,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
262 kvm_get_pfn(pfn); 268 kvm_get_pfn(pfn);
263 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, 269 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
264 gpte & PT_DIRTY_MASK, NULL, largepage, 270 gpte & PT_DIRTY_MASK, NULL, largepage,
265 gpte & PT_GLOBAL_MASK, gpte_to_gfn(gpte), 271 gpte_to_gfn(gpte), pfn, true);
266 pfn, true);
267} 272}
268 273
269/* 274/*
@@ -297,7 +302,6 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
297 user_fault, write_fault, 302 user_fault, write_fault,
298 gw->ptes[gw->level-1] & PT_DIRTY_MASK, 303 gw->ptes[gw->level-1] & PT_DIRTY_MASK,
299 ptwrite, largepage, 304 ptwrite, largepage,
300 gw->ptes[gw->level-1] & PT_GLOBAL_MASK,
301 gw->gfn, pfn, false); 305 gw->gfn, pfn, false);
302 break; 306 break;
303 } 307 }
@@ -380,7 +384,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
380 return r; 384 return r;
381 385
382 /* 386 /*
383 * Look up the shadow pte for the faulting address. 387 * Look up the guest pte for the faulting address.
384 */ 388 */
385 r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, 389 r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
386 fetch_fault); 390 fetch_fault);
@@ -586,7 +590,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
586 nr_present++; 590 nr_present++;
587 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 591 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
588 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, 592 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
589 is_dirty_pte(gpte), 0, gpte & PT_GLOBAL_MASK, gfn, 593 is_dirty_pte(gpte), 0, gfn,
590 spte_to_pfn(sp->spt[i]), true, false); 594 spte_to_pfn(sp->spt[i]), true, false);
591 } 595 }
592 596
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1f8510c51d6e..71510e07e69e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -19,6 +19,7 @@
19#include "irq.h" 19#include "irq.h"
20#include "mmu.h" 20#include "mmu.h"
21#include "kvm_cache_regs.h" 21#include "kvm_cache_regs.h"
22#include "x86.h"
22 23
23#include <linux/module.h> 24#include <linux/module.h>
24#include <linux/kernel.h> 25#include <linux/kernel.h>
@@ -69,7 +70,6 @@ module_param(npt, int, S_IRUGO);
69static int nested = 0; 70static int nested = 0;
70module_param(nested, int, S_IRUGO); 71module_param(nested, int, S_IRUGO);
71 72
72static void kvm_reput_irq(struct vcpu_svm *svm);
73static void svm_flush_tlb(struct kvm_vcpu *vcpu); 73static void svm_flush_tlb(struct kvm_vcpu *vcpu);
74 74
75static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override); 75static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override);
@@ -132,24 +132,6 @@ static inline u32 svm_has(u32 feat)
132 return svm_features & feat; 132 return svm_features & feat;
133} 133}
134 134
135static inline u8 pop_irq(struct kvm_vcpu *vcpu)
136{
137 int word_index = __ffs(vcpu->arch.irq_summary);
138 int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
139 int irq = word_index * BITS_PER_LONG + bit_index;
140
141 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
142 if (!vcpu->arch.irq_pending[word_index])
143 clear_bit(word_index, &vcpu->arch.irq_summary);
144 return irq;
145}
146
147static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq)
148{
149 set_bit(irq, vcpu->arch.irq_pending);
150 set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
151}
152
153static inline void clgi(void) 135static inline void clgi(void)
154{ 136{
155 asm volatile (__ex(SVM_CLGI)); 137 asm volatile (__ex(SVM_CLGI));
@@ -214,17 +196,31 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
214 svm->vmcb->control.event_inj_err = error_code; 196 svm->vmcb->control.event_inj_err = error_code;
215} 197}
216 198
217static bool svm_exception_injected(struct kvm_vcpu *vcpu) 199static int is_external_interrupt(u32 info)
200{
201 info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
202 return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
203}
204
205static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
218{ 206{
219 struct vcpu_svm *svm = to_svm(vcpu); 207 struct vcpu_svm *svm = to_svm(vcpu);
208 u32 ret = 0;
220 209
221 return !(svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID); 210 if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
211 ret |= X86_SHADOW_INT_STI | X86_SHADOW_INT_MOV_SS;
212 return ret & mask;
222} 213}
223 214
224static int is_external_interrupt(u32 info) 215static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
225{ 216{
226 info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; 217 struct vcpu_svm *svm = to_svm(vcpu);
227 return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR); 218
219 if (mask == 0)
220 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
221 else
222 svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
223
228} 224}
229 225
230static void skip_emulated_instruction(struct kvm_vcpu *vcpu) 226static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
@@ -232,7 +228,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
232 struct vcpu_svm *svm = to_svm(vcpu); 228 struct vcpu_svm *svm = to_svm(vcpu);
233 229
234 if (!svm->next_rip) { 230 if (!svm->next_rip) {
235 printk(KERN_DEBUG "%s: NOP\n", __func__); 231 if (emulate_instruction(vcpu, vcpu->run, 0, 0, EMULTYPE_SKIP) !=
232 EMULATE_DONE)
233 printk(KERN_DEBUG "%s: NOP\n", __func__);
236 return; 234 return;
237 } 235 }
238 if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE) 236 if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
@@ -240,9 +238,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
240 __func__, kvm_rip_read(vcpu), svm->next_rip); 238 __func__, kvm_rip_read(vcpu), svm->next_rip);
241 239
242 kvm_rip_write(vcpu, svm->next_rip); 240 kvm_rip_write(vcpu, svm->next_rip);
243 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; 241 svm_set_interrupt_shadow(vcpu, 0);
244
245 vcpu->arch.interrupt_window_open = (svm->vcpu.arch.hflags & HF_GIF_MASK);
246} 242}
247 243
248static int has_svm(void) 244static int has_svm(void)
@@ -830,6 +826,15 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
830 if (!var->unusable) 826 if (!var->unusable)
831 var->type |= 0x1; 827 var->type |= 0x1;
832 break; 828 break;
829 case VCPU_SREG_SS:
830 /* On AMD CPUs sometimes the DB bit in the segment
831 * descriptor is left as 1, although the whole segment has
832 * been made unusable. Clear it here to pass an Intel VMX
833 * entry check when cross vendor migrating.
834 */
835 if (var->unusable)
836 var->db = 0;
837 break;
833 } 838 }
834} 839}
835 840
@@ -960,15 +965,16 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
960 965
961} 966}
962 967
963static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) 968static void update_db_intercept(struct kvm_vcpu *vcpu)
964{ 969{
965 int old_debug = vcpu->guest_debug;
966 struct vcpu_svm *svm = to_svm(vcpu); 970 struct vcpu_svm *svm = to_svm(vcpu);
967 971
968 vcpu->guest_debug = dbg->control;
969
970 svm->vmcb->control.intercept_exceptions &= 972 svm->vmcb->control.intercept_exceptions &=
971 ~((1 << DB_VECTOR) | (1 << BP_VECTOR)); 973 ~((1 << DB_VECTOR) | (1 << BP_VECTOR));
974
975 if (vcpu->arch.singlestep)
976 svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR);
977
972 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { 978 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
973 if (vcpu->guest_debug & 979 if (vcpu->guest_debug &
974 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 980 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
@@ -979,6 +985,16 @@ static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
979 1 << BP_VECTOR; 985 1 << BP_VECTOR;
980 } else 986 } else
981 vcpu->guest_debug = 0; 987 vcpu->guest_debug = 0;
988}
989
990static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
991{
992 int old_debug = vcpu->guest_debug;
993 struct vcpu_svm *svm = to_svm(vcpu);
994
995 vcpu->guest_debug = dbg->control;
996
997 update_db_intercept(vcpu);
982 998
983 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 999 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
984 svm->vmcb->save.dr7 = dbg->arch.debugreg[7]; 1000 svm->vmcb->save.dr7 = dbg->arch.debugreg[7];
@@ -993,16 +1009,6 @@ static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
993 return 0; 1009 return 0;
994} 1010}
995 1011
996static int svm_get_irq(struct kvm_vcpu *vcpu)
997{
998 struct vcpu_svm *svm = to_svm(vcpu);
999 u32 exit_int_info = svm->vmcb->control.exit_int_info;
1000
1001 if (is_external_interrupt(exit_int_info))
1002 return exit_int_info & SVM_EVTINJ_VEC_MASK;
1003 return -1;
1004}
1005
1006static void load_host_msrs(struct kvm_vcpu *vcpu) 1012static void load_host_msrs(struct kvm_vcpu *vcpu)
1007{ 1013{
1008#ifdef CONFIG_X86_64 1014#ifdef CONFIG_X86_64
@@ -1107,17 +1113,8 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
1107 1113
1108static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1114static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1109{ 1115{
1110 u32 exit_int_info = svm->vmcb->control.exit_int_info;
1111 struct kvm *kvm = svm->vcpu.kvm;
1112 u64 fault_address; 1116 u64 fault_address;
1113 u32 error_code; 1117 u32 error_code;
1114 bool event_injection = false;
1115
1116 if (!irqchip_in_kernel(kvm) &&
1117 is_external_interrupt(exit_int_info)) {
1118 event_injection = true;
1119 push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
1120 }
1121 1118
1122 fault_address = svm->vmcb->control.exit_info_2; 1119 fault_address = svm->vmcb->control.exit_info_2;
1123 error_code = svm->vmcb->control.exit_info_1; 1120 error_code = svm->vmcb->control.exit_info_1;
@@ -1137,23 +1134,40 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1137 */ 1134 */
1138 if (npt_enabled) 1135 if (npt_enabled)
1139 svm_flush_tlb(&svm->vcpu); 1136 svm_flush_tlb(&svm->vcpu);
1140 1137 else {
1141 if (!npt_enabled && event_injection) 1138 if (kvm_event_needs_reinjection(&svm->vcpu))
1142 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); 1139 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
1140 }
1143 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); 1141 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
1144} 1142}
1145 1143
1146static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1144static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1147{ 1145{
1148 if (!(svm->vcpu.guest_debug & 1146 if (!(svm->vcpu.guest_debug &
1149 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { 1147 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
1148 !svm->vcpu.arch.singlestep) {
1150 kvm_queue_exception(&svm->vcpu, DB_VECTOR); 1149 kvm_queue_exception(&svm->vcpu, DB_VECTOR);
1151 return 1; 1150 return 1;
1152 } 1151 }
1153 kvm_run->exit_reason = KVM_EXIT_DEBUG; 1152
1154 kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; 1153 if (svm->vcpu.arch.singlestep) {
1155 kvm_run->debug.arch.exception = DB_VECTOR; 1154 svm->vcpu.arch.singlestep = false;
1156 return 0; 1155 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
1156 svm->vmcb->save.rflags &=
1157 ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1158 update_db_intercept(&svm->vcpu);
1159 }
1160
1161 if (svm->vcpu.guest_debug &
1162 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)){
1163 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1164 kvm_run->debug.arch.pc =
1165 svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1166 kvm_run->debug.arch.exception = DB_VECTOR;
1167 return 0;
1168 }
1169
1170 return 1;
1157} 1171}
1158 1172
1159static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1173static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
@@ -1842,17 +1856,51 @@ static int task_switch_interception(struct vcpu_svm *svm,
1842 struct kvm_run *kvm_run) 1856 struct kvm_run *kvm_run)
1843{ 1857{
1844 u16 tss_selector; 1858 u16 tss_selector;
1859 int reason;
1860 int int_type = svm->vmcb->control.exit_int_info &
1861 SVM_EXITINTINFO_TYPE_MASK;
1862 int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
1863 uint32_t type =
1864 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
1865 uint32_t idt_v =
1866 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
1845 1867
1846 tss_selector = (u16)svm->vmcb->control.exit_info_1; 1868 tss_selector = (u16)svm->vmcb->control.exit_info_1;
1869
1847 if (svm->vmcb->control.exit_info_2 & 1870 if (svm->vmcb->control.exit_info_2 &
1848 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET)) 1871 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
1849 return kvm_task_switch(&svm->vcpu, tss_selector, 1872 reason = TASK_SWITCH_IRET;
1850 TASK_SWITCH_IRET); 1873 else if (svm->vmcb->control.exit_info_2 &
1851 if (svm->vmcb->control.exit_info_2 & 1874 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
1852 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP)) 1875 reason = TASK_SWITCH_JMP;
1853 return kvm_task_switch(&svm->vcpu, tss_selector, 1876 else if (idt_v)
1854 TASK_SWITCH_JMP); 1877 reason = TASK_SWITCH_GATE;
1855 return kvm_task_switch(&svm->vcpu, tss_selector, TASK_SWITCH_CALL); 1878 else
1879 reason = TASK_SWITCH_CALL;
1880
1881 if (reason == TASK_SWITCH_GATE) {
1882 switch (type) {
1883 case SVM_EXITINTINFO_TYPE_NMI:
1884 svm->vcpu.arch.nmi_injected = false;
1885 break;
1886 case SVM_EXITINTINFO_TYPE_EXEPT:
1887 kvm_clear_exception_queue(&svm->vcpu);
1888 break;
1889 case SVM_EXITINTINFO_TYPE_INTR:
1890 kvm_clear_interrupt_queue(&svm->vcpu);
1891 break;
1892 default:
1893 break;
1894 }
1895 }
1896
1897 if (reason != TASK_SWITCH_GATE ||
1898 int_type == SVM_EXITINTINFO_TYPE_SOFT ||
1899 (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
1900 (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
1901 skip_emulated_instruction(&svm->vcpu);
1902
1903 return kvm_task_switch(&svm->vcpu, tss_selector, reason);
1856} 1904}
1857 1905
1858static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1906static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
@@ -1862,6 +1910,14 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1862 return 1; 1910 return 1;
1863} 1911}
1864 1912
1913static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1914{
1915 ++svm->vcpu.stat.nmi_window_exits;
1916 svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
1917 svm->vcpu.arch.hflags |= HF_IRET_MASK;
1918 return 1;
1919}
1920
1865static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1921static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1866{ 1922{
1867 if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE) 1923 if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE)
@@ -1879,8 +1935,14 @@ static int emulate_on_interception(struct vcpu_svm *svm,
1879 1935
1880static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1936static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1881{ 1937{
1938 u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
1939 /* instruction emulation calls kvm_set_cr8() */
1882 emulate_instruction(&svm->vcpu, NULL, 0, 0, 0); 1940 emulate_instruction(&svm->vcpu, NULL, 0, 0, 0);
1883 if (irqchip_in_kernel(svm->vcpu.kvm)) 1941 if (irqchip_in_kernel(svm->vcpu.kvm)) {
1942 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
1943 return 1;
1944 }
1945 if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
1884 return 1; 1946 return 1;
1885 kvm_run->exit_reason = KVM_EXIT_SET_TPR; 1947 kvm_run->exit_reason = KVM_EXIT_SET_TPR;
1886 return 0; 1948 return 0;
@@ -2090,8 +2152,9 @@ static int interrupt_window_interception(struct vcpu_svm *svm,
2090 * If the user space waits to inject interrupts, exit as soon as 2152 * If the user space waits to inject interrupts, exit as soon as
2091 * possible 2153 * possible
2092 */ 2154 */
2093 if (kvm_run->request_interrupt_window && 2155 if (!irqchip_in_kernel(svm->vcpu.kvm) &&
2094 !svm->vcpu.arch.irq_summary) { 2156 kvm_run->request_interrupt_window &&
2157 !kvm_cpu_has_interrupt(&svm->vcpu)) {
2095 ++svm->vcpu.stat.irq_window_exits; 2158 ++svm->vcpu.stat.irq_window_exits;
2096 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; 2159 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
2097 return 0; 2160 return 0;
@@ -2134,6 +2197,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
2134 [SVM_EXIT_VINTR] = interrupt_window_interception, 2197 [SVM_EXIT_VINTR] = interrupt_window_interception,
2135 /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */ 2198 /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */
2136 [SVM_EXIT_CPUID] = cpuid_interception, 2199 [SVM_EXIT_CPUID] = cpuid_interception,
2200 [SVM_EXIT_IRET] = iret_interception,
2137 [SVM_EXIT_INVD] = emulate_on_interception, 2201 [SVM_EXIT_INVD] = emulate_on_interception,
2138 [SVM_EXIT_HLT] = halt_interception, 2202 [SVM_EXIT_HLT] = halt_interception,
2139 [SVM_EXIT_INVLPG] = invlpg_interception, 2203 [SVM_EXIT_INVLPG] = invlpg_interception,
@@ -2194,7 +2258,6 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2194 } 2258 }
2195 } 2259 }
2196 2260
2197 kvm_reput_irq(svm);
2198 2261
2199 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { 2262 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
2200 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 2263 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
@@ -2205,7 +2268,7 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2205 2268
2206 if (is_external_interrupt(svm->vmcb->control.exit_int_info) && 2269 if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
2207 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && 2270 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
2208 exit_code != SVM_EXIT_NPF) 2271 exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH)
2209 printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " 2272 printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
2210 "exit_code 0x%x\n", 2273 "exit_code 0x%x\n",
2211 __func__, svm->vmcb->control.exit_int_info, 2274 __func__, svm->vmcb->control.exit_int_info,
@@ -2242,6 +2305,15 @@ static void pre_svm_run(struct vcpu_svm *svm)
2242 new_asid(svm, svm_data); 2305 new_asid(svm, svm_data);
2243} 2306}
2244 2307
2308static void svm_inject_nmi(struct kvm_vcpu *vcpu)
2309{
2310 struct vcpu_svm *svm = to_svm(vcpu);
2311
2312 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
2313 vcpu->arch.hflags |= HF_NMI_MASK;
2314 svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET);
2315 ++vcpu->stat.nmi_injections;
2316}
2245 2317
2246static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) 2318static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
2247{ 2319{
@@ -2257,134 +2329,71 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
2257 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); 2329 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
2258} 2330}
2259 2331
2260static void svm_set_irq(struct kvm_vcpu *vcpu, int irq) 2332static void svm_queue_irq(struct kvm_vcpu *vcpu, unsigned nr)
2261{ 2333{
2262 struct vcpu_svm *svm = to_svm(vcpu); 2334 struct vcpu_svm *svm = to_svm(vcpu);
2263 2335
2264 nested_svm_intr(svm); 2336 svm->vmcb->control.event_inj = nr |
2265 2337 SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
2266 svm_inject_irq(svm, irq);
2267} 2338}
2268 2339
2269static void update_cr8_intercept(struct kvm_vcpu *vcpu) 2340static void svm_set_irq(struct kvm_vcpu *vcpu)
2270{ 2341{
2271 struct vcpu_svm *svm = to_svm(vcpu); 2342 struct vcpu_svm *svm = to_svm(vcpu);
2272 struct vmcb *vmcb = svm->vmcb;
2273 int max_irr, tpr;
2274 2343
2275 if (!irqchip_in_kernel(vcpu->kvm) || vcpu->arch.apic->vapic_addr) 2344 nested_svm_intr(svm);
2276 return;
2277 2345
2278 vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; 2346 svm_queue_irq(vcpu, vcpu->arch.interrupt.nr);
2347}
2279 2348
2280 max_irr = kvm_lapic_find_highest_irr(vcpu); 2349static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
2281 if (max_irr == -1) 2350{
2282 return; 2351 struct vcpu_svm *svm = to_svm(vcpu);
2283 2352
2284 tpr = kvm_lapic_get_cr8(vcpu) << 4; 2353 if (irr == -1)
2354 return;
2285 2355
2286 if (tpr >= (max_irr & 0xf0)) 2356 if (tpr >= irr)
2287 vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK; 2357 svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK;
2288} 2358}
2289 2359
2290static void svm_intr_assist(struct kvm_vcpu *vcpu) 2360static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
2291{ 2361{
2292 struct vcpu_svm *svm = to_svm(vcpu); 2362 struct vcpu_svm *svm = to_svm(vcpu);
2293 struct vmcb *vmcb = svm->vmcb; 2363 struct vmcb *vmcb = svm->vmcb;
2294 int intr_vector = -1; 2364 return !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
2295 2365 !(svm->vcpu.arch.hflags & HF_NMI_MASK);
2296 if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) &&
2297 ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) {
2298 intr_vector = vmcb->control.exit_int_info &
2299 SVM_EVTINJ_VEC_MASK;
2300 vmcb->control.exit_int_info = 0;
2301 svm_inject_irq(svm, intr_vector);
2302 goto out;
2303 }
2304
2305 if (vmcb->control.int_ctl & V_IRQ_MASK)
2306 goto out;
2307
2308 if (!kvm_cpu_has_interrupt(vcpu))
2309 goto out;
2310
2311 if (nested_svm_intr(svm))
2312 goto out;
2313
2314 if (!(svm->vcpu.arch.hflags & HF_GIF_MASK))
2315 goto out;
2316
2317 if (!(vmcb->save.rflags & X86_EFLAGS_IF) ||
2318 (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
2319 (vmcb->control.event_inj & SVM_EVTINJ_VALID)) {
2320 /* unable to deliver irq, set pending irq */
2321 svm_set_vintr(svm);
2322 svm_inject_irq(svm, 0x0);
2323 goto out;
2324 }
2325 /* Okay, we can deliver the interrupt: grab it and update PIC state. */
2326 intr_vector = kvm_cpu_get_interrupt(vcpu);
2327 svm_inject_irq(svm, intr_vector);
2328out:
2329 update_cr8_intercept(vcpu);
2330} 2366}
2331 2367
2332static void kvm_reput_irq(struct vcpu_svm *svm) 2368static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
2333{ 2369{
2334 struct vmcb_control_area *control = &svm->vmcb->control; 2370 struct vcpu_svm *svm = to_svm(vcpu);
2335 2371 struct vmcb *vmcb = svm->vmcb;
2336 if ((control->int_ctl & V_IRQ_MASK) 2372 return (vmcb->save.rflags & X86_EFLAGS_IF) &&
2337 && !irqchip_in_kernel(svm->vcpu.kvm)) { 2373 !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
2338 control->int_ctl &= ~V_IRQ_MASK; 2374 (svm->vcpu.arch.hflags & HF_GIF_MASK);
2339 push_irq(&svm->vcpu, control->int_vector);
2340 }
2341
2342 svm->vcpu.arch.interrupt_window_open =
2343 !(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
2344 (svm->vcpu.arch.hflags & HF_GIF_MASK);
2345} 2375}
2346 2376
2347static void svm_do_inject_vector(struct vcpu_svm *svm) 2377static void enable_irq_window(struct kvm_vcpu *vcpu)
2348{ 2378{
2349 struct kvm_vcpu *vcpu = &svm->vcpu; 2379 svm_set_vintr(to_svm(vcpu));
2350 int word_index = __ffs(vcpu->arch.irq_summary); 2380 svm_inject_irq(to_svm(vcpu), 0x0);
2351 int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
2352 int irq = word_index * BITS_PER_LONG + bit_index;
2353
2354 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
2355 if (!vcpu->arch.irq_pending[word_index])
2356 clear_bit(word_index, &vcpu->arch.irq_summary);
2357 svm_inject_irq(svm, irq);
2358} 2381}
2359 2382
2360static void do_interrupt_requests(struct kvm_vcpu *vcpu, 2383static void enable_nmi_window(struct kvm_vcpu *vcpu)
2361 struct kvm_run *kvm_run)
2362{ 2384{
2363 struct vcpu_svm *svm = to_svm(vcpu); 2385 struct vcpu_svm *svm = to_svm(vcpu);
2364 struct vmcb_control_area *control = &svm->vmcb->control;
2365
2366 if (nested_svm_intr(svm))
2367 return;
2368 2386
2369 svm->vcpu.arch.interrupt_window_open = 2387 if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
2370 (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) && 2388 == HF_NMI_MASK)
2371 (svm->vmcb->save.rflags & X86_EFLAGS_IF) && 2389 return; /* IRET will cause a vm exit */
2372 (svm->vcpu.arch.hflags & HF_GIF_MASK));
2373 2390
2374 if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary) 2391 /* Something prevents NMI from been injected. Single step over
2375 /* 2392 possible problem (IRET or exception injection or interrupt
2376 * If interrupts enabled, and not blocked by sti or mov ss. Good. 2393 shadow) */
2377 */ 2394 vcpu->arch.singlestep = true;
2378 svm_do_inject_vector(svm); 2395 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
2379 2396 update_db_intercept(vcpu);
2380 /*
2381 * Interrupts blocked. Wait for unblock.
2382 */
2383 if (!svm->vcpu.arch.interrupt_window_open &&
2384 (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window))
2385 svm_set_vintr(svm);
2386 else
2387 svm_clear_vintr(svm);
2388} 2397}
2389 2398
2390static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) 2399static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -2407,7 +2416,7 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
2407 2416
2408 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) { 2417 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) {
2409 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; 2418 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
2410 kvm_lapic_set_tpr(vcpu, cr8); 2419 kvm_set_cr8(vcpu, cr8);
2411 } 2420 }
2412} 2421}
2413 2422
@@ -2416,14 +2425,54 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
2416 struct vcpu_svm *svm = to_svm(vcpu); 2425 struct vcpu_svm *svm = to_svm(vcpu);
2417 u64 cr8; 2426 u64 cr8;
2418 2427
2419 if (!irqchip_in_kernel(vcpu->kvm))
2420 return;
2421
2422 cr8 = kvm_get_cr8(vcpu); 2428 cr8 = kvm_get_cr8(vcpu);
2423 svm->vmcb->control.int_ctl &= ~V_TPR_MASK; 2429 svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
2424 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; 2430 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
2425} 2431}
2426 2432
2433static void svm_complete_interrupts(struct vcpu_svm *svm)
2434{
2435 u8 vector;
2436 int type;
2437 u32 exitintinfo = svm->vmcb->control.exit_int_info;
2438
2439 if (svm->vcpu.arch.hflags & HF_IRET_MASK)
2440 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
2441
2442 svm->vcpu.arch.nmi_injected = false;
2443 kvm_clear_exception_queue(&svm->vcpu);
2444 kvm_clear_interrupt_queue(&svm->vcpu);
2445
2446 if (!(exitintinfo & SVM_EXITINTINFO_VALID))
2447 return;
2448
2449 vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
2450 type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
2451
2452 switch (type) {
2453 case SVM_EXITINTINFO_TYPE_NMI:
2454 svm->vcpu.arch.nmi_injected = true;
2455 break;
2456 case SVM_EXITINTINFO_TYPE_EXEPT:
2457 /* In case of software exception do not reinject an exception
2458 vector, but re-execute and instruction instead */
2459 if (kvm_exception_is_soft(vector))
2460 break;
2461 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
2462 u32 err = svm->vmcb->control.exit_int_info_err;
2463 kvm_queue_exception_e(&svm->vcpu, vector, err);
2464
2465 } else
2466 kvm_queue_exception(&svm->vcpu, vector);
2467 break;
2468 case SVM_EXITINTINFO_TYPE_INTR:
2469 kvm_queue_interrupt(&svm->vcpu, vector, false);
2470 break;
2471 default:
2472 break;
2473 }
2474}
2475
2427#ifdef CONFIG_X86_64 2476#ifdef CONFIG_X86_64
2428#define R "r" 2477#define R "r"
2429#else 2478#else
@@ -2552,6 +2601,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2552 sync_cr8_to_lapic(vcpu); 2601 sync_cr8_to_lapic(vcpu);
2553 2602
2554 svm->next_rip = 0; 2603 svm->next_rip = 0;
2604
2605 svm_complete_interrupts(svm);
2555} 2606}
2556 2607
2557#undef R 2608#undef R
@@ -2617,7 +2668,7 @@ static int get_npt_level(void)
2617#endif 2668#endif
2618} 2669}
2619 2670
2620static int svm_get_mt_mask_shift(void) 2671static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
2621{ 2672{
2622 return 0; 2673 return 0;
2623} 2674}
@@ -2667,17 +2718,21 @@ static struct kvm_x86_ops svm_x86_ops = {
2667 .run = svm_vcpu_run, 2718 .run = svm_vcpu_run,
2668 .handle_exit = handle_exit, 2719 .handle_exit = handle_exit,
2669 .skip_emulated_instruction = skip_emulated_instruction, 2720 .skip_emulated_instruction = skip_emulated_instruction,
2721 .set_interrupt_shadow = svm_set_interrupt_shadow,
2722 .get_interrupt_shadow = svm_get_interrupt_shadow,
2670 .patch_hypercall = svm_patch_hypercall, 2723 .patch_hypercall = svm_patch_hypercall,
2671 .get_irq = svm_get_irq,
2672 .set_irq = svm_set_irq, 2724 .set_irq = svm_set_irq,
2725 .set_nmi = svm_inject_nmi,
2673 .queue_exception = svm_queue_exception, 2726 .queue_exception = svm_queue_exception,
2674 .exception_injected = svm_exception_injected, 2727 .interrupt_allowed = svm_interrupt_allowed,
2675 .inject_pending_irq = svm_intr_assist, 2728 .nmi_allowed = svm_nmi_allowed,
2676 .inject_pending_vectors = do_interrupt_requests, 2729 .enable_nmi_window = enable_nmi_window,
2730 .enable_irq_window = enable_irq_window,
2731 .update_cr8_intercept = update_cr8_intercept,
2677 2732
2678 .set_tss_addr = svm_set_tss_addr, 2733 .set_tss_addr = svm_set_tss_addr,
2679 .get_tdp_level = get_npt_level, 2734 .get_tdp_level = get_npt_level,
2680 .get_mt_mask_shift = svm_get_mt_mask_shift, 2735 .get_mt_mask = svm_get_mt_mask,
2681}; 2736};
2682 2737
2683static int __init svm_init(void) 2738static int __init svm_init(void)
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
new file mode 100644
index 000000000000..86dbac072d0c
--- /dev/null
+++ b/arch/x86/kvm/timer.c
@@ -0,0 +1,46 @@
1#include <linux/kvm_host.h>
2#include <linux/kvm.h>
3#include <linux/hrtimer.h>
4#include <asm/atomic.h>
5#include "kvm_timer.h"
6
7static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
8{
9 int restart_timer = 0;
10 wait_queue_head_t *q = &vcpu->wq;
11
12 /* FIXME: this code should not know anything about vcpus */
13 if (!atomic_inc_and_test(&ktimer->pending))
14 set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
15
16 if (!ktimer->reinject)
17 atomic_set(&ktimer->pending, 1);
18
19 if (waitqueue_active(q))
20 wake_up_interruptible(q);
21
22 if (ktimer->t_ops->is_periodic(ktimer)) {
23 hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
24 restart_timer = 1;
25 }
26
27 return restart_timer;
28}
29
30enum hrtimer_restart kvm_timer_fn(struct hrtimer *data)
31{
32 int restart_timer;
33 struct kvm_vcpu *vcpu;
34 struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
35
36 vcpu = ktimer->kvm->vcpus[ktimer->vcpu_id];
37 if (!vcpu)
38 return HRTIMER_NORESTART;
39
40 restart_timer = __kvm_timer_fn(vcpu, ktimer);
41 if (restart_timer)
42 return HRTIMER_RESTART;
43 else
44 return HRTIMER_NORESTART;
45}
46
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bb481330716f..e770bf349ec4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -32,26 +32,27 @@
32#include <asm/desc.h> 32#include <asm/desc.h>
33#include <asm/vmx.h> 33#include <asm/vmx.h>
34#include <asm/virtext.h> 34#include <asm/virtext.h>
35#include <asm/mce.h>
35 36
36#define __ex(x) __kvm_handle_fault_on_reboot(x) 37#define __ex(x) __kvm_handle_fault_on_reboot(x)
37 38
38MODULE_AUTHOR("Qumranet"); 39MODULE_AUTHOR("Qumranet");
39MODULE_LICENSE("GPL"); 40MODULE_LICENSE("GPL");
40 41
41static int bypass_guest_pf = 1; 42static int __read_mostly bypass_guest_pf = 1;
42module_param(bypass_guest_pf, bool, 0); 43module_param(bypass_guest_pf, bool, S_IRUGO);
43 44
44static int enable_vpid = 1; 45static int __read_mostly enable_vpid = 1;
45module_param(enable_vpid, bool, 0); 46module_param_named(vpid, enable_vpid, bool, 0444);
46 47
47static int flexpriority_enabled = 1; 48static int __read_mostly flexpriority_enabled = 1;
48module_param(flexpriority_enabled, bool, 0); 49module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
49 50
50static int enable_ept = 1; 51static int __read_mostly enable_ept = 1;
51module_param(enable_ept, bool, 0); 52module_param_named(ept, enable_ept, bool, S_IRUGO);
52 53
53static int emulate_invalid_guest_state = 0; 54static int __read_mostly emulate_invalid_guest_state = 0;
54module_param(emulate_invalid_guest_state, bool, 0); 55module_param(emulate_invalid_guest_state, bool, S_IRUGO);
55 56
56struct vmcs { 57struct vmcs {
57 u32 revision_id; 58 u32 revision_id;
@@ -97,6 +98,7 @@ struct vcpu_vmx {
97 int soft_vnmi_blocked; 98 int soft_vnmi_blocked;
98 ktime_t entry_time; 99 ktime_t entry_time;
99 s64 vnmi_blocked_time; 100 s64 vnmi_blocked_time;
101 u32 exit_reason;
100}; 102};
101 103
102static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) 104static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -111,9 +113,10 @@ static DEFINE_PER_CPU(struct vmcs *, vmxarea);
111static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 113static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
112static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu); 114static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
113 115
114static struct page *vmx_io_bitmap_a; 116static unsigned long *vmx_io_bitmap_a;
115static struct page *vmx_io_bitmap_b; 117static unsigned long *vmx_io_bitmap_b;
116static struct page *vmx_msr_bitmap; 118static unsigned long *vmx_msr_bitmap_legacy;
119static unsigned long *vmx_msr_bitmap_longmode;
117 120
118static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); 121static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
119static DEFINE_SPINLOCK(vmx_vpid_lock); 122static DEFINE_SPINLOCK(vmx_vpid_lock);
@@ -213,70 +216,78 @@ static inline int is_external_interrupt(u32 intr_info)
213 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); 216 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
214} 217}
215 218
219static inline int is_machine_check(u32 intr_info)
220{
221 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
222 INTR_INFO_VALID_MASK)) ==
223 (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
224}
225
216static inline int cpu_has_vmx_msr_bitmap(void) 226static inline int cpu_has_vmx_msr_bitmap(void)
217{ 227{
218 return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS); 228 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
219} 229}
220 230
221static inline int cpu_has_vmx_tpr_shadow(void) 231static inline int cpu_has_vmx_tpr_shadow(void)
222{ 232{
223 return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW); 233 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
224} 234}
225 235
226static inline int vm_need_tpr_shadow(struct kvm *kvm) 236static inline int vm_need_tpr_shadow(struct kvm *kvm)
227{ 237{
228 return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm))); 238 return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
229} 239}
230 240
231static inline int cpu_has_secondary_exec_ctrls(void) 241static inline int cpu_has_secondary_exec_ctrls(void)
232{ 242{
233 return (vmcs_config.cpu_based_exec_ctrl & 243 return vmcs_config.cpu_based_exec_ctrl &
234 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS); 244 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
235} 245}
236 246
237static inline bool cpu_has_vmx_virtualize_apic_accesses(void) 247static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
238{ 248{
239 return flexpriority_enabled 249 return vmcs_config.cpu_based_2nd_exec_ctrl &
240 && (vmcs_config.cpu_based_2nd_exec_ctrl & 250 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
241 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); 251}
252
253static inline bool cpu_has_vmx_flexpriority(void)
254{
255 return cpu_has_vmx_tpr_shadow() &&
256 cpu_has_vmx_virtualize_apic_accesses();
242} 257}
243 258
244static inline int cpu_has_vmx_invept_individual_addr(void) 259static inline int cpu_has_vmx_invept_individual_addr(void)
245{ 260{
246 return (!!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT)); 261 return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT);
247} 262}
248 263
249static inline int cpu_has_vmx_invept_context(void) 264static inline int cpu_has_vmx_invept_context(void)
250{ 265{
251 return (!!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT)); 266 return !!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT);
252} 267}
253 268
254static inline int cpu_has_vmx_invept_global(void) 269static inline int cpu_has_vmx_invept_global(void)
255{ 270{
256 return (!!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT)); 271 return !!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT);
257} 272}
258 273
259static inline int cpu_has_vmx_ept(void) 274static inline int cpu_has_vmx_ept(void)
260{ 275{
261 return (vmcs_config.cpu_based_2nd_exec_ctrl & 276 return vmcs_config.cpu_based_2nd_exec_ctrl &
262 SECONDARY_EXEC_ENABLE_EPT); 277 SECONDARY_EXEC_ENABLE_EPT;
263}
264
265static inline int vm_need_ept(void)
266{
267 return (cpu_has_vmx_ept() && enable_ept);
268} 278}
269 279
270static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) 280static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
271{ 281{
272 return ((cpu_has_vmx_virtualize_apic_accesses()) && 282 return flexpriority_enabled &&
273 (irqchip_in_kernel(kvm))); 283 (cpu_has_vmx_virtualize_apic_accesses()) &&
284 (irqchip_in_kernel(kvm));
274} 285}
275 286
276static inline int cpu_has_vmx_vpid(void) 287static inline int cpu_has_vmx_vpid(void)
277{ 288{
278 return (vmcs_config.cpu_based_2nd_exec_ctrl & 289 return vmcs_config.cpu_based_2nd_exec_ctrl &
279 SECONDARY_EXEC_ENABLE_VPID); 290 SECONDARY_EXEC_ENABLE_VPID;
280} 291}
281 292
282static inline int cpu_has_virtual_nmis(void) 293static inline int cpu_has_virtual_nmis(void)
@@ -284,6 +295,11 @@ static inline int cpu_has_virtual_nmis(void)
284 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; 295 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
285} 296}
286 297
298static inline bool report_flexpriority(void)
299{
300 return flexpriority_enabled;
301}
302
287static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) 303static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
288{ 304{
289 int i; 305 int i;
@@ -381,7 +397,7 @@ static inline void ept_sync_global(void)
381 397
382static inline void ept_sync_context(u64 eptp) 398static inline void ept_sync_context(u64 eptp)
383{ 399{
384 if (vm_need_ept()) { 400 if (enable_ept) {
385 if (cpu_has_vmx_invept_context()) 401 if (cpu_has_vmx_invept_context())
386 __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0); 402 __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
387 else 403 else
@@ -391,7 +407,7 @@ static inline void ept_sync_context(u64 eptp)
391 407
392static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa) 408static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
393{ 409{
394 if (vm_need_ept()) { 410 if (enable_ept) {
395 if (cpu_has_vmx_invept_individual_addr()) 411 if (cpu_has_vmx_invept_individual_addr())
396 __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR, 412 __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
397 eptp, gpa); 413 eptp, gpa);
@@ -478,7 +494,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
478{ 494{
479 u32 eb; 495 u32 eb;
480 496
481 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR); 497 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR);
482 if (!vcpu->fpu_active) 498 if (!vcpu->fpu_active)
483 eb |= 1u << NM_VECTOR; 499 eb |= 1u << NM_VECTOR;
484 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { 500 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
@@ -488,9 +504,9 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
488 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 504 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
489 eb |= 1u << BP_VECTOR; 505 eb |= 1u << BP_VECTOR;
490 } 506 }
491 if (vcpu->arch.rmode.active) 507 if (vcpu->arch.rmode.vm86_active)
492 eb = ~0; 508 eb = ~0;
493 if (vm_need_ept()) 509 if (enable_ept)
494 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ 510 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
495 vmcs_write32(EXCEPTION_BITMAP, eb); 511 vmcs_write32(EXCEPTION_BITMAP, eb);
496} 512}
@@ -724,29 +740,50 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
724 740
725static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 741static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
726{ 742{
727 if (vcpu->arch.rmode.active) 743 if (vcpu->arch.rmode.vm86_active)
728 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 744 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
729 vmcs_writel(GUEST_RFLAGS, rflags); 745 vmcs_writel(GUEST_RFLAGS, rflags);
730} 746}
731 747
748static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
749{
750 u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
751 int ret = 0;
752
753 if (interruptibility & GUEST_INTR_STATE_STI)
754 ret |= X86_SHADOW_INT_STI;
755 if (interruptibility & GUEST_INTR_STATE_MOV_SS)
756 ret |= X86_SHADOW_INT_MOV_SS;
757
758 return ret & mask;
759}
760
761static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
762{
763 u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
764 u32 interruptibility = interruptibility_old;
765
766 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
767
768 if (mask & X86_SHADOW_INT_MOV_SS)
769 interruptibility |= GUEST_INTR_STATE_MOV_SS;
770 if (mask & X86_SHADOW_INT_STI)
771 interruptibility |= GUEST_INTR_STATE_STI;
772
773 if ((interruptibility != interruptibility_old))
774 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
775}
776
732static void skip_emulated_instruction(struct kvm_vcpu *vcpu) 777static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
733{ 778{
734 unsigned long rip; 779 unsigned long rip;
735 u32 interruptibility;
736 780
737 rip = kvm_rip_read(vcpu); 781 rip = kvm_rip_read(vcpu);
738 rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 782 rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
739 kvm_rip_write(vcpu, rip); 783 kvm_rip_write(vcpu, rip);
740 784
741 /* 785 /* skipping an emulated instruction also counts */
742 * We emulated an instruction, so temporary interrupt blocking 786 vmx_set_interrupt_shadow(vcpu, 0);
743 * should be removed, if set.
744 */
745 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
746 if (interruptibility & 3)
747 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
748 interruptibility & ~3);
749 vcpu->arch.interrupt_window_open = 1;
750} 787}
751 788
752static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, 789static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
@@ -760,7 +797,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
760 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 797 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
761 } 798 }
762 799
763 if (vcpu->arch.rmode.active) { 800 if (vcpu->arch.rmode.vm86_active) {
764 vmx->rmode.irq.pending = true; 801 vmx->rmode.irq.pending = true;
765 vmx->rmode.irq.vector = nr; 802 vmx->rmode.irq.vector = nr;
766 vmx->rmode.irq.rip = kvm_rip_read(vcpu); 803 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
@@ -773,8 +810,9 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
773 return; 810 return;
774 } 811 }
775 812
776 if (nr == BP_VECTOR || nr == OF_VECTOR) { 813 if (kvm_exception_is_soft(nr)) {
777 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); 814 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
815 vmx->vcpu.arch.event_exit_inst_len);
778 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 816 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
779 } else 817 } else
780 intr_info |= INTR_TYPE_HARD_EXCEPTION; 818 intr_info |= INTR_TYPE_HARD_EXCEPTION;
@@ -782,11 +820,6 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
782 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); 820 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
783} 821}
784 822
785static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
786{
787 return false;
788}
789
790/* 823/*
791 * Swap MSR entry in host/guest MSR entry array. 824 * Swap MSR entry in host/guest MSR entry array.
792 */ 825 */
@@ -812,6 +845,7 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
812static void setup_msrs(struct vcpu_vmx *vmx) 845static void setup_msrs(struct vcpu_vmx *vmx)
813{ 846{
814 int save_nmsrs; 847 int save_nmsrs;
848 unsigned long *msr_bitmap;
815 849
816 vmx_load_host_state(vmx); 850 vmx_load_host_state(vmx);
817 save_nmsrs = 0; 851 save_nmsrs = 0;
@@ -847,6 +881,15 @@ static void setup_msrs(struct vcpu_vmx *vmx)
847 __find_msr_index(vmx, MSR_KERNEL_GS_BASE); 881 __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
848#endif 882#endif
849 vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER); 883 vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
884
885 if (cpu_has_vmx_msr_bitmap()) {
886 if (is_long_mode(&vmx->vcpu))
887 msr_bitmap = vmx_msr_bitmap_longmode;
888 else
889 msr_bitmap = vmx_msr_bitmap_legacy;
890
891 vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
892 }
850} 893}
851 894
852/* 895/*
@@ -1034,13 +1077,6 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
1034 return 0; 1077 return 0;
1035} 1078}
1036 1079
1037static int vmx_get_irq(struct kvm_vcpu *vcpu)
1038{
1039 if (!vcpu->arch.interrupt.pending)
1040 return -1;
1041 return vcpu->arch.interrupt.nr;
1042}
1043
1044static __init int cpu_has_kvm_support(void) 1080static __init int cpu_has_kvm_support(void)
1045{ 1081{
1046 return cpu_has_vmx(); 1082 return cpu_has_vmx();
@@ -1241,7 +1277,7 @@ static struct vmcs *alloc_vmcs_cpu(int cpu)
1241 struct page *pages; 1277 struct page *pages;
1242 struct vmcs *vmcs; 1278 struct vmcs *vmcs;
1243 1279
1244 pages = alloc_pages_node(node, GFP_KERNEL, vmcs_config.order); 1280 pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order);
1245 if (!pages) 1281 if (!pages)
1246 return NULL; 1282 return NULL;
1247 vmcs = page_address(pages); 1283 vmcs = page_address(pages);
@@ -1294,6 +1330,18 @@ static __init int hardware_setup(void)
1294 if (boot_cpu_has(X86_FEATURE_NX)) 1330 if (boot_cpu_has(X86_FEATURE_NX))
1295 kvm_enable_efer_bits(EFER_NX); 1331 kvm_enable_efer_bits(EFER_NX);
1296 1332
1333 if (!cpu_has_vmx_vpid())
1334 enable_vpid = 0;
1335
1336 if (!cpu_has_vmx_ept())
1337 enable_ept = 0;
1338
1339 if (!cpu_has_vmx_flexpriority())
1340 flexpriority_enabled = 0;
1341
1342 if (!cpu_has_vmx_tpr_shadow())
1343 kvm_x86_ops->update_cr8_intercept = NULL;
1344
1297 return alloc_kvm_area(); 1345 return alloc_kvm_area();
1298} 1346}
1299 1347
@@ -1324,7 +1372,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1324 struct vcpu_vmx *vmx = to_vmx(vcpu); 1372 struct vcpu_vmx *vmx = to_vmx(vcpu);
1325 1373
1326 vmx->emulation_required = 1; 1374 vmx->emulation_required = 1;
1327 vcpu->arch.rmode.active = 0; 1375 vcpu->arch.rmode.vm86_active = 0;
1328 1376
1329 vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); 1377 vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
1330 vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit); 1378 vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit);
@@ -1386,7 +1434,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1386 struct vcpu_vmx *vmx = to_vmx(vcpu); 1434 struct vcpu_vmx *vmx = to_vmx(vcpu);
1387 1435
1388 vmx->emulation_required = 1; 1436 vmx->emulation_required = 1;
1389 vcpu->arch.rmode.active = 1; 1437 vcpu->arch.rmode.vm86_active = 1;
1390 1438
1391 vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); 1439 vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
1392 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); 1440 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
@@ -1485,7 +1533,7 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
1485static void vmx_flush_tlb(struct kvm_vcpu *vcpu) 1533static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
1486{ 1534{
1487 vpid_sync_vcpu_all(to_vmx(vcpu)); 1535 vpid_sync_vcpu_all(to_vmx(vcpu));
1488 if (vm_need_ept()) 1536 if (enable_ept)
1489 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); 1537 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
1490} 1538}
1491 1539
@@ -1555,10 +1603,10 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1555 1603
1556 vmx_fpu_deactivate(vcpu); 1604 vmx_fpu_deactivate(vcpu);
1557 1605
1558 if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE)) 1606 if (vcpu->arch.rmode.vm86_active && (cr0 & X86_CR0_PE))
1559 enter_pmode(vcpu); 1607 enter_pmode(vcpu);
1560 1608
1561 if (!vcpu->arch.rmode.active && !(cr0 & X86_CR0_PE)) 1609 if (!vcpu->arch.rmode.vm86_active && !(cr0 & X86_CR0_PE))
1562 enter_rmode(vcpu); 1610 enter_rmode(vcpu);
1563 1611
1564#ifdef CONFIG_X86_64 1612#ifdef CONFIG_X86_64
@@ -1570,7 +1618,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1570 } 1618 }
1571#endif 1619#endif
1572 1620
1573 if (vm_need_ept()) 1621 if (enable_ept)
1574 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); 1622 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
1575 1623
1576 vmcs_writel(CR0_READ_SHADOW, cr0); 1624 vmcs_writel(CR0_READ_SHADOW, cr0);
@@ -1599,7 +1647,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1599 u64 eptp; 1647 u64 eptp;
1600 1648
1601 guest_cr3 = cr3; 1649 guest_cr3 = cr3;
1602 if (vm_need_ept()) { 1650 if (enable_ept) {
1603 eptp = construct_eptp(cr3); 1651 eptp = construct_eptp(cr3);
1604 vmcs_write64(EPT_POINTER, eptp); 1652 vmcs_write64(EPT_POINTER, eptp);
1605 ept_sync_context(eptp); 1653 ept_sync_context(eptp);
@@ -1616,11 +1664,11 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1616 1664
1617static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 1665static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1618{ 1666{
1619 unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.active ? 1667 unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.vm86_active ?
1620 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); 1668 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
1621 1669
1622 vcpu->arch.cr4 = cr4; 1670 vcpu->arch.cr4 = cr4;
1623 if (vm_need_ept()) 1671 if (enable_ept)
1624 ept_update_paging_mode_cr4(&hw_cr4, vcpu); 1672 ept_update_paging_mode_cr4(&hw_cr4, vcpu);
1625 1673
1626 vmcs_writel(CR4_READ_SHADOW, cr4); 1674 vmcs_writel(CR4_READ_SHADOW, cr4);
@@ -1699,7 +1747,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
1699 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 1747 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1700 u32 ar; 1748 u32 ar;
1701 1749
1702 if (vcpu->arch.rmode.active && seg == VCPU_SREG_TR) { 1750 if (vcpu->arch.rmode.vm86_active && seg == VCPU_SREG_TR) {
1703 vcpu->arch.rmode.tr.selector = var->selector; 1751 vcpu->arch.rmode.tr.selector = var->selector;
1704 vcpu->arch.rmode.tr.base = var->base; 1752 vcpu->arch.rmode.tr.base = var->base;
1705 vcpu->arch.rmode.tr.limit = var->limit; 1753 vcpu->arch.rmode.tr.limit = var->limit;
@@ -1709,7 +1757,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
1709 vmcs_writel(sf->base, var->base); 1757 vmcs_writel(sf->base, var->base);
1710 vmcs_write32(sf->limit, var->limit); 1758 vmcs_write32(sf->limit, var->limit);
1711 vmcs_write16(sf->selector, var->selector); 1759 vmcs_write16(sf->selector, var->selector);
1712 if (vcpu->arch.rmode.active && var->s) { 1760 if (vcpu->arch.rmode.vm86_active && var->s) {
1713 /* 1761 /*
1714 * Hack real-mode segments into vm86 compatibility. 1762 * Hack real-mode segments into vm86 compatibility.
1715 */ 1763 */
@@ -1982,7 +2030,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
1982 pfn_t identity_map_pfn; 2030 pfn_t identity_map_pfn;
1983 u32 tmp; 2031 u32 tmp;
1984 2032
1985 if (!vm_need_ept()) 2033 if (!enable_ept)
1986 return 1; 2034 return 1;
1987 if (unlikely(!kvm->arch.ept_identity_pagetable)) { 2035 if (unlikely(!kvm->arch.ept_identity_pagetable)) {
1988 printk(KERN_ERR "EPT: identity-mapping pagetable " 2036 printk(KERN_ERR "EPT: identity-mapping pagetable "
@@ -2071,7 +2119,7 @@ static void allocate_vpid(struct vcpu_vmx *vmx)
2071 int vpid; 2119 int vpid;
2072 2120
2073 vmx->vpid = 0; 2121 vmx->vpid = 0;
2074 if (!enable_vpid || !cpu_has_vmx_vpid()) 2122 if (!enable_vpid)
2075 return; 2123 return;
2076 spin_lock(&vmx_vpid_lock); 2124 spin_lock(&vmx_vpid_lock);
2077 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); 2125 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
@@ -2082,9 +2130,9 @@ static void allocate_vpid(struct vcpu_vmx *vmx)
2082 spin_unlock(&vmx_vpid_lock); 2130 spin_unlock(&vmx_vpid_lock);
2083} 2131}
2084 2132
2085static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr) 2133static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
2086{ 2134{
2087 void *va; 2135 int f = sizeof(unsigned long);
2088 2136
2089 if (!cpu_has_vmx_msr_bitmap()) 2137 if (!cpu_has_vmx_msr_bitmap())
2090 return; 2138 return;
@@ -2094,16 +2142,21 @@ static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr)
2094 * have the write-low and read-high bitmap offsets the wrong way round. 2142 * have the write-low and read-high bitmap offsets the wrong way round.
2095 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. 2143 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
2096 */ 2144 */
2097 va = kmap(msr_bitmap);
2098 if (msr <= 0x1fff) { 2145 if (msr <= 0x1fff) {
2099 __clear_bit(msr, va + 0x000); /* read-low */ 2146 __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
2100 __clear_bit(msr, va + 0x800); /* write-low */ 2147 __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
2101 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 2148 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
2102 msr &= 0x1fff; 2149 msr &= 0x1fff;
2103 __clear_bit(msr, va + 0x400); /* read-high */ 2150 __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
2104 __clear_bit(msr, va + 0xc00); /* write-high */ 2151 __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
2105 } 2152 }
2106 kunmap(msr_bitmap); 2153}
2154
2155static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
2156{
2157 if (!longmode_only)
2158 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
2159 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
2107} 2160}
2108 2161
2109/* 2162/*
@@ -2121,11 +2174,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2121 u32 exec_control; 2174 u32 exec_control;
2122 2175
2123 /* I/O */ 2176 /* I/O */
2124 vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a)); 2177 vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
2125 vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b)); 2178 vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b));
2126 2179
2127 if (cpu_has_vmx_msr_bitmap()) 2180 if (cpu_has_vmx_msr_bitmap())
2128 vmcs_write64(MSR_BITMAP, page_to_phys(vmx_msr_bitmap)); 2181 vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
2129 2182
2130 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ 2183 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
2131 2184
@@ -2141,7 +2194,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2141 CPU_BASED_CR8_LOAD_EXITING; 2194 CPU_BASED_CR8_LOAD_EXITING;
2142#endif 2195#endif
2143 } 2196 }
2144 if (!vm_need_ept()) 2197 if (!enable_ept)
2145 exec_control |= CPU_BASED_CR3_STORE_EXITING | 2198 exec_control |= CPU_BASED_CR3_STORE_EXITING |
2146 CPU_BASED_CR3_LOAD_EXITING | 2199 CPU_BASED_CR3_LOAD_EXITING |
2147 CPU_BASED_INVLPG_EXITING; 2200 CPU_BASED_INVLPG_EXITING;
@@ -2154,7 +2207,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2154 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 2207 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
2155 if (vmx->vpid == 0) 2208 if (vmx->vpid == 0)
2156 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; 2209 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
2157 if (!vm_need_ept()) 2210 if (!enable_ept)
2158 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; 2211 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
2159 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); 2212 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
2160 } 2213 }
@@ -2273,7 +2326,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2273 goto out; 2326 goto out;
2274 } 2327 }
2275 2328
2276 vmx->vcpu.arch.rmode.active = 0; 2329 vmx->vcpu.arch.rmode.vm86_active = 0;
2277 2330
2278 vmx->soft_vnmi_blocked = 0; 2331 vmx->soft_vnmi_blocked = 0;
2279 2332
@@ -2402,14 +2455,16 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
2402 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 2455 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2403} 2456}
2404 2457
2405static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) 2458static void vmx_inject_irq(struct kvm_vcpu *vcpu)
2406{ 2459{
2407 struct vcpu_vmx *vmx = to_vmx(vcpu); 2460 struct vcpu_vmx *vmx = to_vmx(vcpu);
2461 uint32_t intr;
2462 int irq = vcpu->arch.interrupt.nr;
2408 2463
2409 KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler); 2464 KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler);
2410 2465
2411 ++vcpu->stat.irq_injections; 2466 ++vcpu->stat.irq_injections;
2412 if (vcpu->arch.rmode.active) { 2467 if (vcpu->arch.rmode.vm86_active) {
2413 vmx->rmode.irq.pending = true; 2468 vmx->rmode.irq.pending = true;
2414 vmx->rmode.irq.vector = irq; 2469 vmx->rmode.irq.vector = irq;
2415 vmx->rmode.irq.rip = kvm_rip_read(vcpu); 2470 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
@@ -2419,8 +2474,14 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
2419 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); 2474 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
2420 return; 2475 return;
2421 } 2476 }
2422 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2477 intr = irq | INTR_INFO_VALID_MASK;
2423 irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); 2478 if (vcpu->arch.interrupt.soft) {
2479 intr |= INTR_TYPE_SOFT_INTR;
2480 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2481 vmx->vcpu.arch.event_exit_inst_len);
2482 } else
2483 intr |= INTR_TYPE_EXT_INTR;
2484 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
2424} 2485}
2425 2486
2426static void vmx_inject_nmi(struct kvm_vcpu *vcpu) 2487static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
@@ -2441,7 +2502,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
2441 } 2502 }
2442 2503
2443 ++vcpu->stat.nmi_injections; 2504 ++vcpu->stat.nmi_injections;
2444 if (vcpu->arch.rmode.active) { 2505 if (vcpu->arch.rmode.vm86_active) {
2445 vmx->rmode.irq.pending = true; 2506 vmx->rmode.irq.pending = true;
2446 vmx->rmode.irq.vector = NMI_VECTOR; 2507 vmx->rmode.irq.vector = NMI_VECTOR;
2447 vmx->rmode.irq.rip = kvm_rip_read(vcpu); 2508 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
@@ -2456,76 +2517,21 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
2456 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 2517 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
2457} 2518}
2458 2519
2459static void vmx_update_window_states(struct kvm_vcpu *vcpu) 2520static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
2460{ 2521{
2461 u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
2462
2463 vcpu->arch.nmi_window_open =
2464 !(guest_intr & (GUEST_INTR_STATE_STI |
2465 GUEST_INTR_STATE_MOV_SS |
2466 GUEST_INTR_STATE_NMI));
2467 if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) 2522 if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
2468 vcpu->arch.nmi_window_open = 0; 2523 return 0;
2469
2470 vcpu->arch.interrupt_window_open =
2471 ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
2472 !(guest_intr & (GUEST_INTR_STATE_STI |
2473 GUEST_INTR_STATE_MOV_SS)));
2474}
2475
2476static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
2477{
2478 int word_index = __ffs(vcpu->arch.irq_summary);
2479 int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
2480 int irq = word_index * BITS_PER_LONG + bit_index;
2481 2524
2482 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); 2525 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2483 if (!vcpu->arch.irq_pending[word_index]) 2526 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS |
2484 clear_bit(word_index, &vcpu->arch.irq_summary); 2527 GUEST_INTR_STATE_NMI));
2485 kvm_queue_interrupt(vcpu, irq);
2486} 2528}
2487 2529
2488static void do_interrupt_requests(struct kvm_vcpu *vcpu, 2530static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
2489 struct kvm_run *kvm_run)
2490{ 2531{
2491 vmx_update_window_states(vcpu); 2532 return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
2492 2533 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2493 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 2534 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
2494 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
2495 GUEST_INTR_STATE_STI |
2496 GUEST_INTR_STATE_MOV_SS);
2497
2498 if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
2499 if (vcpu->arch.interrupt.pending) {
2500 enable_nmi_window(vcpu);
2501 } else if (vcpu->arch.nmi_window_open) {
2502 vcpu->arch.nmi_pending = false;
2503 vcpu->arch.nmi_injected = true;
2504 } else {
2505 enable_nmi_window(vcpu);
2506 return;
2507 }
2508 }
2509 if (vcpu->arch.nmi_injected) {
2510 vmx_inject_nmi(vcpu);
2511 if (vcpu->arch.nmi_pending)
2512 enable_nmi_window(vcpu);
2513 else if (vcpu->arch.irq_summary
2514 || kvm_run->request_interrupt_window)
2515 enable_irq_window(vcpu);
2516 return;
2517 }
2518
2519 if (vcpu->arch.interrupt_window_open) {
2520 if (vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
2521 kvm_do_inject_irq(vcpu);
2522
2523 if (vcpu->arch.interrupt.pending)
2524 vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
2525 }
2526 if (!vcpu->arch.interrupt_window_open &&
2527 (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
2528 enable_irq_window(vcpu);
2529} 2535}
2530 2536
2531static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) 2537static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -2585,6 +2591,31 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2585 return 0; 2591 return 0;
2586} 2592}
2587 2593
2594/*
2595 * Trigger machine check on the host. We assume all the MSRs are already set up
2596 * by the CPU and that we still run on the same CPU as the MCE occurred on.
2597 * We pass a fake environment to the machine check handler because we want
2598 * the guest to be always treated like user space, no matter what context
2599 * it used internally.
2600 */
2601static void kvm_machine_check(void)
2602{
2603#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
2604 struct pt_regs regs = {
2605 .cs = 3, /* Fake ring 3 no matter what the guest ran on */
2606 .flags = X86_EFLAGS_IF,
2607 };
2608
2609 do_machine_check(&regs, 0);
2610#endif
2611}
2612
2613static int handle_machine_check(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2614{
2615 /* already handled by vcpu_run */
2616 return 1;
2617}
2618
2588static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2619static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2589{ 2620{
2590 struct vcpu_vmx *vmx = to_vmx(vcpu); 2621 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2596,17 +2627,14 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2596 vect_info = vmx->idt_vectoring_info; 2627 vect_info = vmx->idt_vectoring_info;
2597 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 2628 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2598 2629
2630 if (is_machine_check(intr_info))
2631 return handle_machine_check(vcpu, kvm_run);
2632
2599 if ((vect_info & VECTORING_INFO_VALID_MASK) && 2633 if ((vect_info & VECTORING_INFO_VALID_MASK) &&
2600 !is_page_fault(intr_info)) 2634 !is_page_fault(intr_info))
2601 printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " 2635 printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
2602 "intr info 0x%x\n", __func__, vect_info, intr_info); 2636 "intr info 0x%x\n", __func__, vect_info, intr_info);
2603 2637
2604 if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) {
2605 int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
2606 set_bit(irq, vcpu->arch.irq_pending);
2607 set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
2608 }
2609
2610 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) 2638 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
2611 return 1; /* already handled by vmx_vcpu_run() */ 2639 return 1; /* already handled by vmx_vcpu_run() */
2612 2640
@@ -2628,17 +2656,17 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2628 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 2656 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
2629 if (is_page_fault(intr_info)) { 2657 if (is_page_fault(intr_info)) {
2630 /* EPT won't cause page fault directly */ 2658 /* EPT won't cause page fault directly */
2631 if (vm_need_ept()) 2659 if (enable_ept)
2632 BUG(); 2660 BUG();
2633 cr2 = vmcs_readl(EXIT_QUALIFICATION); 2661 cr2 = vmcs_readl(EXIT_QUALIFICATION);
2634 KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, 2662 KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
2635 (u32)((u64)cr2 >> 32), handler); 2663 (u32)((u64)cr2 >> 32), handler);
2636 if (vcpu->arch.interrupt.pending || vcpu->arch.exception.pending) 2664 if (kvm_event_needs_reinjection(vcpu))
2637 kvm_mmu_unprotect_page_virt(vcpu, cr2); 2665 kvm_mmu_unprotect_page_virt(vcpu, cr2);
2638 return kvm_mmu_page_fault(vcpu, cr2, error_code); 2666 return kvm_mmu_page_fault(vcpu, cr2, error_code);
2639 } 2667 }
2640 2668
2641 if (vcpu->arch.rmode.active && 2669 if (vcpu->arch.rmode.vm86_active &&
2642 handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, 2670 handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
2643 error_code)) { 2671 error_code)) {
2644 if (vcpu->arch.halt_request) { 2672 if (vcpu->arch.halt_request) {
@@ -2753,13 +2781,18 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2753 kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg)); 2781 kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg));
2754 skip_emulated_instruction(vcpu); 2782 skip_emulated_instruction(vcpu);
2755 return 1; 2783 return 1;
2756 case 8: 2784 case 8: {
2757 kvm_set_cr8(vcpu, kvm_register_read(vcpu, reg)); 2785 u8 cr8_prev = kvm_get_cr8(vcpu);
2758 skip_emulated_instruction(vcpu); 2786 u8 cr8 = kvm_register_read(vcpu, reg);
2759 if (irqchip_in_kernel(vcpu->kvm)) 2787 kvm_set_cr8(vcpu, cr8);
2760 return 1; 2788 skip_emulated_instruction(vcpu);
2761 kvm_run->exit_reason = KVM_EXIT_SET_TPR; 2789 if (irqchip_in_kernel(vcpu->kvm))
2762 return 0; 2790 return 1;
2791 if (cr8_prev <= cr8)
2792 return 1;
2793 kvm_run->exit_reason = KVM_EXIT_SET_TPR;
2794 return 0;
2795 }
2763 }; 2796 };
2764 break; 2797 break;
2765 case 2: /* clts */ 2798 case 2: /* clts */
@@ -2957,8 +2990,9 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
2957 * If the user space waits to inject interrupts, exit as soon as 2990 * If the user space waits to inject interrupts, exit as soon as
2958 * possible 2991 * possible
2959 */ 2992 */
2960 if (kvm_run->request_interrupt_window && 2993 if (!irqchip_in_kernel(vcpu->kvm) &&
2961 !vcpu->arch.irq_summary) { 2994 kvm_run->request_interrupt_window &&
2995 !kvm_cpu_has_interrupt(vcpu)) {
2962 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; 2996 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
2963 return 0; 2997 return 0;
2964 } 2998 }
@@ -2980,7 +3014,7 @@ static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2980 3014
2981static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3015static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2982{ 3016{
2983 u64 exit_qualification = vmcs_read64(EXIT_QUALIFICATION); 3017 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
2984 3018
2985 kvm_mmu_invlpg(vcpu, exit_qualification); 3019 kvm_mmu_invlpg(vcpu, exit_qualification);
2986 skip_emulated_instruction(vcpu); 3020 skip_emulated_instruction(vcpu);
@@ -2996,11 +3030,11 @@ static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2996 3030
2997static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3031static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2998{ 3032{
2999 u64 exit_qualification; 3033 unsigned long exit_qualification;
3000 enum emulation_result er; 3034 enum emulation_result er;
3001 unsigned long offset; 3035 unsigned long offset;
3002 3036
3003 exit_qualification = vmcs_read64(EXIT_QUALIFICATION); 3037 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3004 offset = exit_qualification & 0xffful; 3038 offset = exit_qualification & 0xffful;
3005 3039
3006 er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); 3040 er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
@@ -3019,22 +3053,41 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3019 struct vcpu_vmx *vmx = to_vmx(vcpu); 3053 struct vcpu_vmx *vmx = to_vmx(vcpu);
3020 unsigned long exit_qualification; 3054 unsigned long exit_qualification;
3021 u16 tss_selector; 3055 u16 tss_selector;
3022 int reason; 3056 int reason, type, idt_v;
3057
3058 idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
3059 type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
3023 3060
3024 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3061 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3025 3062
3026 reason = (u32)exit_qualification >> 30; 3063 reason = (u32)exit_qualification >> 30;
3027 if (reason == TASK_SWITCH_GATE && vmx->vcpu.arch.nmi_injected && 3064 if (reason == TASK_SWITCH_GATE && idt_v) {
3028 (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 3065 switch (type) {
3029 (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK) 3066 case INTR_TYPE_NMI_INTR:
3030 == INTR_TYPE_NMI_INTR) { 3067 vcpu->arch.nmi_injected = false;
3031 vcpu->arch.nmi_injected = false; 3068 if (cpu_has_virtual_nmis())
3032 if (cpu_has_virtual_nmis()) 3069 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
3033 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 3070 GUEST_INTR_STATE_NMI);
3034 GUEST_INTR_STATE_NMI); 3071 break;
3072 case INTR_TYPE_EXT_INTR:
3073 case INTR_TYPE_SOFT_INTR:
3074 kvm_clear_interrupt_queue(vcpu);
3075 break;
3076 case INTR_TYPE_HARD_EXCEPTION:
3077 case INTR_TYPE_SOFT_EXCEPTION:
3078 kvm_clear_exception_queue(vcpu);
3079 break;
3080 default:
3081 break;
3082 }
3035 } 3083 }
3036 tss_selector = exit_qualification; 3084 tss_selector = exit_qualification;
3037 3085
3086 if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
3087 type != INTR_TYPE_EXT_INTR &&
3088 type != INTR_TYPE_NMI_INTR))
3089 skip_emulated_instruction(vcpu);
3090
3038 if (!kvm_task_switch(vcpu, tss_selector, reason)) 3091 if (!kvm_task_switch(vcpu, tss_selector, reason))
3039 return 0; 3092 return 0;
3040 3093
@@ -3051,11 +3104,11 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3051 3104
3052static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3105static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3053{ 3106{
3054 u64 exit_qualification; 3107 unsigned long exit_qualification;
3055 gpa_t gpa; 3108 gpa_t gpa;
3056 int gla_validity; 3109 int gla_validity;
3057 3110
3058 exit_qualification = vmcs_read64(EXIT_QUALIFICATION); 3111 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3059 3112
3060 if (exit_qualification & (1 << 6)) { 3113 if (exit_qualification & (1 << 6)) {
3061 printk(KERN_ERR "EPT: GPA exceeds GAW!\n"); 3114 printk(KERN_ERR "EPT: GPA exceeds GAW!\n");
@@ -3067,7 +3120,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3067 printk(KERN_ERR "EPT: Handling EPT violation failed!\n"); 3120 printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
3068 printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n", 3121 printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
3069 (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS), 3122 (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
3070 (long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS)); 3123 vmcs_readl(GUEST_LINEAR_ADDRESS));
3071 printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", 3124 printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
3072 (long unsigned int)exit_qualification); 3125 (long unsigned int)exit_qualification);
3073 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 3126 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
@@ -3150,6 +3203,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
3150 [EXIT_REASON_WBINVD] = handle_wbinvd, 3203 [EXIT_REASON_WBINVD] = handle_wbinvd,
3151 [EXIT_REASON_TASK_SWITCH] = handle_task_switch, 3204 [EXIT_REASON_TASK_SWITCH] = handle_task_switch,
3152 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, 3205 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
3206 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
3153}; 3207};
3154 3208
3155static const int kvm_vmx_max_exit_handlers = 3209static const int kvm_vmx_max_exit_handlers =
@@ -3159,10 +3213,10 @@ static const int kvm_vmx_max_exit_handlers =
3159 * The guest has exited. See if we can fix it or if we need userspace 3213 * The guest has exited. See if we can fix it or if we need userspace
3160 * assistance. 3214 * assistance.
3161 */ 3215 */
3162static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 3216static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3163{ 3217{
3164 u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
3165 struct vcpu_vmx *vmx = to_vmx(vcpu); 3218 struct vcpu_vmx *vmx = to_vmx(vcpu);
3219 u32 exit_reason = vmx->exit_reason;
3166 u32 vectoring_info = vmx->idt_vectoring_info; 3220 u32 vectoring_info = vmx->idt_vectoring_info;
3167 3221
3168 KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu), 3222 KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu),
@@ -3178,7 +3232,7 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3178 3232
3179 /* Access CR3 don't cause VMExit in paging mode, so we need 3233 /* Access CR3 don't cause VMExit in paging mode, so we need
3180 * to sync with guest real CR3. */ 3234 * to sync with guest real CR3. */
3181 if (vm_need_ept() && is_paging(vcpu)) { 3235 if (enable_ept && is_paging(vcpu)) {
3182 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 3236 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3183 ept_load_pdptrs(vcpu); 3237 ept_load_pdptrs(vcpu);
3184 } 3238 }
@@ -3199,9 +3253,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3199 __func__, vectoring_info, exit_reason); 3253 __func__, vectoring_info, exit_reason);
3200 3254
3201 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) { 3255 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
3202 if (vcpu->arch.interrupt_window_open) { 3256 if (vmx_interrupt_allowed(vcpu)) {
3203 vmx->soft_vnmi_blocked = 0; 3257 vmx->soft_vnmi_blocked = 0;
3204 vcpu->arch.nmi_window_open = 1;
3205 } else if (vmx->vnmi_blocked_time > 1000000000LL && 3258 } else if (vmx->vnmi_blocked_time > 1000000000LL &&
3206 vcpu->arch.nmi_pending) { 3259 vcpu->arch.nmi_pending) {
3207 /* 3260 /*
@@ -3214,7 +3267,6 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3214 "state on VCPU %d after 1 s timeout\n", 3267 "state on VCPU %d after 1 s timeout\n",
3215 __func__, vcpu->vcpu_id); 3268 __func__, vcpu->vcpu_id);
3216 vmx->soft_vnmi_blocked = 0; 3269 vmx->soft_vnmi_blocked = 0;
3217 vmx->vcpu.arch.nmi_window_open = 1;
3218 } 3270 }
3219 } 3271 }
3220 3272
@@ -3228,122 +3280,107 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3228 return 0; 3280 return 0;
3229} 3281}
3230 3282
3231static void update_tpr_threshold(struct kvm_vcpu *vcpu) 3283static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3232{ 3284{
3233 int max_irr, tpr; 3285 if (irr == -1 || tpr < irr) {
3234
3235 if (!vm_need_tpr_shadow(vcpu->kvm))
3236 return;
3237
3238 if (!kvm_lapic_enabled(vcpu) ||
3239 ((max_irr = kvm_lapic_find_highest_irr(vcpu)) == -1)) {
3240 vmcs_write32(TPR_THRESHOLD, 0); 3286 vmcs_write32(TPR_THRESHOLD, 0);
3241 return; 3287 return;
3242 } 3288 }
3243 3289
3244 tpr = (kvm_lapic_get_cr8(vcpu) & 0x0f) << 4; 3290 vmcs_write32(TPR_THRESHOLD, irr);
3245 vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4);
3246} 3291}
3247 3292
3248static void vmx_complete_interrupts(struct vcpu_vmx *vmx) 3293static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3249{ 3294{
3250 u32 exit_intr_info; 3295 u32 exit_intr_info;
3251 u32 idt_vectoring_info; 3296 u32 idt_vectoring_info = vmx->idt_vectoring_info;
3252 bool unblock_nmi; 3297 bool unblock_nmi;
3253 u8 vector; 3298 u8 vector;
3254 int type; 3299 int type;
3255 bool idtv_info_valid; 3300 bool idtv_info_valid;
3256 u32 error;
3257 3301
3258 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 3302 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
3303
3304 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
3305
3306 /* Handle machine checks before interrupts are enabled */
3307 if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
3308 || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI
3309 && is_machine_check(exit_intr_info)))
3310 kvm_machine_check();
3311
3312 /* We need to handle NMIs before interrupts are enabled */
3313 if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
3314 (exit_intr_info & INTR_INFO_VALID_MASK)) {
3315 KVMTRACE_0D(NMI, &vmx->vcpu, handler);
3316 asm("int $2");
3317 }
3318
3319 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
3320
3259 if (cpu_has_virtual_nmis()) { 3321 if (cpu_has_virtual_nmis()) {
3260 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; 3322 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
3261 vector = exit_intr_info & INTR_INFO_VECTOR_MASK; 3323 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
3262 /* 3324 /*
3263 * SDM 3: 25.7.1.2 3325 * SDM 3: 27.7.1.2 (September 2008)
3264 * Re-set bit "block by NMI" before VM entry if vmexit caused by 3326 * Re-set bit "block by NMI" before VM entry if vmexit caused by
3265 * a guest IRET fault. 3327 * a guest IRET fault.
3328 * SDM 3: 23.2.2 (September 2008)
3329 * Bit 12 is undefined in any of the following cases:
3330 * If the VM exit sets the valid bit in the IDT-vectoring
3331 * information field.
3332 * If the VM exit is due to a double fault.
3266 */ 3333 */
3267 if (unblock_nmi && vector != DF_VECTOR) 3334 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
3335 vector != DF_VECTOR && !idtv_info_valid)
3268 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 3336 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
3269 GUEST_INTR_STATE_NMI); 3337 GUEST_INTR_STATE_NMI);
3270 } else if (unlikely(vmx->soft_vnmi_blocked)) 3338 } else if (unlikely(vmx->soft_vnmi_blocked))
3271 vmx->vnmi_blocked_time += 3339 vmx->vnmi_blocked_time +=
3272 ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); 3340 ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
3273 3341
3274 idt_vectoring_info = vmx->idt_vectoring_info; 3342 vmx->vcpu.arch.nmi_injected = false;
3275 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; 3343 kvm_clear_exception_queue(&vmx->vcpu);
3344 kvm_clear_interrupt_queue(&vmx->vcpu);
3345
3346 if (!idtv_info_valid)
3347 return;
3348
3276 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; 3349 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
3277 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; 3350 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
3278 if (vmx->vcpu.arch.nmi_injected) { 3351
3352 switch (type) {
3353 case INTR_TYPE_NMI_INTR:
3354 vmx->vcpu.arch.nmi_injected = true;
3279 /* 3355 /*
3280 * SDM 3: 25.7.1.2 3356 * SDM 3: 27.7.1.2 (September 2008)
3281 * Clear bit "block by NMI" before VM entry if a NMI delivery 3357 * Clear bit "block by NMI" before VM entry if a NMI
3282 * faulted. 3358 * delivery faulted.
3283 */ 3359 */
3284 if (idtv_info_valid && type == INTR_TYPE_NMI_INTR) 3360 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
3285 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, 3361 GUEST_INTR_STATE_NMI);
3286 GUEST_INTR_STATE_NMI); 3362 break;
3287 else 3363 case INTR_TYPE_SOFT_EXCEPTION:
3288 vmx->vcpu.arch.nmi_injected = false; 3364 vmx->vcpu.arch.event_exit_inst_len =
3289 } 3365 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
3290 kvm_clear_exception_queue(&vmx->vcpu); 3366 /* fall through */
3291 if (idtv_info_valid && (type == INTR_TYPE_HARD_EXCEPTION || 3367 case INTR_TYPE_HARD_EXCEPTION:
3292 type == INTR_TYPE_SOFT_EXCEPTION)) {
3293 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { 3368 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
3294 error = vmcs_read32(IDT_VECTORING_ERROR_CODE); 3369 u32 err = vmcs_read32(IDT_VECTORING_ERROR_CODE);
3295 kvm_queue_exception_e(&vmx->vcpu, vector, error); 3370 kvm_queue_exception_e(&vmx->vcpu, vector, err);
3296 } else 3371 } else
3297 kvm_queue_exception(&vmx->vcpu, vector); 3372 kvm_queue_exception(&vmx->vcpu, vector);
3298 vmx->idt_vectoring_info = 0; 3373 break;
3299 } 3374 case INTR_TYPE_SOFT_INTR:
3300 kvm_clear_interrupt_queue(&vmx->vcpu); 3375 vmx->vcpu.arch.event_exit_inst_len =
3301 if (idtv_info_valid && type == INTR_TYPE_EXT_INTR) { 3376 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
3302 kvm_queue_interrupt(&vmx->vcpu, vector); 3377 /* fall through */
3303 vmx->idt_vectoring_info = 0; 3378 case INTR_TYPE_EXT_INTR:
3304 } 3379 kvm_queue_interrupt(&vmx->vcpu, vector,
3305} 3380 type == INTR_TYPE_SOFT_INTR);
3306 3381 break;
3307static void vmx_intr_assist(struct kvm_vcpu *vcpu) 3382 default:
3308{ 3383 break;
3309 update_tpr_threshold(vcpu);
3310
3311 vmx_update_window_states(vcpu);
3312
3313 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
3314 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
3315 GUEST_INTR_STATE_STI |
3316 GUEST_INTR_STATE_MOV_SS);
3317
3318 if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
3319 if (vcpu->arch.interrupt.pending) {
3320 enable_nmi_window(vcpu);
3321 } else if (vcpu->arch.nmi_window_open) {
3322 vcpu->arch.nmi_pending = false;
3323 vcpu->arch.nmi_injected = true;
3324 } else {
3325 enable_nmi_window(vcpu);
3326 return;
3327 }
3328 }
3329 if (vcpu->arch.nmi_injected) {
3330 vmx_inject_nmi(vcpu);
3331 if (vcpu->arch.nmi_pending)
3332 enable_nmi_window(vcpu);
3333 else if (kvm_cpu_has_interrupt(vcpu))
3334 enable_irq_window(vcpu);
3335 return;
3336 }
3337 if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) {
3338 if (vcpu->arch.interrupt_window_open)
3339 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
3340 else
3341 enable_irq_window(vcpu);
3342 }
3343 if (vcpu->arch.interrupt.pending) {
3344 vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
3345 if (kvm_cpu_has_interrupt(vcpu))
3346 enable_irq_window(vcpu);
3347 } 3384 }
3348} 3385}
3349 3386
@@ -3381,7 +3418,6 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
3381static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3418static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3382{ 3419{
3383 struct vcpu_vmx *vmx = to_vmx(vcpu); 3420 struct vcpu_vmx *vmx = to_vmx(vcpu);
3384 u32 intr_info;
3385 3421
3386 /* Record the guest's net vcpu time for enforced NMI injections. */ 3422 /* Record the guest's net vcpu time for enforced NMI injections. */
3387 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) 3423 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
@@ -3505,20 +3541,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3505 if (vmx->rmode.irq.pending) 3541 if (vmx->rmode.irq.pending)
3506 fixup_rmode_irq(vmx); 3542 fixup_rmode_irq(vmx);
3507 3543
3508 vmx_update_window_states(vcpu);
3509
3510 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); 3544 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
3511 vmx->launched = 1; 3545 vmx->launched = 1;
3512 3546
3513 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
3514
3515 /* We need to handle NMIs before interrupts are enabled */
3516 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
3517 (intr_info & INTR_INFO_VALID_MASK)) {
3518 KVMTRACE_0D(NMI, vcpu, handler);
3519 asm("int $2");
3520 }
3521
3522 vmx_complete_interrupts(vmx); 3547 vmx_complete_interrupts(vmx);
3523} 3548}
3524 3549
@@ -3593,7 +3618,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
3593 if (alloc_apic_access_page(kvm) != 0) 3618 if (alloc_apic_access_page(kvm) != 0)
3594 goto free_vmcs; 3619 goto free_vmcs;
3595 3620
3596 if (vm_need_ept()) 3621 if (enable_ept)
3597 if (alloc_identity_pagetable(kvm) != 0) 3622 if (alloc_identity_pagetable(kvm) != 0)
3598 goto free_vmcs; 3623 goto free_vmcs;
3599 3624
@@ -3631,9 +3656,32 @@ static int get_ept_level(void)
3631 return VMX_EPT_DEFAULT_GAW + 1; 3656 return VMX_EPT_DEFAULT_GAW + 1;
3632} 3657}
3633 3658
3634static int vmx_get_mt_mask_shift(void) 3659static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
3635{ 3660{
3636 return VMX_EPT_MT_EPTE_SHIFT; 3661 u64 ret;
3662
3663 /* For VT-d and EPT combination
3664 * 1. MMIO: always map as UC
3665 * 2. EPT with VT-d:
3666 * a. VT-d without snooping control feature: can't guarantee the
3667 * result, try to trust guest.
3668 * b. VT-d with snooping control feature: snooping control feature of
3669 * VT-d engine can guarantee the cache correctness. Just set it
3670 * to WB to keep consistent with host. So the same as item 3.
3671 * 3. EPT without VT-d: always map as WB and set IGMT=1 to keep
3672 * consistent with host MTRR
3673 */
3674 if (is_mmio)
3675 ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
3676 else if (vcpu->kvm->arch.iommu_domain &&
3677 !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY))
3678 ret = kvm_get_guest_memory_type(vcpu, gfn) <<
3679 VMX_EPT_MT_EPTE_SHIFT;
3680 else
3681 ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT)
3682 | VMX_EPT_IGMT_BIT;
3683
3684 return ret;
3637} 3685}
3638 3686
3639static struct kvm_x86_ops vmx_x86_ops = { 3687static struct kvm_x86_ops vmx_x86_ops = {
@@ -3644,7 +3692,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
3644 .check_processor_compatibility = vmx_check_processor_compat, 3692 .check_processor_compatibility = vmx_check_processor_compat,
3645 .hardware_enable = hardware_enable, 3693 .hardware_enable = hardware_enable,
3646 .hardware_disable = hardware_disable, 3694 .hardware_disable = hardware_disable,
3647 .cpu_has_accelerated_tpr = cpu_has_vmx_virtualize_apic_accesses, 3695 .cpu_has_accelerated_tpr = report_flexpriority,
3648 3696
3649 .vcpu_create = vmx_create_vcpu, 3697 .vcpu_create = vmx_create_vcpu,
3650 .vcpu_free = vmx_free_vcpu, 3698 .vcpu_free = vmx_free_vcpu,
@@ -3678,78 +3726,82 @@ static struct kvm_x86_ops vmx_x86_ops = {
3678 .tlb_flush = vmx_flush_tlb, 3726 .tlb_flush = vmx_flush_tlb,
3679 3727
3680 .run = vmx_vcpu_run, 3728 .run = vmx_vcpu_run,
3681 .handle_exit = kvm_handle_exit, 3729 .handle_exit = vmx_handle_exit,
3682 .skip_emulated_instruction = skip_emulated_instruction, 3730 .skip_emulated_instruction = skip_emulated_instruction,
3731 .set_interrupt_shadow = vmx_set_interrupt_shadow,
3732 .get_interrupt_shadow = vmx_get_interrupt_shadow,
3683 .patch_hypercall = vmx_patch_hypercall, 3733 .patch_hypercall = vmx_patch_hypercall,
3684 .get_irq = vmx_get_irq,
3685 .set_irq = vmx_inject_irq, 3734 .set_irq = vmx_inject_irq,
3735 .set_nmi = vmx_inject_nmi,
3686 .queue_exception = vmx_queue_exception, 3736 .queue_exception = vmx_queue_exception,
3687 .exception_injected = vmx_exception_injected, 3737 .interrupt_allowed = vmx_interrupt_allowed,
3688 .inject_pending_irq = vmx_intr_assist, 3738 .nmi_allowed = vmx_nmi_allowed,
3689 .inject_pending_vectors = do_interrupt_requests, 3739 .enable_nmi_window = enable_nmi_window,
3740 .enable_irq_window = enable_irq_window,
3741 .update_cr8_intercept = update_cr8_intercept,
3690 3742
3691 .set_tss_addr = vmx_set_tss_addr, 3743 .set_tss_addr = vmx_set_tss_addr,
3692 .get_tdp_level = get_ept_level, 3744 .get_tdp_level = get_ept_level,
3693 .get_mt_mask_shift = vmx_get_mt_mask_shift, 3745 .get_mt_mask = vmx_get_mt_mask,
3694}; 3746};
3695 3747
3696static int __init vmx_init(void) 3748static int __init vmx_init(void)
3697{ 3749{
3698 void *va;
3699 int r; 3750 int r;
3700 3751
3701 vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); 3752 vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
3702 if (!vmx_io_bitmap_a) 3753 if (!vmx_io_bitmap_a)
3703 return -ENOMEM; 3754 return -ENOMEM;
3704 3755
3705 vmx_io_bitmap_b = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); 3756 vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);
3706 if (!vmx_io_bitmap_b) { 3757 if (!vmx_io_bitmap_b) {
3707 r = -ENOMEM; 3758 r = -ENOMEM;
3708 goto out; 3759 goto out;
3709 } 3760 }
3710 3761
3711 vmx_msr_bitmap = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); 3762 vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
3712 if (!vmx_msr_bitmap) { 3763 if (!vmx_msr_bitmap_legacy) {
3713 r = -ENOMEM; 3764 r = -ENOMEM;
3714 goto out1; 3765 goto out1;
3715 } 3766 }
3716 3767
3768 vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
3769 if (!vmx_msr_bitmap_longmode) {
3770 r = -ENOMEM;
3771 goto out2;
3772 }
3773
3717 /* 3774 /*
3718 * Allow direct access to the PC debug port (it is often used for I/O 3775 * Allow direct access to the PC debug port (it is often used for I/O
3719 * delays, but the vmexits simply slow things down). 3776 * delays, but the vmexits simply slow things down).
3720 */ 3777 */
3721 va = kmap(vmx_io_bitmap_a); 3778 memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE);
3722 memset(va, 0xff, PAGE_SIZE); 3779 clear_bit(0x80, vmx_io_bitmap_a);
3723 clear_bit(0x80, va);
3724 kunmap(vmx_io_bitmap_a);
3725 3780
3726 va = kmap(vmx_io_bitmap_b); 3781 memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
3727 memset(va, 0xff, PAGE_SIZE);
3728 kunmap(vmx_io_bitmap_b);
3729 3782
3730 va = kmap(vmx_msr_bitmap); 3783 memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
3731 memset(va, 0xff, PAGE_SIZE); 3784 memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
3732 kunmap(vmx_msr_bitmap);
3733 3785
3734 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ 3786 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
3735 3787
3736 r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE); 3788 r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
3737 if (r) 3789 if (r)
3738 goto out2; 3790 goto out3;
3739 3791
3740 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_FS_BASE); 3792 vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
3741 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_GS_BASE); 3793 vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
3742 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_CS); 3794 vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
3743 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP); 3795 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
3744 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP); 3796 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
3797 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
3745 3798
3746 if (vm_need_ept()) { 3799 if (enable_ept) {
3747 bypass_guest_pf = 0; 3800 bypass_guest_pf = 0;
3748 kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK | 3801 kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
3749 VMX_EPT_WRITABLE_MASK); 3802 VMX_EPT_WRITABLE_MASK);
3750 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, 3803 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
3751 VMX_EPT_EXECUTABLE_MASK, 3804 VMX_EPT_EXECUTABLE_MASK);
3752 VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
3753 kvm_enable_tdp(); 3805 kvm_enable_tdp();
3754 } else 3806 } else
3755 kvm_disable_tdp(); 3807 kvm_disable_tdp();
@@ -3761,20 +3813,23 @@ static int __init vmx_init(void)
3761 3813
3762 return 0; 3814 return 0;
3763 3815
3816out3:
3817 free_page((unsigned long)vmx_msr_bitmap_longmode);
3764out2: 3818out2:
3765 __free_page(vmx_msr_bitmap); 3819 free_page((unsigned long)vmx_msr_bitmap_legacy);
3766out1: 3820out1:
3767 __free_page(vmx_io_bitmap_b); 3821 free_page((unsigned long)vmx_io_bitmap_b);
3768out: 3822out:
3769 __free_page(vmx_io_bitmap_a); 3823 free_page((unsigned long)vmx_io_bitmap_a);
3770 return r; 3824 return r;
3771} 3825}
3772 3826
3773static void __exit vmx_exit(void) 3827static void __exit vmx_exit(void)
3774{ 3828{
3775 __free_page(vmx_msr_bitmap); 3829 free_page((unsigned long)vmx_msr_bitmap_legacy);
3776 __free_page(vmx_io_bitmap_b); 3830 free_page((unsigned long)vmx_msr_bitmap_longmode);
3777 __free_page(vmx_io_bitmap_a); 3831 free_page((unsigned long)vmx_io_bitmap_b);
3832 free_page((unsigned long)vmx_io_bitmap_a);
3778 3833
3779 kvm_exit(); 3834 kvm_exit();
3780} 3835}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3944e917e794..249540f98513 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -91,7 +91,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
91 { "halt_wakeup", VCPU_STAT(halt_wakeup) }, 91 { "halt_wakeup", VCPU_STAT(halt_wakeup) },
92 { "hypercalls", VCPU_STAT(hypercalls) }, 92 { "hypercalls", VCPU_STAT(hypercalls) },
93 { "request_irq", VCPU_STAT(request_irq_exits) }, 93 { "request_irq", VCPU_STAT(request_irq_exits) },
94 { "request_nmi", VCPU_STAT(request_nmi_exits) },
95 { "irq_exits", VCPU_STAT(irq_exits) }, 94 { "irq_exits", VCPU_STAT(irq_exits) },
96 { "host_state_reload", VCPU_STAT(host_state_reload) }, 95 { "host_state_reload", VCPU_STAT(host_state_reload) },
97 { "efer_reload", VCPU_STAT(efer_reload) }, 96 { "efer_reload", VCPU_STAT(efer_reload) },
@@ -108,7 +107,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
108 { "mmu_recycled", VM_STAT(mmu_recycled) }, 107 { "mmu_recycled", VM_STAT(mmu_recycled) },
109 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, 108 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
110 { "mmu_unsync", VM_STAT(mmu_unsync) }, 109 { "mmu_unsync", VM_STAT(mmu_unsync) },
111 { "mmu_unsync_global", VM_STAT(mmu_unsync_global) },
112 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, 110 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
113 { "largepages", VM_STAT(lpages) }, 111 { "largepages", VM_STAT(lpages) },
114 { NULL } 112 { NULL }
@@ -234,7 +232,8 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
234 goto out; 232 goto out;
235 } 233 }
236 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { 234 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
237 if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) { 235 if (is_present_pte(pdpte[i]) &&
236 (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
238 ret = 0; 237 ret = 0;
239 goto out; 238 goto out;
240 } 239 }
@@ -321,7 +320,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
321 kvm_x86_ops->set_cr0(vcpu, cr0); 320 kvm_x86_ops->set_cr0(vcpu, cr0);
322 vcpu->arch.cr0 = cr0; 321 vcpu->arch.cr0 = cr0;
323 322
324 kvm_mmu_sync_global(vcpu);
325 kvm_mmu_reset_context(vcpu); 323 kvm_mmu_reset_context(vcpu);
326 return; 324 return;
327} 325}
@@ -370,7 +368,6 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
370 kvm_x86_ops->set_cr4(vcpu, cr4); 368 kvm_x86_ops->set_cr4(vcpu, cr4);
371 vcpu->arch.cr4 = cr4; 369 vcpu->arch.cr4 = cr4;
372 vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled; 370 vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
373 kvm_mmu_sync_global(vcpu);
374 kvm_mmu_reset_context(vcpu); 371 kvm_mmu_reset_context(vcpu);
375} 372}
376EXPORT_SYMBOL_GPL(kvm_set_cr4); 373EXPORT_SYMBOL_GPL(kvm_set_cr4);
@@ -523,6 +520,9 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
523 efer |= vcpu->arch.shadow_efer & EFER_LMA; 520 efer |= vcpu->arch.shadow_efer & EFER_LMA;
524 521
525 vcpu->arch.shadow_efer = efer; 522 vcpu->arch.shadow_efer = efer;
523
524 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
525 kvm_mmu_reset_context(vcpu);
526} 526}
527 527
528void kvm_enable_efer_bits(u64 mask) 528void kvm_enable_efer_bits(u64 mask)
@@ -630,14 +630,17 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
630 unsigned long flags; 630 unsigned long flags;
631 struct kvm_vcpu_arch *vcpu = &v->arch; 631 struct kvm_vcpu_arch *vcpu = &v->arch;
632 void *shared_kaddr; 632 void *shared_kaddr;
633 unsigned long this_tsc_khz;
633 634
634 if ((!vcpu->time_page)) 635 if ((!vcpu->time_page))
635 return; 636 return;
636 637
637 if (unlikely(vcpu->hv_clock_tsc_khz != __get_cpu_var(cpu_tsc_khz))) { 638 this_tsc_khz = get_cpu_var(cpu_tsc_khz);
638 kvm_set_time_scale(__get_cpu_var(cpu_tsc_khz), &vcpu->hv_clock); 639 if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) {
639 vcpu->hv_clock_tsc_khz = __get_cpu_var(cpu_tsc_khz); 640 kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
641 vcpu->hv_clock_tsc_khz = this_tsc_khz;
640 } 642 }
643 put_cpu_var(cpu_tsc_khz);
641 644
642 /* Keep irq disabled to prevent changes to the clock */ 645 /* Keep irq disabled to prevent changes to the clock */
643 local_irq_save(flags); 646 local_irq_save(flags);
@@ -893,6 +896,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
893 case MSR_IA32_LASTINTFROMIP: 896 case MSR_IA32_LASTINTFROMIP:
894 case MSR_IA32_LASTINTTOIP: 897 case MSR_IA32_LASTINTTOIP:
895 case MSR_VM_HSAVE_PA: 898 case MSR_VM_HSAVE_PA:
899 case MSR_P6_EVNTSEL0:
900 case MSR_P6_EVNTSEL1:
896 data = 0; 901 data = 0;
897 break; 902 break;
898 case MSR_MTRRcap: 903 case MSR_MTRRcap:
@@ -1024,6 +1029,7 @@ int kvm_dev_ioctl_check_extension(long ext)
1024 case KVM_CAP_SYNC_MMU: 1029 case KVM_CAP_SYNC_MMU:
1025 case KVM_CAP_REINJECT_CONTROL: 1030 case KVM_CAP_REINJECT_CONTROL:
1026 case KVM_CAP_IRQ_INJECT_STATUS: 1031 case KVM_CAP_IRQ_INJECT_STATUS:
1032 case KVM_CAP_ASSIGN_DEV_IRQ:
1027 r = 1; 1033 r = 1;
1028 break; 1034 break;
1029 case KVM_CAP_COALESCED_MMIO: 1035 case KVM_CAP_COALESCED_MMIO:
@@ -1241,41 +1247,53 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1241 entry->flags = 0; 1247 entry->flags = 0;
1242} 1248}
1243 1249
1250#define F(x) bit(X86_FEATURE_##x)
1251
1244static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 1252static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1245 u32 index, int *nent, int maxnent) 1253 u32 index, int *nent, int maxnent)
1246{ 1254{
1247 const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) | 1255 unsigned f_nx = is_efer_nx() ? F(NX) : 0;
1248 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
1249 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
1250 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
1251 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
1252 bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) |
1253 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
1254 bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |
1255 bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |
1256 bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);
1257 const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) |
1258 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
1259 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
1260 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
1261 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
1262 bit(X86_FEATURE_PGE) |
1263 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
1264 bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) |
1265 bit(X86_FEATURE_SYSCALL) |
1266 (is_efer_nx() ? bit(X86_FEATURE_NX) : 0) |
1267#ifdef CONFIG_X86_64 1256#ifdef CONFIG_X86_64
1268 bit(X86_FEATURE_LM) | 1257 unsigned f_lm = F(LM);
1258#else
1259 unsigned f_lm = 0;
1269#endif 1260#endif
1270 bit(X86_FEATURE_FXSR_OPT) | 1261
1271 bit(X86_FEATURE_MMXEXT) | 1262 /* cpuid 1.edx */
1272 bit(X86_FEATURE_3DNOWEXT) | 1263 const u32 kvm_supported_word0_x86_features =
1273 bit(X86_FEATURE_3DNOW); 1264 F(FPU) | F(VME) | F(DE) | F(PSE) |
1274 const u32 kvm_supported_word3_x86_features = 1265 F(TSC) | F(MSR) | F(PAE) | F(MCE) |
1275 bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16); 1266 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
1267 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
1268 F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) |
1269 0 /* Reserved, DS, ACPI */ | F(MMX) |
1270 F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
1271 0 /* HTT, TM, Reserved, PBE */;
1272 /* cpuid 0x80000001.edx */
1273 const u32 kvm_supported_word1_x86_features =
1274 F(FPU) | F(VME) | F(DE) | F(PSE) |
1275 F(TSC) | F(MSR) | F(PAE) | F(MCE) |
1276 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
1277 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
1278 F(PAT) | F(PSE36) | 0 /* Reserved */ |
1279 f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
1280 F(FXSR) | F(FXSR_OPT) | 0 /* GBPAGES */ | 0 /* RDTSCP */ |
1281 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
1282 /* cpuid 1.ecx */
1283 const u32 kvm_supported_word4_x86_features =
1284 F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ |
1285 0 /* DS-CPL, VMX, SMX, EST */ |
1286 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
1287 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
1288 0 /* Reserved, DCA */ | F(XMM4_1) |
1289 F(XMM4_2) | 0 /* x2APIC */ | F(MOVBE) | F(POPCNT) |
1290 0 /* Reserved, XSAVE, OSXSAVE */;
1291 /* cpuid 0x80000001.ecx */
1276 const u32 kvm_supported_word6_x86_features = 1292 const u32 kvm_supported_word6_x86_features =
1277 bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY) | 1293 F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
1278 bit(X86_FEATURE_SVM); 1294 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
1295 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) |
1296 0 /* SKINIT */ | 0 /* WDT */;
1279 1297
1280 /* all calls to cpuid_count() should be made on the same cpu */ 1298 /* all calls to cpuid_count() should be made on the same cpu */
1281 get_cpu(); 1299 get_cpu();
@@ -1288,7 +1306,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1288 break; 1306 break;
1289 case 1: 1307 case 1:
1290 entry->edx &= kvm_supported_word0_x86_features; 1308 entry->edx &= kvm_supported_word0_x86_features;
1291 entry->ecx &= kvm_supported_word3_x86_features; 1309 entry->ecx &= kvm_supported_word4_x86_features;
1292 break; 1310 break;
1293 /* function 2 entries are STATEFUL. That is, repeated cpuid commands 1311 /* function 2 entries are STATEFUL. That is, repeated cpuid commands
1294 * may return different values. This forces us to get_cpu() before 1312 * may return different values. This forces us to get_cpu() before
@@ -1350,6 +1368,8 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1350 put_cpu(); 1368 put_cpu();
1351} 1369}
1352 1370
1371#undef F
1372
1353static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, 1373static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
1354 struct kvm_cpuid_entry2 __user *entries) 1374 struct kvm_cpuid_entry2 __user *entries)
1355{ 1375{
@@ -1421,8 +1441,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
1421 return -ENXIO; 1441 return -ENXIO;
1422 vcpu_load(vcpu); 1442 vcpu_load(vcpu);
1423 1443
1424 set_bit(irq->irq, vcpu->arch.irq_pending); 1444 kvm_queue_interrupt(vcpu, irq->irq, false);
1425 set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
1426 1445
1427 vcpu_put(vcpu); 1446 vcpu_put(vcpu);
1428 1447
@@ -1584,8 +1603,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1584 r = -EINVAL; 1603 r = -EINVAL;
1585 } 1604 }
1586out: 1605out:
1587 if (lapic) 1606 kfree(lapic);
1588 kfree(lapic);
1589 return r; 1607 return r;
1590} 1608}
1591 1609
@@ -1606,10 +1624,12 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
1606 return -EINVAL; 1624 return -EINVAL;
1607 1625
1608 down_write(&kvm->slots_lock); 1626 down_write(&kvm->slots_lock);
1627 spin_lock(&kvm->mmu_lock);
1609 1628
1610 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); 1629 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
1611 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; 1630 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
1612 1631
1632 spin_unlock(&kvm->mmu_lock);
1613 up_write(&kvm->slots_lock); 1633 up_write(&kvm->slots_lock);
1614 return 0; 1634 return 0;
1615} 1635}
@@ -1785,7 +1805,9 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1785 1805
1786 /* If nothing is dirty, don't bother messing with page tables. */ 1806 /* If nothing is dirty, don't bother messing with page tables. */
1787 if (is_dirty) { 1807 if (is_dirty) {
1808 spin_lock(&kvm->mmu_lock);
1788 kvm_mmu_slot_remove_write_access(kvm, log->slot); 1809 kvm_mmu_slot_remove_write_access(kvm, log->slot);
1810 spin_unlock(&kvm->mmu_lock);
1789 kvm_flush_remote_tlbs(kvm); 1811 kvm_flush_remote_tlbs(kvm);
1790 memslot = &kvm->memslots[log->slot]; 1812 memslot = &kvm->memslots[log->slot];
1791 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; 1813 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
@@ -2360,7 +2382,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2360 u16 error_code, 2382 u16 error_code,
2361 int emulation_type) 2383 int emulation_type)
2362{ 2384{
2363 int r; 2385 int r, shadow_mask;
2364 struct decode_cache *c; 2386 struct decode_cache *c;
2365 2387
2366 kvm_clear_exception_queue(vcpu); 2388 kvm_clear_exception_queue(vcpu);
@@ -2408,7 +2430,16 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2408 } 2430 }
2409 } 2431 }
2410 2432
2433 if (emulation_type & EMULTYPE_SKIP) {
2434 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip);
2435 return EMULATE_DONE;
2436 }
2437
2411 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 2438 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2439 shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
2440
2441 if (r == 0)
2442 kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
2412 2443
2413 if (vcpu->arch.pio.string) 2444 if (vcpu->arch.pio.string)
2414 return EMULATE_DO_MMIO; 2445 return EMULATE_DO_MMIO;
@@ -2761,7 +2792,7 @@ int kvm_arch_init(void *opaque)
2761 kvm_mmu_set_nonpresent_ptes(0ull, 0ull); 2792 kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
2762 kvm_mmu_set_base_ptes(PT_PRESENT_MASK); 2793 kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
2763 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 2794 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
2764 PT_DIRTY_MASK, PT64_NX_MASK, 0, 0); 2795 PT_DIRTY_MASK, PT64_NX_MASK, 0);
2765 2796
2766 for_each_possible_cpu(cpu) 2797 for_each_possible_cpu(cpu)
2767 per_cpu(cpu_tsc_khz, cpu) = tsc_khz; 2798 per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
@@ -3012,6 +3043,16 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
3012 return best; 3043 return best;
3013} 3044}
3014 3045
3046int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
3047{
3048 struct kvm_cpuid_entry2 *best;
3049
3050 best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
3051 if (best)
3052 return best->eax & 0xff;
3053 return 36;
3054}
3055
3015void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) 3056void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
3016{ 3057{
3017 u32 function, index; 3058 u32 function, index;
@@ -3048,10 +3089,9 @@ EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
3048static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, 3089static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
3049 struct kvm_run *kvm_run) 3090 struct kvm_run *kvm_run)
3050{ 3091{
3051 return (!vcpu->arch.irq_summary && 3092 return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
3052 kvm_run->request_interrupt_window && 3093 kvm_run->request_interrupt_window &&
3053 vcpu->arch.interrupt_window_open && 3094 kvm_arch_interrupt_allowed(vcpu));
3054 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
3055} 3095}
3056 3096
3057static void post_kvm_run_save(struct kvm_vcpu *vcpu, 3097static void post_kvm_run_save(struct kvm_vcpu *vcpu,
@@ -3064,8 +3104,9 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu,
3064 kvm_run->ready_for_interrupt_injection = 1; 3104 kvm_run->ready_for_interrupt_injection = 1;
3065 else 3105 else
3066 kvm_run->ready_for_interrupt_injection = 3106 kvm_run->ready_for_interrupt_injection =
3067 (vcpu->arch.interrupt_window_open && 3107 kvm_arch_interrupt_allowed(vcpu) &&
3068 vcpu->arch.irq_summary == 0); 3108 !kvm_cpu_has_interrupt(vcpu) &&
3109 !kvm_event_needs_reinjection(vcpu);
3069} 3110}
3070 3111
3071static void vapic_enter(struct kvm_vcpu *vcpu) 3112static void vapic_enter(struct kvm_vcpu *vcpu)
@@ -3094,9 +3135,63 @@ static void vapic_exit(struct kvm_vcpu *vcpu)
3094 up_read(&vcpu->kvm->slots_lock); 3135 up_read(&vcpu->kvm->slots_lock);
3095} 3136}
3096 3137
3138static void update_cr8_intercept(struct kvm_vcpu *vcpu)
3139{
3140 int max_irr, tpr;
3141
3142 if (!kvm_x86_ops->update_cr8_intercept)
3143 return;
3144
3145 if (!vcpu->arch.apic->vapic_addr)
3146 max_irr = kvm_lapic_find_highest_irr(vcpu);
3147 else
3148 max_irr = -1;
3149
3150 if (max_irr != -1)
3151 max_irr >>= 4;
3152
3153 tpr = kvm_lapic_get_cr8(vcpu);
3154
3155 kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
3156}
3157
3158static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3159{
3160 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
3161 kvm_x86_ops->set_interrupt_shadow(vcpu, 0);
3162
3163 /* try to reinject previous events if any */
3164 if (vcpu->arch.nmi_injected) {
3165 kvm_x86_ops->set_nmi(vcpu);
3166 return;
3167 }
3168
3169 if (vcpu->arch.interrupt.pending) {
3170 kvm_x86_ops->set_irq(vcpu);
3171 return;
3172 }
3173
3174 /* try to inject new event if pending */
3175 if (vcpu->arch.nmi_pending) {
3176 if (kvm_x86_ops->nmi_allowed(vcpu)) {
3177 vcpu->arch.nmi_pending = false;
3178 vcpu->arch.nmi_injected = true;
3179 kvm_x86_ops->set_nmi(vcpu);
3180 }
3181 } else if (kvm_cpu_has_interrupt(vcpu)) {
3182 if (kvm_x86_ops->interrupt_allowed(vcpu)) {
3183 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
3184 false);
3185 kvm_x86_ops->set_irq(vcpu);
3186 }
3187 }
3188}
3189
3097static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3190static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3098{ 3191{
3099 int r; 3192 int r;
3193 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
3194 kvm_run->request_interrupt_window;
3100 3195
3101 if (vcpu->requests) 3196 if (vcpu->requests)
3102 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) 3197 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
@@ -3128,9 +3223,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3128 } 3223 }
3129 } 3224 }
3130 3225
3131 clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
3132 kvm_inject_pending_timer_irqs(vcpu);
3133
3134 preempt_disable(); 3226 preempt_disable();
3135 3227
3136 kvm_x86_ops->prepare_guest_switch(vcpu); 3228 kvm_x86_ops->prepare_guest_switch(vcpu);
@@ -3138,6 +3230,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3138 3230
3139 local_irq_disable(); 3231 local_irq_disable();
3140 3232
3233 clear_bit(KVM_REQ_KICK, &vcpu->requests);
3234 smp_mb__after_clear_bit();
3235
3141 if (vcpu->requests || need_resched() || signal_pending(current)) { 3236 if (vcpu->requests || need_resched() || signal_pending(current)) {
3142 local_irq_enable(); 3237 local_irq_enable();
3143 preempt_enable(); 3238 preempt_enable();
@@ -3145,21 +3240,21 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3145 goto out; 3240 goto out;
3146 } 3241 }
3147 3242
3148 vcpu->guest_mode = 1;
3149 /*
3150 * Make sure that guest_mode assignment won't happen after
3151 * testing the pending IRQ vector bitmap.
3152 */
3153 smp_wmb();
3154
3155 if (vcpu->arch.exception.pending) 3243 if (vcpu->arch.exception.pending)
3156 __queue_exception(vcpu); 3244 __queue_exception(vcpu);
3157 else if (irqchip_in_kernel(vcpu->kvm))
3158 kvm_x86_ops->inject_pending_irq(vcpu);
3159 else 3245 else
3160 kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run); 3246 inject_pending_irq(vcpu, kvm_run);
3161 3247
3162 kvm_lapic_sync_to_vapic(vcpu); 3248 /* enable NMI/IRQ window open exits if needed */
3249 if (vcpu->arch.nmi_pending)
3250 kvm_x86_ops->enable_nmi_window(vcpu);
3251 else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
3252 kvm_x86_ops->enable_irq_window(vcpu);
3253
3254 if (kvm_lapic_enabled(vcpu)) {
3255 update_cr8_intercept(vcpu);
3256 kvm_lapic_sync_to_vapic(vcpu);
3257 }
3163 3258
3164 up_read(&vcpu->kvm->slots_lock); 3259 up_read(&vcpu->kvm->slots_lock);
3165 3260
@@ -3193,7 +3288,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3193 set_debugreg(vcpu->arch.host_dr6, 6); 3288 set_debugreg(vcpu->arch.host_dr6, 6);
3194 set_debugreg(vcpu->arch.host_dr7, 7); 3289 set_debugreg(vcpu->arch.host_dr7, 7);
3195 3290
3196 vcpu->guest_mode = 0; 3291 set_bit(KVM_REQ_KICK, &vcpu->requests);
3197 local_irq_enable(); 3292 local_irq_enable();
3198 3293
3199 ++vcpu->stat.exits; 3294 ++vcpu->stat.exits;
@@ -3220,8 +3315,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3220 profile_hit(KVM_PROFILING, (void *)rip); 3315 profile_hit(KVM_PROFILING, (void *)rip);
3221 } 3316 }
3222 3317
3223 if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
3224 vcpu->arch.exception.pending = false;
3225 3318
3226 kvm_lapic_sync_from_vapic(vcpu); 3319 kvm_lapic_sync_from_vapic(vcpu);
3227 3320
@@ -3230,6 +3323,7 @@ out:
3230 return r; 3323 return r;
3231} 3324}
3232 3325
3326
3233static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3327static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3234{ 3328{
3235 int r; 3329 int r;
@@ -3256,29 +3350,42 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3256 kvm_vcpu_block(vcpu); 3350 kvm_vcpu_block(vcpu);
3257 down_read(&vcpu->kvm->slots_lock); 3351 down_read(&vcpu->kvm->slots_lock);
3258 if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) 3352 if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
3259 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 3353 {
3354 switch(vcpu->arch.mp_state) {
3355 case KVM_MP_STATE_HALTED:
3260 vcpu->arch.mp_state = 3356 vcpu->arch.mp_state =
3261 KVM_MP_STATE_RUNNABLE; 3357 KVM_MP_STATE_RUNNABLE;
3262 if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) 3358 case KVM_MP_STATE_RUNNABLE:
3263 r = -EINTR; 3359 break;
3360 case KVM_MP_STATE_SIPI_RECEIVED:
3361 default:
3362 r = -EINTR;
3363 break;
3364 }
3365 }
3264 } 3366 }
3265 3367
3266 if (r > 0) { 3368 if (r <= 0)
3267 if (dm_request_for_irq_injection(vcpu, kvm_run)) { 3369 break;
3268 r = -EINTR; 3370
3269 kvm_run->exit_reason = KVM_EXIT_INTR; 3371 clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
3270 ++vcpu->stat.request_irq_exits; 3372 if (kvm_cpu_has_pending_timer(vcpu))
3271 } 3373 kvm_inject_pending_timer_irqs(vcpu);
3272 if (signal_pending(current)) { 3374
3273 r = -EINTR; 3375 if (dm_request_for_irq_injection(vcpu, kvm_run)) {
3274 kvm_run->exit_reason = KVM_EXIT_INTR; 3376 r = -EINTR;
3275 ++vcpu->stat.signal_exits; 3377 kvm_run->exit_reason = KVM_EXIT_INTR;
3276 } 3378 ++vcpu->stat.request_irq_exits;
3277 if (need_resched()) { 3379 }
3278 up_read(&vcpu->kvm->slots_lock); 3380 if (signal_pending(current)) {
3279 kvm_resched(vcpu); 3381 r = -EINTR;
3280 down_read(&vcpu->kvm->slots_lock); 3382 kvm_run->exit_reason = KVM_EXIT_INTR;
3281 } 3383 ++vcpu->stat.signal_exits;
3384 }
3385 if (need_resched()) {
3386 up_read(&vcpu->kvm->slots_lock);
3387 kvm_resched(vcpu);
3388 down_read(&vcpu->kvm->slots_lock);
3282 } 3389 }
3283 } 3390 }
3284 3391
@@ -3442,7 +3549,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
3442 struct kvm_sregs *sregs) 3549 struct kvm_sregs *sregs)
3443{ 3550{
3444 struct descriptor_table dt; 3551 struct descriptor_table dt;
3445 int pending_vec;
3446 3552
3447 vcpu_load(vcpu); 3553 vcpu_load(vcpu);
3448 3554
@@ -3472,16 +3578,11 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
3472 sregs->efer = vcpu->arch.shadow_efer; 3578 sregs->efer = vcpu->arch.shadow_efer;
3473 sregs->apic_base = kvm_get_apic_base(vcpu); 3579 sregs->apic_base = kvm_get_apic_base(vcpu);
3474 3580
3475 if (irqchip_in_kernel(vcpu->kvm)) { 3581 memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
3476 memset(sregs->interrupt_bitmap, 0, 3582
3477 sizeof sregs->interrupt_bitmap); 3583 if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft)
3478 pending_vec = kvm_x86_ops->get_irq(vcpu); 3584 set_bit(vcpu->arch.interrupt.nr,
3479 if (pending_vec >= 0) 3585 (unsigned long *)sregs->interrupt_bitmap);
3480 set_bit(pending_vec,
3481 (unsigned long *)sregs->interrupt_bitmap);
3482 } else
3483 memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending,
3484 sizeof sregs->interrupt_bitmap);
3485 3586
3486 vcpu_put(vcpu); 3587 vcpu_put(vcpu);
3487 3588
@@ -3688,7 +3789,6 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,
3688 tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS); 3789 tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
3689 tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS); 3790 tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
3690 tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); 3791 tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
3691 tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
3692} 3792}
3693 3793
3694static int load_state_from_tss32(struct kvm_vcpu *vcpu, 3794static int load_state_from_tss32(struct kvm_vcpu *vcpu,
@@ -3785,8 +3885,8 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu,
3785} 3885}
3786 3886
3787static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, 3887static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
3788 u32 old_tss_base, 3888 u16 old_tss_sel, u32 old_tss_base,
3789 struct desc_struct *nseg_desc) 3889 struct desc_struct *nseg_desc)
3790{ 3890{
3791 struct tss_segment_16 tss_segment_16; 3891 struct tss_segment_16 tss_segment_16;
3792 int ret = 0; 3892 int ret = 0;
@@ -3805,6 +3905,16 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
3805 &tss_segment_16, sizeof tss_segment_16)) 3905 &tss_segment_16, sizeof tss_segment_16))
3806 goto out; 3906 goto out;
3807 3907
3908 if (old_tss_sel != 0xffff) {
3909 tss_segment_16.prev_task_link = old_tss_sel;
3910
3911 if (kvm_write_guest(vcpu->kvm,
3912 get_tss_base_addr(vcpu, nseg_desc),
3913 &tss_segment_16.prev_task_link,
3914 sizeof tss_segment_16.prev_task_link))
3915 goto out;
3916 }
3917
3808 if (load_state_from_tss16(vcpu, &tss_segment_16)) 3918 if (load_state_from_tss16(vcpu, &tss_segment_16))
3809 goto out; 3919 goto out;
3810 3920
@@ -3814,7 +3924,7 @@ out:
3814} 3924}
3815 3925
3816static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, 3926static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
3817 u32 old_tss_base, 3927 u16 old_tss_sel, u32 old_tss_base,
3818 struct desc_struct *nseg_desc) 3928 struct desc_struct *nseg_desc)
3819{ 3929{
3820 struct tss_segment_32 tss_segment_32; 3930 struct tss_segment_32 tss_segment_32;
@@ -3834,6 +3944,16 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
3834 &tss_segment_32, sizeof tss_segment_32)) 3944 &tss_segment_32, sizeof tss_segment_32))
3835 goto out; 3945 goto out;
3836 3946
3947 if (old_tss_sel != 0xffff) {
3948 tss_segment_32.prev_task_link = old_tss_sel;
3949
3950 if (kvm_write_guest(vcpu->kvm,
3951 get_tss_base_addr(vcpu, nseg_desc),
3952 &tss_segment_32.prev_task_link,
3953 sizeof tss_segment_32.prev_task_link))
3954 goto out;
3955 }
3956
3837 if (load_state_from_tss32(vcpu, &tss_segment_32)) 3957 if (load_state_from_tss32(vcpu, &tss_segment_32))
3838 goto out; 3958 goto out;
3839 3959
@@ -3887,14 +4007,22 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3887 kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); 4007 kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
3888 } 4008 }
3889 4009
3890 kvm_x86_ops->skip_emulated_instruction(vcpu); 4010 /* set back link to prev task only if NT bit is set in eflags
4011 note that old_tss_sel is not used afetr this point */
4012 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
4013 old_tss_sel = 0xffff;
4014
4015 /* set back link to prev task only if NT bit is set in eflags
4016 note that old_tss_sel is not used afetr this point */
4017 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
4018 old_tss_sel = 0xffff;
3891 4019
3892 if (nseg_desc.type & 8) 4020 if (nseg_desc.type & 8)
3893 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base, 4021 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
3894 &nseg_desc); 4022 old_tss_base, &nseg_desc);
3895 else 4023 else
3896 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_base, 4024 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel,
3897 &nseg_desc); 4025 old_tss_base, &nseg_desc);
3898 4026
3899 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { 4027 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
3900 u32 eflags = kvm_x86_ops->get_rflags(vcpu); 4028 u32 eflags = kvm_x86_ops->get_rflags(vcpu);
@@ -3920,7 +4048,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
3920 struct kvm_sregs *sregs) 4048 struct kvm_sregs *sregs)
3921{ 4049{
3922 int mmu_reset_needed = 0; 4050 int mmu_reset_needed = 0;
3923 int i, pending_vec, max_bits; 4051 int pending_vec, max_bits;
3924 struct descriptor_table dt; 4052 struct descriptor_table dt;
3925 4053
3926 vcpu_load(vcpu); 4054 vcpu_load(vcpu);
@@ -3934,7 +4062,13 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
3934 4062
3935 vcpu->arch.cr2 = sregs->cr2; 4063 vcpu->arch.cr2 = sregs->cr2;
3936 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; 4064 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
3937 vcpu->arch.cr3 = sregs->cr3; 4065
4066 down_read(&vcpu->kvm->slots_lock);
4067 if (gfn_to_memslot(vcpu->kvm, sregs->cr3 >> PAGE_SHIFT))
4068 vcpu->arch.cr3 = sregs->cr3;
4069 else
4070 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
4071 up_read(&vcpu->kvm->slots_lock);
3938 4072
3939 kvm_set_cr8(vcpu, sregs->cr8); 4073 kvm_set_cr8(vcpu, sregs->cr8);
3940 4074
@@ -3956,25 +4090,14 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
3956 if (mmu_reset_needed) 4090 if (mmu_reset_needed)
3957 kvm_mmu_reset_context(vcpu); 4091 kvm_mmu_reset_context(vcpu);
3958 4092
3959 if (!irqchip_in_kernel(vcpu->kvm)) { 4093 max_bits = (sizeof sregs->interrupt_bitmap) << 3;
3960 memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap, 4094 pending_vec = find_first_bit(
3961 sizeof vcpu->arch.irq_pending); 4095 (const unsigned long *)sregs->interrupt_bitmap, max_bits);
3962 vcpu->arch.irq_summary = 0; 4096 if (pending_vec < max_bits) {
3963 for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i) 4097 kvm_queue_interrupt(vcpu, pending_vec, false);
3964 if (vcpu->arch.irq_pending[i]) 4098 pr_debug("Set back pending irq %d\n", pending_vec);
3965 __set_bit(i, &vcpu->arch.irq_summary); 4099 if (irqchip_in_kernel(vcpu->kvm))
3966 } else { 4100 kvm_pic_clear_isr_ack(vcpu->kvm);
3967 max_bits = (sizeof sregs->interrupt_bitmap) << 3;
3968 pending_vec = find_first_bit(
3969 (const unsigned long *)sregs->interrupt_bitmap,
3970 max_bits);
3971 /* Only pending external irq is handled here */
3972 if (pending_vec < max_bits) {
3973 kvm_x86_ops->set_irq(vcpu, pending_vec);
3974 pr_debug("Set back pending irq %d\n",
3975 pending_vec);
3976 }
3977 kvm_pic_clear_isr_ack(vcpu->kvm);
3978 } 4101 }
3979 4102
3980 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 4103 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
@@ -4308,7 +4431,6 @@ struct kvm *kvm_arch_create_vm(void)
4308 return ERR_PTR(-ENOMEM); 4431 return ERR_PTR(-ENOMEM);
4309 4432
4310 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 4433 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
4311 INIT_LIST_HEAD(&kvm->arch.oos_global_pages);
4312 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 4434 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
4313 4435
4314 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 4436 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
@@ -4411,12 +4533,14 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
4411 } 4533 }
4412 } 4534 }
4413 4535
4536 spin_lock(&kvm->mmu_lock);
4414 if (!kvm->arch.n_requested_mmu_pages) { 4537 if (!kvm->arch.n_requested_mmu_pages) {
4415 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); 4538 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
4416 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); 4539 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
4417 } 4540 }
4418 4541
4419 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 4542 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
4543 spin_unlock(&kvm->mmu_lock);
4420 kvm_flush_remote_tlbs(kvm); 4544 kvm_flush_remote_tlbs(kvm);
4421 4545
4422 return 0; 4546 return 0;
@@ -4425,6 +4549,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
4425void kvm_arch_flush_shadow(struct kvm *kvm) 4549void kvm_arch_flush_shadow(struct kvm *kvm)
4426{ 4550{
4427 kvm_mmu_zap_all(kvm); 4551 kvm_mmu_zap_all(kvm);
4552 kvm_reload_remote_mmus(kvm);
4428} 4553}
4429 4554
4430int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 4555int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
@@ -4434,28 +4559,24 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
4434 || vcpu->arch.nmi_pending; 4559 || vcpu->arch.nmi_pending;
4435} 4560}
4436 4561
4437static void vcpu_kick_intr(void *info)
4438{
4439#ifdef DEBUG
4440 struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
4441 printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
4442#endif
4443}
4444
4445void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 4562void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
4446{ 4563{
4447 int ipi_pcpu = vcpu->cpu; 4564 int me;
4448 int cpu = get_cpu(); 4565 int cpu = vcpu->cpu;
4449 4566
4450 if (waitqueue_active(&vcpu->wq)) { 4567 if (waitqueue_active(&vcpu->wq)) {
4451 wake_up_interruptible(&vcpu->wq); 4568 wake_up_interruptible(&vcpu->wq);
4452 ++vcpu->stat.halt_wakeup; 4569 ++vcpu->stat.halt_wakeup;
4453 } 4570 }
4454 /* 4571
4455 * We may be called synchronously with irqs disabled in guest mode, 4572 me = get_cpu();
4456 * So need not to call smp_call_function_single() in that case. 4573 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
4457 */ 4574 if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests))
4458 if (vcpu->guest_mode && vcpu->cpu != cpu) 4575 smp_send_reschedule(cpu);
4459 smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0);
4460 put_cpu(); 4576 put_cpu();
4461} 4577}
4578
4579int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
4580{
4581 return kvm_x86_ops->interrupt_allowed(vcpu);
4582}
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 6a4be78a7384..4c8e10af78e8 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -8,9 +8,11 @@ static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
8 vcpu->arch.exception.pending = false; 8 vcpu->arch.exception.pending = false;
9} 9}
10 10
11static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector) 11static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector,
12 bool soft)
12{ 13{
13 vcpu->arch.interrupt.pending = true; 14 vcpu->arch.interrupt.pending = true;
15 vcpu->arch.interrupt.soft = soft;
14 vcpu->arch.interrupt.nr = vector; 16 vcpu->arch.interrupt.nr = vector;
15} 17}
16 18
@@ -19,4 +21,14 @@ static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu)
19 vcpu->arch.interrupt.pending = false; 21 vcpu->arch.interrupt.pending = false;
20} 22}
21 23
24static inline bool kvm_event_needs_reinjection(struct kvm_vcpu *vcpu)
25{
26 return vcpu->arch.exception.pending || vcpu->arch.interrupt.pending ||
27 vcpu->arch.nmi_injected;
28}
29
30static inline bool kvm_exception_is_soft(unsigned int nr)
31{
32 return (nr == BP_VECTOR) || (nr == OF_VECTOR);
33}
22#endif 34#endif
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index ca91749d2083..c1b6c232e02b 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -59,13 +59,14 @@
59#define SrcImm (5<<4) /* Immediate operand. */ 59#define SrcImm (5<<4) /* Immediate operand. */
60#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */ 60#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */
61#define SrcOne (7<<4) /* Implied '1' */ 61#define SrcOne (7<<4) /* Implied '1' */
62#define SrcMask (7<<4) 62#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */
63#define SrcMask (0xf<<4)
63/* Generic ModRM decode. */ 64/* Generic ModRM decode. */
64#define ModRM (1<<7) 65#define ModRM (1<<8)
65/* Destination is only written; never read. */ 66/* Destination is only written; never read. */
66#define Mov (1<<8) 67#define Mov (1<<9)
67#define BitOp (1<<9) 68#define BitOp (1<<10)
68#define MemAbs (1<<10) /* Memory operand is absolute displacement */ 69#define MemAbs (1<<11) /* Memory operand is absolute displacement */
69#define String (1<<12) /* String instruction (rep capable) */ 70#define String (1<<12) /* String instruction (rep capable) */
70#define Stack (1<<13) /* Stack instruction (push/pop) */ 71#define Stack (1<<13) /* Stack instruction (push/pop) */
71#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ 72#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
@@ -76,6 +77,7 @@
76#define Src2CL (1<<29) 77#define Src2CL (1<<29)
77#define Src2ImmByte (2<<29) 78#define Src2ImmByte (2<<29)
78#define Src2One (3<<29) 79#define Src2One (3<<29)
80#define Src2Imm16 (4<<29)
79#define Src2Mask (7<<29) 81#define Src2Mask (7<<29)
80 82
81enum { 83enum {
@@ -135,11 +137,11 @@ static u32 opcode_table[256] = {
135 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ 137 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */
136 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ 138 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */
137 /* 0x70 - 0x77 */ 139 /* 0x70 - 0x77 */
138 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, 140 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
139 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, 141 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
140 /* 0x78 - 0x7F */ 142 /* 0x78 - 0x7F */
141 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, 143 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
142 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, 144 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
143 /* 0x80 - 0x87 */ 145 /* 0x80 - 0x87 */
144 Group | Group1_80, Group | Group1_81, 146 Group | Group1_80, Group | Group1_81,
145 Group | Group1_82, Group | Group1_83, 147 Group | Group1_82, Group | Group1_83,
@@ -153,7 +155,8 @@ static u32 opcode_table[256] = {
153 /* 0x90 - 0x97 */ 155 /* 0x90 - 0x97 */
154 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, 156 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
155 /* 0x98 - 0x9F */ 157 /* 0x98 - 0x9F */
156 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, 158 0, 0, SrcImm | Src2Imm16, 0,
159 ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
157 /* 0xA0 - 0xA7 */ 160 /* 0xA0 - 0xA7 */
158 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, 161 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
159 ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, 162 ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
@@ -178,7 +181,8 @@ static u32 opcode_table[256] = {
178 0, ImplicitOps | Stack, 0, 0, 181 0, ImplicitOps | Stack, 0, 0,
179 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, 182 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
180 /* 0xC8 - 0xCF */ 183 /* 0xC8 - 0xCF */
181 0, 0, 0, ImplicitOps | Stack, 0, 0, 0, 0, 184 0, 0, 0, ImplicitOps | Stack,
185 ImplicitOps, SrcImmByte, ImplicitOps, ImplicitOps,
182 /* 0xD0 - 0xD7 */ 186 /* 0xD0 - 0xD7 */
183 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, 187 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
184 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, 188 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
@@ -187,11 +191,11 @@ static u32 opcode_table[256] = {
187 0, 0, 0, 0, 0, 0, 0, 0, 191 0, 0, 0, 0, 0, 0, 0, 0,
188 /* 0xE0 - 0xE7 */ 192 /* 0xE0 - 0xE7 */
189 0, 0, 0, 0, 193 0, 0, 0, 0,
190 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 194 ByteOp | SrcImmUByte, SrcImmUByte,
191 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 195 ByteOp | SrcImmUByte, SrcImmUByte,
192 /* 0xE8 - 0xEF */ 196 /* 0xE8 - 0xEF */
193 ImplicitOps | Stack, SrcImm | ImplicitOps, 197 SrcImm | Stack, SrcImm | ImplicitOps,
194 ImplicitOps, SrcImmByte | ImplicitOps, 198 SrcImm | Src2Imm16, SrcImmByte | ImplicitOps,
195 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 199 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
196 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 200 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
197 /* 0xF0 - 0xF7 */ 201 /* 0xF0 - 0xF7 */
@@ -230,10 +234,8 @@ static u32 twobyte_table[256] = {
230 /* 0x70 - 0x7F */ 234 /* 0x70 - 0x7F */
231 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 235 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
232 /* 0x80 - 0x8F */ 236 /* 0x80 - 0x8F */
233 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, 237 SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
234 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, 238 SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
235 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
236 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
237 /* 0x90 - 0x9F */ 239 /* 0x90 - 0x9F */
238 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 240 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
239 /* 0xA0 - 0xA7 */ 241 /* 0xA0 - 0xA7 */
@@ -1044,10 +1046,14 @@ done_prefixes:
1044 } 1046 }
1045 break; 1047 break;
1046 case SrcImmByte: 1048 case SrcImmByte:
1049 case SrcImmUByte:
1047 c->src.type = OP_IMM; 1050 c->src.type = OP_IMM;
1048 c->src.ptr = (unsigned long *)c->eip; 1051 c->src.ptr = (unsigned long *)c->eip;
1049 c->src.bytes = 1; 1052 c->src.bytes = 1;
1050 c->src.val = insn_fetch(s8, 1, c->eip); 1053 if ((c->d & SrcMask) == SrcImmByte)
1054 c->src.val = insn_fetch(s8, 1, c->eip);
1055 else
1056 c->src.val = insn_fetch(u8, 1, c->eip);
1051 break; 1057 break;
1052 case SrcOne: 1058 case SrcOne:
1053 c->src.bytes = 1; 1059 c->src.bytes = 1;
@@ -1072,6 +1078,12 @@ done_prefixes:
1072 c->src2.bytes = 1; 1078 c->src2.bytes = 1;
1073 c->src2.val = insn_fetch(u8, 1, c->eip); 1079 c->src2.val = insn_fetch(u8, 1, c->eip);
1074 break; 1080 break;
1081 case Src2Imm16:
1082 c->src2.type = OP_IMM;
1083 c->src2.ptr = (unsigned long *)c->eip;
1084 c->src2.bytes = 2;
1085 c->src2.val = insn_fetch(u16, 2, c->eip);
1086 break;
1075 case Src2One: 1087 case Src2One:
1076 c->src2.bytes = 1; 1088 c->src2.bytes = 1;
1077 c->src2.val = 1; 1089 c->src2.val = 1;
@@ -1349,6 +1361,20 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
1349 return 0; 1361 return 0;
1350} 1362}
1351 1363
1364void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
1365{
1366 u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask);
1367 /*
1368 * an sti; sti; sequence only disable interrupts for the first
1369 * instruction. So, if the last instruction, be it emulated or
1370 * not, left the system with the INT_STI flag enabled, it
1371 * means that the last instruction is an sti. We should not
1372 * leave the flag on in this case. The same goes for mov ss
1373 */
1374 if (!(int_shadow & mask))
1375 ctxt->interruptibility = mask;
1376}
1377
1352int 1378int
1353x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) 1379x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1354{ 1380{
@@ -1360,6 +1386,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1360 int io_dir_in; 1386 int io_dir_in;
1361 int rc = 0; 1387 int rc = 0;
1362 1388
1389 ctxt->interruptibility = 0;
1390
1363 /* Shadow copy of register state. Committed on successful emulation. 1391 /* Shadow copy of register state. Committed on successful emulation.
1364 * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't 1392 * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
1365 * modify them. 1393 * modify them.
@@ -1531,13 +1559,10 @@ special_insn:
1531 return -1; 1559 return -1;
1532 } 1560 }
1533 return 0; 1561 return 0;
1534 case 0x70 ... 0x7f: /* jcc (short) */ { 1562 case 0x70 ... 0x7f: /* jcc (short) */
1535 int rel = insn_fetch(s8, 1, c->eip);
1536
1537 if (test_cc(c->b, ctxt->eflags)) 1563 if (test_cc(c->b, ctxt->eflags))
1538 jmp_rel(c, rel); 1564 jmp_rel(c, c->src.val);
1539 break; 1565 break;
1540 }
1541 case 0x80 ... 0x83: /* Grp1 */ 1566 case 0x80 ... 0x83: /* Grp1 */
1542 switch (c->modrm_reg) { 1567 switch (c->modrm_reg) {
1543 case 0: 1568 case 0:
@@ -1609,6 +1634,9 @@ special_insn:
1609 int err; 1634 int err;
1610 1635
1611 sel = c->src.val; 1636 sel = c->src.val;
1637 if (c->modrm_reg == VCPU_SREG_SS)
1638 toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS);
1639
1612 if (c->modrm_reg <= 5) { 1640 if (c->modrm_reg <= 5) {
1613 type_bits = (c->modrm_reg == 1) ? 9 : 1; 1641 type_bits = (c->modrm_reg == 1) ? 9 : 1;
1614 err = kvm_load_segment_descriptor(ctxt->vcpu, sel, 1642 err = kvm_load_segment_descriptor(ctxt->vcpu, sel,
@@ -1769,59 +1797,32 @@ special_insn:
1769 break; 1797 break;
1770 case 0xe4: /* inb */ 1798 case 0xe4: /* inb */
1771 case 0xe5: /* in */ 1799 case 0xe5: /* in */
1772 port = insn_fetch(u8, 1, c->eip); 1800 port = c->src.val;
1773 io_dir_in = 1; 1801 io_dir_in = 1;
1774 goto do_io; 1802 goto do_io;
1775 case 0xe6: /* outb */ 1803 case 0xe6: /* outb */
1776 case 0xe7: /* out */ 1804 case 0xe7: /* out */
1777 port = insn_fetch(u8, 1, c->eip); 1805 port = c->src.val;
1778 io_dir_in = 0; 1806 io_dir_in = 0;
1779 goto do_io; 1807 goto do_io;
1780 case 0xe8: /* call (near) */ { 1808 case 0xe8: /* call (near) */ {
1781 long int rel; 1809 long int rel = c->src.val;
1782 switch (c->op_bytes) {
1783 case 2:
1784 rel = insn_fetch(s16, 2, c->eip);
1785 break;
1786 case 4:
1787 rel = insn_fetch(s32, 4, c->eip);
1788 break;
1789 default:
1790 DPRINTF("Call: Invalid op_bytes\n");
1791 goto cannot_emulate;
1792 }
1793 c->src.val = (unsigned long) c->eip; 1810 c->src.val = (unsigned long) c->eip;
1794 jmp_rel(c, rel); 1811 jmp_rel(c, rel);
1795 c->op_bytes = c->ad_bytes;
1796 emulate_push(ctxt); 1812 emulate_push(ctxt);
1797 break; 1813 break;
1798 } 1814 }
1799 case 0xe9: /* jmp rel */ 1815 case 0xe9: /* jmp rel */
1800 goto jmp; 1816 goto jmp;
1801 case 0xea: /* jmp far */ { 1817 case 0xea: /* jmp far */
1802 uint32_t eip; 1818 if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, 9,
1803 uint16_t sel; 1819 VCPU_SREG_CS) < 0) {
1804
1805 switch (c->op_bytes) {
1806 case 2:
1807 eip = insn_fetch(u16, 2, c->eip);
1808 break;
1809 case 4:
1810 eip = insn_fetch(u32, 4, c->eip);
1811 break;
1812 default:
1813 DPRINTF("jmp far: Invalid op_bytes\n");
1814 goto cannot_emulate;
1815 }
1816 sel = insn_fetch(u16, 2, c->eip);
1817 if (kvm_load_segment_descriptor(ctxt->vcpu, sel, 9, VCPU_SREG_CS) < 0) {
1818 DPRINTF("jmp far: Failed to load CS descriptor\n"); 1820 DPRINTF("jmp far: Failed to load CS descriptor\n");
1819 goto cannot_emulate; 1821 goto cannot_emulate;
1820 } 1822 }
1821 1823
1822 c->eip = eip; 1824 c->eip = c->src.val;
1823 break; 1825 break;
1824 }
1825 case 0xeb: 1826 case 0xeb:
1826 jmp: /* jmp rel short */ 1827 jmp: /* jmp rel short */
1827 jmp_rel(c, c->src.val); 1828 jmp_rel(c, c->src.val);
@@ -1865,6 +1866,7 @@ special_insn:
1865 c->dst.type = OP_NONE; /* Disable writeback. */ 1866 c->dst.type = OP_NONE; /* Disable writeback. */
1866 break; 1867 break;
1867 case 0xfb: /* sti */ 1868 case 0xfb: /* sti */
1869 toggle_interruptibility(ctxt, X86_SHADOW_INT_STI);
1868 ctxt->eflags |= X86_EFLAGS_IF; 1870 ctxt->eflags |= X86_EFLAGS_IF;
1869 c->dst.type = OP_NONE; /* Disable writeback. */ 1871 c->dst.type = OP_NONE; /* Disable writeback. */
1870 break; 1872 break;
@@ -2039,28 +2041,11 @@ twobyte_insn:
2039 if (!test_cc(c->b, ctxt->eflags)) 2041 if (!test_cc(c->b, ctxt->eflags))
2040 c->dst.type = OP_NONE; /* no writeback */ 2042 c->dst.type = OP_NONE; /* no writeback */
2041 break; 2043 break;
2042 case 0x80 ... 0x8f: /* jnz rel, etc*/ { 2044 case 0x80 ... 0x8f: /* jnz rel, etc*/
2043 long int rel;
2044
2045 switch (c->op_bytes) {
2046 case 2:
2047 rel = insn_fetch(s16, 2, c->eip);
2048 break;
2049 case 4:
2050 rel = insn_fetch(s32, 4, c->eip);
2051 break;
2052 case 8:
2053 rel = insn_fetch(s64, 8, c->eip);
2054 break;
2055 default:
2056 DPRINTF("jnz: Invalid op_bytes\n");
2057 goto cannot_emulate;
2058 }
2059 if (test_cc(c->b, ctxt->eflags)) 2045 if (test_cc(c->b, ctxt->eflags))
2060 jmp_rel(c, rel); 2046 jmp_rel(c, c->src.val);
2061 c->dst.type = OP_NONE; 2047 c->dst.type = OP_NONE;
2062 break; 2048 break;
2063 }
2064 case 0xa3: 2049 case 0xa3:
2065 bt: /* bt */ 2050 bt: /* bt */
2066 c->dst.type = OP_NONE; 2051 c->dst.type = OP_NONE;
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
index 8dab8f7844d3..38718041efc3 100644
--- a/arch/x86/lguest/Kconfig
+++ b/arch/x86/lguest/Kconfig
@@ -2,7 +2,6 @@ config LGUEST_GUEST
2 bool "Lguest guest support" 2 bool "Lguest guest support"
3 select PARAVIRT 3 select PARAVIRT
4 depends on X86_32 4 depends on X86_32
5 depends on !X86_PAE
6 select VIRTIO 5 select VIRTIO
7 select VIRTIO_RING 6 select VIRTIO_RING
8 select VIRTIO_CONSOLE 7 select VIRTIO_CONSOLE
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 33a93b417396..7bc65f0f62c4 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -87,7 +87,7 @@ struct lguest_data lguest_data = {
87 87
88/*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a 88/*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a
89 * ring buffer of stored hypercalls which the Host will run though next time we 89 * ring buffer of stored hypercalls which the Host will run though next time we
90 * do a normal hypercall. Each entry in the ring has 4 slots for the hypercall 90 * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall
91 * arguments, and a "hcall_status" word which is 0 if the call is ready to go, 91 * arguments, and a "hcall_status" word which is 0 if the call is ready to go,
92 * and 255 once the Host has finished with it. 92 * and 255 once the Host has finished with it.
93 * 93 *
@@ -96,7 +96,8 @@ struct lguest_data lguest_data = {
96 * effect of causing the Host to run all the stored calls in the ring buffer 96 * effect of causing the Host to run all the stored calls in the ring buffer
97 * which empties it for next time! */ 97 * which empties it for next time! */
98static void async_hcall(unsigned long call, unsigned long arg1, 98static void async_hcall(unsigned long call, unsigned long arg1,
99 unsigned long arg2, unsigned long arg3) 99 unsigned long arg2, unsigned long arg3,
100 unsigned long arg4)
100{ 101{
101 /* Note: This code assumes we're uniprocessor. */ 102 /* Note: This code assumes we're uniprocessor. */
102 static unsigned int next_call; 103 static unsigned int next_call;
@@ -108,12 +109,13 @@ static void async_hcall(unsigned long call, unsigned long arg1,
108 local_irq_save(flags); 109 local_irq_save(flags);
109 if (lguest_data.hcall_status[next_call] != 0xFF) { 110 if (lguest_data.hcall_status[next_call] != 0xFF) {
110 /* Table full, so do normal hcall which will flush table. */ 111 /* Table full, so do normal hcall which will flush table. */
111 kvm_hypercall3(call, arg1, arg2, arg3); 112 kvm_hypercall4(call, arg1, arg2, arg3, arg4);
112 } else { 113 } else {
113 lguest_data.hcalls[next_call].arg0 = call; 114 lguest_data.hcalls[next_call].arg0 = call;
114 lguest_data.hcalls[next_call].arg1 = arg1; 115 lguest_data.hcalls[next_call].arg1 = arg1;
115 lguest_data.hcalls[next_call].arg2 = arg2; 116 lguest_data.hcalls[next_call].arg2 = arg2;
116 lguest_data.hcalls[next_call].arg3 = arg3; 117 lguest_data.hcalls[next_call].arg3 = arg3;
118 lguest_data.hcalls[next_call].arg4 = arg4;
117 /* Arguments must all be written before we mark it to go */ 119 /* Arguments must all be written before we mark it to go */
118 wmb(); 120 wmb();
119 lguest_data.hcall_status[next_call] = 0; 121 lguest_data.hcall_status[next_call] = 0;
@@ -141,7 +143,7 @@ static void lazy_hcall1(unsigned long call,
141 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) 143 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
142 kvm_hypercall1(call, arg1); 144 kvm_hypercall1(call, arg1);
143 else 145 else
144 async_hcall(call, arg1, 0, 0); 146 async_hcall(call, arg1, 0, 0, 0);
145} 147}
146 148
147static void lazy_hcall2(unsigned long call, 149static void lazy_hcall2(unsigned long call,
@@ -151,7 +153,7 @@ static void lazy_hcall2(unsigned long call,
151 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) 153 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
152 kvm_hypercall2(call, arg1, arg2); 154 kvm_hypercall2(call, arg1, arg2);
153 else 155 else
154 async_hcall(call, arg1, arg2, 0); 156 async_hcall(call, arg1, arg2, 0, 0);
155} 157}
156 158
157static void lazy_hcall3(unsigned long call, 159static void lazy_hcall3(unsigned long call,
@@ -162,18 +164,38 @@ static void lazy_hcall3(unsigned long call,
162 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) 164 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
163 kvm_hypercall3(call, arg1, arg2, arg3); 165 kvm_hypercall3(call, arg1, arg2, arg3);
164 else 166 else
165 async_hcall(call, arg1, arg2, arg3); 167 async_hcall(call, arg1, arg2, arg3, 0);
168}
169
170#ifdef CONFIG_X86_PAE
171static void lazy_hcall4(unsigned long call,
172 unsigned long arg1,
173 unsigned long arg2,
174 unsigned long arg3,
175 unsigned long arg4)
176{
177 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
178 kvm_hypercall4(call, arg1, arg2, arg3, arg4);
179 else
180 async_hcall(call, arg1, arg2, arg3, arg4);
166} 181}
182#endif
167 183
168/* When lazy mode is turned off reset the per-cpu lazy mode variable and then 184/* When lazy mode is turned off reset the per-cpu lazy mode variable and then
169 * issue the do-nothing hypercall to flush any stored calls. */ 185 * issue the do-nothing hypercall to flush any stored calls. */
170static void lguest_leave_lazy_mode(void) 186static void lguest_leave_lazy_mmu_mode(void)
187{
188 kvm_hypercall0(LHCALL_FLUSH_ASYNC);
189 paravirt_leave_lazy_mmu();
190}
191
192static void lguest_end_context_switch(struct task_struct *next)
171{ 193{
172 paravirt_leave_lazy(paravirt_get_lazy_mode());
173 kvm_hypercall0(LHCALL_FLUSH_ASYNC); 194 kvm_hypercall0(LHCALL_FLUSH_ASYNC);
195 paravirt_end_context_switch(next);
174} 196}
175 197
176/*G:033 198/*G:032
177 * After that diversion we return to our first native-instruction 199 * After that diversion we return to our first native-instruction
178 * replacements: four functions for interrupt control. 200 * replacements: four functions for interrupt control.
179 * 201 *
@@ -193,30 +215,28 @@ static unsigned long save_fl(void)
193{ 215{
194 return lguest_data.irq_enabled; 216 return lguest_data.irq_enabled;
195} 217}
196PV_CALLEE_SAVE_REGS_THUNK(save_fl);
197
198/* restore_flags() just sets the flags back to the value given. */
199static void restore_fl(unsigned long flags)
200{
201 lguest_data.irq_enabled = flags;
202}
203PV_CALLEE_SAVE_REGS_THUNK(restore_fl);
204 218
205/* Interrupts go off... */ 219/* Interrupts go off... */
206static void irq_disable(void) 220static void irq_disable(void)
207{ 221{
208 lguest_data.irq_enabled = 0; 222 lguest_data.irq_enabled = 0;
209} 223}
224
225/* Let's pause a moment. Remember how I said these are called so often?
226 * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to
227 * break some rules. In particular, these functions are assumed to save their
228 * own registers if they need to: normal C functions assume they can trash the
229 * eax register. To use normal C functions, we use
230 * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the
231 * C function, then restores it. */
232PV_CALLEE_SAVE_REGS_THUNK(save_fl);
210PV_CALLEE_SAVE_REGS_THUNK(irq_disable); 233PV_CALLEE_SAVE_REGS_THUNK(irq_disable);
234/*:*/
211 235
212/* Interrupts go on... */ 236/* These are in i386_head.S */
213static void irq_enable(void) 237extern void lg_irq_enable(void);
214{ 238extern void lg_restore_fl(unsigned long flags);
215 lguest_data.irq_enabled = X86_EFLAGS_IF;
216}
217PV_CALLEE_SAVE_REGS_THUNK(irq_enable);
218 239
219/*:*/
220/*M:003 Note that we don't check for outstanding interrupts when we re-enable 240/*M:003 Note that we don't check for outstanding interrupts when we re-enable
221 * them (or when we unmask an interrupt). This seems to work for the moment, 241 * them (or when we unmask an interrupt). This seems to work for the moment,
222 * since interrupts are rare and we'll just get the interrupt on the next timer 242 * since interrupts are rare and we'll just get the interrupt on the next timer
@@ -362,8 +382,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
362 case 1: /* Basic feature request. */ 382 case 1: /* Basic feature request. */
363 /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ 383 /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */
364 *cx &= 0x00002201; 384 *cx &= 0x00002201;
365 /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU. */ 385 /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU, PAE. */
366 *dx &= 0x07808111; 386 *dx &= 0x07808151;
367 /* The Host can do a nice optimization if it knows that the 387 /* The Host can do a nice optimization if it knows that the
368 * kernel mappings (addresses above 0xC0000000 or whatever 388 * kernel mappings (addresses above 0xC0000000 or whatever
369 * PAGE_OFFSET is set to) haven't changed. But Linux calls 389 * PAGE_OFFSET is set to) haven't changed. But Linux calls
@@ -382,6 +402,11 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
382 if (*ax > 0x80000008) 402 if (*ax > 0x80000008)
383 *ax = 0x80000008; 403 *ax = 0x80000008;
384 break; 404 break;
405 case 0x80000001:
406 /* Here we should fix nx cap depending on host. */
407 /* For this version of PAE, we just clear NX bit. */
408 *dx &= ~(1 << 20);
409 break;
385 } 410 }
386} 411}
387 412
@@ -515,25 +540,52 @@ static void lguest_write_cr4(unsigned long val)
515static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, 540static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
516 pte_t *ptep) 541 pte_t *ptep)
517{ 542{
543#ifdef CONFIG_X86_PAE
544 lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr,
545 ptep->pte_low, ptep->pte_high);
546#else
518 lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low); 547 lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low);
548#endif
519} 549}
520 550
521static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, 551static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
522 pte_t *ptep, pte_t pteval) 552 pte_t *ptep, pte_t pteval)
523{ 553{
524 *ptep = pteval; 554 native_set_pte(ptep, pteval);
525 lguest_pte_update(mm, addr, ptep); 555 lguest_pte_update(mm, addr, ptep);
526} 556}
527 557
528/* The Guest calls this to set a top-level entry. Again, we set the entry then 558/* The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd
529 * tell the Host which top-level page we changed, and the index of the entry we 559 * to set a middle-level entry when PAE is activated.
530 * changed. */ 560 * Again, we set the entry then tell the Host which page we changed,
561 * and the index of the entry we changed. */
562#ifdef CONFIG_X86_PAE
563static void lguest_set_pud(pud_t *pudp, pud_t pudval)
564{
565 native_set_pud(pudp, pudval);
566
567 /* 32 bytes aligned pdpt address and the index. */
568 lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0,
569 (__pa(pudp) & 0x1F) / sizeof(pud_t));
570}
571
531static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) 572static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
532{ 573{
533 *pmdp = pmdval; 574 native_set_pmd(pmdp, pmdval);
534 lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK, 575 lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK,
535 (__pa(pmdp) & (PAGE_SIZE - 1)) / 4); 576 (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
577}
578#else
579
580/* The Guest calls lguest_set_pmd to set a top-level entry when PAE is not
581 * activated. */
582static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
583{
584 native_set_pmd(pmdp, pmdval);
585 lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK,
586 (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
536} 587}
588#endif
537 589
538/* There are a couple of legacy places where the kernel sets a PTE, but we 590/* There are a couple of legacy places where the kernel sets a PTE, but we
539 * don't know the top level any more. This is useless for us, since we don't 591 * don't know the top level any more. This is useless for us, since we don't
@@ -546,11 +598,31 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
546 * which brings boot back to 0.25 seconds. */ 598 * which brings boot back to 0.25 seconds. */
547static void lguest_set_pte(pte_t *ptep, pte_t pteval) 599static void lguest_set_pte(pte_t *ptep, pte_t pteval)
548{ 600{
549 *ptep = pteval; 601 native_set_pte(ptep, pteval);
602 if (cr3_changed)
603 lazy_hcall1(LHCALL_FLUSH_TLB, 1);
604}
605
606#ifdef CONFIG_X86_PAE
607static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte)
608{
609 native_set_pte_atomic(ptep, pte);
550 if (cr3_changed) 610 if (cr3_changed)
551 lazy_hcall1(LHCALL_FLUSH_TLB, 1); 611 lazy_hcall1(LHCALL_FLUSH_TLB, 1);
552} 612}
553 613
614void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
615{
616 native_pte_clear(mm, addr, ptep);
617 lguest_pte_update(mm, addr, ptep);
618}
619
620void lguest_pmd_clear(pmd_t *pmdp)
621{
622 lguest_set_pmd(pmdp, __pmd(0));
623}
624#endif
625
554/* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on 626/* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
555 * native page table operations. On native hardware you can set a new page 627 * native page table operations. On native hardware you can set a new page
556 * table entry whenever you want, but if you want to remove one you have to do 628 * table entry whenever you want, but if you want to remove one you have to do
@@ -622,13 +694,12 @@ static void __init lguest_init_IRQ(void)
622{ 694{
623 unsigned int i; 695 unsigned int i;
624 696
625 for (i = 0; i < LGUEST_IRQS; i++) { 697 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
626 int vector = FIRST_EXTERNAL_VECTOR + i;
627 /* Some systems map "vectors" to interrupts weirdly. Lguest has 698 /* Some systems map "vectors" to interrupts weirdly. Lguest has
628 * a straightforward 1 to 1 mapping, so force that here. */ 699 * a straightforward 1 to 1 mapping, so force that here. */
629 __get_cpu_var(vector_irq)[vector] = i; 700 __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR;
630 if (vector != SYSCALL_VECTOR) 701 if (i != SYSCALL_VECTOR)
631 set_intr_gate(vector, interrupt[i]); 702 set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
632 } 703 }
633 /* This call is required to set up for 4k stacks, where we have 704 /* This call is required to set up for 4k stacks, where we have
634 * separate stacks for hard and soft interrupts. */ 705 * separate stacks for hard and soft interrupts. */
@@ -637,7 +708,7 @@ static void __init lguest_init_IRQ(void)
637 708
638void lguest_setup_irq(unsigned int irq) 709void lguest_setup_irq(unsigned int irq)
639{ 710{
640 irq_to_desc_alloc_cpu(irq, 0); 711 irq_to_desc_alloc_node(irq, 0);
641 set_irq_chip_and_handler_name(irq, &lguest_irq_controller, 712 set_irq_chip_and_handler_name(irq, &lguest_irq_controller,
642 handle_level_irq, "level"); 713 handle_level_irq, "level");
643} 714}
@@ -967,10 +1038,10 @@ static void lguest_restart(char *reason)
967 * 1038 *
968 * Our current solution is to allow the paravirt back end to optionally patch 1039 * Our current solution is to allow the paravirt back end to optionally patch
969 * over the indirect calls to replace them with something more efficient. We 1040 * over the indirect calls to replace them with something more efficient. We
970 * patch the four most commonly called functions: disable interrupts, enable 1041 * patch two of the simplest of the most commonly called functions: disable
971 * interrupts, restore interrupts and save interrupts. We usually have 6 or 10 1042 * interrupts and save interrupts. We usually have 6 or 10 bytes to patch
972 * bytes to patch into: the Guest versions of these operations are small enough 1043 * into: the Guest versions of these operations are small enough that we can
973 * that we can fit comfortably. 1044 * fit comfortably.
974 * 1045 *
975 * First we need assembly templates of each of the patchable Guest operations, 1046 * First we need assembly templates of each of the patchable Guest operations,
976 * and these are in i386_head.S. */ 1047 * and these are in i386_head.S. */
@@ -981,8 +1052,6 @@ static const struct lguest_insns
981 const char *start, *end; 1052 const char *start, *end;
982} lguest_insns[] = { 1053} lguest_insns[] = {
983 [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli }, 1054 [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli },
984 [PARAVIRT_PATCH(pv_irq_ops.irq_enable)] = { lgstart_sti, lgend_sti },
985 [PARAVIRT_PATCH(pv_irq_ops.restore_fl)] = { lgstart_popf, lgend_popf },
986 [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, 1055 [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf },
987}; 1056};
988 1057
@@ -1020,6 +1089,7 @@ __init void lguest_init(void)
1020 pv_info.name = "lguest"; 1089 pv_info.name = "lguest";
1021 pv_info.paravirt_enabled = 1; 1090 pv_info.paravirt_enabled = 1;
1022 pv_info.kernel_rpl = 1; 1091 pv_info.kernel_rpl = 1;
1092 pv_info.shared_kernel_pmd = 1;
1023 1093
1024 /* We set up all the lguest overrides for sensitive operations. These 1094 /* We set up all the lguest overrides for sensitive operations. These
1025 * are detailed with the operations themselves. */ 1095 * are detailed with the operations themselves. */
@@ -1027,9 +1097,9 @@ __init void lguest_init(void)
1027 /* interrupt-related operations */ 1097 /* interrupt-related operations */
1028 pv_irq_ops.init_IRQ = lguest_init_IRQ; 1098 pv_irq_ops.init_IRQ = lguest_init_IRQ;
1029 pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); 1099 pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl);
1030 pv_irq_ops.restore_fl = PV_CALLEE_SAVE(restore_fl); 1100 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl);
1031 pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable); 1101 pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable);
1032 pv_irq_ops.irq_enable = PV_CALLEE_SAVE(irq_enable); 1102 pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable);
1033 pv_irq_ops.safe_halt = lguest_safe_halt; 1103 pv_irq_ops.safe_halt = lguest_safe_halt;
1034 1104
1035 /* init-time operations */ 1105 /* init-time operations */
@@ -1054,8 +1124,8 @@ __init void lguest_init(void)
1054 pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry; 1124 pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry;
1055 pv_cpu_ops.write_idt_entry = lguest_write_idt_entry; 1125 pv_cpu_ops.write_idt_entry = lguest_write_idt_entry;
1056 pv_cpu_ops.wbinvd = lguest_wbinvd; 1126 pv_cpu_ops.wbinvd = lguest_wbinvd;
1057 pv_cpu_ops.lazy_mode.enter = paravirt_enter_lazy_cpu; 1127 pv_cpu_ops.start_context_switch = paravirt_start_context_switch;
1058 pv_cpu_ops.lazy_mode.leave = lguest_leave_lazy_mode; 1128 pv_cpu_ops.end_context_switch = lguest_end_context_switch;
1059 1129
1060 /* pagetable management */ 1130 /* pagetable management */
1061 pv_mmu_ops.write_cr3 = lguest_write_cr3; 1131 pv_mmu_ops.write_cr3 = lguest_write_cr3;
@@ -1065,10 +1135,16 @@ __init void lguest_init(void)
1065 pv_mmu_ops.set_pte = lguest_set_pte; 1135 pv_mmu_ops.set_pte = lguest_set_pte;
1066 pv_mmu_ops.set_pte_at = lguest_set_pte_at; 1136 pv_mmu_ops.set_pte_at = lguest_set_pte_at;
1067 pv_mmu_ops.set_pmd = lguest_set_pmd; 1137 pv_mmu_ops.set_pmd = lguest_set_pmd;
1138#ifdef CONFIG_X86_PAE
1139 pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic;
1140 pv_mmu_ops.pte_clear = lguest_pte_clear;
1141 pv_mmu_ops.pmd_clear = lguest_pmd_clear;
1142 pv_mmu_ops.set_pud = lguest_set_pud;
1143#endif
1068 pv_mmu_ops.read_cr2 = lguest_read_cr2; 1144 pv_mmu_ops.read_cr2 = lguest_read_cr2;
1069 pv_mmu_ops.read_cr3 = lguest_read_cr3; 1145 pv_mmu_ops.read_cr3 = lguest_read_cr3;
1070 pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; 1146 pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
1071 pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mode; 1147 pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode;
1072 pv_mmu_ops.pte_update = lguest_pte_update; 1148 pv_mmu_ops.pte_update = lguest_pte_update;
1073 pv_mmu_ops.pte_update_defer = lguest_pte_update; 1149 pv_mmu_ops.pte_update_defer = lguest_pte_update;
1074 1150
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index f79541989471..a9c8cfe61cd4 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -46,10 +46,64 @@ ENTRY(lguest_entry)
46 .globl lgstart_##name; .globl lgend_##name 46 .globl lgstart_##name; .globl lgend_##name
47 47
48LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) 48LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled)
49LGUEST_PATCH(sti, movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled)
50LGUEST_PATCH(popf, movl %eax, lguest_data+LGUEST_DATA_irq_enabled)
51LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) 49LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax)
52/*:*/ 50
51/*G:033 But using those wrappers is inefficient (we'll see why that doesn't
52 * matter for save_fl and irq_disable later). If we write our routines
53 * carefully in assembler, we can avoid clobbering any registers and avoid
54 * jumping through the wrapper functions.
55 *
56 * I skipped over our first piece of assembler, but this one is worth studying
57 * in a bit more detail so I'll describe in easy stages. First, the routine
58 * to enable interrupts: */
59ENTRY(lg_irq_enable)
60 /* The reverse of irq_disable, this sets lguest_data.irq_enabled to
61 * X86_EFLAGS_IF (ie. "Interrupts enabled"). */
62 movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled
63 /* But now we need to check if the Host wants to know: there might have
64 * been interrupts waiting to be delivered, in which case it will have
65 * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we
66 * jump to send_interrupts, otherwise we're done. */
67 testl $0, lguest_data+LGUEST_DATA_irq_pending
68 jnz send_interrupts
69 /* One cool thing about x86 is that you can do many things without using
70 * a register. In this case, the normal path hasn't needed to save or
71 * restore any registers at all! */
72 ret
73send_interrupts:
74 /* OK, now we need a register: eax is used for the hypercall number,
75 * which is LHCALL_SEND_INTERRUPTS.
76 *
77 * We used not to bother with this pending detection at all, which was
78 * much simpler. Sooner or later the Host would realize it had to
79 * send us an interrupt. But that turns out to make performance 7
80 * times worse on a simple tcp benchmark. So now we do this the hard
81 * way. */
82 pushl %eax
83 movl $LHCALL_SEND_INTERRUPTS, %eax
84 /* This is a vmcall instruction (same thing that KVM uses). Older
85 * assembler versions might not know the "vmcall" instruction, so we
86 * create one manually here. */
87 .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */
88 popl %eax
89 ret
90
91/* Finally, the "popf" or "restore flags" routine. The %eax register holds the
92 * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're
93 * enabling interrupts again, if it's 0 we're leaving them off. */
94ENTRY(lg_restore_fl)
95 /* This is just "lguest_data.irq_enabled = flags;" */
96 movl %eax, lguest_data+LGUEST_DATA_irq_enabled
97 /* Now, if the %eax value has enabled interrupts and
98 * lguest_data.irq_pending is set, we want to tell the Host so it can
99 * deliver any outstanding interrupts. Fortunately, both values will
100 * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl"
101 * instruction will AND them together for us. If both are set, we
102 * jump to send_interrupts. */
103 testl lguest_data+LGUEST_DATA_irq_pending, %eax
104 jnz send_interrupts
105 /* Again, the normal path has used no extra registers. Clever, huh? */
106 ret
53 107
54/* These demark the EIP range where host should never deliver interrupts. */ 108/* These demark the EIP range where host should never deliver interrupts. */
55.global lguest_noirq_start 109.global lguest_noirq_start
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 55e11aa6d66c..f9d35632666b 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -2,7 +2,7 @@
2# Makefile for x86 specific library files. 2# Makefile for x86 specific library files.
3# 3#
4 4
5obj-$(CONFIG_SMP) := msr-on-cpu.o 5obj-$(CONFIG_SMP) := msr.o
6 6
7lib-y := delay.o 7lib-y := delay.o
8lib-y += thunk_$(BITS).o 8lib-y += thunk_$(BITS).o
diff --git a/arch/x86/lib/msr-on-cpu.c b/arch/x86/lib/msr-on-cpu.c
deleted file mode 100644
index 321cf720dbb6..000000000000
--- a/arch/x86/lib/msr-on-cpu.c
+++ /dev/null
@@ -1,97 +0,0 @@
1#include <linux/module.h>
2#include <linux/preempt.h>
3#include <linux/smp.h>
4#include <asm/msr.h>
5
6struct msr_info {
7 u32 msr_no;
8 u32 l, h;
9 int err;
10};
11
12static void __rdmsr_on_cpu(void *info)
13{
14 struct msr_info *rv = info;
15
16 rdmsr(rv->msr_no, rv->l, rv->h);
17}
18
19static void __wrmsr_on_cpu(void *info)
20{
21 struct msr_info *rv = info;
22
23 wrmsr(rv->msr_no, rv->l, rv->h);
24}
25
26int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
27{
28 int err;
29 struct msr_info rv;
30
31 rv.msr_no = msr_no;
32 err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
33 *l = rv.l;
34 *h = rv.h;
35
36 return err;
37}
38
39int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
40{
41 int err;
42 struct msr_info rv;
43
44 rv.msr_no = msr_no;
45 rv.l = l;
46 rv.h = h;
47 err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
48
49 return err;
50}
51
52/* These "safe" variants are slower and should be used when the target MSR
53 may not actually exist. */
54static void __rdmsr_safe_on_cpu(void *info)
55{
56 struct msr_info *rv = info;
57
58 rv->err = rdmsr_safe(rv->msr_no, &rv->l, &rv->h);
59}
60
61static void __wrmsr_safe_on_cpu(void *info)
62{
63 struct msr_info *rv = info;
64
65 rv->err = wrmsr_safe(rv->msr_no, rv->l, rv->h);
66}
67
68int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
69{
70 int err;
71 struct msr_info rv;
72
73 rv.msr_no = msr_no;
74 err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
75 *l = rv.l;
76 *h = rv.h;
77
78 return err ? err : rv.err;
79}
80
81int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
82{
83 int err;
84 struct msr_info rv;
85
86 rv.msr_no = msr_no;
87 rv.l = l;
88 rv.h = h;
89 err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
90
91 return err ? err : rv.err;
92}
93
94EXPORT_SYMBOL(rdmsr_on_cpu);
95EXPORT_SYMBOL(wrmsr_on_cpu);
96EXPORT_SYMBOL(rdmsr_safe_on_cpu);
97EXPORT_SYMBOL(wrmsr_safe_on_cpu);
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
new file mode 100644
index 000000000000..1440b9c0547e
--- /dev/null
+++ b/arch/x86/lib/msr.c
@@ -0,0 +1,183 @@
1#include <linux/module.h>
2#include <linux/preempt.h>
3#include <linux/smp.h>
4#include <asm/msr.h>
5
6struct msr_info {
7 u32 msr_no;
8 struct msr reg;
9 struct msr *msrs;
10 int off;
11 int err;
12};
13
14static void __rdmsr_on_cpu(void *info)
15{
16 struct msr_info *rv = info;
17 struct msr *reg;
18 int this_cpu = raw_smp_processor_id();
19
20 if (rv->msrs)
21 reg = &rv->msrs[this_cpu - rv->off];
22 else
23 reg = &rv->reg;
24
25 rdmsr(rv->msr_no, reg->l, reg->h);
26}
27
28static void __wrmsr_on_cpu(void *info)
29{
30 struct msr_info *rv = info;
31 struct msr *reg;
32 int this_cpu = raw_smp_processor_id();
33
34 if (rv->msrs)
35 reg = &rv->msrs[this_cpu - rv->off];
36 else
37 reg = &rv->reg;
38
39 wrmsr(rv->msr_no, reg->l, reg->h);
40}
41
42int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
43{
44 int err;
45 struct msr_info rv;
46
47 memset(&rv, 0, sizeof(rv));
48
49 rv.msr_no = msr_no;
50 err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
51 *l = rv.reg.l;
52 *h = rv.reg.h;
53
54 return err;
55}
56EXPORT_SYMBOL(rdmsr_on_cpu);
57
58int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
59{
60 int err;
61 struct msr_info rv;
62
63 memset(&rv, 0, sizeof(rv));
64
65 rv.msr_no = msr_no;
66 rv.reg.l = l;
67 rv.reg.h = h;
68 err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
69
70 return err;
71}
72EXPORT_SYMBOL(wrmsr_on_cpu);
73
74/* rdmsr on a bunch of CPUs
75 *
76 * @mask: which CPUs
77 * @msr_no: which MSR
78 * @msrs: array of MSR values
79 *
80 */
81void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs)
82{
83 struct msr_info rv;
84 int this_cpu;
85
86 memset(&rv, 0, sizeof(rv));
87
88 rv.off = cpumask_first(mask);
89 rv.msrs = msrs;
90 rv.msr_no = msr_no;
91
92 preempt_disable();
93 /*
94 * FIXME: handle the CPU we're executing on separately for now until
95 * smp_call_function_many has been fixed to not skip it.
96 */
97 this_cpu = raw_smp_processor_id();
98 smp_call_function_single(this_cpu, __rdmsr_on_cpu, &rv, 1);
99
100 smp_call_function_many(mask, __rdmsr_on_cpu, &rv, 1);
101 preempt_enable();
102}
103EXPORT_SYMBOL(rdmsr_on_cpus);
104
105/*
106 * wrmsr on a bunch of CPUs
107 *
108 * @mask: which CPUs
109 * @msr_no: which MSR
110 * @msrs: array of MSR values
111 *
112 */
113void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs)
114{
115 struct msr_info rv;
116 int this_cpu;
117
118 memset(&rv, 0, sizeof(rv));
119
120 rv.off = cpumask_first(mask);
121 rv.msrs = msrs;
122 rv.msr_no = msr_no;
123
124 preempt_disable();
125 /*
126 * FIXME: handle the CPU we're executing on separately for now until
127 * smp_call_function_many has been fixed to not skip it.
128 */
129 this_cpu = raw_smp_processor_id();
130 smp_call_function_single(this_cpu, __wrmsr_on_cpu, &rv, 1);
131
132 smp_call_function_many(mask, __wrmsr_on_cpu, &rv, 1);
133 preempt_enable();
134}
135EXPORT_SYMBOL(wrmsr_on_cpus);
136
137/* These "safe" variants are slower and should be used when the target MSR
138 may not actually exist. */
139static void __rdmsr_safe_on_cpu(void *info)
140{
141 struct msr_info *rv = info;
142
143 rv->err = rdmsr_safe(rv->msr_no, &rv->reg.l, &rv->reg.h);
144}
145
146static void __wrmsr_safe_on_cpu(void *info)
147{
148 struct msr_info *rv = info;
149
150 rv->err = wrmsr_safe(rv->msr_no, rv->reg.l, rv->reg.h);
151}
152
153int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
154{
155 int err;
156 struct msr_info rv;
157
158 memset(&rv, 0, sizeof(rv));
159
160 rv.msr_no = msr_no;
161 err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
162 *l = rv.reg.l;
163 *h = rv.reg.h;
164
165 return err ? err : rv.err;
166}
167EXPORT_SYMBOL(rdmsr_safe_on_cpu);
168
169int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
170{
171 int err;
172 struct msr_info rv;
173
174 memset(&rv, 0, sizeof(rv));
175
176 rv.msr_no = msr_no;
177 rv.reg.l = l;
178 rv.reg.h = h;
179 err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
180
181 return err ? err : rv.err;
182}
183EXPORT_SYMBOL(wrmsr_safe_on_cpu);
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index ec13cb5f17ed..b7c2849ffb66 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -127,7 +127,7 @@ EXPORT_SYMBOL(__strnlen_user);
127 127
128long strnlen_user(const char __user *s, long n) 128long strnlen_user(const char __user *s, long n)
129{ 129{
130 if (!access_ok(VERIFY_READ, s, n)) 130 if (!access_ok(VERIFY_READ, s, 1))
131 return 0; 131 return 0;
132 return __strnlen_user(s, n); 132 return __strnlen_user(s, n);
133} 133}
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index fdd30d08ab52..eefdeee8a871 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -10,6 +10,8 @@ obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o
10 10
11obj-$(CONFIG_HIGHMEM) += highmem_32.o 11obj-$(CONFIG_HIGHMEM) += highmem_32.o
12 12
13obj-$(CONFIG_KMEMCHECK) += kmemcheck/
14
13obj-$(CONFIG_MMIOTRACE) += mmiotrace.o 15obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
14mmiotrace-y := kmmio.o pf_in.o mmio-mod.o 16mmiotrace-y := kmmio.o pf_in.o mmio-mod.o
15obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o 17obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index e7277cbcfb40..a725b7f760ae 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -161,13 +161,14 @@ static void note_page(struct seq_file *m, struct pg_state *st,
161 st->current_address >= st->marker[1].start_address) { 161 st->current_address >= st->marker[1].start_address) {
162 const char *unit = units; 162 const char *unit = units;
163 unsigned long delta; 163 unsigned long delta;
164 int width = sizeof(unsigned long) * 2;
164 165
165 /* 166 /*
166 * Now print the actual finished series 167 * Now print the actual finished series
167 */ 168 */
168 seq_printf(m, "0x%p-0x%p ", 169 seq_printf(m, "0x%0*lx-0x%0*lx ",
169 (void *)st->start_address, 170 width, st->start_address,
170 (void *)st->current_address); 171 width, st->current_address);
171 172
172 delta = (st->current_address - st->start_address) >> 10; 173 delta = (st->current_address - st->start_address) >> 10;
173 while (!(delta & 1023) && unit[1]) { 174 while (!(delta & 1023) && unit[1]) {
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index a03b7279efa0..78a5fff857be 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -3,40 +3,18 @@
3 * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. 3 * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
4 * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar 4 * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
5 */ 5 */
6#include <linux/interrupt.h> 6#include <linux/magic.h> /* STACK_END_MAGIC */
7#include <linux/mmiotrace.h> 7#include <linux/sched.h> /* test_thread_flag(), ... */
8#include <linux/bootmem.h> 8#include <linux/kdebug.h> /* oops_begin/end, ... */
9#include <linux/compiler.h> 9#include <linux/module.h> /* search_exception_table */
10#include <linux/highmem.h> 10#include <linux/bootmem.h> /* max_low_pfn */
11#include <linux/kprobes.h> 11#include <linux/kprobes.h> /* __kprobes, ... */
12#include <linux/uaccess.h> 12#include <linux/mmiotrace.h> /* kmmio_handler, ... */
13#include <linux/vmalloc.h> 13#include <linux/perf_counter.h> /* perf_swcounter_event */
14#include <linux/vt_kern.h> 14
15#include <linux/signal.h> 15#include <asm/traps.h> /* dotraplinkage, ... */
16#include <linux/kernel.h> 16#include <asm/pgalloc.h> /* pgd_*(), ... */
17#include <linux/ptrace.h> 17#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */
18#include <linux/string.h>
19#include <linux/module.h>
20#include <linux/kdebug.h>
21#include <linux/errno.h>
22#include <linux/magic.h>
23#include <linux/sched.h>
24#include <linux/types.h>
25#include <linux/init.h>
26#include <linux/mman.h>
27#include <linux/tty.h>
28#include <linux/smp.h>
29#include <linux/mm.h>
30
31#include <asm-generic/sections.h>
32
33#include <asm/tlbflush.h>
34#include <asm/pgalloc.h>
35#include <asm/segment.h>
36#include <asm/system.h>
37#include <asm/proto.h>
38#include <asm/traps.h>
39#include <asm/desc.h>
40 18
41/* 19/*
42 * Page fault error code bits: 20 * Page fault error code bits:
@@ -225,12 +203,10 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
225 if (!pmd_present(*pmd_k)) 203 if (!pmd_present(*pmd_k))
226 return NULL; 204 return NULL;
227 205
228 if (!pmd_present(*pmd)) { 206 if (!pmd_present(*pmd))
229 set_pmd(pmd, *pmd_k); 207 set_pmd(pmd, *pmd_k);
230 arch_flush_lazy_mmu_mode(); 208 else
231 } else {
232 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); 209 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
233 }
234 210
235 return pmd_k; 211 return pmd_k;
236} 212}
@@ -538,8 +514,6 @@ bad:
538static int is_errata93(struct pt_regs *regs, unsigned long address) 514static int is_errata93(struct pt_regs *regs, unsigned long address)
539{ 515{
540#ifdef CONFIG_X86_64 516#ifdef CONFIG_X86_64
541 static int once;
542
543 if (address != regs->ip) 517 if (address != regs->ip)
544 return 0; 518 return 0;
545 519
@@ -549,10 +523,7 @@ static int is_errata93(struct pt_regs *regs, unsigned long address)
549 address |= 0xffffffffUL << 32; 523 address |= 0xffffffffUL << 32;
550 if ((address >= (u64)_stext && address <= (u64)_etext) || 524 if ((address >= (u64)_stext && address <= (u64)_etext) ||
551 (address >= MODULES_VADDR && address <= MODULES_END)) { 525 (address >= MODULES_VADDR && address <= MODULES_END)) {
552 if (!once) { 526 printk_once(errata93_warning);
553 printk(errata93_warning);
554 once = 1;
555 }
556 regs->ip = address; 527 regs->ip = address;
557 return 1; 528 return 1;
558 } 529 }
@@ -981,11 +952,17 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
981 tsk = current; 952 tsk = current;
982 mm = tsk->mm; 953 mm = tsk->mm;
983 954
984 prefetchw(&mm->mmap_sem);
985
986 /* Get the faulting address: */ 955 /* Get the faulting address: */
987 address = read_cr2(); 956 address = read_cr2();
988 957
958 /*
959 * Detect and handle instructions that would cause a page fault for
960 * both a tracked kernel page and a userspace page.
961 */
962 if (kmemcheck_active(regs))
963 kmemcheck_hide(regs);
964 prefetchw(&mm->mmap_sem);
965
989 if (unlikely(kmmio_fault(regs, address))) 966 if (unlikely(kmmio_fault(regs, address)))
990 return; 967 return;
991 968
@@ -1003,9 +980,13 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
1003 * protection error (error_code & 9) == 0. 980 * protection error (error_code & 9) == 0.
1004 */ 981 */
1005 if (unlikely(fault_in_kernel_space(address))) { 982 if (unlikely(fault_in_kernel_space(address))) {
1006 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && 983 if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) {
1007 vmalloc_fault(address) >= 0) 984 if (vmalloc_fault(address) >= 0)
1008 return; 985 return;
986
987 if (kmemcheck_fault(regs, address, error_code))
988 return;
989 }
1009 990
1010 /* Can handle a stale RO->RW TLB: */ 991 /* Can handle a stale RO->RW TLB: */
1011 if (spurious_fault(error_code, address)) 992 if (spurious_fault(error_code, address))
@@ -1044,6 +1025,8 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
1044 if (unlikely(error_code & PF_RSVD)) 1025 if (unlikely(error_code & PF_RSVD))
1045 pgtable_bad(regs, error_code, address); 1026 pgtable_bad(regs, error_code, address);
1046 1027
1028 perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
1029
1047 /* 1030 /*
1048 * If we're in an interrupt, have no user context or are running 1031 * If we're in an interrupt, have no user context or are running
1049 * in an atomic region then we must not take the fault: 1032 * in an atomic region then we must not take the fault:
@@ -1130,17 +1113,22 @@ good_area:
1130 * make sure we exit gracefully rather than endlessly redo 1113 * make sure we exit gracefully rather than endlessly redo
1131 * the fault: 1114 * the fault:
1132 */ 1115 */
1133 fault = handle_mm_fault(mm, vma, address, write); 1116 fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);
1134 1117
1135 if (unlikely(fault & VM_FAULT_ERROR)) { 1118 if (unlikely(fault & VM_FAULT_ERROR)) {
1136 mm_fault_error(regs, error_code, address, fault); 1119 mm_fault_error(regs, error_code, address, fault);
1137 return; 1120 return;
1138 } 1121 }
1139 1122
1140 if (fault & VM_FAULT_MAJOR) 1123 if (fault & VM_FAULT_MAJOR) {
1141 tsk->maj_flt++; 1124 tsk->maj_flt++;
1142 else 1125 perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
1126 regs, address);
1127 } else {
1143 tsk->min_flt++; 1128 tsk->min_flt++;
1129 perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
1130 regs, address);
1131 }
1144 1132
1145 check_v8086_mode(regs, address, tsk); 1133 check_v8086_mode(regs, address, tsk);
1146 1134
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 6340cef6798a..71da1bca13cb 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -14,7 +14,7 @@
14static inline pte_t gup_get_pte(pte_t *ptep) 14static inline pte_t gup_get_pte(pte_t *ptep)
15{ 15{
16#ifndef CONFIG_X86_PAE 16#ifndef CONFIG_X86_PAE
17 return *ptep; 17 return ACCESS_ONCE(*ptep);
18#else 18#else
19 /* 19 /*
20 * With get_user_pages_fast, we walk down the pagetables without taking 20 * With get_user_pages_fast, we walk down the pagetables without taking
@@ -219,6 +219,62 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
219 return 1; 219 return 1;
220} 220}
221 221
222/*
223 * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
224 * back to the regular GUP.
225 */
226int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
227 struct page **pages)
228{
229 struct mm_struct *mm = current->mm;
230 unsigned long addr, len, end;
231 unsigned long next;
232 unsigned long flags;
233 pgd_t *pgdp;
234 int nr = 0;
235
236 start &= PAGE_MASK;
237 addr = start;
238 len = (unsigned long) nr_pages << PAGE_SHIFT;
239 end = start + len;
240 if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
241 (void __user *)start, len)))
242 return 0;
243
244 /*
245 * XXX: batch / limit 'nr', to avoid large irq off latency
246 * needs some instrumenting to determine the common sizes used by
247 * important workloads (eg. DB2), and whether limiting the batch size
248 * will decrease performance.
249 *
250 * It seems like we're in the clear for the moment. Direct-IO is
251 * the main guy that batches up lots of get_user_pages, and even
252 * they are limited to 64-at-a-time which is not so many.
253 */
254 /*
255 * This doesn't prevent pagetable teardown, but does prevent
256 * the pagetables and pages from being freed on x86.
257 *
258 * So long as we atomically load page table pointers versus teardown
259 * (which we do on x86, with the above PAE exception), we can follow the
260 * address down to the the page and take a ref on it.
261 */
262 local_irq_save(flags);
263 pgdp = pgd_offset(mm, addr);
264 do {
265 pgd_t pgd = *pgdp;
266
267 next = pgd_addr_end(addr, end);
268 if (pgd_none(pgd))
269 break;
270 if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
271 break;
272 } while (pgdp++, addr = next, addr != end);
273 local_irq_restore(flags);
274
275 return nr;
276}
277
222/** 278/**
223 * get_user_pages_fast() - pin user pages in memory 279 * get_user_pages_fast() - pin user pages in memory
224 * @start: starting user address 280 * @start: starting user address
@@ -247,11 +303,16 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
247 start &= PAGE_MASK; 303 start &= PAGE_MASK;
248 addr = start; 304 addr = start;
249 len = (unsigned long) nr_pages << PAGE_SHIFT; 305 len = (unsigned long) nr_pages << PAGE_SHIFT;
306
250 end = start + len; 307 end = start + len;
251 if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, 308 if (end < start)
252 (void __user *)start, len)))
253 goto slow_irqon; 309 goto slow_irqon;
254 310
311#ifdef CONFIG_X86_64
312 if (end >> __VIRTUAL_MASK_SHIFT)
313 goto slow_irqon;
314#endif
315
255 /* 316 /*
256 * XXX: batch / limit 'nr', to avoid large irq off latency 317 * XXX: batch / limit 'nr', to avoid large irq off latency
257 * needs some instrumenting to determine the common sizes used by 318 * needs some instrumenting to determine the common sizes used by
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 8126e8d1a2a4..58f621e81919 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -44,7 +44,6 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
44 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); 44 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
45 BUG_ON(!pte_none(*(kmap_pte-idx))); 45 BUG_ON(!pte_none(*(kmap_pte-idx)));
46 set_pte(kmap_pte-idx, mk_pte(page, prot)); 46 set_pte(kmap_pte-idx, mk_pte(page, prot));
47 arch_flush_lazy_mmu_mode();
48 47
49 return (void *)vaddr; 48 return (void *)vaddr;
50} 49}
@@ -74,7 +73,6 @@ void kunmap_atomic(void *kvaddr, enum km_type type)
74#endif 73#endif
75 } 74 }
76 75
77 arch_flush_lazy_mmu_mode();
78 pagefault_enable(); 76 pagefault_enable();
79} 77}
80 78
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index ae4f7b5d7104..f53b57e4086f 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -1,3 +1,4 @@
1#include <linux/initrd.h>
1#include <linux/ioport.h> 2#include <linux/ioport.h>
2#include <linux/swap.h> 3#include <linux/swap.h>
3 4
@@ -10,6 +11,9 @@
10#include <asm/setup.h> 11#include <asm/setup.h>
11#include <asm/system.h> 12#include <asm/system.h>
12#include <asm/tlbflush.h> 13#include <asm/tlbflush.h>
14#include <asm/tlb.h>
15
16DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
13 17
14unsigned long __initdata e820_table_start; 18unsigned long __initdata e820_table_start;
15unsigned long __meminitdata e820_table_end; 19unsigned long __meminitdata e820_table_end;
@@ -23,6 +27,69 @@ int direct_gbpages
23#endif 27#endif
24; 28;
25 29
30int nx_enabled;
31
32#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
33static int disable_nx __cpuinitdata;
34
35/*
36 * noexec = on|off
37 *
38 * Control non-executable mappings for processes.
39 *
40 * on Enable
41 * off Disable
42 */
43static int __init noexec_setup(char *str)
44{
45 if (!str)
46 return -EINVAL;
47 if (!strncmp(str, "on", 2)) {
48 __supported_pte_mask |= _PAGE_NX;
49 disable_nx = 0;
50 } else if (!strncmp(str, "off", 3)) {
51 disable_nx = 1;
52 __supported_pte_mask &= ~_PAGE_NX;
53 }
54 return 0;
55}
56early_param("noexec", noexec_setup);
57#endif
58
59#ifdef CONFIG_X86_PAE
60static void __init set_nx(void)
61{
62 unsigned int v[4], l, h;
63
64 if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
65 cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
66
67 if ((v[3] & (1 << 20)) && !disable_nx) {
68 rdmsr(MSR_EFER, l, h);
69 l |= EFER_NX;
70 wrmsr(MSR_EFER, l, h);
71 nx_enabled = 1;
72 __supported_pte_mask |= _PAGE_NX;
73 }
74 }
75}
76#else
77static inline void set_nx(void)
78{
79}
80#endif
81
82#ifdef CONFIG_X86_64
83void __cpuinit check_efer(void)
84{
85 unsigned long efer;
86
87 rdmsrl(MSR_EFER, efer);
88 if (!(efer & EFER_NX) || disable_nx)
89 __supported_pte_mask &= ~_PAGE_NX;
90}
91#endif
92
26static void __init find_early_table_space(unsigned long end, int use_pse, 93static void __init find_early_table_space(unsigned long end, int use_pse,
27 int use_gbpages) 94 int use_gbpages)
28{ 95{
@@ -66,12 +133,11 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
66 */ 133 */
67#ifdef CONFIG_X86_32 134#ifdef CONFIG_X86_32
68 start = 0x7000; 135 start = 0x7000;
69 e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT, 136#else
70 tables, PAGE_SIZE);
71#else /* CONFIG_X86_64 */
72 start = 0x8000; 137 start = 0x8000;
73 e820_table_start = find_e820_area(start, end, tables, PAGE_SIZE);
74#endif 138#endif
139 e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
140 tables, PAGE_SIZE);
75 if (e820_table_start == -1UL) 141 if (e820_table_start == -1UL)
76 panic("Cannot find space for the kernel page tables"); 142 panic("Cannot find space for the kernel page tables");
77 143
@@ -147,7 +213,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
147 if (!after_bootmem) 213 if (!after_bootmem)
148 init_gbpages(); 214 init_gbpages();
149 215
150#ifdef CONFIG_DEBUG_PAGEALLOC 216#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
151 /* 217 /*
152 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. 218 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
153 * This will simplify cpa(), which otherwise needs to support splitting 219 * This will simplify cpa(), which otherwise needs to support splitting
@@ -159,12 +225,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
159 use_gbpages = direct_gbpages; 225 use_gbpages = direct_gbpages;
160#endif 226#endif
161 227
162#ifdef CONFIG_X86_32
163#ifdef CONFIG_X86_PAE
164 set_nx(); 228 set_nx();
165 if (nx_enabled) 229 if (nx_enabled)
166 printk(KERN_INFO "NX (Execute Disable) protection: active\n"); 230 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
167#endif
168 231
169 /* Enable PSE if available */ 232 /* Enable PSE if available */
170 if (cpu_has_pse) 233 if (cpu_has_pse)
@@ -175,7 +238,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
175 set_in_cr4(X86_CR4_PGE); 238 set_in_cr4(X86_CR4_PGE);
176 __supported_pte_mask |= _PAGE_GLOBAL; 239 __supported_pte_mask |= _PAGE_GLOBAL;
177 } 240 }
178#endif
179 241
180 if (use_gbpages) 242 if (use_gbpages)
181 page_size_mask |= 1 << PG_LEVEL_1G; 243 page_size_mask |= 1 << PG_LEVEL_1G;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 749559ed80f5..3cd7711bb949 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -49,12 +49,9 @@
49#include <asm/paravirt.h> 49#include <asm/paravirt.h>
50#include <asm/setup.h> 50#include <asm/setup.h>
51#include <asm/cacheflush.h> 51#include <asm/cacheflush.h>
52#include <asm/page_types.h>
52#include <asm/init.h> 53#include <asm/init.h>
53 54
54unsigned long max_low_pfn_mapped;
55unsigned long max_pfn_mapped;
56
57DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
58unsigned long highstart_pfn, highend_pfn; 55unsigned long highstart_pfn, highend_pfn;
59 56
60static noinline int do_test_wp_bit(void); 57static noinline int do_test_wp_bit(void);
@@ -114,7 +111,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
114 pte_t *page_table = NULL; 111 pte_t *page_table = NULL;
115 112
116 if (after_bootmem) { 113 if (after_bootmem) {
117#ifdef CONFIG_DEBUG_PAGEALLOC 114#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
118 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); 115 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
119#endif 116#endif
120 if (!page_table) 117 if (!page_table)
@@ -567,7 +564,7 @@ static inline void save_pg_dir(void)
567} 564}
568#endif /* !CONFIG_ACPI_SLEEP */ 565#endif /* !CONFIG_ACPI_SLEEP */
569 566
570void zap_low_mappings(void) 567void zap_low_mappings(bool early)
571{ 568{
572 int i; 569 int i;
573 570
@@ -584,64 +581,16 @@ void zap_low_mappings(void)
584 set_pgd(swapper_pg_dir+i, __pgd(0)); 581 set_pgd(swapper_pg_dir+i, __pgd(0));
585#endif 582#endif
586 } 583 }
587 flush_tlb_all();
588}
589 584
590int nx_enabled; 585 if (early)
586 __flush_tlb();
587 else
588 flush_tlb_all();
589}
591 590
592pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP); 591pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
593EXPORT_SYMBOL_GPL(__supported_pte_mask); 592EXPORT_SYMBOL_GPL(__supported_pte_mask);
594 593
595#ifdef CONFIG_X86_PAE
596
597static int disable_nx __initdata;
598
599/*
600 * noexec = on|off
601 *
602 * Control non executable mappings.
603 *
604 * on Enable
605 * off Disable
606 */
607static int __init noexec_setup(char *str)
608{
609 if (!str || !strcmp(str, "on")) {
610 if (cpu_has_nx) {
611 __supported_pte_mask |= _PAGE_NX;
612 disable_nx = 0;
613 }
614 } else {
615 if (!strcmp(str, "off")) {
616 disable_nx = 1;
617 __supported_pte_mask &= ~_PAGE_NX;
618 } else {
619 return -EINVAL;
620 }
621 }
622
623 return 0;
624}
625early_param("noexec", noexec_setup);
626
627void __init set_nx(void)
628{
629 unsigned int v[4], l, h;
630
631 if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
632 cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
633
634 if ((v[3] & (1 << 20)) && !disable_nx) {
635 rdmsr(MSR_EFER, l, h);
636 l |= EFER_NX;
637 wrmsr(MSR_EFER, l, h);
638 nx_enabled = 1;
639 __supported_pte_mask |= _PAGE_NX;
640 }
641 }
642}
643#endif
644
645/* user-defined highmem size */ 594/* user-defined highmem size */
646static unsigned int highmem_pages = -1; 595static unsigned int highmem_pages = -1;
647 596
@@ -761,15 +710,15 @@ void __init initmem_init(unsigned long start_pfn,
761 highstart_pfn = highend_pfn = max_pfn; 710 highstart_pfn = highend_pfn = max_pfn;
762 if (max_pfn > max_low_pfn) 711 if (max_pfn > max_low_pfn)
763 highstart_pfn = max_low_pfn; 712 highstart_pfn = max_low_pfn;
764 memory_present(0, 0, highend_pfn);
765 e820_register_active_regions(0, 0, highend_pfn); 713 e820_register_active_regions(0, 0, highend_pfn);
714 sparse_memory_present_with_active_regions(0);
766 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", 715 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
767 pages_to_mb(highend_pfn - highstart_pfn)); 716 pages_to_mb(highend_pfn - highstart_pfn));
768 num_physpages = highend_pfn; 717 num_physpages = highend_pfn;
769 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; 718 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
770#else 719#else
771 memory_present(0, 0, max_low_pfn);
772 e820_register_active_regions(0, 0, max_low_pfn); 720 e820_register_active_regions(0, 0, max_low_pfn);
721 sparse_memory_present_with_active_regions(0);
773 num_physpages = max_low_pfn; 722 num_physpages = max_low_pfn;
774 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; 723 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
775#endif 724#endif
@@ -1011,7 +960,7 @@ void __init mem_init(void)
1011 test_wp_bit(); 960 test_wp_bit();
1012 961
1013 save_pg_dir(); 962 save_pg_dir();
1014 zap_low_mappings(); 963 zap_low_mappings(true);
1015} 964}
1016 965
1017#ifdef CONFIG_MEMORY_HOTPLUG 966#ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 1753e8020df6..c4378f4fd4a5 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -50,18 +50,8 @@
50#include <asm/cacheflush.h> 50#include <asm/cacheflush.h>
51#include <asm/init.h> 51#include <asm/init.h>
52 52
53/*
54 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
55 * The direct mapping extends to max_pfn_mapped, so that we can directly access
56 * apertures, ACPI and other tables without having to play with fixmaps.
57 */
58unsigned long max_low_pfn_mapped;
59unsigned long max_pfn_mapped;
60
61static unsigned long dma_reserve __initdata; 53static unsigned long dma_reserve __initdata;
62 54
63DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
64
65static int __init parse_direct_gbpages_off(char *arg) 55static int __init parse_direct_gbpages_off(char *arg)
66{ 56{
67 direct_gbpages = 0; 57 direct_gbpages = 0;
@@ -85,39 +75,6 @@ early_param("gbpages", parse_direct_gbpages_on);
85pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP; 75pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
86EXPORT_SYMBOL_GPL(__supported_pte_mask); 76EXPORT_SYMBOL_GPL(__supported_pte_mask);
87 77
88static int disable_nx __cpuinitdata;
89
90/*
91 * noexec=on|off
92 * Control non-executable mappings for 64-bit processes.
93 *
94 * on Enable (default)
95 * off Disable
96 */
97static int __init nonx_setup(char *str)
98{
99 if (!str)
100 return -EINVAL;
101 if (!strncmp(str, "on", 2)) {
102 __supported_pte_mask |= _PAGE_NX;
103 disable_nx = 0;
104 } else if (!strncmp(str, "off", 3)) {
105 disable_nx = 1;
106 __supported_pte_mask &= ~_PAGE_NX;
107 }
108 return 0;
109}
110early_param("noexec", nonx_setup);
111
112void __cpuinit check_efer(void)
113{
114 unsigned long efer;
115
116 rdmsrl(MSR_EFER, efer);
117 if (!(efer & EFER_NX) || disable_nx)
118 __supported_pte_mask &= ~_PAGE_NX;
119}
120
121int force_personality32; 78int force_personality32;
122 79
123/* 80/*
@@ -147,7 +104,7 @@ static __ref void *spp_getpage(void)
147 void *ptr; 104 void *ptr;
148 105
149 if (after_bootmem) 106 if (after_bootmem)
150 ptr = (void *) get_zeroed_page(GFP_ATOMIC); 107 ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
151 else 108 else
152 ptr = alloc_bootmem_pages(PAGE_SIZE); 109 ptr = alloc_bootmem_pages(PAGE_SIZE);
153 110
@@ -324,7 +281,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
324 void *adr; 281 void *adr;
325 282
326 if (after_bootmem) { 283 if (after_bootmem) {
327 adr = (void *)get_zeroed_page(GFP_ATOMIC); 284 adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
328 *phys = __pa(adr); 285 *phys = __pa(adr);
329 286
330 return adr; 287 return adr;
@@ -570,7 +527,7 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
570 return phys_pud_init(pud, addr, end, page_size_mask); 527 return phys_pud_init(pud, addr, end, page_size_mask);
571} 528}
572 529
573unsigned long __init 530unsigned long __meminit
574kernel_physical_mapping_init(unsigned long start, 531kernel_physical_mapping_init(unsigned long start,
575 unsigned long end, 532 unsigned long end,
576 unsigned long page_size_mask) 533 unsigned long page_size_mask)
@@ -628,6 +585,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
628 early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT); 585 early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
629 reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT); 586 reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
630} 587}
588#endif
631 589
632void __init paging_init(void) 590void __init paging_init(void)
633{ 591{
@@ -638,11 +596,10 @@ void __init paging_init(void)
638 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; 596 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
639 max_zone_pfns[ZONE_NORMAL] = max_pfn; 597 max_zone_pfns[ZONE_NORMAL] = max_pfn;
640 598
641 memory_present(0, 0, max_pfn); 599 sparse_memory_present_with_active_regions(MAX_NUMNODES);
642 sparse_init(); 600 sparse_init();
643 free_area_init_nodes(max_zone_pfns); 601 free_area_init_nodes(max_zone_pfns);
644} 602}
645#endif
646 603
647/* 604/*
648 * Memory hotplug specific functions 605 * Memory hotplug specific functions
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 8056545e2d39..fe6f84ca121e 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -82,7 +82,6 @@ iounmap_atomic(void *kvaddr, enum km_type type)
82 if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) 82 if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx))
83 kpte_clear_flush(kmap_pte-idx, vaddr); 83 kpte_clear_flush(kmap_pte-idx, vaddr);
84 84
85 arch_flush_lazy_mmu_mode();
86 pagefault_enable(); 85 pagefault_enable();
87} 86}
88EXPORT_SYMBOL_GPL(iounmap_atomic); 87EXPORT_SYMBOL_GPL(iounmap_atomic);
diff --git a/arch/x86/mm/kmemcheck/Makefile b/arch/x86/mm/kmemcheck/Makefile
new file mode 100644
index 000000000000..520b3bce4095
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/Makefile
@@ -0,0 +1 @@
obj-y := error.o kmemcheck.o opcode.o pte.o selftest.o shadow.o
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
new file mode 100644
index 000000000000..4901d0dafda6
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -0,0 +1,228 @@
1#include <linux/interrupt.h>
2#include <linux/kdebug.h>
3#include <linux/kmemcheck.h>
4#include <linux/kernel.h>
5#include <linux/types.h>
6#include <linux/ptrace.h>
7#include <linux/stacktrace.h>
8#include <linux/string.h>
9
10#include "error.h"
11#include "shadow.h"
12
13enum kmemcheck_error_type {
14 KMEMCHECK_ERROR_INVALID_ACCESS,
15 KMEMCHECK_ERROR_BUG,
16};
17
18#define SHADOW_COPY_SIZE (1 << CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT)
19
20struct kmemcheck_error {
21 enum kmemcheck_error_type type;
22
23 union {
24 /* KMEMCHECK_ERROR_INVALID_ACCESS */
25 struct {
26 /* Kind of access that caused the error */
27 enum kmemcheck_shadow state;
28 /* Address and size of the erroneous read */
29 unsigned long address;
30 unsigned int size;
31 };
32 };
33
34 struct pt_regs regs;
35 struct stack_trace trace;
36 unsigned long trace_entries[32];
37
38 /* We compress it to a char. */
39 unsigned char shadow_copy[SHADOW_COPY_SIZE];
40 unsigned char memory_copy[SHADOW_COPY_SIZE];
41};
42
43/*
44 * Create a ring queue of errors to output. We can't call printk() directly
45 * from the kmemcheck traps, since this may call the console drivers and
46 * result in a recursive fault.
47 */
48static struct kmemcheck_error error_fifo[CONFIG_KMEMCHECK_QUEUE_SIZE];
49static unsigned int error_count;
50static unsigned int error_rd;
51static unsigned int error_wr;
52static unsigned int error_missed_count;
53
54static struct kmemcheck_error *error_next_wr(void)
55{
56 struct kmemcheck_error *e;
57
58 if (error_count == ARRAY_SIZE(error_fifo)) {
59 ++error_missed_count;
60 return NULL;
61 }
62
63 e = &error_fifo[error_wr];
64 if (++error_wr == ARRAY_SIZE(error_fifo))
65 error_wr = 0;
66 ++error_count;
67 return e;
68}
69
70static struct kmemcheck_error *error_next_rd(void)
71{
72 struct kmemcheck_error *e;
73
74 if (error_count == 0)
75 return NULL;
76
77 e = &error_fifo[error_rd];
78 if (++error_rd == ARRAY_SIZE(error_fifo))
79 error_rd = 0;
80 --error_count;
81 return e;
82}
83
84void kmemcheck_error_recall(void)
85{
86 static const char *desc[] = {
87 [KMEMCHECK_SHADOW_UNALLOCATED] = "unallocated",
88 [KMEMCHECK_SHADOW_UNINITIALIZED] = "uninitialized",
89 [KMEMCHECK_SHADOW_INITIALIZED] = "initialized",
90 [KMEMCHECK_SHADOW_FREED] = "freed",
91 };
92
93 static const char short_desc[] = {
94 [KMEMCHECK_SHADOW_UNALLOCATED] = 'a',
95 [KMEMCHECK_SHADOW_UNINITIALIZED] = 'u',
96 [KMEMCHECK_SHADOW_INITIALIZED] = 'i',
97 [KMEMCHECK_SHADOW_FREED] = 'f',
98 };
99
100 struct kmemcheck_error *e;
101 unsigned int i;
102
103 e = error_next_rd();
104 if (!e)
105 return;
106
107 switch (e->type) {
108 case KMEMCHECK_ERROR_INVALID_ACCESS:
109 printk(KERN_ERR "WARNING: kmemcheck: Caught %d-bit read "
110 "from %s memory (%p)\n",
111 8 * e->size, e->state < ARRAY_SIZE(desc) ?
112 desc[e->state] : "(invalid shadow state)",
113 (void *) e->address);
114
115 printk(KERN_INFO);
116 for (i = 0; i < SHADOW_COPY_SIZE; ++i)
117 printk("%02x", e->memory_copy[i]);
118 printk("\n");
119
120 printk(KERN_INFO);
121 for (i = 0; i < SHADOW_COPY_SIZE; ++i) {
122 if (e->shadow_copy[i] < ARRAY_SIZE(short_desc))
123 printk(" %c", short_desc[e->shadow_copy[i]]);
124 else
125 printk(" ?");
126 }
127 printk("\n");
128 printk(KERN_INFO "%*c\n", 2 + 2
129 * (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^');
130 break;
131 case KMEMCHECK_ERROR_BUG:
132 printk(KERN_EMERG "ERROR: kmemcheck: Fatal error\n");
133 break;
134 }
135
136 __show_regs(&e->regs, 1);
137 print_stack_trace(&e->trace, 0);
138}
139
140static void do_wakeup(unsigned long data)
141{
142 while (error_count > 0)
143 kmemcheck_error_recall();
144
145 if (error_missed_count > 0) {
146 printk(KERN_WARNING "kmemcheck: Lost %d error reports because "
147 "the queue was too small\n", error_missed_count);
148 error_missed_count = 0;
149 }
150}
151
152static DECLARE_TASKLET(kmemcheck_tasklet, &do_wakeup, 0);
153
154/*
155 * Save the context of an error report.
156 */
157void kmemcheck_error_save(enum kmemcheck_shadow state,
158 unsigned long address, unsigned int size, struct pt_regs *regs)
159{
160 static unsigned long prev_ip;
161
162 struct kmemcheck_error *e;
163 void *shadow_copy;
164 void *memory_copy;
165
166 /* Don't report several adjacent errors from the same EIP. */
167 if (regs->ip == prev_ip)
168 return;
169 prev_ip = regs->ip;
170
171 e = error_next_wr();
172 if (!e)
173 return;
174
175 e->type = KMEMCHECK_ERROR_INVALID_ACCESS;
176
177 e->state = state;
178 e->address = address;
179 e->size = size;
180
181 /* Save regs */
182 memcpy(&e->regs, regs, sizeof(*regs));
183
184 /* Save stack trace */
185 e->trace.nr_entries = 0;
186 e->trace.entries = e->trace_entries;
187 e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
188 e->trace.skip = 0;
189 save_stack_trace_bp(&e->trace, regs->bp);
190
191 /* Round address down to nearest 16 bytes */
192 shadow_copy = kmemcheck_shadow_lookup(address
193 & ~(SHADOW_COPY_SIZE - 1));
194 BUG_ON(!shadow_copy);
195
196 memcpy(e->shadow_copy, shadow_copy, SHADOW_COPY_SIZE);
197
198 kmemcheck_show_addr(address);
199 memory_copy = (void *) (address & ~(SHADOW_COPY_SIZE - 1));
200 memcpy(e->memory_copy, memory_copy, SHADOW_COPY_SIZE);
201 kmemcheck_hide_addr(address);
202
203 tasklet_hi_schedule_first(&kmemcheck_tasklet);
204}
205
206/*
207 * Save the context of a kmemcheck bug.
208 */
209void kmemcheck_error_save_bug(struct pt_regs *regs)
210{
211 struct kmemcheck_error *e;
212
213 e = error_next_wr();
214 if (!e)
215 return;
216
217 e->type = KMEMCHECK_ERROR_BUG;
218
219 memcpy(&e->regs, regs, sizeof(*regs));
220
221 e->trace.nr_entries = 0;
222 e->trace.entries = e->trace_entries;
223 e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
224 e->trace.skip = 1;
225 save_stack_trace(&e->trace);
226
227 tasklet_hi_schedule_first(&kmemcheck_tasklet);
228}
diff --git a/arch/x86/mm/kmemcheck/error.h b/arch/x86/mm/kmemcheck/error.h
new file mode 100644
index 000000000000..0efc2e8d0a20
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/error.h
@@ -0,0 +1,15 @@
1#ifndef ARCH__X86__MM__KMEMCHECK__ERROR_H
2#define ARCH__X86__MM__KMEMCHECK__ERROR_H
3
4#include <linux/ptrace.h>
5
6#include "shadow.h"
7
8void kmemcheck_error_save(enum kmemcheck_shadow state,
9 unsigned long address, unsigned int size, struct pt_regs *regs);
10
11void kmemcheck_error_save_bug(struct pt_regs *regs);
12
13void kmemcheck_error_recall(void);
14
15#endif
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
new file mode 100644
index 000000000000..2c55ed098654
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -0,0 +1,640 @@
1/**
2 * kmemcheck - a heavyweight memory checker for the linux kernel
3 * Copyright (C) 2007, 2008 Vegard Nossum <vegardno@ifi.uio.no>
4 * (With a lot of help from Ingo Molnar and Pekka Enberg.)
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License (version 2) as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/init.h>
12#include <linux/interrupt.h>
13#include <linux/kallsyms.h>
14#include <linux/kernel.h>
15#include <linux/kmemcheck.h>
16#include <linux/mm.h>
17#include <linux/module.h>
18#include <linux/page-flags.h>
19#include <linux/percpu.h>
20#include <linux/ptrace.h>
21#include <linux/string.h>
22#include <linux/types.h>
23
24#include <asm/cacheflush.h>
25#include <asm/kmemcheck.h>
26#include <asm/pgtable.h>
27#include <asm/tlbflush.h>
28
29#include "error.h"
30#include "opcode.h"
31#include "pte.h"
32#include "selftest.h"
33#include "shadow.h"
34
35
36#ifdef CONFIG_KMEMCHECK_DISABLED_BY_DEFAULT
37# define KMEMCHECK_ENABLED 0
38#endif
39
40#ifdef CONFIG_KMEMCHECK_ENABLED_BY_DEFAULT
41# define KMEMCHECK_ENABLED 1
42#endif
43
44#ifdef CONFIG_KMEMCHECK_ONESHOT_BY_DEFAULT
45# define KMEMCHECK_ENABLED 2
46#endif
47
48int kmemcheck_enabled = KMEMCHECK_ENABLED;
49
50int __init kmemcheck_init(void)
51{
52#ifdef CONFIG_SMP
53 /*
54 * Limit SMP to use a single CPU. We rely on the fact that this code
55 * runs before SMP is set up.
56 */
57 if (setup_max_cpus > 1) {
58 printk(KERN_INFO
59 "kmemcheck: Limiting number of CPUs to 1.\n");
60 setup_max_cpus = 1;
61 }
62#endif
63
64 if (!kmemcheck_selftest()) {
65 printk(KERN_INFO "kmemcheck: self-tests failed; disabling\n");
66 kmemcheck_enabled = 0;
67 return -EINVAL;
68 }
69
70 printk(KERN_INFO "kmemcheck: Initialized\n");
71 return 0;
72}
73
74early_initcall(kmemcheck_init);
75
76/*
77 * We need to parse the kmemcheck= option before any memory is allocated.
78 */
79static int __init param_kmemcheck(char *str)
80{
81 if (!str)
82 return -EINVAL;
83
84 sscanf(str, "%d", &kmemcheck_enabled);
85 return 0;
86}
87
88early_param("kmemcheck", param_kmemcheck);
89
90int kmemcheck_show_addr(unsigned long address)
91{
92 pte_t *pte;
93
94 pte = kmemcheck_pte_lookup(address);
95 if (!pte)
96 return 0;
97
98 set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
99 __flush_tlb_one(address);
100 return 1;
101}
102
103int kmemcheck_hide_addr(unsigned long address)
104{
105 pte_t *pte;
106
107 pte = kmemcheck_pte_lookup(address);
108 if (!pte)
109 return 0;
110
111 set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
112 __flush_tlb_one(address);
113 return 1;
114}
115
116struct kmemcheck_context {
117 bool busy;
118 int balance;
119
120 /*
121 * There can be at most two memory operands to an instruction, but
122 * each address can cross a page boundary -- so we may need up to
123 * four addresses that must be hidden/revealed for each fault.
124 */
125 unsigned long addr[4];
126 unsigned long n_addrs;
127 unsigned long flags;
128
129 /* Data size of the instruction that caused a fault. */
130 unsigned int size;
131};
132
133static DEFINE_PER_CPU(struct kmemcheck_context, kmemcheck_context);
134
135bool kmemcheck_active(struct pt_regs *regs)
136{
137 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
138
139 return data->balance > 0;
140}
141
142/* Save an address that needs to be shown/hidden */
143static void kmemcheck_save_addr(unsigned long addr)
144{
145 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
146
147 BUG_ON(data->n_addrs >= ARRAY_SIZE(data->addr));
148 data->addr[data->n_addrs++] = addr;
149}
150
151static unsigned int kmemcheck_show_all(void)
152{
153 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
154 unsigned int i;
155 unsigned int n;
156
157 n = 0;
158 for (i = 0; i < data->n_addrs; ++i)
159 n += kmemcheck_show_addr(data->addr[i]);
160
161 return n;
162}
163
164static unsigned int kmemcheck_hide_all(void)
165{
166 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
167 unsigned int i;
168 unsigned int n;
169
170 n = 0;
171 for (i = 0; i < data->n_addrs; ++i)
172 n += kmemcheck_hide_addr(data->addr[i]);
173
174 return n;
175}
176
177/*
178 * Called from the #PF handler.
179 */
180void kmemcheck_show(struct pt_regs *regs)
181{
182 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
183
184 BUG_ON(!irqs_disabled());
185
186 if (unlikely(data->balance != 0)) {
187 kmemcheck_show_all();
188 kmemcheck_error_save_bug(regs);
189 data->balance = 0;
190 return;
191 }
192
193 /*
194 * None of the addresses actually belonged to kmemcheck. Note that
195 * this is not an error.
196 */
197 if (kmemcheck_show_all() == 0)
198 return;
199
200 ++data->balance;
201
202 /*
203 * The IF needs to be cleared as well, so that the faulting
204 * instruction can run "uninterrupted". Otherwise, we might take
205 * an interrupt and start executing that before we've had a chance
206 * to hide the page again.
207 *
208 * NOTE: In the rare case of multiple faults, we must not override
209 * the original flags:
210 */
211 if (!(regs->flags & X86_EFLAGS_TF))
212 data->flags = regs->flags;
213
214 regs->flags |= X86_EFLAGS_TF;
215 regs->flags &= ~X86_EFLAGS_IF;
216}
217
218/*
219 * Called from the #DB handler.
220 */
221void kmemcheck_hide(struct pt_regs *regs)
222{
223 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
224 int n;
225
226 BUG_ON(!irqs_disabled());
227
228 if (data->balance == 0)
229 return;
230
231 if (unlikely(data->balance != 1)) {
232 kmemcheck_show_all();
233 kmemcheck_error_save_bug(regs);
234 data->n_addrs = 0;
235 data->balance = 0;
236
237 if (!(data->flags & X86_EFLAGS_TF))
238 regs->flags &= ~X86_EFLAGS_TF;
239 if (data->flags & X86_EFLAGS_IF)
240 regs->flags |= X86_EFLAGS_IF;
241 return;
242 }
243
244 if (kmemcheck_enabled)
245 n = kmemcheck_hide_all();
246 else
247 n = kmemcheck_show_all();
248
249 if (n == 0)
250 return;
251
252 --data->balance;
253
254 data->n_addrs = 0;
255
256 if (!(data->flags & X86_EFLAGS_TF))
257 regs->flags &= ~X86_EFLAGS_TF;
258 if (data->flags & X86_EFLAGS_IF)
259 regs->flags |= X86_EFLAGS_IF;
260}
261
262void kmemcheck_show_pages(struct page *p, unsigned int n)
263{
264 unsigned int i;
265
266 for (i = 0; i < n; ++i) {
267 unsigned long address;
268 pte_t *pte;
269 unsigned int level;
270
271 address = (unsigned long) page_address(&p[i]);
272 pte = lookup_address(address, &level);
273 BUG_ON(!pte);
274 BUG_ON(level != PG_LEVEL_4K);
275
276 set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
277 set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_HIDDEN));
278 __flush_tlb_one(address);
279 }
280}
281
282bool kmemcheck_page_is_tracked(struct page *p)
283{
284 /* This will also check the "hidden" flag of the PTE. */
285 return kmemcheck_pte_lookup((unsigned long) page_address(p));
286}
287
288void kmemcheck_hide_pages(struct page *p, unsigned int n)
289{
290 unsigned int i;
291
292 for (i = 0; i < n; ++i) {
293 unsigned long address;
294 pte_t *pte;
295 unsigned int level;
296
297 address = (unsigned long) page_address(&p[i]);
298 pte = lookup_address(address, &level);
299 BUG_ON(!pte);
300 BUG_ON(level != PG_LEVEL_4K);
301
302 set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
303 set_pte(pte, __pte(pte_val(*pte) | _PAGE_HIDDEN));
304 __flush_tlb_one(address);
305 }
306}
307
308/* Access may NOT cross page boundary */
309static void kmemcheck_read_strict(struct pt_regs *regs,
310 unsigned long addr, unsigned int size)
311{
312 void *shadow;
313 enum kmemcheck_shadow status;
314
315 shadow = kmemcheck_shadow_lookup(addr);
316 if (!shadow)
317 return;
318
319 kmemcheck_save_addr(addr);
320 status = kmemcheck_shadow_test(shadow, size);
321 if (status == KMEMCHECK_SHADOW_INITIALIZED)
322 return;
323
324 if (kmemcheck_enabled)
325 kmemcheck_error_save(status, addr, size, regs);
326
327 if (kmemcheck_enabled == 2)
328 kmemcheck_enabled = 0;
329
330 /* Don't warn about it again. */
331 kmemcheck_shadow_set(shadow, size);
332}
333
334/* Access may cross page boundary */
335static void kmemcheck_read(struct pt_regs *regs,
336 unsigned long addr, unsigned int size)
337{
338 unsigned long page = addr & PAGE_MASK;
339 unsigned long next_addr = addr + size - 1;
340 unsigned long next_page = next_addr & PAGE_MASK;
341
342 if (likely(page == next_page)) {
343 kmemcheck_read_strict(regs, addr, size);
344 return;
345 }
346
347 /*
348 * What we do is basically to split the access across the
349 * two pages and handle each part separately. Yes, this means
350 * that we may now see reads that are 3 + 5 bytes, for
351 * example (and if both are uninitialized, there will be two
352 * reports), but it makes the code a lot simpler.
353 */
354 kmemcheck_read_strict(regs, addr, next_page - addr);
355 kmemcheck_read_strict(regs, next_page, next_addr - next_page);
356}
357
358static void kmemcheck_write_strict(struct pt_regs *regs,
359 unsigned long addr, unsigned int size)
360{
361 void *shadow;
362
363 shadow = kmemcheck_shadow_lookup(addr);
364 if (!shadow)
365 return;
366
367 kmemcheck_save_addr(addr);
368 kmemcheck_shadow_set(shadow, size);
369}
370
371static void kmemcheck_write(struct pt_regs *regs,
372 unsigned long addr, unsigned int size)
373{
374 unsigned long page = addr & PAGE_MASK;
375 unsigned long next_addr = addr + size - 1;
376 unsigned long next_page = next_addr & PAGE_MASK;
377
378 if (likely(page == next_page)) {
379 kmemcheck_write_strict(regs, addr, size);
380 return;
381 }
382
383 /* See comment in kmemcheck_read(). */
384 kmemcheck_write_strict(regs, addr, next_page - addr);
385 kmemcheck_write_strict(regs, next_page, next_addr - next_page);
386}
387
388/*
389 * Copying is hard. We have two addresses, each of which may be split across
390 * a page (and each page will have different shadow addresses).
391 */
392static void kmemcheck_copy(struct pt_regs *regs,
393 unsigned long src_addr, unsigned long dst_addr, unsigned int size)
394{
395 uint8_t shadow[8];
396 enum kmemcheck_shadow status;
397
398 unsigned long page;
399 unsigned long next_addr;
400 unsigned long next_page;
401
402 uint8_t *x;
403 unsigned int i;
404 unsigned int n;
405
406 BUG_ON(size > sizeof(shadow));
407
408 page = src_addr & PAGE_MASK;
409 next_addr = src_addr + size - 1;
410 next_page = next_addr & PAGE_MASK;
411
412 if (likely(page == next_page)) {
413 /* Same page */
414 x = kmemcheck_shadow_lookup(src_addr);
415 if (x) {
416 kmemcheck_save_addr(src_addr);
417 for (i = 0; i < size; ++i)
418 shadow[i] = x[i];
419 } else {
420 for (i = 0; i < size; ++i)
421 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
422 }
423 } else {
424 n = next_page - src_addr;
425 BUG_ON(n > sizeof(shadow));
426
427 /* First page */
428 x = kmemcheck_shadow_lookup(src_addr);
429 if (x) {
430 kmemcheck_save_addr(src_addr);
431 for (i = 0; i < n; ++i)
432 shadow[i] = x[i];
433 } else {
434 /* Not tracked */
435 for (i = 0; i < n; ++i)
436 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
437 }
438
439 /* Second page */
440 x = kmemcheck_shadow_lookup(next_page);
441 if (x) {
442 kmemcheck_save_addr(next_page);
443 for (i = n; i < size; ++i)
444 shadow[i] = x[i - n];
445 } else {
446 /* Not tracked */
447 for (i = n; i < size; ++i)
448 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
449 }
450 }
451
452 page = dst_addr & PAGE_MASK;
453 next_addr = dst_addr + size - 1;
454 next_page = next_addr & PAGE_MASK;
455
456 if (likely(page == next_page)) {
457 /* Same page */
458 x = kmemcheck_shadow_lookup(dst_addr);
459 if (x) {
460 kmemcheck_save_addr(dst_addr);
461 for (i = 0; i < size; ++i) {
462 x[i] = shadow[i];
463 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
464 }
465 }
466 } else {
467 n = next_page - dst_addr;
468 BUG_ON(n > sizeof(shadow));
469
470 /* First page */
471 x = kmemcheck_shadow_lookup(dst_addr);
472 if (x) {
473 kmemcheck_save_addr(dst_addr);
474 for (i = 0; i < n; ++i) {
475 x[i] = shadow[i];
476 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
477 }
478 }
479
480 /* Second page */
481 x = kmemcheck_shadow_lookup(next_page);
482 if (x) {
483 kmemcheck_save_addr(next_page);
484 for (i = n; i < size; ++i) {
485 x[i - n] = shadow[i];
486 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
487 }
488 }
489 }
490
491 status = kmemcheck_shadow_test(shadow, size);
492 if (status == KMEMCHECK_SHADOW_INITIALIZED)
493 return;
494
495 if (kmemcheck_enabled)
496 kmemcheck_error_save(status, src_addr, size, regs);
497
498 if (kmemcheck_enabled == 2)
499 kmemcheck_enabled = 0;
500}
501
502enum kmemcheck_method {
503 KMEMCHECK_READ,
504 KMEMCHECK_WRITE,
505};
506
507static void kmemcheck_access(struct pt_regs *regs,
508 unsigned long fallback_address, enum kmemcheck_method fallback_method)
509{
510 const uint8_t *insn;
511 const uint8_t *insn_primary;
512 unsigned int size;
513
514 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
515
516 /* Recursive fault -- ouch. */
517 if (data->busy) {
518 kmemcheck_show_addr(fallback_address);
519 kmemcheck_error_save_bug(regs);
520 return;
521 }
522
523 data->busy = true;
524
525 insn = (const uint8_t *) regs->ip;
526 insn_primary = kmemcheck_opcode_get_primary(insn);
527
528 kmemcheck_opcode_decode(insn, &size);
529
530 switch (insn_primary[0]) {
531#ifdef CONFIG_KMEMCHECK_BITOPS_OK
532 /* AND, OR, XOR */
533 /*
534 * Unfortunately, these instructions have to be excluded from
535 * our regular checking since they access only some (and not
536 * all) bits. This clears out "bogus" bitfield-access warnings.
537 */
538 case 0x80:
539 case 0x81:
540 case 0x82:
541 case 0x83:
542 switch ((insn_primary[1] >> 3) & 7) {
543 /* OR */
544 case 1:
545 /* AND */
546 case 4:
547 /* XOR */
548 case 6:
549 kmemcheck_write(regs, fallback_address, size);
550 goto out;
551
552 /* ADD */
553 case 0:
554 /* ADC */
555 case 2:
556 /* SBB */
557 case 3:
558 /* SUB */
559 case 5:
560 /* CMP */
561 case 7:
562 break;
563 }
564 break;
565#endif
566
567 /* MOVS, MOVSB, MOVSW, MOVSD */
568 case 0xa4:
569 case 0xa5:
570 /*
571 * These instructions are special because they take two
572 * addresses, but we only get one page fault.
573 */
574 kmemcheck_copy(regs, regs->si, regs->di, size);
575 goto out;
576
577 /* CMPS, CMPSB, CMPSW, CMPSD */
578 case 0xa6:
579 case 0xa7:
580 kmemcheck_read(regs, regs->si, size);
581 kmemcheck_read(regs, regs->di, size);
582 goto out;
583 }
584
585 /*
586 * If the opcode isn't special in any way, we use the data from the
587 * page fault handler to determine the address and type of memory
588 * access.
589 */
590 switch (fallback_method) {
591 case KMEMCHECK_READ:
592 kmemcheck_read(regs, fallback_address, size);
593 goto out;
594 case KMEMCHECK_WRITE:
595 kmemcheck_write(regs, fallback_address, size);
596 goto out;
597 }
598
599out:
600 data->busy = false;
601}
602
603bool kmemcheck_fault(struct pt_regs *regs, unsigned long address,
604 unsigned long error_code)
605{
606 pte_t *pte;
607
608 /*
609 * XXX: Is it safe to assume that memory accesses from virtual 86
610 * mode or non-kernel code segments will _never_ access kernel
611 * memory (e.g. tracked pages)? For now, we need this to avoid
612 * invoking kmemcheck for PnP BIOS calls.
613 */
614 if (regs->flags & X86_VM_MASK)
615 return false;
616 if (regs->cs != __KERNEL_CS)
617 return false;
618
619 pte = kmemcheck_pte_lookup(address);
620 if (!pte)
621 return false;
622
623 if (error_code & 2)
624 kmemcheck_access(regs, address, KMEMCHECK_WRITE);
625 else
626 kmemcheck_access(regs, address, KMEMCHECK_READ);
627
628 kmemcheck_show(regs);
629 return true;
630}
631
632bool kmemcheck_trap(struct pt_regs *regs)
633{
634 if (!kmemcheck_active(regs))
635 return false;
636
637 /* We're done. */
638 kmemcheck_hide(regs);
639 return true;
640}
diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c
new file mode 100644
index 000000000000..63c19e27aa6f
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/opcode.c
@@ -0,0 +1,106 @@
1#include <linux/types.h>
2
3#include "opcode.h"
4
5static bool opcode_is_prefix(uint8_t b)
6{
7 return
8 /* Group 1 */
9 b == 0xf0 || b == 0xf2 || b == 0xf3
10 /* Group 2 */
11 || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26
12 || b == 0x64 || b == 0x65 || b == 0x2e || b == 0x3e
13 /* Group 3 */
14 || b == 0x66
15 /* Group 4 */
16 || b == 0x67;
17}
18
19#ifdef CONFIG_X86_64
20static bool opcode_is_rex_prefix(uint8_t b)
21{
22 return (b & 0xf0) == 0x40;
23}
24#else
25static bool opcode_is_rex_prefix(uint8_t b)
26{
27 return false;
28}
29#endif
30
31#define REX_W (1 << 3)
32
33/*
34 * This is a VERY crude opcode decoder. We only need to find the size of the
35 * load/store that caused our #PF and this should work for all the opcodes
36 * that we care about. Moreover, the ones who invented this instruction set
37 * should be shot.
38 */
39void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size)
40{
41 /* Default operand size */
42 int operand_size_override = 4;
43
44 /* prefixes */
45 for (; opcode_is_prefix(*op); ++op) {
46 if (*op == 0x66)
47 operand_size_override = 2;
48 }
49
50 /* REX prefix */
51 if (opcode_is_rex_prefix(*op)) {
52 uint8_t rex = *op;
53
54 ++op;
55 if (rex & REX_W) {
56 switch (*op) {
57 case 0x63:
58 *size = 4;
59 return;
60 case 0x0f:
61 ++op;
62
63 switch (*op) {
64 case 0xb6:
65 case 0xbe:
66 *size = 1;
67 return;
68 case 0xb7:
69 case 0xbf:
70 *size = 2;
71 return;
72 }
73
74 break;
75 }
76
77 *size = 8;
78 return;
79 }
80 }
81
82 /* escape opcode */
83 if (*op == 0x0f) {
84 ++op;
85
86 /*
87 * This is move with zero-extend and sign-extend, respectively;
88 * we don't have to think about 0xb6/0xbe, because this is
89 * already handled in the conditional below.
90 */
91 if (*op == 0xb7 || *op == 0xbf)
92 operand_size_override = 2;
93 }
94
95 *size = (*op & 1) ? operand_size_override : 1;
96}
97
98const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op)
99{
100 /* skip prefixes */
101 while (opcode_is_prefix(*op))
102 ++op;
103 if (opcode_is_rex_prefix(*op))
104 ++op;
105 return op;
106}
diff --git a/arch/x86/mm/kmemcheck/opcode.h b/arch/x86/mm/kmemcheck/opcode.h
new file mode 100644
index 000000000000..6956aad66b5b
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/opcode.h
@@ -0,0 +1,9 @@
1#ifndef ARCH__X86__MM__KMEMCHECK__OPCODE_H
2#define ARCH__X86__MM__KMEMCHECK__OPCODE_H
3
4#include <linux/types.h>
5
6void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size);
7const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op);
8
9#endif
diff --git a/arch/x86/mm/kmemcheck/pte.c b/arch/x86/mm/kmemcheck/pte.c
new file mode 100644
index 000000000000..4ead26eeaf96
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/pte.c
@@ -0,0 +1,22 @@
1#include <linux/mm.h>
2
3#include <asm/pgtable.h>
4
5#include "pte.h"
6
7pte_t *kmemcheck_pte_lookup(unsigned long address)
8{
9 pte_t *pte;
10 unsigned int level;
11
12 pte = lookup_address(address, &level);
13 if (!pte)
14 return NULL;
15 if (level != PG_LEVEL_4K)
16 return NULL;
17 if (!pte_hidden(*pte))
18 return NULL;
19
20 return pte;
21}
22
diff --git a/arch/x86/mm/kmemcheck/pte.h b/arch/x86/mm/kmemcheck/pte.h
new file mode 100644
index 000000000000..9f5966456492
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/pte.h
@@ -0,0 +1,10 @@
1#ifndef ARCH__X86__MM__KMEMCHECK__PTE_H
2#define ARCH__X86__MM__KMEMCHECK__PTE_H
3
4#include <linux/mm.h>
5
6#include <asm/pgtable.h>
7
8pte_t *kmemcheck_pte_lookup(unsigned long address);
9
10#endif
diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c
new file mode 100644
index 000000000000..036efbea8b28
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/selftest.c
@@ -0,0 +1,69 @@
1#include <linux/kernel.h>
2
3#include "opcode.h"
4#include "selftest.h"
5
6struct selftest_opcode {
7 unsigned int expected_size;
8 const uint8_t *insn;
9 const char *desc;
10};
11
12static const struct selftest_opcode selftest_opcodes[] = {
13 /* REP MOVS */
14 {1, "\xf3\xa4", "rep movsb <mem8>, <mem8>"},
15 {4, "\xf3\xa5", "rep movsl <mem32>, <mem32>"},
16
17 /* MOVZX / MOVZXD */
18 {1, "\x66\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg16>"},
19 {1, "\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg32>"},
20
21 /* MOVSX / MOVSXD */
22 {1, "\x66\x0f\xbe\x51\xf8", "movswq <mem8>, <reg16>"},
23 {1, "\x0f\xbe\x51\xf8", "movswq <mem8>, <reg32>"},
24
25#ifdef CONFIG_X86_64
26 /* MOVZX / MOVZXD */
27 {1, "\x49\x0f\xb6\x51\xf8", "movzbq <mem8>, <reg64>"},
28 {2, "\x49\x0f\xb7\x51\xf8", "movzbq <mem16>, <reg64>"},
29
30 /* MOVSX / MOVSXD */
31 {1, "\x49\x0f\xbe\x51\xf8", "movsbq <mem8>, <reg64>"},
32 {2, "\x49\x0f\xbf\x51\xf8", "movsbq <mem16>, <reg64>"},
33 {4, "\x49\x63\x51\xf8", "movslq <mem32>, <reg64>"},
34#endif
35};
36
37static bool selftest_opcode_one(const struct selftest_opcode *op)
38{
39 unsigned size;
40
41 kmemcheck_opcode_decode(op->insn, &size);
42
43 if (size == op->expected_size)
44 return true;
45
46 printk(KERN_WARNING "kmemcheck: opcode %s: expected size %d, got %d\n",
47 op->desc, op->expected_size, size);
48 return false;
49}
50
51static bool selftest_opcodes_all(void)
52{
53 bool pass = true;
54 unsigned int i;
55
56 for (i = 0; i < ARRAY_SIZE(selftest_opcodes); ++i)
57 pass = pass && selftest_opcode_one(&selftest_opcodes[i]);
58
59 return pass;
60}
61
62bool kmemcheck_selftest(void)
63{
64 bool pass = true;
65
66 pass = pass && selftest_opcodes_all();
67
68 return pass;
69}
diff --git a/arch/x86/mm/kmemcheck/selftest.h b/arch/x86/mm/kmemcheck/selftest.h
new file mode 100644
index 000000000000..8fed4fe11f95
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/selftest.h
@@ -0,0 +1,6 @@
1#ifndef ARCH_X86_MM_KMEMCHECK_SELFTEST_H
2#define ARCH_X86_MM_KMEMCHECK_SELFTEST_H
3
4bool kmemcheck_selftest(void);
5
6#endif
diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c
new file mode 100644
index 000000000000..e773b6bd0079
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/shadow.c
@@ -0,0 +1,162 @@
1#include <linux/kmemcheck.h>
2#include <linux/module.h>
3#include <linux/mm.h>
4#include <linux/module.h>
5
6#include <asm/page.h>
7#include <asm/pgtable.h>
8
9#include "pte.h"
10#include "shadow.h"
11
12/*
13 * Return the shadow address for the given address. Returns NULL if the
14 * address is not tracked.
15 *
16 * We need to be extremely careful not to follow any invalid pointers,
17 * because this function can be called for *any* possible address.
18 */
19void *kmemcheck_shadow_lookup(unsigned long address)
20{
21 pte_t *pte;
22 struct page *page;
23
24 if (!virt_addr_valid(address))
25 return NULL;
26
27 pte = kmemcheck_pte_lookup(address);
28 if (!pte)
29 return NULL;
30
31 page = virt_to_page(address);
32 if (!page->shadow)
33 return NULL;
34 return page->shadow + (address & (PAGE_SIZE - 1));
35}
36
37static void mark_shadow(void *address, unsigned int n,
38 enum kmemcheck_shadow status)
39{
40 unsigned long addr = (unsigned long) address;
41 unsigned long last_addr = addr + n - 1;
42 unsigned long page = addr & PAGE_MASK;
43 unsigned long last_page = last_addr & PAGE_MASK;
44 unsigned int first_n;
45 void *shadow;
46
47 /* If the memory range crosses a page boundary, stop there. */
48 if (page == last_page)
49 first_n = n;
50 else
51 first_n = page + PAGE_SIZE - addr;
52
53 shadow = kmemcheck_shadow_lookup(addr);
54 if (shadow)
55 memset(shadow, status, first_n);
56
57 addr += first_n;
58 n -= first_n;
59
60 /* Do full-page memset()s. */
61 while (n >= PAGE_SIZE) {
62 shadow = kmemcheck_shadow_lookup(addr);
63 if (shadow)
64 memset(shadow, status, PAGE_SIZE);
65
66 addr += PAGE_SIZE;
67 n -= PAGE_SIZE;
68 }
69
70 /* Do the remaining page, if any. */
71 if (n > 0) {
72 shadow = kmemcheck_shadow_lookup(addr);
73 if (shadow)
74 memset(shadow, status, n);
75 }
76}
77
78void kmemcheck_mark_unallocated(void *address, unsigned int n)
79{
80 mark_shadow(address, n, KMEMCHECK_SHADOW_UNALLOCATED);
81}
82
83void kmemcheck_mark_uninitialized(void *address, unsigned int n)
84{
85 mark_shadow(address, n, KMEMCHECK_SHADOW_UNINITIALIZED);
86}
87
88/*
89 * Fill the shadow memory of the given address such that the memory at that
90 * address is marked as being initialized.
91 */
92void kmemcheck_mark_initialized(void *address, unsigned int n)
93{
94 mark_shadow(address, n, KMEMCHECK_SHADOW_INITIALIZED);
95}
96EXPORT_SYMBOL_GPL(kmemcheck_mark_initialized);
97
98void kmemcheck_mark_freed(void *address, unsigned int n)
99{
100 mark_shadow(address, n, KMEMCHECK_SHADOW_FREED);
101}
102
103void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n)
104{
105 unsigned int i;
106
107 for (i = 0; i < n; ++i)
108 kmemcheck_mark_unallocated(page_address(&p[i]), PAGE_SIZE);
109}
110
111void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n)
112{
113 unsigned int i;
114
115 for (i = 0; i < n; ++i)
116 kmemcheck_mark_uninitialized(page_address(&p[i]), PAGE_SIZE);
117}
118
119void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n)
120{
121 unsigned int i;
122
123 for (i = 0; i < n; ++i)
124 kmemcheck_mark_initialized(page_address(&p[i]), PAGE_SIZE);
125}
126
127enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size)
128{
129 uint8_t *x;
130 unsigned int i;
131
132 x = shadow;
133
134#ifdef CONFIG_KMEMCHECK_PARTIAL_OK
135 /*
136 * Make sure _some_ bytes are initialized. Gcc frequently generates
137 * code to access neighboring bytes.
138 */
139 for (i = 0; i < size; ++i) {
140 if (x[i] == KMEMCHECK_SHADOW_INITIALIZED)
141 return x[i];
142 }
143#else
144 /* All bytes must be initialized. */
145 for (i = 0; i < size; ++i) {
146 if (x[i] != KMEMCHECK_SHADOW_INITIALIZED)
147 return x[i];
148 }
149#endif
150
151 return x[0];
152}
153
154void kmemcheck_shadow_set(void *shadow, unsigned int size)
155{
156 uint8_t *x;
157 unsigned int i;
158
159 x = shadow;
160 for (i = 0; i < size; ++i)
161 x[i] = KMEMCHECK_SHADOW_INITIALIZED;
162}
diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h
new file mode 100644
index 000000000000..af46d9ab9d86
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/shadow.h
@@ -0,0 +1,16 @@
1#ifndef ARCH__X86__MM__KMEMCHECK__SHADOW_H
2#define ARCH__X86__MM__KMEMCHECK__SHADOW_H
3
4enum kmemcheck_shadow {
5 KMEMCHECK_SHADOW_UNALLOCATED,
6 KMEMCHECK_SHADOW_UNINITIALIZED,
7 KMEMCHECK_SHADOW_INITIALIZED,
8 KMEMCHECK_SHADOW_FREED,
9};
10
11void *kmemcheck_shadow_lookup(unsigned long address);
12
13enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size);
14void kmemcheck_shadow_set(void *shadow, unsigned int size);
15
16#endif
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 50dc802a1c46..16ccbd77917f 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -32,7 +32,7 @@ struct kmmio_fault_page {
32 struct list_head list; 32 struct list_head list;
33 struct kmmio_fault_page *release_next; 33 struct kmmio_fault_page *release_next;
34 unsigned long page; /* location of the fault page */ 34 unsigned long page; /* location of the fault page */
35 bool old_presence; /* page presence prior to arming */ 35 pteval_t old_presence; /* page presence prior to arming */
36 bool armed; 36 bool armed;
37 37
38 /* 38 /*
@@ -97,60 +97,62 @@ static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
97static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page) 97static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
98{ 98{
99 struct list_head *head; 99 struct list_head *head;
100 struct kmmio_fault_page *p; 100 struct kmmio_fault_page *f;
101 101
102 page &= PAGE_MASK; 102 page &= PAGE_MASK;
103 head = kmmio_page_list(page); 103 head = kmmio_page_list(page);
104 list_for_each_entry_rcu(p, head, list) { 104 list_for_each_entry_rcu(f, head, list) {
105 if (p->page == page) 105 if (f->page == page)
106 return p; 106 return f;
107 } 107 }
108 return NULL; 108 return NULL;
109} 109}
110 110
111static void set_pmd_presence(pmd_t *pmd, bool present, bool *old) 111static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old)
112{ 112{
113 pmdval_t v = pmd_val(*pmd); 113 pmdval_t v = pmd_val(*pmd);
114 *old = !!(v & _PAGE_PRESENT); 114 if (clear) {
115 v &= ~_PAGE_PRESENT; 115 *old = v & _PAGE_PRESENT;
116 if (present) 116 v &= ~_PAGE_PRESENT;
117 v |= _PAGE_PRESENT; 117 } else /* presume this has been called with clear==true previously */
118 v |= *old;
118 set_pmd(pmd, __pmd(v)); 119 set_pmd(pmd, __pmd(v));
119} 120}
120 121
121static void set_pte_presence(pte_t *pte, bool present, bool *old) 122static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old)
122{ 123{
123 pteval_t v = pte_val(*pte); 124 pteval_t v = pte_val(*pte);
124 *old = !!(v & _PAGE_PRESENT); 125 if (clear) {
125 v &= ~_PAGE_PRESENT; 126 *old = v & _PAGE_PRESENT;
126 if (present) 127 v &= ~_PAGE_PRESENT;
127 v |= _PAGE_PRESENT; 128 } else /* presume this has been called with clear==true previously */
129 v |= *old;
128 set_pte_atomic(pte, __pte(v)); 130 set_pte_atomic(pte, __pte(v));
129} 131}
130 132
131static int set_page_presence(unsigned long addr, bool present, bool *old) 133static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
132{ 134{
133 unsigned int level; 135 unsigned int level;
134 pte_t *pte = lookup_address(addr, &level); 136 pte_t *pte = lookup_address(f->page, &level);
135 137
136 if (!pte) { 138 if (!pte) {
137 pr_err("kmmio: no pte for page 0x%08lx\n", addr); 139 pr_err("kmmio: no pte for page 0x%08lx\n", f->page);
138 return -1; 140 return -1;
139 } 141 }
140 142
141 switch (level) { 143 switch (level) {
142 case PG_LEVEL_2M: 144 case PG_LEVEL_2M:
143 set_pmd_presence((pmd_t *)pte, present, old); 145 clear_pmd_presence((pmd_t *)pte, clear, &f->old_presence);
144 break; 146 break;
145 case PG_LEVEL_4K: 147 case PG_LEVEL_4K:
146 set_pte_presence(pte, present, old); 148 clear_pte_presence(pte, clear, &f->old_presence);
147 break; 149 break;
148 default: 150 default:
149 pr_err("kmmio: unexpected page level 0x%x.\n", level); 151 pr_err("kmmio: unexpected page level 0x%x.\n", level);
150 return -1; 152 return -1;
151 } 153 }
152 154
153 __flush_tlb_one(addr); 155 __flush_tlb_one(f->page);
154 return 0; 156 return 0;
155} 157}
156 158
@@ -171,9 +173,9 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
171 WARN_ONCE(f->armed, KERN_ERR "kmmio page already armed.\n"); 173 WARN_ONCE(f->armed, KERN_ERR "kmmio page already armed.\n");
172 if (f->armed) { 174 if (f->armed) {
173 pr_warning("kmmio double-arm: page 0x%08lx, ref %d, old %d\n", 175 pr_warning("kmmio double-arm: page 0x%08lx, ref %d, old %d\n",
174 f->page, f->count, f->old_presence); 176 f->page, f->count, !!f->old_presence);
175 } 177 }
176 ret = set_page_presence(f->page, false, &f->old_presence); 178 ret = clear_page_presence(f, true);
177 WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", f->page); 179 WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", f->page);
178 f->armed = true; 180 f->armed = true;
179 return ret; 181 return ret;
@@ -182,8 +184,7 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
182/** Restore the given page to saved presence state. */ 184/** Restore the given page to saved presence state. */
183static void disarm_kmmio_fault_page(struct kmmio_fault_page *f) 185static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
184{ 186{
185 bool tmp; 187 int ret = clear_page_presence(f, false);
186 int ret = set_page_presence(f->page, f->old_presence, &tmp);
187 WARN_ONCE(ret < 0, 188 WARN_ONCE(ret < 0,
188 KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page); 189 KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page);
189 f->armed = false; 190 f->armed = false;
@@ -310,7 +311,12 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
310 struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); 311 struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
311 312
312 if (!ctx->active) { 313 if (!ctx->active) {
313 pr_debug("kmmio: spurious debug trap on CPU %d.\n", 314 /*
315 * debug traps without an active context are due to either
316 * something external causing them (f.e. using a debugger while
317 * mmio tracing enabled), or erroneous behaviour
318 */
319 pr_warning("kmmio: unexpected debug trap on CPU %d.\n",
314 smp_processor_id()); 320 smp_processor_id());
315 goto out; 321 goto out;
316 } 322 }
@@ -439,12 +445,12 @@ static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
439 head, 445 head,
440 struct kmmio_delayed_release, 446 struct kmmio_delayed_release,
441 rcu); 447 rcu);
442 struct kmmio_fault_page *p = dr->release_list; 448 struct kmmio_fault_page *f = dr->release_list;
443 while (p) { 449 while (f) {
444 struct kmmio_fault_page *next = p->release_next; 450 struct kmmio_fault_page *next = f->release_next;
445 BUG_ON(p->count); 451 BUG_ON(f->count);
446 kfree(p); 452 kfree(f);
447 p = next; 453 f = next;
448 } 454 }
449 kfree(dr); 455 kfree(dr);
450} 456}
@@ -453,19 +459,19 @@ static void remove_kmmio_fault_pages(struct rcu_head *head)
453{ 459{
454 struct kmmio_delayed_release *dr = 460 struct kmmio_delayed_release *dr =
455 container_of(head, struct kmmio_delayed_release, rcu); 461 container_of(head, struct kmmio_delayed_release, rcu);
456 struct kmmio_fault_page *p = dr->release_list; 462 struct kmmio_fault_page *f = dr->release_list;
457 struct kmmio_fault_page **prevp = &dr->release_list; 463 struct kmmio_fault_page **prevp = &dr->release_list;
458 unsigned long flags; 464 unsigned long flags;
459 465
460 spin_lock_irqsave(&kmmio_lock, flags); 466 spin_lock_irqsave(&kmmio_lock, flags);
461 while (p) { 467 while (f) {
462 if (!p->count) { 468 if (!f->count) {
463 list_del_rcu(&p->list); 469 list_del_rcu(&f->list);
464 prevp = &p->release_next; 470 prevp = &f->release_next;
465 } else { 471 } else {
466 *prevp = p->release_next; 472 *prevp = f->release_next;
467 } 473 }
468 p = p->release_next; 474 f = f->release_next;
469 } 475 }
470 spin_unlock_irqrestore(&kmmio_lock, flags); 476 spin_unlock_irqrestore(&kmmio_lock, flags);
471 477
@@ -528,8 +534,8 @@ void unregister_kmmio_probe(struct kmmio_probe *p)
528} 534}
529EXPORT_SYMBOL(unregister_kmmio_probe); 535EXPORT_SYMBOL(unregister_kmmio_probe);
530 536
531static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val, 537static int
532 void *args) 538kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args)
533{ 539{
534 struct die_args *arg = args; 540 struct die_args *arg = args;
535 541
@@ -544,11 +550,23 @@ static struct notifier_block nb_die = {
544 .notifier_call = kmmio_die_notifier 550 .notifier_call = kmmio_die_notifier
545}; 551};
546 552
547static int __init init_kmmio(void) 553int kmmio_init(void)
548{ 554{
549 int i; 555 int i;
556
550 for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) 557 for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
551 INIT_LIST_HEAD(&kmmio_page_table[i]); 558 INIT_LIST_HEAD(&kmmio_page_table[i]);
559
552 return register_die_notifier(&nb_die); 560 return register_die_notifier(&nb_die);
553} 561}
554fs_initcall(init_kmmio); /* should be before device_initcall() */ 562
563void kmmio_cleanup(void)
564{
565 int i;
566
567 unregister_die_notifier(&nb_die);
568 for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) {
569 WARN_ONCE(!list_empty(&kmmio_page_table[i]),
570 KERN_ERR "kmmio_page_table not empty at cleanup, any further tracing will leak memory.\n");
571 }
572}
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index 605c8be06217..18d244f70205 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -40,23 +40,22 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
40 40
41static void __init memtest(u64 pattern, u64 start_phys, u64 size) 41static void __init memtest(u64 pattern, u64 start_phys, u64 size)
42{ 42{
43 u64 i, count; 43 u64 *p, *start, *end;
44 u64 *start;
45 u64 start_bad, last_bad; 44 u64 start_bad, last_bad;
46 u64 start_phys_aligned; 45 u64 start_phys_aligned;
47 size_t incr; 46 const size_t incr = sizeof(pattern);
48 47
49 incr = sizeof(pattern);
50 start_phys_aligned = ALIGN(start_phys, incr); 48 start_phys_aligned = ALIGN(start_phys, incr);
51 count = (size - (start_phys_aligned - start_phys))/incr;
52 start = __va(start_phys_aligned); 49 start = __va(start_phys_aligned);
50 end = start + (size - (start_phys_aligned - start_phys)) / incr;
53 start_bad = 0; 51 start_bad = 0;
54 last_bad = 0; 52 last_bad = 0;
55 53
56 for (i = 0; i < count; i++) 54 for (p = start; p < end; p++)
57 start[i] = pattern; 55 *p = pattern;
58 for (i = 0; i < count; i++, start++, start_phys_aligned += incr) { 56
59 if (*start == pattern) 57 for (p = start; p < end; p++, start_phys_aligned += incr) {
58 if (*p == pattern)
60 continue; 59 continue;
61 if (start_phys_aligned == last_bad + incr) { 60 if (start_phys_aligned == last_bad + incr) {
62 last_bad += incr; 61 last_bad += incr;
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index c9342ed8b402..132772a8ec57 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -451,6 +451,7 @@ void enable_mmiotrace(void)
451 451
452 if (nommiotrace) 452 if (nommiotrace)
453 pr_info(NAME "MMIO tracing disabled.\n"); 453 pr_info(NAME "MMIO tracing disabled.\n");
454 kmmio_init();
454 enter_uniprocessor(); 455 enter_uniprocessor();
455 spin_lock_irq(&trace_lock); 456 spin_lock_irq(&trace_lock);
456 atomic_inc(&mmiotrace_enabled); 457 atomic_inc(&mmiotrace_enabled);
@@ -473,6 +474,7 @@ void disable_mmiotrace(void)
473 474
474 clear_trace_list(); /* guarantees: no more kmmio callbacks */ 475 clear_trace_list(); /* guarantees: no more kmmio callbacks */
475 leave_uniprocessor(); 476 leave_uniprocessor();
477 kmmio_cleanup();
476 pr_info(NAME "disabled.\n"); 478 pr_info(NAME "disabled.\n");
477out: 479out:
478 mutex_unlock(&mmiotrace_mutex); 480 mutex_unlock(&mmiotrace_mutex);
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 2d05a12029dc..459913beac71 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -179,18 +179,25 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
179} 179}
180 180
181/* Initialize bootmem allocator for a node */ 181/* Initialize bootmem allocator for a node */
182void __init setup_node_bootmem(int nodeid, unsigned long start, 182void __init
183 unsigned long end) 183setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
184{ 184{
185 unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size; 185 unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size;
186 const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
186 unsigned long bootmap_start, nodedata_phys; 187 unsigned long bootmap_start, nodedata_phys;
187 void *bootmap; 188 void *bootmap;
188 const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
189 int nid; 189 int nid;
190 190
191 if (!end) 191 if (!end)
192 return; 192 return;
193 193
194 /*
195 * Don't confuse VM with a node that doesn't have the
196 * minimum amount of memory:
197 */
198 if (end && (end - start) < NODE_MIN_SIZE)
199 return;
200
194 start = roundup(start, ZONE_ALIGN); 201 start = roundup(start, ZONE_ALIGN);
195 202
196 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, 203 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid,
@@ -272,9 +279,6 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
272 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, 279 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start,
273 bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT); 280 bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT);
274 281
275#ifdef CONFIG_ACPI_NUMA
276 srat_reserve_add_area(nodeid);
277#endif
278 node_set_online(nodeid); 282 node_set_online(nodeid);
279} 283}
280 284
@@ -578,21 +582,6 @@ unsigned long __init numa_free_all_bootmem(void)
578 return pages; 582 return pages;
579} 583}
580 584
581void __init paging_init(void)
582{
583 unsigned long max_zone_pfns[MAX_NR_ZONES];
584
585 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
586 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
587 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
588 max_zone_pfns[ZONE_NORMAL] = max_pfn;
589
590 sparse_memory_present_with_active_regions(MAX_NUMNODES);
591 sparse_init();
592
593 free_area_init_nodes(max_zone_pfns);
594}
595
596static __init int numa_setup(char *opt) 585static __init int numa_setup(char *opt)
597{ 586{
598 if (!opt) 587 if (!opt)
@@ -606,8 +595,6 @@ static __init int numa_setup(char *opt)
606#ifdef CONFIG_ACPI_NUMA 595#ifdef CONFIG_ACPI_NUMA
607 if (!strncmp(opt, "noacpi", 6)) 596 if (!strncmp(opt, "noacpi", 6))
608 acpi_numa = -1; 597 acpi_numa = -1;
609 if (!strncmp(opt, "hotadd=", 7))
610 hotadd_percent = simple_strtoul(opt+7, NULL, 10);
611#endif 598#endif
612 return 0; 599 return 0;
613} 600}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index e17efed088c5..3cfe9ced8a4c 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -470,7 +470,7 @@ static int split_large_page(pte_t *kpte, unsigned long address)
470 470
471 if (!debug_pagealloc) 471 if (!debug_pagealloc)
472 spin_unlock(&cpa_lock); 472 spin_unlock(&cpa_lock);
473 base = alloc_pages(GFP_KERNEL, 0); 473 base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
474 if (!debug_pagealloc) 474 if (!debug_pagealloc)
475 spin_lock(&cpa_lock); 475 spin_lock(&cpa_lock);
476 if (!base) 476 if (!base)
@@ -839,13 +839,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
839 839
840 vm_unmap_aliases(); 840 vm_unmap_aliases();
841 841
842 /*
843 * If we're called with lazy mmu updates enabled, the
844 * in-memory pte state may be stale. Flush pending updates to
845 * bring them up to date.
846 */
847 arch_flush_lazy_mmu_mode();
848
849 cpa.vaddr = addr; 842 cpa.vaddr = addr;
850 cpa.pages = pages; 843 cpa.pages = pages;
851 cpa.numpages = numpages; 844 cpa.numpages = numpages;
@@ -890,13 +883,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
890 } else 883 } else
891 cpa_flush_all(cache); 884 cpa_flush_all(cache);
892 885
893 /*
894 * If we've been called with lazy mmu updates enabled, then
895 * make sure that everything gets flushed out before we
896 * return.
897 */
898 arch_flush_lazy_mmu_mode();
899
900out: 886out:
901 return ret; 887 return ret;
902} 888}
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 7aa03a5389f5..8e43bdd45456 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -4,9 +4,11 @@
4#include <asm/tlb.h> 4#include <asm/tlb.h>
5#include <asm/fixmap.h> 5#include <asm/fixmap.h>
6 6
7#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
8
7pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) 9pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
8{ 10{
9 return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); 11 return (pte_t *)__get_free_page(PGALLOC_GFP);
10} 12}
11 13
12pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) 14pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
@@ -14,9 +16,9 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
14 struct page *pte; 16 struct page *pte;
15 17
16#ifdef CONFIG_HIGHPTE 18#ifdef CONFIG_HIGHPTE
17 pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); 19 pte = alloc_pages(PGALLOC_GFP | __GFP_HIGHMEM, 0);
18#else 20#else
19 pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); 21 pte = alloc_pages(PGALLOC_GFP, 0);
20#endif 22#endif
21 if (pte) 23 if (pte)
22 pgtable_page_ctor(pte); 24 pgtable_page_ctor(pte);
@@ -161,7 +163,7 @@ static int preallocate_pmds(pmd_t *pmds[])
161 bool failed = false; 163 bool failed = false;
162 164
163 for(i = 0; i < PREALLOCATED_PMDS; i++) { 165 for(i = 0; i < PREALLOCATED_PMDS; i++) {
164 pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); 166 pmd_t *pmd = (pmd_t *)__get_free_page(PGALLOC_GFP);
165 if (pmd == NULL) 167 if (pmd == NULL)
166 failed = true; 168 failed = true;
167 pmds[i] = pmd; 169 pmds[i] = pmd;
@@ -228,7 +230,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
228 pmd_t *pmds[PREALLOCATED_PMDS]; 230 pmd_t *pmds[PREALLOCATED_PMDS];
229 unsigned long flags; 231 unsigned long flags;
230 232
231 pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); 233 pgd = (pgd_t *)__get_free_page(PGALLOC_GFP);
232 234
233 if (pgd == NULL) 235 if (pgd == NULL)
234 goto out; 236 goto out;
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 01765955baaf..2dfcbf9df2ae 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -31,17 +31,11 @@ static nodemask_t nodes_parsed __initdata;
31static nodemask_t cpu_nodes_parsed __initdata; 31static nodemask_t cpu_nodes_parsed __initdata;
32static struct bootnode nodes[MAX_NUMNODES] __initdata; 32static struct bootnode nodes[MAX_NUMNODES] __initdata;
33static struct bootnode nodes_add[MAX_NUMNODES]; 33static struct bootnode nodes_add[MAX_NUMNODES];
34static int found_add_area __initdata;
35int hotadd_percent __initdata = 0;
36 34
37static int num_node_memblks __initdata; 35static int num_node_memblks __initdata;
38static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata; 36static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
39static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata; 37static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata;
40 38
41/* Too small nodes confuse the VM badly. Usually they result
42 from BIOS bugs. */
43#define NODE_MIN_SIZE (4*1024*1024)
44
45static __init int setup_node(int pxm) 39static __init int setup_node(int pxm)
46{ 40{
47 return acpi_map_pxm_to_node(pxm); 41 return acpi_map_pxm_to_node(pxm);
@@ -66,9 +60,6 @@ static __init void cutoff_node(int i, unsigned long start, unsigned long end)
66{ 60{
67 struct bootnode *nd = &nodes[i]; 61 struct bootnode *nd = &nodes[i];
68 62
69 if (found_add_area)
70 return;
71
72 if (nd->start < start) { 63 if (nd->start < start) {
73 nd->start = start; 64 nd->start = start;
74 if (nd->end < nd->start) 65 if (nd->end < nd->start)
@@ -86,7 +77,6 @@ static __init void bad_srat(void)
86 int i; 77 int i;
87 printk(KERN_ERR "SRAT: SRAT not used.\n"); 78 printk(KERN_ERR "SRAT: SRAT not used.\n");
88 acpi_numa = -1; 79 acpi_numa = -1;
89 found_add_area = 0;
90 for (i = 0; i < MAX_LOCAL_APIC; i++) 80 for (i = 0; i < MAX_LOCAL_APIC; i++)
91 apicid_to_node[i] = NUMA_NO_NODE; 81 apicid_to_node[i] = NUMA_NO_NODE;
92 for (i = 0; i < MAX_NUMNODES; i++) 82 for (i = 0; i < MAX_NUMNODES; i++)
@@ -182,24 +172,21 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
182 pxm, apic_id, node); 172 pxm, apic_id, node);
183} 173}
184 174
185static int update_end_of_memory(unsigned long end) {return -1;}
186static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
187#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 175#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
188static inline int save_add_info(void) {return 1;} 176static inline int save_add_info(void) {return 1;}
189#else 177#else
190static inline int save_add_info(void) {return 0;} 178static inline int save_add_info(void) {return 0;}
191#endif 179#endif
192/* 180/*
193 * Update nodes_add and decide if to include add are in the zone. 181 * Update nodes_add[]
194 * Both SPARSE and RESERVE need nodes_add information. 182 * This code supports one contiguous hot add area per node
195 * This code supports one contiguous hot add area per node.
196 */ 183 */
197static int __init 184static void __init
198reserve_hotadd(int node, unsigned long start, unsigned long end) 185update_nodes_add(int node, unsigned long start, unsigned long end)
199{ 186{
200 unsigned long s_pfn = start >> PAGE_SHIFT; 187 unsigned long s_pfn = start >> PAGE_SHIFT;
201 unsigned long e_pfn = end >> PAGE_SHIFT; 188 unsigned long e_pfn = end >> PAGE_SHIFT;
202 int ret = 0, changed = 0; 189 int changed = 0;
203 struct bootnode *nd = &nodes_add[node]; 190 struct bootnode *nd = &nodes_add[node];
204 191
205 /* I had some trouble with strange memory hotadd regions breaking 192 /* I had some trouble with strange memory hotadd regions breaking
@@ -210,7 +197,7 @@ reserve_hotadd(int node, unsigned long start, unsigned long end)
210 mistakes */ 197 mistakes */
211 if ((signed long)(end - start) < NODE_MIN_SIZE) { 198 if ((signed long)(end - start) < NODE_MIN_SIZE) {
212 printk(KERN_ERR "SRAT: Hotplug area too small\n"); 199 printk(KERN_ERR "SRAT: Hotplug area too small\n");
213 return -1; 200 return;
214 } 201 }
215 202
216 /* This check might be a bit too strict, but I'm keeping it for now. */ 203 /* This check might be a bit too strict, but I'm keeping it for now. */
@@ -218,12 +205,7 @@ reserve_hotadd(int node, unsigned long start, unsigned long end)
218 printk(KERN_ERR 205 printk(KERN_ERR
219 "SRAT: Hotplug area %lu -> %lu has existing memory\n", 206 "SRAT: Hotplug area %lu -> %lu has existing memory\n",
220 s_pfn, e_pfn); 207 s_pfn, e_pfn);
221 return -1; 208 return;
222 }
223
224 if (!hotadd_enough_memory(&nodes_add[node])) {
225 printk(KERN_ERR "SRAT: Hotplug area too large\n");
226 return -1;
227 } 209 }
228 210
229 /* Looks good */ 211 /* Looks good */
@@ -245,11 +227,9 @@ reserve_hotadd(int node, unsigned long start, unsigned long end)
245 printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n"); 227 printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
246 } 228 }
247 229
248 ret = update_end_of_memory(nd->end);
249
250 if (changed) 230 if (changed)
251 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end); 231 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",
252 return ret; 232 nd->start, nd->end);
253} 233}
254 234
255/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ 235/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
@@ -310,13 +290,10 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
310 start, end); 290 start, end);
311 e820_register_active_regions(node, start >> PAGE_SHIFT, 291 e820_register_active_regions(node, start >> PAGE_SHIFT,
312 end >> PAGE_SHIFT); 292 end >> PAGE_SHIFT);
313 push_node_boundaries(node, nd->start >> PAGE_SHIFT,
314 nd->end >> PAGE_SHIFT);
315 293
316 if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && 294 if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
317 (reserve_hotadd(node, start, end) < 0)) { 295 update_nodes_add(node, start, end);
318 /* Ignore hotadd region. Undo damage */ 296 /* restore nodes[node] */
319 printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
320 *nd = oldnode; 297 *nd = oldnode;
321 if ((nd->start | nd->end) == 0) 298 if ((nd->start | nd->end) == 0)
322 node_clear(node, nodes_parsed); 299 node_clear(node, nodes_parsed);
@@ -345,9 +322,9 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
345 pxmram = 0; 322 pxmram = 0;
346 } 323 }
347 324
348 e820ram = max_pfn - absent_pages_in_range(0, max_pfn); 325 e820ram = max_pfn - (e820_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT);
349 /* We seem to lose 3 pages somewhere. Allow a bit of slack. */ 326 /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
350 if ((long)(e820ram - pxmram) >= 1*1024*1024) { 327 if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) {
351 printk(KERN_ERR 328 printk(KERN_ERR
352 "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n", 329 "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
353 (pxmram << PAGE_SHIFT) >> 20, 330 (pxmram << PAGE_SHIFT) >> 20,
@@ -357,17 +334,6 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
357 return 1; 334 return 1;
358} 335}
359 336
360static void __init unparse_node(int node)
361{
362 int i;
363 node_clear(node, nodes_parsed);
364 node_clear(node, cpu_nodes_parsed);
365 for (i = 0; i < MAX_LOCAL_APIC; i++) {
366 if (apicid_to_node[i] == node)
367 apicid_to_node[i] = NUMA_NO_NODE;
368 }
369}
370
371void __init acpi_numa_arch_fixup(void) {} 337void __init acpi_numa_arch_fixup(void) {}
372 338
373/* Use the information discovered above to actually set up the nodes. */ 339/* Use the information discovered above to actually set up the nodes. */
@@ -379,18 +345,8 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
379 return -1; 345 return -1;
380 346
381 /* First clean up the node list */ 347 /* First clean up the node list */
382 for (i = 0; i < MAX_NUMNODES; i++) { 348 for (i = 0; i < MAX_NUMNODES; i++)
383 cutoff_node(i, start, end); 349 cutoff_node(i, start, end);
384 /*
385 * don't confuse VM with a node that doesn't have the
386 * minimum memory.
387 */
388 if (nodes[i].end &&
389 (nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
390 unparse_node(i);
391 node_set_offline(i);
392 }
393 }
394 350
395 if (!nodes_cover_memory(nodes)) { 351 if (!nodes_cover_memory(nodes)) {
396 bad_srat(); 352 bad_srat();
@@ -423,7 +379,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
423 379
424 if (node == NUMA_NO_NODE) 380 if (node == NUMA_NO_NODE)
425 continue; 381 continue;
426 if (!node_isset(node, node_possible_map)) 382 if (!node_online(node))
427 numa_clear_node(i); 383 numa_clear_node(i);
428 } 384 }
429 numa_init_array(); 385 numa_init_array();
@@ -510,26 +466,6 @@ static int null_slit_node_compare(int a, int b)
510} 466}
511#endif /* CONFIG_NUMA_EMU */ 467#endif /* CONFIG_NUMA_EMU */
512 468
513void __init srat_reserve_add_area(int nodeid)
514{
515 if (found_add_area && nodes_add[nodeid].end) {
516 u64 total_mb;
517
518 printk(KERN_INFO "SRAT: Reserving hot-add memory space "
519 "for node %d at %Lx-%Lx\n",
520 nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
521 total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
522 >> PAGE_SHIFT;
523 total_mb *= sizeof(struct page);
524 total_mb >>= 20;
525 printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
526 "pre-allocated memory.\n", (unsigned long long)total_mb);
527 reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
528 nodes_add[nodeid].end - nodes_add[nodeid].start,
529 BOOTMEM_DEFAULT);
530 }
531}
532
533int __node_distance(int a, int b) 469int __node_distance(int a, int b)
534{ 470{
535 int index; 471 int index;
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 202864ad49a7..b07dd8d0b321 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -40,8 +40,9 @@ static int profile_exceptions_notify(struct notifier_block *self,
40 40
41 switch (val) { 41 switch (val) {
42 case DIE_NMI: 42 case DIE_NMI:
43 if (model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu))) 43 case DIE_NMI_IPI:
44 ret = NOTIFY_STOP; 44 model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu));
45 ret = NOTIFY_STOP;
45 break; 46 break;
46 default: 47 default:
47 break; 48 break;
@@ -134,7 +135,7 @@ static void nmi_cpu_setup(void *dummy)
134static struct notifier_block profile_exceptions_nb = { 135static struct notifier_block profile_exceptions_nb = {
135 .notifier_call = profile_exceptions_notify, 136 .notifier_call = profile_exceptions_notify,
136 .next = NULL, 137 .next = NULL,
137 .priority = 0 138 .priority = 2
138}; 139};
139 140
140static int nmi_setup(void) 141static int nmi_setup(void)
@@ -356,14 +357,11 @@ static void exit_sysfs(void)
356#define exit_sysfs() do { } while (0) 357#define exit_sysfs() do { } while (0)
357#endif /* CONFIG_PM */ 358#endif /* CONFIG_PM */
358 359
359static int p4force;
360module_param(p4force, int, 0);
361
362static int __init p4_init(char **cpu_type) 360static int __init p4_init(char **cpu_type)
363{ 361{
364 __u8 cpu_model = boot_cpu_data.x86_model; 362 __u8 cpu_model = boot_cpu_data.x86_model;
365 363
366 if (!p4force && (cpu_model > 6 || cpu_model == 5)) 364 if (cpu_model > 6 || cpu_model == 5)
367 return 0; 365 return 0;
368 366
369#ifndef CONFIG_SMP 367#ifndef CONFIG_SMP
@@ -389,10 +387,25 @@ static int __init p4_init(char **cpu_type)
389 return 0; 387 return 0;
390} 388}
391 389
390static int force_arch_perfmon;
391static int force_cpu_type(const char *str, struct kernel_param *kp)
392{
393 if (!strcmp(str, "archperfmon")) {
394 force_arch_perfmon = 1;
395 printk(KERN_INFO "oprofile: forcing architectural perfmon\n");
396 }
397
398 return 0;
399}
400module_param_call(cpu_type, force_cpu_type, NULL, NULL, 0);
401
392static int __init ppro_init(char **cpu_type) 402static int __init ppro_init(char **cpu_type)
393{ 403{
394 __u8 cpu_model = boot_cpu_data.x86_model; 404 __u8 cpu_model = boot_cpu_data.x86_model;
395 405
406 if (force_arch_perfmon && cpu_has_arch_perfmon)
407 return 0;
408
396 switch (cpu_model) { 409 switch (cpu_model) {
397 case 0 ... 2: 410 case 0 ... 2:
398 *cpu_type = "i386/ppro"; 411 *cpu_type = "i386/ppro";
@@ -414,6 +427,13 @@ static int __init ppro_init(char **cpu_type)
414 case 15: case 23: 427 case 15: case 23:
415 *cpu_type = "i386/core_2"; 428 *cpu_type = "i386/core_2";
416 break; 429 break;
430 case 26:
431 arch_perfmon_setup_counters();
432 *cpu_type = "i386/core_i7";
433 break;
434 case 28:
435 *cpu_type = "i386/atom";
436 break;
417 default: 437 default:
418 /* Unknown */ 438 /* Unknown */
419 return 0; 439 return 0;
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index 10131fbdaada..4da7230b3d17 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -18,7 +18,7 @@
18#include <asm/msr.h> 18#include <asm/msr.h>
19#include <asm/apic.h> 19#include <asm/apic.h>
20#include <asm/nmi.h> 20#include <asm/nmi.h>
21#include <asm/intel_arch_perfmon.h> 21#include <asm/perf_counter.h>
22 22
23#include "op_x86_model.h" 23#include "op_x86_model.h"
24#include "op_counter.h" 24#include "op_counter.h"
@@ -136,6 +136,13 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
136 u64 val; 136 u64 val;
137 int i; 137 int i;
138 138
139 /*
140 * This can happen if perf counters are in use when
141 * we steal the die notifier NMI.
142 */
143 if (unlikely(!reset_value))
144 goto out;
145
139 for (i = 0 ; i < num_counters; ++i) { 146 for (i = 0 ; i < num_counters; ++i) {
140 if (!reset_value[i]) 147 if (!reset_value[i])
141 continue; 148 continue;
@@ -146,6 +153,7 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
146 } 153 }
147 } 154 }
148 155
156out:
149 /* Only P6 based Pentium M need to re-unmask the apic vector but it 157 /* Only P6 based Pentium M need to re-unmask the apic vector but it
150 * doesn't hurt other P6 variant */ 158 * doesn't hurt other P6 variant */
151 apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); 159 apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index c0ecf250fe51..b26626dc517c 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -38,15 +38,26 @@ count_resource(struct acpi_resource *acpi_res, void *data)
38 struct acpi_resource_address64 addr; 38 struct acpi_resource_address64 addr;
39 acpi_status status; 39 acpi_status status;
40 40
41 if (info->res_num >= PCI_BUS_NUM_RESOURCES)
42 return AE_OK;
43
44 status = resource_to_addr(acpi_res, &addr); 41 status = resource_to_addr(acpi_res, &addr);
45 if (ACPI_SUCCESS(status)) 42 if (ACPI_SUCCESS(status))
46 info->res_num++; 43 info->res_num++;
47 return AE_OK; 44 return AE_OK;
48} 45}
49 46
47static int
48bus_has_transparent_bridge(struct pci_bus *bus)
49{
50 struct pci_dev *dev;
51
52 list_for_each_entry(dev, &bus->devices, bus_list) {
53 u16 class = dev->class >> 8;
54
55 if (class == PCI_CLASS_BRIDGE_PCI && dev->transparent)
56 return true;
57 }
58 return false;
59}
60
50static acpi_status 61static acpi_status
51setup_resource(struct acpi_resource *acpi_res, void *data) 62setup_resource(struct acpi_resource *acpi_res, void *data)
52{ 63{
@@ -56,9 +67,7 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
56 acpi_status status; 67 acpi_status status;
57 unsigned long flags; 68 unsigned long flags;
58 struct resource *root; 69 struct resource *root;
59 70 int max_root_bus_resources = PCI_BUS_NUM_RESOURCES;
60 if (info->res_num >= PCI_BUS_NUM_RESOURCES)
61 return AE_OK;
62 71
63 status = resource_to_addr(acpi_res, &addr); 72 status = resource_to_addr(acpi_res, &addr);
64 if (!ACPI_SUCCESS(status)) 73 if (!ACPI_SUCCESS(status))
@@ -82,6 +91,18 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
82 res->end = res->start + addr.address_length - 1; 91 res->end = res->start + addr.address_length - 1;
83 res->child = NULL; 92 res->child = NULL;
84 93
94 if (bus_has_transparent_bridge(info->bus))
95 max_root_bus_resources -= 3;
96 if (info->res_num >= max_root_bus_resources) {
97 printk(KERN_WARNING "PCI: Failed to allocate 0x%lx-0x%lx "
98 "from %s for %s due to _CRS returning more than "
99 "%d resource descriptors\n", (unsigned long) res->start,
100 (unsigned long) res->end, root->name, info->name,
101 max_root_bus_resources);
102 info->res_num++;
103 return AE_OK;
104 }
105
85 if (insert_resource(root, res)) { 106 if (insert_resource(root, res)) {
86 printk(KERN_ERR "PCI: Failed to allocate 0x%lx-0x%lx " 107 printk(KERN_ERR "PCI: Failed to allocate 0x%lx-0x%lx "
87 "from %s for %s\n", (unsigned long) res->start, 108 "from %s for %s\n", (unsigned long) res->start,
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index a85bef20a3b9..0fb56db16d18 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -116,7 +116,7 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list)
116 struct pci_bus *bus; 116 struct pci_bus *bus;
117 struct pci_dev *dev; 117 struct pci_dev *dev;
118 int idx; 118 int idx;
119 struct resource *r, *pr; 119 struct resource *r;
120 120
121 /* Depth-First Search on bus tree */ 121 /* Depth-First Search on bus tree */
122 list_for_each_entry(bus, bus_list, node) { 122 list_for_each_entry(bus, bus_list, node) {
@@ -126,9 +126,8 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list)
126 r = &dev->resource[idx]; 126 r = &dev->resource[idx];
127 if (!r->flags) 127 if (!r->flags)
128 continue; 128 continue;
129 pr = pci_find_parent_resource(dev, r); 129 if (!r->start ||
130 if (!r->start || !pr || 130 pci_claim_resource(dev, idx) < 0) {
131 request_resource(pr, r) < 0) {
132 dev_info(&dev->dev, "BAR %d: can't allocate resource\n", idx); 131 dev_info(&dev->dev, "BAR %d: can't allocate resource\n", idx);
133 /* 132 /*
134 * Something is wrong with the region. 133 * Something is wrong with the region.
@@ -149,7 +148,7 @@ static void __init pcibios_allocate_resources(int pass)
149 struct pci_dev *dev = NULL; 148 struct pci_dev *dev = NULL;
150 int idx, disabled; 149 int idx, disabled;
151 u16 command; 150 u16 command;
152 struct resource *r, *pr; 151 struct resource *r;
153 152
154 for_each_pci_dev(dev) { 153 for_each_pci_dev(dev) {
155 pci_read_config_word(dev, PCI_COMMAND, &command); 154 pci_read_config_word(dev, PCI_COMMAND, &command);
@@ -168,8 +167,7 @@ static void __init pcibios_allocate_resources(int pass)
168 (unsigned long long) r->start, 167 (unsigned long long) r->start,
169 (unsigned long long) r->end, 168 (unsigned long long) r->end,
170 r->flags, disabled, pass); 169 r->flags, disabled, pass);
171 pr = pci_find_parent_resource(dev, r); 170 if (pci_claim_resource(dev, idx) < 0) {
172 if (!pr || request_resource(pr, r) < 0) {
173 dev_info(&dev->dev, "BAR %d: can't allocate resource\n", idx); 171 dev_info(&dev->dev, "BAR %d: can't allocate resource\n", idx);
174 /* We'll assign a new address later */ 172 /* We'll assign a new address later */
175 r->end -= r->start; 173 r->end -= r->start;
@@ -197,7 +195,7 @@ static void __init pcibios_allocate_resources(int pass)
197static int __init pcibios_assign_resources(void) 195static int __init pcibios_assign_resources(void)
198{ 196{
199 struct pci_dev *dev = NULL; 197 struct pci_dev *dev = NULL;
200 struct resource *r, *pr; 198 struct resource *r;
201 199
202 if (!(pci_probe & PCI_ASSIGN_ROMS)) { 200 if (!(pci_probe & PCI_ASSIGN_ROMS)) {
203 /* 201 /*
@@ -209,8 +207,7 @@ static int __init pcibios_assign_resources(void)
209 r = &dev->resource[PCI_ROM_RESOURCE]; 207 r = &dev->resource[PCI_ROM_RESOURCE];
210 if (!r->flags || !r->start) 208 if (!r->flags || !r->start)
211 continue; 209 continue;
212 pr = pci_find_parent_resource(dev, r); 210 if (pci_claim_resource(dev, PCI_ROM_RESOURCE) < 0) {
213 if (!pr || request_resource(pr, r) < 0) {
214 r->end -= r->start; 211 r->end -= r->start;
215 r->start = 0; 212 r->start = 0;
216 } 213 }
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index fecbce6e7d7c..0696d506c4ad 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -889,6 +889,9 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
889 return 0; 889 return 0;
890 } 890 }
891 891
892 if (io_apic_assign_pci_irqs)
893 return 0;
894
892 /* Find IRQ routing entry */ 895 /* Find IRQ routing entry */
893 896
894 if (!pirq_table) 897 if (!pirq_table)
@@ -1039,56 +1042,15 @@ static void __init pcibios_fixup_irqs(void)
1039 pirq_penalty[dev->irq]++; 1042 pirq_penalty[dev->irq]++;
1040 } 1043 }
1041 1044
1045 if (io_apic_assign_pci_irqs)
1046 return;
1047
1042 dev = NULL; 1048 dev = NULL;
1043 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 1049 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
1044 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); 1050 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
1045 if (!pin) 1051 if (!pin)
1046 continue; 1052 continue;
1047 1053
1048#ifdef CONFIG_X86_IO_APIC
1049 /*
1050 * Recalculate IRQ numbers if we use the I/O APIC.
1051 */
1052 if (io_apic_assign_pci_irqs) {
1053 int irq;
1054
1055 /*
1056 * interrupt pins are numbered starting from 1
1057 */
1058 irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
1059 PCI_SLOT(dev->devfn), pin - 1);
1060 /*
1061 * Busses behind bridges are typically not listed in the
1062 * MP-table. In this case we have to look up the IRQ
1063 * based on the parent bus, parent slot, and pin number.
1064 * The SMP code detects such bridged busses itself so we
1065 * should get into this branch reliably.
1066 */
1067 if (irq < 0 && dev->bus->parent) {
1068 /* go back to the bridge */
1069 struct pci_dev *bridge = dev->bus->self;
1070 int bus;
1071
1072 pin = pci_swizzle_interrupt_pin(dev, pin);
1073 bus = bridge->bus->number;
1074 irq = IO_APIC_get_PCI_irq_vector(bus,
1075 PCI_SLOT(bridge->devfn), pin - 1);
1076 if (irq >= 0)
1077 dev_warn(&dev->dev,
1078 "using bridge %s INT %c to "
1079 "get IRQ %d\n",
1080 pci_name(bridge),
1081 'A' + pin - 1, irq);
1082 }
1083 if (irq >= 0) {
1084 dev_info(&dev->dev,
1085 "PCI->APIC IRQ transform: INT %c "
1086 "-> IRQ %d\n",
1087 'A' + pin - 1, irq);
1088 dev->irq = irq;
1089 }
1090 }
1091#endif
1092 /* 1054 /*
1093 * Still no IRQ? Try to lookup one... 1055 * Still no IRQ? Try to lookup one...
1094 */ 1056 */
@@ -1183,6 +1145,19 @@ int __init pcibios_irq_init(void)
1183 pcibios_enable_irq = pirq_enable_irq; 1145 pcibios_enable_irq = pirq_enable_irq;
1184 1146
1185 pcibios_fixup_irqs(); 1147 pcibios_fixup_irqs();
1148
1149 if (io_apic_assign_pci_irqs && pci_routeirq) {
1150 struct pci_dev *dev = NULL;
1151 /*
1152 * PCI IRQ routing is set up by pci_enable_device(), but we
1153 * also do it here in case there are still broken drivers that
1154 * don't use pci_enable_device().
1155 */
1156 printk(KERN_INFO "PCI: Routing PCI interrupts for all devices because \"pci=routeirq\" specified\n");
1157 for_each_pci_dev(dev)
1158 pirq_enable_irq(dev);
1159 }
1160
1186 return 0; 1161 return 0;
1187} 1162}
1188 1163
@@ -1213,16 +1188,23 @@ void pcibios_penalize_isa_irq(int irq, int active)
1213static int pirq_enable_irq(struct pci_dev *dev) 1188static int pirq_enable_irq(struct pci_dev *dev)
1214{ 1189{
1215 u8 pin; 1190 u8 pin;
1216 struct pci_dev *temp_dev;
1217 1191
1218 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); 1192 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
1219 if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) { 1193 if (pin && !pcibios_lookup_irq(dev, 1)) {
1220 char *msg = ""; 1194 char *msg = "";
1221 1195
1196 if (!io_apic_assign_pci_irqs && dev->irq)
1197 return 0;
1198
1222 if (io_apic_assign_pci_irqs) { 1199 if (io_apic_assign_pci_irqs) {
1200#ifdef CONFIG_X86_IO_APIC
1201 struct pci_dev *temp_dev;
1223 int irq; 1202 int irq;
1203 struct io_apic_irq_attr irq_attr;
1224 1204
1225 irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin - 1); 1205 irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
1206 PCI_SLOT(dev->devfn),
1207 pin - 1, &irq_attr);
1226 /* 1208 /*
1227 * Busses behind bridges are typically not listed in the MP-table. 1209 * Busses behind bridges are typically not listed in the MP-table.
1228 * In this case we have to look up the IRQ based on the parent bus, 1210 * In this case we have to look up the IRQ based on the parent bus,
@@ -1235,7 +1217,8 @@ static int pirq_enable_irq(struct pci_dev *dev)
1235 1217
1236 pin = pci_swizzle_interrupt_pin(dev, pin); 1218 pin = pci_swizzle_interrupt_pin(dev, pin);
1237 irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, 1219 irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
1238 PCI_SLOT(bridge->devfn), pin - 1); 1220 PCI_SLOT(bridge->devfn),
1221 pin - 1, &irq_attr);
1239 if (irq >= 0) 1222 if (irq >= 0)
1240 dev_warn(&dev->dev, "using bridge %s " 1223 dev_warn(&dev->dev, "using bridge %s "
1241 "INT %c to get IRQ %d\n", 1224 "INT %c to get IRQ %d\n",
@@ -1245,12 +1228,15 @@ static int pirq_enable_irq(struct pci_dev *dev)
1245 } 1228 }
1246 dev = temp_dev; 1229 dev = temp_dev;
1247 if (irq >= 0) { 1230 if (irq >= 0) {
1231 io_apic_set_pci_routing(&dev->dev, irq,
1232 &irq_attr);
1233 dev->irq = irq;
1248 dev_info(&dev->dev, "PCI->APIC IRQ transform: " 1234 dev_info(&dev->dev, "PCI->APIC IRQ transform: "
1249 "INT %c -> IRQ %d\n", 'A' + pin - 1, irq); 1235 "INT %c -> IRQ %d\n", 'A' + pin - 1, irq);
1250 dev->irq = irq;
1251 return 0; 1236 return 0;
1252 } else 1237 } else
1253 msg = "; probably buggy MP table"; 1238 msg = "; probably buggy MP table";
1239#endif
1254 } else if (pci_probe & PCI_BIOS_IRQ_SCAN) 1240 } else if (pci_probe & PCI_BIOS_IRQ_SCAN)
1255 msg = ""; 1241 msg = "";
1256 else 1242 else
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 8766b0e216c5..712443ec6d43 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -523,6 +523,69 @@ reject:
523 523
524static int __initdata known_bridge; 524static int __initdata known_bridge;
525 525
526static int acpi_mcfg_64bit_base_addr __initdata = FALSE;
527
528/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */
529struct acpi_mcfg_allocation *pci_mmcfg_config;
530int pci_mmcfg_config_num;
531
532static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg)
533{
534 if (!strcmp(mcfg->header.oem_id, "SGI"))
535 acpi_mcfg_64bit_base_addr = TRUE;
536
537 return 0;
538}
539
540static int __init pci_parse_mcfg(struct acpi_table_header *header)
541{
542 struct acpi_table_mcfg *mcfg;
543 unsigned long i;
544 int config_size;
545
546 if (!header)
547 return -EINVAL;
548
549 mcfg = (struct acpi_table_mcfg *)header;
550
551 /* how many config structures do we have */
552 pci_mmcfg_config_num = 0;
553 i = header->length - sizeof(struct acpi_table_mcfg);
554 while (i >= sizeof(struct acpi_mcfg_allocation)) {
555 ++pci_mmcfg_config_num;
556 i -= sizeof(struct acpi_mcfg_allocation);
557 };
558 if (pci_mmcfg_config_num == 0) {
559 printk(KERN_ERR PREFIX "MMCONFIG has no entries\n");
560 return -ENODEV;
561 }
562
563 config_size = pci_mmcfg_config_num * sizeof(*pci_mmcfg_config);
564 pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL);
565 if (!pci_mmcfg_config) {
566 printk(KERN_WARNING PREFIX
567 "No memory for MCFG config tables\n");
568 return -ENOMEM;
569 }
570
571 memcpy(pci_mmcfg_config, &mcfg[1], config_size);
572
573 acpi_mcfg_oem_check(mcfg);
574
575 for (i = 0; i < pci_mmcfg_config_num; ++i) {
576 if ((pci_mmcfg_config[i].address > 0xFFFFFFFF) &&
577 !acpi_mcfg_64bit_base_addr) {
578 printk(KERN_ERR PREFIX
579 "MMCONFIG not in low 4GB of memory\n");
580 kfree(pci_mmcfg_config);
581 pci_mmcfg_config_num = 0;
582 return -ENODEV;
583 }
584 }
585
586 return 0;
587}
588
526static void __init __pci_mmcfg_init(int early) 589static void __init __pci_mmcfg_init(int early)
527{ 590{
528 /* MMCONFIG disabled */ 591 /* MMCONFIG disabled */
@@ -543,7 +606,7 @@ static void __init __pci_mmcfg_init(int early)
543 } 606 }
544 607
545 if (!known_bridge) 608 if (!known_bridge)
546 acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg); 609 acpi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
547 610
548 pci_mmcfg_reject_broken(early); 611 pci_mmcfg_reject_broken(early);
549 612
diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile
index 58b32db33125..de2abbd07544 100644
--- a/arch/x86/power/Makefile
+++ b/arch/x86/power/Makefile
@@ -3,5 +3,5 @@
3nostackp := $(call cc-option, -fno-stack-protector) 3nostackp := $(call cc-option, -fno-stack-protector)
4CFLAGS_cpu_$(BITS).o := $(nostackp) 4CFLAGS_cpu_$(BITS).o := $(nostackp)
5 5
6obj-$(CONFIG_PM_SLEEP) += cpu_$(BITS).o 6obj-$(CONFIG_PM_SLEEP) += cpu.o
7obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o 7obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o
diff --git a/arch/x86/power/cpu_64.c b/arch/x86/power/cpu.c
index 5343540f2607..d277ef1eea51 100644
--- a/arch/x86/power/cpu_64.c
+++ b/arch/x86/power/cpu.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Suspend and hibernation support for x86-64 2 * Suspend support specific for i386/x86-64.
3 * 3 *
4 * Distribute under GPLv2 4 * Distribute under GPLv2
5 * 5 *
@@ -8,18 +8,28 @@
8 * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> 8 * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
9 */ 9 */
10 10
11#include <linux/smp.h>
12#include <linux/suspend.h> 11#include <linux/suspend.h>
13#include <asm/proto.h> 12#include <linux/smp.h>
14#include <asm/page.h> 13
15#include <asm/pgtable.h> 14#include <asm/pgtable.h>
15#include <asm/proto.h>
16#include <asm/mtrr.h> 16#include <asm/mtrr.h>
17#include <asm/page.h>
18#include <asm/mce.h>
17#include <asm/xcr.h> 19#include <asm/xcr.h>
18#include <asm/suspend.h> 20#include <asm/suspend.h>
19 21
20static void fix_processor_context(void); 22#ifdef CONFIG_X86_32
23static struct saved_context saved_context;
21 24
25unsigned long saved_context_ebx;
26unsigned long saved_context_esp, saved_context_ebp;
27unsigned long saved_context_esi, saved_context_edi;
28unsigned long saved_context_eflags;
29#else
30/* CONFIG_X86_64 */
22struct saved_context saved_context; 31struct saved_context saved_context;
32#endif
23 33
24/** 34/**
25 * __save_processor_state - save CPU registers before creating a 35 * __save_processor_state - save CPU registers before creating a
@@ -38,19 +48,35 @@ struct saved_context saved_context;
38 */ 48 */
39static void __save_processor_state(struct saved_context *ctxt) 49static void __save_processor_state(struct saved_context *ctxt)
40{ 50{
51#ifdef CONFIG_X86_32
52 mtrr_save_fixed_ranges(NULL);
53#endif
41 kernel_fpu_begin(); 54 kernel_fpu_begin();
42 55
43 /* 56 /*
44 * descriptor tables 57 * descriptor tables
45 */ 58 */
59#ifdef CONFIG_X86_32
60 store_gdt(&ctxt->gdt);
61 store_idt(&ctxt->idt);
62#else
63/* CONFIG_X86_64 */
46 store_gdt((struct desc_ptr *)&ctxt->gdt_limit); 64 store_gdt((struct desc_ptr *)&ctxt->gdt_limit);
47 store_idt((struct desc_ptr *)&ctxt->idt_limit); 65 store_idt((struct desc_ptr *)&ctxt->idt_limit);
66#endif
48 store_tr(ctxt->tr); 67 store_tr(ctxt->tr);
49 68
50 /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */ 69 /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */
51 /* 70 /*
52 * segment registers 71 * segment registers
53 */ 72 */
73#ifdef CONFIG_X86_32
74 savesegment(es, ctxt->es);
75 savesegment(fs, ctxt->fs);
76 savesegment(gs, ctxt->gs);
77 savesegment(ss, ctxt->ss);
78#else
79/* CONFIG_X86_64 */
54 asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds)); 80 asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds));
55 asm volatile ("movw %%es, %0" : "=m" (ctxt->es)); 81 asm volatile ("movw %%es, %0" : "=m" (ctxt->es));
56 asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs)); 82 asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs));
@@ -62,30 +88,87 @@ static void __save_processor_state(struct saved_context *ctxt)
62 rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); 88 rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
63 mtrr_save_fixed_ranges(NULL); 89 mtrr_save_fixed_ranges(NULL);
64 90
91 rdmsrl(MSR_EFER, ctxt->efer);
92#endif
93
65 /* 94 /*
66 * control registers 95 * control registers
67 */ 96 */
68 rdmsrl(MSR_EFER, ctxt->efer);
69 ctxt->cr0 = read_cr0(); 97 ctxt->cr0 = read_cr0();
70 ctxt->cr2 = read_cr2(); 98 ctxt->cr2 = read_cr2();
71 ctxt->cr3 = read_cr3(); 99 ctxt->cr3 = read_cr3();
100#ifdef CONFIG_X86_32
101 ctxt->cr4 = read_cr4_safe();
102#else
103/* CONFIG_X86_64 */
72 ctxt->cr4 = read_cr4(); 104 ctxt->cr4 = read_cr4();
73 ctxt->cr8 = read_cr8(); 105 ctxt->cr8 = read_cr8();
106#endif
74} 107}
75 108
109/* Needed by apm.c */
76void save_processor_state(void) 110void save_processor_state(void)
77{ 111{
78 __save_processor_state(&saved_context); 112 __save_processor_state(&saved_context);
79} 113}
114#ifdef CONFIG_X86_32
115EXPORT_SYMBOL(save_processor_state);
116#endif
80 117
81static void do_fpu_end(void) 118static void do_fpu_end(void)
82{ 119{
83 /* 120 /*
84 * Restore FPU regs if necessary 121 * Restore FPU regs if necessary.
85 */ 122 */
86 kernel_fpu_end(); 123 kernel_fpu_end();
87} 124}
88 125
126static void fix_processor_context(void)
127{
128 int cpu = smp_processor_id();
129 struct tss_struct *t = &per_cpu(init_tss, cpu);
130
131 set_tss_desc(cpu, t); /*
132 * This just modifies memory; should not be
133 * necessary. But... This is necessary, because
134 * 386 hardware has concept of busy TSS or some
135 * similar stupidity.
136 */
137
138#ifdef CONFIG_X86_64
139 get_cpu_gdt_table(cpu)[GDT_ENTRY_TSS].type = 9;
140
141 syscall_init(); /* This sets MSR_*STAR and related */
142#endif
143 load_TR_desc(); /* This does ltr */
144 load_LDT(&current->active_mm->context); /* This does lldt */
145
146 /*
147 * Now maybe reload the debug registers
148 */
149 if (current->thread.debugreg7) {
150#ifdef CONFIG_X86_32
151 set_debugreg(current->thread.debugreg0, 0);
152 set_debugreg(current->thread.debugreg1, 1);
153 set_debugreg(current->thread.debugreg2, 2);
154 set_debugreg(current->thread.debugreg3, 3);
155 /* no 4 and 5 */
156 set_debugreg(current->thread.debugreg6, 6);
157 set_debugreg(current->thread.debugreg7, 7);
158#else
159 /* CONFIG_X86_64 */
160 loaddebug(&current->thread, 0);
161 loaddebug(&current->thread, 1);
162 loaddebug(&current->thread, 2);
163 loaddebug(&current->thread, 3);
164 /* no 4 and 5 */
165 loaddebug(&current->thread, 6);
166 loaddebug(&current->thread, 7);
167#endif
168 }
169
170}
171
89/** 172/**
90 * __restore_processor_state - restore the contents of CPU registers saved 173 * __restore_processor_state - restore the contents of CPU registers saved
91 * by __save_processor_state() 174 * by __save_processor_state()
@@ -96,9 +179,16 @@ static void __restore_processor_state(struct saved_context *ctxt)
96 /* 179 /*
97 * control registers 180 * control registers
98 */ 181 */
182 /* cr4 was introduced in the Pentium CPU */
183#ifdef CONFIG_X86_32
184 if (ctxt->cr4)
185 write_cr4(ctxt->cr4);
186#else
187/* CONFIG X86_64 */
99 wrmsrl(MSR_EFER, ctxt->efer); 188 wrmsrl(MSR_EFER, ctxt->efer);
100 write_cr8(ctxt->cr8); 189 write_cr8(ctxt->cr8);
101 write_cr4(ctxt->cr4); 190 write_cr4(ctxt->cr4);
191#endif
102 write_cr3(ctxt->cr3); 192 write_cr3(ctxt->cr3);
103 write_cr2(ctxt->cr2); 193 write_cr2(ctxt->cr2);
104 write_cr0(ctxt->cr0); 194 write_cr0(ctxt->cr0);
@@ -107,13 +197,31 @@ static void __restore_processor_state(struct saved_context *ctxt)
107 * now restore the descriptor tables to their proper values 197 * now restore the descriptor tables to their proper values
108 * ltr is done i fix_processor_context(). 198 * ltr is done i fix_processor_context().
109 */ 199 */
200#ifdef CONFIG_X86_32
201 load_gdt(&ctxt->gdt);
202 load_idt(&ctxt->idt);
203#else
204/* CONFIG_X86_64 */
110 load_gdt((const struct desc_ptr *)&ctxt->gdt_limit); 205 load_gdt((const struct desc_ptr *)&ctxt->gdt_limit);
111 load_idt((const struct desc_ptr *)&ctxt->idt_limit); 206 load_idt((const struct desc_ptr *)&ctxt->idt_limit);
112 207#endif
113 208
114 /* 209 /*
115 * segment registers 210 * segment registers
116 */ 211 */
212#ifdef CONFIG_X86_32
213 loadsegment(es, ctxt->es);
214 loadsegment(fs, ctxt->fs);
215 loadsegment(gs, ctxt->gs);
216 loadsegment(ss, ctxt->ss);
217
218 /*
219 * sysenter MSRs
220 */
221 if (boot_cpu_has(X86_FEATURE_SEP))
222 enable_sep_cpu();
223#else
224/* CONFIG_X86_64 */
117 asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds)); 225 asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds));
118 asm volatile ("movw %0, %%es" :: "r" (ctxt->es)); 226 asm volatile ("movw %0, %%es" :: "r" (ctxt->es));
119 asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs)); 227 asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs));
@@ -123,6 +231,7 @@ static void __restore_processor_state(struct saved_context *ctxt)
123 wrmsrl(MSR_FS_BASE, ctxt->fs_base); 231 wrmsrl(MSR_FS_BASE, ctxt->fs_base);
124 wrmsrl(MSR_GS_BASE, ctxt->gs_base); 232 wrmsrl(MSR_GS_BASE, ctxt->gs_base);
125 wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); 233 wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
234#endif
126 235
127 /* 236 /*
128 * restore XCR0 for xsave capable cpu's. 237 * restore XCR0 for xsave capable cpu's.
@@ -134,41 +243,17 @@ static void __restore_processor_state(struct saved_context *ctxt)
134 243
135 do_fpu_end(); 244 do_fpu_end();
136 mtrr_ap_init(); 245 mtrr_ap_init();
246
247#ifdef CONFIG_X86_32
248 mcheck_init(&boot_cpu_data);
249#endif
137} 250}
138 251
252/* Needed by apm.c */
139void restore_processor_state(void) 253void restore_processor_state(void)
140{ 254{
141 __restore_processor_state(&saved_context); 255 __restore_processor_state(&saved_context);
142} 256}
143 257#ifdef CONFIG_X86_32
144static void fix_processor_context(void) 258EXPORT_SYMBOL(restore_processor_state);
145{ 259#endif
146 int cpu = smp_processor_id();
147 struct tss_struct *t = &per_cpu(init_tss, cpu);
148
149 /*
150 * This just modifies memory; should not be necessary. But... This
151 * is necessary, because 386 hardware has concept of busy TSS or some
152 * similar stupidity.
153 */
154 set_tss_desc(cpu, t);
155
156 get_cpu_gdt_table(cpu)[GDT_ENTRY_TSS].type = 9;
157
158 syscall_init(); /* This sets MSR_*STAR and related */
159 load_TR_desc(); /* This does ltr */
160 load_LDT(&current->active_mm->context); /* This does lldt */
161
162 /*
163 * Now maybe reload the debug registers
164 */
165 if (current->thread.debugreg7){
166 loaddebug(&current->thread, 0);
167 loaddebug(&current->thread, 1);
168 loaddebug(&current->thread, 2);
169 loaddebug(&current->thread, 3);
170 /* no 4 and 5 */
171 loaddebug(&current->thread, 6);
172 loaddebug(&current->thread, 7);
173 }
174}
diff --git a/arch/x86/power/cpu_32.c b/arch/x86/power/cpu_32.c
deleted file mode 100644
index ce702c5b3a2c..000000000000
--- a/arch/x86/power/cpu_32.c
+++ /dev/null
@@ -1,148 +0,0 @@
1/*
2 * Suspend support specific for i386.
3 *
4 * Distribute under GPLv2
5 *
6 * Copyright (c) 2002 Pavel Machek <pavel@suse.cz>
7 * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
8 */
9
10#include <linux/module.h>
11#include <linux/suspend.h>
12#include <asm/mtrr.h>
13#include <asm/mce.h>
14#include <asm/xcr.h>
15#include <asm/suspend.h>
16
17static struct saved_context saved_context;
18
19unsigned long saved_context_ebx;
20unsigned long saved_context_esp, saved_context_ebp;
21unsigned long saved_context_esi, saved_context_edi;
22unsigned long saved_context_eflags;
23
24static void __save_processor_state(struct saved_context *ctxt)
25{
26 mtrr_save_fixed_ranges(NULL);
27 kernel_fpu_begin();
28
29 /*
30 * descriptor tables
31 */
32 store_gdt(&ctxt->gdt);
33 store_idt(&ctxt->idt);
34 store_tr(ctxt->tr);
35
36 /*
37 * segment registers
38 */
39 savesegment(es, ctxt->es);
40 savesegment(fs, ctxt->fs);
41 savesegment(gs, ctxt->gs);
42 savesegment(ss, ctxt->ss);
43
44 /*
45 * control registers
46 */
47 ctxt->cr0 = read_cr0();
48 ctxt->cr2 = read_cr2();
49 ctxt->cr3 = read_cr3();
50 ctxt->cr4 = read_cr4_safe();
51}
52
53/* Needed by apm.c */
54void save_processor_state(void)
55{
56 __save_processor_state(&saved_context);
57}
58EXPORT_SYMBOL(save_processor_state);
59
60static void do_fpu_end(void)
61{
62 /*
63 * Restore FPU regs if necessary.
64 */
65 kernel_fpu_end();
66}
67
68static void fix_processor_context(void)
69{
70 int cpu = smp_processor_id();
71 struct tss_struct *t = &per_cpu(init_tss, cpu);
72
73 set_tss_desc(cpu, t); /*
74 * This just modifies memory; should not be
75 * necessary. But... This is necessary, because
76 * 386 hardware has concept of busy TSS or some
77 * similar stupidity.
78 */
79
80 load_TR_desc(); /* This does ltr */
81 load_LDT(&current->active_mm->context); /* This does lldt */
82
83 /*
84 * Now maybe reload the debug registers
85 */
86 if (current->thread.debugreg7) {
87 set_debugreg(current->thread.debugreg0, 0);
88 set_debugreg(current->thread.debugreg1, 1);
89 set_debugreg(current->thread.debugreg2, 2);
90 set_debugreg(current->thread.debugreg3, 3);
91 /* no 4 and 5 */
92 set_debugreg(current->thread.debugreg6, 6);
93 set_debugreg(current->thread.debugreg7, 7);
94 }
95
96}
97
98static void __restore_processor_state(struct saved_context *ctxt)
99{
100 /*
101 * control registers
102 */
103 /* cr4 was introduced in the Pentium CPU */
104 if (ctxt->cr4)
105 write_cr4(ctxt->cr4);
106 write_cr3(ctxt->cr3);
107 write_cr2(ctxt->cr2);
108 write_cr0(ctxt->cr0);
109
110 /*
111 * now restore the descriptor tables to their proper values
112 * ltr is done i fix_processor_context().
113 */
114 load_gdt(&ctxt->gdt);
115 load_idt(&ctxt->idt);
116
117 /*
118 * segment registers
119 */
120 loadsegment(es, ctxt->es);
121 loadsegment(fs, ctxt->fs);
122 loadsegment(gs, ctxt->gs);
123 loadsegment(ss, ctxt->ss);
124
125 /*
126 * sysenter MSRs
127 */
128 if (boot_cpu_has(X86_FEATURE_SEP))
129 enable_sep_cpu();
130
131 /*
132 * restore XCR0 for xsave capable cpu's.
133 */
134 if (cpu_has_xsave)
135 xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
136
137 fix_processor_context();
138 do_fpu_end();
139 mtrr_ap_init();
140 mcheck_init(&boot_cpu_data);
141}
142
143/* Needed by apm.c */
144void restore_processor_state(void)
145{
146 __restore_processor_state(&saved_context);
147}
148EXPORT_SYMBOL(restore_processor_state);
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index 16a9020c8f11..88112b49f02c 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -123,6 +123,7 @@ quiet_cmd_vdso = VDSO $@
123 -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) 123 -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^)
124 124
125VDSO_LDFLAGS = -fPIC -shared $(call ld-option, -Wl$(comma)--hash-style=sysv) 125VDSO_LDFLAGS = -fPIC -shared $(call ld-option, -Wl$(comma)--hash-style=sysv)
126GCOV_PROFILE := n
126 127
127# 128#
128# Install the unstripped copy of vdso*.so listed in $(vdso-install-y). 129# Install the unstripped copy of vdso*.so listed in $(vdso-install-y).
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 1241f118ab56..58bc00f68b12 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -338,6 +338,8 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
338 } 338 }
339 } 339 }
340 340
341 current->mm->context.vdso = (void *)addr;
342
341 if (compat_uses_vma || !compat) { 343 if (compat_uses_vma || !compat) {
342 /* 344 /*
343 * MAYWRITE to allow gdb to COW and set breakpoints 345 * MAYWRITE to allow gdb to COW and set breakpoints
@@ -358,11 +360,13 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
358 goto up_fail; 360 goto up_fail;
359 } 361 }
360 362
361 current->mm->context.vdso = (void *)addr;
362 current_thread_info()->sysenter_return = 363 current_thread_info()->sysenter_return =
363 VDSO32_SYMBOL(addr, SYSENTER_RETURN); 364 VDSO32_SYMBOL(addr, SYSENTER_RETURN);
364 365
365 up_fail: 366 up_fail:
367 if (ret)
368 current->mm->context.vdso = NULL;
369
366 up_write(&mm->mmap_sem); 370 up_write(&mm->mmap_sem);
367 371
368 return ret; 372 return ret;
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 7133cdf9098b..21e1aeb9f3ea 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -8,6 +8,7 @@
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/init.h> 9#include <linux/init.h>
10#include <linux/random.h> 10#include <linux/random.h>
11#include <linux/elf.h>
11#include <asm/vsyscall.h> 12#include <asm/vsyscall.h>
12#include <asm/vgtod.h> 13#include <asm/vgtod.h>
13#include <asm/proto.h> 14#include <asm/proto.h>
@@ -115,15 +116,18 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
115 goto up_fail; 116 goto up_fail;
116 } 117 }
117 118
119 current->mm->context.vdso = (void *)addr;
120
118 ret = install_special_mapping(mm, addr, vdso_size, 121 ret = install_special_mapping(mm, addr, vdso_size,
119 VM_READ|VM_EXEC| 122 VM_READ|VM_EXEC|
120 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| 123 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
121 VM_ALWAYSDUMP, 124 VM_ALWAYSDUMP,
122 vdso_pages); 125 vdso_pages);
123 if (ret) 126 if (ret) {
127 current->mm->context.vdso = NULL;
124 goto up_fail; 128 goto up_fail;
129 }
125 130
126 current->mm->context.vdso = (void *)addr;
127up_fail: 131up_fail:
128 up_write(&mm->mmap_sem); 132 up_write(&mm->mmap_sem);
129 return ret; 133 return ret;
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index f09e8c36ee80..0a1700a2be9c 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -20,6 +20,7 @@
20#include <linux/delay.h> 20#include <linux/delay.h>
21#include <linux/start_kernel.h> 21#include <linux/start_kernel.h>
22#include <linux/sched.h> 22#include <linux/sched.h>
23#include <linux/kprobes.h>
23#include <linux/bootmem.h> 24#include <linux/bootmem.h>
24#include <linux/module.h> 25#include <linux/module.h>
25#include <linux/mm.h> 26#include <linux/mm.h>
@@ -44,6 +45,7 @@
44#include <asm/processor.h> 45#include <asm/processor.h>
45#include <asm/proto.h> 46#include <asm/proto.h>
46#include <asm/msr-index.h> 47#include <asm/msr-index.h>
48#include <asm/traps.h>
47#include <asm/setup.h> 49#include <asm/setup.h>
48#include <asm/desc.h> 50#include <asm/desc.h>
49#include <asm/pgtable.h> 51#include <asm/pgtable.h>
@@ -240,10 +242,10 @@ static unsigned long xen_get_debugreg(int reg)
240 return HYPERVISOR_get_debugreg(reg); 242 return HYPERVISOR_get_debugreg(reg);
241} 243}
242 244
243void xen_leave_lazy(void) 245static void xen_end_context_switch(struct task_struct *next)
244{ 246{
245 paravirt_leave_lazy(paravirt_get_lazy_mode());
246 xen_mc_flush(); 247 xen_mc_flush();
248 paravirt_end_context_switch(next);
247} 249}
248 250
249static unsigned long xen_store_tr(void) 251static unsigned long xen_store_tr(void)
@@ -428,11 +430,44 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
428static int cvt_gate_to_trap(int vector, const gate_desc *val, 430static int cvt_gate_to_trap(int vector, const gate_desc *val,
429 struct trap_info *info) 431 struct trap_info *info)
430{ 432{
433 unsigned long addr;
434
431 if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT) 435 if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT)
432 return 0; 436 return 0;
433 437
434 info->vector = vector; 438 info->vector = vector;
435 info->address = gate_offset(*val); 439
440 addr = gate_offset(*val);
441#ifdef CONFIG_X86_64
442 /*
443 * Look for known traps using IST, and substitute them
444 * appropriately. The debugger ones are the only ones we care
445 * about. Xen will handle faults like double_fault and
446 * machine_check, so we should never see them. Warn if
447 * there's an unexpected IST-using fault handler.
448 */
449 if (addr == (unsigned long)debug)
450 addr = (unsigned long)xen_debug;
451 else if (addr == (unsigned long)int3)
452 addr = (unsigned long)xen_int3;
453 else if (addr == (unsigned long)stack_segment)
454 addr = (unsigned long)xen_stack_segment;
455 else if (addr == (unsigned long)double_fault ||
456 addr == (unsigned long)nmi) {
457 /* Don't need to handle these */
458 return 0;
459#ifdef CONFIG_X86_MCE
460 } else if (addr == (unsigned long)machine_check) {
461 return 0;
462#endif
463 } else {
464 /* Some other trap using IST? */
465 if (WARN_ON(val->ist != 0))
466 return 0;
467 }
468#endif /* CONFIG_X86_64 */
469 info->address = addr;
470
436 info->cs = gate_segment(*val); 471 info->cs = gate_segment(*val);
437 info->flags = val->dpl; 472 info->flags = val->dpl;
438 /* interrupt gates clear IF */ 473 /* interrupt gates clear IF */
@@ -623,10 +658,26 @@ static void xen_clts(void)
623 xen_mc_issue(PARAVIRT_LAZY_CPU); 658 xen_mc_issue(PARAVIRT_LAZY_CPU);
624} 659}
625 660
661static DEFINE_PER_CPU(unsigned long, xen_cr0_value);
662
663static unsigned long xen_read_cr0(void)
664{
665 unsigned long cr0 = percpu_read(xen_cr0_value);
666
667 if (unlikely(cr0 == 0)) {
668 cr0 = native_read_cr0();
669 percpu_write(xen_cr0_value, cr0);
670 }
671
672 return cr0;
673}
674
626static void xen_write_cr0(unsigned long cr0) 675static void xen_write_cr0(unsigned long cr0)
627{ 676{
628 struct multicall_space mcs; 677 struct multicall_space mcs;
629 678
679 percpu_write(xen_cr0_value, cr0);
680
630 /* Only pay attention to cr0.TS; everything else is 681 /* Only pay attention to cr0.TS; everything else is
631 ignored. */ 682 ignored. */
632 mcs = xen_mc_entry(0); 683 mcs = xen_mc_entry(0);
@@ -812,7 +863,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
812 863
813 .clts = xen_clts, 864 .clts = xen_clts,
814 865
815 .read_cr0 = native_read_cr0, 866 .read_cr0 = xen_read_cr0,
816 .write_cr0 = xen_write_cr0, 867 .write_cr0 = xen_write_cr0,
817 868
818 .read_cr4 = native_read_cr4, 869 .read_cr4 = native_read_cr4,
@@ -860,10 +911,8 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
860 /* Xen takes care of %gs when switching to usermode for us */ 911 /* Xen takes care of %gs when switching to usermode for us */
861 .swapgs = paravirt_nop, 912 .swapgs = paravirt_nop,
862 913
863 .lazy_mode = { 914 .start_context_switch = paravirt_start_context_switch,
864 .enter = paravirt_enter_lazy_cpu, 915 .end_context_switch = xen_end_context_switch,
865 .leave = xen_leave_lazy,
866 },
867}; 916};
868 917
869static const struct pv_apic_ops xen_apic_ops __initdata = { 918static const struct pv_apic_ops xen_apic_ops __initdata = {
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index fba55b1a4021..4ceb28581652 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -452,10 +452,6 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
452void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 452void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
453 pte_t *ptep, pte_t pteval) 453 pte_t *ptep, pte_t pteval)
454{ 454{
455 /* updates to init_mm may be done without lock */
456 if (mm == &init_mm)
457 preempt_disable();
458
459 ADD_STATS(set_pte_at, 1); 455 ADD_STATS(set_pte_at, 1);
460// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep)); 456// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
461 ADD_STATS(set_pte_at_current, mm == current->mm); 457 ADD_STATS(set_pte_at_current, mm == current->mm);
@@ -476,9 +472,7 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
476 } 472 }
477 xen_set_pte(ptep, pteval); 473 xen_set_pte(ptep, pteval);
478 474
479out: 475out: return;
480 if (mm == &init_mm)
481 preempt_enable();
482} 476}
483 477
484pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, 478pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
@@ -1152,10 +1146,8 @@ static void drop_other_mm_ref(void *info)
1152 1146
1153 /* If this cpu still has a stale cr3 reference, then make sure 1147 /* If this cpu still has a stale cr3 reference, then make sure
1154 it has been flushed. */ 1148 it has been flushed. */
1155 if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) { 1149 if (percpu_read(xen_current_cr3) == __pa(mm->pgd))
1156 load_cr3(swapper_pg_dir); 1150 load_cr3(swapper_pg_dir);
1157 arch_flush_lazy_cpu_mode();
1158 }
1159} 1151}
1160 1152
1161static void xen_drop_mm_ref(struct mm_struct *mm) 1153static void xen_drop_mm_ref(struct mm_struct *mm)
@@ -1168,7 +1160,6 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
1168 load_cr3(swapper_pg_dir); 1160 load_cr3(swapper_pg_dir);
1169 else 1161 else
1170 leave_mm(smp_processor_id()); 1162 leave_mm(smp_processor_id());
1171 arch_flush_lazy_cpu_mode();
1172 } 1163 }
1173 1164
1174 /* Get the "official" set of cpus referring to our pagetable. */ 1165 /* Get the "official" set of cpus referring to our pagetable. */
@@ -1876,6 +1867,14 @@ __init void xen_post_allocator_init(void)
1876 xen_mark_init_mm_pinned(); 1867 xen_mark_init_mm_pinned();
1877} 1868}
1878 1869
1870static void xen_leave_lazy_mmu(void)
1871{
1872 preempt_disable();
1873 xen_mc_flush();
1874 paravirt_leave_lazy_mmu();
1875 preempt_enable();
1876}
1877
1879const struct pv_mmu_ops xen_mmu_ops __initdata = { 1878const struct pv_mmu_ops xen_mmu_ops __initdata = {
1880 .pagetable_setup_start = xen_pagetable_setup_start, 1879 .pagetable_setup_start = xen_pagetable_setup_start,
1881 .pagetable_setup_done = xen_pagetable_setup_done, 1880 .pagetable_setup_done = xen_pagetable_setup_done,
@@ -1949,7 +1948,7 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = {
1949 1948
1950 .lazy_mode = { 1949 .lazy_mode = {
1951 .enter = paravirt_enter_lazy_mmu, 1950 .enter = paravirt_enter_lazy_mmu,
1952 .leave = xen_leave_lazy, 1951 .leave = xen_leave_lazy_mmu,
1953 }, 1952 },
1954 1953
1955 .set_fixmap = xen_set_fixmap, 1954 .set_fixmap = xen_set_fixmap,
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 15c6c68db6a2..ad0047f47cd4 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -61,9 +61,9 @@ char * __init xen_memory_setup(void)
61 * - xen_start_info 61 * - xen_start_info
62 * See comment above "struct start_info" in <xen/interface/xen.h> 62 * See comment above "struct start_info" in <xen/interface/xen.h>
63 */ 63 */
64 e820_add_region(__pa(xen_start_info->mfn_list), 64 reserve_early(__pa(xen_start_info->mfn_list),
65 xen_start_info->pt_base - xen_start_info->mfn_list, 65 __pa(xen_start_info->pt_base),
66 E820_RESERVED); 66 "XEN START INFO");
67 67
68 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 68 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
69 69
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index ca6596b05d53..22494fd4c9b5 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -30,7 +30,6 @@ pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
30void xen_ident_map_ISA(void); 30void xen_ident_map_ISA(void);
31void xen_reserve_top(void); 31void xen_reserve_top(void);
32 32
33void xen_leave_lazy(void);
34void xen_post_allocator_init(void); 33void xen_post_allocator_init(void);
35 34
36char * __init xen_memory_setup(void); 35char * __init xen_memory_setup(void);