aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-06-17 06:52:15 -0400
committerIngo Molnar <mingo@elte.hu>2009-06-17 06:56:49 -0400
commiteadb8a091b27a840de7450f84ecff5ef13476424 (patch)
tree58c3782d40def63baa8167f3d31e3048cb4c7660 /arch/x86
parent73874005cd8800440be4299bd095387fff4b90ac (diff)
parent65795efbd380a832ae508b04dba8f8e53f0b84d9 (diff)
Merge branch 'linus' into tracing/hw-breakpoints
Conflicts: arch/x86/Kconfig arch/x86/kernel/traps.c arch/x86/power/cpu.c arch/x86/power/cpu_32.c kernel/Makefile Semantic conflict: arch/x86/kernel/hw_breakpoint.c Merge reason: Resolve the conflicts, move from put_cpu_no_sched() to put_cpu() in arch/x86/kernel/hw_breakpoint.c. Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kbuild16
-rw-r--r--arch/x86/Kconfig113
-rw-r--r--arch/x86/Kconfig.cpu1
-rw-r--r--arch/x86/Kconfig.debug11
-rw-r--r--arch/x86/Makefile24
-rw-r--r--arch/x86/boot/.gitignore2
-rw-r--r--arch/x86/boot/Makefile29
-rw-r--r--arch/x86/boot/a20.c9
-rw-r--r--arch/x86/boot/apm.c76
-rw-r--r--arch/x86/boot/bioscall.S82
-rw-r--r--arch/x86/boot/boot.h48
-rw-r--r--arch/x86/boot/compressed/.gitignore3
-rw-r--r--arch/x86/boot/compressed/Makefile54
-rw-r--r--arch/x86/boot/compressed/head_32.S194
-rw-r--r--arch/x86/boot/compressed/head_64.S169
-rw-r--r--arch/x86/boot/compressed/misc.c12
-rw-r--r--arch/x86/boot/compressed/mkpiggy.c97
-rw-r--r--arch/x86/boot/compressed/relocs.c7
-rw-r--r--arch/x86/boot/compressed/vmlinux.lds.S (renamed from arch/x86/boot/compressed/vmlinux_64.lds)29
-rw-r--r--arch/x86/boot/compressed/vmlinux.scr10
-rw-r--r--arch/x86/boot/compressed/vmlinux_32.lds43
-rw-r--r--arch/x86/boot/edd.c71
-rw-r--r--arch/x86/boot/header.S30
-rw-r--r--arch/x86/boot/main.c39
-rw-r--r--arch/x86/boot/mca.c27
-rw-r--r--arch/x86/boot/memory.c108
-rw-r--r--arch/x86/boot/regs.c29
-rw-r--r--arch/x86/boot/setup.ld6
-rw-r--r--arch/x86/boot/tty.c52
-rw-r--r--arch/x86/boot/video-bios.c27
-rw-r--r--arch/x86/boot/video-vesa.c137
-rw-r--r--arch/x86/boot/video-vga.c95
-rw-r--r--arch/x86/boot/video.c42
-rw-r--r--arch/x86/boot/video.h14
-rw-r--r--arch/x86/configs/i386_defconfig148
-rw-r--r--arch/x86/configs/x86_64_defconfig151
-rw-r--r--arch/x86/crypto/Makefile2
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c267
-rw-r--r--arch/x86/crypto/fpu.c166
-rw-r--r--arch/x86/ia32/ia32entry.S4
-rw-r--r--arch/x86/include/asm/alternative.h59
-rw-r--r--arch/x86/include/asm/amd_iommu.h2
-rw-r--r--arch/x86/include/asm/amd_iommu_types.h55
-rw-r--r--arch/x86/include/asm/apic.h33
-rw-r--r--arch/x86/include/asm/apicdef.h8
-rw-r--r--arch/x86/include/asm/atomic_32.h238
-rw-r--r--arch/x86/include/asm/atomic_64.h2
-rw-r--r--arch/x86/include/asm/bitsperlong.h13
-rw-r--r--arch/x86/include/asm/boot.h15
-rw-r--r--arch/x86/include/asm/bootparam.h3
-rw-r--r--arch/x86/include/asm/cpu_debug.h101
-rw-r--r--arch/x86/include/asm/cpufeature.h9
-rw-r--r--arch/x86/include/asm/dma-mapping.h7
-rw-r--r--arch/x86/include/asm/entry_arch.h13
-rw-r--r--arch/x86/include/asm/hardirq.h4
-rw-r--r--arch/x86/include/asm/hw_irq.h29
-rw-r--r--arch/x86/include/asm/i387.h43
-rw-r--r--arch/x86/include/asm/i8259.h4
-rw-r--r--arch/x86/include/asm/intel_arch_perfmon.h31
-rw-r--r--arch/x86/include/asm/io_apic.h9
-rw-r--r--arch/x86/include/asm/iomap.h5
-rw-r--r--arch/x86/include/asm/irq_remapping.h2
-rw-r--r--arch/x86/include/asm/irq_vectors.h26
-rw-r--r--arch/x86/include/asm/k8.h13
-rw-r--r--arch/x86/include/asm/kmap_types.h23
-rw-r--r--arch/x86/include/asm/kmemcheck.h42
-rw-r--r--arch/x86/include/asm/kvm.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h45
-rw-r--r--arch/x86/include/asm/kvm_x86_emulate.h6
-rw-r--r--arch/x86/include/asm/lguest.h7
-rw-r--r--arch/x86/include/asm/lguest_hcall.h15
-rw-r--r--arch/x86/include/asm/mce.h88
-rw-r--r--arch/x86/include/asm/microcode.h25
-rw-r--r--arch/x86/include/asm/mman.h2
-rw-r--r--arch/x86/include/asm/mpspec.h15
-rw-r--r--arch/x86/include/asm/msr-index.h8
-rw-r--r--arch/x86/include/asm/msr.h23
-rw-r--r--arch/x86/include/asm/nmi.h2
-rw-r--r--arch/x86/include/asm/numa_64.h10
-rw-r--r--arch/x86/include/asm/page.h2
-rw-r--r--arch/x86/include/asm/page_32_types.h4
-rw-r--r--arch/x86/include/asm/page_64_types.h22
-rw-r--r--arch/x86/include/asm/page_types.h6
-rw-r--r--arch/x86/include/asm/paravirt.h24
-rw-r--r--arch/x86/include/asm/percpu.h10
-rw-r--r--arch/x86/include/asm/perf_counter.h100
-rw-r--r--arch/x86/include/asm/pgtable.h9
-rw-r--r--arch/x86/include/asm/pgtable_32_types.h4
-rw-r--r--arch/x86/include/asm/pgtable_64.h6
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h8
-rw-r--r--arch/x86/include/asm/pgtable_types.h10
-rw-r--r--arch/x86/include/asm/processor.h12
-rw-r--r--arch/x86/include/asm/ptrace.h7
-rw-r--r--arch/x86/include/asm/required-features.h8
-rw-r--r--arch/x86/include/asm/setup.h1
-rw-r--r--arch/x86/include/asm/signal.h2
-rw-r--r--arch/x86/include/asm/smp.h2
-rw-r--r--arch/x86/include/asm/sparsemem.h2
-rw-r--r--arch/x86/include/asm/spinlock.h4
-rw-r--r--arch/x86/include/asm/string_32.h8
-rw-r--r--arch/x86/include/asm/string_64.h8
-rw-r--r--arch/x86/include/asm/svm.h1
-rw-r--r--arch/x86/include/asm/syscalls.h45
-rw-r--r--arch/x86/include/asm/termios.h1
-rw-r--r--arch/x86/include/asm/thread_info.h8
-rw-r--r--arch/x86/include/asm/timex.h4
-rw-r--r--arch/x86/include/asm/tlbflush.h2
-rw-r--r--arch/x86/include/asm/topology.h3
-rw-r--r--arch/x86/include/asm/traps.h5
-rw-r--r--arch/x86/include/asm/types.h6
-rw-r--r--arch/x86/include/asm/unistd_32.h2
-rw-r--r--arch/x86/include/asm/unistd_64.h5
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h2
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h6
-rw-r--r--arch/x86/include/asm/vmx.h1
-rw-r--r--arch/x86/include/asm/xor.h5
-rw-r--r--arch/x86/kernel/Makefile7
-rw-r--r--arch/x86/kernel/acpi/boot.c156
-rw-r--r--arch/x86/kernel/acpi/realmode/Makefile2
-rw-r--r--arch/x86/kernel/acpi/realmode/bioscall.S1
-rw-r--r--arch/x86/kernel/acpi/realmode/regs.c1
-rw-r--r--arch/x86/kernel/acpi/sleep.c2
-rw-r--r--arch/x86/kernel/amd_iommu.c500
-rw-r--r--arch/x86/kernel/amd_iommu_init.c273
-rw-r--r--arch/x86/kernel/apic/apic.c318
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c4
-rw-r--r--arch/x86/kernel/apic/es7000_32.c10
-rw-r--r--arch/x86/kernel/apic/io_apic.c908
-rw-r--r--arch/x86/kernel/apic/nmi.c4
-rw-r--r--arch/x86/kernel/apic/probe_32.c1
-rw-r--r--arch/x86/kernel/apic/probe_64.c2
-rw-r--r--arch/x86/kernel/apic/summit_32.c7
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c22
-rw-r--r--arch/x86/kernel/apm_32.c14
-rw-r--r--arch/x86/kernel/asm-offsets_32.c2
-rw-r--r--arch/x86/kernel/asm-offsets_64.c1
-rw-r--r--arch/x86/kernel/cpu/Makefile12
-rw-r--r--arch/x86/kernel/cpu/amd.c12
-rw-r--r--arch/x86/kernel/cpu/common.c43
-rw-r--r--arch/x86/kernel/cpu/cpu_debug.c431
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Kconfig9
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c12
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c57
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c2
-rw-r--r--arch/x86/kernel/cpu/intel.c29
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c153
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile10
-rw-r--r--arch/x86/kernel/cpu/mcheck/k7.c42
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c127
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h15
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c218
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c1964
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.h26
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_32.c76
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c1187
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c203
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c74
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel_64.c66
-rw-r--r--arch/x86/kernel/cpu/mcheck/non-fatal.c57
-rw-r--r--arch/x86/kernel/cpu/mcheck/p4.c86
-rw-r--r--arch/x86/kernel/cpu/mcheck/p5.c48
-rw-r--r--arch/x86/kernel/cpu/mcheck/p6.c26
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c73
-rw-r--r--arch/x86/kernel/cpu/mcheck/threshold.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/winchip.c17
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c4
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c30
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h15
-rw-r--r--arch/x86/kernel/cpu/mtrr/state.c6
-rw-r--r--arch/x86/kernel/cpu/perf_counter.c1711
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c4
-rw-r--r--arch/x86/kernel/cpuid.c6
-rw-r--r--arch/x86/kernel/dumpstack.h1
-rw-r--r--arch/x86/kernel/e820.c46
-rw-r--r--arch/x86/kernel/early-quirks.c2
-rw-r--r--arch/x86/kernel/entry_64.S21
-rw-r--r--arch/x86/kernel/ftrace.c2
-rw-r--r--arch/x86/kernel/head_32.S7
-rw-r--r--arch/x86/kernel/hw_breakpoint.c4
-rw-r--r--arch/x86/kernel/i8253.c1
-rw-r--r--arch/x86/kernel/init_task.c1
-rw-r--r--arch/x86/kernel/irq.c47
-rw-r--r--arch/x86/kernel/irqinit.c (renamed from arch/x86/kernel/irqinit_32.c)155
-rw-r--r--arch/x86/kernel/irqinit_64.c177
-rw-r--r--arch/x86/kernel/kgdb.c2
-rw-r--r--arch/x86/kernel/kvm.c6
-rw-r--r--arch/x86/kernel/microcode_amd.c70
-rw-r--r--arch/x86/kernel/microcode_core.c330
-rw-r--r--arch/x86/kernel/microcode_intel.c90
-rw-r--r--arch/x86/kernel/module.c (renamed from arch/x86/kernel/module_64.c)82
-rw-r--r--arch/x86/kernel/module_32.c152
-rw-r--r--arch/x86/kernel/mpparse.c34
-rw-r--r--arch/x86/kernel/msr.c6
-rw-r--r--arch/x86/kernel/paravirt.c58
-rw-r--r--arch/x86/kernel/pci-calgary_64.c54
-rw-r--r--arch/x86/kernel/pci-gart_64.c55
-rw-r--r--arch/x86/kernel/pci-swiotlb.c2
-rw-r--r--arch/x86/kernel/process.c17
-rw-r--r--arch/x86/kernel/process_32.c17
-rw-r--r--arch/x86/kernel/process_64.c17
-rw-r--r--arch/x86/kernel/quirks.c37
-rw-r--r--arch/x86/kernel/reboot.c17
-rw-r--r--arch/x86/kernel/setup.c55
-rw-r--r--arch/x86/kernel/setup_percpu.c12
-rw-r--r--arch/x86/kernel/signal.c7
-rw-r--r--arch/x86/kernel/smp.c51
-rw-r--r--arch/x86/kernel/smpboot.c24
-rw-r--r--arch/x86/kernel/stacktrace.c7
-rw-r--r--arch/x86/kernel/syscall_table_32.S2
-rw-r--r--arch/x86/kernel/tlb_uv.c17
-rw-r--r--arch/x86/kernel/traps.c31
-rw-r--r--arch/x86/kernel/tsc.c20
-rw-r--r--arch/x86/kernel/tsc_sync.c14
-rw-r--r--arch/x86/kernel/vm86_32.c13
-rw-r--r--arch/x86/kernel/vmi_32.c20
-rw-r--r--arch/x86/kernel/vmlinux.lds.S432
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S229
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S298
-rw-r--r--arch/x86/kernel/vsyscall_64.c8
-rw-r--r--arch/x86/kvm/Kconfig6
-rw-r--r--arch/x86/kvm/Makefile2
-rw-r--r--arch/x86/kvm/i8254.c109
-rw-r--r--arch/x86/kvm/i8254.h12
-rw-r--r--arch/x86/kvm/irq.c7
-rw-r--r--arch/x86/kvm/kvm_timer.h18
-rw-r--r--arch/x86/kvm/lapic.c251
-rw-r--r--arch/x86/kvm/lapic.h12
-rw-r--r--arch/x86/kvm/mmu.c197
-rw-r--r--arch/x86/kvm/mmu.h5
-rw-r--r--arch/x86/kvm/paging_tmpl.h16
-rw-r--r--arch/x86/kvm/svm.c415
-rw-r--r--arch/x86/kvm/timer.c46
-rw-r--r--arch/x86/kvm/vmx.c723
-rw-r--r--arch/x86/kvm/x86.c415
-rw-r--r--arch/x86/kvm/x86.h14
-rw-r--r--arch/x86/kvm/x86_emulate.c141
-rw-r--r--arch/x86/lguest/Kconfig1
-rw-r--r--arch/x86/lguest/Makefile1
-rw-r--r--arch/x86/lguest/boot.c193
-rw-r--r--arch/x86/lguest/i386_head.S60
-rw-r--r--arch/x86/lib/Makefile2
-rw-r--r--arch/x86/lib/msr-on-cpu.c97
-rw-r--r--arch/x86/lib/msr.c183
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/dump_pagetables.c7
-rw-r--r--arch/x86/mm/fault.c87
-rw-r--r--arch/x86/mm/highmem_32.c2
-rw-r--r--arch/x86/mm/hugetlbpage.c6
-rw-r--r--arch/x86/mm/init.c80
-rw-r--r--arch/x86/mm/init_32.c73
-rw-r--r--arch/x86/mm/init_64.c51
-rw-r--r--arch/x86/mm/iomap_32.c1
-rw-r--r--arch/x86/mm/kmemcheck/Makefile1
-rw-r--r--arch/x86/mm/kmemcheck/error.c228
-rw-r--r--arch/x86/mm/kmemcheck/error.h15
-rw-r--r--arch/x86/mm/kmemcheck/kmemcheck.c640
-rw-r--r--arch/x86/mm/kmemcheck/opcode.c106
-rw-r--r--arch/x86/mm/kmemcheck/opcode.h9
-rw-r--r--arch/x86/mm/kmemcheck/pte.c22
-rw-r--r--arch/x86/mm/kmemcheck/pte.h10
-rw-r--r--arch/x86/mm/kmemcheck/selftest.c69
-rw-r--r--arch/x86/mm/kmemcheck/selftest.h6
-rw-r--r--arch/x86/mm/kmemcheck/shadow.c162
-rw-r--r--arch/x86/mm/kmemcheck/shadow.h16
-rw-r--r--arch/x86/mm/memtest.c17
-rw-r--r--arch/x86/mm/numa_64.c33
-rw-r--r--arch/x86/mm/pageattr.c29
-rw-r--r--arch/x86/mm/pgtable.c12
-rw-r--r--arch/x86/mm/srat_64.c98
-rw-r--r--arch/x86/oprofile/backtrace.c2
-rw-r--r--arch/x86/oprofile/nmi_int.c34
-rw-r--r--arch/x86/oprofile/op_model_ppro.c10
-rw-r--r--arch/x86/pci/irq.c84
-rw-r--r--arch/x86/pci/mmconfig-shared.c6
-rw-r--r--arch/x86/power/Makefile2
-rw-r--r--arch/x86/power/cpu.c (renamed from arch/x86/power/cpu_64.c)137
-rw-r--r--arch/x86/power/cpu_32.c141
-rw-r--r--arch/x86/vdso/vdso32-setup.c6
-rw-r--r--arch/x86/vdso/vma.c8
-rw-r--r--arch/x86/xen/Makefile5
-rw-r--r--arch/x86/xen/enlighten.c65
-rw-r--r--arch/x86/xen/mmu.c24
-rw-r--r--arch/x86/xen/setup.c6
-rw-r--r--arch/x86/xen/xen-ops.h20
288 files changed, 13361 insertions, 7579 deletions
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
new file mode 100644
index 00000000000..ad8ec356fb3
--- /dev/null
+++ b/arch/x86/Kbuild
@@ -0,0 +1,16 @@
1
2obj-$(CONFIG_KVM) += kvm/
3
4# Xen paravirtualization support
5obj-$(CONFIG_XEN) += xen/
6
7# lguest paravirtualization support
8obj-$(CONFIG_LGUEST_GUEST) += lguest/
9
10obj-y += kernel/
11obj-y += mm/
12
13obj-y += crypto/
14obj-y += vdso/
15obj-$(CONFIG_IA32_EMULATION) += ia32/
16
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3033375ed6b..52421d52f21 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -47,6 +47,12 @@ config X86
47 select HAVE_KERNEL_BZIP2 47 select HAVE_KERNEL_BZIP2
48 select HAVE_KERNEL_LZMA 48 select HAVE_KERNEL_LZMA
49 select HAVE_HW_BREAKPOINT 49 select HAVE_HW_BREAKPOINT
50 select HAVE_ARCH_KMEMCHECK
51
52config OUTPUT_FORMAT
53 string
54 default "elf32-i386" if X86_32
55 default "elf64-x86-64" if X86_64
50 56
51config ARCH_DEFCONFIG 57config ARCH_DEFCONFIG
52 string 58 string
@@ -275,15 +281,9 @@ config SPARSE_IRQ
275 281
276 If you don't know what to do here, say N. 282 If you don't know what to do here, say N.
277 283
278config NUMA_MIGRATE_IRQ_DESC 284config NUMA_IRQ_DESC
279 bool "Move irq desc when changing irq smp_affinity" 285 def_bool y
280 depends on SPARSE_IRQ && NUMA 286 depends on SPARSE_IRQ && NUMA
281 depends on BROKEN
282 default n
283 ---help---
284 This enables moving irq_desc to cpu/node that irq will use handled.
285
286 If you don't know what to do here, say N.
287 287
288config X86_MPPARSE 288config X86_MPPARSE
289 bool "Enable MPS table" if ACPI 289 bool "Enable MPS table" if ACPI
@@ -356,7 +356,7 @@ config X86_UV
356 depends on X86_64 356 depends on X86_64
357 depends on X86_EXTENDED_PLATFORM 357 depends on X86_EXTENDED_PLATFORM
358 depends on NUMA 358 depends on NUMA
359 select X86_X2APIC 359 depends on X86_X2APIC
360 ---help--- 360 ---help---
361 This option is needed in order to support SGI Ultraviolet systems. 361 This option is needed in order to support SGI Ultraviolet systems.
362 If you don't have one of these, you should say N here. 362 If you don't have one of these, you should say N here.
@@ -499,6 +499,19 @@ config PARAVIRT
499 over full virtualization. However, when run without a hypervisor 499 over full virtualization. However, when run without a hypervisor
500 the kernel is theoretically slower and slightly larger. 500 the kernel is theoretically slower and slightly larger.
501 501
502config PARAVIRT_SPINLOCKS
503 bool "Paravirtualization layer for spinlocks"
504 depends on PARAVIRT && SMP && EXPERIMENTAL
505 ---help---
506 Paravirtualized spinlocks allow a pvops backend to replace the
507 spinlock implementation with something virtualization-friendly
508 (for example, block the virtual CPU rather than spinning).
509
510 Unfortunately the downside is an up to 5% performance hit on
511 native kernels, with various workloads.
512
513 If you are unsure how to answer this question, answer N.
514
502config PARAVIRT_CLOCK 515config PARAVIRT_CLOCK
503 bool 516 bool
504 default n 517 default n
@@ -728,6 +741,7 @@ config X86_UP_IOAPIC
728config X86_LOCAL_APIC 741config X86_LOCAL_APIC
729 def_bool y 742 def_bool y
730 depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC 743 depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC
744 select HAVE_PERF_COUNTERS if (!M386 && !M486)
731 745
732config X86_IO_APIC 746config X86_IO_APIC
733 def_bool y 747 def_bool y
@@ -777,10 +791,26 @@ config X86_MCE
777 to disable it. MCE support simply ignores non-MCE processors like 791 to disable it. MCE support simply ignores non-MCE processors like
778 the 386 and 486, so nearly everyone can say Y here. 792 the 386 and 486, so nearly everyone can say Y here.
779 793
794config X86_OLD_MCE
795 depends on X86_32 && X86_MCE
796 bool "Use legacy machine check code (will go away)"
797 default n
798 select X86_ANCIENT_MCE
799 ---help---
800 Use the old i386 machine check code. This is merely intended for
801 testing in a transition period. Try this if you run into any machine
802 check related software problems, but report the problem to
803 linux-kernel. When in doubt say no.
804
805config X86_NEW_MCE
806 depends on X86_MCE
807 bool
808 default y if (!X86_OLD_MCE && X86_32) || X86_64
809
780config X86_MCE_INTEL 810config X86_MCE_INTEL
781 def_bool y 811 def_bool y
782 prompt "Intel MCE features" 812 prompt "Intel MCE features"
783 depends on X86_64 && X86_MCE && X86_LOCAL_APIC 813 depends on X86_NEW_MCE && X86_LOCAL_APIC
784 ---help--- 814 ---help---
785 Additional support for intel specific MCE features such as 815 Additional support for intel specific MCE features such as
786 the thermal monitor. 816 the thermal monitor.
@@ -788,19 +818,36 @@ config X86_MCE_INTEL
788config X86_MCE_AMD 818config X86_MCE_AMD
789 def_bool y 819 def_bool y
790 prompt "AMD MCE features" 820 prompt "AMD MCE features"
791 depends on X86_64 && X86_MCE && X86_LOCAL_APIC 821 depends on X86_NEW_MCE && X86_LOCAL_APIC
792 ---help--- 822 ---help---
793 Additional support for AMD specific MCE features such as 823 Additional support for AMD specific MCE features such as
794 the DRAM Error Threshold. 824 the DRAM Error Threshold.
795 825
826config X86_ANCIENT_MCE
827 def_bool n
828 depends on X86_32
829 prompt "Support for old Pentium 5 / WinChip machine checks"
830 ---help---
831 Include support for machine check handling on old Pentium 5 or WinChip
832 systems. These typically need to be enabled explicitely on the command
833 line.
834
796config X86_MCE_THRESHOLD 835config X86_MCE_THRESHOLD
797 depends on X86_MCE_AMD || X86_MCE_INTEL 836 depends on X86_MCE_AMD || X86_MCE_INTEL
798 bool 837 bool
799 default y 838 default y
800 839
840config X86_MCE_INJECT
841 depends on X86_NEW_MCE
842 tristate "Machine check injector support"
843 ---help---
844 Provide support for injecting machine checks for testing purposes.
845 If you don't know what a machine check is and you don't do kernel
846 QA it is safe to say n.
847
801config X86_MCE_NONFATAL 848config X86_MCE_NONFATAL
802 tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" 849 tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
803 depends on X86_32 && X86_MCE 850 depends on X86_OLD_MCE
804 ---help--- 851 ---help---
805 Enabling this feature starts a timer that triggers every 5 seconds which 852 Enabling this feature starts a timer that triggers every 5 seconds which
806 will look at the machine check registers to see if anything happened. 853 will look at the machine check registers to see if anything happened.
@@ -813,11 +860,15 @@ config X86_MCE_NONFATAL
813 860
814config X86_MCE_P4THERMAL 861config X86_MCE_P4THERMAL
815 bool "check for P4 thermal throttling interrupt." 862 bool "check for P4 thermal throttling interrupt."
816 depends on X86_32 && X86_MCE && (X86_UP_APIC || SMP) 863 depends on X86_OLD_MCE && X86_MCE && (X86_UP_APIC || SMP)
817 ---help--- 864 ---help---
818 Enabling this feature will cause a message to be printed when the P4 865 Enabling this feature will cause a message to be printed when the P4
819 enters thermal throttling. 866 enters thermal throttling.
820 867
868config X86_THERMAL_VECTOR
869 def_bool y
870 depends on X86_MCE_P4THERMAL || X86_MCE_INTEL
871
821config VM86 872config VM86
822 bool "Enable VM86 support" if EMBEDDED 873 bool "Enable VM86 support" if EMBEDDED
823 default y 874 default y
@@ -1454,9 +1505,7 @@ config KEXEC_JUMP
1454 1505
1455config PHYSICAL_START 1506config PHYSICAL_START
1456 hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) 1507 hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
1457 default "0x1000000" if X86_NUMAQ 1508 default "0x1000000"
1458 default "0x200000" if X86_64
1459 default "0x100000"
1460 ---help--- 1509 ---help---
1461 This gives the physical address where the kernel is loaded. 1510 This gives the physical address where the kernel is loaded.
1462 1511
@@ -1475,15 +1524,15 @@ config PHYSICAL_START
1475 to be specifically compiled to run from a specific memory area 1524 to be specifically compiled to run from a specific memory area
1476 (normally a reserved region) and this option comes handy. 1525 (normally a reserved region) and this option comes handy.
1477 1526
1478 So if you are using bzImage for capturing the crash dump, leave 1527 So if you are using bzImage for capturing the crash dump,
1479 the value here unchanged to 0x100000 and set CONFIG_RELOCATABLE=y. 1528 leave the value here unchanged to 0x1000000 and set
1480 Otherwise if you plan to use vmlinux for capturing the crash dump 1529 CONFIG_RELOCATABLE=y. Otherwise if you plan to use vmlinux
1481 change this value to start of the reserved region (Typically 16MB 1530 for capturing the crash dump change this value to start of
1482 0x1000000). In other words, it can be set based on the "X" value as 1531 the reserved region. In other words, it can be set based on
1483 specified in the "crashkernel=YM@XM" command line boot parameter 1532 the "X" value as specified in the "crashkernel=YM@XM"
1484 passed to the panic-ed kernel. Typically this parameter is set as 1533 command line boot parameter passed to the panic-ed
1485 crashkernel=64M@16M. Please take a look at 1534 kernel. Please take a look at Documentation/kdump/kdump.txt
1486 Documentation/kdump/kdump.txt for more details about crash dumps. 1535 for more details about crash dumps.
1487 1536
1488 Usage of bzImage for capturing the crash dump is recommended as 1537 Usage of bzImage for capturing the crash dump is recommended as
1489 one does not have to build two kernels. Same kernel can be used 1538 one does not have to build two kernels. Same kernel can be used
@@ -1496,8 +1545,8 @@ config PHYSICAL_START
1496 Don't change this unless you know what you are doing. 1545 Don't change this unless you know what you are doing.
1497 1546
1498config RELOCATABLE 1547config RELOCATABLE
1499 bool "Build a relocatable kernel (EXPERIMENTAL)" 1548 bool "Build a relocatable kernel"
1500 depends on EXPERIMENTAL 1549 default y
1501 ---help--- 1550 ---help---
1502 This builds a kernel image that retains relocation information 1551 This builds a kernel image that retains relocation information
1503 so it can be loaded someplace besides the default 1MB. 1552 so it can be loaded someplace besides the default 1MB.
@@ -1512,12 +1561,16 @@ config RELOCATABLE
1512 it has been loaded at and the compile time physical address 1561 it has been loaded at and the compile time physical address
1513 (CONFIG_PHYSICAL_START) is ignored. 1562 (CONFIG_PHYSICAL_START) is ignored.
1514 1563
1564# Relocation on x86-32 needs some additional build support
1565config X86_NEED_RELOCS
1566 def_bool y
1567 depends on X86_32 && RELOCATABLE
1568
1515config PHYSICAL_ALIGN 1569config PHYSICAL_ALIGN
1516 hex 1570 hex
1517 prompt "Alignment value to which kernel should be aligned" if X86_32 1571 prompt "Alignment value to which kernel should be aligned" if X86_32
1518 default "0x100000" if X86_32 1572 default "0x1000000"
1519 default "0x200000" if X86_64 1573 range 0x2000 0x1000000
1520 range 0x2000 0x400000
1521 ---help--- 1574 ---help---
1522 This value puts the alignment restrictions on physical address 1575 This value puts the alignment restrictions on physical address
1523 where kernel is loaded and run from. Kernel is compiled for an 1576 where kernel is loaded and run from. Kernel is compiled for an
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 924e156a85a..8130334329c 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -506,6 +506,7 @@ config X86_PTRACE_BTS
506 bool "Branch Trace Store" 506 bool "Branch Trace Store"
507 default y 507 default y
508 depends on X86_DEBUGCTLMSR 508 depends on X86_DEBUGCTLMSR
509 depends on BROKEN
509 ---help--- 510 ---help---
510 This adds a ptrace interface to the hardware's branch trace store. 511 This adds a ptrace interface to the hardware's branch trace store.
511 512
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 22b752e0948..d105f29bb6b 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -159,10 +159,17 @@ config IOMMU_DEBUG
159 options. See Documentation/x86_64/boot-options.txt for more 159 options. See Documentation/x86_64/boot-options.txt for more
160 details. 160 details.
161 161
162config IOMMU_STRESS
163 bool "Enable IOMMU stress-test mode"
164 ---help---
165 This option disables various optimizations in IOMMU related
166 code to do real stress testing of the IOMMU code. This option
167 will cause a performance drop and should only be enabled for
168 testing.
169
162config IOMMU_LEAK 170config IOMMU_LEAK
163 bool "IOMMU leak tracing" 171 bool "IOMMU leak tracing"
164 depends on DEBUG_KERNEL 172 depends on IOMMU_DEBUG && DMA_API_DEBUG
165 depends on IOMMU_DEBUG
166 ---help--- 173 ---help---
167 Add a simple leak tracer to the IOMMU code. This is useful when you 174 Add a simple leak tracer to the IOMMU code. This is useful when you
168 are debugging a buggy device driver that leaks IOMMU mappings. 175 are debugging a buggy device driver that leaks IOMMU mappings.
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 8c86b72afdc..1b68659c41b 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -7,8 +7,6 @@ else
7 KBUILD_DEFCONFIG := $(ARCH)_defconfig 7 KBUILD_DEFCONFIG := $(ARCH)_defconfig
8endif 8endif
9 9
10core-$(CONFIG_KVM) += arch/x86/kvm/
11
12# BITS is used as extension for files which are available in a 32 bit 10# BITS is used as extension for files which are available in a 32 bit
13# and a 64 bit version to simplify shared Makefiles. 11# and a 64 bit version to simplify shared Makefiles.
14# e.g.: obj-y += foo_$(BITS).o 12# e.g.: obj-y += foo_$(BITS).o
@@ -83,6 +81,11 @@ ifdef CONFIG_CC_STACKPROTECTOR
83 endif 81 endif
84endif 82endif
85 83
84# Don't unroll struct assignments with kmemcheck enabled
85ifeq ($(CONFIG_KMEMCHECK),y)
86 KBUILD_CFLAGS += $(call cc-option,-fno-builtin-memcpy)
87endif
88
86# Stackpointer is addressed different for 32 bit and 64 bit x86 89# Stackpointer is addressed different for 32 bit and 64 bit x86
87sp-$(CONFIG_X86_32) := esp 90sp-$(CONFIG_X86_32) := esp
88sp-$(CONFIG_X86_64) := rsp 91sp-$(CONFIG_X86_64) := rsp
@@ -118,21 +121,8 @@ head-y += arch/x86/kernel/init_task.o
118 121
119libs-y += arch/x86/lib/ 122libs-y += arch/x86/lib/
120 123
121# Sub architecture files that needs linking first 124# See arch/x86/Kbuild for content of core part of the kernel
122core-y += $(fcore-y) 125core-y += arch/x86/
123
124# Xen paravirtualization support
125core-$(CONFIG_XEN) += arch/x86/xen/
126
127# lguest paravirtualization support
128core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/
129
130core-y += arch/x86/kernel/
131core-y += arch/x86/mm/
132
133core-y += arch/x86/crypto/
134core-y += arch/x86/vdso/
135core-$(CONFIG_IA32_EMULATION) += arch/x86/ia32/
136 126
137# drivers-y are linked after core-y 127# drivers-y are linked after core-y
138drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/ 128drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/
diff --git a/arch/x86/boot/.gitignore b/arch/x86/boot/.gitignore
index 172cf8a98bd..851fe936d24 100644
--- a/arch/x86/boot/.gitignore
+++ b/arch/x86/boot/.gitignore
@@ -3,6 +3,8 @@ bzImage
3cpustr.h 3cpustr.h
4mkcpustr 4mkcpustr
5offsets.h 5offsets.h
6voffset.h
7zoffset.h
6setup 8setup
7setup.bin 9setup.bin
8setup.elf 10setup.elf
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 6633b6e7505..8d16ada2504 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -26,9 +26,10 @@ targets := vmlinux.bin setup.bin setup.elf bzImage
26targets += fdimage fdimage144 fdimage288 image.iso mtools.conf 26targets += fdimage fdimage144 fdimage288 image.iso mtools.conf
27subdir- := compressed 27subdir- := compressed
28 28
29setup-y += a20.o cmdline.o copy.o cpu.o cpucheck.o edd.o 29setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpucheck.o edd.o
30setup-y += header.o main.o mca.o memory.o pm.o pmjump.o 30setup-y += header.o main.o mca.o memory.o pm.o pmjump.o
31setup-y += printf.o string.o tty.o video.o video-mode.o version.o 31setup-y += printf.o regs.o string.o tty.o video.o video-mode.o
32setup-y += version.o
32setup-$(CONFIG_X86_APM_BOOT) += apm.o 33setup-$(CONFIG_X86_APM_BOOT) += apm.o
33 34
34# The link order of the video-*.o modules can matter. In particular, 35# The link order of the video-*.o modules can matter. In particular,
@@ -86,19 +87,27 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE
86 87
87SETUP_OBJS = $(addprefix $(obj)/,$(setup-y)) 88SETUP_OBJS = $(addprefix $(obj)/,$(setup-y))
88 89
89sed-offsets := -e 's/^00*/0/' \ 90sed-voffset := -e 's/^\([0-9a-fA-F]*\) . \(_text\|_end\)$$/\#define VO_\2 0x\1/p'
90 -e 's/^\([0-9a-fA-F]*\) . \(input_data\|input_data_end\)$$/\#define \2 0x\1/p'
91 91
92quiet_cmd_offsets = OFFSETS $@ 92quiet_cmd_voffset = VOFFSET $@
93 cmd_offsets = $(NM) $< | sed -n $(sed-offsets) > $@ 93 cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@
94 94
95$(obj)/offsets.h: $(obj)/compressed/vmlinux FORCE 95targets += voffset.h
96 $(call if_changed,offsets) 96$(obj)/voffset.h: vmlinux FORCE
97 $(call if_changed,voffset)
98
99sed-zoffset := -e 's/^\([0-9a-fA-F]*\) . \(startup_32\|input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p'
100
101quiet_cmd_zoffset = ZOFFSET $@
102 cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@
103
104targets += zoffset.h
105$(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE
106 $(call if_changed,zoffset)
97 107
98targets += offsets.h
99 108
100AFLAGS_header.o += -I$(obj) 109AFLAGS_header.o += -I$(obj)
101$(obj)/header.o: $(obj)/offsets.h 110$(obj)/header.o: $(obj)/voffset.h $(obj)/zoffset.h
102 111
103LDFLAGS_setup.elf := -T 112LDFLAGS_setup.elf := -T
104$(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE 113$(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE
diff --git a/arch/x86/boot/a20.c b/arch/x86/boot/a20.c
index 7c19ce8c244..64a31a6d751 100644
--- a/arch/x86/boot/a20.c
+++ b/arch/x86/boot/a20.c
@@ -2,7 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007-2008 rPath, Inc. - All Rights Reserved 4 * Copyright 2007-2008 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation 5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
6 * 6 *
7 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
8 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -90,8 +90,11 @@ static int a20_test_long(void)
90 90
91static void enable_a20_bios(void) 91static void enable_a20_bios(void)
92{ 92{
93 asm volatile("pushfl; int $0x15; popfl" 93 struct biosregs ireg;
94 : : "a" ((u16)0x2401)); 94
95 initregs(&ireg);
96 ireg.ax = 0x2401;
97 intcall(0x15, &ireg, NULL);
95} 98}
96 99
97static void enable_a20_kbc(void) 100static void enable_a20_kbc(void)
diff --git a/arch/x86/boot/apm.c b/arch/x86/boot/apm.c
index 7aa6033001f..ee274834ea8 100644
--- a/arch/x86/boot/apm.c
+++ b/arch/x86/boot/apm.c
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * Original APM BIOS checking by Stephen Rothwell, May 1994 7 * Original APM BIOS checking by Stephen Rothwell, May 1994
7 * (sfr@canb.auug.org.au) 8 * (sfr@canb.auug.org.au)
@@ -19,75 +20,56 @@
19 20
20int query_apm_bios(void) 21int query_apm_bios(void)
21{ 22{
22 u16 ax, bx, cx, dx, di; 23 struct biosregs ireg, oreg;
23 u32 ebx, esi;
24 u8 err;
25 24
26 /* APM BIOS installation check */ 25 /* APM BIOS installation check */
27 ax = 0x5300; 26 initregs(&ireg);
28 bx = cx = 0; 27 ireg.ah = 0x53;
29 asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %0" 28 intcall(0x15, &ireg, &oreg);
30 : "=d" (err), "+a" (ax), "+b" (bx), "+c" (cx)
31 : : "esi", "edi");
32 29
33 if (err) 30 if (oreg.flags & X86_EFLAGS_CF)
34 return -1; /* No APM BIOS */ 31 return -1; /* No APM BIOS */
35 32
36 if (bx != 0x504d) /* "PM" signature */ 33 if (oreg.bx != 0x504d) /* "PM" signature */
37 return -1; 34 return -1;
38 35
39 if (!(cx & 0x02)) /* 32 bits supported? */ 36 if (!(oreg.cx & 0x02)) /* 32 bits supported? */
40 return -1; 37 return -1;
41 38
42 /* Disconnect first, just in case */ 39 /* Disconnect first, just in case */
43 ax = 0x5304; 40 ireg.al = 0x04;
44 bx = 0; 41 intcall(0x15, &ireg, NULL);
45 asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp"
46 : "+a" (ax), "+b" (bx)
47 : : "ecx", "edx", "esi", "edi");
48
49 /* Paranoia */
50 ebx = esi = 0;
51 cx = dx = di = 0;
52 42
53 /* 32-bit connect */ 43 /* 32-bit connect */
54 asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %6" 44 ireg.al = 0x03;
55 : "=a" (ax), "+b" (ebx), "+c" (cx), "+d" (dx), 45 intcall(0x15, &ireg, &oreg);
56 "+S" (esi), "+D" (di), "=m" (err) 46
57 : "a" (0x5303)); 47 boot_params.apm_bios_info.cseg = oreg.ax;
58 48 boot_params.apm_bios_info.offset = oreg.ebx;
59 boot_params.apm_bios_info.cseg = ax; 49 boot_params.apm_bios_info.cseg_16 = oreg.cx;
60 boot_params.apm_bios_info.offset = ebx; 50 boot_params.apm_bios_info.dseg = oreg.dx;
61 boot_params.apm_bios_info.cseg_16 = cx; 51 boot_params.apm_bios_info.cseg_len = oreg.si;
62 boot_params.apm_bios_info.dseg = dx; 52 boot_params.apm_bios_info.cseg_16_len = oreg.hsi;
63 boot_params.apm_bios_info.cseg_len = (u16)esi; 53 boot_params.apm_bios_info.dseg_len = oreg.di;
64 boot_params.apm_bios_info.cseg_16_len = esi >> 16; 54
65 boot_params.apm_bios_info.dseg_len = di; 55 if (oreg.flags & X86_EFLAGS_CF)
66
67 if (err)
68 return -1; 56 return -1;
69 57
70 /* Redo the installation check as the 32-bit connect; 58 /* Redo the installation check as the 32-bit connect;
71 some BIOSes return different flags this way... */ 59 some BIOSes return different flags this way... */
72 60
73 ax = 0x5300; 61 ireg.al = 0x00;
74 bx = cx = 0; 62 intcall(0x15, &ireg, &oreg);
75 asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %0"
76 : "=d" (err), "+a" (ax), "+b" (bx), "+c" (cx)
77 : : "esi", "edi");
78 63
79 if (err || bx != 0x504d) { 64 if ((oreg.eflags & X86_EFLAGS_CF) || oreg.bx != 0x504d) {
80 /* Failure with 32-bit connect, try to disconect and ignore */ 65 /* Failure with 32-bit connect, try to disconect and ignore */
81 ax = 0x5304; 66 ireg.al = 0x04;
82 bx = 0; 67 intcall(0x15, &ireg, NULL);
83 asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp"
84 : "+a" (ax), "+b" (bx)
85 : : "ecx", "edx", "esi", "edi");
86 return -1; 68 return -1;
87 } 69 }
88 70
89 boot_params.apm_bios_info.version = ax; 71 boot_params.apm_bios_info.version = oreg.ax;
90 boot_params.apm_bios_info.flags = cx; 72 boot_params.apm_bios_info.flags = oreg.cx;
91 return 0; 73 return 0;
92} 74}
93 75
diff --git a/arch/x86/boot/bioscall.S b/arch/x86/boot/bioscall.S
new file mode 100644
index 00000000000..507793739ea
--- /dev/null
+++ b/arch/x86/boot/bioscall.S
@@ -0,0 +1,82 @@
1/* -----------------------------------------------------------------------
2 *
3 * Copyright 2009 Intel Corporation; author H. Peter Anvin
4 *
5 * This file is part of the Linux kernel, and is made available under
6 * the terms of the GNU General Public License version 2 or (at your
7 * option) any later version; incorporated herein by reference.
8 *
9 * ----------------------------------------------------------------------- */
10
11/*
12 * "Glove box" for BIOS calls. Avoids the constant problems with BIOSes
13 * touching registers they shouldn't be.
14 */
15
16 .code16
17 .text
18 .globl intcall
19 .type intcall, @function
20intcall:
21 /* Self-modify the INT instruction. Ugly, but works. */
22 cmpb %al, 3f
23 je 1f
24 movb %al, 3f
25 jmp 1f /* Synchronize pipeline */
261:
27 /* Save state */
28 pushfl
29 pushw %fs
30 pushw %gs
31 pushal
32
33 /* Copy input state to stack frame */
34 subw $44, %sp
35 movw %dx, %si
36 movw %sp, %di
37 movw $11, %cx
38 rep; movsd
39
40 /* Pop full state from the stack */
41 popal
42 popw %gs
43 popw %fs
44 popw %es
45 popw %ds
46 popfl
47
48 /* Actual INT */
49 .byte 0xcd /* INT opcode */
503: .byte 0
51
52 /* Push full state to the stack */
53 pushfl
54 pushw %ds
55 pushw %es
56 pushw %fs
57 pushw %gs
58 pushal
59
60 /* Re-establish C environment invariants */
61 cld
62 movzwl %sp, %esp
63 movw %cs, %ax
64 movw %ax, %ds
65 movw %ax, %es
66
67 /* Copy output state from stack frame */
68 movw 68(%esp), %di /* Original %cx == 3rd argument */
69 andw %di, %di
70 jz 4f
71 movw %sp, %si
72 movw $11, %cx
73 rep; movsd
744: addw $44, %sp
75
76 /* Restore state and return */
77 popal
78 popw %gs
79 popw %fs
80 popfl
81 retl
82 .size intcall, .-intcall
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index 7b2692e897e..98239d2658f 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -26,6 +27,7 @@
26#include <asm/setup.h> 27#include <asm/setup.h>
27#include "bitops.h" 28#include "bitops.h"
28#include <asm/cpufeature.h> 29#include <asm/cpufeature.h>
30#include <asm/processor-flags.h>
29 31
30/* Useful macros */ 32/* Useful macros */
31#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) 33#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
@@ -241,6 +243,49 @@ int enable_a20(void);
241/* apm.c */ 243/* apm.c */
242int query_apm_bios(void); 244int query_apm_bios(void);
243 245
246/* bioscall.c */
247struct biosregs {
248 union {
249 struct {
250 u32 edi;
251 u32 esi;
252 u32 ebp;
253 u32 _esp;
254 u32 ebx;
255 u32 edx;
256 u32 ecx;
257 u32 eax;
258 u32 _fsgs;
259 u32 _dses;
260 u32 eflags;
261 };
262 struct {
263 u16 di, hdi;
264 u16 si, hsi;
265 u16 bp, hbp;
266 u16 _sp, _hsp;
267 u16 bx, hbx;
268 u16 dx, hdx;
269 u16 cx, hcx;
270 u16 ax, hax;
271 u16 gs, fs;
272 u16 es, ds;
273 u16 flags, hflags;
274 };
275 struct {
276 u8 dil, dih, edi2, edi3;
277 u8 sil, sih, esi2, esi3;
278 u8 bpl, bph, ebp2, ebp3;
279 u8 _spl, _sph, _esp2, _esp3;
280 u8 bl, bh, ebx2, ebx3;
281 u8 dl, dh, edx2, edx3;
282 u8 cl, ch, ecx2, ecx3;
283 u8 al, ah, eax2, eax3;
284 };
285 };
286};
287void intcall(u8 int_no, const struct biosregs *ireg, struct biosregs *oreg);
288
244/* cmdline.c */ 289/* cmdline.c */
245int cmdline_find_option(const char *option, char *buffer, int bufsize); 290int cmdline_find_option(const char *option, char *buffer, int bufsize);
246int cmdline_find_option_bool(const char *option); 291int cmdline_find_option_bool(const char *option);
@@ -279,6 +324,9 @@ int sprintf(char *buf, const char *fmt, ...);
279int vsprintf(char *buf, const char *fmt, va_list args); 324int vsprintf(char *buf, const char *fmt, va_list args);
280int printf(const char *fmt, ...); 325int printf(const char *fmt, ...);
281 326
327/* regs.c */
328void initregs(struct biosregs *regs);
329
282/* string.c */ 330/* string.c */
283int strcmp(const char *str1, const char *str2); 331int strcmp(const char *str1, const char *str2);
284size_t strnlen(const char *s, size_t maxlen); 332size_t strnlen(const char *s, size_t maxlen);
diff --git a/arch/x86/boot/compressed/.gitignore b/arch/x86/boot/compressed/.gitignore
index 63eff3b04d0..4a46fab7162 100644
--- a/arch/x86/boot/compressed/.gitignore
+++ b/arch/x86/boot/compressed/.gitignore
@@ -1,3 +1,6 @@
1relocs 1relocs
2vmlinux.bin.all 2vmlinux.bin.all
3vmlinux.relocs 3vmlinux.relocs
4vmlinux.lds
5mkpiggy
6piggy.S
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 65551c9f857..49c8a4c37d7 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -19,7 +19,9 @@ KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
19LDFLAGS := -m elf_$(UTS_MACHINE) 19LDFLAGS := -m elf_$(UTS_MACHINE)
20LDFLAGS_vmlinux := -T 20LDFLAGS_vmlinux := -T
21 21
22$(obj)/vmlinux: $(src)/vmlinux_$(BITS).lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE 22hostprogs-y := mkpiggy
23
24$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE
23 $(call if_changed,ld) 25 $(call if_changed,ld)
24 @: 26 @:
25 27
@@ -29,7 +31,7 @@ $(obj)/vmlinux.bin: vmlinux FORCE
29 31
30 32
31targets += vmlinux.bin.all vmlinux.relocs relocs 33targets += vmlinux.bin.all vmlinux.relocs relocs
32hostprogs-$(CONFIG_X86_32) += relocs 34hostprogs-$(CONFIG_X86_NEED_RELOCS) += relocs
33 35
34quiet_cmd_relocs = RELOCS $@ 36quiet_cmd_relocs = RELOCS $@
35 cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $< 37 cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $<
@@ -37,46 +39,22 @@ $(obj)/vmlinux.relocs: vmlinux $(obj)/relocs FORCE
37 $(call if_changed,relocs) 39 $(call if_changed,relocs)
38 40
39vmlinux.bin.all-y := $(obj)/vmlinux.bin 41vmlinux.bin.all-y := $(obj)/vmlinux.bin
40vmlinux.bin.all-$(CONFIG_RELOCATABLE) += $(obj)/vmlinux.relocs 42vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs
41quiet_cmd_relocbin = BUILD $@
42 cmd_relocbin = cat $(filter-out FORCE,$^) > $@
43$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE
44 $(call if_changed,relocbin)
45
46ifeq ($(CONFIG_X86_32),y)
47 43
48ifdef CONFIG_RELOCATABLE 44$(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE
49$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE
50 $(call if_changed,gzip)
51$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin.all FORCE
52 $(call if_changed,bzip2)
53$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin.all FORCE
54 $(call if_changed,lzma)
55else
56$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
57 $(call if_changed,gzip) 45 $(call if_changed,gzip)
58$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE 46$(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE
59 $(call if_changed,bzip2) 47 $(call if_changed,bzip2)
60$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE 48$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE
61 $(call if_changed,lzma) 49 $(call if_changed,lzma)
62endif
63LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T
64 50
65else 51suffix-$(CONFIG_KERNEL_GZIP) := gz
52suffix-$(CONFIG_KERNEL_BZIP2) := bz2
53suffix-$(CONFIG_KERNEL_LZMA) := lzma
66 54
67$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE 55quiet_cmd_mkpiggy = MKPIGGY $@
68 $(call if_changed,gzip) 56 cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false )
69$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE
70 $(call if_changed,bzip2)
71$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE
72 $(call if_changed,lzma)
73
74LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T
75endif
76 57
77suffix_$(CONFIG_KERNEL_GZIP) = gz 58targets += piggy.S
78suffix_$(CONFIG_KERNEL_BZIP2) = bz2 59$(obj)/piggy.S: $(obj)/vmlinux.bin.$(suffix-y) $(obj)/mkpiggy FORCE
79suffix_$(CONFIG_KERNEL_LZMA) = lzma 60 $(call if_changed,mkpiggy)
80
81$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix_y) FORCE
82 $(call if_changed,ld)
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 3a8a866fb2e..75e4f001e70 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -12,16 +12,16 @@
12 * the page directory. [According to comments etc elsewhere on a compressed 12 * the page directory. [According to comments etc elsewhere on a compressed
13 * kernel it will end up at 0x1000 + 1Mb I hope so as I assume this. - AC] 13 * kernel it will end up at 0x1000 + 1Mb I hope so as I assume this. - AC]
14 * 14 *
15 * Page 0 is deliberately kept safe, since System Management Mode code in 15 * Page 0 is deliberately kept safe, since System Management Mode code in
16 * laptops may need to access the BIOS data stored there. This is also 16 * laptops may need to access the BIOS data stored there. This is also
17 * useful for future device drivers that either access the BIOS via VM86 17 * useful for future device drivers that either access the BIOS via VM86
18 * mode. 18 * mode.
19 */ 19 */
20 20
21/* 21/*
22 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 22 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
23 */ 23 */
24.text 24 .text
25 25
26#include <linux/linkage.h> 26#include <linux/linkage.h>
27#include <asm/segment.h> 27#include <asm/segment.h>
@@ -29,161 +29,151 @@
29#include <asm/boot.h> 29#include <asm/boot.h>
30#include <asm/asm-offsets.h> 30#include <asm/asm-offsets.h>
31 31
32.section ".text.head","ax",@progbits 32 .section ".text.head","ax",@progbits
33ENTRY(startup_32) 33ENTRY(startup_32)
34 cld 34 cld
35 /* test KEEP_SEGMENTS flag to see if the bootloader is asking 35 /*
36 * us to not reload segments */ 36 * Test KEEP_SEGMENTS flag to see if the bootloader is asking
37 testb $(1<<6), BP_loadflags(%esi) 37 * us to not reload segments
38 jnz 1f 38 */
39 testb $(1<<6), BP_loadflags(%esi)
40 jnz 1f
39 41
40 cli 42 cli
41 movl $(__BOOT_DS),%eax 43 movl $__BOOT_DS, %eax
42 movl %eax,%ds 44 movl %eax, %ds
43 movl %eax,%es 45 movl %eax, %es
44 movl %eax,%fs 46 movl %eax, %fs
45 movl %eax,%gs 47 movl %eax, %gs
46 movl %eax,%ss 48 movl %eax, %ss
471: 491:
48 50
49/* Calculate the delta between where we were compiled to run 51/*
52 * Calculate the delta between where we were compiled to run
50 * at and where we were actually loaded at. This can only be done 53 * at and where we were actually loaded at. This can only be done
51 * with a short local call on x86. Nothing else will tell us what 54 * with a short local call on x86. Nothing else will tell us what
52 * address we are running at. The reserved chunk of the real-mode 55 * address we are running at. The reserved chunk of the real-mode
53 * data at 0x1e4 (defined as a scratch field) are used as the stack 56 * data at 0x1e4 (defined as a scratch field) are used as the stack
54 * for this calculation. Only 4 bytes are needed. 57 * for this calculation. Only 4 bytes are needed.
55 */ 58 */
56 leal (0x1e4+4)(%esi), %esp 59 leal (BP_scratch+4)(%esi), %esp
57 call 1f 60 call 1f
581: popl %ebp 611: popl %ebp
59 subl $1b, %ebp 62 subl $1b, %ebp
60 63
61/* %ebp contains the address we are loaded at by the boot loader and %ebx 64/*
65 * %ebp contains the address we are loaded at by the boot loader and %ebx
62 * contains the address where we should move the kernel image temporarily 66 * contains the address where we should move the kernel image temporarily
63 * for safe in-place decompression. 67 * for safe in-place decompression.
64 */ 68 */
65 69
66#ifdef CONFIG_RELOCATABLE 70#ifdef CONFIG_RELOCATABLE
67 movl %ebp, %ebx 71 movl %ebp, %ebx
68 addl $(CONFIG_PHYSICAL_ALIGN - 1), %ebx 72 movl BP_kernel_alignment(%esi), %eax
69 andl $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebx 73 decl %eax
74 addl %eax, %ebx
75 notl %eax
76 andl %eax, %ebx
70#else 77#else
71 movl $LOAD_PHYSICAL_ADDR, %ebx 78 movl $LOAD_PHYSICAL_ADDR, %ebx
72#endif 79#endif
73 80
74 /* Replace the compressed data size with the uncompressed size */ 81 /* Target address to relocate to for decompression */
75 subl input_len(%ebp), %ebx 82 addl $z_extract_offset, %ebx
76 movl output_len(%ebp), %eax 83
77 addl %eax, %ebx 84 /* Set up the stack */
78 /* Add 8 bytes for every 32K input block */ 85 leal boot_stack_end(%ebx), %esp
79 shrl $12, %eax 86
80 addl %eax, %ebx 87 /* Zero EFLAGS */
81 /* Add 32K + 18 bytes of extra slack */ 88 pushl $0
82 addl $(32768 + 18), %ebx 89 popfl
83 /* Align on a 4K boundary */ 90
84 addl $4095, %ebx 91/*
85 andl $~4095, %ebx 92 * Copy the compressed kernel to the end of our buffer
86
87/* Copy the compressed kernel to the end of our buffer
88 * where decompression in place becomes safe. 93 * where decompression in place becomes safe.
89 */ 94 */
90 pushl %esi 95 pushl %esi
91 leal _end(%ebp), %esi 96 leal (_bss-4)(%ebp), %esi
92 leal _end(%ebx), %edi 97 leal (_bss-4)(%ebx), %edi
93 movl $(_end - startup_32), %ecx 98 movl $(_bss - startup_32), %ecx
99 shrl $2, %ecx
94 std 100 std
95 rep 101 rep movsl
96 movsb
97 cld 102 cld
98 popl %esi 103 popl %esi
99
100/* Compute the kernel start address.
101 */
102#ifdef CONFIG_RELOCATABLE
103 addl $(CONFIG_PHYSICAL_ALIGN - 1), %ebp
104 andl $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebp
105#else
106 movl $LOAD_PHYSICAL_ADDR, %ebp
107#endif
108 104
109/* 105/*
110 * Jump to the relocated address. 106 * Jump to the relocated address.
111 */ 107 */
112 leal relocated(%ebx), %eax 108 leal relocated(%ebx), %eax
113 jmp *%eax 109 jmp *%eax
114ENDPROC(startup_32) 110ENDPROC(startup_32)
115 111
116.section ".text" 112 .text
117relocated: 113relocated:
118 114
119/* 115/*
120 * Clear BSS 116 * Clear BSS (stack is currently empty)
121 */
122 xorl %eax,%eax
123 leal _edata(%ebx),%edi
124 leal _end(%ebx), %ecx
125 subl %edi,%ecx
126 cld
127 rep
128 stosb
129
130/*
131 * Setup the stack for the decompressor
132 */ 117 */
133 leal boot_stack_end(%ebx), %esp 118 xorl %eax, %eax
119 leal _bss(%ebx), %edi
120 leal _ebss(%ebx), %ecx
121 subl %edi, %ecx
122 shrl $2, %ecx
123 rep stosl
134 124
135/* 125/*
136 * Do the decompression, and jump to the new kernel.. 126 * Do the decompression, and jump to the new kernel..
137 */ 127 */
138 movl output_len(%ebx), %eax 128 leal z_extract_offset_negative(%ebx), %ebp
139 pushl %eax 129 /* push arguments for decompress_kernel: */
140 # push arguments for decompress_kernel: 130 pushl %ebp /* output address */
141 pushl %ebp # output address 131 pushl $z_input_len /* input_len */
142 movl input_len(%ebx), %eax 132 leal input_data(%ebx), %eax
143 pushl %eax # input_len 133 pushl %eax /* input_data */
144 leal input_data(%ebx), %eax 134 leal boot_heap(%ebx), %eax
145 pushl %eax # input_data 135 pushl %eax /* heap area */
146 leal boot_heap(%ebx), %eax 136 pushl %esi /* real mode pointer */
147 pushl %eax # heap area 137 call decompress_kernel
148 pushl %esi # real mode pointer 138 addl $20, %esp
149 call decompress_kernel
150 addl $20, %esp
151 popl %ecx
152 139
153#if CONFIG_RELOCATABLE 140#if CONFIG_RELOCATABLE
154/* Find the address of the relocations. 141/*
142 * Find the address of the relocations.
155 */ 143 */
156 movl %ebp, %edi 144 leal z_output_len(%ebp), %edi
157 addl %ecx, %edi
158 145
159/* Calculate the delta between where vmlinux was compiled to run 146/*
147 * Calculate the delta between where vmlinux was compiled to run
160 * and where it was actually loaded. 148 * and where it was actually loaded.
161 */ 149 */
162 movl %ebp, %ebx 150 movl %ebp, %ebx
163 subl $LOAD_PHYSICAL_ADDR, %ebx 151 subl $LOAD_PHYSICAL_ADDR, %ebx
164 jz 2f /* Nothing to be done if loaded at compiled addr. */ 152 jz 2f /* Nothing to be done if loaded at compiled addr. */
165/* 153/*
166 * Process relocations. 154 * Process relocations.
167 */ 155 */
168 156
1691: subl $4, %edi 1571: subl $4, %edi
170 movl 0(%edi), %ecx 158 movl (%edi), %ecx
171 testl %ecx, %ecx 159 testl %ecx, %ecx
172 jz 2f 160 jz 2f
173 addl %ebx, -__PAGE_OFFSET(%ebx, %ecx) 161 addl %ebx, -__PAGE_OFFSET(%ebx, %ecx)
174 jmp 1b 162 jmp 1b
1752: 1632:
176#endif 164#endif
177 165
178/* 166/*
179 * Jump to the decompressed kernel. 167 * Jump to the decompressed kernel.
180 */ 168 */
181 xorl %ebx,%ebx 169 xorl %ebx, %ebx
182 jmp *%ebp 170 jmp *%ebp
183 171
184.bss 172/*
185/* Stack and heap for uncompression */ 173 * Stack and heap for uncompression
186.balign 4 174 */
175 .bss
176 .balign 4
187boot_heap: 177boot_heap:
188 .fill BOOT_HEAP_SIZE, 1, 0 178 .fill BOOT_HEAP_SIZE, 1, 0
189boot_stack: 179boot_stack:
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index ed4a8294800..f62c284db9e 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -21,8 +21,8 @@
21/* 21/*
22 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 22 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
23 */ 23 */
24.code32 24 .code32
25.text 25 .text
26 26
27#include <linux/linkage.h> 27#include <linux/linkage.h>
28#include <asm/segment.h> 28#include <asm/segment.h>
@@ -33,12 +33,14 @@
33#include <asm/processor-flags.h> 33#include <asm/processor-flags.h>
34#include <asm/asm-offsets.h> 34#include <asm/asm-offsets.h>
35 35
36.section ".text.head" 36 .section ".text.head"
37 .code32 37 .code32
38ENTRY(startup_32) 38ENTRY(startup_32)
39 cld 39 cld
40 /* test KEEP_SEGMENTS flag to see if the bootloader is asking 40 /*
41 * us to not reload segments */ 41 * Test KEEP_SEGMENTS flag to see if the bootloader is asking
42 * us to not reload segments
43 */
42 testb $(1<<6), BP_loadflags(%esi) 44 testb $(1<<6), BP_loadflags(%esi)
43 jnz 1f 45 jnz 1f
44 46
@@ -49,14 +51,15 @@ ENTRY(startup_32)
49 movl %eax, %ss 51 movl %eax, %ss
501: 521:
51 53
52/* Calculate the delta between where we were compiled to run 54/*
55 * Calculate the delta between where we were compiled to run
53 * at and where we were actually loaded at. This can only be done 56 * at and where we were actually loaded at. This can only be done
54 * with a short local call on x86. Nothing else will tell us what 57 * with a short local call on x86. Nothing else will tell us what
55 * address we are running at. The reserved chunk of the real-mode 58 * address we are running at. The reserved chunk of the real-mode
56 * data at 0x1e4 (defined as a scratch field) are used as the stack 59 * data at 0x1e4 (defined as a scratch field) are used as the stack
57 * for this calculation. Only 4 bytes are needed. 60 * for this calculation. Only 4 bytes are needed.
58 */ 61 */
59 leal (0x1e4+4)(%esi), %esp 62 leal (BP_scratch+4)(%esi), %esp
60 call 1f 63 call 1f
611: popl %ebp 641: popl %ebp
62 subl $1b, %ebp 65 subl $1b, %ebp
@@ -70,32 +73,28 @@ ENTRY(startup_32)
70 testl %eax, %eax 73 testl %eax, %eax
71 jnz no_longmode 74 jnz no_longmode
72 75
73/* Compute the delta between where we were compiled to run at 76/*
77 * Compute the delta between where we were compiled to run at
74 * and where the code will actually run at. 78 * and where the code will actually run at.
75 */ 79 *
76/* %ebp contains the address we are loaded at by the boot loader and %ebx 80 * %ebp contains the address we are loaded at by the boot loader and %ebx
77 * contains the address where we should move the kernel image temporarily 81 * contains the address where we should move the kernel image temporarily
78 * for safe in-place decompression. 82 * for safe in-place decompression.
79 */ 83 */
80 84
81#ifdef CONFIG_RELOCATABLE 85#ifdef CONFIG_RELOCATABLE
82 movl %ebp, %ebx 86 movl %ebp, %ebx
83 addl $(PMD_PAGE_SIZE -1), %ebx 87 movl BP_kernel_alignment(%esi), %eax
84 andl $PMD_PAGE_MASK, %ebx 88 decl %eax
89 addl %eax, %ebx
90 notl %eax
91 andl %eax, %ebx
85#else 92#else
86 movl $CONFIG_PHYSICAL_START, %ebx 93 movl $LOAD_PHYSICAL_ADDR, %ebx
87#endif 94#endif
88 95
89 /* Replace the compressed data size with the uncompressed size */ 96 /* Target address to relocate to for decompression */
90 subl input_len(%ebp), %ebx 97 addl $z_extract_offset, %ebx
91 movl output_len(%ebp), %eax
92 addl %eax, %ebx
93 /* Add 8 bytes for every 32K input block */
94 shrl $12, %eax
95 addl %eax, %ebx
96 /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */
97 addl $(32768 + 18 + 4095), %ebx
98 andl $~4095, %ebx
99 98
100/* 99/*
101 * Prepare for entering 64 bit mode 100 * Prepare for entering 64 bit mode
@@ -114,7 +113,7 @@ ENTRY(startup_32)
114 /* 113 /*
115 * Build early 4G boot pagetable 114 * Build early 4G boot pagetable
116 */ 115 */
117 /* Initialize Page tables to 0*/ 116 /* Initialize Page tables to 0 */
118 leal pgtable(%ebx), %edi 117 leal pgtable(%ebx), %edi
119 xorl %eax, %eax 118 xorl %eax, %eax
120 movl $((4096*6)/4), %ecx 119 movl $((4096*6)/4), %ecx
@@ -155,7 +154,8 @@ ENTRY(startup_32)
155 btsl $_EFER_LME, %eax 154 btsl $_EFER_LME, %eax
156 wrmsr 155 wrmsr
157 156
158 /* Setup for the jump to 64bit mode 157 /*
158 * Setup for the jump to 64bit mode
159 * 159 *
160 * When the jump is performend we will be in long mode but 160 * When the jump is performend we will be in long mode but
161 * in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1 161 * in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1
@@ -184,7 +184,8 @@ no_longmode:
184 184
185#include "../../kernel/verify_cpu_64.S" 185#include "../../kernel/verify_cpu_64.S"
186 186
187 /* Be careful here startup_64 needs to be at a predictable 187 /*
188 * Be careful here startup_64 needs to be at a predictable
188 * address so I can export it in an ELF header. Bootloaders 189 * address so I can export it in an ELF header. Bootloaders
189 * should look at the ELF header to find this address, as 190 * should look at the ELF header to find this address, as
190 * it may change in the future. 191 * it may change in the future.
@@ -192,7 +193,8 @@ no_longmode:
192 .code64 193 .code64
193 .org 0x200 194 .org 0x200
194ENTRY(startup_64) 195ENTRY(startup_64)
195 /* We come here either from startup_32 or directly from a 196 /*
197 * We come here either from startup_32 or directly from a
196 * 64bit bootloader. If we come here from a bootloader we depend on 198 * 64bit bootloader. If we come here from a bootloader we depend on
197 * an identity mapped page table being provied that maps our 199 * an identity mapped page table being provied that maps our
198 * entire text+data+bss and hopefully all of memory. 200 * entire text+data+bss and hopefully all of memory.
@@ -209,50 +211,54 @@ ENTRY(startup_64)
209 movl $0x20, %eax 211 movl $0x20, %eax
210 ltr %ax 212 ltr %ax
211 213
212 /* Compute the decompressed kernel start address. It is where 214 /*
215 * Compute the decompressed kernel start address. It is where
213 * we were loaded at aligned to a 2M boundary. %rbp contains the 216 * we were loaded at aligned to a 2M boundary. %rbp contains the
214 * decompressed kernel start address. 217 * decompressed kernel start address.
215 * 218 *
216 * If it is a relocatable kernel then decompress and run the kernel 219 * If it is a relocatable kernel then decompress and run the kernel
217 * from load address aligned to 2MB addr, otherwise decompress and 220 * from load address aligned to 2MB addr, otherwise decompress and
218 * run the kernel from CONFIG_PHYSICAL_START 221 * run the kernel from LOAD_PHYSICAL_ADDR
222 *
223 * We cannot rely on the calculation done in 32-bit mode, since we
224 * may have been invoked via the 64-bit entry point.
219 */ 225 */
220 226
221 /* Start with the delta to where the kernel will run at. */ 227 /* Start with the delta to where the kernel will run at. */
222#ifdef CONFIG_RELOCATABLE 228#ifdef CONFIG_RELOCATABLE
223 leaq startup_32(%rip) /* - $startup_32 */, %rbp 229 leaq startup_32(%rip) /* - $startup_32 */, %rbp
224 addq $(PMD_PAGE_SIZE - 1), %rbp 230 movl BP_kernel_alignment(%rsi), %eax
225 andq $PMD_PAGE_MASK, %rbp 231 decl %eax
226 movq %rbp, %rbx 232 addq %rax, %rbp
233 notq %rax
234 andq %rax, %rbp
227#else 235#else
228 movq $CONFIG_PHYSICAL_START, %rbp 236 movq $LOAD_PHYSICAL_ADDR, %rbp
229 movq %rbp, %rbx
230#endif 237#endif
231 238
232 /* Replace the compressed data size with the uncompressed size */ 239 /* Target address to relocate to for decompression */
233 movl input_len(%rip), %eax 240 leaq z_extract_offset(%rbp), %rbx
234 subq %rax, %rbx 241
235 movl output_len(%rip), %eax 242 /* Set up the stack */
236 addq %rax, %rbx 243 leaq boot_stack_end(%rbx), %rsp
237 /* Add 8 bytes for every 32K input block */ 244
238 shrq $12, %rax 245 /* Zero EFLAGS */
239 addq %rax, %rbx 246 pushq $0
240 /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */ 247 popfq
241 addq $(32768 + 18 + 4095), %rbx 248
242 andq $~4095, %rbx 249/*
243 250 * Copy the compressed kernel to the end of our buffer
244/* Copy the compressed kernel to the end of our buffer
245 * where decompression in place becomes safe. 251 * where decompression in place becomes safe.
246 */ 252 */
247 leaq _end_before_pgt(%rip), %r8 253 pushq %rsi
248 leaq _end_before_pgt(%rbx), %r9 254 leaq (_bss-8)(%rip), %rsi
249 movq $_end_before_pgt /* - $startup_32 */, %rcx 255 leaq (_bss-8)(%rbx), %rdi
2501: subq $8, %r8 256 movq $_bss /* - $startup_32 */, %rcx
251 subq $8, %r9 257 shrq $3, %rcx
252 movq 0(%r8), %rax 258 std
253 movq %rax, 0(%r9) 259 rep movsq
254 subq $8, %rcx 260 cld
255 jnz 1b 261 popq %rsi
256 262
257/* 263/*
258 * Jump to the relocated address. 264 * Jump to the relocated address.
@@ -260,37 +266,28 @@ ENTRY(startup_64)
260 leaq relocated(%rbx), %rax 266 leaq relocated(%rbx), %rax
261 jmp *%rax 267 jmp *%rax
262 268
263.section ".text" 269 .text
264relocated: 270relocated:
265 271
266/* 272/*
267 * Clear BSS 273 * Clear BSS (stack is currently empty)
268 */ 274 */
269 xorq %rax, %rax 275 xorl %eax, %eax
270 leaq _edata(%rbx), %rdi 276 leaq _bss(%rip), %rdi
271 leaq _end_before_pgt(%rbx), %rcx 277 leaq _ebss(%rip), %rcx
272 subq %rdi, %rcx 278 subq %rdi, %rcx
273 cld 279 shrq $3, %rcx
274 rep 280 rep stosq
275 stosb
276
277 /* Setup the stack */
278 leaq boot_stack_end(%rip), %rsp
279
280 /* zero EFLAGS after setting rsp */
281 pushq $0
282 popfq
283 281
284/* 282/*
285 * Do the decompression, and jump to the new kernel.. 283 * Do the decompression, and jump to the new kernel..
286 */ 284 */
287 pushq %rsi # Save the real mode argument 285 pushq %rsi /* Save the real mode argument */
288 movq %rsi, %rdi # real mode address 286 movq %rsi, %rdi /* real mode address */
289 leaq boot_heap(%rip), %rsi # malloc area for uncompression 287 leaq boot_heap(%rip), %rsi /* malloc area for uncompression */
290 leaq input_data(%rip), %rdx # input_data 288 leaq input_data(%rip), %rdx /* input_data */
291 movl input_len(%rip), %eax 289 movl $z_input_len, %ecx /* input_len */
292 movq %rax, %rcx # input_len 290 movq %rbp, %r8 /* output target address */
293 movq %rbp, %r8 # output
294 call decompress_kernel 291 call decompress_kernel
295 popq %rsi 292 popq %rsi
296 293
@@ -311,11 +308,21 @@ gdt:
311 .quad 0x0000000000000000 /* TS continued */ 308 .quad 0x0000000000000000 /* TS continued */
312gdt_end: 309gdt_end:
313 310
314.bss 311/*
315/* Stack and heap for uncompression */ 312 * Stack and heap for uncompression
316.balign 4 313 */
314 .bss
315 .balign 4
317boot_heap: 316boot_heap:
318 .fill BOOT_HEAP_SIZE, 1, 0 317 .fill BOOT_HEAP_SIZE, 1, 0
319boot_stack: 318boot_stack:
320 .fill BOOT_STACK_SIZE, 1, 0 319 .fill BOOT_STACK_SIZE, 1, 0
321boot_stack_end: 320boot_stack_end:
321
322/*
323 * Space for page tables (not in .bss so not zeroed)
324 */
325 .section ".pgtable","a",@nobits
326 .balign 4096
327pgtable:
328 .fill 6*4096, 1, 0
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index e45be73684f..842b2a36174 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -325,21 +325,19 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
325 free_mem_ptr = heap; /* Heap */ 325 free_mem_ptr = heap; /* Heap */
326 free_mem_end_ptr = heap + BOOT_HEAP_SIZE; 326 free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
327 327
328 if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1))
329 error("Destination address inappropriately aligned");
328#ifdef CONFIG_X86_64 330#ifdef CONFIG_X86_64
329 if ((unsigned long)output & (__KERNEL_ALIGN - 1)) 331 if (heap > 0x3fffffffffffUL)
330 error("Destination address not 2M aligned");
331 if ((unsigned long)output >= 0xffffffffffUL)
332 error("Destination address too large"); 332 error("Destination address too large");
333#else 333#else
334 if ((u32)output & (CONFIG_PHYSICAL_ALIGN - 1))
335 error("Destination address not CONFIG_PHYSICAL_ALIGN aligned");
336 if (heap > ((-__PAGE_OFFSET-(512<<20)-1) & 0x7fffffff)) 334 if (heap > ((-__PAGE_OFFSET-(512<<20)-1) & 0x7fffffff))
337 error("Destination address too large"); 335 error("Destination address too large");
336#endif
338#ifndef CONFIG_RELOCATABLE 337#ifndef CONFIG_RELOCATABLE
339 if ((u32)output != LOAD_PHYSICAL_ADDR) 338 if ((unsigned long)output != LOAD_PHYSICAL_ADDR)
340 error("Wrong destination address"); 339 error("Wrong destination address");
341#endif 340#endif
342#endif
343 341
344 if (!quiet) 342 if (!quiet)
345 putstr("\nDecompressing Linux... "); 343 putstr("\nDecompressing Linux... ");
diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c
new file mode 100644
index 00000000000..bcbd36c4143
--- /dev/null
+++ b/arch/x86/boot/compressed/mkpiggy.c
@@ -0,0 +1,97 @@
1/* ----------------------------------------------------------------------- *
2 *
3 * Copyright (C) 2009 Intel Corporation. All rights reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License version
7 * 2 as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301, USA.
18 *
19 * H. Peter Anvin <hpa@linux.intel.com>
20 *
21 * ----------------------------------------------------------------------- */
22
23/*
24 * Compute the desired load offset from a compressed program; outputs
25 * a small assembly wrapper with the appropriate symbols defined.
26 */
27
28#include <stdlib.h>
29#include <stdio.h>
30#include <string.h>
31#include <inttypes.h>
32
33static uint32_t getle32(const void *p)
34{
35 const uint8_t *cp = p;
36
37 return (uint32_t)cp[0] + ((uint32_t)cp[1] << 8) +
38 ((uint32_t)cp[2] << 16) + ((uint32_t)cp[3] << 24);
39}
40
41int main(int argc, char *argv[])
42{
43 uint32_t olen;
44 long ilen;
45 unsigned long offs;
46 FILE *f;
47
48 if (argc < 2) {
49 fprintf(stderr, "Usage: %s compressed_file\n", argv[0]);
50 return 1;
51 }
52
53 /* Get the information for the compressed kernel image first */
54
55 f = fopen(argv[1], "r");
56 if (!f) {
57 perror(argv[1]);
58 return 1;
59 }
60
61
62 if (fseek(f, -4L, SEEK_END)) {
63 perror(argv[1]);
64 }
65 fread(&olen, sizeof olen, 1, f);
66 ilen = ftell(f);
67 olen = getle32(&olen);
68 fclose(f);
69
70 /*
71 * Now we have the input (compressed) and output (uncompressed)
72 * sizes, compute the necessary decompression offset...
73 */
74
75 offs = (olen > ilen) ? olen - ilen : 0;
76 offs += olen >> 12; /* Add 8 bytes for each 32K block */
77 offs += 32*1024 + 18; /* Add 32K + 18 bytes slack */
78 offs = (offs+4095) & ~4095; /* Round to a 4K boundary */
79
80 printf(".section \".rodata.compressed\",\"a\",@progbits\n");
81 printf(".globl z_input_len\n");
82 printf("z_input_len = %lu\n", ilen);
83 printf(".globl z_output_len\n");
84 printf("z_output_len = %lu\n", (unsigned long)olen);
85 printf(".globl z_extract_offset\n");
86 printf("z_extract_offset = 0x%lx\n", offs);
87 /* z_extract_offset_negative allows simplification of head_32.S */
88 printf(".globl z_extract_offset_negative\n");
89 printf("z_extract_offset_negative = -0x%lx\n", offs);
90
91 printf(".globl input_data, input_data_end\n");
92 printf("input_data:\n");
93 printf(".incbin \"%s\"\n", argv[1]);
94 printf("input_data_end:\n");
95
96 return 0;
97}
diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c
index 857e492c571..bbeb0c3fbd9 100644
--- a/arch/x86/boot/compressed/relocs.c
+++ b/arch/x86/boot/compressed/relocs.c
@@ -504,8 +504,11 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym))
504 if (sym->st_shndx == SHN_ABS) { 504 if (sym->st_shndx == SHN_ABS) {
505 continue; 505 continue;
506 } 506 }
507 if (r_type == R_386_PC32) { 507 if (r_type == R_386_NONE || r_type == R_386_PC32) {
508 /* PC relative relocations don't need to be adjusted */ 508 /*
509 * NONE can be ignored and and PC relative
510 * relocations don't need to be adjusted.
511 */
509 } 512 }
510 else if (r_type == R_386_32) { 513 else if (r_type == R_386_32) {
511 /* Visit relocations that need to be adjusted */ 514 /* Visit relocations that need to be adjusted */
diff --git a/arch/x86/boot/compressed/vmlinux_64.lds b/arch/x86/boot/compressed/vmlinux.lds.S
index bef1ac891bc..cc353e1b3ff 100644
--- a/arch/x86/boot/compressed/vmlinux_64.lds
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -1,6 +1,17 @@
1OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") 1OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
2
3#undef i386
4
5#include <asm/page_types.h>
6
7#ifdef CONFIG_X86_64
2OUTPUT_ARCH(i386:x86-64) 8OUTPUT_ARCH(i386:x86-64)
3ENTRY(startup_64) 9ENTRY(startup_64)
10#else
11OUTPUT_ARCH(i386)
12ENTRY(startup_32)
13#endif
14
4SECTIONS 15SECTIONS
5{ 16{
6 /* Be careful parts of head_64.S assume startup_32 is at 17 /* Be careful parts of head_64.S assume startup_32 is at
@@ -33,16 +44,22 @@ SECTIONS
33 *(.data.*) 44 *(.data.*)
34 _edata = . ; 45 _edata = . ;
35 } 46 }
47 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
36 .bss : { 48 .bss : {
37 _bss = . ; 49 _bss = . ;
38 *(.bss) 50 *(.bss)
39 *(.bss.*) 51 *(.bss.*)
40 *(COMMON) 52 *(COMMON)
41 . = ALIGN(8); 53 . = ALIGN(8); /* For convenience during zeroing */
42 _end_before_pgt = . ;
43 . = ALIGN(4096);
44 pgtable = . ;
45 . = . + 4096 * 6;
46 _ebss = .; 54 _ebss = .;
47 } 55 }
56#ifdef CONFIG_X86_64
57 . = ALIGN(PAGE_SIZE);
58 .pgtable : {
59 _pgtable = . ;
60 *(.pgtable)
61 _epgtable = . ;
62 }
63#endif
64 _end = .;
48} 65}
diff --git a/arch/x86/boot/compressed/vmlinux.scr b/arch/x86/boot/compressed/vmlinux.scr
deleted file mode 100644
index f02382ae5c4..00000000000
--- a/arch/x86/boot/compressed/vmlinux.scr
+++ /dev/null
@@ -1,10 +0,0 @@
1SECTIONS
2{
3 .rodata.compressed : {
4 input_len = .;
5 LONG(input_data_end - input_data) input_data = .;
6 *(.data)
7 output_len = . - 4;
8 input_data_end = .;
9 }
10}
diff --git a/arch/x86/boot/compressed/vmlinux_32.lds b/arch/x86/boot/compressed/vmlinux_32.lds
deleted file mode 100644
index bb3c48379c4..00000000000
--- a/arch/x86/boot/compressed/vmlinux_32.lds
+++ /dev/null
@@ -1,43 +0,0 @@
1OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
2OUTPUT_ARCH(i386)
3ENTRY(startup_32)
4SECTIONS
5{
6 /* Be careful parts of head_32.S assume startup_32 is at
7 * address 0.
8 */
9 . = 0;
10 .text.head : {
11 _head = . ;
12 *(.text.head)
13 _ehead = . ;
14 }
15 .rodata.compressed : {
16 *(.rodata.compressed)
17 }
18 .text : {
19 _text = .; /* Text */
20 *(.text)
21 *(.text.*)
22 _etext = . ;
23 }
24 .rodata : {
25 _rodata = . ;
26 *(.rodata) /* read-only data */
27 *(.rodata.*)
28 _erodata = . ;
29 }
30 .data : {
31 _data = . ;
32 *(.data)
33 *(.data.*)
34 _edata = . ;
35 }
36 .bss : {
37 _bss = . ;
38 *(.bss)
39 *(.bss.*)
40 *(COMMON)
41 _end = . ;
42 }
43}
diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c
index 1aae8f3e5ca..c501a5b466f 100644
--- a/arch/x86/boot/edd.c
+++ b/arch/x86/boot/edd.c
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -22,17 +23,17 @@
22 */ 23 */
23static int read_mbr(u8 devno, void *buf) 24static int read_mbr(u8 devno, void *buf)
24{ 25{
25 u16 ax, bx, cx, dx; 26 struct biosregs ireg, oreg;
26 27
27 ax = 0x0201; /* Legacy Read, one sector */ 28 initregs(&ireg);
28 cx = 0x0001; /* Sector 0-0-1 */ 29 ireg.ax = 0x0201; /* Legacy Read, one sector */
29 dx = devno; 30 ireg.cx = 0x0001; /* Sector 0-0-1 */
30 bx = (size_t)buf; 31 ireg.dl = devno;
31 asm volatile("pushfl; stc; int $0x13; setc %%al; popfl" 32 ireg.bx = (size_t)buf;
32 : "+a" (ax), "+c" (cx), "+d" (dx), "+b" (bx)
33 : : "esi", "edi", "memory");
34 33
35 return -(u8)ax; /* 0 or -1 */ 34 intcall(0x13, &ireg, &oreg);
35
36 return -(oreg.eflags & X86_EFLAGS_CF); /* 0 or -1 */
36} 37}
37 38
38static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig) 39static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig)
@@ -72,56 +73,46 @@ static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig)
72 73
73static int get_edd_info(u8 devno, struct edd_info *ei) 74static int get_edd_info(u8 devno, struct edd_info *ei)
74{ 75{
75 u16 ax, bx, cx, dx, di; 76 struct biosregs ireg, oreg;
76 77
77 memset(ei, 0, sizeof *ei); 78 memset(ei, 0, sizeof *ei);
78 79
79 /* Check Extensions Present */ 80 /* Check Extensions Present */
80 81
81 ax = 0x4100; 82 initregs(&ireg);
82 bx = EDDMAGIC1; 83 ireg.ah = 0x41;
83 dx = devno; 84 ireg.bx = EDDMAGIC1;
84 asm("pushfl; stc; int $0x13; setc %%al; popfl" 85 ireg.dl = devno;
85 : "+a" (ax), "+b" (bx), "=c" (cx), "+d" (dx) 86 intcall(0x13, &ireg, &oreg);
86 : : "esi", "edi");
87 87
88 if ((u8)ax) 88 if (oreg.eflags & X86_EFLAGS_CF)
89 return -1; /* No extended information */ 89 return -1; /* No extended information */
90 90
91 if (bx != EDDMAGIC2) 91 if (oreg.bx != EDDMAGIC2)
92 return -1; 92 return -1;
93 93
94 ei->device = devno; 94 ei->device = devno;
95 ei->version = ax >> 8; /* EDD version number */ 95 ei->version = oreg.ah; /* EDD version number */
96 ei->interface_support = cx; /* EDD functionality subsets */ 96 ei->interface_support = oreg.cx; /* EDD functionality subsets */
97 97
98 /* Extended Get Device Parameters */ 98 /* Extended Get Device Parameters */
99 99
100 ei->params.length = sizeof(ei->params); 100 ei->params.length = sizeof(ei->params);
101 ax = 0x4800; 101 ireg.ah = 0x48;
102 dx = devno; 102 ireg.si = (size_t)&ei->params;
103 asm("pushfl; int $0x13; popfl" 103 intcall(0x13, &ireg, &oreg);
104 : "+a" (ax), "+d" (dx), "=m" (ei->params)
105 : "S" (&ei->params)
106 : "ebx", "ecx", "edi");
107 104
108 /* Get legacy CHS parameters */ 105 /* Get legacy CHS parameters */
109 106
110 /* Ralf Brown recommends setting ES:DI to 0:0 */ 107 /* Ralf Brown recommends setting ES:DI to 0:0 */
111 ax = 0x0800; 108 ireg.ah = 0x08;
112 dx = devno; 109 ireg.es = 0;
113 di = 0; 110 intcall(0x13, &ireg, &oreg);
114 asm("pushw %%es; " 111
115 "movw %%di,%%es; " 112 if (!(oreg.eflags & X86_EFLAGS_CF)) {
116 "pushfl; stc; int $0x13; setc %%al; popfl; " 113 ei->legacy_max_cylinder = oreg.ch + ((oreg.cl & 0xc0) << 2);
117 "popw %%es" 114 ei->legacy_max_head = oreg.dh;
118 : "+a" (ax), "=b" (bx), "=c" (cx), "+d" (dx), "+D" (di) 115 ei->legacy_sectors_per_track = oreg.cl & 0x3f;
119 : : "esi");
120
121 if ((u8)ax == 0) {
122 ei->legacy_max_cylinder = (cx >> 8) + ((cx & 0xc0) << 2);
123 ei->legacy_max_head = dx >> 8;
124 ei->legacy_sectors_per_track = cx & 0x3f;
125 } 116 }
126 117
127 return 0; 118 return 0;
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 5d84d1c74e4..b31cc54b464 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -22,7 +22,8 @@
22#include <asm/page_types.h> 22#include <asm/page_types.h>
23#include <asm/setup.h> 23#include <asm/setup.h>
24#include "boot.h" 24#include "boot.h"
25#include "offsets.h" 25#include "voffset.h"
26#include "zoffset.h"
26 27
27BOOTSEG = 0x07C0 /* original address of boot-sector */ 28BOOTSEG = 0x07C0 /* original address of boot-sector */
28SYSSEG = 0x1000 /* historical load address >> 4 */ 29SYSSEG = 0x1000 /* historical load address >> 4 */
@@ -115,7 +116,7 @@ _start:
115 # Part 2 of the header, from the old setup.S 116 # Part 2 of the header, from the old setup.S
116 117
117 .ascii "HdrS" # header signature 118 .ascii "HdrS" # header signature
118 .word 0x0209 # header version number (>= 0x0105) 119 .word 0x020a # header version number (>= 0x0105)
119 # or else old loadlin-1.5 will fail) 120 # or else old loadlin-1.5 will fail)
120 .globl realmode_swtch 121 .globl realmode_swtch
121realmode_swtch: .word 0, 0 # default_switch, SETUPSEG 122realmode_swtch: .word 0, 0 # default_switch, SETUPSEG
@@ -168,7 +169,11 @@ heap_end_ptr: .word _end+STACK_SIZE-512
168 # end of setup code can be used by setup 169 # end of setup code can be used by setup
169 # for local heap purposes. 170 # for local heap purposes.
170 171
171pad1: .word 0 172ext_loader_ver:
173 .byte 0 # Extended boot loader version
174ext_loader_type:
175 .byte 0 # Extended boot loader type
176
172cmd_line_ptr: .long 0 # (Header version 0x0202 or later) 177cmd_line_ptr: .long 0 # (Header version 0x0202 or later)
173 # If nonzero, a 32-bit pointer 178 # If nonzero, a 32-bit pointer
174 # to the kernel command line. 179 # to the kernel command line.
@@ -200,7 +205,7 @@ relocatable_kernel: .byte 1
200#else 205#else
201relocatable_kernel: .byte 0 206relocatable_kernel: .byte 0
202#endif 207#endif
203pad2: .byte 0 208min_alignment: .byte MIN_KERNEL_ALIGN_LG2 # minimum alignment
204pad3: .word 0 209pad3: .word 0
205 210
206cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line, 211cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line,
@@ -212,16 +217,27 @@ hardware_subarch: .long 0 # subarchitecture, added with 2.07
212 217
213hardware_subarch_data: .quad 0 218hardware_subarch_data: .quad 0
214 219
215payload_offset: .long input_data 220payload_offset: .long ZO_input_data
216payload_length: .long input_data_end-input_data 221payload_length: .long ZO_z_input_len
217 222
218setup_data: .quad 0 # 64-bit physical pointer to 223setup_data: .quad 0 # 64-bit physical pointer to
219 # single linked list of 224 # single linked list of
220 # struct setup_data 225 # struct setup_data
221 226
227pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr
228
229#define ZO_INIT_SIZE (ZO__end - ZO_startup_32 + ZO_z_extract_offset)
230#define VO_INIT_SIZE (VO__end - VO__text)
231#if ZO_INIT_SIZE > VO_INIT_SIZE
232#define INIT_SIZE ZO_INIT_SIZE
233#else
234#define INIT_SIZE VO_INIT_SIZE
235#endif
236init_size: .long INIT_SIZE # kernel initialization size
237
222# End of setup header ##################################################### 238# End of setup header #####################################################
223 239
224 .section ".inittext", "ax" 240 .section ".entrytext", "ax"
225start_of_setup: 241start_of_setup:
226#ifdef SAFE_RESET_DISK_CONTROLLER 242#ifdef SAFE_RESET_DISK_CONTROLLER
227# Reset the disk controller. 243# Reset the disk controller.
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
index 58f0415d3ae..140172b895b 100644
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -61,11 +62,10 @@ static void copy_boot_params(void)
61 */ 62 */
62static void keyboard_set_repeat(void) 63static void keyboard_set_repeat(void)
63{ 64{
64 u16 ax = 0x0305; 65 struct biosregs ireg;
65 u16 bx = 0; 66 initregs(&ireg);
66 asm volatile("int $0x16" 67 ireg.ax = 0x0305;
67 : "+a" (ax), "+b" (bx) 68 intcall(0x16, &ireg, NULL);
68 : : "ecx", "edx", "esi", "edi");
69} 69}
70 70
71/* 71/*
@@ -73,18 +73,22 @@ static void keyboard_set_repeat(void)
73 */ 73 */
74static void query_ist(void) 74static void query_ist(void)
75{ 75{
76 struct biosregs ireg, oreg;
77
76 /* Some older BIOSes apparently crash on this call, so filter 78 /* Some older BIOSes apparently crash on this call, so filter
77 it from machines too old to have SpeedStep at all. */ 79 it from machines too old to have SpeedStep at all. */
78 if (cpu.level < 6) 80 if (cpu.level < 6)
79 return; 81 return;
80 82
81 asm("int $0x15" 83 initregs(&ireg);
82 : "=a" (boot_params.ist_info.signature), 84 ireg.ax = 0xe980; /* IST Support */
83 "=b" (boot_params.ist_info.command), 85 ireg.edx = 0x47534943; /* Request value */
84 "=c" (boot_params.ist_info.event), 86 intcall(0x15, &ireg, &oreg);
85 "=d" (boot_params.ist_info.perf_level) 87
86 : "a" (0x0000e980), /* IST Support */ 88 boot_params.ist_info.signature = oreg.eax;
87 "d" (0x47534943)); /* Request value */ 89 boot_params.ist_info.command = oreg.ebx;
90 boot_params.ist_info.event = oreg.ecx;
91 boot_params.ist_info.perf_level = oreg.edx;
88} 92}
89 93
90/* 94/*
@@ -93,13 +97,12 @@ static void query_ist(void)
93static void set_bios_mode(void) 97static void set_bios_mode(void)
94{ 98{
95#ifdef CONFIG_X86_64 99#ifdef CONFIG_X86_64
96 u32 eax, ebx; 100 struct biosregs ireg;
97 101
98 eax = 0xec00; 102 initregs(&ireg);
99 ebx = 2; 103 ireg.ax = 0xec00;
100 asm volatile("int $0x15" 104 ireg.bx = 2;
101 : "+a" (eax), "+b" (ebx) 105 intcall(0x15, &ireg, NULL);
102 : : "ecx", "edx", "esi", "edi");
103#endif 106#endif
104} 107}
105 108
diff --git a/arch/x86/boot/mca.c b/arch/x86/boot/mca.c
index 911eaae5d69..a95a531148e 100644
--- a/arch/x86/boot/mca.c
+++ b/arch/x86/boot/mca.c
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -16,26 +17,22 @@
16 17
17int query_mca(void) 18int query_mca(void)
18{ 19{
19 u8 err; 20 struct biosregs ireg, oreg;
20 u16 es, bx, len; 21 u16 len;
21 22
22 asm("pushw %%es ; " 23 initregs(&ireg);
23 "int $0x15 ; " 24 ireg.ah = 0xc0;
24 "setc %0 ; " 25 intcall(0x15, &ireg, &oreg);
25 "movw %%es, %1 ; " 26
26 "popw %%es" 27 if (oreg.eflags & X86_EFLAGS_CF)
27 : "=acd" (err), "=acdSD" (es), "=b" (bx)
28 : "a" (0xc000));
29
30 if (err)
31 return -1; /* No MCA present */ 28 return -1; /* No MCA present */
32 29
33 set_fs(es); 30 set_fs(oreg.es);
34 len = rdfs16(bx); 31 len = rdfs16(oreg.bx);
35 32
36 if (len > sizeof(boot_params.sys_desc_table)) 33 if (len > sizeof(boot_params.sys_desc_table))
37 len = sizeof(boot_params.sys_desc_table); 34 len = sizeof(boot_params.sys_desc_table);
38 35
39 copy_from_fs(&boot_params.sys_desc_table, bx, len); 36 copy_from_fs(&boot_params.sys_desc_table, oreg.bx, len);
40 return 0; 37 return 0;
41} 38}
diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c
index 5054c2ddd1a..cae3feb1035 100644
--- a/arch/x86/boot/memory.c
+++ b/arch/x86/boot/memory.c
@@ -17,43 +17,41 @@
17 17
18#define SMAP 0x534d4150 /* ASCII "SMAP" */ 18#define SMAP 0x534d4150 /* ASCII "SMAP" */
19 19
20struct e820_ext_entry {
21 struct e820entry std;
22 u32 ext_flags;
23} __attribute__((packed));
24
25static int detect_memory_e820(void) 20static int detect_memory_e820(void)
26{ 21{
27 int count = 0; 22 int count = 0;
28 u32 next = 0; 23 struct biosregs ireg, oreg;
29 u32 size, id, edi;
30 u8 err;
31 struct e820entry *desc = boot_params.e820_map; 24 struct e820entry *desc = boot_params.e820_map;
32 static struct e820_ext_entry buf; /* static so it is zeroed */ 25 static struct e820entry buf; /* static so it is zeroed */
26
27 initregs(&ireg);
28 ireg.ax = 0xe820;
29 ireg.cx = sizeof buf;
30 ireg.edx = SMAP;
31 ireg.di = (size_t)&buf;
33 32
34 /* 33 /*
35 * Set this here so that if the BIOS doesn't change this field 34 * Note: at least one BIOS is known which assumes that the
36 * but still doesn't change %ecx, we're still okay... 35 * buffer pointed to by one e820 call is the same one as
36 * the previous call, and only changes modified fields. Therefore,
37 * we use a temporary buffer and copy the results entry by entry.
38 *
39 * This routine deliberately does not try to account for
40 * ACPI 3+ extended attributes. This is because there are
41 * BIOSes in the field which report zero for the valid bit for
42 * all ranges, and we don't currently make any use of the
43 * other attribute bits. Revisit this if we see the extended
44 * attribute bits deployed in a meaningful way in the future.
37 */ 45 */
38 buf.ext_flags = 1;
39 46
40 do { 47 do {
41 size = sizeof buf; 48 intcall(0x15, &ireg, &oreg);
42 49 ireg.ebx = oreg.ebx; /* for next iteration... */
43 /* Important: %edx and %esi are clobbered by some BIOSes,
44 so they must be either used for the error output
45 or explicitly marked clobbered. Given that, assume there
46 is something out there clobbering %ebp and %edi, too. */
47 asm("pushl %%ebp; int $0x15; popl %%ebp; setc %0"
48 : "=d" (err), "+b" (next), "=a" (id), "+c" (size),
49 "=D" (edi), "+m" (buf)
50 : "D" (&buf), "d" (SMAP), "a" (0xe820)
51 : "esi");
52 50
53 /* BIOSes which terminate the chain with CF = 1 as opposed 51 /* BIOSes which terminate the chain with CF = 1 as opposed
54 to %ebx = 0 don't always report the SMAP signature on 52 to %ebx = 0 don't always report the SMAP signature on
55 the final, failing, probe. */ 53 the final, failing, probe. */
56 if (err) 54 if (oreg.eflags & X86_EFLAGS_CF)
57 break; 55 break;
58 56
59 /* Some BIOSes stop returning SMAP in the middle of 57 /* Some BIOSes stop returning SMAP in the middle of
@@ -61,66 +59,64 @@ static int detect_memory_e820(void)
61 screwed up the map at that point, we might have a 59 screwed up the map at that point, we might have a
62 partial map, the full map, or complete garbage, so 60 partial map, the full map, or complete garbage, so
63 just return failure. */ 61 just return failure. */
64 if (id != SMAP) { 62 if (oreg.eax != SMAP) {
65 count = 0; 63 count = 0;
66 break; 64 break;
67 } 65 }
68 66
69 /* ACPI 3.0 added the extended flags support. If bit 0 67 *desc++ = buf;
70 in the extended flags is zero, we're supposed to simply
71 ignore the entry -- a backwards incompatible change! */
72 if (size > 20 && !(buf.ext_flags & 1))
73 continue;
74
75 *desc++ = buf.std;
76 count++; 68 count++;
77 } while (next && count < ARRAY_SIZE(boot_params.e820_map)); 69 } while (ireg.ebx && count < ARRAY_SIZE(boot_params.e820_map));
78 70
79 return boot_params.e820_entries = count; 71 return boot_params.e820_entries = count;
80} 72}
81 73
82static int detect_memory_e801(void) 74static int detect_memory_e801(void)
83{ 75{
84 u16 ax, bx, cx, dx; 76 struct biosregs ireg, oreg;
85 u8 err;
86 77
87 bx = cx = dx = 0; 78 initregs(&ireg);
88 ax = 0xe801; 79 ireg.ax = 0xe801;
89 asm("stc; int $0x15; setc %0" 80 intcall(0x15, &ireg, &oreg);
90 : "=m" (err), "+a" (ax), "+b" (bx), "+c" (cx), "+d" (dx));
91 81
92 if (err) 82 if (oreg.eflags & X86_EFLAGS_CF)
93 return -1; 83 return -1;
94 84
95 /* Do we really need to do this? */ 85 /* Do we really need to do this? */
96 if (cx || dx) { 86 if (oreg.cx || oreg.dx) {
97 ax = cx; 87 oreg.ax = oreg.cx;
98 bx = dx; 88 oreg.bx = oreg.dx;
99 } 89 }
100 90
101 if (ax > 15*1024) 91 if (oreg.ax > 15*1024) {
102 return -1; /* Bogus! */ 92 return -1; /* Bogus! */
103 93 } else if (oreg.ax == 15*1024) {
104 /* This ignores memory above 16MB if we have a memory hole 94 boot_params.alt_mem_k = (oreg.dx << 6) + oreg.ax;
105 there. If someone actually finds a machine with a memory 95 } else {
106 hole at 16MB and no support for 0E820h they should probably 96 /*
107 generate a fake e820 map. */ 97 * This ignores memory above 16MB if we have a memory
108 boot_params.alt_mem_k = (ax == 15*1024) ? (dx << 6)+ax : ax; 98 * hole there. If someone actually finds a machine
99 * with a memory hole at 16MB and no support for
100 * 0E820h they should probably generate a fake e820
101 * map.
102 */
103 boot_params.alt_mem_k = oreg.ax;
104 }
109 105
110 return 0; 106 return 0;
111} 107}
112 108
113static int detect_memory_88(void) 109static int detect_memory_88(void)
114{ 110{
115 u16 ax; 111 struct biosregs ireg, oreg;
116 u8 err;
117 112
118 ax = 0x8800; 113 initregs(&ireg);
119 asm("stc; int $0x15; setc %0" : "=bcdm" (err), "+a" (ax)); 114 ireg.ah = 0x88;
115 intcall(0x15, &ireg, &oreg);
120 116
121 boot_params.screen_info.ext_mem_k = ax; 117 boot_params.screen_info.ext_mem_k = oreg.ax;
122 118
123 return -err; 119 return -(oreg.eflags & X86_EFLAGS_CF); /* 0 or -1 */
124} 120}
125 121
126int detect_memory(void) 122int detect_memory(void)
diff --git a/arch/x86/boot/regs.c b/arch/x86/boot/regs.c
new file mode 100644
index 00000000000..958019b1cfa
--- /dev/null
+++ b/arch/x86/boot/regs.c
@@ -0,0 +1,29 @@
1/* -----------------------------------------------------------------------
2 *
3 * Copyright 2009 Intel Corporation; author H. Peter Anvin
4 *
5 * This file is part of the Linux kernel, and is made available under
6 * the terms of the GNU General Public License version 2 or (at your
7 * option) any later version; incorporated herein by reference.
8 *
9 * ----------------------------------------------------------------------- */
10
11/*
12 * Simple helper function for initializing a register set.
13 *
14 * Note that this sets EFLAGS_CF in the input register set; this
15 * makes it easier to catch functions which do nothing but don't
16 * explicitly set CF.
17 */
18
19#include "boot.h"
20
21void initregs(struct biosregs *reg)
22{
23 memset(reg, 0, sizeof *reg);
24 reg->eflags |= X86_EFLAGS_CF;
25 reg->ds = ds();
26 reg->es = ds();
27 reg->fs = fs();
28 reg->gs = gs();
29}
diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld
index bb8dc2de796..0f6ec455a2b 100644
--- a/arch/x86/boot/setup.ld
+++ b/arch/x86/boot/setup.ld
@@ -15,8 +15,11 @@ SECTIONS
15 15
16 . = 497; 16 . = 497;
17 .header : { *(.header) } 17 .header : { *(.header) }
18 .entrytext : { *(.entrytext) }
18 .inittext : { *(.inittext) } 19 .inittext : { *(.inittext) }
19 .initdata : { *(.initdata) } 20 .initdata : { *(.initdata) }
21 __end_init = .;
22
20 .text : { *(.text) } 23 .text : { *(.text) }
21 .text32 : { *(.text32) } 24 .text32 : { *(.text32) }
22 25
@@ -52,4 +55,7 @@ SECTIONS
52 55
53 . = ASSERT(_end <= 0x8000, "Setup too big!"); 56 . = ASSERT(_end <= 0x8000, "Setup too big!");
54 . = ASSERT(hdr == 0x1f1, "The setup header has the wrong offset!"); 57 . = ASSERT(hdr == 0x1f1, "The setup header has the wrong offset!");
58 /* Necessary for the very-old-loader check to work... */
59 . = ASSERT(__end_init <= 5*512, "init sections too big!");
60
55} 61}
diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c
index 7e8e8b25f5f..01ec69c901c 100644
--- a/arch/x86/boot/tty.c
+++ b/arch/x86/boot/tty.c
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -22,24 +23,23 @@
22 23
23void __attribute__((section(".inittext"))) putchar(int ch) 24void __attribute__((section(".inittext"))) putchar(int ch)
24{ 25{
25 unsigned char c = ch; 26 struct biosregs ireg;
26 27
27 if (c == '\n') 28 if (ch == '\n')
28 putchar('\r'); /* \n -> \r\n */ 29 putchar('\r'); /* \n -> \r\n */
29 30
30 /* int $0x10 is known to have bugs involving touching registers 31 initregs(&ireg);
31 it shouldn't. Be extra conservative... */ 32 ireg.bx = 0x0007;
32 asm volatile("pushal; pushw %%ds; int $0x10; popw %%ds; popal" 33 ireg.cx = 0x0001;
33 : : "b" (0x0007), "c" (0x0001), "a" (0x0e00|ch)); 34 ireg.ah = 0x0e;
35 ireg.al = ch;
36 intcall(0x10, &ireg, NULL);
34} 37}
35 38
36void __attribute__((section(".inittext"))) puts(const char *str) 39void __attribute__((section(".inittext"))) puts(const char *str)
37{ 40{
38 int n = 0; 41 while (*str)
39 while (*str) {
40 putchar(*str++); 42 putchar(*str++);
41 n++;
42 }
43} 43}
44 44
45/* 45/*
@@ -49,14 +49,13 @@ void __attribute__((section(".inittext"))) puts(const char *str)
49 49
50static u8 gettime(void) 50static u8 gettime(void)
51{ 51{
52 u16 ax = 0x0200; 52 struct biosregs ireg, oreg;
53 u16 cx, dx;
54 53
55 asm volatile("int $0x1a" 54 initregs(&ireg);
56 : "+a" (ax), "=c" (cx), "=d" (dx) 55 ireg.ah = 0x02;
57 : : "ebx", "esi", "edi"); 56 intcall(0x1a, &ireg, &oreg);
58 57
59 return dx >> 8; 58 return oreg.dh;
60} 59}
61 60
62/* 61/*
@@ -64,19 +63,24 @@ static u8 gettime(void)
64 */ 63 */
65int getchar(void) 64int getchar(void)
66{ 65{
67 u16 ax = 0; 66 struct biosregs ireg, oreg;
68 asm volatile("int $0x16" : "+a" (ax)); 67
68 initregs(&ireg);
69 /* ireg.ah = 0x00; */
70 intcall(0x16, &ireg, &oreg);
69 71
70 return ax & 0xff; 72 return oreg.al;
71} 73}
72 74
73static int kbd_pending(void) 75static int kbd_pending(void)
74{ 76{
75 u8 pending; 77 struct biosregs ireg, oreg;
76 asm volatile("int $0x16; setnz %0" 78
77 : "=qm" (pending) 79 initregs(&ireg);
78 : "a" (0x0100)); 80 ireg.ah = 0x01;
79 return pending; 81 intcall(0x16, &ireg, &oreg);
82
83 return !(oreg.eflags & X86_EFLAGS_ZF);
80} 84}
81 85
82void kbd_flush(void) 86void kbd_flush(void)
diff --git a/arch/x86/boot/video-bios.c b/arch/x86/boot/video-bios.c
index 3fa979c9c36..d660be49236 100644
--- a/arch/x86/boot/video-bios.c
+++ b/arch/x86/boot/video-bios.c
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -29,21 +30,21 @@ static int bios_set_mode(struct mode_info *mi)
29 30
30static int set_bios_mode(u8 mode) 31static int set_bios_mode(u8 mode)
31{ 32{
32 u16 ax; 33 struct biosregs ireg, oreg;
33 u8 new_mode; 34 u8 new_mode;
34 35
35 ax = mode; /* AH=0x00 Set Video Mode */ 36 initregs(&ireg);
36 asm volatile(INT10 37 ireg.al = mode; /* AH=0x00 Set Video Mode */
37 : "+a" (ax) 38 intcall(0x10, &ireg, NULL);
38 : : "ebx", "ecx", "edx", "esi", "edi");
39 39
40 ax = 0x0f00; /* Get Current Video Mode */ 40
41 asm volatile(INT10 41 ireg.ah = 0x0f; /* Get Current Video Mode */
42 : "+a" (ax) 42 intcall(0x10, &ireg, &oreg);
43 : : "ebx", "ecx", "edx", "esi", "edi");
44 43
45 do_restore = 1; /* Assume video contents were lost */ 44 do_restore = 1; /* Assume video contents were lost */
46 new_mode = ax & 0x7f; /* Not all BIOSes are clean with the top bit */ 45
46 /* Not all BIOSes are clean with the top bit */
47 new_mode = ireg.al & 0x7f;
47 48
48 if (new_mode == mode) 49 if (new_mode == mode)
49 return 0; /* Mode change OK */ 50 return 0; /* Mode change OK */
@@ -53,10 +54,8 @@ static int set_bios_mode(u8 mode)
53 /* Mode setting failed, but we didn't end up where we 54 /* Mode setting failed, but we didn't end up where we
54 started. That's bad. Try to revert to the original 55 started. That's bad. Try to revert to the original
55 video mode. */ 56 video mode. */
56 ax = boot_params.screen_info.orig_video_mode; 57 ireg.ax = boot_params.screen_info.orig_video_mode;
57 asm volatile(INT10 58 intcall(0x10, &ireg, NULL);
58 : "+a" (ax)
59 : : "ebx", "ecx", "edx", "esi", "edi");
60 } 59 }
61#endif 60#endif
62 return -1; 61 return -1;
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c
index 4a58c8ce3f6..c700147d6ff 100644
--- a/arch/x86/boot/video-vesa.c
+++ b/arch/x86/boot/video-vesa.c
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -31,7 +32,7 @@ static inline void vesa_store_mode_params_graphics(void) {}
31static int vesa_probe(void) 32static int vesa_probe(void)
32{ 33{
33#if defined(CONFIG_VIDEO_VESA) || defined(CONFIG_FIRMWARE_EDID) 34#if defined(CONFIG_VIDEO_VESA) || defined(CONFIG_FIRMWARE_EDID)
34 u16 ax, cx, di; 35 struct biosregs ireg, oreg;
35 u16 mode; 36 u16 mode;
36 addr_t mode_ptr; 37 addr_t mode_ptr;
37 struct mode_info *mi; 38 struct mode_info *mi;
@@ -39,13 +40,12 @@ static int vesa_probe(void)
39 40
40 video_vesa.modes = GET_HEAP(struct mode_info, 0); 41 video_vesa.modes = GET_HEAP(struct mode_info, 0);
41 42
42 ax = 0x4f00; 43 initregs(&ireg);
43 di = (size_t)&vginfo; 44 ireg.ax = 0x4f00;
44 asm(INT10 45 ireg.di = (size_t)&vginfo;
45 : "+a" (ax), "+D" (di), "=m" (vginfo) 46 intcall(0x10, &ireg, &oreg);
46 : : "ebx", "ecx", "edx", "esi");
47 47
48 if (ax != 0x004f || 48 if (ireg.ax != 0x004f ||
49 vginfo.signature != VESA_MAGIC || 49 vginfo.signature != VESA_MAGIC ||
50 vginfo.version < 0x0102) 50 vginfo.version < 0x0102)
51 return 0; /* Not present */ 51 return 0; /* Not present */
@@ -65,14 +65,12 @@ static int vesa_probe(void)
65 65
66 memset(&vminfo, 0, sizeof vminfo); /* Just in case... */ 66 memset(&vminfo, 0, sizeof vminfo); /* Just in case... */
67 67
68 ax = 0x4f01; 68 ireg.ax = 0x4f01;
69 cx = mode; 69 ireg.cx = mode;
70 di = (size_t)&vminfo; 70 ireg.di = (size_t)&vminfo;
71 asm(INT10 71 intcall(0x10, &ireg, &oreg);
72 : "+a" (ax), "+c" (cx), "+D" (di), "=m" (vminfo)
73 : : "ebx", "edx", "esi");
74 72
75 if (ax != 0x004f) 73 if (ireg.ax != 0x004f)
76 continue; 74 continue;
77 75
78 if ((vminfo.mode_attr & 0x15) == 0x05) { 76 if ((vminfo.mode_attr & 0x15) == 0x05) {
@@ -111,20 +109,19 @@ static int vesa_probe(void)
111 109
112static int vesa_set_mode(struct mode_info *mode) 110static int vesa_set_mode(struct mode_info *mode)
113{ 111{
114 u16 ax, bx, cx, di; 112 struct biosregs ireg, oreg;
115 int is_graphic; 113 int is_graphic;
116 u16 vesa_mode = mode->mode - VIDEO_FIRST_VESA; 114 u16 vesa_mode = mode->mode - VIDEO_FIRST_VESA;
117 115
118 memset(&vminfo, 0, sizeof vminfo); /* Just in case... */ 116 memset(&vminfo, 0, sizeof vminfo); /* Just in case... */
119 117
120 ax = 0x4f01; 118 initregs(&ireg);
121 cx = vesa_mode; 119 ireg.ax = 0x4f01;
122 di = (size_t)&vminfo; 120 ireg.cx = vesa_mode;
123 asm(INT10 121 ireg.di = (size_t)&vminfo;
124 : "+a" (ax), "+c" (cx), "+D" (di), "=m" (vminfo) 122 intcall(0x10, &ireg, &oreg);
125 : : "ebx", "edx", "esi");
126 123
127 if (ax != 0x004f) 124 if (oreg.ax != 0x004f)
128 return -1; 125 return -1;
129 126
130 if ((vminfo.mode_attr & 0x15) == 0x05) { 127 if ((vminfo.mode_attr & 0x15) == 0x05) {
@@ -141,14 +138,12 @@ static int vesa_set_mode(struct mode_info *mode)
141 } 138 }
142 139
143 140
144 ax = 0x4f02; 141 initregs(&ireg);
145 bx = vesa_mode; 142 ireg.ax = 0x4f02;
146 di = 0; 143 ireg.bx = vesa_mode;
147 asm volatile(INT10 144 intcall(0x10, &ireg, &oreg);
148 : "+a" (ax), "+b" (bx), "+D" (di)
149 : : "ecx", "edx", "esi");
150 145
151 if (ax != 0x004f) 146 if (oreg.ax != 0x004f)
152 return -1; 147 return -1;
153 148
154 graphic_mode = is_graphic; 149 graphic_mode = is_graphic;
@@ -171,50 +166,45 @@ static int vesa_set_mode(struct mode_info *mode)
171/* Switch DAC to 8-bit mode */ 166/* Switch DAC to 8-bit mode */
172static void vesa_dac_set_8bits(void) 167static void vesa_dac_set_8bits(void)
173{ 168{
169 struct biosregs ireg, oreg;
174 u8 dac_size = 6; 170 u8 dac_size = 6;
175 171
176 /* If possible, switch the DAC to 8-bit mode */ 172 /* If possible, switch the DAC to 8-bit mode */
177 if (vginfo.capabilities & 1) { 173 if (vginfo.capabilities & 1) {
178 u16 ax, bx; 174 initregs(&ireg);
179 175 ireg.ax = 0x4f08;
180 ax = 0x4f08; 176 ireg.bh = 0x08;
181 bx = 0x0800; 177 intcall(0x10, &ireg, &oreg);
182 asm volatile(INT10 178 if (oreg.ax == 0x004f)
183 : "+a" (ax), "+b" (bx) 179 dac_size = oreg.bh;
184 : : "ecx", "edx", "esi", "edi");
185
186 if (ax == 0x004f)
187 dac_size = bx >> 8;
188 } 180 }
189 181
190 /* Set the color sizes to the DAC size, and offsets to 0 */ 182 /* Set the color sizes to the DAC size, and offsets to 0 */
191 boot_params.screen_info.red_size = dac_size; 183 boot_params.screen_info.red_size = dac_size;
192 boot_params.screen_info.green_size = dac_size; 184 boot_params.screen_info.green_size = dac_size;
193 boot_params.screen_info.blue_size = dac_size; 185 boot_params.screen_info.blue_size = dac_size;
194 boot_params.screen_info.rsvd_size = dac_size; 186 boot_params.screen_info.rsvd_size = dac_size;
195 187
196 boot_params.screen_info.red_pos = 0; 188 boot_params.screen_info.red_pos = 0;
197 boot_params.screen_info.green_pos = 0; 189 boot_params.screen_info.green_pos = 0;
198 boot_params.screen_info.blue_pos = 0; 190 boot_params.screen_info.blue_pos = 0;
199 boot_params.screen_info.rsvd_pos = 0; 191 boot_params.screen_info.rsvd_pos = 0;
200} 192}
201 193
202/* Save the VESA protected mode info */ 194/* Save the VESA protected mode info */
203static void vesa_store_pm_info(void) 195static void vesa_store_pm_info(void)
204{ 196{
205 u16 ax, bx, di, es; 197 struct biosregs ireg, oreg;
206 198
207 ax = 0x4f0a; 199 initregs(&ireg);
208 bx = di = 0; 200 ireg.ax = 0x4f0a;
209 asm("pushw %%es; "INT10"; movw %%es,%0; popw %%es" 201 intcall(0x10, &ireg, &oreg);
210 : "=d" (es), "+a" (ax), "+b" (bx), "+D" (di)
211 : : "ecx", "esi");
212 202
213 if (ax != 0x004f) 203 if (oreg.ax != 0x004f)
214 return; 204 return;
215 205
216 boot_params.screen_info.vesapm_seg = es; 206 boot_params.screen_info.vesapm_seg = oreg.es;
217 boot_params.screen_info.vesapm_off = di; 207 boot_params.screen_info.vesapm_off = oreg.di;
218} 208}
219 209
220/* 210/*
@@ -252,7 +242,7 @@ static void vesa_store_mode_params_graphics(void)
252void vesa_store_edid(void) 242void vesa_store_edid(void)
253{ 243{
254#ifdef CONFIG_FIRMWARE_EDID 244#ifdef CONFIG_FIRMWARE_EDID
255 u16 ax, bx, cx, dx, di; 245 struct biosregs ireg, oreg;
256 246
257 /* Apparently used as a nonsense token... */ 247 /* Apparently used as a nonsense token... */
258 memset(&boot_params.edid_info, 0x13, sizeof boot_params.edid_info); 248 memset(&boot_params.edid_info, 0x13, sizeof boot_params.edid_info);
@@ -260,33 +250,26 @@ void vesa_store_edid(void)
260 if (vginfo.version < 0x0200) 250 if (vginfo.version < 0x0200)
261 return; /* EDID requires VBE 2.0+ */ 251 return; /* EDID requires VBE 2.0+ */
262 252
263 ax = 0x4f15; /* VBE DDC */ 253 initregs(&ireg);
264 bx = 0x0000; /* Report DDC capabilities */ 254 ireg.ax = 0x4f15; /* VBE DDC */
265 cx = 0; /* Controller 0 */ 255 /* ireg.bx = 0x0000; */ /* Report DDC capabilities */
266 di = 0; /* ES:DI must be 0 by spec */ 256 /* ireg.cx = 0; */ /* Controller 0 */
267 257 ireg.es = 0; /* ES:DI must be 0 by spec */
268 /* Note: The VBE DDC spec is different from the main VESA spec; 258 intcall(0x10, &ireg, &oreg);
269 we genuinely have to assume all registers are destroyed here. */
270
271 asm("pushw %%es; movw %2,%%es; "INT10"; popw %%es"
272 : "+a" (ax), "+b" (bx), "+c" (cx), "+D" (di)
273 : : "esi", "edx");
274 259
275 if (ax != 0x004f) 260 if (oreg.ax != 0x004f)
276 return; /* No EDID */ 261 return; /* No EDID */
277 262
278 /* BH = time in seconds to transfer EDD information */ 263 /* BH = time in seconds to transfer EDD information */
279 /* BL = DDC level supported */ 264 /* BL = DDC level supported */
280 265
281 ax = 0x4f15; /* VBE DDC */ 266 ireg.ax = 0x4f15; /* VBE DDC */
282 bx = 0x0001; /* Read EDID */ 267 ireg.bx = 0x0001; /* Read EDID */
283 cx = 0; /* Controller 0 */ 268 /* ireg.cx = 0; */ /* Controller 0 */
284 dx = 0; /* EDID block number */ 269 /* ireg.dx = 0; */ /* EDID block number */
285 di =(size_t) &boot_params.edid_info; /* (ES:)Pointer to block */ 270 ireg.es = ds();
286 asm(INT10 271 ireg.di =(size_t)&boot_params.edid_info; /* (ES:)Pointer to block */
287 : "+a" (ax), "+b" (bx), "+d" (dx), "=m" (boot_params.edid_info), 272 intcall(0x10, &ireg, &oreg);
288 "+c" (cx), "+D" (di)
289 : : "esi");
290#endif /* CONFIG_FIRMWARE_EDID */ 273#endif /* CONFIG_FIRMWARE_EDID */
291} 274}
292 275
diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c
index 9e0587a3776..8f8d827e254 100644
--- a/arch/x86/boot/video-vga.c
+++ b/arch/x86/boot/video-vga.c
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -39,30 +40,30 @@ static __videocard video_vga;
39/* Set basic 80x25 mode */ 40/* Set basic 80x25 mode */
40static u8 vga_set_basic_mode(void) 41static u8 vga_set_basic_mode(void)
41{ 42{
43 struct biosregs ireg, oreg;
42 u16 ax; 44 u16 ax;
43 u8 rows; 45 u8 rows;
44 u8 mode; 46 u8 mode;
45 47
48 initregs(&ireg);
49
46#ifdef CONFIG_VIDEO_400_HACK 50#ifdef CONFIG_VIDEO_400_HACK
47 if (adapter >= ADAPTER_VGA) { 51 if (adapter >= ADAPTER_VGA) {
48 asm volatile(INT10 52 ireg.ax = 0x1202;
49 : : "a" (0x1202), "b" (0x0030) 53 ireg.bx = 0x0030;
50 : "ecx", "edx", "esi", "edi"); 54 intcall(0x10, &ireg, NULL);
51 } 55 }
52#endif 56#endif
53 57
54 ax = 0x0f00; 58 ax = 0x0f00;
55 asm volatile(INT10 59 intcall(0x10, &ireg, &oreg);
56 : "+a" (ax) 60 mode = oreg.al;
57 : : "ebx", "ecx", "edx", "esi", "edi");
58
59 mode = (u8)ax;
60 61
61 set_fs(0); 62 set_fs(0);
62 rows = rdfs8(0x484); /* rows minus one */ 63 rows = rdfs8(0x484); /* rows minus one */
63 64
64#ifndef CONFIG_VIDEO_400_HACK 65#ifndef CONFIG_VIDEO_400_HACK
65 if ((ax == 0x5003 || ax == 0x5007) && 66 if ((oreg.ax == 0x5003 || oreg.ax == 0x5007) &&
66 (rows == 0 || rows == 24)) 67 (rows == 0 || rows == 24))
67 return mode; 68 return mode;
68#endif 69#endif
@@ -71,10 +72,8 @@ static u8 vga_set_basic_mode(void)
71 mode = 3; 72 mode = 3;
72 73
73 /* Set the mode */ 74 /* Set the mode */
74 ax = mode; 75 ireg.ax = mode; /* AH=0: set mode */
75 asm volatile(INT10 76 intcall(0x10, &ireg, NULL);
76 : "+a" (ax)
77 : : "ebx", "ecx", "edx", "esi", "edi");
78 do_restore = 1; 77 do_restore = 1;
79 return mode; 78 return mode;
80} 79}
@@ -82,43 +81,69 @@ static u8 vga_set_basic_mode(void)
82static void vga_set_8font(void) 81static void vga_set_8font(void)
83{ 82{
84 /* Set 8x8 font - 80x43 on EGA, 80x50 on VGA */ 83 /* Set 8x8 font - 80x43 on EGA, 80x50 on VGA */
84 struct biosregs ireg;
85
86 initregs(&ireg);
85 87
86 /* Set 8x8 font */ 88 /* Set 8x8 font */
87 asm volatile(INT10 : : "a" (0x1112), "b" (0)); 89 ireg.ax = 0x1112;
90 /* ireg.bl = 0; */
91 intcall(0x10, &ireg, NULL);
88 92
89 /* Use alternate print screen */ 93 /* Use alternate print screen */
90 asm volatile(INT10 : : "a" (0x1200), "b" (0x20)); 94 ireg.ax = 0x1200;
95 ireg.bl = 0x20;
96 intcall(0x10, &ireg, NULL);
91 97
92 /* Turn off cursor emulation */ 98 /* Turn off cursor emulation */
93 asm volatile(INT10 : : "a" (0x1201), "b" (0x34)); 99 ireg.ax = 0x1201;
100 ireg.bl = 0x34;
101 intcall(0x10, &ireg, NULL);
94 102
95 /* Cursor is scan lines 6-7 */ 103 /* Cursor is scan lines 6-7 */
96 asm volatile(INT10 : : "a" (0x0100), "c" (0x0607)); 104 ireg.ax = 0x0100;
105 ireg.cx = 0x0607;
106 intcall(0x10, &ireg, NULL);
97} 107}
98 108
99static void vga_set_14font(void) 109static void vga_set_14font(void)
100{ 110{
101 /* Set 9x14 font - 80x28 on VGA */ 111 /* Set 9x14 font - 80x28 on VGA */
112 struct biosregs ireg;
113
114 initregs(&ireg);
102 115
103 /* Set 9x14 font */ 116 /* Set 9x14 font */
104 asm volatile(INT10 : : "a" (0x1111), "b" (0)); 117 ireg.ax = 0x1111;
118 /* ireg.bl = 0; */
119 intcall(0x10, &ireg, NULL);
105 120
106 /* Turn off cursor emulation */ 121 /* Turn off cursor emulation */
107 asm volatile(INT10 : : "a" (0x1201), "b" (0x34)); 122 ireg.ax = 0x1201;
123 ireg.bl = 0x34;
124 intcall(0x10, &ireg, NULL);
108 125
109 /* Cursor is scan lines 11-12 */ 126 /* Cursor is scan lines 11-12 */
110 asm volatile(INT10 : : "a" (0x0100), "c" (0x0b0c)); 127 ireg.ax = 0x0100;
128 ireg.cx = 0x0b0c;
129 intcall(0x10, &ireg, NULL);
111} 130}
112 131
113static void vga_set_80x43(void) 132static void vga_set_80x43(void)
114{ 133{
115 /* Set 80x43 mode on VGA (not EGA) */ 134 /* Set 80x43 mode on VGA (not EGA) */
135 struct biosregs ireg;
136
137 initregs(&ireg);
116 138
117 /* Set 350 scans */ 139 /* Set 350 scans */
118 asm volatile(INT10 : : "a" (0x1201), "b" (0x30)); 140 ireg.ax = 0x1201;
141 ireg.bl = 0x30;
142 intcall(0x10, &ireg, NULL);
119 143
120 /* Reset video mode */ 144 /* Reset video mode */
121 asm volatile(INT10 : : "a" (0x0003)); 145 ireg.ax = 0x0003;
146 intcall(0x10, &ireg, NULL);
122 147
123 vga_set_8font(); 148 vga_set_8font();
124} 149}
@@ -225,8 +250,6 @@ static int vga_set_mode(struct mode_info *mode)
225 */ 250 */
226static int vga_probe(void) 251static int vga_probe(void)
227{ 252{
228 u16 ega_bx;
229
230 static const char *card_name[] = { 253 static const char *card_name[] = {
231 "CGA/MDA/HGC", "EGA", "VGA" 254 "CGA/MDA/HGC", "EGA", "VGA"
232 }; 255 };
@@ -240,26 +263,26 @@ static int vga_probe(void)
240 sizeof(ega_modes)/sizeof(struct mode_info), 263 sizeof(ega_modes)/sizeof(struct mode_info),
241 sizeof(vga_modes)/sizeof(struct mode_info), 264 sizeof(vga_modes)/sizeof(struct mode_info),
242 }; 265 };
243 u8 vga_flag;
244 266
245 asm(INT10 267 struct biosregs ireg, oreg;
246 : "=b" (ega_bx) 268
247 : "a" (0x1200), "b" (0x10) /* Check EGA/VGA */ 269 initregs(&ireg);
248 : "ecx", "edx", "esi", "edi"); 270
271 ireg.ax = 0x1200;
272 ireg.bl = 0x10; /* Check EGA/VGA */
273 intcall(0x10, &ireg, &oreg);
249 274
250#ifndef _WAKEUP 275#ifndef _WAKEUP
251 boot_params.screen_info.orig_video_ega_bx = ega_bx; 276 boot_params.screen_info.orig_video_ega_bx = oreg.bx;
252#endif 277#endif
253 278
254 /* If we have MDA/CGA/HGC then BL will be unchanged at 0x10 */ 279 /* If we have MDA/CGA/HGC then BL will be unchanged at 0x10 */
255 if ((u8)ega_bx != 0x10) { 280 if (oreg.bl != 0x10) {
256 /* EGA/VGA */ 281 /* EGA/VGA */
257 asm(INT10 282 ireg.ax = 0x1a00;
258 : "=a" (vga_flag) 283 intcall(0x10, &ireg, &oreg);
259 : "a" (0x1a00)
260 : "ebx", "ecx", "edx", "esi", "edi");
261 284
262 if (vga_flag == 0x1a) { 285 if (oreg.al == 0x1a) {
263 adapter = ADAPTER_VGA; 286 adapter = ADAPTER_VGA;
264#ifndef _WAKEUP 287#ifndef _WAKEUP
265 boot_params.screen_info.orig_video_isVGA = 1; 288 boot_params.screen_info.orig_video_isVGA = 1;
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
index 3bef2c1febe..bad728b76fc 100644
--- a/arch/x86/boot/video.c
+++ b/arch/x86/boot/video.c
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -18,33 +19,29 @@
18 19
19static void store_cursor_position(void) 20static void store_cursor_position(void)
20{ 21{
21 u16 curpos; 22 struct biosregs ireg, oreg;
22 u16 ax, bx;
23 23
24 ax = 0x0300; 24 initregs(&ireg);
25 bx = 0; 25 ireg.ah = 0x03;
26 asm(INT10 26 intcall(0x10, &ireg, &oreg);
27 : "=d" (curpos), "+a" (ax), "+b" (bx)
28 : : "ecx", "esi", "edi");
29 27
30 boot_params.screen_info.orig_x = curpos; 28 boot_params.screen_info.orig_x = oreg.dl;
31 boot_params.screen_info.orig_y = curpos >> 8; 29 boot_params.screen_info.orig_y = oreg.dh;
32} 30}
33 31
34static void store_video_mode(void) 32static void store_video_mode(void)
35{ 33{
36 u16 ax, page; 34 struct biosregs ireg, oreg;
37 35
38 /* N.B.: the saving of the video page here is a bit silly, 36 /* N.B.: the saving of the video page here is a bit silly,
39 since we pretty much assume page 0 everywhere. */ 37 since we pretty much assume page 0 everywhere. */
40 ax = 0x0f00; 38 initregs(&ireg);
41 asm(INT10 39 ireg.ah = 0x0f;
42 : "+a" (ax), "=b" (page) 40 intcall(0x10, &ireg, &oreg);
43 : : "ecx", "edx", "esi", "edi");
44 41
45 /* Not all BIOSes are clean with respect to the top bit */ 42 /* Not all BIOSes are clean with respect to the top bit */
46 boot_params.screen_info.orig_video_mode = ax & 0x7f; 43 boot_params.screen_info.orig_video_mode = oreg.al & 0x7f;
47 boot_params.screen_info.orig_video_page = page >> 8; 44 boot_params.screen_info.orig_video_page = oreg.bh;
48} 45}
49 46
50/* 47/*
@@ -257,7 +254,7 @@ static void restore_screen(void)
257 int y; 254 int y;
258 addr_t dst = 0; 255 addr_t dst = 0;
259 u16 *src = saved.data; 256 u16 *src = saved.data;
260 u16 ax, bx, dx; 257 struct biosregs ireg;
261 258
262 if (graphic_mode) 259 if (graphic_mode)
263 return; /* Can't restore onto a graphic mode */ 260 return; /* Can't restore onto a graphic mode */
@@ -296,12 +293,11 @@ static void restore_screen(void)
296 } 293 }
297 294
298 /* Restore cursor position */ 295 /* Restore cursor position */
299 ax = 0x0200; /* Set cursor position */ 296 initregs(&ireg);
300 bx = 0; /* Page number (<< 8) */ 297 ireg.ah = 0x02; /* Set cursor position */
301 dx = (saved.cury << 8)+saved.curx; 298 ireg.dh = saved.cury;
302 asm volatile(INT10 299 ireg.dl = saved.curx;
303 : "+a" (ax), "+b" (bx), "+d" (dx) 300 intcall(0x10, &ireg, NULL);
304 : : "ecx", "esi", "edi");
305} 301}
306#else 302#else
307#define save_screen() ((void)0) 303#define save_screen() ((void)0)
diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h
index ee63f5d1446..5bb174a997f 100644
--- a/arch/x86/boot/video.h
+++ b/arch/x86/boot/video.h
@@ -112,20 +112,6 @@ extern int force_x, force_y; /* Don't query the BIOS for cols/rows */
112extern int do_restore; /* Restore screen contents */ 112extern int do_restore; /* Restore screen contents */
113extern int graphic_mode; /* Graphics mode with linear frame buffer */ 113extern int graphic_mode; /* Graphics mode with linear frame buffer */
114 114
115/*
116 * int $0x10 is notorious for touching registers it shouldn't.
117 * gcc doesn't like %ebp being clobbered, so define it as a push/pop
118 * sequence here.
119 *
120 * A number of systems, including the original PC can clobber %bp in
121 * certain circumstances, like when scrolling. There exists at least
122 * one Trident video card which could clobber DS under a set of
123 * circumstances that we are unlikely to encounter (scrolling when
124 * using an extended graphics mode of more than 800x600 pixels), but
125 * it's cheap insurance to deal with that here.
126 */
127#define INT10 "pushl %%ebp; pushw %%ds; int $0x10; popw %%ds; popl %%ebp"
128
129/* Accessing VGA indexed registers */ 115/* Accessing VGA indexed registers */
130static inline u8 in_idx(u16 port, u8 index) 116static inline u8 in_idx(u16 port, u8 index)
131{ 117{
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 235b81d0f6f..edb992ebef9 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -1,12 +1,13 @@
1# 1#
2# Automatically generated make config: don't edit 2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.29-rc4 3# Linux kernel version: 2.6.30-rc2
4# Tue Feb 24 15:50:58 2009 4# Mon May 11 16:21:55 2009
5# 5#
6# CONFIG_64BIT is not set 6# CONFIG_64BIT is not set
7CONFIG_X86_32=y 7CONFIG_X86_32=y
8# CONFIG_X86_64 is not set 8# CONFIG_X86_64 is not set
9CONFIG_X86=y 9CONFIG_X86=y
10CONFIG_OUTPUT_FORMAT="elf32-i386"
10CONFIG_ARCH_DEFCONFIG="arch/x86/configs/i386_defconfig" 11CONFIG_ARCH_DEFCONFIG="arch/x86/configs/i386_defconfig"
11CONFIG_GENERIC_TIME=y 12CONFIG_GENERIC_TIME=y
12CONFIG_GENERIC_CMOS_UPDATE=y 13CONFIG_GENERIC_CMOS_UPDATE=y
@@ -33,6 +34,7 @@ CONFIG_ARCH_HAS_CPU_RELAX=y
33CONFIG_ARCH_HAS_DEFAULT_IDLE=y 34CONFIG_ARCH_HAS_DEFAULT_IDLE=y
34CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y 35CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y
35CONFIG_HAVE_SETUP_PER_CPU_AREA=y 36CONFIG_HAVE_SETUP_PER_CPU_AREA=y
37CONFIG_HAVE_DYNAMIC_PER_CPU_AREA=y
36# CONFIG_HAVE_CPUMASK_OF_CPU_MAP is not set 38# CONFIG_HAVE_CPUMASK_OF_CPU_MAP is not set
37CONFIG_ARCH_HIBERNATION_POSSIBLE=y 39CONFIG_ARCH_HIBERNATION_POSSIBLE=y
38CONFIG_ARCH_SUSPEND_POSSIBLE=y 40CONFIG_ARCH_SUSPEND_POSSIBLE=y
@@ -40,15 +42,16 @@ CONFIG_ARCH_SUSPEND_POSSIBLE=y
40CONFIG_ARCH_POPULATES_NODE_MAP=y 42CONFIG_ARCH_POPULATES_NODE_MAP=y
41# CONFIG_AUDIT_ARCH is not set 43# CONFIG_AUDIT_ARCH is not set
42CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y 44CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y
45CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y
43CONFIG_GENERIC_HARDIRQS=y 46CONFIG_GENERIC_HARDIRQS=y
47CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y
44CONFIG_GENERIC_IRQ_PROBE=y 48CONFIG_GENERIC_IRQ_PROBE=y
45CONFIG_GENERIC_PENDING_IRQ=y 49CONFIG_GENERIC_PENDING_IRQ=y
46CONFIG_X86_SMP=y
47CONFIG_USE_GENERIC_SMP_HELPERS=y 50CONFIG_USE_GENERIC_SMP_HELPERS=y
48CONFIG_X86_32_SMP=y 51CONFIG_X86_32_SMP=y
49CONFIG_X86_HT=y 52CONFIG_X86_HT=y
50CONFIG_X86_BIOS_REBOOT=y
51CONFIG_X86_TRAMPOLINE=y 53CONFIG_X86_TRAMPOLINE=y
54CONFIG_X86_32_LAZY_GS=y
52CONFIG_KTIME_SCALAR=y 55CONFIG_KTIME_SCALAR=y
53CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" 56CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
54 57
@@ -60,10 +63,17 @@ CONFIG_LOCK_KERNEL=y
60CONFIG_INIT_ENV_ARG_LIMIT=32 63CONFIG_INIT_ENV_ARG_LIMIT=32
61CONFIG_LOCALVERSION="" 64CONFIG_LOCALVERSION=""
62# CONFIG_LOCALVERSION_AUTO is not set 65# CONFIG_LOCALVERSION_AUTO is not set
66CONFIG_HAVE_KERNEL_GZIP=y
67CONFIG_HAVE_KERNEL_BZIP2=y
68CONFIG_HAVE_KERNEL_LZMA=y
69CONFIG_KERNEL_GZIP=y
70# CONFIG_KERNEL_BZIP2 is not set
71# CONFIG_KERNEL_LZMA is not set
63CONFIG_SWAP=y 72CONFIG_SWAP=y
64CONFIG_SYSVIPC=y 73CONFIG_SYSVIPC=y
65CONFIG_SYSVIPC_SYSCTL=y 74CONFIG_SYSVIPC_SYSCTL=y
66CONFIG_POSIX_MQUEUE=y 75CONFIG_POSIX_MQUEUE=y
76CONFIG_POSIX_MQUEUE_SYSCTL=y
67CONFIG_BSD_PROCESS_ACCT=y 77CONFIG_BSD_PROCESS_ACCT=y
68# CONFIG_BSD_PROCESS_ACCT_V3 is not set 78# CONFIG_BSD_PROCESS_ACCT_V3 is not set
69CONFIG_TASKSTATS=y 79CONFIG_TASKSTATS=y
@@ -113,23 +123,26 @@ CONFIG_PID_NS=y
113CONFIG_NET_NS=y 123CONFIG_NET_NS=y
114CONFIG_BLK_DEV_INITRD=y 124CONFIG_BLK_DEV_INITRD=y
115CONFIG_INITRAMFS_SOURCE="" 125CONFIG_INITRAMFS_SOURCE=""
126CONFIG_RD_GZIP=y
127CONFIG_RD_BZIP2=y
128CONFIG_RD_LZMA=y
116CONFIG_CC_OPTIMIZE_FOR_SIZE=y 129CONFIG_CC_OPTIMIZE_FOR_SIZE=y
117CONFIG_SYSCTL=y 130CONFIG_SYSCTL=y
131CONFIG_ANON_INODES=y
118# CONFIG_EMBEDDED is not set 132# CONFIG_EMBEDDED is not set
119CONFIG_UID16=y 133CONFIG_UID16=y
120CONFIG_SYSCTL_SYSCALL=y 134CONFIG_SYSCTL_SYSCALL=y
121CONFIG_KALLSYMS=y 135CONFIG_KALLSYMS=y
122CONFIG_KALLSYMS_ALL=y 136CONFIG_KALLSYMS_ALL=y
123CONFIG_KALLSYMS_EXTRA_PASS=y 137CONFIG_KALLSYMS_EXTRA_PASS=y
138# CONFIG_STRIP_ASM_SYMS is not set
124CONFIG_HOTPLUG=y 139CONFIG_HOTPLUG=y
125CONFIG_PRINTK=y 140CONFIG_PRINTK=y
126CONFIG_BUG=y 141CONFIG_BUG=y
127CONFIG_ELF_CORE=y 142CONFIG_ELF_CORE=y
128CONFIG_PCSPKR_PLATFORM=y 143CONFIG_PCSPKR_PLATFORM=y
129# CONFIG_COMPAT_BRK is not set
130CONFIG_BASE_FULL=y 144CONFIG_BASE_FULL=y
131CONFIG_FUTEX=y 145CONFIG_FUTEX=y
132CONFIG_ANON_INODES=y
133CONFIG_EPOLL=y 146CONFIG_EPOLL=y
134CONFIG_SIGNALFD=y 147CONFIG_SIGNALFD=y
135CONFIG_TIMERFD=y 148CONFIG_TIMERFD=y
@@ -139,6 +152,7 @@ CONFIG_AIO=y
139CONFIG_VM_EVENT_COUNTERS=y 152CONFIG_VM_EVENT_COUNTERS=y
140CONFIG_PCI_QUIRKS=y 153CONFIG_PCI_QUIRKS=y
141CONFIG_SLUB_DEBUG=y 154CONFIG_SLUB_DEBUG=y
155# CONFIG_COMPAT_BRK is not set
142# CONFIG_SLAB is not set 156# CONFIG_SLAB is not set
143CONFIG_SLUB=y 157CONFIG_SLUB=y
144# CONFIG_SLOB is not set 158# CONFIG_SLOB is not set
@@ -154,6 +168,8 @@ CONFIG_HAVE_IOREMAP_PROT=y
154CONFIG_HAVE_KPROBES=y 168CONFIG_HAVE_KPROBES=y
155CONFIG_HAVE_KRETPROBES=y 169CONFIG_HAVE_KRETPROBES=y
156CONFIG_HAVE_ARCH_TRACEHOOK=y 170CONFIG_HAVE_ARCH_TRACEHOOK=y
171CONFIG_HAVE_DMA_API_DEBUG=y
172# CONFIG_SLOW_WORK is not set
157CONFIG_HAVE_GENERIC_DMA_COHERENT=y 173CONFIG_HAVE_GENERIC_DMA_COHERENT=y
158CONFIG_SLABINFO=y 174CONFIG_SLABINFO=y
159CONFIG_RT_MUTEXES=y 175CONFIG_RT_MUTEXES=y
@@ -167,7 +183,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
167CONFIG_STOP_MACHINE=y 183CONFIG_STOP_MACHINE=y
168CONFIG_BLOCK=y 184CONFIG_BLOCK=y
169# CONFIG_LBD is not set 185# CONFIG_LBD is not set
170CONFIG_BLK_DEV_IO_TRACE=y
171CONFIG_BLK_DEV_BSG=y 186CONFIG_BLK_DEV_BSG=y
172# CONFIG_BLK_DEV_INTEGRITY is not set 187# CONFIG_BLK_DEV_INTEGRITY is not set
173 188
@@ -194,12 +209,12 @@ CONFIG_HIGH_RES_TIMERS=y
194CONFIG_GENERIC_CLOCKEVENTS_BUILD=y 209CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
195CONFIG_SMP=y 210CONFIG_SMP=y
196CONFIG_SPARSE_IRQ=y 211CONFIG_SPARSE_IRQ=y
197CONFIG_X86_FIND_SMP_CONFIG=y
198CONFIG_X86_MPPARSE=y 212CONFIG_X86_MPPARSE=y
213# CONFIG_X86_BIGSMP is not set
214CONFIG_X86_EXTENDED_PLATFORM=y
199# CONFIG_X86_ELAN is not set 215# CONFIG_X86_ELAN is not set
200# CONFIG_X86_GENERICARCH is not set
201# CONFIG_X86_VSMP is not set
202# CONFIG_X86_RDC321X is not set 216# CONFIG_X86_RDC321X is not set
217# CONFIG_X86_32_NON_STANDARD is not set
203CONFIG_SCHED_OMIT_FRAME_POINTER=y 218CONFIG_SCHED_OMIT_FRAME_POINTER=y
204# CONFIG_PARAVIRT_GUEST is not set 219# CONFIG_PARAVIRT_GUEST is not set
205# CONFIG_MEMTEST is not set 220# CONFIG_MEMTEST is not set
@@ -230,8 +245,10 @@ CONFIG_M686=y
230# CONFIG_GENERIC_CPU is not set 245# CONFIG_GENERIC_CPU is not set
231CONFIG_X86_GENERIC=y 246CONFIG_X86_GENERIC=y
232CONFIG_X86_CPU=y 247CONFIG_X86_CPU=y
248CONFIG_X86_L1_CACHE_BYTES=64
249CONFIG_X86_INTERNODE_CACHE_BYTES=64
233CONFIG_X86_CMPXCHG=y 250CONFIG_X86_CMPXCHG=y
234CONFIG_X86_L1_CACHE_SHIFT=7 251CONFIG_X86_L1_CACHE_SHIFT=5
235CONFIG_X86_XADD=y 252CONFIG_X86_XADD=y
236# CONFIG_X86_PPRO_FENCE is not set 253# CONFIG_X86_PPRO_FENCE is not set
237CONFIG_X86_WP_WORKS_OK=y 254CONFIG_X86_WP_WORKS_OK=y
@@ -247,7 +264,7 @@ CONFIG_X86_DEBUGCTLMSR=y
247CONFIG_CPU_SUP_INTEL=y 264CONFIG_CPU_SUP_INTEL=y
248CONFIG_CPU_SUP_CYRIX_32=y 265CONFIG_CPU_SUP_CYRIX_32=y
249CONFIG_CPU_SUP_AMD=y 266CONFIG_CPU_SUP_AMD=y
250CONFIG_CPU_SUP_CENTAUR_32=y 267CONFIG_CPU_SUP_CENTAUR=y
251CONFIG_CPU_SUP_TRANSMETA_32=y 268CONFIG_CPU_SUP_TRANSMETA_32=y
252CONFIG_CPU_SUP_UMC_32=y 269CONFIG_CPU_SUP_UMC_32=y
253CONFIG_X86_DS=y 270CONFIG_X86_DS=y
@@ -279,6 +296,7 @@ CONFIG_MICROCODE_AMD=y
279CONFIG_MICROCODE_OLD_INTERFACE=y 296CONFIG_MICROCODE_OLD_INTERFACE=y
280CONFIG_X86_MSR=y 297CONFIG_X86_MSR=y
281CONFIG_X86_CPUID=y 298CONFIG_X86_CPUID=y
299# CONFIG_X86_CPU_DEBUG is not set
282# CONFIG_NOHIGHMEM is not set 300# CONFIG_NOHIGHMEM is not set
283CONFIG_HIGHMEM4G=y 301CONFIG_HIGHMEM4G=y
284# CONFIG_HIGHMEM64G is not set 302# CONFIG_HIGHMEM64G is not set
@@ -302,6 +320,8 @@ CONFIG_ZONE_DMA_FLAG=1
302CONFIG_BOUNCE=y 320CONFIG_BOUNCE=y
303CONFIG_VIRT_TO_BUS=y 321CONFIG_VIRT_TO_BUS=y
304CONFIG_UNEVICTABLE_LRU=y 322CONFIG_UNEVICTABLE_LRU=y
323CONFIG_HAVE_MLOCK=y
324CONFIG_HAVE_MLOCKED_PAGE_BIT=y
305CONFIG_HIGHPTE=y 325CONFIG_HIGHPTE=y
306CONFIG_X86_CHECK_BIOS_CORRUPTION=y 326CONFIG_X86_CHECK_BIOS_CORRUPTION=y
307CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y 327CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y
@@ -312,6 +332,7 @@ CONFIG_MTRR=y
312CONFIG_X86_PAT=y 332CONFIG_X86_PAT=y
313CONFIG_EFI=y 333CONFIG_EFI=y
314CONFIG_SECCOMP=y 334CONFIG_SECCOMP=y
335# CONFIG_CC_STACKPROTECTOR is not set
315# CONFIG_HZ_100 is not set 336# CONFIG_HZ_100 is not set
316# CONFIG_HZ_250 is not set 337# CONFIG_HZ_250 is not set
317# CONFIG_HZ_300 is not set 338# CONFIG_HZ_300 is not set
@@ -322,8 +343,9 @@ CONFIG_KEXEC=y
322CONFIG_CRASH_DUMP=y 343CONFIG_CRASH_DUMP=y
323# CONFIG_KEXEC_JUMP is not set 344# CONFIG_KEXEC_JUMP is not set
324CONFIG_PHYSICAL_START=0x1000000 345CONFIG_PHYSICAL_START=0x1000000
325# CONFIG_RELOCATABLE is not set 346CONFIG_RELOCATABLE=y
326CONFIG_PHYSICAL_ALIGN=0x200000 347CONFIG_X86_NEED_RELOCS=y
348CONFIG_PHYSICAL_ALIGN=0x1000000
327CONFIG_HOTPLUG_CPU=y 349CONFIG_HOTPLUG_CPU=y
328# CONFIG_COMPAT_VDSO is not set 350# CONFIG_COMPAT_VDSO is not set
329# CONFIG_CMDLINE_BOOL is not set 351# CONFIG_CMDLINE_BOOL is not set
@@ -363,7 +385,6 @@ CONFIG_ACPI_THERMAL=y
363CONFIG_ACPI_BLACKLIST_YEAR=0 385CONFIG_ACPI_BLACKLIST_YEAR=0
364# CONFIG_ACPI_DEBUG is not set 386# CONFIG_ACPI_DEBUG is not set
365# CONFIG_ACPI_PCI_SLOT is not set 387# CONFIG_ACPI_PCI_SLOT is not set
366CONFIG_ACPI_SYSTEM=y
367CONFIG_X86_PM_TIMER=y 388CONFIG_X86_PM_TIMER=y
368CONFIG_ACPI_CONTAINER=y 389CONFIG_ACPI_CONTAINER=y
369# CONFIG_ACPI_SBS is not set 390# CONFIG_ACPI_SBS is not set
@@ -425,6 +446,7 @@ CONFIG_PCI_BIOS=y
425CONFIG_PCI_DIRECT=y 446CONFIG_PCI_DIRECT=y
426CONFIG_PCI_MMCONFIG=y 447CONFIG_PCI_MMCONFIG=y
427CONFIG_PCI_DOMAINS=y 448CONFIG_PCI_DOMAINS=y
449# CONFIG_DMAR is not set
428CONFIG_PCIEPORTBUS=y 450CONFIG_PCIEPORTBUS=y
429# CONFIG_HOTPLUG_PCI_PCIE is not set 451# CONFIG_HOTPLUG_PCI_PCIE is not set
430CONFIG_PCIEAER=y 452CONFIG_PCIEAER=y
@@ -435,6 +457,7 @@ CONFIG_PCI_MSI=y
435# CONFIG_PCI_DEBUG is not set 457# CONFIG_PCI_DEBUG is not set
436# CONFIG_PCI_STUB is not set 458# CONFIG_PCI_STUB is not set
437CONFIG_HT_IRQ=y 459CONFIG_HT_IRQ=y
460# CONFIG_PCI_IOV is not set
438CONFIG_ISA_DMA_API=y 461CONFIG_ISA_DMA_API=y
439# CONFIG_ISA is not set 462# CONFIG_ISA is not set
440# CONFIG_MCA is not set 463# CONFIG_MCA is not set
@@ -481,7 +504,6 @@ CONFIG_NET=y
481# 504#
482# Networking options 505# Networking options
483# 506#
484CONFIG_COMPAT_NET_DEV_OPS=y
485CONFIG_PACKET=y 507CONFIG_PACKET=y
486CONFIG_PACKET_MMAP=y 508CONFIG_PACKET_MMAP=y
487CONFIG_UNIX=y 509CONFIG_UNIX=y
@@ -639,6 +661,7 @@ CONFIG_LLC=y
639# CONFIG_LAPB is not set 661# CONFIG_LAPB is not set
640# CONFIG_ECONET is not set 662# CONFIG_ECONET is not set
641# CONFIG_WAN_ROUTER is not set 663# CONFIG_WAN_ROUTER is not set
664# CONFIG_PHONET is not set
642CONFIG_NET_SCHED=y 665CONFIG_NET_SCHED=y
643 666
644# 667#
@@ -696,6 +719,7 @@ CONFIG_NET_SCH_FIFO=y
696# 719#
697# CONFIG_NET_PKTGEN is not set 720# CONFIG_NET_PKTGEN is not set
698# CONFIG_NET_TCPPROBE is not set 721# CONFIG_NET_TCPPROBE is not set
722# CONFIG_NET_DROP_MONITOR is not set
699CONFIG_HAMRADIO=y 723CONFIG_HAMRADIO=y
700 724
701# 725#
@@ -706,12 +730,10 @@ CONFIG_HAMRADIO=y
706# CONFIG_IRDA is not set 730# CONFIG_IRDA is not set
707# CONFIG_BT is not set 731# CONFIG_BT is not set
708# CONFIG_AF_RXRPC is not set 732# CONFIG_AF_RXRPC is not set
709# CONFIG_PHONET is not set
710CONFIG_FIB_RULES=y 733CONFIG_FIB_RULES=y
711CONFIG_WIRELESS=y 734CONFIG_WIRELESS=y
712CONFIG_CFG80211=y 735CONFIG_CFG80211=y
713# CONFIG_CFG80211_REG_DEBUG is not set 736# CONFIG_CFG80211_REG_DEBUG is not set
714CONFIG_NL80211=y
715CONFIG_WIRELESS_OLD_REGULATORY=y 737CONFIG_WIRELESS_OLD_REGULATORY=y
716CONFIG_WIRELESS_EXT=y 738CONFIG_WIRELESS_EXT=y
717CONFIG_WIRELESS_EXT_SYSFS=y 739CONFIG_WIRELESS_EXT_SYSFS=y
@@ -789,6 +811,7 @@ CONFIG_MISC_DEVICES=y
789# CONFIG_ICS932S401 is not set 811# CONFIG_ICS932S401 is not set
790# CONFIG_ENCLOSURE_SERVICES is not set 812# CONFIG_ENCLOSURE_SERVICES is not set
791# CONFIG_HP_ILO is not set 813# CONFIG_HP_ILO is not set
814# CONFIG_ISL29003 is not set
792# CONFIG_C2PORT is not set 815# CONFIG_C2PORT is not set
793 816
794# 817#
@@ -842,6 +865,7 @@ CONFIG_SCSI_SPI_ATTRS=y
842# CONFIG_SCSI_LOWLEVEL is not set 865# CONFIG_SCSI_LOWLEVEL is not set
843# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set 866# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set
844# CONFIG_SCSI_DH is not set 867# CONFIG_SCSI_DH is not set
868# CONFIG_SCSI_OSD_INITIATOR is not set
845CONFIG_ATA=y 869CONFIG_ATA=y
846# CONFIG_ATA_NONSTANDARD is not set 870# CONFIG_ATA_NONSTANDARD is not set
847CONFIG_ATA_ACPI=y 871CONFIG_ATA_ACPI=y
@@ -940,6 +964,7 @@ CONFIG_DM_ZERO=y
940CONFIG_MACINTOSH_DRIVERS=y 964CONFIG_MACINTOSH_DRIVERS=y
941CONFIG_MAC_EMUMOUSEBTN=y 965CONFIG_MAC_EMUMOUSEBTN=y
942CONFIG_NETDEVICES=y 966CONFIG_NETDEVICES=y
967CONFIG_COMPAT_NET_DEV_OPS=y
943# CONFIG_IFB is not set 968# CONFIG_IFB is not set
944# CONFIG_DUMMY is not set 969# CONFIG_DUMMY is not set
945# CONFIG_BONDING is not set 970# CONFIG_BONDING is not set
@@ -977,6 +1002,8 @@ CONFIG_MII=y
977CONFIG_NET_VENDOR_3COM=y 1002CONFIG_NET_VENDOR_3COM=y
978# CONFIG_VORTEX is not set 1003# CONFIG_VORTEX is not set
979# CONFIG_TYPHOON is not set 1004# CONFIG_TYPHOON is not set
1005# CONFIG_ETHOC is not set
1006# CONFIG_DNET is not set
980CONFIG_NET_TULIP=y 1007CONFIG_NET_TULIP=y
981# CONFIG_DE2104X is not set 1008# CONFIG_DE2104X is not set
982# CONFIG_TULIP is not set 1009# CONFIG_TULIP is not set
@@ -1026,6 +1053,7 @@ CONFIG_E1000=y
1026CONFIG_E1000E=y 1053CONFIG_E1000E=y
1027# CONFIG_IP1000 is not set 1054# CONFIG_IP1000 is not set
1028# CONFIG_IGB is not set 1055# CONFIG_IGB is not set
1056# CONFIG_IGBVF is not set
1029# CONFIG_NS83820 is not set 1057# CONFIG_NS83820 is not set
1030# CONFIG_HAMACHI is not set 1058# CONFIG_HAMACHI is not set
1031# CONFIG_YELLOWFIN is not set 1059# CONFIG_YELLOWFIN is not set
@@ -1040,6 +1068,7 @@ CONFIG_BNX2=y
1040# CONFIG_QLA3XXX is not set 1068# CONFIG_QLA3XXX is not set
1041# CONFIG_ATL1 is not set 1069# CONFIG_ATL1 is not set
1042# CONFIG_ATL1E is not set 1070# CONFIG_ATL1E is not set
1071# CONFIG_ATL1C is not set
1043# CONFIG_JME is not set 1072# CONFIG_JME is not set
1044CONFIG_NETDEV_10000=y 1073CONFIG_NETDEV_10000=y
1045# CONFIG_CHELSIO_T1 is not set 1074# CONFIG_CHELSIO_T1 is not set
@@ -1049,6 +1078,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
1049# CONFIG_IXGBE is not set 1078# CONFIG_IXGBE is not set
1050# CONFIG_IXGB is not set 1079# CONFIG_IXGB is not set
1051# CONFIG_S2IO is not set 1080# CONFIG_S2IO is not set
1081# CONFIG_VXGE is not set
1052# CONFIG_MYRI10GE is not set 1082# CONFIG_MYRI10GE is not set
1053# CONFIG_NETXEN_NIC is not set 1083# CONFIG_NETXEN_NIC is not set
1054# CONFIG_NIU is not set 1084# CONFIG_NIU is not set
@@ -1058,6 +1088,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
1058# CONFIG_BNX2X is not set 1088# CONFIG_BNX2X is not set
1059# CONFIG_QLGE is not set 1089# CONFIG_QLGE is not set
1060# CONFIG_SFC is not set 1090# CONFIG_SFC is not set
1091# CONFIG_BE2NET is not set
1061CONFIG_TR=y 1092CONFIG_TR=y
1062# CONFIG_IBMOL is not set 1093# CONFIG_IBMOL is not set
1063# CONFIG_IBMLS is not set 1094# CONFIG_IBMLS is not set
@@ -1073,8 +1104,8 @@ CONFIG_WLAN_80211=y
1073# CONFIG_LIBERTAS is not set 1104# CONFIG_LIBERTAS is not set
1074# CONFIG_LIBERTAS_THINFIRM is not set 1105# CONFIG_LIBERTAS_THINFIRM is not set
1075# CONFIG_AIRO is not set 1106# CONFIG_AIRO is not set
1076# CONFIG_HERMES is not set
1077# CONFIG_ATMEL is not set 1107# CONFIG_ATMEL is not set
1108# CONFIG_AT76C50X_USB is not set
1078# CONFIG_AIRO_CS is not set 1109# CONFIG_AIRO_CS is not set
1079# CONFIG_PCMCIA_WL3501 is not set 1110# CONFIG_PCMCIA_WL3501 is not set
1080# CONFIG_PRISM54 is not set 1111# CONFIG_PRISM54 is not set
@@ -1084,21 +1115,21 @@ CONFIG_WLAN_80211=y
1084# CONFIG_RTL8187 is not set 1115# CONFIG_RTL8187 is not set
1085# CONFIG_ADM8211 is not set 1116# CONFIG_ADM8211 is not set
1086# CONFIG_MAC80211_HWSIM is not set 1117# CONFIG_MAC80211_HWSIM is not set
1118# CONFIG_MWL8K is not set
1087# CONFIG_P54_COMMON is not set 1119# CONFIG_P54_COMMON is not set
1088CONFIG_ATH5K=y 1120CONFIG_ATH5K=y
1089# CONFIG_ATH5K_DEBUG is not set 1121# CONFIG_ATH5K_DEBUG is not set
1090# CONFIG_ATH9K is not set 1122# CONFIG_ATH9K is not set
1123# CONFIG_AR9170_USB is not set
1091# CONFIG_IPW2100 is not set 1124# CONFIG_IPW2100 is not set
1092# CONFIG_IPW2200 is not set 1125# CONFIG_IPW2200 is not set
1093# CONFIG_IWLCORE is not set 1126# CONFIG_IWLWIFI is not set
1094# CONFIG_IWLWIFI_LEDS is not set
1095# CONFIG_IWLAGN is not set
1096# CONFIG_IWL3945 is not set
1097# CONFIG_HOSTAP is not set 1127# CONFIG_HOSTAP is not set
1098# CONFIG_B43 is not set 1128# CONFIG_B43 is not set
1099# CONFIG_B43LEGACY is not set 1129# CONFIG_B43LEGACY is not set
1100# CONFIG_ZD1211RW is not set 1130# CONFIG_ZD1211RW is not set
1101# CONFIG_RT2X00 is not set 1131# CONFIG_RT2X00 is not set
1132# CONFIG_HERMES is not set
1102 1133
1103# 1134#
1104# Enable WiMAX (Networking options) to see the WiMAX drivers 1135# Enable WiMAX (Networking options) to see the WiMAX drivers
@@ -1209,6 +1240,8 @@ CONFIG_INPUT_TABLET=y
1209# CONFIG_TABLET_USB_KBTAB is not set 1240# CONFIG_TABLET_USB_KBTAB is not set
1210# CONFIG_TABLET_USB_WACOM is not set 1241# CONFIG_TABLET_USB_WACOM is not set
1211CONFIG_INPUT_TOUCHSCREEN=y 1242CONFIG_INPUT_TOUCHSCREEN=y
1243# CONFIG_TOUCHSCREEN_AD7879_I2C is not set
1244# CONFIG_TOUCHSCREEN_AD7879 is not set
1212# CONFIG_TOUCHSCREEN_FUJITSU is not set 1245# CONFIG_TOUCHSCREEN_FUJITSU is not set
1213# CONFIG_TOUCHSCREEN_GUNZE is not set 1246# CONFIG_TOUCHSCREEN_GUNZE is not set
1214# CONFIG_TOUCHSCREEN_ELO is not set 1247# CONFIG_TOUCHSCREEN_ELO is not set
@@ -1303,6 +1336,7 @@ CONFIG_UNIX98_PTYS=y
1303# CONFIG_LEGACY_PTYS is not set 1336# CONFIG_LEGACY_PTYS is not set
1304# CONFIG_IPMI_HANDLER is not set 1337# CONFIG_IPMI_HANDLER is not set
1305CONFIG_HW_RANDOM=y 1338CONFIG_HW_RANDOM=y
1339# CONFIG_HW_RANDOM_TIMERIOMEM is not set
1306CONFIG_HW_RANDOM_INTEL=y 1340CONFIG_HW_RANDOM_INTEL=y
1307CONFIG_HW_RANDOM_AMD=y 1341CONFIG_HW_RANDOM_AMD=y
1308CONFIG_HW_RANDOM_GEODE=y 1342CONFIG_HW_RANDOM_GEODE=y
@@ -1390,7 +1424,6 @@ CONFIG_I2C_I801=y
1390# CONFIG_SENSORS_PCF8574 is not set 1424# CONFIG_SENSORS_PCF8574 is not set
1391# CONFIG_PCF8575 is not set 1425# CONFIG_PCF8575 is not set
1392# CONFIG_SENSORS_PCA9539 is not set 1426# CONFIG_SENSORS_PCA9539 is not set
1393# CONFIG_SENSORS_PCF8591 is not set
1394# CONFIG_SENSORS_MAX6875 is not set 1427# CONFIG_SENSORS_MAX6875 is not set
1395# CONFIG_SENSORS_TSL2550 is not set 1428# CONFIG_SENSORS_TSL2550 is not set
1396# CONFIG_I2C_DEBUG_CORE is not set 1429# CONFIG_I2C_DEBUG_CORE is not set
@@ -1424,6 +1457,7 @@ CONFIG_HWMON=y
1424# CONFIG_SENSORS_ADT7475 is not set 1457# CONFIG_SENSORS_ADT7475 is not set
1425# CONFIG_SENSORS_K8TEMP is not set 1458# CONFIG_SENSORS_K8TEMP is not set
1426# CONFIG_SENSORS_ASB100 is not set 1459# CONFIG_SENSORS_ASB100 is not set
1460# CONFIG_SENSORS_ATK0110 is not set
1427# CONFIG_SENSORS_ATXP1 is not set 1461# CONFIG_SENSORS_ATXP1 is not set
1428# CONFIG_SENSORS_DS1621 is not set 1462# CONFIG_SENSORS_DS1621 is not set
1429# CONFIG_SENSORS_I5K_AMB is not set 1463# CONFIG_SENSORS_I5K_AMB is not set
@@ -1433,6 +1467,7 @@ CONFIG_HWMON=y
1433# CONFIG_SENSORS_FSCHER is not set 1467# CONFIG_SENSORS_FSCHER is not set
1434# CONFIG_SENSORS_FSCPOS is not set 1468# CONFIG_SENSORS_FSCPOS is not set
1435# CONFIG_SENSORS_FSCHMD is not set 1469# CONFIG_SENSORS_FSCHMD is not set
1470# CONFIG_SENSORS_G760A is not set
1436# CONFIG_SENSORS_GL518SM is not set 1471# CONFIG_SENSORS_GL518SM is not set
1437# CONFIG_SENSORS_GL520SM is not set 1472# CONFIG_SENSORS_GL520SM is not set
1438# CONFIG_SENSORS_CORETEMP is not set 1473# CONFIG_SENSORS_CORETEMP is not set
@@ -1448,11 +1483,14 @@ CONFIG_HWMON=y
1448# CONFIG_SENSORS_LM90 is not set 1483# CONFIG_SENSORS_LM90 is not set
1449# CONFIG_SENSORS_LM92 is not set 1484# CONFIG_SENSORS_LM92 is not set
1450# CONFIG_SENSORS_LM93 is not set 1485# CONFIG_SENSORS_LM93 is not set
1486# CONFIG_SENSORS_LTC4215 is not set
1451# CONFIG_SENSORS_LTC4245 is not set 1487# CONFIG_SENSORS_LTC4245 is not set
1488# CONFIG_SENSORS_LM95241 is not set
1452# CONFIG_SENSORS_MAX1619 is not set 1489# CONFIG_SENSORS_MAX1619 is not set
1453# CONFIG_SENSORS_MAX6650 is not set 1490# CONFIG_SENSORS_MAX6650 is not set
1454# CONFIG_SENSORS_PC87360 is not set 1491# CONFIG_SENSORS_PC87360 is not set
1455# CONFIG_SENSORS_PC87427 is not set 1492# CONFIG_SENSORS_PC87427 is not set
1493# CONFIG_SENSORS_PCF8591 is not set
1456# CONFIG_SENSORS_SIS5595 is not set 1494# CONFIG_SENSORS_SIS5595 is not set
1457# CONFIG_SENSORS_DME1737 is not set 1495# CONFIG_SENSORS_DME1737 is not set
1458# CONFIG_SENSORS_SMSC47M1 is not set 1496# CONFIG_SENSORS_SMSC47M1 is not set
@@ -1643,7 +1681,6 @@ CONFIG_FB_EFI=y
1643# CONFIG_FB_3DFX is not set 1681# CONFIG_FB_3DFX is not set
1644# CONFIG_FB_VOODOO1 is not set 1682# CONFIG_FB_VOODOO1 is not set
1645# CONFIG_FB_VT8623 is not set 1683# CONFIG_FB_VT8623 is not set
1646# CONFIG_FB_CYBLA is not set
1647# CONFIG_FB_TRIDENT is not set 1684# CONFIG_FB_TRIDENT is not set
1648# CONFIG_FB_ARK is not set 1685# CONFIG_FB_ARK is not set
1649# CONFIG_FB_PM3 is not set 1686# CONFIG_FB_PM3 is not set
@@ -1652,6 +1689,7 @@ CONFIG_FB_EFI=y
1652# CONFIG_FB_VIRTUAL is not set 1689# CONFIG_FB_VIRTUAL is not set
1653# CONFIG_FB_METRONOME is not set 1690# CONFIG_FB_METRONOME is not set
1654# CONFIG_FB_MB862XX is not set 1691# CONFIG_FB_MB862XX is not set
1692# CONFIG_FB_BROADSHEET is not set
1655CONFIG_BACKLIGHT_LCD_SUPPORT=y 1693CONFIG_BACKLIGHT_LCD_SUPPORT=y
1656# CONFIG_LCD_CLASS_DEVICE is not set 1694# CONFIG_LCD_CLASS_DEVICE is not set
1657CONFIG_BACKLIGHT_CLASS_DEVICE=y 1695CONFIG_BACKLIGHT_CLASS_DEVICE=y
@@ -1738,6 +1776,8 @@ CONFIG_SND_PCI=y
1738# CONFIG_SND_INDIGO is not set 1776# CONFIG_SND_INDIGO is not set
1739# CONFIG_SND_INDIGOIO is not set 1777# CONFIG_SND_INDIGOIO is not set
1740# CONFIG_SND_INDIGODJ is not set 1778# CONFIG_SND_INDIGODJ is not set
1779# CONFIG_SND_INDIGOIOX is not set
1780# CONFIG_SND_INDIGODJX is not set
1741# CONFIG_SND_EMU10K1 is not set 1781# CONFIG_SND_EMU10K1 is not set
1742# CONFIG_SND_EMU10K1X is not set 1782# CONFIG_SND_EMU10K1X is not set
1743# CONFIG_SND_ENS1370 is not set 1783# CONFIG_SND_ENS1370 is not set
@@ -1811,15 +1851,17 @@ CONFIG_USB_HIDDEV=y
1811# 1851#
1812# Special HID drivers 1852# Special HID drivers
1813# 1853#
1814CONFIG_HID_COMPAT=y
1815CONFIG_HID_A4TECH=y 1854CONFIG_HID_A4TECH=y
1816CONFIG_HID_APPLE=y 1855CONFIG_HID_APPLE=y
1817CONFIG_HID_BELKIN=y 1856CONFIG_HID_BELKIN=y
1818CONFIG_HID_CHERRY=y 1857CONFIG_HID_CHERRY=y
1819CONFIG_HID_CHICONY=y 1858CONFIG_HID_CHICONY=y
1820CONFIG_HID_CYPRESS=y 1859CONFIG_HID_CYPRESS=y
1860# CONFIG_DRAGONRISE_FF is not set
1821CONFIG_HID_EZKEY=y 1861CONFIG_HID_EZKEY=y
1862CONFIG_HID_KYE=y
1822CONFIG_HID_GYRATION=y 1863CONFIG_HID_GYRATION=y
1864CONFIG_HID_KENSINGTON=y
1823CONFIG_HID_LOGITECH=y 1865CONFIG_HID_LOGITECH=y
1824CONFIG_LOGITECH_FF=y 1866CONFIG_LOGITECH_FF=y
1825# CONFIG_LOGIRUMBLEPAD2_FF is not set 1867# CONFIG_LOGIRUMBLEPAD2_FF is not set
@@ -1885,11 +1927,11 @@ CONFIG_USB_PRINTER=y
1885# CONFIG_USB_TMC is not set 1927# CONFIG_USB_TMC is not set
1886 1928
1887# 1929#
1888# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may also be needed; 1930# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may
1889# 1931#
1890 1932
1891# 1933#
1892# see USB_STORAGE Help for more information 1934# also be needed; see USB_STORAGE Help for more info
1893# 1935#
1894CONFIG_USB_STORAGE=y 1936CONFIG_USB_STORAGE=y
1895# CONFIG_USB_STORAGE_DEBUG is not set 1937# CONFIG_USB_STORAGE_DEBUG is not set
@@ -1931,7 +1973,6 @@ CONFIG_USB_LIBUSUAL=y
1931# CONFIG_USB_LED is not set 1973# CONFIG_USB_LED is not set
1932# CONFIG_USB_CYPRESS_CY7C63 is not set 1974# CONFIG_USB_CYPRESS_CY7C63 is not set
1933# CONFIG_USB_CYTHERM is not set 1975# CONFIG_USB_CYTHERM is not set
1934# CONFIG_USB_PHIDGET is not set
1935# CONFIG_USB_IDMOUSE is not set 1976# CONFIG_USB_IDMOUSE is not set
1936# CONFIG_USB_FTDI_ELAN is not set 1977# CONFIG_USB_FTDI_ELAN is not set
1937# CONFIG_USB_APPLEDISPLAY is not set 1978# CONFIG_USB_APPLEDISPLAY is not set
@@ -1947,6 +1988,7 @@ CONFIG_USB_LIBUSUAL=y
1947# 1988#
1948# OTG and related infrastructure 1989# OTG and related infrastructure
1949# 1990#
1991# CONFIG_NOP_USB_XCEIV is not set
1950# CONFIG_UWB is not set 1992# CONFIG_UWB is not set
1951# CONFIG_MMC is not set 1993# CONFIG_MMC is not set
1952# CONFIG_MEMSTICK is not set 1994# CONFIG_MEMSTICK is not set
@@ -1958,8 +2000,10 @@ CONFIG_LEDS_CLASS=y
1958# 2000#
1959# CONFIG_LEDS_ALIX2 is not set 2001# CONFIG_LEDS_ALIX2 is not set
1960# CONFIG_LEDS_PCA9532 is not set 2002# CONFIG_LEDS_PCA9532 is not set
2003# CONFIG_LEDS_LP5521 is not set
1961# CONFIG_LEDS_CLEVO_MAIL is not set 2004# CONFIG_LEDS_CLEVO_MAIL is not set
1962# CONFIG_LEDS_PCA955X is not set 2005# CONFIG_LEDS_PCA955X is not set
2006# CONFIG_LEDS_BD2802 is not set
1963 2007
1964# 2008#
1965# LED Triggers 2009# LED Triggers
@@ -1969,6 +2013,10 @@ CONFIG_LEDS_TRIGGERS=y
1969# CONFIG_LEDS_TRIGGER_HEARTBEAT is not set 2013# CONFIG_LEDS_TRIGGER_HEARTBEAT is not set
1970# CONFIG_LEDS_TRIGGER_BACKLIGHT is not set 2014# CONFIG_LEDS_TRIGGER_BACKLIGHT is not set
1971# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set 2015# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set
2016
2017#
2018# iptables trigger is under Netfilter config (LED target)
2019#
1972# CONFIG_ACCESSIBILITY is not set 2020# CONFIG_ACCESSIBILITY is not set
1973# CONFIG_INFINIBAND is not set 2021# CONFIG_INFINIBAND is not set
1974CONFIG_EDAC=y 2022CONFIG_EDAC=y
@@ -2037,6 +2085,7 @@ CONFIG_DMADEVICES=y
2037# DMA Devices 2085# DMA Devices
2038# 2086#
2039# CONFIG_INTEL_IOATDMA is not set 2087# CONFIG_INTEL_IOATDMA is not set
2088# CONFIG_AUXDISPLAY is not set
2040# CONFIG_UIO is not set 2089# CONFIG_UIO is not set
2041# CONFIG_STAGING is not set 2090# CONFIG_STAGING is not set
2042CONFIG_X86_PLATFORM_DEVICES=y 2091CONFIG_X86_PLATFORM_DEVICES=y
@@ -2071,6 +2120,7 @@ CONFIG_DMIID=y
2071# 2120#
2072# CONFIG_EXT2_FS is not set 2121# CONFIG_EXT2_FS is not set
2073CONFIG_EXT3_FS=y 2122CONFIG_EXT3_FS=y
2123# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
2074CONFIG_EXT3_FS_XATTR=y 2124CONFIG_EXT3_FS_XATTR=y
2075CONFIG_EXT3_FS_POSIX_ACL=y 2125CONFIG_EXT3_FS_POSIX_ACL=y
2076CONFIG_EXT3_FS_SECURITY=y 2126CONFIG_EXT3_FS_SECURITY=y
@@ -2101,6 +2151,11 @@ CONFIG_AUTOFS4_FS=y
2101CONFIG_GENERIC_ACL=y 2151CONFIG_GENERIC_ACL=y
2102 2152
2103# 2153#
2154# Caches
2155#
2156# CONFIG_FSCACHE is not set
2157
2158#
2104# CD-ROM/DVD Filesystems 2159# CD-ROM/DVD Filesystems
2105# 2160#
2106CONFIG_ISO9660_FS=y 2161CONFIG_ISO9660_FS=y
@@ -2151,6 +2206,7 @@ CONFIG_MISC_FILESYSTEMS=y
2151# CONFIG_ROMFS_FS is not set 2206# CONFIG_ROMFS_FS is not set
2152# CONFIG_SYSV_FS is not set 2207# CONFIG_SYSV_FS is not set
2153# CONFIG_UFS_FS is not set 2208# CONFIG_UFS_FS is not set
2209# CONFIG_NILFS2_FS is not set
2154CONFIG_NETWORK_FILESYSTEMS=y 2210CONFIG_NETWORK_FILESYSTEMS=y
2155CONFIG_NFS_FS=y 2211CONFIG_NFS_FS=y
2156CONFIG_NFS_V3=y 2212CONFIG_NFS_V3=y
@@ -2164,7 +2220,6 @@ CONFIG_NFS_ACL_SUPPORT=y
2164CONFIG_NFS_COMMON=y 2220CONFIG_NFS_COMMON=y
2165CONFIG_SUNRPC=y 2221CONFIG_SUNRPC=y
2166CONFIG_SUNRPC_GSS=y 2222CONFIG_SUNRPC_GSS=y
2167# CONFIG_SUNRPC_REGISTER_V4 is not set
2168CONFIG_RPCSEC_GSS_KRB5=y 2223CONFIG_RPCSEC_GSS_KRB5=y
2169# CONFIG_RPCSEC_GSS_SPKM3 is not set 2224# CONFIG_RPCSEC_GSS_SPKM3 is not set
2170# CONFIG_SMB_FS is not set 2225# CONFIG_SMB_FS is not set
@@ -2251,6 +2306,7 @@ CONFIG_DEBUG_FS=y
2251CONFIG_DEBUG_KERNEL=y 2306CONFIG_DEBUG_KERNEL=y
2252# CONFIG_DEBUG_SHIRQ is not set 2307# CONFIG_DEBUG_SHIRQ is not set
2253# CONFIG_DETECT_SOFTLOCKUP is not set 2308# CONFIG_DETECT_SOFTLOCKUP is not set
2309# CONFIG_DETECT_HUNG_TASK is not set
2254# CONFIG_SCHED_DEBUG is not set 2310# CONFIG_SCHED_DEBUG is not set
2255CONFIG_SCHEDSTATS=y 2311CONFIG_SCHEDSTATS=y
2256CONFIG_TIMER_STATS=y 2312CONFIG_TIMER_STATS=y
@@ -2266,6 +2322,7 @@ CONFIG_TIMER_STATS=y
2266# CONFIG_LOCK_STAT is not set 2322# CONFIG_LOCK_STAT is not set
2267# CONFIG_DEBUG_SPINLOCK_SLEEP is not set 2323# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
2268# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set 2324# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
2325CONFIG_STACKTRACE=y
2269# CONFIG_DEBUG_KOBJECT is not set 2326# CONFIG_DEBUG_KOBJECT is not set
2270# CONFIG_DEBUG_HIGHMEM is not set 2327# CONFIG_DEBUG_HIGHMEM is not set
2271CONFIG_DEBUG_BUGVERBOSE=y 2328CONFIG_DEBUG_BUGVERBOSE=y
@@ -2289,13 +2346,19 @@ CONFIG_FRAME_POINTER=y
2289# CONFIG_FAULT_INJECTION is not set 2346# CONFIG_FAULT_INJECTION is not set
2290# CONFIG_LATENCYTOP is not set 2347# CONFIG_LATENCYTOP is not set
2291CONFIG_SYSCTL_SYSCALL_CHECK=y 2348CONFIG_SYSCTL_SYSCALL_CHECK=y
2349# CONFIG_DEBUG_PAGEALLOC is not set
2292CONFIG_USER_STACKTRACE_SUPPORT=y 2350CONFIG_USER_STACKTRACE_SUPPORT=y
2351CONFIG_NOP_TRACER=y
2293CONFIG_HAVE_FUNCTION_TRACER=y 2352CONFIG_HAVE_FUNCTION_TRACER=y
2294CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y 2353CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y
2295CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y 2354CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
2296CONFIG_HAVE_DYNAMIC_FTRACE=y 2355CONFIG_HAVE_DYNAMIC_FTRACE=y
2297CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y 2356CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
2298CONFIG_HAVE_HW_BRANCH_TRACER=y 2357CONFIG_HAVE_HW_BRANCH_TRACER=y
2358CONFIG_HAVE_FTRACE_SYSCALLS=y
2359CONFIG_RING_BUFFER=y
2360CONFIG_TRACING=y
2361CONFIG_TRACING_SUPPORT=y
2299 2362
2300# 2363#
2301# Tracers 2364# Tracers
@@ -2305,13 +2368,21 @@ CONFIG_HAVE_HW_BRANCH_TRACER=y
2305# CONFIG_SYSPROF_TRACER is not set 2368# CONFIG_SYSPROF_TRACER is not set
2306# CONFIG_SCHED_TRACER is not set 2369# CONFIG_SCHED_TRACER is not set
2307# CONFIG_CONTEXT_SWITCH_TRACER is not set 2370# CONFIG_CONTEXT_SWITCH_TRACER is not set
2371# CONFIG_EVENT_TRACER is not set
2372# CONFIG_FTRACE_SYSCALLS is not set
2308# CONFIG_BOOT_TRACER is not set 2373# CONFIG_BOOT_TRACER is not set
2309# CONFIG_TRACE_BRANCH_PROFILING is not set 2374# CONFIG_TRACE_BRANCH_PROFILING is not set
2310# CONFIG_POWER_TRACER is not set 2375# CONFIG_POWER_TRACER is not set
2311# CONFIG_STACK_TRACER is not set 2376# CONFIG_STACK_TRACER is not set
2312# CONFIG_HW_BRANCH_TRACER is not set 2377# CONFIG_HW_BRANCH_TRACER is not set
2378# CONFIG_KMEMTRACE is not set
2379# CONFIG_WORKQUEUE_TRACER is not set
2380CONFIG_BLK_DEV_IO_TRACE=y
2381# CONFIG_FTRACE_STARTUP_TEST is not set
2382# CONFIG_MMIOTRACE is not set
2313CONFIG_PROVIDE_OHCI1394_DMA_INIT=y 2383CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
2314# CONFIG_DYNAMIC_PRINTK_DEBUG is not set 2384# CONFIG_DYNAMIC_DEBUG is not set
2385# CONFIG_DMA_API_DEBUG is not set
2315# CONFIG_SAMPLES is not set 2386# CONFIG_SAMPLES is not set
2316CONFIG_HAVE_ARCH_KGDB=y 2387CONFIG_HAVE_ARCH_KGDB=y
2317# CONFIG_KGDB is not set 2388# CONFIG_KGDB is not set
@@ -2321,7 +2392,6 @@ CONFIG_EARLY_PRINTK=y
2321CONFIG_EARLY_PRINTK_DBGP=y 2392CONFIG_EARLY_PRINTK_DBGP=y
2322CONFIG_DEBUG_STACKOVERFLOW=y 2393CONFIG_DEBUG_STACKOVERFLOW=y
2323CONFIG_DEBUG_STACK_USAGE=y 2394CONFIG_DEBUG_STACK_USAGE=y
2324# CONFIG_DEBUG_PAGEALLOC is not set
2325# CONFIG_DEBUG_PER_CPU_MAPS is not set 2395# CONFIG_DEBUG_PER_CPU_MAPS is not set
2326# CONFIG_X86_PTDUMP is not set 2396# CONFIG_X86_PTDUMP is not set
2327CONFIG_DEBUG_RODATA=y 2397CONFIG_DEBUG_RODATA=y
@@ -2329,7 +2399,7 @@ CONFIG_DEBUG_RODATA=y
2329CONFIG_DEBUG_NX_TEST=m 2399CONFIG_DEBUG_NX_TEST=m
2330# CONFIG_4KSTACKS is not set 2400# CONFIG_4KSTACKS is not set
2331CONFIG_DOUBLEFAULT=y 2401CONFIG_DOUBLEFAULT=y
2332# CONFIG_MMIOTRACE is not set 2402CONFIG_HAVE_MMIOTRACE_SUPPORT=y
2333CONFIG_IO_DELAY_TYPE_0X80=0 2403CONFIG_IO_DELAY_TYPE_0X80=0
2334CONFIG_IO_DELAY_TYPE_0XED=1 2404CONFIG_IO_DELAY_TYPE_0XED=1
2335CONFIG_IO_DELAY_TYPE_UDELAY=2 2405CONFIG_IO_DELAY_TYPE_UDELAY=2
@@ -2365,6 +2435,8 @@ CONFIG_SECURITY_SELINUX_AVC_STATS=y
2365CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1 2435CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
2366# CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set 2436# CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set
2367# CONFIG_SECURITY_SMACK is not set 2437# CONFIG_SECURITY_SMACK is not set
2438# CONFIG_SECURITY_TOMOYO is not set
2439# CONFIG_IMA is not set
2368CONFIG_CRYPTO=y 2440CONFIG_CRYPTO=y
2369 2441
2370# 2442#
@@ -2380,10 +2452,12 @@ CONFIG_CRYPTO_BLKCIPHER2=y
2380CONFIG_CRYPTO_HASH=y 2452CONFIG_CRYPTO_HASH=y
2381CONFIG_CRYPTO_HASH2=y 2453CONFIG_CRYPTO_HASH2=y
2382CONFIG_CRYPTO_RNG2=y 2454CONFIG_CRYPTO_RNG2=y
2455CONFIG_CRYPTO_PCOMP=y
2383CONFIG_CRYPTO_MANAGER=y 2456CONFIG_CRYPTO_MANAGER=y
2384CONFIG_CRYPTO_MANAGER2=y 2457CONFIG_CRYPTO_MANAGER2=y
2385# CONFIG_CRYPTO_GF128MUL is not set 2458# CONFIG_CRYPTO_GF128MUL is not set
2386# CONFIG_CRYPTO_NULL is not set 2459# CONFIG_CRYPTO_NULL is not set
2460CONFIG_CRYPTO_WORKQUEUE=y
2387# CONFIG_CRYPTO_CRYPTD is not set 2461# CONFIG_CRYPTO_CRYPTD is not set
2388CONFIG_CRYPTO_AUTHENC=y 2462CONFIG_CRYPTO_AUTHENC=y
2389# CONFIG_CRYPTO_TEST is not set 2463# CONFIG_CRYPTO_TEST is not set
@@ -2456,6 +2530,7 @@ CONFIG_CRYPTO_DES=y
2456# Compression 2530# Compression
2457# 2531#
2458# CONFIG_CRYPTO_DEFLATE is not set 2532# CONFIG_CRYPTO_DEFLATE is not set
2533# CONFIG_CRYPTO_ZLIB is not set
2459# CONFIG_CRYPTO_LZO is not set 2534# CONFIG_CRYPTO_LZO is not set
2460 2535
2461# 2536#
@@ -2467,11 +2542,13 @@ CONFIG_CRYPTO_HW=y
2467# CONFIG_CRYPTO_DEV_GEODE is not set 2542# CONFIG_CRYPTO_DEV_GEODE is not set
2468# CONFIG_CRYPTO_DEV_HIFN_795X is not set 2543# CONFIG_CRYPTO_DEV_HIFN_795X is not set
2469CONFIG_HAVE_KVM=y 2544CONFIG_HAVE_KVM=y
2545CONFIG_HAVE_KVM_IRQCHIP=y
2470CONFIG_VIRTUALIZATION=y 2546CONFIG_VIRTUALIZATION=y
2471# CONFIG_KVM is not set 2547# CONFIG_KVM is not set
2472# CONFIG_LGUEST is not set 2548# CONFIG_LGUEST is not set
2473# CONFIG_VIRTIO_PCI is not set 2549# CONFIG_VIRTIO_PCI is not set
2474# CONFIG_VIRTIO_BALLOON is not set 2550# CONFIG_VIRTIO_BALLOON is not set
2551CONFIG_BINARY_PRINTF=y
2475 2552
2476# 2553#
2477# Library routines 2554# Library routines
@@ -2489,7 +2566,10 @@ CONFIG_CRC32=y
2489# CONFIG_LIBCRC32C is not set 2566# CONFIG_LIBCRC32C is not set
2490CONFIG_AUDIT_GENERIC=y 2567CONFIG_AUDIT_GENERIC=y
2491CONFIG_ZLIB_INFLATE=y 2568CONFIG_ZLIB_INFLATE=y
2492CONFIG_PLIST=y 2569CONFIG_DECOMPRESS_GZIP=y
2570CONFIG_DECOMPRESS_BZIP2=y
2571CONFIG_DECOMPRESS_LZMA=y
2493CONFIG_HAS_IOMEM=y 2572CONFIG_HAS_IOMEM=y
2494CONFIG_HAS_IOPORT=y 2573CONFIG_HAS_IOPORT=y
2495CONFIG_HAS_DMA=y 2574CONFIG_HAS_DMA=y
2575CONFIG_NLATTR=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 9fe5d212ab4..cee1dd2e69b 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -1,12 +1,13 @@
1# 1#
2# Automatically generated make config: don't edit 2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.29-rc4 3# Linux kernel version: 2.6.30-rc2
4# Tue Feb 24 15:44:16 2009 4# Mon May 11 16:22:00 2009
5# 5#
6CONFIG_64BIT=y 6CONFIG_64BIT=y
7# CONFIG_X86_32 is not set 7# CONFIG_X86_32 is not set
8CONFIG_X86_64=y 8CONFIG_X86_64=y
9CONFIG_X86=y 9CONFIG_X86=y
10CONFIG_OUTPUT_FORMAT="elf64-x86-64"
10CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig" 11CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig"
11CONFIG_GENERIC_TIME=y 12CONFIG_GENERIC_TIME=y
12CONFIG_GENERIC_CMOS_UPDATE=y 13CONFIG_GENERIC_CMOS_UPDATE=y
@@ -34,6 +35,7 @@ CONFIG_ARCH_HAS_CPU_RELAX=y
34CONFIG_ARCH_HAS_DEFAULT_IDLE=y 35CONFIG_ARCH_HAS_DEFAULT_IDLE=y
35CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y 36CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y
36CONFIG_HAVE_SETUP_PER_CPU_AREA=y 37CONFIG_HAVE_SETUP_PER_CPU_AREA=y
38CONFIG_HAVE_DYNAMIC_PER_CPU_AREA=y
37CONFIG_HAVE_CPUMASK_OF_CPU_MAP=y 39CONFIG_HAVE_CPUMASK_OF_CPU_MAP=y
38CONFIG_ARCH_HIBERNATION_POSSIBLE=y 40CONFIG_ARCH_HIBERNATION_POSSIBLE=y
39CONFIG_ARCH_SUSPEND_POSSIBLE=y 41CONFIG_ARCH_SUSPEND_POSSIBLE=y
@@ -41,14 +43,14 @@ CONFIG_ZONE_DMA32=y
41CONFIG_ARCH_POPULATES_NODE_MAP=y 43CONFIG_ARCH_POPULATES_NODE_MAP=y
42CONFIG_AUDIT_ARCH=y 44CONFIG_AUDIT_ARCH=y
43CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y 45CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y
46CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y
44CONFIG_GENERIC_HARDIRQS=y 47CONFIG_GENERIC_HARDIRQS=y
48CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y
45CONFIG_GENERIC_IRQ_PROBE=y 49CONFIG_GENERIC_IRQ_PROBE=y
46CONFIG_GENERIC_PENDING_IRQ=y 50CONFIG_GENERIC_PENDING_IRQ=y
47CONFIG_X86_SMP=y
48CONFIG_USE_GENERIC_SMP_HELPERS=y 51CONFIG_USE_GENERIC_SMP_HELPERS=y
49CONFIG_X86_64_SMP=y 52CONFIG_X86_64_SMP=y
50CONFIG_X86_HT=y 53CONFIG_X86_HT=y
51CONFIG_X86_BIOS_REBOOT=y
52CONFIG_X86_TRAMPOLINE=y 54CONFIG_X86_TRAMPOLINE=y
53# CONFIG_KTIME_SCALAR is not set 55# CONFIG_KTIME_SCALAR is not set
54CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" 56CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
@@ -61,10 +63,17 @@ CONFIG_LOCK_KERNEL=y
61CONFIG_INIT_ENV_ARG_LIMIT=32 63CONFIG_INIT_ENV_ARG_LIMIT=32
62CONFIG_LOCALVERSION="" 64CONFIG_LOCALVERSION=""
63# CONFIG_LOCALVERSION_AUTO is not set 65# CONFIG_LOCALVERSION_AUTO is not set
66CONFIG_HAVE_KERNEL_GZIP=y
67CONFIG_HAVE_KERNEL_BZIP2=y
68CONFIG_HAVE_KERNEL_LZMA=y
69CONFIG_KERNEL_GZIP=y
70# CONFIG_KERNEL_BZIP2 is not set
71# CONFIG_KERNEL_LZMA is not set
64CONFIG_SWAP=y 72CONFIG_SWAP=y
65CONFIG_SYSVIPC=y 73CONFIG_SYSVIPC=y
66CONFIG_SYSVIPC_SYSCTL=y 74CONFIG_SYSVIPC_SYSCTL=y
67CONFIG_POSIX_MQUEUE=y 75CONFIG_POSIX_MQUEUE=y
76CONFIG_POSIX_MQUEUE_SYSCTL=y
68CONFIG_BSD_PROCESS_ACCT=y 77CONFIG_BSD_PROCESS_ACCT=y
69# CONFIG_BSD_PROCESS_ACCT_V3 is not set 78# CONFIG_BSD_PROCESS_ACCT_V3 is not set
70CONFIG_TASKSTATS=y 79CONFIG_TASKSTATS=y
@@ -114,23 +123,26 @@ CONFIG_PID_NS=y
114CONFIG_NET_NS=y 123CONFIG_NET_NS=y
115CONFIG_BLK_DEV_INITRD=y 124CONFIG_BLK_DEV_INITRD=y
116CONFIG_INITRAMFS_SOURCE="" 125CONFIG_INITRAMFS_SOURCE=""
126CONFIG_RD_GZIP=y
127CONFIG_RD_BZIP2=y
128CONFIG_RD_LZMA=y
117CONFIG_CC_OPTIMIZE_FOR_SIZE=y 129CONFIG_CC_OPTIMIZE_FOR_SIZE=y
118CONFIG_SYSCTL=y 130CONFIG_SYSCTL=y
131CONFIG_ANON_INODES=y
119# CONFIG_EMBEDDED is not set 132# CONFIG_EMBEDDED is not set
120CONFIG_UID16=y 133CONFIG_UID16=y
121CONFIG_SYSCTL_SYSCALL=y 134CONFIG_SYSCTL_SYSCALL=y
122CONFIG_KALLSYMS=y 135CONFIG_KALLSYMS=y
123CONFIG_KALLSYMS_ALL=y 136CONFIG_KALLSYMS_ALL=y
124CONFIG_KALLSYMS_EXTRA_PASS=y 137CONFIG_KALLSYMS_EXTRA_PASS=y
138# CONFIG_STRIP_ASM_SYMS is not set
125CONFIG_HOTPLUG=y 139CONFIG_HOTPLUG=y
126CONFIG_PRINTK=y 140CONFIG_PRINTK=y
127CONFIG_BUG=y 141CONFIG_BUG=y
128CONFIG_ELF_CORE=y 142CONFIG_ELF_CORE=y
129CONFIG_PCSPKR_PLATFORM=y 143CONFIG_PCSPKR_PLATFORM=y
130# CONFIG_COMPAT_BRK is not set
131CONFIG_BASE_FULL=y 144CONFIG_BASE_FULL=y
132CONFIG_FUTEX=y 145CONFIG_FUTEX=y
133CONFIG_ANON_INODES=y
134CONFIG_EPOLL=y 146CONFIG_EPOLL=y
135CONFIG_SIGNALFD=y 147CONFIG_SIGNALFD=y
136CONFIG_TIMERFD=y 148CONFIG_TIMERFD=y
@@ -140,6 +152,7 @@ CONFIG_AIO=y
140CONFIG_VM_EVENT_COUNTERS=y 152CONFIG_VM_EVENT_COUNTERS=y
141CONFIG_PCI_QUIRKS=y 153CONFIG_PCI_QUIRKS=y
142CONFIG_SLUB_DEBUG=y 154CONFIG_SLUB_DEBUG=y
155# CONFIG_COMPAT_BRK is not set
143# CONFIG_SLAB is not set 156# CONFIG_SLAB is not set
144CONFIG_SLUB=y 157CONFIG_SLUB=y
145# CONFIG_SLOB is not set 158# CONFIG_SLOB is not set
@@ -155,6 +168,8 @@ CONFIG_HAVE_IOREMAP_PROT=y
155CONFIG_HAVE_KPROBES=y 168CONFIG_HAVE_KPROBES=y
156CONFIG_HAVE_KRETPROBES=y 169CONFIG_HAVE_KRETPROBES=y
157CONFIG_HAVE_ARCH_TRACEHOOK=y 170CONFIG_HAVE_ARCH_TRACEHOOK=y
171CONFIG_HAVE_DMA_API_DEBUG=y
172# CONFIG_SLOW_WORK is not set
158# CONFIG_HAVE_GENERIC_DMA_COHERENT is not set 173# CONFIG_HAVE_GENERIC_DMA_COHERENT is not set
159CONFIG_SLABINFO=y 174CONFIG_SLABINFO=y
160CONFIG_RT_MUTEXES=y 175CONFIG_RT_MUTEXES=y
@@ -167,7 +182,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
167# CONFIG_MODULE_SRCVERSION_ALL is not set 182# CONFIG_MODULE_SRCVERSION_ALL is not set
168CONFIG_STOP_MACHINE=y 183CONFIG_STOP_MACHINE=y
169CONFIG_BLOCK=y 184CONFIG_BLOCK=y
170CONFIG_BLK_DEV_IO_TRACE=y
171CONFIG_BLK_DEV_BSG=y 185CONFIG_BLK_DEV_BSG=y
172# CONFIG_BLK_DEV_INTEGRITY is not set 186# CONFIG_BLK_DEV_INTEGRITY is not set
173CONFIG_BLOCK_COMPAT=y 187CONFIG_BLOCK_COMPAT=y
@@ -195,12 +209,10 @@ CONFIG_HIGH_RES_TIMERS=y
195CONFIG_GENERIC_CLOCKEVENTS_BUILD=y 209CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
196CONFIG_SMP=y 210CONFIG_SMP=y
197CONFIG_SPARSE_IRQ=y 211CONFIG_SPARSE_IRQ=y
198# CONFIG_NUMA_MIGRATE_IRQ_DESC is not set
199CONFIG_X86_FIND_SMP_CONFIG=y
200CONFIG_X86_MPPARSE=y 212CONFIG_X86_MPPARSE=y
201# CONFIG_X86_ELAN is not set 213CONFIG_X86_EXTENDED_PLATFORM=y
202# CONFIG_X86_GENERICARCH is not set
203# CONFIG_X86_VSMP is not set 214# CONFIG_X86_VSMP is not set
215# CONFIG_X86_UV is not set
204CONFIG_SCHED_OMIT_FRAME_POINTER=y 216CONFIG_SCHED_OMIT_FRAME_POINTER=y
205# CONFIG_PARAVIRT_GUEST is not set 217# CONFIG_PARAVIRT_GUEST is not set
206# CONFIG_MEMTEST is not set 218# CONFIG_MEMTEST is not set
@@ -230,10 +242,10 @@ CONFIG_SCHED_OMIT_FRAME_POINTER=y
230# CONFIG_MCORE2 is not set 242# CONFIG_MCORE2 is not set
231CONFIG_GENERIC_CPU=y 243CONFIG_GENERIC_CPU=y
232CONFIG_X86_CPU=y 244CONFIG_X86_CPU=y
233CONFIG_X86_L1_CACHE_BYTES=128 245CONFIG_X86_L1_CACHE_BYTES=64
234CONFIG_X86_INTERNODE_CACHE_BYTES=128 246CONFIG_X86_INTERNODE_CACHE_BYTES=64
235CONFIG_X86_CMPXCHG=y 247CONFIG_X86_CMPXCHG=y
236CONFIG_X86_L1_CACHE_SHIFT=7 248CONFIG_X86_L1_CACHE_SHIFT=6
237CONFIG_X86_WP_WORKS_OK=y 249CONFIG_X86_WP_WORKS_OK=y
238CONFIG_X86_TSC=y 250CONFIG_X86_TSC=y
239CONFIG_X86_CMPXCHG64=y 251CONFIG_X86_CMPXCHG64=y
@@ -242,7 +254,7 @@ CONFIG_X86_MINIMUM_CPU_FAMILY=64
242CONFIG_X86_DEBUGCTLMSR=y 254CONFIG_X86_DEBUGCTLMSR=y
243CONFIG_CPU_SUP_INTEL=y 255CONFIG_CPU_SUP_INTEL=y
244CONFIG_CPU_SUP_AMD=y 256CONFIG_CPU_SUP_AMD=y
245CONFIG_CPU_SUP_CENTAUR_64=y 257CONFIG_CPU_SUP_CENTAUR=y
246CONFIG_X86_DS=y 258CONFIG_X86_DS=y
247CONFIG_X86_PTRACE_BTS=y 259CONFIG_X86_PTRACE_BTS=y
248CONFIG_HPET_TIMER=y 260CONFIG_HPET_TIMER=y
@@ -269,6 +281,7 @@ CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y
269CONFIG_X86_MCE=y 281CONFIG_X86_MCE=y
270CONFIG_X86_MCE_INTEL=y 282CONFIG_X86_MCE_INTEL=y
271CONFIG_X86_MCE_AMD=y 283CONFIG_X86_MCE_AMD=y
284CONFIG_X86_MCE_THRESHOLD=y
272# CONFIG_I8K is not set 285# CONFIG_I8K is not set
273CONFIG_MICROCODE=y 286CONFIG_MICROCODE=y
274CONFIG_MICROCODE_INTEL=y 287CONFIG_MICROCODE_INTEL=y
@@ -276,6 +289,7 @@ CONFIG_MICROCODE_AMD=y
276CONFIG_MICROCODE_OLD_INTERFACE=y 289CONFIG_MICROCODE_OLD_INTERFACE=y
277CONFIG_X86_MSR=y 290CONFIG_X86_MSR=y
278CONFIG_X86_CPUID=y 291CONFIG_X86_CPUID=y
292# CONFIG_X86_CPU_DEBUG is not set
279CONFIG_ARCH_PHYS_ADDR_T_64BIT=y 293CONFIG_ARCH_PHYS_ADDR_T_64BIT=y
280CONFIG_DIRECT_GBPAGES=y 294CONFIG_DIRECT_GBPAGES=y
281CONFIG_NUMA=y 295CONFIG_NUMA=y
@@ -309,6 +323,8 @@ CONFIG_ZONE_DMA_FLAG=1
309CONFIG_BOUNCE=y 323CONFIG_BOUNCE=y
310CONFIG_VIRT_TO_BUS=y 324CONFIG_VIRT_TO_BUS=y
311CONFIG_UNEVICTABLE_LRU=y 325CONFIG_UNEVICTABLE_LRU=y
326CONFIG_HAVE_MLOCK=y
327CONFIG_HAVE_MLOCKED_PAGE_BIT=y
312CONFIG_X86_CHECK_BIOS_CORRUPTION=y 328CONFIG_X86_CHECK_BIOS_CORRUPTION=y
313CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y 329CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y
314CONFIG_X86_RESERVE_LOW_64K=y 330CONFIG_X86_RESERVE_LOW_64K=y
@@ -317,6 +333,7 @@ CONFIG_MTRR=y
317CONFIG_X86_PAT=y 333CONFIG_X86_PAT=y
318CONFIG_EFI=y 334CONFIG_EFI=y
319CONFIG_SECCOMP=y 335CONFIG_SECCOMP=y
336# CONFIG_CC_STACKPROTECTOR is not set
320# CONFIG_HZ_100 is not set 337# CONFIG_HZ_100 is not set
321# CONFIG_HZ_250 is not set 338# CONFIG_HZ_250 is not set
322# CONFIG_HZ_300 is not set 339# CONFIG_HZ_300 is not set
@@ -325,9 +342,10 @@ CONFIG_HZ=1000
325CONFIG_SCHED_HRTICK=y 342CONFIG_SCHED_HRTICK=y
326CONFIG_KEXEC=y 343CONFIG_KEXEC=y
327CONFIG_CRASH_DUMP=y 344CONFIG_CRASH_DUMP=y
345# CONFIG_KEXEC_JUMP is not set
328CONFIG_PHYSICAL_START=0x1000000 346CONFIG_PHYSICAL_START=0x1000000
329# CONFIG_RELOCATABLE is not set 347CONFIG_RELOCATABLE=y
330CONFIG_PHYSICAL_ALIGN=0x200000 348CONFIG_PHYSICAL_ALIGN=0x1000000
331CONFIG_HOTPLUG_CPU=y 349CONFIG_HOTPLUG_CPU=y
332# CONFIG_COMPAT_VDSO is not set 350# CONFIG_COMPAT_VDSO is not set
333# CONFIG_CMDLINE_BOOL is not set 351# CONFIG_CMDLINE_BOOL is not set
@@ -370,7 +388,6 @@ CONFIG_ACPI_NUMA=y
370CONFIG_ACPI_BLACKLIST_YEAR=0 388CONFIG_ACPI_BLACKLIST_YEAR=0
371# CONFIG_ACPI_DEBUG is not set 389# CONFIG_ACPI_DEBUG is not set
372# CONFIG_ACPI_PCI_SLOT is not set 390# CONFIG_ACPI_PCI_SLOT is not set
373CONFIG_ACPI_SYSTEM=y
374CONFIG_X86_PM_TIMER=y 391CONFIG_X86_PM_TIMER=y
375CONFIG_ACPI_CONTAINER=y 392CONFIG_ACPI_CONTAINER=y
376# CONFIG_ACPI_SBS is not set 393# CONFIG_ACPI_SBS is not set
@@ -436,6 +453,7 @@ CONFIG_PCI_MSI=y
436# CONFIG_PCI_DEBUG is not set 453# CONFIG_PCI_DEBUG is not set
437# CONFIG_PCI_STUB is not set 454# CONFIG_PCI_STUB is not set
438CONFIG_HT_IRQ=y 455CONFIG_HT_IRQ=y
456# CONFIG_PCI_IOV is not set
439CONFIG_ISA_DMA_API=y 457CONFIG_ISA_DMA_API=y
440CONFIG_K8_NB=y 458CONFIG_K8_NB=y
441CONFIG_PCCARD=y 459CONFIG_PCCARD=y
@@ -481,7 +499,6 @@ CONFIG_NET=y
481# 499#
482# Networking options 500# Networking options
483# 501#
484CONFIG_COMPAT_NET_DEV_OPS=y
485CONFIG_PACKET=y 502CONFIG_PACKET=y
486CONFIG_PACKET_MMAP=y 503CONFIG_PACKET_MMAP=y
487CONFIG_UNIX=y 504CONFIG_UNIX=y
@@ -639,6 +656,7 @@ CONFIG_LLC=y
639# CONFIG_LAPB is not set 656# CONFIG_LAPB is not set
640# CONFIG_ECONET is not set 657# CONFIG_ECONET is not set
641# CONFIG_WAN_ROUTER is not set 658# CONFIG_WAN_ROUTER is not set
659# CONFIG_PHONET is not set
642CONFIG_NET_SCHED=y 660CONFIG_NET_SCHED=y
643 661
644# 662#
@@ -696,6 +714,7 @@ CONFIG_NET_SCH_FIFO=y
696# 714#
697# CONFIG_NET_PKTGEN is not set 715# CONFIG_NET_PKTGEN is not set
698# CONFIG_NET_TCPPROBE is not set 716# CONFIG_NET_TCPPROBE is not set
717# CONFIG_NET_DROP_MONITOR is not set
699CONFIG_HAMRADIO=y 718CONFIG_HAMRADIO=y
700 719
701# 720#
@@ -706,12 +725,10 @@ CONFIG_HAMRADIO=y
706# CONFIG_IRDA is not set 725# CONFIG_IRDA is not set
707# CONFIG_BT is not set 726# CONFIG_BT is not set
708# CONFIG_AF_RXRPC is not set 727# CONFIG_AF_RXRPC is not set
709# CONFIG_PHONET is not set
710CONFIG_FIB_RULES=y 728CONFIG_FIB_RULES=y
711CONFIG_WIRELESS=y 729CONFIG_WIRELESS=y
712CONFIG_CFG80211=y 730CONFIG_CFG80211=y
713# CONFIG_CFG80211_REG_DEBUG is not set 731# CONFIG_CFG80211_REG_DEBUG is not set
714CONFIG_NL80211=y
715CONFIG_WIRELESS_OLD_REGULATORY=y 732CONFIG_WIRELESS_OLD_REGULATORY=y
716CONFIG_WIRELESS_EXT=y 733CONFIG_WIRELESS_EXT=y
717CONFIG_WIRELESS_EXT_SYSFS=y 734CONFIG_WIRELESS_EXT_SYSFS=y
@@ -788,9 +805,8 @@ CONFIG_MISC_DEVICES=y
788# CONFIG_TIFM_CORE is not set 805# CONFIG_TIFM_CORE is not set
789# CONFIG_ICS932S401 is not set 806# CONFIG_ICS932S401 is not set
790# CONFIG_ENCLOSURE_SERVICES is not set 807# CONFIG_ENCLOSURE_SERVICES is not set
791# CONFIG_SGI_XP is not set
792# CONFIG_HP_ILO is not set 808# CONFIG_HP_ILO is not set
793# CONFIG_SGI_GRU is not set 809# CONFIG_ISL29003 is not set
794# CONFIG_C2PORT is not set 810# CONFIG_C2PORT is not set
795 811
796# 812#
@@ -844,6 +860,7 @@ CONFIG_SCSI_SPI_ATTRS=y
844# CONFIG_SCSI_LOWLEVEL is not set 860# CONFIG_SCSI_LOWLEVEL is not set
845# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set 861# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set
846# CONFIG_SCSI_DH is not set 862# CONFIG_SCSI_DH is not set
863# CONFIG_SCSI_OSD_INITIATOR is not set
847CONFIG_ATA=y 864CONFIG_ATA=y
848# CONFIG_ATA_NONSTANDARD is not set 865# CONFIG_ATA_NONSTANDARD is not set
849CONFIG_ATA_ACPI=y 866CONFIG_ATA_ACPI=y
@@ -940,6 +957,7 @@ CONFIG_DM_ZERO=y
940CONFIG_MACINTOSH_DRIVERS=y 957CONFIG_MACINTOSH_DRIVERS=y
941CONFIG_MAC_EMUMOUSEBTN=y 958CONFIG_MAC_EMUMOUSEBTN=y
942CONFIG_NETDEVICES=y 959CONFIG_NETDEVICES=y
960CONFIG_COMPAT_NET_DEV_OPS=y
943# CONFIG_IFB is not set 961# CONFIG_IFB is not set
944# CONFIG_DUMMY is not set 962# CONFIG_DUMMY is not set
945# CONFIG_BONDING is not set 963# CONFIG_BONDING is not set
@@ -977,6 +995,8 @@ CONFIG_MII=y
977CONFIG_NET_VENDOR_3COM=y 995CONFIG_NET_VENDOR_3COM=y
978# CONFIG_VORTEX is not set 996# CONFIG_VORTEX is not set
979# CONFIG_TYPHOON is not set 997# CONFIG_TYPHOON is not set
998# CONFIG_ETHOC is not set
999# CONFIG_DNET is not set
980CONFIG_NET_TULIP=y 1000CONFIG_NET_TULIP=y
981# CONFIG_DE2104X is not set 1001# CONFIG_DE2104X is not set
982# CONFIG_TULIP is not set 1002# CONFIG_TULIP is not set
@@ -1026,6 +1046,7 @@ CONFIG_E1000=y
1026# CONFIG_E1000E is not set 1046# CONFIG_E1000E is not set
1027# CONFIG_IP1000 is not set 1047# CONFIG_IP1000 is not set
1028# CONFIG_IGB is not set 1048# CONFIG_IGB is not set
1049# CONFIG_IGBVF is not set
1029# CONFIG_NS83820 is not set 1050# CONFIG_NS83820 is not set
1030# CONFIG_HAMACHI is not set 1051# CONFIG_HAMACHI is not set
1031# CONFIG_YELLOWFIN is not set 1052# CONFIG_YELLOWFIN is not set
@@ -1040,6 +1061,7 @@ CONFIG_TIGON3=y
1040# CONFIG_QLA3XXX is not set 1061# CONFIG_QLA3XXX is not set
1041# CONFIG_ATL1 is not set 1062# CONFIG_ATL1 is not set
1042# CONFIG_ATL1E is not set 1063# CONFIG_ATL1E is not set
1064# CONFIG_ATL1C is not set
1043# CONFIG_JME is not set 1065# CONFIG_JME is not set
1044CONFIG_NETDEV_10000=y 1066CONFIG_NETDEV_10000=y
1045# CONFIG_CHELSIO_T1 is not set 1067# CONFIG_CHELSIO_T1 is not set
@@ -1049,6 +1071,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
1049# CONFIG_IXGBE is not set 1071# CONFIG_IXGBE is not set
1050# CONFIG_IXGB is not set 1072# CONFIG_IXGB is not set
1051# CONFIG_S2IO is not set 1073# CONFIG_S2IO is not set
1074# CONFIG_VXGE is not set
1052# CONFIG_MYRI10GE is not set 1075# CONFIG_MYRI10GE is not set
1053# CONFIG_NETXEN_NIC is not set 1076# CONFIG_NETXEN_NIC is not set
1054# CONFIG_NIU is not set 1077# CONFIG_NIU is not set
@@ -1058,6 +1081,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
1058# CONFIG_BNX2X is not set 1081# CONFIG_BNX2X is not set
1059# CONFIG_QLGE is not set 1082# CONFIG_QLGE is not set
1060# CONFIG_SFC is not set 1083# CONFIG_SFC is not set
1084# CONFIG_BE2NET is not set
1061CONFIG_TR=y 1085CONFIG_TR=y
1062# CONFIG_IBMOL is not set 1086# CONFIG_IBMOL is not set
1063# CONFIG_3C359 is not set 1087# CONFIG_3C359 is not set
@@ -1072,8 +1096,8 @@ CONFIG_WLAN_80211=y
1072# CONFIG_LIBERTAS is not set 1096# CONFIG_LIBERTAS is not set
1073# CONFIG_LIBERTAS_THINFIRM is not set 1097# CONFIG_LIBERTAS_THINFIRM is not set
1074# CONFIG_AIRO is not set 1098# CONFIG_AIRO is not set
1075# CONFIG_HERMES is not set
1076# CONFIG_ATMEL is not set 1099# CONFIG_ATMEL is not set
1100# CONFIG_AT76C50X_USB is not set
1077# CONFIG_AIRO_CS is not set 1101# CONFIG_AIRO_CS is not set
1078# CONFIG_PCMCIA_WL3501 is not set 1102# CONFIG_PCMCIA_WL3501 is not set
1079# CONFIG_PRISM54 is not set 1103# CONFIG_PRISM54 is not set
@@ -1083,21 +1107,21 @@ CONFIG_WLAN_80211=y
1083# CONFIG_RTL8187 is not set 1107# CONFIG_RTL8187 is not set
1084# CONFIG_ADM8211 is not set 1108# CONFIG_ADM8211 is not set
1085# CONFIG_MAC80211_HWSIM is not set 1109# CONFIG_MAC80211_HWSIM is not set
1110# CONFIG_MWL8K is not set
1086# CONFIG_P54_COMMON is not set 1111# CONFIG_P54_COMMON is not set
1087CONFIG_ATH5K=y 1112CONFIG_ATH5K=y
1088# CONFIG_ATH5K_DEBUG is not set 1113# CONFIG_ATH5K_DEBUG is not set
1089# CONFIG_ATH9K is not set 1114# CONFIG_ATH9K is not set
1115# CONFIG_AR9170_USB is not set
1090# CONFIG_IPW2100 is not set 1116# CONFIG_IPW2100 is not set
1091# CONFIG_IPW2200 is not set 1117# CONFIG_IPW2200 is not set
1092# CONFIG_IWLCORE is not set 1118# CONFIG_IWLWIFI is not set
1093# CONFIG_IWLWIFI_LEDS is not set
1094# CONFIG_IWLAGN is not set
1095# CONFIG_IWL3945 is not set
1096# CONFIG_HOSTAP is not set 1119# CONFIG_HOSTAP is not set
1097# CONFIG_B43 is not set 1120# CONFIG_B43 is not set
1098# CONFIG_B43LEGACY is not set 1121# CONFIG_B43LEGACY is not set
1099# CONFIG_ZD1211RW is not set 1122# CONFIG_ZD1211RW is not set
1100# CONFIG_RT2X00 is not set 1123# CONFIG_RT2X00 is not set
1124# CONFIG_HERMES is not set
1101 1125
1102# 1126#
1103# Enable WiMAX (Networking options) to see the WiMAX drivers 1127# Enable WiMAX (Networking options) to see the WiMAX drivers
@@ -1208,6 +1232,8 @@ CONFIG_INPUT_TABLET=y
1208# CONFIG_TABLET_USB_KBTAB is not set 1232# CONFIG_TABLET_USB_KBTAB is not set
1209# CONFIG_TABLET_USB_WACOM is not set 1233# CONFIG_TABLET_USB_WACOM is not set
1210CONFIG_INPUT_TOUCHSCREEN=y 1234CONFIG_INPUT_TOUCHSCREEN=y
1235# CONFIG_TOUCHSCREEN_AD7879_I2C is not set
1236# CONFIG_TOUCHSCREEN_AD7879 is not set
1211# CONFIG_TOUCHSCREEN_FUJITSU is not set 1237# CONFIG_TOUCHSCREEN_FUJITSU is not set
1212# CONFIG_TOUCHSCREEN_GUNZE is not set 1238# CONFIG_TOUCHSCREEN_GUNZE is not set
1213# CONFIG_TOUCHSCREEN_ELO is not set 1239# CONFIG_TOUCHSCREEN_ELO is not set
@@ -1301,6 +1327,7 @@ CONFIG_UNIX98_PTYS=y
1301# CONFIG_LEGACY_PTYS is not set 1327# CONFIG_LEGACY_PTYS is not set
1302# CONFIG_IPMI_HANDLER is not set 1328# CONFIG_IPMI_HANDLER is not set
1303CONFIG_HW_RANDOM=y 1329CONFIG_HW_RANDOM=y
1330# CONFIG_HW_RANDOM_TIMERIOMEM is not set
1304# CONFIG_HW_RANDOM_INTEL is not set 1331# CONFIG_HW_RANDOM_INTEL is not set
1305# CONFIG_HW_RANDOM_AMD is not set 1332# CONFIG_HW_RANDOM_AMD is not set
1306CONFIG_NVRAM=y 1333CONFIG_NVRAM=y
@@ -1382,7 +1409,6 @@ CONFIG_I2C_I801=y
1382# CONFIG_SENSORS_PCF8574 is not set 1409# CONFIG_SENSORS_PCF8574 is not set
1383# CONFIG_PCF8575 is not set 1410# CONFIG_PCF8575 is not set
1384# CONFIG_SENSORS_PCA9539 is not set 1411# CONFIG_SENSORS_PCA9539 is not set
1385# CONFIG_SENSORS_PCF8591 is not set
1386# CONFIG_SENSORS_MAX6875 is not set 1412# CONFIG_SENSORS_MAX6875 is not set
1387# CONFIG_SENSORS_TSL2550 is not set 1413# CONFIG_SENSORS_TSL2550 is not set
1388# CONFIG_I2C_DEBUG_CORE is not set 1414# CONFIG_I2C_DEBUG_CORE is not set
@@ -1416,6 +1442,7 @@ CONFIG_HWMON=y
1416# CONFIG_SENSORS_ADT7475 is not set 1442# CONFIG_SENSORS_ADT7475 is not set
1417# CONFIG_SENSORS_K8TEMP is not set 1443# CONFIG_SENSORS_K8TEMP is not set
1418# CONFIG_SENSORS_ASB100 is not set 1444# CONFIG_SENSORS_ASB100 is not set
1445# CONFIG_SENSORS_ATK0110 is not set
1419# CONFIG_SENSORS_ATXP1 is not set 1446# CONFIG_SENSORS_ATXP1 is not set
1420# CONFIG_SENSORS_DS1621 is not set 1447# CONFIG_SENSORS_DS1621 is not set
1421# CONFIG_SENSORS_I5K_AMB is not set 1448# CONFIG_SENSORS_I5K_AMB is not set
@@ -1425,6 +1452,7 @@ CONFIG_HWMON=y
1425# CONFIG_SENSORS_FSCHER is not set 1452# CONFIG_SENSORS_FSCHER is not set
1426# CONFIG_SENSORS_FSCPOS is not set 1453# CONFIG_SENSORS_FSCPOS is not set
1427# CONFIG_SENSORS_FSCHMD is not set 1454# CONFIG_SENSORS_FSCHMD is not set
1455# CONFIG_SENSORS_G760A is not set
1428# CONFIG_SENSORS_GL518SM is not set 1456# CONFIG_SENSORS_GL518SM is not set
1429# CONFIG_SENSORS_GL520SM is not set 1457# CONFIG_SENSORS_GL520SM is not set
1430# CONFIG_SENSORS_CORETEMP is not set 1458# CONFIG_SENSORS_CORETEMP is not set
@@ -1440,11 +1468,14 @@ CONFIG_HWMON=y
1440# CONFIG_SENSORS_LM90 is not set 1468# CONFIG_SENSORS_LM90 is not set
1441# CONFIG_SENSORS_LM92 is not set 1469# CONFIG_SENSORS_LM92 is not set
1442# CONFIG_SENSORS_LM93 is not set 1470# CONFIG_SENSORS_LM93 is not set
1471# CONFIG_SENSORS_LTC4215 is not set
1443# CONFIG_SENSORS_LTC4245 is not set 1472# CONFIG_SENSORS_LTC4245 is not set
1473# CONFIG_SENSORS_LM95241 is not set
1444# CONFIG_SENSORS_MAX1619 is not set 1474# CONFIG_SENSORS_MAX1619 is not set
1445# CONFIG_SENSORS_MAX6650 is not set 1475# CONFIG_SENSORS_MAX6650 is not set
1446# CONFIG_SENSORS_PC87360 is not set 1476# CONFIG_SENSORS_PC87360 is not set
1447# CONFIG_SENSORS_PC87427 is not set 1477# CONFIG_SENSORS_PC87427 is not set
1478# CONFIG_SENSORS_PCF8591 is not set
1448# CONFIG_SENSORS_SIS5595 is not set 1479# CONFIG_SENSORS_SIS5595 is not set
1449# CONFIG_SENSORS_DME1737 is not set 1480# CONFIG_SENSORS_DME1737 is not set
1450# CONFIG_SENSORS_SMSC47M1 is not set 1481# CONFIG_SENSORS_SMSC47M1 is not set
@@ -1635,6 +1666,7 @@ CONFIG_FB_EFI=y
1635# CONFIG_FB_VIRTUAL is not set 1666# CONFIG_FB_VIRTUAL is not set
1636# CONFIG_FB_METRONOME is not set 1667# CONFIG_FB_METRONOME is not set
1637# CONFIG_FB_MB862XX is not set 1668# CONFIG_FB_MB862XX is not set
1669# CONFIG_FB_BROADSHEET is not set
1638CONFIG_BACKLIGHT_LCD_SUPPORT=y 1670CONFIG_BACKLIGHT_LCD_SUPPORT=y
1639# CONFIG_LCD_CLASS_DEVICE is not set 1671# CONFIG_LCD_CLASS_DEVICE is not set
1640CONFIG_BACKLIGHT_CLASS_DEVICE=y 1672CONFIG_BACKLIGHT_CLASS_DEVICE=y
@@ -1720,6 +1752,8 @@ CONFIG_SND_PCI=y
1720# CONFIG_SND_INDIGO is not set 1752# CONFIG_SND_INDIGO is not set
1721# CONFIG_SND_INDIGOIO is not set 1753# CONFIG_SND_INDIGOIO is not set
1722# CONFIG_SND_INDIGODJ is not set 1754# CONFIG_SND_INDIGODJ is not set
1755# CONFIG_SND_INDIGOIOX is not set
1756# CONFIG_SND_INDIGODJX is not set
1723# CONFIG_SND_EMU10K1 is not set 1757# CONFIG_SND_EMU10K1 is not set
1724# CONFIG_SND_EMU10K1X is not set 1758# CONFIG_SND_EMU10K1X is not set
1725# CONFIG_SND_ENS1370 is not set 1759# CONFIG_SND_ENS1370 is not set
@@ -1792,15 +1826,17 @@ CONFIG_USB_HIDDEV=y
1792# 1826#
1793# Special HID drivers 1827# Special HID drivers
1794# 1828#
1795CONFIG_HID_COMPAT=y
1796CONFIG_HID_A4TECH=y 1829CONFIG_HID_A4TECH=y
1797CONFIG_HID_APPLE=y 1830CONFIG_HID_APPLE=y
1798CONFIG_HID_BELKIN=y 1831CONFIG_HID_BELKIN=y
1799CONFIG_HID_CHERRY=y 1832CONFIG_HID_CHERRY=y
1800CONFIG_HID_CHICONY=y 1833CONFIG_HID_CHICONY=y
1801CONFIG_HID_CYPRESS=y 1834CONFIG_HID_CYPRESS=y
1835# CONFIG_DRAGONRISE_FF is not set
1802CONFIG_HID_EZKEY=y 1836CONFIG_HID_EZKEY=y
1837CONFIG_HID_KYE=y
1803CONFIG_HID_GYRATION=y 1838CONFIG_HID_GYRATION=y
1839CONFIG_HID_KENSINGTON=y
1804CONFIG_HID_LOGITECH=y 1840CONFIG_HID_LOGITECH=y
1805CONFIG_LOGITECH_FF=y 1841CONFIG_LOGITECH_FF=y
1806# CONFIG_LOGIRUMBLEPAD2_FF is not set 1842# CONFIG_LOGIRUMBLEPAD2_FF is not set
@@ -1866,11 +1902,11 @@ CONFIG_USB_PRINTER=y
1866# CONFIG_USB_TMC is not set 1902# CONFIG_USB_TMC is not set
1867 1903
1868# 1904#
1869# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may also be needed; 1905# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may
1870# 1906#
1871 1907
1872# 1908#
1873# see USB_STORAGE Help for more information 1909# also be needed; see USB_STORAGE Help for more info
1874# 1910#
1875CONFIG_USB_STORAGE=y 1911CONFIG_USB_STORAGE=y
1876# CONFIG_USB_STORAGE_DEBUG is not set 1912# CONFIG_USB_STORAGE_DEBUG is not set
@@ -1912,7 +1948,6 @@ CONFIG_USB_LIBUSUAL=y
1912# CONFIG_USB_LED is not set 1948# CONFIG_USB_LED is not set
1913# CONFIG_USB_CYPRESS_CY7C63 is not set 1949# CONFIG_USB_CYPRESS_CY7C63 is not set
1914# CONFIG_USB_CYTHERM is not set 1950# CONFIG_USB_CYTHERM is not set
1915# CONFIG_USB_PHIDGET is not set
1916# CONFIG_USB_IDMOUSE is not set 1951# CONFIG_USB_IDMOUSE is not set
1917# CONFIG_USB_FTDI_ELAN is not set 1952# CONFIG_USB_FTDI_ELAN is not set
1918# CONFIG_USB_APPLEDISPLAY is not set 1953# CONFIG_USB_APPLEDISPLAY is not set
@@ -1928,6 +1963,7 @@ CONFIG_USB_LIBUSUAL=y
1928# 1963#
1929# OTG and related infrastructure 1964# OTG and related infrastructure
1930# 1965#
1966# CONFIG_NOP_USB_XCEIV is not set
1931# CONFIG_UWB is not set 1967# CONFIG_UWB is not set
1932# CONFIG_MMC is not set 1968# CONFIG_MMC is not set
1933# CONFIG_MEMSTICK is not set 1969# CONFIG_MEMSTICK is not set
@@ -1939,8 +1975,10 @@ CONFIG_LEDS_CLASS=y
1939# 1975#
1940# CONFIG_LEDS_ALIX2 is not set 1976# CONFIG_LEDS_ALIX2 is not set
1941# CONFIG_LEDS_PCA9532 is not set 1977# CONFIG_LEDS_PCA9532 is not set
1978# CONFIG_LEDS_LP5521 is not set
1942# CONFIG_LEDS_CLEVO_MAIL is not set 1979# CONFIG_LEDS_CLEVO_MAIL is not set
1943# CONFIG_LEDS_PCA955X is not set 1980# CONFIG_LEDS_PCA955X is not set
1981# CONFIG_LEDS_BD2802 is not set
1944 1982
1945# 1983#
1946# LED Triggers 1984# LED Triggers
@@ -1950,6 +1988,10 @@ CONFIG_LEDS_TRIGGERS=y
1950# CONFIG_LEDS_TRIGGER_HEARTBEAT is not set 1988# CONFIG_LEDS_TRIGGER_HEARTBEAT is not set
1951# CONFIG_LEDS_TRIGGER_BACKLIGHT is not set 1989# CONFIG_LEDS_TRIGGER_BACKLIGHT is not set
1952# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set 1990# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set
1991
1992#
1993# iptables trigger is under Netfilter config (LED target)
1994#
1953# CONFIG_ACCESSIBILITY is not set 1995# CONFIG_ACCESSIBILITY is not set
1954# CONFIG_INFINIBAND is not set 1996# CONFIG_INFINIBAND is not set
1955CONFIG_EDAC=y 1997CONFIG_EDAC=y
@@ -2018,6 +2060,7 @@ CONFIG_DMADEVICES=y
2018# DMA Devices 2060# DMA Devices
2019# 2061#
2020# CONFIG_INTEL_IOATDMA is not set 2062# CONFIG_INTEL_IOATDMA is not set
2063# CONFIG_AUXDISPLAY is not set
2021# CONFIG_UIO is not set 2064# CONFIG_UIO is not set
2022# CONFIG_STAGING is not set 2065# CONFIG_STAGING is not set
2023CONFIG_X86_PLATFORM_DEVICES=y 2066CONFIG_X86_PLATFORM_DEVICES=y
@@ -2051,6 +2094,7 @@ CONFIG_DMIID=y
2051# 2094#
2052# CONFIG_EXT2_FS is not set 2095# CONFIG_EXT2_FS is not set
2053CONFIG_EXT3_FS=y 2096CONFIG_EXT3_FS=y
2097# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
2054CONFIG_EXT3_FS_XATTR=y 2098CONFIG_EXT3_FS_XATTR=y
2055CONFIG_EXT3_FS_POSIX_ACL=y 2099CONFIG_EXT3_FS_POSIX_ACL=y
2056CONFIG_EXT3_FS_SECURITY=y 2100CONFIG_EXT3_FS_SECURITY=y
@@ -2082,6 +2126,11 @@ CONFIG_AUTOFS4_FS=y
2082CONFIG_GENERIC_ACL=y 2126CONFIG_GENERIC_ACL=y
2083 2127
2084# 2128#
2129# Caches
2130#
2131# CONFIG_FSCACHE is not set
2132
2133#
2085# CD-ROM/DVD Filesystems 2134# CD-ROM/DVD Filesystems
2086# 2135#
2087CONFIG_ISO9660_FS=y 2136CONFIG_ISO9660_FS=y
@@ -2132,6 +2181,7 @@ CONFIG_MISC_FILESYSTEMS=y
2132# CONFIG_ROMFS_FS is not set 2181# CONFIG_ROMFS_FS is not set
2133# CONFIG_SYSV_FS is not set 2182# CONFIG_SYSV_FS is not set
2134# CONFIG_UFS_FS is not set 2183# CONFIG_UFS_FS is not set
2184# CONFIG_NILFS2_FS is not set
2135CONFIG_NETWORK_FILESYSTEMS=y 2185CONFIG_NETWORK_FILESYSTEMS=y
2136CONFIG_NFS_FS=y 2186CONFIG_NFS_FS=y
2137CONFIG_NFS_V3=y 2187CONFIG_NFS_V3=y
@@ -2145,7 +2195,6 @@ CONFIG_NFS_ACL_SUPPORT=y
2145CONFIG_NFS_COMMON=y 2195CONFIG_NFS_COMMON=y
2146CONFIG_SUNRPC=y 2196CONFIG_SUNRPC=y
2147CONFIG_SUNRPC_GSS=y 2197CONFIG_SUNRPC_GSS=y
2148# CONFIG_SUNRPC_REGISTER_V4 is not set
2149CONFIG_RPCSEC_GSS_KRB5=y 2198CONFIG_RPCSEC_GSS_KRB5=y
2150# CONFIG_RPCSEC_GSS_SPKM3 is not set 2199# CONFIG_RPCSEC_GSS_SPKM3 is not set
2151# CONFIG_SMB_FS is not set 2200# CONFIG_SMB_FS is not set
@@ -2232,6 +2281,7 @@ CONFIG_DEBUG_FS=y
2232CONFIG_DEBUG_KERNEL=y 2281CONFIG_DEBUG_KERNEL=y
2233# CONFIG_DEBUG_SHIRQ is not set 2282# CONFIG_DEBUG_SHIRQ is not set
2234# CONFIG_DETECT_SOFTLOCKUP is not set 2283# CONFIG_DETECT_SOFTLOCKUP is not set
2284# CONFIG_DETECT_HUNG_TASK is not set
2235# CONFIG_SCHED_DEBUG is not set 2285# CONFIG_SCHED_DEBUG is not set
2236CONFIG_SCHEDSTATS=y 2286CONFIG_SCHEDSTATS=y
2237CONFIG_TIMER_STATS=y 2287CONFIG_TIMER_STATS=y
@@ -2247,6 +2297,7 @@ CONFIG_TIMER_STATS=y
2247# CONFIG_LOCK_STAT is not set 2297# CONFIG_LOCK_STAT is not set
2248# CONFIG_DEBUG_SPINLOCK_SLEEP is not set 2298# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
2249# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set 2299# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
2300CONFIG_STACKTRACE=y
2250# CONFIG_DEBUG_KOBJECT is not set 2301# CONFIG_DEBUG_KOBJECT is not set
2251CONFIG_DEBUG_BUGVERBOSE=y 2302CONFIG_DEBUG_BUGVERBOSE=y
2252# CONFIG_DEBUG_INFO is not set 2303# CONFIG_DEBUG_INFO is not set
@@ -2269,13 +2320,19 @@ CONFIG_FRAME_POINTER=y
2269# CONFIG_FAULT_INJECTION is not set 2320# CONFIG_FAULT_INJECTION is not set
2270# CONFIG_LATENCYTOP is not set 2321# CONFIG_LATENCYTOP is not set
2271CONFIG_SYSCTL_SYSCALL_CHECK=y 2322CONFIG_SYSCTL_SYSCALL_CHECK=y
2323# CONFIG_DEBUG_PAGEALLOC is not set
2272CONFIG_USER_STACKTRACE_SUPPORT=y 2324CONFIG_USER_STACKTRACE_SUPPORT=y
2325CONFIG_NOP_TRACER=y
2273CONFIG_HAVE_FUNCTION_TRACER=y 2326CONFIG_HAVE_FUNCTION_TRACER=y
2274CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y 2327CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y
2275CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y 2328CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
2276CONFIG_HAVE_DYNAMIC_FTRACE=y 2329CONFIG_HAVE_DYNAMIC_FTRACE=y
2277CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y 2330CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
2278CONFIG_HAVE_HW_BRANCH_TRACER=y 2331CONFIG_HAVE_HW_BRANCH_TRACER=y
2332CONFIG_HAVE_FTRACE_SYSCALLS=y
2333CONFIG_RING_BUFFER=y
2334CONFIG_TRACING=y
2335CONFIG_TRACING_SUPPORT=y
2279 2336
2280# 2337#
2281# Tracers 2338# Tracers
@@ -2285,13 +2342,21 @@ CONFIG_HAVE_HW_BRANCH_TRACER=y
2285# CONFIG_SYSPROF_TRACER is not set 2342# CONFIG_SYSPROF_TRACER is not set
2286# CONFIG_SCHED_TRACER is not set 2343# CONFIG_SCHED_TRACER is not set
2287# CONFIG_CONTEXT_SWITCH_TRACER is not set 2344# CONFIG_CONTEXT_SWITCH_TRACER is not set
2345# CONFIG_EVENT_TRACER is not set
2346# CONFIG_FTRACE_SYSCALLS is not set
2288# CONFIG_BOOT_TRACER is not set 2347# CONFIG_BOOT_TRACER is not set
2289# CONFIG_TRACE_BRANCH_PROFILING is not set 2348# CONFIG_TRACE_BRANCH_PROFILING is not set
2290# CONFIG_POWER_TRACER is not set 2349# CONFIG_POWER_TRACER is not set
2291# CONFIG_STACK_TRACER is not set 2350# CONFIG_STACK_TRACER is not set
2292# CONFIG_HW_BRANCH_TRACER is not set 2351# CONFIG_HW_BRANCH_TRACER is not set
2352# CONFIG_KMEMTRACE is not set
2353# CONFIG_WORKQUEUE_TRACER is not set
2354CONFIG_BLK_DEV_IO_TRACE=y
2355# CONFIG_FTRACE_STARTUP_TEST is not set
2356# CONFIG_MMIOTRACE is not set
2293CONFIG_PROVIDE_OHCI1394_DMA_INIT=y 2357CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
2294# CONFIG_DYNAMIC_PRINTK_DEBUG is not set 2358# CONFIG_DYNAMIC_DEBUG is not set
2359# CONFIG_DMA_API_DEBUG is not set
2295# CONFIG_SAMPLES is not set 2360# CONFIG_SAMPLES is not set
2296CONFIG_HAVE_ARCH_KGDB=y 2361CONFIG_HAVE_ARCH_KGDB=y
2297# CONFIG_KGDB is not set 2362# CONFIG_KGDB is not set
@@ -2301,14 +2366,13 @@ CONFIG_EARLY_PRINTK=y
2301CONFIG_EARLY_PRINTK_DBGP=y 2366CONFIG_EARLY_PRINTK_DBGP=y
2302CONFIG_DEBUG_STACKOVERFLOW=y 2367CONFIG_DEBUG_STACKOVERFLOW=y
2303CONFIG_DEBUG_STACK_USAGE=y 2368CONFIG_DEBUG_STACK_USAGE=y
2304# CONFIG_DEBUG_PAGEALLOC is not set
2305# CONFIG_DEBUG_PER_CPU_MAPS is not set 2369# CONFIG_DEBUG_PER_CPU_MAPS is not set
2306# CONFIG_X86_PTDUMP is not set 2370# CONFIG_X86_PTDUMP is not set
2307CONFIG_DEBUG_RODATA=y 2371CONFIG_DEBUG_RODATA=y
2308# CONFIG_DEBUG_RODATA_TEST is not set 2372# CONFIG_DEBUG_RODATA_TEST is not set
2309CONFIG_DEBUG_NX_TEST=m 2373CONFIG_DEBUG_NX_TEST=m
2310# CONFIG_IOMMU_DEBUG is not set 2374# CONFIG_IOMMU_DEBUG is not set
2311# CONFIG_MMIOTRACE is not set 2375CONFIG_HAVE_MMIOTRACE_SUPPORT=y
2312CONFIG_IO_DELAY_TYPE_0X80=0 2376CONFIG_IO_DELAY_TYPE_0X80=0
2313CONFIG_IO_DELAY_TYPE_0XED=1 2377CONFIG_IO_DELAY_TYPE_0XED=1
2314CONFIG_IO_DELAY_TYPE_UDELAY=2 2378CONFIG_IO_DELAY_TYPE_UDELAY=2
@@ -2344,6 +2408,8 @@ CONFIG_SECURITY_SELINUX_AVC_STATS=y
2344CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1 2408CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
2345# CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set 2409# CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set
2346# CONFIG_SECURITY_SMACK is not set 2410# CONFIG_SECURITY_SMACK is not set
2411# CONFIG_SECURITY_TOMOYO is not set
2412# CONFIG_IMA is not set
2347CONFIG_CRYPTO=y 2413CONFIG_CRYPTO=y
2348 2414
2349# 2415#
@@ -2359,10 +2425,12 @@ CONFIG_CRYPTO_BLKCIPHER2=y
2359CONFIG_CRYPTO_HASH=y 2425CONFIG_CRYPTO_HASH=y
2360CONFIG_CRYPTO_HASH2=y 2426CONFIG_CRYPTO_HASH2=y
2361CONFIG_CRYPTO_RNG2=y 2427CONFIG_CRYPTO_RNG2=y
2428CONFIG_CRYPTO_PCOMP=y
2362CONFIG_CRYPTO_MANAGER=y 2429CONFIG_CRYPTO_MANAGER=y
2363CONFIG_CRYPTO_MANAGER2=y 2430CONFIG_CRYPTO_MANAGER2=y
2364# CONFIG_CRYPTO_GF128MUL is not set 2431# CONFIG_CRYPTO_GF128MUL is not set
2365# CONFIG_CRYPTO_NULL is not set 2432# CONFIG_CRYPTO_NULL is not set
2433CONFIG_CRYPTO_WORKQUEUE=y
2366# CONFIG_CRYPTO_CRYPTD is not set 2434# CONFIG_CRYPTO_CRYPTD is not set
2367CONFIG_CRYPTO_AUTHENC=y 2435CONFIG_CRYPTO_AUTHENC=y
2368# CONFIG_CRYPTO_TEST is not set 2436# CONFIG_CRYPTO_TEST is not set
@@ -2414,6 +2482,7 @@ CONFIG_CRYPTO_SHA1=y
2414# 2482#
2415CONFIG_CRYPTO_AES=y 2483CONFIG_CRYPTO_AES=y
2416# CONFIG_CRYPTO_AES_X86_64 is not set 2484# CONFIG_CRYPTO_AES_X86_64 is not set
2485# CONFIG_CRYPTO_AES_NI_INTEL is not set
2417# CONFIG_CRYPTO_ANUBIS is not set 2486# CONFIG_CRYPTO_ANUBIS is not set
2418CONFIG_CRYPTO_ARC4=y 2487CONFIG_CRYPTO_ARC4=y
2419# CONFIG_CRYPTO_BLOWFISH is not set 2488# CONFIG_CRYPTO_BLOWFISH is not set
@@ -2435,6 +2504,7 @@ CONFIG_CRYPTO_DES=y
2435# Compression 2504# Compression
2436# 2505#
2437# CONFIG_CRYPTO_DEFLATE is not set 2506# CONFIG_CRYPTO_DEFLATE is not set
2507# CONFIG_CRYPTO_ZLIB is not set
2438# CONFIG_CRYPTO_LZO is not set 2508# CONFIG_CRYPTO_LZO is not set
2439 2509
2440# 2510#
@@ -2444,10 +2514,12 @@ CONFIG_CRYPTO_DES=y
2444CONFIG_CRYPTO_HW=y 2514CONFIG_CRYPTO_HW=y
2445# CONFIG_CRYPTO_DEV_HIFN_795X is not set 2515# CONFIG_CRYPTO_DEV_HIFN_795X is not set
2446CONFIG_HAVE_KVM=y 2516CONFIG_HAVE_KVM=y
2517CONFIG_HAVE_KVM_IRQCHIP=y
2447CONFIG_VIRTUALIZATION=y 2518CONFIG_VIRTUALIZATION=y
2448# CONFIG_KVM is not set 2519# CONFIG_KVM is not set
2449# CONFIG_VIRTIO_PCI is not set 2520# CONFIG_VIRTIO_PCI is not set
2450# CONFIG_VIRTIO_BALLOON is not set 2521# CONFIG_VIRTIO_BALLOON is not set
2522CONFIG_BINARY_PRINTF=y
2451 2523
2452# 2524#
2453# Library routines 2525# Library routines
@@ -2464,7 +2536,10 @@ CONFIG_CRC32=y
2464# CONFIG_CRC7 is not set 2536# CONFIG_CRC7 is not set
2465# CONFIG_LIBCRC32C is not set 2537# CONFIG_LIBCRC32C is not set
2466CONFIG_ZLIB_INFLATE=y 2538CONFIG_ZLIB_INFLATE=y
2467CONFIG_PLIST=y 2539CONFIG_DECOMPRESS_GZIP=y
2540CONFIG_DECOMPRESS_BZIP2=y
2541CONFIG_DECOMPRESS_LZMA=y
2468CONFIG_HAS_IOMEM=y 2542CONFIG_HAS_IOMEM=y
2469CONFIG_HAS_IOPORT=y 2543CONFIG_HAS_IOPORT=y
2470CONFIG_HAS_DMA=y 2544CONFIG_HAS_DMA=y
2545CONFIG_NLATTR=y
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index ebe7deedd5b..cfb0010fa94 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -2,6 +2,8 @@
2# Arch-specific CryptoAPI modules. 2# Arch-specific CryptoAPI modules.
3# 3#
4 4
5obj-$(CONFIG_CRYPTO_FPU) += fpu.o
6
5obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o 7obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
6obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o 8obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
7obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o 9obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 02af0af6549..4e663398f77 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -21,6 +21,22 @@
21#include <asm/i387.h> 21#include <asm/i387.h>
22#include <asm/aes.h> 22#include <asm/aes.h>
23 23
24#if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE)
25#define HAS_CTR
26#endif
27
28#if defined(CONFIG_CRYPTO_LRW) || defined(CONFIG_CRYPTO_LRW_MODULE)
29#define HAS_LRW
30#endif
31
32#if defined(CONFIG_CRYPTO_PCBC) || defined(CONFIG_CRYPTO_PCBC_MODULE)
33#define HAS_PCBC
34#endif
35
36#if defined(CONFIG_CRYPTO_XTS) || defined(CONFIG_CRYPTO_XTS_MODULE)
37#define HAS_XTS
38#endif
39
24struct async_aes_ctx { 40struct async_aes_ctx {
25 struct cryptd_ablkcipher *cryptd_tfm; 41 struct cryptd_ablkcipher *cryptd_tfm;
26}; 42};
@@ -137,6 +153,41 @@ static struct crypto_alg aesni_alg = {
137 } 153 }
138}; 154};
139 155
156static void __aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
157{
158 struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
159
160 aesni_enc(ctx, dst, src);
161}
162
163static void __aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
164{
165 struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
166
167 aesni_dec(ctx, dst, src);
168}
169
170static struct crypto_alg __aesni_alg = {
171 .cra_name = "__aes-aesni",
172 .cra_driver_name = "__driver-aes-aesni",
173 .cra_priority = 0,
174 .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
175 .cra_blocksize = AES_BLOCK_SIZE,
176 .cra_ctxsize = sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1,
177 .cra_alignmask = 0,
178 .cra_module = THIS_MODULE,
179 .cra_list = LIST_HEAD_INIT(__aesni_alg.cra_list),
180 .cra_u = {
181 .cipher = {
182 .cia_min_keysize = AES_MIN_KEY_SIZE,
183 .cia_max_keysize = AES_MAX_KEY_SIZE,
184 .cia_setkey = aes_set_key,
185 .cia_encrypt = __aes_encrypt,
186 .cia_decrypt = __aes_decrypt
187 }
188 }
189};
190
140static int ecb_encrypt(struct blkcipher_desc *desc, 191static int ecb_encrypt(struct blkcipher_desc *desc,
141 struct scatterlist *dst, struct scatterlist *src, 192 struct scatterlist *dst, struct scatterlist *src,
142 unsigned int nbytes) 193 unsigned int nbytes)
@@ -277,8 +328,16 @@ static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
277 unsigned int key_len) 328 unsigned int key_len)
278{ 329{
279 struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm); 330 struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
331 struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base;
332 int err;
280 333
281 return crypto_ablkcipher_setkey(&ctx->cryptd_tfm->base, key, key_len); 334 crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
335 crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm)
336 & CRYPTO_TFM_REQ_MASK);
337 err = crypto_ablkcipher_setkey(child, key, key_len);
338 crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child)
339 & CRYPTO_TFM_RES_MASK);
340 return err;
282} 341}
283 342
284static int ablk_encrypt(struct ablkcipher_request *req) 343static int ablk_encrypt(struct ablkcipher_request *req)
@@ -411,6 +470,163 @@ static struct crypto_alg ablk_cbc_alg = {
411 }, 470 },
412}; 471};
413 472
473#ifdef HAS_CTR
474static int ablk_ctr_init(struct crypto_tfm *tfm)
475{
476 struct cryptd_ablkcipher *cryptd_tfm;
477
478 cryptd_tfm = cryptd_alloc_ablkcipher("fpu(ctr(__driver-aes-aesni))",
479 0, 0);
480 if (IS_ERR(cryptd_tfm))
481 return PTR_ERR(cryptd_tfm);
482 ablk_init_common(tfm, cryptd_tfm);
483 return 0;
484}
485
486static struct crypto_alg ablk_ctr_alg = {
487 .cra_name = "ctr(aes)",
488 .cra_driver_name = "ctr-aes-aesni",
489 .cra_priority = 400,
490 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
491 .cra_blocksize = 1,
492 .cra_ctxsize = sizeof(struct async_aes_ctx),
493 .cra_alignmask = 0,
494 .cra_type = &crypto_ablkcipher_type,
495 .cra_module = THIS_MODULE,
496 .cra_list = LIST_HEAD_INIT(ablk_ctr_alg.cra_list),
497 .cra_init = ablk_ctr_init,
498 .cra_exit = ablk_exit,
499 .cra_u = {
500 .ablkcipher = {
501 .min_keysize = AES_MIN_KEY_SIZE,
502 .max_keysize = AES_MAX_KEY_SIZE,
503 .ivsize = AES_BLOCK_SIZE,
504 .setkey = ablk_set_key,
505 .encrypt = ablk_encrypt,
506 .decrypt = ablk_decrypt,
507 .geniv = "chainiv",
508 },
509 },
510};
511#endif
512
513#ifdef HAS_LRW
514static int ablk_lrw_init(struct crypto_tfm *tfm)
515{
516 struct cryptd_ablkcipher *cryptd_tfm;
517
518 cryptd_tfm = cryptd_alloc_ablkcipher("fpu(lrw(__driver-aes-aesni))",
519 0, 0);
520 if (IS_ERR(cryptd_tfm))
521 return PTR_ERR(cryptd_tfm);
522 ablk_init_common(tfm, cryptd_tfm);
523 return 0;
524}
525
526static struct crypto_alg ablk_lrw_alg = {
527 .cra_name = "lrw(aes)",
528 .cra_driver_name = "lrw-aes-aesni",
529 .cra_priority = 400,
530 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
531 .cra_blocksize = AES_BLOCK_SIZE,
532 .cra_ctxsize = sizeof(struct async_aes_ctx),
533 .cra_alignmask = 0,
534 .cra_type = &crypto_ablkcipher_type,
535 .cra_module = THIS_MODULE,
536 .cra_list = LIST_HEAD_INIT(ablk_lrw_alg.cra_list),
537 .cra_init = ablk_lrw_init,
538 .cra_exit = ablk_exit,
539 .cra_u = {
540 .ablkcipher = {
541 .min_keysize = AES_MIN_KEY_SIZE + AES_BLOCK_SIZE,
542 .max_keysize = AES_MAX_KEY_SIZE + AES_BLOCK_SIZE,
543 .ivsize = AES_BLOCK_SIZE,
544 .setkey = ablk_set_key,
545 .encrypt = ablk_encrypt,
546 .decrypt = ablk_decrypt,
547 },
548 },
549};
550#endif
551
552#ifdef HAS_PCBC
553static int ablk_pcbc_init(struct crypto_tfm *tfm)
554{
555 struct cryptd_ablkcipher *cryptd_tfm;
556
557 cryptd_tfm = cryptd_alloc_ablkcipher("fpu(pcbc(__driver-aes-aesni))",
558 0, 0);
559 if (IS_ERR(cryptd_tfm))
560 return PTR_ERR(cryptd_tfm);
561 ablk_init_common(tfm, cryptd_tfm);
562 return 0;
563}
564
565static struct crypto_alg ablk_pcbc_alg = {
566 .cra_name = "pcbc(aes)",
567 .cra_driver_name = "pcbc-aes-aesni",
568 .cra_priority = 400,
569 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
570 .cra_blocksize = AES_BLOCK_SIZE,
571 .cra_ctxsize = sizeof(struct async_aes_ctx),
572 .cra_alignmask = 0,
573 .cra_type = &crypto_ablkcipher_type,
574 .cra_module = THIS_MODULE,
575 .cra_list = LIST_HEAD_INIT(ablk_pcbc_alg.cra_list),
576 .cra_init = ablk_pcbc_init,
577 .cra_exit = ablk_exit,
578 .cra_u = {
579 .ablkcipher = {
580 .min_keysize = AES_MIN_KEY_SIZE,
581 .max_keysize = AES_MAX_KEY_SIZE,
582 .ivsize = AES_BLOCK_SIZE,
583 .setkey = ablk_set_key,
584 .encrypt = ablk_encrypt,
585 .decrypt = ablk_decrypt,
586 },
587 },
588};
589#endif
590
591#ifdef HAS_XTS
592static int ablk_xts_init(struct crypto_tfm *tfm)
593{
594 struct cryptd_ablkcipher *cryptd_tfm;
595
596 cryptd_tfm = cryptd_alloc_ablkcipher("fpu(xts(__driver-aes-aesni))",
597 0, 0);
598 if (IS_ERR(cryptd_tfm))
599 return PTR_ERR(cryptd_tfm);
600 ablk_init_common(tfm, cryptd_tfm);
601 return 0;
602}
603
604static struct crypto_alg ablk_xts_alg = {
605 .cra_name = "xts(aes)",
606 .cra_driver_name = "xts-aes-aesni",
607 .cra_priority = 400,
608 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
609 .cra_blocksize = AES_BLOCK_SIZE,
610 .cra_ctxsize = sizeof(struct async_aes_ctx),
611 .cra_alignmask = 0,
612 .cra_type = &crypto_ablkcipher_type,
613 .cra_module = THIS_MODULE,
614 .cra_list = LIST_HEAD_INIT(ablk_xts_alg.cra_list),
615 .cra_init = ablk_xts_init,
616 .cra_exit = ablk_exit,
617 .cra_u = {
618 .ablkcipher = {
619 .min_keysize = 2 * AES_MIN_KEY_SIZE,
620 .max_keysize = 2 * AES_MAX_KEY_SIZE,
621 .ivsize = AES_BLOCK_SIZE,
622 .setkey = ablk_set_key,
623 .encrypt = ablk_encrypt,
624 .decrypt = ablk_decrypt,
625 },
626 },
627};
628#endif
629
414static int __init aesni_init(void) 630static int __init aesni_init(void)
415{ 631{
416 int err; 632 int err;
@@ -421,6 +637,8 @@ static int __init aesni_init(void)
421 } 637 }
422 if ((err = crypto_register_alg(&aesni_alg))) 638 if ((err = crypto_register_alg(&aesni_alg)))
423 goto aes_err; 639 goto aes_err;
640 if ((err = crypto_register_alg(&__aesni_alg)))
641 goto __aes_err;
424 if ((err = crypto_register_alg(&blk_ecb_alg))) 642 if ((err = crypto_register_alg(&blk_ecb_alg)))
425 goto blk_ecb_err; 643 goto blk_ecb_err;
426 if ((err = crypto_register_alg(&blk_cbc_alg))) 644 if ((err = crypto_register_alg(&blk_cbc_alg)))
@@ -429,9 +647,41 @@ static int __init aesni_init(void)
429 goto ablk_ecb_err; 647 goto ablk_ecb_err;
430 if ((err = crypto_register_alg(&ablk_cbc_alg))) 648 if ((err = crypto_register_alg(&ablk_cbc_alg)))
431 goto ablk_cbc_err; 649 goto ablk_cbc_err;
650#ifdef HAS_CTR
651 if ((err = crypto_register_alg(&ablk_ctr_alg)))
652 goto ablk_ctr_err;
653#endif
654#ifdef HAS_LRW
655 if ((err = crypto_register_alg(&ablk_lrw_alg)))
656 goto ablk_lrw_err;
657#endif
658#ifdef HAS_PCBC
659 if ((err = crypto_register_alg(&ablk_pcbc_alg)))
660 goto ablk_pcbc_err;
661#endif
662#ifdef HAS_XTS
663 if ((err = crypto_register_alg(&ablk_xts_alg)))
664 goto ablk_xts_err;
665#endif
432 666
433 return err; 667 return err;
434 668
669#ifdef HAS_XTS
670ablk_xts_err:
671#endif
672#ifdef HAS_PCBC
673 crypto_unregister_alg(&ablk_pcbc_alg);
674ablk_pcbc_err:
675#endif
676#ifdef HAS_LRW
677 crypto_unregister_alg(&ablk_lrw_alg);
678ablk_lrw_err:
679#endif
680#ifdef HAS_CTR
681 crypto_unregister_alg(&ablk_ctr_alg);
682ablk_ctr_err:
683#endif
684 crypto_unregister_alg(&ablk_cbc_alg);
435ablk_cbc_err: 685ablk_cbc_err:
436 crypto_unregister_alg(&ablk_ecb_alg); 686 crypto_unregister_alg(&ablk_ecb_alg);
437ablk_ecb_err: 687ablk_ecb_err:
@@ -439,6 +689,8 @@ ablk_ecb_err:
439blk_cbc_err: 689blk_cbc_err:
440 crypto_unregister_alg(&blk_ecb_alg); 690 crypto_unregister_alg(&blk_ecb_alg);
441blk_ecb_err: 691blk_ecb_err:
692 crypto_unregister_alg(&__aesni_alg);
693__aes_err:
442 crypto_unregister_alg(&aesni_alg); 694 crypto_unregister_alg(&aesni_alg);
443aes_err: 695aes_err:
444 return err; 696 return err;
@@ -446,10 +698,23 @@ aes_err:
446 698
447static void __exit aesni_exit(void) 699static void __exit aesni_exit(void)
448{ 700{
701#ifdef HAS_XTS
702 crypto_unregister_alg(&ablk_xts_alg);
703#endif
704#ifdef HAS_PCBC
705 crypto_unregister_alg(&ablk_pcbc_alg);
706#endif
707#ifdef HAS_LRW
708 crypto_unregister_alg(&ablk_lrw_alg);
709#endif
710#ifdef HAS_CTR
711 crypto_unregister_alg(&ablk_ctr_alg);
712#endif
449 crypto_unregister_alg(&ablk_cbc_alg); 713 crypto_unregister_alg(&ablk_cbc_alg);
450 crypto_unregister_alg(&ablk_ecb_alg); 714 crypto_unregister_alg(&ablk_ecb_alg);
451 crypto_unregister_alg(&blk_cbc_alg); 715 crypto_unregister_alg(&blk_cbc_alg);
452 crypto_unregister_alg(&blk_ecb_alg); 716 crypto_unregister_alg(&blk_ecb_alg);
717 crypto_unregister_alg(&__aesni_alg);
453 crypto_unregister_alg(&aesni_alg); 718 crypto_unregister_alg(&aesni_alg);
454} 719}
455 720
diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c
new file mode 100644
index 00000000000..5f9781a3815
--- /dev/null
+++ b/arch/x86/crypto/fpu.c
@@ -0,0 +1,166 @@
1/*
2 * FPU: Wrapper for blkcipher touching fpu
3 *
4 * Copyright (c) Intel Corp.
5 * Author: Huang Ying <ying.huang@intel.com>
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License as published by the Free
9 * Software Foundation; either version 2 of the License, or (at your option)
10 * any later version.
11 *
12 */
13
14#include <crypto/algapi.h>
15#include <linux/err.h>
16#include <linux/init.h>
17#include <linux/kernel.h>
18#include <linux/module.h>
19#include <asm/i387.h>
20
21struct crypto_fpu_ctx {
22 struct crypto_blkcipher *child;
23};
24
25static int crypto_fpu_setkey(struct crypto_tfm *parent, const u8 *key,
26 unsigned int keylen)
27{
28 struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(parent);
29 struct crypto_blkcipher *child = ctx->child;
30 int err;
31
32 crypto_blkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
33 crypto_blkcipher_set_flags(child, crypto_tfm_get_flags(parent) &
34 CRYPTO_TFM_REQ_MASK);
35 err = crypto_blkcipher_setkey(child, key, keylen);
36 crypto_tfm_set_flags(parent, crypto_blkcipher_get_flags(child) &
37 CRYPTO_TFM_RES_MASK);
38 return err;
39}
40
41static int crypto_fpu_encrypt(struct blkcipher_desc *desc_in,
42 struct scatterlist *dst, struct scatterlist *src,
43 unsigned int nbytes)
44{
45 int err;
46 struct crypto_fpu_ctx *ctx = crypto_blkcipher_ctx(desc_in->tfm);
47 struct crypto_blkcipher *child = ctx->child;
48 struct blkcipher_desc desc = {
49 .tfm = child,
50 .info = desc_in->info,
51 .flags = desc_in->flags,
52 };
53
54 kernel_fpu_begin();
55 err = crypto_blkcipher_crt(desc.tfm)->encrypt(&desc, dst, src, nbytes);
56 kernel_fpu_end();
57 return err;
58}
59
60static int crypto_fpu_decrypt(struct blkcipher_desc *desc_in,
61 struct scatterlist *dst, struct scatterlist *src,
62 unsigned int nbytes)
63{
64 int err;
65 struct crypto_fpu_ctx *ctx = crypto_blkcipher_ctx(desc_in->tfm);
66 struct crypto_blkcipher *child = ctx->child;
67 struct blkcipher_desc desc = {
68 .tfm = child,
69 .info = desc_in->info,
70 .flags = desc_in->flags,
71 };
72
73 kernel_fpu_begin();
74 err = crypto_blkcipher_crt(desc.tfm)->decrypt(&desc, dst, src, nbytes);
75 kernel_fpu_end();
76 return err;
77}
78
79static int crypto_fpu_init_tfm(struct crypto_tfm *tfm)
80{
81 struct crypto_instance *inst = crypto_tfm_alg_instance(tfm);
82 struct crypto_spawn *spawn = crypto_instance_ctx(inst);
83 struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(tfm);
84 struct crypto_blkcipher *cipher;
85
86 cipher = crypto_spawn_blkcipher(spawn);
87 if (IS_ERR(cipher))
88 return PTR_ERR(cipher);
89
90 ctx->child = cipher;
91 return 0;
92}
93
94static void crypto_fpu_exit_tfm(struct crypto_tfm *tfm)
95{
96 struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(tfm);
97 crypto_free_blkcipher(ctx->child);
98}
99
100static struct crypto_instance *crypto_fpu_alloc(struct rtattr **tb)
101{
102 struct crypto_instance *inst;
103 struct crypto_alg *alg;
104 int err;
105
106 err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_BLKCIPHER);
107 if (err)
108 return ERR_PTR(err);
109
110 alg = crypto_get_attr_alg(tb, CRYPTO_ALG_TYPE_BLKCIPHER,
111 CRYPTO_ALG_TYPE_MASK);
112 if (IS_ERR(alg))
113 return ERR_CAST(alg);
114
115 inst = crypto_alloc_instance("fpu", alg);
116 if (IS_ERR(inst))
117 goto out_put_alg;
118
119 inst->alg.cra_flags = alg->cra_flags;
120 inst->alg.cra_priority = alg->cra_priority;
121 inst->alg.cra_blocksize = alg->cra_blocksize;
122 inst->alg.cra_alignmask = alg->cra_alignmask;
123 inst->alg.cra_type = alg->cra_type;
124 inst->alg.cra_blkcipher.ivsize = alg->cra_blkcipher.ivsize;
125 inst->alg.cra_blkcipher.min_keysize = alg->cra_blkcipher.min_keysize;
126 inst->alg.cra_blkcipher.max_keysize = alg->cra_blkcipher.max_keysize;
127 inst->alg.cra_ctxsize = sizeof(struct crypto_fpu_ctx);
128 inst->alg.cra_init = crypto_fpu_init_tfm;
129 inst->alg.cra_exit = crypto_fpu_exit_tfm;
130 inst->alg.cra_blkcipher.setkey = crypto_fpu_setkey;
131 inst->alg.cra_blkcipher.encrypt = crypto_fpu_encrypt;
132 inst->alg.cra_blkcipher.decrypt = crypto_fpu_decrypt;
133
134out_put_alg:
135 crypto_mod_put(alg);
136 return inst;
137}
138
139static void crypto_fpu_free(struct crypto_instance *inst)
140{
141 crypto_drop_spawn(crypto_instance_ctx(inst));
142 kfree(inst);
143}
144
145static struct crypto_template crypto_fpu_tmpl = {
146 .name = "fpu",
147 .alloc = crypto_fpu_alloc,
148 .free = crypto_fpu_free,
149 .module = THIS_MODULE,
150};
151
152static int __init crypto_fpu_module_init(void)
153{
154 return crypto_register_template(&crypto_fpu_tmpl);
155}
156
157static void __exit crypto_fpu_module_exit(void)
158{
159 crypto_unregister_template(&crypto_fpu_tmpl);
160}
161
162module_init(crypto_fpu_module_init);
163module_exit(crypto_fpu_module_exit);
164
165MODULE_LICENSE("GPL");
166MODULE_DESCRIPTION("FPU block cipher wrapper");
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index a505202086e..e590261ba05 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -825,9 +825,11 @@ ia32_sys_call_table:
825 .quad compat_sys_signalfd4 825 .quad compat_sys_signalfd4
826 .quad sys_eventfd2 826 .quad sys_eventfd2
827 .quad sys_epoll_create1 827 .quad sys_epoll_create1
828 .quad sys_dup3 /* 330 */ 828 .quad sys_dup3 /* 330 */
829 .quad sys_pipe2 829 .quad sys_pipe2
830 .quad sys_inotify_init1 830 .quad sys_inotify_init1
831 .quad compat_sys_preadv 831 .quad compat_sys_preadv
832 .quad compat_sys_pwritev 832 .quad compat_sys_pwritev
833 .quad compat_sys_rt_tgsigqueueinfo /* 335 */
834 .quad sys_perf_counter_open
833ia32_syscall_end: 835ia32_syscall_end:
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index f6aa18eadf7..1a37bcdc860 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -3,6 +3,7 @@
3 3
4#include <linux/types.h> 4#include <linux/types.h>
5#include <linux/stddef.h> 5#include <linux/stddef.h>
6#include <linux/stringify.h>
6#include <asm/asm.h> 7#include <asm/asm.h>
7 8
8/* 9/*
@@ -74,6 +75,22 @@ static inline void alternatives_smp_switch(int smp) {}
74 75
75const unsigned char *const *find_nop_table(void); 76const unsigned char *const *find_nop_table(void);
76 77
78/* alternative assembly primitive: */
79#define ALTERNATIVE(oldinstr, newinstr, feature) \
80 \
81 "661:\n\t" oldinstr "\n662:\n" \
82 ".section .altinstructions,\"a\"\n" \
83 _ASM_ALIGN "\n" \
84 _ASM_PTR "661b\n" /* label */ \
85 _ASM_PTR "663f\n" /* new instruction */ \
86 " .byte " __stringify(feature) "\n" /* feature bit */ \
87 " .byte 662b-661b\n" /* sourcelen */ \
88 " .byte 664f-663f\n" /* replacementlen */ \
89 ".previous\n" \
90 ".section .altinstr_replacement, \"ax\"\n" \
91 "663:\n\t" newinstr "\n664:\n" /* replacement */ \
92 ".previous"
93
77/* 94/*
78 * Alternative instructions for different CPU types or capabilities. 95 * Alternative instructions for different CPU types or capabilities.
79 * 96 *
@@ -87,18 +104,7 @@ const unsigned char *const *find_nop_table(void);
87 * without volatile and memory clobber. 104 * without volatile and memory clobber.
88 */ 105 */
89#define alternative(oldinstr, newinstr, feature) \ 106#define alternative(oldinstr, newinstr, feature) \
90 asm volatile ("661:\n\t" oldinstr "\n662:\n" \ 107 asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory")
91 ".section .altinstructions,\"a\"\n" \
92 _ASM_ALIGN "\n" \
93 _ASM_PTR "661b\n" /* label */ \
94 _ASM_PTR "663f\n" /* new instruction */ \
95 " .byte %c0\n" /* feature bit */ \
96 " .byte 662b-661b\n" /* sourcelen */ \
97 " .byte 664f-663f\n" /* replacementlen */ \
98 ".previous\n" \
99 ".section .altinstr_replacement,\"ax\"\n" \
100 "663:\n\t" newinstr "\n664:\n" /* replacement */ \
101 ".previous" :: "i" (feature) : "memory")
102 108
103/* 109/*
104 * Alternative inline assembly with input. 110 * Alternative inline assembly with input.
@@ -109,35 +115,16 @@ const unsigned char *const *find_nop_table(void);
109 * Best is to use constraints that are fixed size (like (%1) ... "r") 115 * Best is to use constraints that are fixed size (like (%1) ... "r")
110 * If you use variable sized constraints like "m" or "g" in the 116 * If you use variable sized constraints like "m" or "g" in the
111 * replacement make sure to pad to the worst case length. 117 * replacement make sure to pad to the worst case length.
118 * Leaving an unused argument 0 to keep API compatibility.
112 */ 119 */
113#define alternative_input(oldinstr, newinstr, feature, input...) \ 120#define alternative_input(oldinstr, newinstr, feature, input...) \
114 asm volatile ("661:\n\t" oldinstr "\n662:\n" \ 121 asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \
115 ".section .altinstructions,\"a\"\n" \ 122 : : "i" (0), ## input)
116 _ASM_ALIGN "\n" \
117 _ASM_PTR "661b\n" /* label */ \
118 _ASM_PTR "663f\n" /* new instruction */ \
119 " .byte %c0\n" /* feature bit */ \
120 " .byte 662b-661b\n" /* sourcelen */ \
121 " .byte 664f-663f\n" /* replacementlen */ \
122 ".previous\n" \
123 ".section .altinstr_replacement,\"ax\"\n" \
124 "663:\n\t" newinstr "\n664:\n" /* replacement */ \
125 ".previous" :: "i" (feature), ##input)
126 123
127/* Like alternative_input, but with a single output argument */ 124/* Like alternative_input, but with a single output argument */
128#define alternative_io(oldinstr, newinstr, feature, output, input...) \ 125#define alternative_io(oldinstr, newinstr, feature, output, input...) \
129 asm volatile ("661:\n\t" oldinstr "\n662:\n" \ 126 asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \
130 ".section .altinstructions,\"a\"\n" \ 127 : output : "i" (0), ## input)
131 _ASM_ALIGN "\n" \
132 _ASM_PTR "661b\n" /* label */ \
133 _ASM_PTR "663f\n" /* new instruction */ \
134 " .byte %c[feat]\n" /* feature bit */ \
135 " .byte 662b-661b\n" /* sourcelen */ \
136 " .byte 664f-663f\n" /* replacementlen */ \
137 ".previous\n" \
138 ".section .altinstr_replacement,\"ax\"\n" \
139 "663:\n\t" newinstr "\n664:\n" /* replacement */ \
140 ".previous" : output : [feat] "i" (feature), ##input)
141 128
142/* 129/*
143 * use this macro(s) if you need more than one output parameter 130 * use this macro(s) if you need more than one output parameter
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
index f712344329b..262e0282004 100644
--- a/arch/x86/include/asm/amd_iommu.h
+++ b/arch/x86/include/asm/amd_iommu.h
@@ -27,6 +27,8 @@ extern int amd_iommu_init(void);
27extern int amd_iommu_init_dma_ops(void); 27extern int amd_iommu_init_dma_ops(void);
28extern void amd_iommu_detect(void); 28extern void amd_iommu_detect(void);
29extern irqreturn_t amd_iommu_int_handler(int irq, void *data); 29extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
30extern void amd_iommu_flush_all_domains(void);
31extern void amd_iommu_flush_all_devices(void);
30#else 32#else
31static inline int amd_iommu_init(void) { return -ENODEV; } 33static inline int amd_iommu_init(void) { return -ENODEV; }
32static inline void amd_iommu_detect(void) { } 34static inline void amd_iommu_detect(void) { }
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index 95c8cd9d22b..0c878caaa0a 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -194,6 +194,27 @@
194#define PD_DMA_OPS_MASK (1UL << 0) /* domain used for dma_ops */ 194#define PD_DMA_OPS_MASK (1UL << 0) /* domain used for dma_ops */
195#define PD_DEFAULT_MASK (1UL << 1) /* domain is a default dma_ops 195#define PD_DEFAULT_MASK (1UL << 1) /* domain is a default dma_ops
196 domain for an IOMMU */ 196 domain for an IOMMU */
197extern bool amd_iommu_dump;
198#define DUMP_printk(format, arg...) \
199 do { \
200 if (amd_iommu_dump) \
201 printk(KERN_INFO "AMD IOMMU: " format, ## arg); \
202 } while(0);
203
204/*
205 * Make iterating over all IOMMUs easier
206 */
207#define for_each_iommu(iommu) \
208 list_for_each_entry((iommu), &amd_iommu_list, list)
209#define for_each_iommu_safe(iommu, next) \
210 list_for_each_entry_safe((iommu), (next), &amd_iommu_list, list)
211
212#define APERTURE_RANGE_SHIFT 27 /* 128 MB */
213#define APERTURE_RANGE_SIZE (1ULL << APERTURE_RANGE_SHIFT)
214#define APERTURE_RANGE_PAGES (APERTURE_RANGE_SIZE >> PAGE_SHIFT)
215#define APERTURE_MAX_RANGES 32 /* allows 4GB of DMA address space */
216#define APERTURE_RANGE_INDEX(a) ((a) >> APERTURE_RANGE_SHIFT)
217#define APERTURE_PAGE_INDEX(a) (((a) >> 21) & 0x3fULL)
197 218
198/* 219/*
199 * This structure contains generic data for IOMMU protection domains 220 * This structure contains generic data for IOMMU protection domains
@@ -210,6 +231,26 @@ struct protection_domain {
210}; 231};
211 232
212/* 233/*
234 * For dynamic growth the aperture size is split into ranges of 128MB of
235 * DMA address space each. This struct represents one such range.
236 */
237struct aperture_range {
238
239 /* address allocation bitmap */
240 unsigned long *bitmap;
241
242 /*
243 * Array of PTE pages for the aperture. In this array we save all the
244 * leaf pages of the domain page table used for the aperture. This way
245 * we don't need to walk the page table to find a specific PTE. We can
246 * just calculate its address in constant time.
247 */
248 u64 *pte_pages[64];
249
250 unsigned long offset;
251};
252
253/*
213 * Data container for a dma_ops specific protection domain 254 * Data container for a dma_ops specific protection domain
214 */ 255 */
215struct dma_ops_domain { 256struct dma_ops_domain {
@@ -222,18 +263,10 @@ struct dma_ops_domain {
222 unsigned long aperture_size; 263 unsigned long aperture_size;
223 264
224 /* address we start to search for free addresses */ 265 /* address we start to search for free addresses */
225 unsigned long next_bit; 266 unsigned long next_address;
226
227 /* address allocation bitmap */
228 unsigned long *bitmap;
229 267
230 /* 268 /* address space relevant data */
231 * Array of PTE pages for the aperture. In this array we save all the 269 struct aperture_range *aperture[APERTURE_MAX_RANGES];
232 * leaf pages of the domain page table used for the aperture. This way
233 * we don't need to walk the page table to find a specific PTE. We can
234 * just calculate its address in constant time.
235 */
236 u64 **pte_pages;
237 270
238 /* This will be set to true when TLB needs to be flushed */ 271 /* This will be set to true when TLB needs to be flushed */
239 bool need_flush; 272 bool need_flush;
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 42f2f837742..bb7d4792584 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -107,8 +107,7 @@ extern u32 native_safe_apic_wait_icr_idle(void);
107extern void native_apic_icr_write(u32 low, u32 id); 107extern void native_apic_icr_write(u32 low, u32 id);
108extern u64 native_apic_icr_read(void); 108extern u64 native_apic_icr_read(void);
109 109
110#define EIM_8BIT_APIC_ID 0 110extern int x2apic_mode;
111#define EIM_32BIT_APIC_ID 1
112 111
113#ifdef CONFIG_X86_X2APIC 112#ifdef CONFIG_X86_X2APIC
114/* 113/*
@@ -166,10 +165,9 @@ static inline u64 native_x2apic_icr_read(void)
166 return val; 165 return val;
167} 166}
168 167
169extern int x2apic, x2apic_phys; 168extern int x2apic_phys;
170extern void check_x2apic(void); 169extern void check_x2apic(void);
171extern void enable_x2apic(void); 170extern void enable_x2apic(void);
172extern void enable_IR_x2apic(void);
173extern void x2apic_icr_write(u32 low, u32 id); 171extern void x2apic_icr_write(u32 low, u32 id);
174static inline int x2apic_enabled(void) 172static inline int x2apic_enabled(void)
175{ 173{
@@ -183,6 +181,8 @@ static inline int x2apic_enabled(void)
183 return 1; 181 return 1;
184 return 0; 182 return 0;
185} 183}
184
185#define x2apic_supported() (cpu_has_x2apic)
186#else 186#else
187static inline void check_x2apic(void) 187static inline void check_x2apic(void)
188{ 188{
@@ -190,28 +190,20 @@ static inline void check_x2apic(void)
190static inline void enable_x2apic(void) 190static inline void enable_x2apic(void)
191{ 191{
192} 192}
193static inline void enable_IR_x2apic(void)
194{
195}
196static inline int x2apic_enabled(void) 193static inline int x2apic_enabled(void)
197{ 194{
198 return 0; 195 return 0;
199} 196}
200 197
201#define x2apic 0 198#define x2apic_preenabled 0
202 199#define x2apic_supported() 0
203#endif 200#endif
204 201
205extern int get_physical_broadcast(void); 202extern void enable_IR_x2apic(void);
206 203
207#ifdef CONFIG_X86_X2APIC 204extern int get_physical_broadcast(void);
208static inline void ack_x2APIC_irq(void)
209{
210 /* Docs say use 0 for future compatibility */
211 native_apic_msr_write(APIC_EOI, 0);
212}
213#endif
214 205
206extern void apic_disable(void);
215extern int lapic_get_maxlvt(void); 207extern int lapic_get_maxlvt(void);
216extern void clear_local_APIC(void); 208extern void clear_local_APIC(void);
217extern void connect_bsp_APIC(void); 209extern void connect_bsp_APIC(void);
@@ -252,7 +244,7 @@ static inline void lapic_shutdown(void) { }
252#define local_apic_timer_c2_ok 1 244#define local_apic_timer_c2_ok 1
253static inline void init_apic_mappings(void) { } 245static inline void init_apic_mappings(void) { }
254static inline void disable_local_APIC(void) { } 246static inline void disable_local_APIC(void) { }
255 247static inline void apic_disable(void) { }
256#endif /* !CONFIG_X86_LOCAL_APIC */ 248#endif /* !CONFIG_X86_LOCAL_APIC */
257 249
258#ifdef CONFIG_X86_64 250#ifdef CONFIG_X86_64
@@ -410,7 +402,7 @@ static inline unsigned default_get_apic_id(unsigned long x)
410{ 402{
411 unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); 403 unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
412 404
413 if (APIC_XAPIC(ver)) 405 if (APIC_XAPIC(ver) || boot_cpu_has(X86_FEATURE_EXTD_APICID))
414 return (x >> 24) & 0xFF; 406 return (x >> 24) & 0xFF;
415 else 407 else
416 return (x >> 24) & 0x0F; 408 return (x >> 24) & 0x0F;
@@ -478,6 +470,9 @@ static inline unsigned int read_apic_id(void)
478extern void default_setup_apic_routing(void); 470extern void default_setup_apic_routing(void);
479 471
480#ifdef CONFIG_X86_32 472#ifdef CONFIG_X86_32
473
474extern struct apic apic_default;
475
481/* 476/*
482 * Set up the logical destination ID. 477 * Set up the logical destination ID.
483 * 478 *
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index bc9514fb3b1..7ddb36ab933 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -22,6 +22,7 @@
22# define APIC_INTEGRATED(x) (1) 22# define APIC_INTEGRATED(x) (1)
23#endif 23#endif
24#define APIC_XAPIC(x) ((x) >= 0x14) 24#define APIC_XAPIC(x) ((x) >= 0x14)
25#define APIC_EXT_SPACE(x) ((x) & 0x80000000)
25#define APIC_TASKPRI 0x80 26#define APIC_TASKPRI 0x80
26#define APIC_TPRI_MASK 0xFFu 27#define APIC_TPRI_MASK 0xFFu
27#define APIC_ARBPRI 0x90 28#define APIC_ARBPRI 0x90
@@ -116,7 +117,9 @@
116#define APIC_TDR_DIV_32 0x8 117#define APIC_TDR_DIV_32 0x8
117#define APIC_TDR_DIV_64 0x9 118#define APIC_TDR_DIV_64 0x9
118#define APIC_TDR_DIV_128 0xA 119#define APIC_TDR_DIV_128 0xA
119#define APIC_EILVT0 0x500 120#define APIC_EFEAT 0x400
121#define APIC_ECTRL 0x410
122#define APIC_EILVTn(n) (0x500 + 0x10 * n)
120#define APIC_EILVT_NR_AMD_K8 1 /* # of extended interrupts */ 123#define APIC_EILVT_NR_AMD_K8 1 /* # of extended interrupts */
121#define APIC_EILVT_NR_AMD_10H 4 124#define APIC_EILVT_NR_AMD_10H 4
122#define APIC_EILVT_LVTOFF(x) (((x) >> 4) & 0xF) 125#define APIC_EILVT_LVTOFF(x) (((x) >> 4) & 0xF)
@@ -125,9 +128,6 @@
125#define APIC_EILVT_MSG_NMI 0x4 128#define APIC_EILVT_MSG_NMI 0x4
126#define APIC_EILVT_MSG_EXT 0x7 129#define APIC_EILVT_MSG_EXT 0x7
127#define APIC_EILVT_MASKED (1 << 16) 130#define APIC_EILVT_MASKED (1 << 16)
128#define APIC_EILVT1 0x510
129#define APIC_EILVT2 0x520
130#define APIC_EILVT3 0x530
131 131
132#define APIC_BASE (fix_to_virt(FIX_APIC_BASE)) 132#define APIC_BASE (fix_to_virt(FIX_APIC_BASE))
133#define APIC_BASE_MSR 0x800 133#define APIC_BASE_MSR 0x800
diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
index 85b46fba422..8cb9c814e12 100644
--- a/arch/x86/include/asm/atomic_32.h
+++ b/arch/x86/include/asm/atomic_32.h
@@ -247,5 +247,241 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
247#define smp_mb__before_atomic_inc() barrier() 247#define smp_mb__before_atomic_inc() barrier()
248#define smp_mb__after_atomic_inc() barrier() 248#define smp_mb__after_atomic_inc() barrier()
249 249
250#include <asm-generic/atomic.h> 250/* An 64bit atomic type */
251
252typedef struct {
253 unsigned long long counter;
254} atomic64_t;
255
256#define ATOMIC64_INIT(val) { (val) }
257
258/**
259 * atomic64_read - read atomic64 variable
260 * @v: pointer of type atomic64_t
261 *
262 * Atomically reads the value of @v.
263 * Doesn't imply a read memory barrier.
264 */
265#define __atomic64_read(ptr) ((ptr)->counter)
266
267static inline unsigned long long
268cmpxchg8b(unsigned long long *ptr, unsigned long long old, unsigned long long new)
269{
270 asm volatile(
271
272 LOCK_PREFIX "cmpxchg8b (%[ptr])\n"
273
274 : "=A" (old)
275
276 : [ptr] "D" (ptr),
277 "A" (old),
278 "b" (ll_low(new)),
279 "c" (ll_high(new))
280
281 : "memory");
282
283 return old;
284}
285
286static inline unsigned long long
287atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val,
288 unsigned long long new_val)
289{
290 return cmpxchg8b(&ptr->counter, old_val, new_val);
291}
292
293/**
294 * atomic64_xchg - xchg atomic64 variable
295 * @ptr: pointer to type atomic64_t
296 * @new_val: value to assign
297 * @old_val: old value that was there
298 *
299 * Atomically xchgs the value of @ptr to @new_val and returns
300 * the old value.
301 */
302
303static inline unsigned long long
304atomic64_xchg(atomic64_t *ptr, unsigned long long new_val)
305{
306 unsigned long long old_val;
307
308 do {
309 old_val = atomic_read(ptr);
310 } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
311
312 return old_val;
313}
314
315/**
316 * atomic64_set - set atomic64 variable
317 * @ptr: pointer to type atomic64_t
318 * @new_val: value to assign
319 *
320 * Atomically sets the value of @ptr to @new_val.
321 */
322static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val)
323{
324 atomic64_xchg(ptr, new_val);
325}
326
327/**
328 * atomic64_read - read atomic64 variable
329 * @ptr: pointer to type atomic64_t
330 *
331 * Atomically reads the value of @ptr and returns it.
332 */
333static inline unsigned long long atomic64_read(atomic64_t *ptr)
334{
335 unsigned long long curr_val;
336
337 do {
338 curr_val = __atomic64_read(ptr);
339 } while (atomic64_cmpxchg(ptr, curr_val, curr_val) != curr_val);
340
341 return curr_val;
342}
343
344/**
345 * atomic64_add_return - add and return
346 * @delta: integer value to add
347 * @ptr: pointer to type atomic64_t
348 *
349 * Atomically adds @delta to @ptr and returns @delta + *@ptr
350 */
351static inline unsigned long long
352atomic64_add_return(unsigned long long delta, atomic64_t *ptr)
353{
354 unsigned long long old_val, new_val;
355
356 do {
357 old_val = atomic_read(ptr);
358 new_val = old_val + delta;
359
360 } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
361
362 return new_val;
363}
364
365static inline long atomic64_sub_return(unsigned long long delta, atomic64_t *ptr)
366{
367 return atomic64_add_return(-delta, ptr);
368}
369
370static inline long atomic64_inc_return(atomic64_t *ptr)
371{
372 return atomic64_add_return(1, ptr);
373}
374
375static inline long atomic64_dec_return(atomic64_t *ptr)
376{
377 return atomic64_sub_return(1, ptr);
378}
379
380/**
381 * atomic64_add - add integer to atomic64 variable
382 * @delta: integer value to add
383 * @ptr: pointer to type atomic64_t
384 *
385 * Atomically adds @delta to @ptr.
386 */
387static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr)
388{
389 atomic64_add_return(delta, ptr);
390}
391
392/**
393 * atomic64_sub - subtract the atomic64 variable
394 * @delta: integer value to subtract
395 * @ptr: pointer to type atomic64_t
396 *
397 * Atomically subtracts @delta from @ptr.
398 */
399static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr)
400{
401 atomic64_add(-delta, ptr);
402}
403
404/**
405 * atomic64_sub_and_test - subtract value from variable and test result
406 * @delta: integer value to subtract
407 * @ptr: pointer to type atomic64_t
408 *
409 * Atomically subtracts @delta from @ptr and returns
410 * true if the result is zero, or false for all
411 * other cases.
412 */
413static inline int
414atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr)
415{
416 unsigned long long old_val = atomic64_sub_return(delta, ptr);
417
418 return old_val == 0;
419}
420
421/**
422 * atomic64_inc - increment atomic64 variable
423 * @ptr: pointer to type atomic64_t
424 *
425 * Atomically increments @ptr by 1.
426 */
427static inline void atomic64_inc(atomic64_t *ptr)
428{
429 atomic64_add(1, ptr);
430}
431
432/**
433 * atomic64_dec - decrement atomic64 variable
434 * @ptr: pointer to type atomic64_t
435 *
436 * Atomically decrements @ptr by 1.
437 */
438static inline void atomic64_dec(atomic64_t *ptr)
439{
440 atomic64_sub(1, ptr);
441}
442
443/**
444 * atomic64_dec_and_test - decrement and test
445 * @ptr: pointer to type atomic64_t
446 *
447 * Atomically decrements @ptr by 1 and
448 * returns true if the result is 0, or false for all other
449 * cases.
450 */
451static inline int atomic64_dec_and_test(atomic64_t *ptr)
452{
453 return atomic64_sub_and_test(1, ptr);
454}
455
456/**
457 * atomic64_inc_and_test - increment and test
458 * @ptr: pointer to type atomic64_t
459 *
460 * Atomically increments @ptr by 1
461 * and returns true if the result is zero, or false for all
462 * other cases.
463 */
464static inline int atomic64_inc_and_test(atomic64_t *ptr)
465{
466 return atomic64_sub_and_test(-1, ptr);
467}
468
469/**
470 * atomic64_add_negative - add and test if negative
471 * @delta: integer value to add
472 * @ptr: pointer to type atomic64_t
473 *
474 * Atomically adds @delta to @ptr and returns true
475 * if the result is negative, or false when
476 * result is greater than or equal to zero.
477 */
478static inline int
479atomic64_add_negative(unsigned long long delta, atomic64_t *ptr)
480{
481 long long old_val = atomic64_add_return(delta, ptr);
482
483 return old_val < 0;
484}
485
486#include <asm-generic/atomic-long.h>
251#endif /* _ASM_X86_ATOMIC_32_H */ 487#endif /* _ASM_X86_ATOMIC_32_H */
diff --git a/arch/x86/include/asm/atomic_64.h b/arch/x86/include/asm/atomic_64.h
index 8c21731984d..0d636022000 100644
--- a/arch/x86/include/asm/atomic_64.h
+++ b/arch/x86/include/asm/atomic_64.h
@@ -455,5 +455,5 @@ static inline void atomic_or_long(unsigned long *v1, unsigned long v2)
455#define smp_mb__before_atomic_inc() barrier() 455#define smp_mb__before_atomic_inc() barrier()
456#define smp_mb__after_atomic_inc() barrier() 456#define smp_mb__after_atomic_inc() barrier()
457 457
458#include <asm-generic/atomic.h> 458#include <asm-generic/atomic-long.h>
459#endif /* _ASM_X86_ATOMIC_64_H */ 459#endif /* _ASM_X86_ATOMIC_64_H */
diff --git a/arch/x86/include/asm/bitsperlong.h b/arch/x86/include/asm/bitsperlong.h
new file mode 100644
index 00000000000..b0ae1c4dc79
--- /dev/null
+++ b/arch/x86/include/asm/bitsperlong.h
@@ -0,0 +1,13 @@
1#ifndef __ASM_X86_BITSPERLONG_H
2#define __ASM_X86_BITSPERLONG_H
3
4#ifdef __x86_64__
5# define __BITS_PER_LONG 64
6#else
7# define __BITS_PER_LONG 32
8#endif
9
10#include <asm-generic/bitsperlong.h>
11
12#endif /* __ASM_X86_BITSPERLONG_H */
13
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 6ba23dd9fc9..418e632d4a8 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -8,11 +8,26 @@
8 8
9#ifdef __KERNEL__ 9#ifdef __KERNEL__
10 10
11#include <asm/page_types.h>
12
11/* Physical address where kernel should be loaded. */ 13/* Physical address where kernel should be loaded. */
12#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \ 14#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \
13 + (CONFIG_PHYSICAL_ALIGN - 1)) \ 15 + (CONFIG_PHYSICAL_ALIGN - 1)) \
14 & ~(CONFIG_PHYSICAL_ALIGN - 1)) 16 & ~(CONFIG_PHYSICAL_ALIGN - 1))
15 17
18/* Minimum kernel alignment, as a power of two */
19#ifdef CONFIG_x86_64
20#define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT
21#else
22#define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT+1)
23#endif
24#define MIN_KERNEL_ALIGN (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2)
25
26#if (CONFIG_PHYSICAL_ALIGN & (CONFIG_PHYSICAL_ALIGN-1)) || \
27 (CONFIG_PHYSICAL_ALIGN < (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2))
28#error "Invalid value for CONFIG_PHYSICAL_ALIGN"
29#endif
30
16#ifdef CONFIG_KERNEL_BZIP2 31#ifdef CONFIG_KERNEL_BZIP2
17#define BOOT_HEAP_SIZE 0x400000 32#define BOOT_HEAP_SIZE 0x400000
18#else /* !CONFIG_KERNEL_BZIP2 */ 33#else /* !CONFIG_KERNEL_BZIP2 */
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
index 433adaebf9b..1724e8de317 100644
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -50,7 +50,8 @@ struct setup_header {
50 __u32 ramdisk_size; 50 __u32 ramdisk_size;
51 __u32 bootsect_kludge; 51 __u32 bootsect_kludge;
52 __u16 heap_end_ptr; 52 __u16 heap_end_ptr;
53 __u16 _pad1; 53 __u8 ext_loader_ver;
54 __u8 ext_loader_type;
54 __u32 cmd_line_ptr; 55 __u32 cmd_line_ptr;
55 __u32 initrd_addr_max; 56 __u32 initrd_addr_max;
56 __u32 kernel_alignment; 57 __u32 kernel_alignment;
diff --git a/arch/x86/include/asm/cpu_debug.h b/arch/x86/include/asm/cpu_debug.h
index 222802029fa..d96c1ee3a95 100644
--- a/arch/x86/include/asm/cpu_debug.h
+++ b/arch/x86/include/asm/cpu_debug.h
@@ -86,105 +86,7 @@ enum cpu_file_bit {
86 CPU_VALUE_BIT, /* value */ 86 CPU_VALUE_BIT, /* value */
87}; 87};
88 88
89#define CPU_FILE_VALUE (1 << CPU_VALUE_BIT) 89#define CPU_FILE_VALUE (1 << CPU_VALUE_BIT)
90
91/*
92 * DisplayFamily_DisplayModel Processor Families/Processor Number Series
93 * -------------------------- ------------------------------------------
94 * 05_01, 05_02, 05_04 Pentium, Pentium with MMX
95 *
96 * 06_01 Pentium Pro
97 * 06_03, 06_05 Pentium II Xeon, Pentium II
98 * 06_07, 06_08, 06_0A, 06_0B Pentium III Xeon, Pentum III
99 *
100 * 06_09, 060D Pentium M
101 *
102 * 06_0E Core Duo, Core Solo
103 *
104 * 06_0F Xeon 3000, 3200, 5100, 5300, 7300 series,
105 * Core 2 Quad, Core 2 Extreme, Core 2 Duo,
106 * Pentium dual-core
107 * 06_17 Xeon 5200, 5400 series, Core 2 Quad Q9650
108 *
109 * 06_1C Atom
110 *
111 * 0F_00, 0F_01, 0F_02 Xeon, Xeon MP, Pentium 4
112 * 0F_03, 0F_04 Xeon, Xeon MP, Pentium 4, Pentium D
113 *
114 * 0F_06 Xeon 7100, 5000 Series, Xeon MP,
115 * Pentium 4, Pentium D
116 */
117
118/* Register processors bits */
119enum cpu_processor_bit {
120 CPU_NONE,
121/* Intel */
122 CPU_INTEL_PENTIUM_BIT,
123 CPU_INTEL_P6_BIT,
124 CPU_INTEL_PENTIUM_M_BIT,
125 CPU_INTEL_CORE_BIT,
126 CPU_INTEL_CORE2_BIT,
127 CPU_INTEL_ATOM_BIT,
128 CPU_INTEL_XEON_P4_BIT,
129 CPU_INTEL_XEON_MP_BIT,
130/* AMD */
131 CPU_AMD_K6_BIT,
132 CPU_AMD_K7_BIT,
133 CPU_AMD_K8_BIT,
134 CPU_AMD_0F_BIT,
135 CPU_AMD_10_BIT,
136 CPU_AMD_11_BIT,
137};
138
139#define CPU_INTEL_PENTIUM (1 << CPU_INTEL_PENTIUM_BIT)
140#define CPU_INTEL_P6 (1 << CPU_INTEL_P6_BIT)
141#define CPU_INTEL_PENTIUM_M (1 << CPU_INTEL_PENTIUM_M_BIT)
142#define CPU_INTEL_CORE (1 << CPU_INTEL_CORE_BIT)
143#define CPU_INTEL_CORE2 (1 << CPU_INTEL_CORE2_BIT)
144#define CPU_INTEL_ATOM (1 << CPU_INTEL_ATOM_BIT)
145#define CPU_INTEL_XEON_P4 (1 << CPU_INTEL_XEON_P4_BIT)
146#define CPU_INTEL_XEON_MP (1 << CPU_INTEL_XEON_MP_BIT)
147
148#define CPU_INTEL_PX (CPU_INTEL_P6 | CPU_INTEL_PENTIUM_M)
149#define CPU_INTEL_COREX (CPU_INTEL_CORE | CPU_INTEL_CORE2)
150#define CPU_INTEL_XEON (CPU_INTEL_XEON_P4 | CPU_INTEL_XEON_MP)
151#define CPU_CO_AT (CPU_INTEL_CORE | CPU_INTEL_ATOM)
152#define CPU_C2_AT (CPU_INTEL_CORE2 | CPU_INTEL_ATOM)
153#define CPU_CX_AT (CPU_INTEL_COREX | CPU_INTEL_ATOM)
154#define CPU_CX_XE (CPU_INTEL_COREX | CPU_INTEL_XEON)
155#define CPU_P6_XE (CPU_INTEL_P6 | CPU_INTEL_XEON)
156#define CPU_PM_CO_AT (CPU_INTEL_PENTIUM_M | CPU_CO_AT)
157#define CPU_C2_AT_XE (CPU_C2_AT | CPU_INTEL_XEON)
158#define CPU_CX_AT_XE (CPU_CX_AT | CPU_INTEL_XEON)
159#define CPU_P6_CX_AT (CPU_INTEL_P6 | CPU_CX_AT)
160#define CPU_P6_CX_XE (CPU_P6_XE | CPU_INTEL_COREX)
161#define CPU_P6_CX_AT_XE (CPU_INTEL_P6 | CPU_CX_AT_XE)
162#define CPU_PM_CX_AT_XE (CPU_INTEL_PENTIUM_M | CPU_CX_AT_XE)
163#define CPU_PM_CX_AT (CPU_INTEL_PENTIUM_M | CPU_CX_AT)
164#define CPU_PM_CX_XE (CPU_INTEL_PENTIUM_M | CPU_CX_XE)
165#define CPU_PX_CX_AT (CPU_INTEL_PX | CPU_CX_AT)
166#define CPU_PX_CX_AT_XE (CPU_INTEL_PX | CPU_CX_AT_XE)
167
168/* Select all supported Intel CPUs */
169#define CPU_INTEL_ALL (CPU_INTEL_PENTIUM | CPU_PX_CX_AT_XE)
170
171#define CPU_AMD_K6 (1 << CPU_AMD_K6_BIT)
172#define CPU_AMD_K7 (1 << CPU_AMD_K7_BIT)
173#define CPU_AMD_K8 (1 << CPU_AMD_K8_BIT)
174#define CPU_AMD_0F (1 << CPU_AMD_0F_BIT)
175#define CPU_AMD_10 (1 << CPU_AMD_10_BIT)
176#define CPU_AMD_11 (1 << CPU_AMD_11_BIT)
177
178#define CPU_K10_PLUS (CPU_AMD_10 | CPU_AMD_11)
179#define CPU_K0F_PLUS (CPU_AMD_0F | CPU_K10_PLUS)
180#define CPU_K8_PLUS (CPU_AMD_K8 | CPU_K0F_PLUS)
181#define CPU_K7_PLUS (CPU_AMD_K7 | CPU_K8_PLUS)
182
183/* Select all supported AMD CPUs */
184#define CPU_AMD_ALL (CPU_AMD_K6 | CPU_K7_PLUS)
185
186/* Select all supported CPUs */
187#define CPU_ALL (CPU_INTEL_ALL | CPU_AMD_ALL)
188 90
189#define MAX_CPU_FILES 512 91#define MAX_CPU_FILES 512
190 92
@@ -220,7 +122,6 @@ struct cpu_debug_range {
220 unsigned min; /* Register range min */ 122 unsigned min; /* Register range min */
221 unsigned max; /* Register range max */ 123 unsigned max; /* Register range max */
222 unsigned flag; /* Supported flags */ 124 unsigned flag; /* Supported flags */
223 unsigned model; /* Supported models */
224}; 125};
225 126
226#endif /* _ASM_X86_CPU_DEBUG_H */ 127#endif /* _ASM_X86_CPU_DEBUG_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index bb83b1c397a..4a28d22d479 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -22,7 +22,7 @@
22#define X86_FEATURE_TSC (0*32+ 4) /* Time Stamp Counter */ 22#define X86_FEATURE_TSC (0*32+ 4) /* Time Stamp Counter */
23#define X86_FEATURE_MSR (0*32+ 5) /* Model-Specific Registers */ 23#define X86_FEATURE_MSR (0*32+ 5) /* Model-Specific Registers */
24#define X86_FEATURE_PAE (0*32+ 6) /* Physical Address Extensions */ 24#define X86_FEATURE_PAE (0*32+ 6) /* Physical Address Extensions */
25#define X86_FEATURE_MCE (0*32+ 7) /* Machine Check Architecture */ 25#define X86_FEATURE_MCE (0*32+ 7) /* Machine Check Exception */
26#define X86_FEATURE_CX8 (0*32+ 8) /* CMPXCHG8 instruction */ 26#define X86_FEATURE_CX8 (0*32+ 8) /* CMPXCHG8 instruction */
27#define X86_FEATURE_APIC (0*32+ 9) /* Onboard APIC */ 27#define X86_FEATURE_APIC (0*32+ 9) /* Onboard APIC */
28#define X86_FEATURE_SEP (0*32+11) /* SYSENTER/SYSEXIT */ 28#define X86_FEATURE_SEP (0*32+11) /* SYSENTER/SYSEXIT */
@@ -94,6 +94,7 @@
94#define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */ 94#define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */
95#define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */ 95#define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */
96#define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */ 96#define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */
97#define X86_FEATURE_EXTD_APICID (3*32+26) /* has extended APICID (8 bits) */
97 98
98/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ 99/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
99#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ 100#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */
@@ -115,6 +116,8 @@
115#define X86_FEATURE_XMM4_1 (4*32+19) /* "sse4_1" SSE-4.1 */ 116#define X86_FEATURE_XMM4_1 (4*32+19) /* "sse4_1" SSE-4.1 */
116#define X86_FEATURE_XMM4_2 (4*32+20) /* "sse4_2" SSE-4.2 */ 117#define X86_FEATURE_XMM4_2 (4*32+20) /* "sse4_2" SSE-4.2 */
117#define X86_FEATURE_X2APIC (4*32+21) /* x2APIC */ 118#define X86_FEATURE_X2APIC (4*32+21) /* x2APIC */
119#define X86_FEATURE_MOVBE (4*32+22) /* MOVBE instruction */
120#define X86_FEATURE_POPCNT (4*32+23) /* POPCNT instruction */
118#define X86_FEATURE_AES (4*32+25) /* AES instructions */ 121#define X86_FEATURE_AES (4*32+25) /* AES instructions */
119#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ 122#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
120#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */ 123#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */
@@ -192,11 +195,11 @@ extern const char * const x86_power_flags[32];
192#define clear_cpu_cap(c, bit) clear_bit(bit, (unsigned long *)((c)->x86_capability)) 195#define clear_cpu_cap(c, bit) clear_bit(bit, (unsigned long *)((c)->x86_capability))
193#define setup_clear_cpu_cap(bit) do { \ 196#define setup_clear_cpu_cap(bit) do { \
194 clear_cpu_cap(&boot_cpu_data, bit); \ 197 clear_cpu_cap(&boot_cpu_data, bit); \
195 set_bit(bit, (unsigned long *)cleared_cpu_caps); \ 198 set_bit(bit, (unsigned long *)cpu_caps_cleared); \
196} while (0) 199} while (0)
197#define setup_force_cpu_cap(bit) do { \ 200#define setup_force_cpu_cap(bit) do { \
198 set_cpu_cap(&boot_cpu_data, bit); \ 201 set_cpu_cap(&boot_cpu_data, bit); \
199 clear_bit(bit, (unsigned long *)cleared_cpu_caps); \ 202 set_bit(bit, (unsigned long *)cpu_caps_set); \
200} while (0) 203} while (0)
201 204
202#define cpu_has_fpu boot_cpu_has(X86_FEATURE_FPU) 205#define cpu_has_fpu boot_cpu_has(X86_FEATURE_FPU)
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index f82fdc412c6..b93405b228b 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -6,6 +6,7 @@
6 * Documentation/DMA-API.txt for documentation. 6 * Documentation/DMA-API.txt for documentation.
7 */ 7 */
8 8
9#include <linux/kmemcheck.h>
9#include <linux/scatterlist.h> 10#include <linux/scatterlist.h>
10#include <linux/dma-debug.h> 11#include <linux/dma-debug.h>
11#include <linux/dma-attrs.h> 12#include <linux/dma-attrs.h>
@@ -60,6 +61,7 @@ dma_map_single(struct device *hwdev, void *ptr, size_t size,
60 dma_addr_t addr; 61 dma_addr_t addr;
61 62
62 BUG_ON(!valid_dma_direction(dir)); 63 BUG_ON(!valid_dma_direction(dir));
64 kmemcheck_mark_initialized(ptr, size);
63 addr = ops->map_page(hwdev, virt_to_page(ptr), 65 addr = ops->map_page(hwdev, virt_to_page(ptr),
64 (unsigned long)ptr & ~PAGE_MASK, size, 66 (unsigned long)ptr & ~PAGE_MASK, size,
65 dir, NULL); 67 dir, NULL);
@@ -87,8 +89,12 @@ dma_map_sg(struct device *hwdev, struct scatterlist *sg,
87{ 89{
88 struct dma_map_ops *ops = get_dma_ops(hwdev); 90 struct dma_map_ops *ops = get_dma_ops(hwdev);
89 int ents; 91 int ents;
92 struct scatterlist *s;
93 int i;
90 94
91 BUG_ON(!valid_dma_direction(dir)); 95 BUG_ON(!valid_dma_direction(dir));
96 for_each_sg(sg, s, nents, i)
97 kmemcheck_mark_initialized(sg_virt(s), s->length);
92 ents = ops->map_sg(hwdev, sg, nents, dir, NULL); 98 ents = ops->map_sg(hwdev, sg, nents, dir, NULL);
93 debug_dma_map_sg(hwdev, sg, nents, ents, dir); 99 debug_dma_map_sg(hwdev, sg, nents, ents, dir);
94 100
@@ -200,6 +206,7 @@ static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
200 dma_addr_t addr; 206 dma_addr_t addr;
201 207
202 BUG_ON(!valid_dma_direction(dir)); 208 BUG_ON(!valid_dma_direction(dir));
209 kmemcheck_mark_initialized(page_address(page) + offset, size);
203 addr = ops->map_page(dev, page, offset, size, dir, NULL); 210 addr = ops->map_page(dev, page, offset, size, dir, NULL);
204 debug_dma_map_page(dev, page, offset, size, dir, addr, false); 211 debug_dma_map_page(dev, page, offset, size, dir, addr, false);
205 212
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index c2e6bedaf25..ff8cbfa0785 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -14,6 +14,7 @@ BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR)
14BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR) 14BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
15BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR) 15BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
16BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR) 16BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
17BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR)
17 18
18BUILD_INTERRUPT3(invalidate_interrupt0,INVALIDATE_TLB_VECTOR_START+0, 19BUILD_INTERRUPT3(invalidate_interrupt0,INVALIDATE_TLB_VECTOR_START+0,
19 smp_invalidate_interrupt) 20 smp_invalidate_interrupt)
@@ -49,11 +50,19 @@ BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
49BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) 50BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
50 51
51#ifdef CONFIG_PERF_COUNTERS 52#ifdef CONFIG_PERF_COUNTERS
52BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR) 53BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
53#endif 54#endif
54 55
55#ifdef CONFIG_X86_MCE_P4THERMAL 56#ifdef CONFIG_X86_THERMAL_VECTOR
56BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) 57BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
57#endif 58#endif
58 59
60#ifdef CONFIG_X86_MCE_THRESHOLD
61BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR)
62#endif
63
64#ifdef CONFIG_X86_NEW_MCE
65BUILD_INTERRUPT(mce_self_interrupt,MCE_SELF_VECTOR)
66#endif
67
59#endif 68#endif
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 37555e52f98..82e3e8f0104 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -13,6 +13,8 @@ typedef struct {
13 unsigned int irq_spurious_count; 13 unsigned int irq_spurious_count;
14#endif 14#endif
15 unsigned int generic_irqs; /* arch dependent */ 15 unsigned int generic_irqs; /* arch dependent */
16 unsigned int apic_perf_irqs;
17 unsigned int apic_pending_irqs;
16#ifdef CONFIG_SMP 18#ifdef CONFIG_SMP
17 unsigned int irq_resched_count; 19 unsigned int irq_resched_count;
18 unsigned int irq_call_count; 20 unsigned int irq_call_count;
@@ -20,7 +22,7 @@ typedef struct {
20#endif 22#endif
21#ifdef CONFIG_X86_MCE 23#ifdef CONFIG_X86_MCE
22 unsigned int irq_thermal_count; 24 unsigned int irq_thermal_count;
23# ifdef CONFIG_X86_64 25# ifdef CONFIG_X86_MCE_THRESHOLD
24 unsigned int irq_threshold_count; 26 unsigned int irq_threshold_count;
25# endif 27# endif
26#endif 28#endif
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index b762ea49bd7..ba180d93b08 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -29,9 +29,12 @@
29extern void apic_timer_interrupt(void); 29extern void apic_timer_interrupt(void);
30extern void generic_interrupt(void); 30extern void generic_interrupt(void);
31extern void error_interrupt(void); 31extern void error_interrupt(void);
32extern void perf_pending_interrupt(void);
33
32extern void spurious_interrupt(void); 34extern void spurious_interrupt(void);
33extern void thermal_interrupt(void); 35extern void thermal_interrupt(void);
34extern void reschedule_interrupt(void); 36extern void reschedule_interrupt(void);
37extern void mce_self_interrupt(void);
35 38
36extern void invalidate_interrupt(void); 39extern void invalidate_interrupt(void);
37extern void invalidate_interrupt0(void); 40extern void invalidate_interrupt0(void);
@@ -44,6 +47,7 @@ extern void invalidate_interrupt6(void);
44extern void invalidate_interrupt7(void); 47extern void invalidate_interrupt7(void);
45 48
46extern void irq_move_cleanup_interrupt(void); 49extern void irq_move_cleanup_interrupt(void);
50extern void reboot_interrupt(void);
47extern void threshold_interrupt(void); 51extern void threshold_interrupt(void);
48 52
49extern void call_function_interrupt(void); 53extern void call_function_interrupt(void);
@@ -63,7 +67,26 @@ extern unsigned long io_apic_irqs;
63extern void init_VISWS_APIC_irqs(void); 67extern void init_VISWS_APIC_irqs(void);
64extern void setup_IO_APIC(void); 68extern void setup_IO_APIC(void);
65extern void disable_IO_APIC(void); 69extern void disable_IO_APIC(void);
66extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn); 70
71struct io_apic_irq_attr {
72 int ioapic;
73 int ioapic_pin;
74 int trigger;
75 int polarity;
76};
77
78static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr,
79 int ioapic, int ioapic_pin,
80 int trigger, int polarity)
81{
82 irq_attr->ioapic = ioapic;
83 irq_attr->ioapic_pin = ioapic_pin;
84 irq_attr->trigger = trigger;
85 irq_attr->polarity = polarity;
86}
87
88extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin,
89 struct io_apic_irq_attr *irq_attr);
67extern void setup_ioapic_dest(void); 90extern void setup_ioapic_dest(void);
68 91
69extern void enable_IO_APIC(void); 92extern void enable_IO_APIC(void);
@@ -78,7 +101,11 @@ extern void eisa_set_level_irq(unsigned int irq);
78/* SMP */ 101/* SMP */
79extern void smp_apic_timer_interrupt(struct pt_regs *); 102extern void smp_apic_timer_interrupt(struct pt_regs *);
80extern void smp_spurious_interrupt(struct pt_regs *); 103extern void smp_spurious_interrupt(struct pt_regs *);
104extern void smp_generic_interrupt(struct pt_regs *);
81extern void smp_error_interrupt(struct pt_regs *); 105extern void smp_error_interrupt(struct pt_regs *);
106#ifdef CONFIG_X86_IO_APIC
107extern asmlinkage void smp_irq_move_cleanup_interrupt(void);
108#endif
82#ifdef CONFIG_SMP 109#ifdef CONFIG_SMP
83extern void smp_reschedule_interrupt(struct pt_regs *); 110extern void smp_reschedule_interrupt(struct pt_regs *);
84extern void smp_call_function_interrupt(struct pt_regs *); 111extern void smp_call_function_interrupt(struct pt_regs *);
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 71c9e518398..175adf58dd4 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -67,7 +67,7 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
67 ".previous\n" 67 ".previous\n"
68 _ASM_EXTABLE(1b, 3b) 68 _ASM_EXTABLE(1b, 3b)
69 : [err] "=r" (err) 69 : [err] "=r" (err)
70#if 0 /* See comment in __save_init_fpu() below. */ 70#if 0 /* See comment in fxsave() below. */
71 : [fx] "r" (fx), "m" (*fx), "0" (0)); 71 : [fx] "r" (fx), "m" (*fx), "0" (0));
72#else 72#else
73 : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0)); 73 : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0));
@@ -75,14 +75,6 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
75 return err; 75 return err;
76} 76}
77 77
78static inline int restore_fpu_checking(struct task_struct *tsk)
79{
80 if (task_thread_info(tsk)->status & TS_XSAVE)
81 return xrstor_checking(&tsk->thread.xstate->xsave);
82 else
83 return fxrstor_checking(&tsk->thread.xstate->fxsave);
84}
85
86/* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception 78/* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception
87 is pending. Clear the x87 state here by setting it to fixed 79 is pending. Clear the x87 state here by setting it to fixed
88 values. The kernel data segment can be sometimes 0 and sometimes 80 values. The kernel data segment can be sometimes 0 and sometimes
@@ -120,7 +112,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
120 ".previous\n" 112 ".previous\n"
121 _ASM_EXTABLE(1b, 3b) 113 _ASM_EXTABLE(1b, 3b)
122 : [err] "=r" (err), "=m" (*fx) 114 : [err] "=r" (err), "=m" (*fx)
123#if 0 /* See comment in __fxsave_clear() below. */ 115#if 0 /* See comment in fxsave() below. */
124 : [fx] "r" (fx), "0" (0)); 116 : [fx] "r" (fx), "0" (0));
125#else 117#else
126 : [fx] "cdaSDb" (fx), "0" (0)); 118 : [fx] "cdaSDb" (fx), "0" (0));
@@ -185,12 +177,9 @@ static inline void tolerant_fwait(void)
185 asm volatile("fnclex ; fwait"); 177 asm volatile("fnclex ; fwait");
186} 178}
187 179
188static inline void restore_fpu(struct task_struct *tsk) 180/* perform fxrstor iff the processor has extended states, otherwise frstor */
181static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
189{ 182{
190 if (task_thread_info(tsk)->status & TS_XSAVE) {
191 xrstor_checking(&tsk->thread.xstate->xsave);
192 return;
193 }
194 /* 183 /*
195 * The "nop" is needed to make the instructions the same 184 * The "nop" is needed to make the instructions the same
196 * length. 185 * length.
@@ -199,7 +188,9 @@ static inline void restore_fpu(struct task_struct *tsk)
199 "nop ; frstor %1", 188 "nop ; frstor %1",
200 "fxrstor %1", 189 "fxrstor %1",
201 X86_FEATURE_FXSR, 190 X86_FEATURE_FXSR,
202 "m" (tsk->thread.xstate->fxsave)); 191 "m" (*fx));
192
193 return 0;
203} 194}
204 195
205/* We need a safe address that is cheap to find and that is already 196/* We need a safe address that is cheap to find and that is already
@@ -262,6 +253,14 @@ end:
262 253
263#endif /* CONFIG_X86_64 */ 254#endif /* CONFIG_X86_64 */
264 255
256static inline int restore_fpu_checking(struct task_struct *tsk)
257{
258 if (task_thread_info(tsk)->status & TS_XSAVE)
259 return xrstor_checking(&tsk->thread.xstate->xsave);
260 else
261 return fxrstor_checking(&tsk->thread.xstate->fxsave);
262}
263
265/* 264/*
266 * Signal frame handlers... 265 * Signal frame handlers...
267 */ 266 */
@@ -305,18 +304,18 @@ static inline void kernel_fpu_end(void)
305/* 304/*
306 * Some instructions like VIA's padlock instructions generate a spurious 305 * Some instructions like VIA's padlock instructions generate a spurious
307 * DNA fault but don't modify SSE registers. And these instructions 306 * DNA fault but don't modify SSE registers. And these instructions
308 * get used from interrupt context aswell. To prevent these kernel instructions 307 * get used from interrupt context as well. To prevent these kernel instructions
309 * in interrupt context interact wrongly with other user/kernel fpu usage, we 308 * in interrupt context interacting wrongly with other user/kernel fpu usage, we
310 * should use them only in the context of irq_ts_save/restore() 309 * should use them only in the context of irq_ts_save/restore()
311 */ 310 */
312static inline int irq_ts_save(void) 311static inline int irq_ts_save(void)
313{ 312{
314 /* 313 /*
315 * If we are in process context, we are ok to take a spurious DNA fault. 314 * If in process context and not atomic, we can take a spurious DNA fault.
316 * Otherwise, doing clts() in process context require pre-emption to 315 * Otherwise, doing clts() in process context requires disabling preemption
317 * be disabled or some heavy lifting like kernel_fpu_begin() 316 * or some heavy lifting like kernel_fpu_begin()
318 */ 317 */
319 if (!in_interrupt()) 318 if (!in_atomic())
320 return 0; 319 return 0;
321 320
322 if (read_cr0() & X86_CR0_TS) { 321 if (read_cr0() & X86_CR0_TS) {
diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h
index 1a99e6c092a..58d7091eeb1 100644
--- a/arch/x86/include/asm/i8259.h
+++ b/arch/x86/include/asm/i8259.h
@@ -60,8 +60,4 @@ extern struct irq_chip i8259A_chip;
60extern void mask_8259A(void); 60extern void mask_8259A(void);
61extern void unmask_8259A(void); 61extern void unmask_8259A(void);
62 62
63#ifdef CONFIG_X86_32
64extern void init_ISA_irqs(void);
65#endif
66
67#endif /* _ASM_X86_I8259_H */ 63#endif /* _ASM_X86_I8259_H */
diff --git a/arch/x86/include/asm/intel_arch_perfmon.h b/arch/x86/include/asm/intel_arch_perfmon.h
deleted file mode 100644
index fa0fd068bc2..00000000000
--- a/arch/x86/include/asm/intel_arch_perfmon.h
+++ /dev/null
@@ -1,31 +0,0 @@
1#ifndef _ASM_X86_INTEL_ARCH_PERFMON_H
2#define _ASM_X86_INTEL_ARCH_PERFMON_H
3
4#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
5#define MSR_ARCH_PERFMON_PERFCTR1 0xc2
6
7#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
8#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
9
10#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
11#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
12#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
13#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
14
15#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL (0x3c)
16#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
17#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX (0)
18#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
19 (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
20
21union cpuid10_eax {
22 struct {
23 unsigned int version_id:8;
24 unsigned int num_counters:8;
25 unsigned int bit_width:8;
26 unsigned int mask_length:8;
27 } split;
28 unsigned int full;
29};
30
31#endif /* _ASM_X86_INTEL_ARCH_PERFMON_H */
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 9d826e43601..daf866ed061 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -154,22 +154,19 @@ extern int timer_through_8259;
154extern int io_apic_get_unique_id(int ioapic, int apic_id); 154extern int io_apic_get_unique_id(int ioapic, int apic_id);
155extern int io_apic_get_version(int ioapic); 155extern int io_apic_get_version(int ioapic);
156extern int io_apic_get_redir_entries(int ioapic); 156extern int io_apic_get_redir_entries(int ioapic);
157extern int io_apic_set_pci_routing(int ioapic, int pin, int irq,
158 int edge_level, int active_high_low);
159#endif /* CONFIG_ACPI */ 157#endif /* CONFIG_ACPI */
160 158
159struct io_apic_irq_attr;
160extern int io_apic_set_pci_routing(struct device *dev, int irq,
161 struct io_apic_irq_attr *irq_attr);
161extern int (*ioapic_renumber_irq)(int ioapic, int irq); 162extern int (*ioapic_renumber_irq)(int ioapic, int irq);
162extern void ioapic_init_mappings(void); 163extern void ioapic_init_mappings(void);
163 164
164#ifdef CONFIG_X86_64
165extern struct IO_APIC_route_entry **alloc_ioapic_entries(void); 165extern struct IO_APIC_route_entry **alloc_ioapic_entries(void);
166extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries); 166extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries);
167extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); 167extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
168extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); 168extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
169extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); 169extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
170extern void reinit_intr_remapped_IO_APIC(int intr_remapping,
171 struct IO_APIC_route_entry **ioapic_entries);
172#endif
173 170
174extern void probe_nr_irqs_gsi(void); 171extern void probe_nr_irqs_gsi(void);
175 172
diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h
index 86af26091d6..0e9fe1d9d97 100644
--- a/arch/x86/include/asm/iomap.h
+++ b/arch/x86/include/asm/iomap.h
@@ -1,3 +1,6 @@
1#ifndef _ASM_X86_IOMAP_H
2#define _ASM_X86_IOMAP_H
3
1/* 4/*
2 * Copyright © 2008 Ingo Molnar 5 * Copyright © 2008 Ingo Molnar
3 * 6 *
@@ -31,3 +34,5 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
31 34
32void 35void
33iounmap_atomic(void *kvaddr, enum km_type type); 36iounmap_atomic(void *kvaddr, enum km_type type);
37
38#endif /* _ASM_X86_IOMAP_H */
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 0396760fccb..f275e224450 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -1,6 +1,6 @@
1#ifndef _ASM_X86_IRQ_REMAPPING_H 1#ifndef _ASM_X86_IRQ_REMAPPING_H
2#define _ASM_X86_IRQ_REMAPPING_H 2#define _ASM_X86_IRQ_REMAPPING_H
3 3
4#define IRTE_DEST(dest) ((x2apic) ? dest : dest << 8) 4#define IRTE_DEST(dest) ((x2apic_mode) ? dest : dest << 8)
5 5
6#endif /* _ASM_X86_IRQ_REMAPPING_H */ 6#endif /* _ASM_X86_IRQ_REMAPPING_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 3cbd79bbb47..5b21f0ec3df 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -25,6 +25,7 @@
25 */ 25 */
26 26
27#define NMI_VECTOR 0x02 27#define NMI_VECTOR 0x02
28#define MCE_VECTOR 0x12
28 29
29/* 30/*
30 * IDT vectors usable for external interrupt sources start 31 * IDT vectors usable for external interrupt sources start
@@ -34,6 +35,7 @@
34 35
35#ifdef CONFIG_X86_32 36#ifdef CONFIG_X86_32
36# define SYSCALL_VECTOR 0x80 37# define SYSCALL_VECTOR 0x80
38# define IA32_SYSCALL_VECTOR 0x80
37#else 39#else
38# define IA32_SYSCALL_VECTOR 0x80 40# define IA32_SYSCALL_VECTOR 0x80
39#endif 41#endif
@@ -86,13 +88,8 @@
86#define CALL_FUNCTION_VECTOR 0xfc 88#define CALL_FUNCTION_VECTOR 0xfc
87#define CALL_FUNCTION_SINGLE_VECTOR 0xfb 89#define CALL_FUNCTION_SINGLE_VECTOR 0xfb
88#define THERMAL_APIC_VECTOR 0xfa 90#define THERMAL_APIC_VECTOR 0xfa
89 91#define THRESHOLD_APIC_VECTOR 0xf9
90#ifdef CONFIG_X86_32 92#define REBOOT_VECTOR 0xf8
91/* 0xf8 - 0xf9 : free */
92#else
93# define THRESHOLD_APIC_VECTOR 0xf9
94# define UV_BAU_MESSAGE 0xf8
95#endif
96 93
97/* f0-f7 used for spreading out TLB flushes: */ 94/* f0-f7 used for spreading out TLB flushes: */
98#define INVALIDATE_TLB_VECTOR_END 0xf7 95#define INVALIDATE_TLB_VECTOR_END 0xf7
@@ -107,14 +104,21 @@
107#define LOCAL_TIMER_VECTOR 0xef 104#define LOCAL_TIMER_VECTOR 0xef
108 105
109/* 106/*
110 * Performance monitoring interrupt vector: 107 * Generic system vector for platform specific use
111 */ 108 */
112#define LOCAL_PERF_VECTOR 0xee 109#define GENERIC_INTERRUPT_VECTOR 0xed
113 110
114/* 111/*
115 * Generic system vector for platform specific use 112 * Performance monitoring pending work vector:
116 */ 113 */
117#define GENERIC_INTERRUPT_VECTOR 0xed 114#define LOCAL_PENDING_VECTOR 0xec
115
116#define UV_BAU_MESSAGE 0xec
117
118/*
119 * Self IPI vector for machine checks
120 */
121#define MCE_SELF_VECTOR 0xeb
118 122
119/* 123/*
120 * First APIC vector available to drivers: (vectors 0x30-0xee) we 124 * First APIC vector available to drivers: (vectors 0x30-0xee) we
diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h
index 54c8cc53b24..c2d1f3b58e5 100644
--- a/arch/x86/include/asm/k8.h
+++ b/arch/x86/include/asm/k8.h
@@ -12,4 +12,17 @@ extern int cache_k8_northbridges(void);
12extern void k8_flush_garts(void); 12extern void k8_flush_garts(void);
13extern int k8_scan_nodes(unsigned long start, unsigned long end); 13extern int k8_scan_nodes(unsigned long start, unsigned long end);
14 14
15#ifdef CONFIG_K8_NB
16static inline struct pci_dev *node_to_k8_nb_misc(int node)
17{
18 return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL;
19}
20#else
21static inline struct pci_dev *node_to_k8_nb_misc(int node)
22{
23 return NULL;
24}
25#endif
26
27
15#endif /* _ASM_X86_K8_H */ 28#endif /* _ASM_X86_K8_H */
diff --git a/arch/x86/include/asm/kmap_types.h b/arch/x86/include/asm/kmap_types.h
index 5759c165a5c..9e00a731a7f 100644
--- a/arch/x86/include/asm/kmap_types.h
+++ b/arch/x86/include/asm/kmap_types.h
@@ -2,28 +2,11 @@
2#define _ASM_X86_KMAP_TYPES_H 2#define _ASM_X86_KMAP_TYPES_H
3 3
4#if defined(CONFIG_X86_32) && defined(CONFIG_DEBUG_HIGHMEM) 4#if defined(CONFIG_X86_32) && defined(CONFIG_DEBUG_HIGHMEM)
5# define D(n) __KM_FENCE_##n , 5#define __WITH_KM_FENCE
6#else
7# define D(n)
8#endif 6#endif
9 7
10enum km_type { 8#include <asm-generic/kmap_types.h>
11D(0) KM_BOUNCE_READ,
12D(1) KM_SKB_SUNRPC_DATA,
13D(2) KM_SKB_DATA_SOFTIRQ,
14D(3) KM_USER0,
15D(4) KM_USER1,
16D(5) KM_BIO_SRC_IRQ,
17D(6) KM_BIO_DST_IRQ,
18D(7) KM_PTE0,
19D(8) KM_PTE1,
20D(9) KM_IRQ0,
21D(10) KM_IRQ1,
22D(11) KM_SOFTIRQ0,
23D(12) KM_SOFTIRQ1,
24D(13) KM_TYPE_NR
25};
26 9
27#undef D 10#undef __WITH_KM_FENCE
28 11
29#endif /* _ASM_X86_KMAP_TYPES_H */ 12#endif /* _ASM_X86_KMAP_TYPES_H */
diff --git a/arch/x86/include/asm/kmemcheck.h b/arch/x86/include/asm/kmemcheck.h
new file mode 100644
index 00000000000..ed01518f297
--- /dev/null
+++ b/arch/x86/include/asm/kmemcheck.h
@@ -0,0 +1,42 @@
1#ifndef ASM_X86_KMEMCHECK_H
2#define ASM_X86_KMEMCHECK_H
3
4#include <linux/types.h>
5#include <asm/ptrace.h>
6
7#ifdef CONFIG_KMEMCHECK
8bool kmemcheck_active(struct pt_regs *regs);
9
10void kmemcheck_show(struct pt_regs *regs);
11void kmemcheck_hide(struct pt_regs *regs);
12
13bool kmemcheck_fault(struct pt_regs *regs,
14 unsigned long address, unsigned long error_code);
15bool kmemcheck_trap(struct pt_regs *regs);
16#else
17static inline bool kmemcheck_active(struct pt_regs *regs)
18{
19 return false;
20}
21
22static inline void kmemcheck_show(struct pt_regs *regs)
23{
24}
25
26static inline void kmemcheck_hide(struct pt_regs *regs)
27{
28}
29
30static inline bool kmemcheck_fault(struct pt_regs *regs,
31 unsigned long address, unsigned long error_code)
32{
33 return false;
34}
35
36static inline bool kmemcheck_trap(struct pt_regs *regs)
37{
38 return false;
39}
40#endif /* CONFIG_KMEMCHECK */
41
42#endif
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index dc3f6cf1170..125be8b1956 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -16,6 +16,7 @@
16#define __KVM_HAVE_MSI 16#define __KVM_HAVE_MSI
17#define __KVM_HAVE_USER_NMI 17#define __KVM_HAVE_USER_NMI
18#define __KVM_HAVE_GUEST_DEBUG 18#define __KVM_HAVE_GUEST_DEBUG
19#define __KVM_HAVE_MSIX
19 20
20/* Architectural interrupt line count. */ 21/* Architectural interrupt line count. */
21#define KVM_NR_INTERRUPTS 256 22#define KVM_NR_INTERRUPTS 256
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f0faf58044f..eabdc1cfab5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -185,6 +185,7 @@ union kvm_mmu_page_role {
185 unsigned access:3; 185 unsigned access:3;
186 unsigned invalid:1; 186 unsigned invalid:1;
187 unsigned cr4_pge:1; 187 unsigned cr4_pge:1;
188 unsigned nxe:1;
188 }; 189 };
189}; 190};
190 191
@@ -212,7 +213,6 @@ struct kvm_mmu_page {
212 int multimapped; /* More than one parent_pte? */ 213 int multimapped; /* More than one parent_pte? */
213 int root_count; /* Currently serving as active root */ 214 int root_count; /* Currently serving as active root */
214 bool unsync; 215 bool unsync;
215 bool global;
216 unsigned int unsync_children; 216 unsigned int unsync_children;
217 union { 217 union {
218 u64 *parent_pte; /* !multimapped */ 218 u64 *parent_pte; /* !multimapped */
@@ -261,13 +261,11 @@ struct kvm_mmu {
261 union kvm_mmu_page_role base_role; 261 union kvm_mmu_page_role base_role;
262 262
263 u64 *pae_root; 263 u64 *pae_root;
264 u64 rsvd_bits_mask[2][4];
264}; 265};
265 266
266struct kvm_vcpu_arch { 267struct kvm_vcpu_arch {
267 u64 host_tsc; 268 u64 host_tsc;
268 int interrupt_window_open;
269 unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
270 DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS);
271 /* 269 /*
272 * rip and regs accesses must go through 270 * rip and regs accesses must go through
273 * kvm_{register,rip}_{read,write} functions. 271 * kvm_{register,rip}_{read,write} functions.
@@ -286,6 +284,7 @@ struct kvm_vcpu_arch {
286 u64 shadow_efer; 284 u64 shadow_efer;
287 u64 apic_base; 285 u64 apic_base;
288 struct kvm_lapic *apic; /* kernel irqchip context */ 286 struct kvm_lapic *apic; /* kernel irqchip context */
287 int32_t apic_arb_prio;
289 int mp_state; 288 int mp_state;
290 int sipi_vector; 289 int sipi_vector;
291 u64 ia32_misc_enable_msr; 290 u64 ia32_misc_enable_msr;
@@ -320,6 +319,8 @@ struct kvm_vcpu_arch {
320 struct kvm_pio_request pio; 319 struct kvm_pio_request pio;
321 void *pio_data; 320 void *pio_data;
322 321
322 u8 event_exit_inst_len;
323
323 struct kvm_queued_exception { 324 struct kvm_queued_exception {
324 bool pending; 325 bool pending;
325 bool has_error_code; 326 bool has_error_code;
@@ -329,11 +330,12 @@ struct kvm_vcpu_arch {
329 330
330 struct kvm_queued_interrupt { 331 struct kvm_queued_interrupt {
331 bool pending; 332 bool pending;
333 bool soft;
332 u8 nr; 334 u8 nr;
333 } interrupt; 335 } interrupt;
334 336
335 struct { 337 struct {
336 int active; 338 int vm86_active;
337 u8 save_iopl; 339 u8 save_iopl;
338 struct kvm_save_segment { 340 struct kvm_save_segment {
339 u16 selector; 341 u16 selector;
@@ -356,9 +358,9 @@ struct kvm_vcpu_arch {
356 unsigned int time_offset; 358 unsigned int time_offset;
357 struct page *time_page; 359 struct page *time_page;
358 360
361 bool singlestep; /* guest is single stepped by KVM */
359 bool nmi_pending; 362 bool nmi_pending;
360 bool nmi_injected; 363 bool nmi_injected;
361 bool nmi_window_open;
362 364
363 struct mtrr_state_type mtrr_state; 365 struct mtrr_state_type mtrr_state;
364 u32 pat; 366 u32 pat;
@@ -392,15 +394,14 @@ struct kvm_arch{
392 */ 394 */
393 struct list_head active_mmu_pages; 395 struct list_head active_mmu_pages;
394 struct list_head assigned_dev_head; 396 struct list_head assigned_dev_head;
395 struct list_head oos_global_pages;
396 struct iommu_domain *iommu_domain; 397 struct iommu_domain *iommu_domain;
398 int iommu_flags;
397 struct kvm_pic *vpic; 399 struct kvm_pic *vpic;
398 struct kvm_ioapic *vioapic; 400 struct kvm_ioapic *vioapic;
399 struct kvm_pit *vpit; 401 struct kvm_pit *vpit;
400 struct hlist_head irq_ack_notifier_list; 402 struct hlist_head irq_ack_notifier_list;
401 int vapics_in_nmi_mode; 403 int vapics_in_nmi_mode;
402 404
403 int round_robin_prev_vcpu;
404 unsigned int tss_addr; 405 unsigned int tss_addr;
405 struct page *apic_access_page; 406 struct page *apic_access_page;
406 407
@@ -423,7 +424,6 @@ struct kvm_vm_stat {
423 u32 mmu_recycled; 424 u32 mmu_recycled;
424 u32 mmu_cache_miss; 425 u32 mmu_cache_miss;
425 u32 mmu_unsync; 426 u32 mmu_unsync;
426 u32 mmu_unsync_global;
427 u32 remote_tlb_flush; 427 u32 remote_tlb_flush;
428 u32 lpages; 428 u32 lpages;
429}; 429};
@@ -443,7 +443,6 @@ struct kvm_vcpu_stat {
443 u32 halt_exits; 443 u32 halt_exits;
444 u32 halt_wakeup; 444 u32 halt_wakeup;
445 u32 request_irq_exits; 445 u32 request_irq_exits;
446 u32 request_nmi_exits;
447 u32 irq_exits; 446 u32 irq_exits;
448 u32 host_state_reload; 447 u32 host_state_reload;
449 u32 efer_reload; 448 u32 efer_reload;
@@ -511,20 +510,22 @@ struct kvm_x86_ops {
511 void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run); 510 void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
512 int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu); 511 int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
513 void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); 512 void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
513 void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
514 u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
514 void (*patch_hypercall)(struct kvm_vcpu *vcpu, 515 void (*patch_hypercall)(struct kvm_vcpu *vcpu,
515 unsigned char *hypercall_addr); 516 unsigned char *hypercall_addr);
516 int (*get_irq)(struct kvm_vcpu *vcpu); 517 void (*set_irq)(struct kvm_vcpu *vcpu);
517 void (*set_irq)(struct kvm_vcpu *vcpu, int vec); 518 void (*set_nmi)(struct kvm_vcpu *vcpu);
518 void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr, 519 void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
519 bool has_error_code, u32 error_code); 520 bool has_error_code, u32 error_code);
520 bool (*exception_injected)(struct kvm_vcpu *vcpu); 521 int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
521 void (*inject_pending_irq)(struct kvm_vcpu *vcpu); 522 int (*nmi_allowed)(struct kvm_vcpu *vcpu);
522 void (*inject_pending_vectors)(struct kvm_vcpu *vcpu, 523 void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
523 struct kvm_run *run); 524 void (*enable_irq_window)(struct kvm_vcpu *vcpu);
524 525 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
525 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); 526 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
526 int (*get_tdp_level)(void); 527 int (*get_tdp_level)(void);
527 int (*get_mt_mask_shift)(void); 528 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
528}; 529};
529 530
530extern struct kvm_x86_ops *kvm_x86_ops; 531extern struct kvm_x86_ops *kvm_x86_ops;
@@ -538,7 +539,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu);
538void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte); 539void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
539void kvm_mmu_set_base_ptes(u64 base_pte); 540void kvm_mmu_set_base_ptes(u64 base_pte);
540void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 541void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
541 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask); 542 u64 dirty_mask, u64 nx_mask, u64 x_mask);
542 543
543int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); 544int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
544void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); 545void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
@@ -552,6 +553,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
552 const void *val, int bytes); 553 const void *val, int bytes);
553int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, 554int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
554 gpa_t addr, unsigned long *ret); 555 gpa_t addr, unsigned long *ret);
556u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
555 557
556extern bool tdp_enabled; 558extern bool tdp_enabled;
557 559
@@ -563,6 +565,7 @@ enum emulation_result {
563 565
564#define EMULTYPE_NO_DECODE (1 << 0) 566#define EMULTYPE_NO_DECODE (1 << 0)
565#define EMULTYPE_TRAP_UD (1 << 1) 567#define EMULTYPE_TRAP_UD (1 << 1)
568#define EMULTYPE_SKIP (1 << 2)
566int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, 569int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run,
567 unsigned long cr2, u16 error_code, int emulation_type); 570 unsigned long cr2, u16 error_code, int emulation_type);
568void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); 571void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
@@ -638,7 +641,6 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
638int kvm_mmu_load(struct kvm_vcpu *vcpu); 641int kvm_mmu_load(struct kvm_vcpu *vcpu);
639void kvm_mmu_unload(struct kvm_vcpu *vcpu); 642void kvm_mmu_unload(struct kvm_vcpu *vcpu);
640void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); 643void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
641void kvm_mmu_sync_global(struct kvm_vcpu *vcpu);
642 644
643int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); 645int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
644 646
@@ -769,6 +771,8 @@ enum {
769#define HF_GIF_MASK (1 << 0) 771#define HF_GIF_MASK (1 << 0)
770#define HF_HIF_MASK (1 << 1) 772#define HF_HIF_MASK (1 << 1)
771#define HF_VINTR_MASK (1 << 2) 773#define HF_VINTR_MASK (1 << 2)
774#define HF_NMI_MASK (1 << 3)
775#define HF_IRET_MASK (1 << 4)
772 776
773/* 777/*
774 * Hardware virtualization extension instructions may fault if a 778 * Hardware virtualization extension instructions may fault if a
@@ -791,5 +795,6 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
791#define KVM_ARCH_WANT_MMU_NOTIFIER 795#define KVM_ARCH_WANT_MMU_NOTIFIER
792int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); 796int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
793int kvm_age_hva(struct kvm *kvm, unsigned long hva); 797int kvm_age_hva(struct kvm *kvm, unsigned long hva);
798int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
794 799
795#endif /* _ASM_X86_KVM_HOST_H */ 800#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_x86_emulate.h
index 6a159732881..b7ed2c42311 100644
--- a/arch/x86/include/asm/kvm_x86_emulate.h
+++ b/arch/x86/include/asm/kvm_x86_emulate.h
@@ -143,6 +143,9 @@ struct decode_cache {
143 struct fetch_cache fetch; 143 struct fetch_cache fetch;
144}; 144};
145 145
146#define X86_SHADOW_INT_MOV_SS 1
147#define X86_SHADOW_INT_STI 2
148
146struct x86_emulate_ctxt { 149struct x86_emulate_ctxt {
147 /* Register state before/after emulation. */ 150 /* Register state before/after emulation. */
148 struct kvm_vcpu *vcpu; 151 struct kvm_vcpu *vcpu;
@@ -152,6 +155,9 @@ struct x86_emulate_ctxt {
152 int mode; 155 int mode;
153 u32 cs_base; 156 u32 cs_base;
154 157
158 /* interruptibility state, as a result of execution of STI or MOV SS */
159 int interruptibility;
160
155 /* decode cache */ 161 /* decode cache */
156 struct decode_cache decode; 162 struct decode_cache decode;
157}; 163};
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
index 1caf57628b9..313389cd50d 100644
--- a/arch/x86/include/asm/lguest.h
+++ b/arch/x86/include/asm/lguest.h
@@ -17,8 +17,13 @@
17/* Pages for switcher itself, then two pages per cpu */ 17/* Pages for switcher itself, then two pages per cpu */
18#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) 18#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids)
19 19
20/* We map at -4M for ease of mapping into the guest (one PTE page). */ 20/* We map at -4M (-2M when PAE is activated) for ease of mapping
21 * into the guest (one PTE page). */
22#ifdef CONFIG_X86_PAE
23#define SWITCHER_ADDR 0xFFE00000
24#else
21#define SWITCHER_ADDR 0xFFC00000 25#define SWITCHER_ADDR 0xFFC00000
26#endif
22 27
23/* Found in switcher.S */ 28/* Found in switcher.S */
24extern unsigned long default_idt_entries[]; 29extern unsigned long default_idt_entries[];
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
index faae1996487..d31c4a68407 100644
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -12,11 +12,13 @@
12#define LHCALL_TS 8 12#define LHCALL_TS 8
13#define LHCALL_SET_CLOCKEVENT 9 13#define LHCALL_SET_CLOCKEVENT 9
14#define LHCALL_HALT 10 14#define LHCALL_HALT 10
15#define LHCALL_SET_PMD 13
15#define LHCALL_SET_PTE 14 16#define LHCALL_SET_PTE 14
16#define LHCALL_SET_PMD 15 17#define LHCALL_SET_PGD 15
17#define LHCALL_LOAD_TLS 16 18#define LHCALL_LOAD_TLS 16
18#define LHCALL_NOTIFY 17 19#define LHCALL_NOTIFY 17
19#define LHCALL_LOAD_GDT_ENTRY 18 20#define LHCALL_LOAD_GDT_ENTRY 18
21#define LHCALL_SEND_INTERRUPTS 19
20 22
21#define LGUEST_TRAP_ENTRY 0x1F 23#define LGUEST_TRAP_ENTRY 0x1F
22 24
@@ -32,10 +34,10 @@
32 * operations? There are two ways: the direct way is to make a "hypercall", 34 * operations? There are two ways: the direct way is to make a "hypercall",
33 * to make requests of the Host Itself. 35 * to make requests of the Host Itself.
34 * 36 *
35 * We use the KVM hypercall mechanism. Eighteen hypercalls are 37 * We use the KVM hypercall mechanism. Seventeen hypercalls are
36 * available: the hypercall number is put in the %eax register, and the 38 * available: the hypercall number is put in the %eax register, and the
37 * arguments (when required) are placed in %ebx, %ecx and %edx. If a return 39 * arguments (when required) are placed in %ebx, %ecx, %edx and %esi.
38 * value makes sense, it's returned in %eax. 40 * If a return value makes sense, it's returned in %eax.
39 * 41 *
40 * Grossly invalid calls result in Sudden Death at the hands of the vengeful 42 * Grossly invalid calls result in Sudden Death at the hands of the vengeful
41 * Host, rather than returning failure. This reflects Winston Churchill's 43 * Host, rather than returning failure. This reflects Winston Churchill's
@@ -47,8 +49,9 @@
47 49
48#define LHCALL_RING_SIZE 64 50#define LHCALL_RING_SIZE 64
49struct hcall_args { 51struct hcall_args {
50 /* These map directly onto eax, ebx, ecx, edx in struct lguest_regs */ 52 /* These map directly onto eax, ebx, ecx, edx and esi
51 unsigned long arg0, arg1, arg2, arg3; 53 * in struct lguest_regs */
54 unsigned long arg0, arg1, arg2, arg3, arg4;
52}; 55};
53 56
54#endif /* !__ASSEMBLY__ */ 57#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 4f8c199584e..540a466e50f 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -1,8 +1,6 @@
1#ifndef _ASM_X86_MCE_H 1#ifndef _ASM_X86_MCE_H
2#define _ASM_X86_MCE_H 2#define _ASM_X86_MCE_H
3 3
4#ifdef __x86_64__
5
6#include <linux/types.h> 4#include <linux/types.h>
7#include <asm/ioctls.h> 5#include <asm/ioctls.h>
8 6
@@ -10,21 +8,35 @@
10 * Machine Check support for x86 8 * Machine Check support for x86
11 */ 9 */
12 10
13#define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */ 11#define MCG_BANKCNT_MASK 0xff /* Number of Banks */
14#define MCG_EXT_P (1ULL<<9) /* Extended registers available */ 12#define MCG_CTL_P (1ULL<<8) /* MCG_CAP register available */
15#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */ 13#define MCG_EXT_P (1ULL<<9) /* Extended registers available */
16 14#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */
17#define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */ 15#define MCG_EXT_CNT_MASK 0xff0000 /* Number of Extended registers */
18#define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */ 16#define MCG_EXT_CNT_SHIFT 16
19#define MCG_STATUS_MCIP (1UL<<2) /* machine check in progress */ 17#define MCG_EXT_CNT(c) (((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT)
20 18#define MCG_SER_P (1ULL<<24) /* MCA recovery/new status bits */
21#define MCI_STATUS_VAL (1UL<<63) /* valid error */ 19
22#define MCI_STATUS_OVER (1UL<<62) /* previous errors lost */ 20#define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */
23#define MCI_STATUS_UC (1UL<<61) /* uncorrected error */ 21#define MCG_STATUS_EIPV (1ULL<<1) /* ip points to correct instruction */
24#define MCI_STATUS_EN (1UL<<60) /* error enabled */ 22#define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */
25#define MCI_STATUS_MISCV (1UL<<59) /* misc error reg. valid */ 23
26#define MCI_STATUS_ADDRV (1UL<<58) /* addr reg. valid */ 24#define MCI_STATUS_VAL (1ULL<<63) /* valid error */
27#define MCI_STATUS_PCC (1UL<<57) /* processor context corrupt */ 25#define MCI_STATUS_OVER (1ULL<<62) /* previous errors lost */
26#define MCI_STATUS_UC (1ULL<<61) /* uncorrected error */
27#define MCI_STATUS_EN (1ULL<<60) /* error enabled */
28#define MCI_STATUS_MISCV (1ULL<<59) /* misc error reg. valid */
29#define MCI_STATUS_ADDRV (1ULL<<58) /* addr reg. valid */
30#define MCI_STATUS_PCC (1ULL<<57) /* processor context corrupt */
31#define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */
32#define MCI_STATUS_AR (1ULL<<55) /* Action required */
33
34/* MISC register defines */
35#define MCM_ADDR_SEGOFF 0 /* segment offset */
36#define MCM_ADDR_LINEAR 1 /* linear address */
37#define MCM_ADDR_PHYS 2 /* physical address */
38#define MCM_ADDR_MEM 3 /* memory address */
39#define MCM_ADDR_GENERIC 7 /* generic */
28 40
29/* Fields are zero when not available */ 41/* Fields are zero when not available */
30struct mce { 42struct mce {
@@ -34,13 +46,19 @@ struct mce {
34 __u64 mcgstatus; 46 __u64 mcgstatus;
35 __u64 ip; 47 __u64 ip;
36 __u64 tsc; /* cpu time stamp counter */ 48 __u64 tsc; /* cpu time stamp counter */
37 __u64 res1; /* for future extension */ 49 __u64 time; /* wall time_t when error was detected */
38 __u64 res2; /* dito. */ 50 __u8 cpuvendor; /* cpu vendor as encoded in system.h */
51 __u8 pad1;
52 __u16 pad2;
53 __u32 cpuid; /* CPUID 1 EAX */
39 __u8 cs; /* code segment */ 54 __u8 cs; /* code segment */
40 __u8 bank; /* machine check bank */ 55 __u8 bank; /* machine check bank */
41 __u8 cpu; /* cpu that raised the error */ 56 __u8 cpu; /* cpu number; obsolete; use extcpu now */
42 __u8 finished; /* entry is valid */ 57 __u8 finished; /* entry is valid */
43 __u32 pad; 58 __u32 extcpu; /* linux cpu number that detected the error */
59 __u32 socketid; /* CPU socket ID */
60 __u32 apicid; /* CPU initial apic ID */
61 __u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */
44}; 62};
45 63
46/* 64/*
@@ -57,7 +75,7 @@ struct mce_log {
57 unsigned len; /* = MCE_LOG_LEN */ 75 unsigned len; /* = MCE_LOG_LEN */
58 unsigned next; 76 unsigned next;
59 unsigned flags; 77 unsigned flags;
60 unsigned pad0; 78 unsigned recordlen; /* length of struct mce */
61 struct mce entry[MCE_LOG_LEN]; 79 struct mce entry[MCE_LOG_LEN];
62}; 80};
63 81
@@ -82,19 +100,16 @@ struct mce_log {
82#define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9) 100#define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9)
83#define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0) 101#define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0)
84 102
85#endif /* __x86_64__ */
86
87#ifdef __KERNEL__ 103#ifdef __KERNEL__
88 104
89#ifdef CONFIG_X86_32
90extern int mce_disabled; 105extern int mce_disabled;
91#else /* CONFIG_X86_32 */
92 106
93#include <asm/atomic.h> 107#include <asm/atomic.h>
108#include <linux/percpu.h>
94 109
95void mce_setup(struct mce *m); 110void mce_setup(struct mce *m);
96void mce_log(struct mce *m); 111void mce_log(struct mce *m);
97DECLARE_PER_CPU(struct sys_device, device_mce); 112DECLARE_PER_CPU(struct sys_device, mce_dev);
98extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 113extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
99 114
100/* 115/*
@@ -104,6 +119,8 @@ extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
104#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1) 119#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1)
105 120
106#ifdef CONFIG_X86_MCE_INTEL 121#ifdef CONFIG_X86_MCE_INTEL
122extern int mce_cmci_disabled;
123extern int mce_ignore_ce;
107void mce_intel_feature_init(struct cpuinfo_x86 *c); 124void mce_intel_feature_init(struct cpuinfo_x86 *c);
108void cmci_clear(void); 125void cmci_clear(void);
109void cmci_reenable(void); 126void cmci_reenable(void);
@@ -123,13 +140,16 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c);
123static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } 140static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
124#endif 141#endif
125 142
126extern int mce_available(struct cpuinfo_x86 *c); 143int mce_available(struct cpuinfo_x86 *c);
144
145DECLARE_PER_CPU(unsigned, mce_exception_count);
146DECLARE_PER_CPU(unsigned, mce_poll_count);
127 147
128void mce_log_therm_throt_event(__u64 status); 148void mce_log_therm_throt_event(__u64 status);
129 149
130extern atomic_t mce_entry; 150extern atomic_t mce_entry;
131 151
132extern void do_machine_check(struct pt_regs *, long); 152void do_machine_check(struct pt_regs *, long);
133 153
134typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS); 154typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS);
135DECLARE_PER_CPU(mce_banks_t, mce_poll_banks); 155DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);
@@ -139,14 +159,16 @@ enum mcp_flags {
139 MCP_UC = (1 << 1), /* log uncorrected errors */ 159 MCP_UC = (1 << 1), /* log uncorrected errors */
140 MCP_DONTLOG = (1 << 2), /* only clear, don't log */ 160 MCP_DONTLOG = (1 << 2), /* only clear, don't log */
141}; 161};
142extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); 162void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
143 163
144extern int mce_notify_user(void); 164int mce_notify_irq(void);
165void mce_notify_process(void);
145 166
146#endif /* !CONFIG_X86_32 */ 167DECLARE_PER_CPU(struct mce, injectm);
168extern struct file_operations mce_chrdev_ops;
147 169
148#ifdef CONFIG_X86_MCE 170#ifdef CONFIG_X86_MCE
149extern void mcheck_init(struct cpuinfo_x86 *c); 171void mcheck_init(struct cpuinfo_x86 *c);
150#else 172#else
151#define mcheck_init(c) do { } while (0) 173#define mcheck_init(c) do { } while (0)
152#endif 174#endif
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index c882664716c..ef51b501e22 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -9,20 +9,31 @@ struct cpu_signature {
9 9
10struct device; 10struct device;
11 11
12enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND };
13
12struct microcode_ops { 14struct microcode_ops {
13 int (*request_microcode_user) (int cpu, const void __user *buf, size_t size); 15 enum ucode_state (*request_microcode_user) (int cpu,
14 int (*request_microcode_fw) (int cpu, struct device *device); 16 const void __user *buf, size_t size);
15 17
16 void (*apply_microcode) (int cpu); 18 enum ucode_state (*request_microcode_fw) (int cpu,
19 struct device *device);
17 20
18 int (*collect_cpu_info) (int cpu, struct cpu_signature *csig);
19 void (*microcode_fini_cpu) (int cpu); 21 void (*microcode_fini_cpu) (int cpu);
22
23 /*
24 * The generic 'microcode_core' part guarantees that
25 * the callbacks below run on a target cpu when they
26 * are being called.
27 * See also the "Synchronization" section in microcode_core.c.
28 */
29 int (*apply_microcode) (int cpu);
30 int (*collect_cpu_info) (int cpu, struct cpu_signature *csig);
20}; 31};
21 32
22struct ucode_cpu_info { 33struct ucode_cpu_info {
23 struct cpu_signature cpu_sig; 34 struct cpu_signature cpu_sig;
24 int valid; 35 int valid;
25 void *mc; 36 void *mc;
26}; 37};
27extern struct ucode_cpu_info ucode_cpu_info[]; 38extern struct ucode_cpu_info ucode_cpu_info[];
28 39
diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h
index 90bc4108a4f..751af2550ed 100644
--- a/arch/x86/include/asm/mman.h
+++ b/arch/x86/include/asm/mman.h
@@ -1,7 +1,7 @@
1#ifndef _ASM_X86_MMAN_H 1#ifndef _ASM_X86_MMAN_H
2#define _ASM_X86_MMAN_H 2#define _ASM_X86_MMAN_H
3 3
4#include <asm-generic/mman.h> 4#include <asm-generic/mman-common.h>
5 5
6#define MAP_32BIT 0x40 /* only give out 32bit addresses */ 6#define MAP_32BIT 0x40 /* only give out 32bit addresses */
7 7
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 642fc7fc8cd..e2a1bb6d71e 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -61,9 +61,11 @@ extern void get_smp_config(void);
61#ifdef CONFIG_X86_MPPARSE 61#ifdef CONFIG_X86_MPPARSE
62extern void find_smp_config(void); 62extern void find_smp_config(void);
63extern void early_reserve_e820_mpc_new(void); 63extern void early_reserve_e820_mpc_new(void);
64extern int enable_update_mptable;
64#else 65#else
65static inline void find_smp_config(void) { } 66static inline void find_smp_config(void) { }
66static inline void early_reserve_e820_mpc_new(void) { } 67static inline void early_reserve_e820_mpc_new(void) { }
68#define enable_update_mptable 0
67#endif 69#endif
68 70
69void __cpuinit generic_processor_info(int apicid, int version); 71void __cpuinit generic_processor_info(int apicid, int version);
@@ -72,20 +74,13 @@ extern void mp_register_ioapic(int id, u32 address, u32 gsi_base);
72extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, 74extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
73 u32 gsi); 75 u32 gsi);
74extern void mp_config_acpi_legacy_irqs(void); 76extern void mp_config_acpi_legacy_irqs(void);
75extern int mp_register_gsi(u32 gsi, int edge_level, int active_high_low); 77struct device;
78extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level,
79 int active_high_low);
76extern int acpi_probe_gsi(void); 80extern int acpi_probe_gsi(void);
77#ifdef CONFIG_X86_IO_APIC 81#ifdef CONFIG_X86_IO_APIC
78extern int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
79 u32 gsi, int triggering, int polarity);
80extern int mp_find_ioapic(int gsi); 82extern int mp_find_ioapic(int gsi);
81extern int mp_find_ioapic_pin(int ioapic, int gsi); 83extern int mp_find_ioapic_pin(int ioapic, int gsi);
82#else
83static inline int
84mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
85 u32 gsi, int triggering, int polarity)
86{
87 return 0;
88}
89#endif 84#endif
90#else /* !CONFIG_ACPI: */ 85#else /* !CONFIG_ACPI: */
91static inline int acpi_probe_gsi(void) 86static inline int acpi_probe_gsi(void)
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index ec41fc16c16..1692fb5050e 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -121,7 +121,6 @@
121#define MSR_K8_TOP_MEM1 0xc001001a 121#define MSR_K8_TOP_MEM1 0xc001001a
122#define MSR_K8_TOP_MEM2 0xc001001d 122#define MSR_K8_TOP_MEM2 0xc001001d
123#define MSR_K8_SYSCFG 0xc0010010 123#define MSR_K8_SYSCFG 0xc0010010
124#define MSR_K8_HWCR 0xc0010015
125#define MSR_K8_INT_PENDING_MSG 0xc0010055 124#define MSR_K8_INT_PENDING_MSG 0xc0010055
126/* C1E active bits in int pending message */ 125/* C1E active bits in int pending message */
127#define K8_INTP_C1E_ACTIVE_MASK 0x18000000 126#define K8_INTP_C1E_ACTIVE_MASK 0x18000000
@@ -208,7 +207,14 @@
208 207
209#define MSR_IA32_THERM_CONTROL 0x0000019a 208#define MSR_IA32_THERM_CONTROL 0x0000019a
210#define MSR_IA32_THERM_INTERRUPT 0x0000019b 209#define MSR_IA32_THERM_INTERRUPT 0x0000019b
210
211#define THERM_INT_LOW_ENABLE (1 << 0)
212#define THERM_INT_HIGH_ENABLE (1 << 1)
213
211#define MSR_IA32_THERM_STATUS 0x0000019c 214#define MSR_IA32_THERM_STATUS 0x0000019c
215
216#define THERM_STATUS_PROCHOT (1 << 0)
217
212#define MSR_IA32_MISC_ENABLE 0x000001a0 218#define MSR_IA32_MISC_ENABLE 0x000001a0
213 219
214/* MISC_ENABLE bits: architectural */ 220/* MISC_ENABLE bits: architectural */
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 638bf624180..22603764e7d 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -12,6 +12,17 @@
12 12
13#include <asm/asm.h> 13#include <asm/asm.h>
14#include <asm/errno.h> 14#include <asm/errno.h>
15#include <asm/cpumask.h>
16
17struct msr {
18 union {
19 struct {
20 u32 l;
21 u32 h;
22 };
23 u64 q;
24 };
25};
15 26
16static inline unsigned long long native_read_tscp(unsigned int *aux) 27static inline unsigned long long native_read_tscp(unsigned int *aux)
17{ 28{
@@ -216,6 +227,8 @@ do { \
216#ifdef CONFIG_SMP 227#ifdef CONFIG_SMP
217int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); 228int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
218int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); 229int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
230void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs);
231void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs);
219int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); 232int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
220int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); 233int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
221#else /* CONFIG_SMP */ 234#else /* CONFIG_SMP */
@@ -229,6 +242,16 @@ static inline int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
229 wrmsr(msr_no, l, h); 242 wrmsr(msr_no, l, h);
230 return 0; 243 return 0;
231} 244}
245static inline void rdmsr_on_cpus(const cpumask_t *m, u32 msr_no,
246 struct msr *msrs)
247{
248 rdmsr_on_cpu(0, msr_no, &(msrs[0].l), &(msrs[0].h));
249}
250static inline void wrmsr_on_cpus(const cpumask_t *m, u32 msr_no,
251 struct msr *msrs)
252{
253 wrmsr_on_cpu(0, msr_no, msrs[0].l, msrs[0].h);
254}
232static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, 255static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no,
233 u32 *l, u32 *h) 256 u32 *l, u32 *h)
234{ 257{
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index c45a0a568df..c9726440993 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -64,7 +64,7 @@ static inline int nmi_watchdog_active(void)
64 * but since they are power of two we could use a 64 * but since they are power of two we could use a
65 * cheaper way --cvg 65 * cheaper way --cvg
66 */ 66 */
67 return nmi_watchdog & 0x3; 67 return nmi_watchdog & (NMI_LOCAL_APIC | NMI_IO_APIC);
68} 68}
69#endif 69#endif
70 70
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 064ed6df4cb..c4ae822e415 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -17,9 +17,6 @@ extern int compute_hash_shift(struct bootnode *nodes, int numblks,
17extern void numa_init_array(void); 17extern void numa_init_array(void);
18extern int numa_off; 18extern int numa_off;
19 19
20extern void srat_reserve_add_area(int nodeid);
21extern int hotadd_percent;
22
23extern s16 apicid_to_node[MAX_LOCAL_APIC]; 20extern s16 apicid_to_node[MAX_LOCAL_APIC];
24 21
25extern unsigned long numa_free_all_bootmem(void); 22extern unsigned long numa_free_all_bootmem(void);
@@ -27,6 +24,13 @@ extern void setup_node_bootmem(int nodeid, unsigned long start,
27 unsigned long end); 24 unsigned long end);
28 25
29#ifdef CONFIG_NUMA 26#ifdef CONFIG_NUMA
27/*
28 * Too small node sizes may confuse the VM badly. Usually they
29 * result from BIOS bugs. So dont recognize nodes as standalone
30 * NUMA entities that have less than this amount of RAM listed:
31 */
32#define NODE_MIN_SIZE (4*1024*1024)
33
30extern void __init init_cpu_to_node(void); 34extern void __init init_cpu_to_node(void);
31extern void __cpuinit numa_set_node(int cpu, int node); 35extern void __cpuinit numa_set_node(int cpu, int node);
32extern void __cpuinit numa_clear_node(int cpu); 36extern void __cpuinit numa_clear_node(int cpu);
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index 89ed9d70b0a..625c3f0e741 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -56,7 +56,7 @@ extern bool __virt_addr_valid(unsigned long kaddr);
56#endif /* __ASSEMBLY__ */ 56#endif /* __ASSEMBLY__ */
57 57
58#include <asm-generic/memory_model.h> 58#include <asm-generic/memory_model.h>
59#include <asm-generic/page.h> 59#include <asm-generic/getorder.h>
60 60
61#define __HAVE_ARCH_GATE_AREA 1 61#define __HAVE_ARCH_GATE_AREA 1
62 62
diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h
index 0f915ae649a..6f1b7331313 100644
--- a/arch/x86/include/asm/page_32_types.h
+++ b/arch/x86/include/asm/page_32_types.h
@@ -54,10 +54,6 @@ extern unsigned int __VMALLOC_RESERVE;
54extern int sysctl_legacy_va_layout; 54extern int sysctl_legacy_va_layout;
55 55
56extern void find_low_pfn_range(void); 56extern void find_low_pfn_range(void);
57extern unsigned long init_memory_mapping(unsigned long start,
58 unsigned long end);
59extern void initmem_init(unsigned long, unsigned long);
60extern void free_initmem(void);
61extern void setup_bootmem_allocator(void); 57extern void setup_bootmem_allocator(void);
62 58
63#endif /* !__ASSEMBLY__ */ 59#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index d38c91b7024..8d382d3abf3 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -32,22 +32,14 @@
32 */ 32 */
33#define __PAGE_OFFSET _AC(0xffff880000000000, UL) 33#define __PAGE_OFFSET _AC(0xffff880000000000, UL)
34 34
35#define __PHYSICAL_START CONFIG_PHYSICAL_START 35#define __PHYSICAL_START ((CONFIG_PHYSICAL_START + \
36#define __KERNEL_ALIGN 0x200000 36 (CONFIG_PHYSICAL_ALIGN - 1)) & \
37 37 ~(CONFIG_PHYSICAL_ALIGN - 1))
38/*
39 * Make sure kernel is aligned to 2MB address. Catching it at compile
40 * time is better. Change your config file and compile the kernel
41 * for a 2MB aligned address (CONFIG_PHYSICAL_START)
42 */
43#if (CONFIG_PHYSICAL_START % __KERNEL_ALIGN) != 0
44#error "CONFIG_PHYSICAL_START must be a multiple of 2MB"
45#endif
46 38
47#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START) 39#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
48#define __START_KERNEL_map _AC(0xffffffff80000000, UL) 40#define __START_KERNEL_map _AC(0xffffffff80000000, UL)
49 41
50/* See Documentation/x86_64/mm.txt for a description of the memory map. */ 42/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
51#define __PHYSICAL_MASK_SHIFT 46 43#define __PHYSICAL_MASK_SHIFT 46
52#define __VIRTUAL_MASK_SHIFT 48 44#define __VIRTUAL_MASK_SHIFT 48
53 45
@@ -71,12 +63,6 @@ extern unsigned long __phys_addr(unsigned long);
71 63
72#define vmemmap ((struct page *)VMEMMAP_START) 64#define vmemmap ((struct page *)VMEMMAP_START)
73 65
74extern unsigned long init_memory_mapping(unsigned long start,
75 unsigned long end);
76
77extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
78extern void free_initmem(void);
79
80extern void init_extra_mapping_uc(unsigned long phys, unsigned long size); 66extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
81extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); 67extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
82 68
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 826ad37006a..6473f5ccff8 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -46,6 +46,12 @@ extern int devmem_is_allowed(unsigned long pagenr);
46extern unsigned long max_low_pfn_mapped; 46extern unsigned long max_low_pfn_mapped;
47extern unsigned long max_pfn_mapped; 47extern unsigned long max_pfn_mapped;
48 48
49extern unsigned long init_memory_mapping(unsigned long start,
50 unsigned long end);
51
52extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
53extern void free_initmem(void);
54
49#endif /* !__ASSEMBLY__ */ 55#endif /* !__ASSEMBLY__ */
50 56
51#endif /* _ASM_X86_PAGE_DEFS_H */ 57#endif /* _ASM_X86_PAGE_DEFS_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 378e3691c08..4fb37c8a083 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -56,6 +56,7 @@ struct desc_ptr;
56struct tss_struct; 56struct tss_struct;
57struct mm_struct; 57struct mm_struct;
58struct desc_struct; 58struct desc_struct;
59struct task_struct;
59 60
60/* 61/*
61 * Wrapper type for pointers to code which uses the non-standard 62 * Wrapper type for pointers to code which uses the non-standard
@@ -203,7 +204,8 @@ struct pv_cpu_ops {
203 204
204 void (*swapgs)(void); 205 void (*swapgs)(void);
205 206
206 struct pv_lazy_ops lazy_mode; 207 void (*start_context_switch)(struct task_struct *prev);
208 void (*end_context_switch)(struct task_struct *next);
207}; 209};
208 210
209struct pv_irq_ops { 211struct pv_irq_ops {
@@ -1399,25 +1401,23 @@ enum paravirt_lazy_mode {
1399}; 1401};
1400 1402
1401enum paravirt_lazy_mode paravirt_get_lazy_mode(void); 1403enum paravirt_lazy_mode paravirt_get_lazy_mode(void);
1402void paravirt_enter_lazy_cpu(void); 1404void paravirt_start_context_switch(struct task_struct *prev);
1403void paravirt_leave_lazy_cpu(void); 1405void paravirt_end_context_switch(struct task_struct *next);
1406
1404void paravirt_enter_lazy_mmu(void); 1407void paravirt_enter_lazy_mmu(void);
1405void paravirt_leave_lazy_mmu(void); 1408void paravirt_leave_lazy_mmu(void);
1406void paravirt_leave_lazy(enum paravirt_lazy_mode mode);
1407 1409
1408#define __HAVE_ARCH_ENTER_LAZY_CPU_MODE 1410#define __HAVE_ARCH_START_CONTEXT_SWITCH
1409static inline void arch_enter_lazy_cpu_mode(void) 1411static inline void arch_start_context_switch(struct task_struct *prev)
1410{ 1412{
1411 PVOP_VCALL0(pv_cpu_ops.lazy_mode.enter); 1413 PVOP_VCALL1(pv_cpu_ops.start_context_switch, prev);
1412} 1414}
1413 1415
1414static inline void arch_leave_lazy_cpu_mode(void) 1416static inline void arch_end_context_switch(struct task_struct *next)
1415{ 1417{
1416 PVOP_VCALL0(pv_cpu_ops.lazy_mode.leave); 1418 PVOP_VCALL1(pv_cpu_ops.end_context_switch, next);
1417} 1419}
1418 1420
1419void arch_flush_lazy_cpu_mode(void);
1420
1421#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE 1421#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
1422static inline void arch_enter_lazy_mmu_mode(void) 1422static inline void arch_enter_lazy_mmu_mode(void)
1423{ 1423{
@@ -1443,7 +1443,7 @@ u64 _paravirt_ident_64(u64);
1443 1443
1444#define paravirt_nop ((void *)_paravirt_nop) 1444#define paravirt_nop ((void *)_paravirt_nop)
1445 1445
1446#ifdef CONFIG_SMP 1446#if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS)
1447 1447
1448static inline int __raw_spin_is_locked(struct raw_spinlock *lock) 1448static inline int __raw_spin_is_locked(struct raw_spinlock *lock)
1449{ 1449{
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index aee103b26d0..02ecb30982a 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -82,22 +82,22 @@ do { \
82 case 1: \ 82 case 1: \
83 asm(op "b %1,"__percpu_arg(0) \ 83 asm(op "b %1,"__percpu_arg(0) \
84 : "+m" (var) \ 84 : "+m" (var) \
85 : "ri" ((T__)val)); \ 85 : "qi" ((T__)(val))); \
86 break; \ 86 break; \
87 case 2: \ 87 case 2: \
88 asm(op "w %1,"__percpu_arg(0) \ 88 asm(op "w %1,"__percpu_arg(0) \
89 : "+m" (var) \ 89 : "+m" (var) \
90 : "ri" ((T__)val)); \ 90 : "ri" ((T__)(val))); \
91 break; \ 91 break; \
92 case 4: \ 92 case 4: \
93 asm(op "l %1,"__percpu_arg(0) \ 93 asm(op "l %1,"__percpu_arg(0) \
94 : "+m" (var) \ 94 : "+m" (var) \
95 : "ri" ((T__)val)); \ 95 : "ri" ((T__)(val))); \
96 break; \ 96 break; \
97 case 8: \ 97 case 8: \
98 asm(op "q %1,"__percpu_arg(0) \ 98 asm(op "q %1,"__percpu_arg(0) \
99 : "+m" (var) \ 99 : "+m" (var) \
100 : "re" ((T__)val)); \ 100 : "re" ((T__)(val))); \
101 break; \ 101 break; \
102 default: __bad_percpu_size(); \ 102 default: __bad_percpu_size(); \
103 } \ 103 } \
@@ -109,7 +109,7 @@ do { \
109 switch (sizeof(var)) { \ 109 switch (sizeof(var)) { \
110 case 1: \ 110 case 1: \
111 asm(op "b "__percpu_arg(1)",%0" \ 111 asm(op "b "__percpu_arg(1)",%0" \
112 : "=r" (ret__) \ 112 : "=q" (ret__) \
113 : "m" (var)); \ 113 : "m" (var)); \
114 break; \ 114 break; \
115 case 2: \ 115 case 2: \
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
new file mode 100644
index 00000000000..876ed97147b
--- /dev/null
+++ b/arch/x86/include/asm/perf_counter.h
@@ -0,0 +1,100 @@
1#ifndef _ASM_X86_PERF_COUNTER_H
2#define _ASM_X86_PERF_COUNTER_H
3
4/*
5 * Performance counter hw details:
6 */
7
8#define X86_PMC_MAX_GENERIC 8
9#define X86_PMC_MAX_FIXED 3
10
11#define X86_PMC_IDX_GENERIC 0
12#define X86_PMC_IDX_FIXED 32
13#define X86_PMC_IDX_MAX 64
14
15#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
16#define MSR_ARCH_PERFMON_PERFCTR1 0xc2
17
18#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
19#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
20
21#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
22#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
23#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
24#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
25
26/*
27 * Includes eventsel and unit mask as well:
28 */
29#define ARCH_PERFMON_EVENT_MASK 0xffff
30
31#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c
32#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
33#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0
34#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
35 (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
36
37#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6
38
39/*
40 * Intel "Architectural Performance Monitoring" CPUID
41 * detection/enumeration details:
42 */
43union cpuid10_eax {
44 struct {
45 unsigned int version_id:8;
46 unsigned int num_counters:8;
47 unsigned int bit_width:8;
48 unsigned int mask_length:8;
49 } split;
50 unsigned int full;
51};
52
53union cpuid10_edx {
54 struct {
55 unsigned int num_counters_fixed:4;
56 unsigned int reserved:28;
57 } split;
58 unsigned int full;
59};
60
61
62/*
63 * Fixed-purpose performance counters:
64 */
65
66/*
67 * All 3 fixed-mode PMCs are configured via this single MSR:
68 */
69#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d
70
71/*
72 * The counts are available in three separate MSRs:
73 */
74
75/* Instr_Retired.Any: */
76#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309
77#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0)
78
79/* CPU_CLK_Unhalted.Core: */
80#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a
81#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1)
82
83/* CPU_CLK_Unhalted.Ref: */
84#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b
85#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2)
86
87extern void set_perf_counter_pending(void);
88
89#define clear_perf_counter_pending() do { } while (0)
90#define test_perf_counter_pending() (0)
91
92#ifdef CONFIG_PERF_COUNTERS
93extern void init_hw_perf_counters(void);
94extern void perf_counters_lapic_init(void);
95#else
96static inline void init_hw_perf_counters(void) { }
97static inline void perf_counters_lapic_init(void) { }
98#endif
99
100#endif /* _ASM_X86_PERF_COUNTER_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 29d96d168bc..3cc06e3fceb 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -81,6 +81,8 @@ static inline void __init paravirt_pagetable_setup_done(pgd_t *base)
81#define pte_val(x) native_pte_val(x) 81#define pte_val(x) native_pte_val(x)
82#define __pte(x) native_make_pte(x) 82#define __pte(x) native_make_pte(x)
83 83
84#define arch_end_context_switch(prev) do {} while(0)
85
84#endif /* CONFIG_PARAVIRT */ 86#endif /* CONFIG_PARAVIRT */
85 87
86/* 88/*
@@ -315,6 +317,11 @@ static inline int pte_present(pte_t a)
315 return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); 317 return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
316} 318}
317 319
320static inline int pte_hidden(pte_t pte)
321{
322 return pte_flags(pte) & _PAGE_HIDDEN;
323}
324
318static inline int pmd_present(pmd_t pmd) 325static inline int pmd_present(pmd_t pmd)
319{ 326{
320 return pmd_flags(pmd) & _PAGE_PRESENT; 327 return pmd_flags(pmd) & _PAGE_PRESENT;
@@ -503,6 +510,8 @@ static inline int pgd_none(pgd_t pgd)
503 510
504#ifndef __ASSEMBLY__ 511#ifndef __ASSEMBLY__
505 512
513extern int direct_gbpages;
514
506/* local pte updates need not use xchg for locking */ 515/* local pte updates need not use xchg for locking */
507static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) 516static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
508{ 517{
diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h
index 2733fad45f9..5e67c153231 100644
--- a/arch/x86/include/asm/pgtable_32_types.h
+++ b/arch/x86/include/asm/pgtable_32_types.h
@@ -46,6 +46,10 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */
46# define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE) 46# define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE)
47#endif 47#endif
48 48
49#define MODULES_VADDR VMALLOC_START
50#define MODULES_END VMALLOC_END
51#define MODULES_LEN (MODULES_VADDR - MODULES_END)
52
49#define MAXMEM (VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE) 53#define MAXMEM (VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE)
50 54
51#endif /* _ASM_X86_PGTABLE_32_DEFS_H */ 55#endif /* _ASM_X86_PGTABLE_32_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 6b87bc6d501..abde308fdb0 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -25,10 +25,6 @@ extern pgd_t init_level4_pgt[];
25 25
26extern void paging_init(void); 26extern void paging_init(void);
27 27
28#endif /* !__ASSEMBLY__ */
29
30#ifndef __ASSEMBLY__
31
32#define pte_ERROR(e) \ 28#define pte_ERROR(e) \
33 printk("%s:%d: bad pte %p(%016lx).\n", \ 29 printk("%s:%d: bad pte %p(%016lx).\n", \
34 __FILE__, __LINE__, &(e), pte_val(e)) 30 __FILE__, __LINE__, &(e), pte_val(e))
@@ -135,8 +131,6 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
135 131
136#define update_mmu_cache(vma, address, pte) do { } while (0) 132#define update_mmu_cache(vma, address, pte) do { } while (0)
137 133
138extern int direct_gbpages;
139
140/* Encode and de-code a swap entry */ 134/* Encode and de-code a swap entry */
141#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE 135#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
142#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) 136#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index fbf42b8e038..766ea16fbbb 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -51,11 +51,11 @@ typedef struct { pteval_t pte; } pte_t;
51#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT) 51#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
52#define PGDIR_MASK (~(PGDIR_SIZE - 1)) 52#define PGDIR_MASK (~(PGDIR_SIZE - 1))
53 53
54 54/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
55#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) 55#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
56#define VMALLOC_START _AC(0xffffc20000000000, UL) 56#define VMALLOC_START _AC(0xffffc90000000000, UL)
57#define VMALLOC_END _AC(0xffffe1ffffffffff, UL) 57#define VMALLOC_END _AC(0xffffe8ffffffffff, UL)
58#define VMEMMAP_START _AC(0xffffe20000000000, UL) 58#define VMEMMAP_START _AC(0xffffea0000000000, UL)
59#define MODULES_VADDR _AC(0xffffffffa0000000, UL) 59#define MODULES_VADDR _AC(0xffffffffa0000000, UL)
60#define MODULES_END _AC(0xffffffffff000000, UL) 60#define MODULES_END _AC(0xffffffffff000000, UL)
61#define MODULES_LEN (MODULES_END - MODULES_VADDR) 61#define MODULES_LEN (MODULES_END - MODULES_VADDR)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index b8238dc8786..54cb697f490 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -18,7 +18,7 @@
18#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ 18#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
19#define _PAGE_BIT_UNUSED1 9 /* available for programmer */ 19#define _PAGE_BIT_UNUSED1 9 /* available for programmer */
20#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */ 20#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */
21#define _PAGE_BIT_UNUSED3 11 21#define _PAGE_BIT_HIDDEN 11 /* hidden by kmemcheck */
22#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ 22#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
23#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 23#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1
24#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 24#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1
@@ -41,13 +41,18 @@
41#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) 41#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
42#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1) 42#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
43#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP) 43#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
44#define _PAGE_UNUSED3 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED3)
45#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) 44#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
46#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) 45#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
47#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) 46#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
48#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) 47#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
49#define __HAVE_ARCH_PTE_SPECIAL 48#define __HAVE_ARCH_PTE_SPECIAL
50 49
50#ifdef CONFIG_KMEMCHECK
51#define _PAGE_HIDDEN (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN)
52#else
53#define _PAGE_HIDDEN (_AT(pteval_t, 0))
54#endif
55
51#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) 56#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
52#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) 57#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
53#else 58#else
@@ -273,7 +278,6 @@ typedef struct page *pgtable_t;
273 278
274extern pteval_t __supported_pte_mask; 279extern pteval_t __supported_pte_mask;
275extern int nx_enabled; 280extern int nx_enabled;
276extern void set_nx(void);
277 281
278#define pgprot_writecombine pgprot_writecombine 282#define pgprot_writecombine pgprot_writecombine
279extern pgprot_t pgprot_writecombine(pgprot_t prot); 283extern pgprot_t pgprot_writecombine(pgprot_t prot);
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 448b34a8e39..2b03f700d3f 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -136,7 +136,8 @@ extern struct cpuinfo_x86 boot_cpu_data;
136extern struct cpuinfo_x86 new_cpu_data; 136extern struct cpuinfo_x86 new_cpu_data;
137 137
138extern struct tss_struct doublefault_tss; 138extern struct tss_struct doublefault_tss;
139extern __u32 cleared_cpu_caps[NCAPINTS]; 139extern __u32 cpu_caps_cleared[NCAPINTS];
140extern __u32 cpu_caps_set[NCAPINTS];
140 141
141#ifdef CONFIG_SMP 142#ifdef CONFIG_SMP
142DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); 143DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
@@ -410,9 +411,6 @@ DECLARE_PER_CPU(unsigned long, stack_canary);
410extern unsigned int xstate_size; 411extern unsigned int xstate_size;
411extern void free_thread_xstate(struct task_struct *); 412extern void free_thread_xstate(struct task_struct *);
412extern struct kmem_cache *task_xstate_cachep; 413extern struct kmem_cache *task_xstate_cachep;
413extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
414extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
415extern unsigned short num_cache_leaves;
416 414
417struct thread_struct { 415struct thread_struct {
418 /* Cached TLS descriptors: */ 416 /* Cached TLS descriptors: */
@@ -428,8 +426,12 @@ struct thread_struct {
428 unsigned short fsindex; 426 unsigned short fsindex;
429 unsigned short gsindex; 427 unsigned short gsindex;
430#endif 428#endif
429#ifdef CONFIG_X86_32
431 unsigned long ip; 430 unsigned long ip;
431#endif
432#ifdef CONFIG_X86_64
432 unsigned long fs; 433 unsigned long fs;
434#endif
433 unsigned long gs; 435 unsigned long gs;
434 /* Hardware debugging registers: */ 436 /* Hardware debugging registers: */
435 unsigned long debugreg[HBP_NUM]; 437 unsigned long debugreg[HBP_NUM];
@@ -835,6 +837,7 @@ extern unsigned int BIOS_revision;
835 837
836/* Boot loader type from the setup header: */ 838/* Boot loader type from the setup header: */
837extern int bootloader_type; 839extern int bootloader_type;
840extern int bootloader_version;
838 841
839extern char ignore_fpu_irq; 842extern char ignore_fpu_irq;
840 843
@@ -895,7 +898,6 @@ static inline void spin_lock_prefetch(const void *x)
895 .vm86_info = NULL, \ 898 .vm86_info = NULL, \
896 .sysenter_cs = __KERNEL_CS, \ 899 .sysenter_cs = __KERNEL_CS, \
897 .io_bitmap_ptr = NULL, \ 900 .io_bitmap_ptr = NULL, \
898 .fs = __KERNEL_PERCPU, \
899} 901}
900 902
901/* 903/*
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 5cdd19f20b5..0f0d908349a 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -187,14 +187,15 @@ static inline int v8086_mode(struct pt_regs *regs)
187 187
188/* 188/*
189 * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode 189 * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode
190 * when it traps. So regs will be the current sp. 190 * when it traps. The previous stack will be directly underneath the saved
191 * registers, and 'sp/ss' won't even have been saved. Thus the '&regs->sp'.
191 * 192 *
192 * This is valid only for kernel mode traps. 193 * This is valid only for kernel mode traps.
193 */ 194 */
194static inline unsigned long kernel_trap_sp(struct pt_regs *regs) 195static inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
195{ 196{
196#ifdef CONFIG_X86_32 197#ifdef CONFIG_X86_32
197 return (unsigned long)regs; 198 return (unsigned long)(&regs->sp);
198#else 199#else
199 return regs->sp; 200 return regs->sp;
200#endif 201#endif
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index a4737dddfd5..64cf2d24fad 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -48,9 +48,15 @@
48#endif 48#endif
49 49
50#ifdef CONFIG_X86_64 50#ifdef CONFIG_X86_64
51#ifdef CONFIG_PARAVIRT
52/* Paravirtualized systems may not have PSE or PGE available */
51#define NEED_PSE 0 53#define NEED_PSE 0
52#define NEED_MSR (1<<(X86_FEATURE_MSR & 31))
53#define NEED_PGE 0 54#define NEED_PGE 0
55#else
56#define NEED_PSE (1<<(X86_FEATURE_PSE) & 31)
57#define NEED_PGE (1<<(X86_FEATURE_PGE) & 31)
58#endif
59#define NEED_MSR (1<<(X86_FEATURE_MSR & 31))
54#define NEED_FXSR (1<<(X86_FEATURE_FXSR & 31)) 60#define NEED_FXSR (1<<(X86_FEATURE_FXSR & 31))
55#define NEED_XMM (1<<(X86_FEATURE_XMM & 31)) 61#define NEED_XMM (1<<(X86_FEATURE_XMM & 31))
56#define NEED_XMM2 (1<<(X86_FEATURE_XMM2 & 31)) 62#define NEED_XMM2 (1<<(X86_FEATURE_XMM2 & 31))
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index bdc2ada05ae..4093d1ed6db 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -33,7 +33,6 @@ struct x86_quirks {
33 int (*setup_ioapic_ids)(void); 33 int (*setup_ioapic_ids)(void);
34}; 34};
35 35
36extern void x86_quirk_pre_intr_init(void);
37extern void x86_quirk_intr_init(void); 36extern void x86_quirk_intr_init(void);
38 37
39extern void x86_quirk_trap_init(void); 38extern void x86_quirk_trap_init(void);
diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
index 7761a5d554b..598457cbd0f 100644
--- a/arch/x86/include/asm/signal.h
+++ b/arch/x86/include/asm/signal.h
@@ -117,7 +117,7 @@ typedef unsigned long sigset_t;
117#define MINSIGSTKSZ 2048 117#define MINSIGSTKSZ 2048
118#define SIGSTKSZ 8192 118#define SIGSTKSZ 8192
119 119
120#include <asm-generic/signal.h> 120#include <asm-generic/signal-defs.h>
121 121
122#ifndef __ASSEMBLY__ 122#ifndef __ASSEMBLY__
123 123
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 19e0d88b966..6a84ed166ae 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -180,7 +180,7 @@ extern int safe_smp_processor_id(void);
180static inline int logical_smp_processor_id(void) 180static inline int logical_smp_processor_id(void)
181{ 181{
182 /* we don't want to mark this access volatile - bad code generation */ 182 /* we don't want to mark this access volatile - bad code generation */
183 return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR)); 183 return GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
184} 184}
185 185
186#endif 186#endif
diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h
index e3cc3c063ec..4517d6b9318 100644
--- a/arch/x86/include/asm/sparsemem.h
+++ b/arch/x86/include/asm/sparsemem.h
@@ -27,7 +27,7 @@
27#else /* CONFIG_X86_32 */ 27#else /* CONFIG_X86_32 */
28# define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */ 28# define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */
29# define MAX_PHYSADDR_BITS 44 29# define MAX_PHYSADDR_BITS 44
30# define MAX_PHYSMEM_BITS 44 /* Can be max 45 bits */ 30# define MAX_PHYSMEM_BITS 46
31#endif 31#endif
32 32
33#endif /* CONFIG_SPARSEMEM */ 33#endif /* CONFIG_SPARSEMEM */
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index e5e6caffec8..b7e5db87639 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -172,7 +172,7 @@ static inline int __ticket_spin_is_contended(raw_spinlock_t *lock)
172 return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1; 172 return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1;
173} 173}
174 174
175#ifndef CONFIG_PARAVIRT 175#ifndef CONFIG_PARAVIRT_SPINLOCKS
176 176
177static inline int __raw_spin_is_locked(raw_spinlock_t *lock) 177static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
178{ 178{
@@ -206,7 +206,7 @@ static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
206 __raw_spin_lock(lock); 206 __raw_spin_lock(lock);
207} 207}
208 208
209#endif 209#endif /* CONFIG_PARAVIRT_SPINLOCKS */
210 210
211static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock) 211static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
212{ 212{
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
index 0e0e3ba827f..c86f452256d 100644
--- a/arch/x86/include/asm/string_32.h
+++ b/arch/x86/include/asm/string_32.h
@@ -177,10 +177,18 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len)
177 * No 3D Now! 177 * No 3D Now!
178 */ 178 */
179 179
180#ifndef CONFIG_KMEMCHECK
180#define memcpy(t, f, n) \ 181#define memcpy(t, f, n) \
181 (__builtin_constant_p((n)) \ 182 (__builtin_constant_p((n)) \
182 ? __constant_memcpy((t), (f), (n)) \ 183 ? __constant_memcpy((t), (f), (n)) \
183 : __memcpy((t), (f), (n))) 184 : __memcpy((t), (f), (n)))
185#else
186/*
187 * kmemcheck becomes very happy if we use the REP instructions unconditionally,
188 * because it means that we know both memory operands in advance.
189 */
190#define memcpy(t, f, n) __memcpy((t), (f), (n))
191#endif
184 192
185#endif 193#endif
186 194
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 2afe164bf1e..19e2c468fc2 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -27,6 +27,7 @@ static __always_inline void *__inline_memcpy(void *to, const void *from, size_t
27 function. */ 27 function. */
28 28
29#define __HAVE_ARCH_MEMCPY 1 29#define __HAVE_ARCH_MEMCPY 1
30#ifndef CONFIG_KMEMCHECK
30#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4 31#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4
31extern void *memcpy(void *to, const void *from, size_t len); 32extern void *memcpy(void *to, const void *from, size_t len);
32#else 33#else
@@ -42,6 +43,13 @@ extern void *__memcpy(void *to, const void *from, size_t len);
42 __ret; \ 43 __ret; \
43}) 44})
44#endif 45#endif
46#else
47/*
48 * kmemcheck becomes very happy if we use the REP instructions unconditionally,
49 * because it means that we know both memory operands in advance.
50 */
51#define memcpy(dst, src, len) __inline_memcpy((dst), (src), (len))
52#endif
45 53
46#define __HAVE_ARCH_MEMSET 54#define __HAVE_ARCH_MEMSET
47void *memset(void *s, int c, size_t n); 55void *memset(void *s, int c, size_t n);
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 82ada75f3eb..85574b7c1bc 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -225,6 +225,7 @@ struct __attribute__ ((__packed__)) vmcb {
225#define SVM_EVTINJ_VALID_ERR (1 << 11) 225#define SVM_EVTINJ_VALID_ERR (1 << 11)
226 226
227#define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK 227#define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK
228#define SVM_EXITINTINFO_TYPE_MASK SVM_EVTINJ_TYPE_MASK
228 229
229#define SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR 230#define SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR
230#define SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI 231#define SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 7043408f690..372b76edd63 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -1,7 +1,7 @@
1/* 1/*
2 * syscalls.h - Linux syscall interfaces (arch-specific) 2 * syscalls.h - Linux syscall interfaces (arch-specific)
3 * 3 *
4 * Copyright (c) 2008 Jaswinder Singh 4 * Copyright (c) 2008 Jaswinder Singh Rajput
5 * 5 *
6 * This file is released under the GPLv2. 6 * This file is released under the GPLv2.
7 * See the file COPYING for more details. 7 * See the file COPYING for more details.
@@ -12,50 +12,55 @@
12 12
13#include <linux/compiler.h> 13#include <linux/compiler.h>
14#include <linux/linkage.h> 14#include <linux/linkage.h>
15#include <linux/types.h>
16#include <linux/signal.h> 15#include <linux/signal.h>
16#include <linux/types.h>
17 17
18/* Common in X86_32 and X86_64 */ 18/* Common in X86_32 and X86_64 */
19/* kernel/ioport.c */ 19/* kernel/ioport.c */
20asmlinkage long sys_ioperm(unsigned long, unsigned long, int); 20asmlinkage long sys_ioperm(unsigned long, unsigned long, int);
21 21
22/* kernel/process.c */
23int sys_fork(struct pt_regs *);
24int sys_vfork(struct pt_regs *);
25
22/* kernel/ldt.c */ 26/* kernel/ldt.c */
23asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); 27asmlinkage int sys_modify_ldt(int, void __user *, unsigned long);
24 28
29/* kernel/signal.c */
30long sys_rt_sigreturn(struct pt_regs *);
31
25/* kernel/tls.c */ 32/* kernel/tls.c */
26asmlinkage int sys_set_thread_area(struct user_desc __user *); 33asmlinkage int sys_set_thread_area(struct user_desc __user *);
27asmlinkage int sys_get_thread_area(struct user_desc __user *); 34asmlinkage int sys_get_thread_area(struct user_desc __user *);
28 35
29/* X86_32 only */ 36/* X86_32 only */
30#ifdef CONFIG_X86_32 37#ifdef CONFIG_X86_32
38/* kernel/ioport.c */
39long sys_iopl(struct pt_regs *);
40
31/* kernel/process_32.c */ 41/* kernel/process_32.c */
32int sys_fork(struct pt_regs *);
33int sys_clone(struct pt_regs *); 42int sys_clone(struct pt_regs *);
34int sys_vfork(struct pt_regs *);
35int sys_execve(struct pt_regs *); 43int sys_execve(struct pt_regs *);
36 44
37/* kernel/signal_32.c */ 45/* kernel/signal.c */
38asmlinkage int sys_sigsuspend(int, int, old_sigset_t); 46asmlinkage int sys_sigsuspend(int, int, old_sigset_t);
39asmlinkage int sys_sigaction(int, const struct old_sigaction __user *, 47asmlinkage int sys_sigaction(int, const struct old_sigaction __user *,
40 struct old_sigaction __user *); 48 struct old_sigaction __user *);
41int sys_sigaltstack(struct pt_regs *); 49int sys_sigaltstack(struct pt_regs *);
42unsigned long sys_sigreturn(struct pt_regs *); 50unsigned long sys_sigreturn(struct pt_regs *);
43long sys_rt_sigreturn(struct pt_regs *);
44
45/* kernel/ioport.c */
46long sys_iopl(struct pt_regs *);
47 51
48/* kernel/sys_i386_32.c */ 52/* kernel/sys_i386_32.c */
53struct mmap_arg_struct;
54struct sel_arg_struct;
55struct oldold_utsname;
56struct old_utsname;
57
49asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long, 58asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long,
50 unsigned long, unsigned long, unsigned long); 59 unsigned long, unsigned long, unsigned long);
51struct mmap_arg_struct;
52asmlinkage int old_mmap(struct mmap_arg_struct __user *); 60asmlinkage int old_mmap(struct mmap_arg_struct __user *);
53struct sel_arg_struct;
54asmlinkage int old_select(struct sel_arg_struct __user *); 61asmlinkage int old_select(struct sel_arg_struct __user *);
55asmlinkage int sys_ipc(uint, int, int, int, void __user *, long); 62asmlinkage int sys_ipc(uint, int, int, int, void __user *, long);
56struct old_utsname;
57asmlinkage int sys_uname(struct old_utsname __user *); 63asmlinkage int sys_uname(struct old_utsname __user *);
58struct oldold_utsname;
59asmlinkage int sys_olduname(struct oldold_utsname __user *); 64asmlinkage int sys_olduname(struct oldold_utsname __user *);
60 65
61/* kernel/vm86_32.c */ 66/* kernel/vm86_32.c */
@@ -65,29 +70,27 @@ int sys_vm86(struct pt_regs *);
65#else /* CONFIG_X86_32 */ 70#else /* CONFIG_X86_32 */
66 71
67/* X86_64 only */ 72/* X86_64 only */
73/* kernel/ioport.c */
74asmlinkage long sys_iopl(unsigned int, struct pt_regs *);
75
68/* kernel/process_64.c */ 76/* kernel/process_64.c */
69asmlinkage long sys_fork(struct pt_regs *);
70asmlinkage long sys_clone(unsigned long, unsigned long, 77asmlinkage long sys_clone(unsigned long, unsigned long,
71 void __user *, void __user *, 78 void __user *, void __user *,
72 struct pt_regs *); 79 struct pt_regs *);
73asmlinkage long sys_vfork(struct pt_regs *);
74asmlinkage long sys_execve(char __user *, char __user * __user *, 80asmlinkage long sys_execve(char __user *, char __user * __user *,
75 char __user * __user *, 81 char __user * __user *,
76 struct pt_regs *); 82 struct pt_regs *);
77long sys_arch_prctl(int, unsigned long); 83long sys_arch_prctl(int, unsigned long);
78 84
79/* kernel/ioport.c */ 85/* kernel/signal.c */
80asmlinkage long sys_iopl(unsigned int, struct pt_regs *);
81
82/* kernel/signal_64.c */
83asmlinkage long sys_sigaltstack(const stack_t __user *, stack_t __user *, 86asmlinkage long sys_sigaltstack(const stack_t __user *, stack_t __user *,
84 struct pt_regs *); 87 struct pt_regs *);
85long sys_rt_sigreturn(struct pt_regs *);
86 88
87/* kernel/sys_x86_64.c */ 89/* kernel/sys_x86_64.c */
90struct new_utsname;
91
88asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long, 92asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long,
89 unsigned long, unsigned long, unsigned long); 93 unsigned long, unsigned long, unsigned long);
90struct new_utsname;
91asmlinkage long sys_uname(struct new_utsname __user *); 94asmlinkage long sys_uname(struct new_utsname __user *);
92 95
93#endif /* CONFIG_X86_32 */ 96#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/include/asm/termios.h b/arch/x86/include/asm/termios.h
index f72956331c4..c4ee8056bac 100644
--- a/arch/x86/include/asm/termios.h
+++ b/arch/x86/include/asm/termios.h
@@ -67,6 +67,7 @@ static inline int user_termio_to_kernel_termios(struct ktermios *termios,
67 SET_LOW_TERMIOS_BITS(termios, termio, c_oflag); 67 SET_LOW_TERMIOS_BITS(termios, termio, c_oflag);
68 SET_LOW_TERMIOS_BITS(termios, termio, c_cflag); 68 SET_LOW_TERMIOS_BITS(termios, termio, c_cflag);
69 SET_LOW_TERMIOS_BITS(termios, termio, c_lflag); 69 SET_LOW_TERMIOS_BITS(termios, termio, c_lflag);
70 get_user(termios->c_line, &termio->c_line);
70 return copy_from_user(termios->c_cc, termio->c_cc, NCC); 71 return copy_from_user(termios->c_cc, termio->c_cc, NCC);
71} 72}
72 73
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 8820a73ae09..b0783520988 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -94,7 +94,8 @@ struct thread_info {
94#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ 94#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
95#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ 95#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */
96#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ 96#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */
97#define TIF_SYSCALL_FTRACE 27 /* for ftrace syscall instrumentation */ 97#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
98#define TIF_SYSCALL_FTRACE 28 /* for ftrace syscall instrumentation */
98 99
99#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) 100#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
100#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) 101#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
@@ -116,6 +117,7 @@ struct thread_info {
116#define _TIF_FORCED_TF (1 << TIF_FORCED_TF) 117#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
117#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) 118#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR)
118#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) 119#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR)
120#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES)
119#define _TIF_SYSCALL_FTRACE (1 << TIF_SYSCALL_FTRACE) 121#define _TIF_SYSCALL_FTRACE (1 << TIF_SYSCALL_FTRACE)
120 122
121/* work to do in syscall_trace_enter() */ 123/* work to do in syscall_trace_enter() */
@@ -152,9 +154,9 @@ struct thread_info {
152 154
153/* thread information allocation */ 155/* thread information allocation */
154#ifdef CONFIG_DEBUG_STACK_USAGE 156#ifdef CONFIG_DEBUG_STACK_USAGE
155#define THREAD_FLAGS (GFP_KERNEL | __GFP_ZERO) 157#define THREAD_FLAGS (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
156#else 158#else
157#define THREAD_FLAGS GFP_KERNEL 159#define THREAD_FLAGS (GFP_KERNEL | __GFP_NOTRACK)
158#endif 160#endif
159 161
160#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR 162#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR
diff --git a/arch/x86/include/asm/timex.h b/arch/x86/include/asm/timex.h
index b5c9d45c981..1375cfc9396 100644
--- a/arch/x86/include/asm/timex.h
+++ b/arch/x86/include/asm/timex.h
@@ -4,9 +4,7 @@
4#include <asm/processor.h> 4#include <asm/processor.h>
5#include <asm/tsc.h> 5#include <asm/tsc.h>
6 6
7/* The PIT ticks at this frequency (in HZ): */ 7/* Assume we use the PIT time source for the clock tick */
8#define PIT_TICK_RATE 1193182
9
10#define CLOCK_TICK_RATE PIT_TICK_RATE 8#define CLOCK_TICK_RATE PIT_TICK_RATE
11 9
12#define ARCH_HAS_READ_CURRENT_TIMER 10#define ARCH_HAS_READ_CURRENT_TIMER
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index a5ecc9c33e9..7f3eba08e7d 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -172,6 +172,6 @@ static inline void flush_tlb_kernel_range(unsigned long start,
172 flush_tlb_all(); 172 flush_tlb_all();
173} 173}
174 174
175extern void zap_low_mappings(void); 175extern void zap_low_mappings(bool early);
176 176
177#endif /* _ASM_X86_TLBFLUSH_H */ 177#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index f44b49abca4..066ef590d7e 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -203,7 +203,8 @@ struct pci_bus;
203void x86_pci_root_bus_res_quirks(struct pci_bus *b); 203void x86_pci_root_bus_res_quirks(struct pci_bus *b);
204 204
205#ifdef CONFIG_SMP 205#ifdef CONFIG_SMP
206#define mc_capable() (cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids) 206#define mc_capable() ((boot_cpu_data.x86_max_cores > 1) && \
207 (cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids))
207#define smt_capable() (smp_num_siblings > 1) 208#define smt_capable() (smp_num_siblings > 1)
208#endif 209#endif
209 210
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 0d5342515b8..bfd74c032fc 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -2,6 +2,7 @@
2#define _ASM_X86_TRAPS_H 2#define _ASM_X86_TRAPS_H
3 3
4#include <asm/debugreg.h> 4#include <asm/debugreg.h>
5#include <asm/siginfo.h> /* TRAP_TRACE, ... */
5 6
6#ifdef CONFIG_X86_32 7#ifdef CONFIG_X86_32
7#define dotraplinkage 8#define dotraplinkage
@@ -13,6 +14,9 @@ asmlinkage void divide_error(void);
13asmlinkage void debug(void); 14asmlinkage void debug(void);
14asmlinkage void nmi(void); 15asmlinkage void nmi(void);
15asmlinkage void int3(void); 16asmlinkage void int3(void);
17asmlinkage void xen_debug(void);
18asmlinkage void xen_int3(void);
19asmlinkage void xen_stack_segment(void);
16asmlinkage void overflow(void); 20asmlinkage void overflow(void);
17asmlinkage void bounds(void); 21asmlinkage void bounds(void);
18asmlinkage void invalid_op(void); 22asmlinkage void invalid_op(void);
@@ -74,7 +78,6 @@ static inline int get_si_code(unsigned long condition)
74} 78}
75 79
76extern int panic_on_unrecovered_nmi; 80extern int panic_on_unrecovered_nmi;
77extern int kstack_depth_to_print;
78 81
79void math_error(void __user *); 82void math_error(void __user *);
80void math_emulate(struct math_emu_info *); 83void math_emulate(struct math_emu_info *);
diff --git a/arch/x86/include/asm/types.h b/arch/x86/include/asm/types.h
index e6f73632007..09b97745772 100644
--- a/arch/x86/include/asm/types.h
+++ b/arch/x86/include/asm/types.h
@@ -14,12 +14,6 @@ typedef unsigned short umode_t;
14 */ 14 */
15#ifdef __KERNEL__ 15#ifdef __KERNEL__
16 16
17#ifdef CONFIG_X86_32
18# define BITS_PER_LONG 32
19#else
20# define BITS_PER_LONG 64
21#endif
22
23#ifndef __ASSEMBLY__ 17#ifndef __ASSEMBLY__
24 18
25typedef u64 dma64_addr_t; 19typedef u64 dma64_addr_t;
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 6e72d74cf8d..732a3070615 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -340,6 +340,8 @@
340#define __NR_inotify_init1 332 340#define __NR_inotify_init1 332
341#define __NR_preadv 333 341#define __NR_preadv 333
342#define __NR_pwritev 334 342#define __NR_pwritev 334
343#define __NR_rt_tgsigqueueinfo 335
344#define __NR_perf_counter_open 336
343 345
344#ifdef __KERNEL__ 346#ifdef __KERNEL__
345 347
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index f8182946232..900e1617e67 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -657,7 +657,10 @@ __SYSCALL(__NR_inotify_init1, sys_inotify_init1)
657__SYSCALL(__NR_preadv, sys_preadv) 657__SYSCALL(__NR_preadv, sys_preadv)
658#define __NR_pwritev 296 658#define __NR_pwritev 296
659__SYSCALL(__NR_pwritev, sys_pwritev) 659__SYSCALL(__NR_pwritev, sys_pwritev)
660 660#define __NR_rt_tgsigqueueinfo 297
661__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
662#define __NR_perf_counter_open 298
663__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
661 664
662#ifndef __NO_STUBS 665#ifndef __NO_STUBS
663#define __ARCH_WANT_OLD_READDIR 666#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 9b0e61bf7a8..bddd44f2f0a 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -37,7 +37,7 @@
37#define UV_CPUS_PER_ACT_STATUS 32 37#define UV_CPUS_PER_ACT_STATUS 32
38#define UV_ACT_STATUS_MASK 0x3 38#define UV_ACT_STATUS_MASK 0x3
39#define UV_ACT_STATUS_SIZE 2 39#define UV_ACT_STATUS_SIZE 2
40#define UV_ACTIVATION_DESCRIPTOR_SIZE 32 40#define UV_ADP_SIZE 32
41#define UV_DISTRIBUTION_SIZE 256 41#define UV_DISTRIBUTION_SIZE 256
42#define UV_SW_ACK_NPENDING 8 42#define UV_SW_ACK_NPENDING 8
43#define UV_NET_ENDPOINT_INTD 0x38 43#define UV_NET_ENDPOINT_INTD 0x38
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index d3a98ea1062..341070f7ad5 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -133,6 +133,7 @@ struct uv_scir_s {
133struct uv_hub_info_s { 133struct uv_hub_info_s {
134 unsigned long global_mmr_base; 134 unsigned long global_mmr_base;
135 unsigned long gpa_mask; 135 unsigned long gpa_mask;
136 unsigned int gnode_extra;
136 unsigned long gnode_upper; 137 unsigned long gnode_upper;
137 unsigned long lowmem_remap_top; 138 unsigned long lowmem_remap_top;
138 unsigned long lowmem_remap_base; 139 unsigned long lowmem_remap_base;
@@ -159,7 +160,8 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
159 * p - PNODE (local part of nsids, right shifted 1) 160 * p - PNODE (local part of nsids, right shifted 1)
160 */ 161 */
161#define UV_NASID_TO_PNODE(n) (((n) >> 1) & uv_hub_info->pnode_mask) 162#define UV_NASID_TO_PNODE(n) (((n) >> 1) & uv_hub_info->pnode_mask)
162#define UV_PNODE_TO_NASID(p) (((p) << 1) | uv_hub_info->gnode_upper) 163#define UV_PNODE_TO_GNODE(p) ((p) |uv_hub_info->gnode_extra)
164#define UV_PNODE_TO_NASID(p) (UV_PNODE_TO_GNODE(p) << 1)
163 165
164#define UV_LOCAL_MMR_BASE 0xf4000000UL 166#define UV_LOCAL_MMR_BASE 0xf4000000UL
165#define UV_GLOBAL_MMR32_BASE 0xf8000000UL 167#define UV_GLOBAL_MMR32_BASE 0xf8000000UL
@@ -173,7 +175,7 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
173#define UV_GLOBAL_MMR32_PNODE_BITS(p) ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT)) 175#define UV_GLOBAL_MMR32_PNODE_BITS(p) ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT))
174 176
175#define UV_GLOBAL_MMR64_PNODE_BITS(p) \ 177#define UV_GLOBAL_MMR64_PNODE_BITS(p) \
176 ((unsigned long)(p) << UV_GLOBAL_MMR64_PNODE_SHIFT) 178 ((unsigned long)(UV_PNODE_TO_GNODE(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT)
177 179
178#define UV_APIC_PNODE_SHIFT 6 180#define UV_APIC_PNODE_SHIFT 6
179 181
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 498f944010b..11be5ad2e0e 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -247,6 +247,7 @@ enum vmcs_field {
247#define EXIT_REASON_MSR_READ 31 247#define EXIT_REASON_MSR_READ 31
248#define EXIT_REASON_MSR_WRITE 32 248#define EXIT_REASON_MSR_WRITE 32
249#define EXIT_REASON_MWAIT_INSTRUCTION 36 249#define EXIT_REASON_MWAIT_INSTRUCTION 36
250#define EXIT_REASON_MCE_DURING_VMENTRY 41
250#define EXIT_REASON_TPR_BELOW_THRESHOLD 43 251#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
251#define EXIT_REASON_APIC_ACCESS 44 252#define EXIT_REASON_APIC_ACCESS 44
252#define EXIT_REASON_EPT_VIOLATION 48 253#define EXIT_REASON_EPT_VIOLATION 48
diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h
index 11b3bb86e17..7fcf6f3dbcc 100644
--- a/arch/x86/include/asm/xor.h
+++ b/arch/x86/include/asm/xor.h
@@ -1,5 +1,10 @@
1#ifdef CONFIG_KMEMCHECK
2/* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */
3# include <asm-generic/xor.h>
4#else
1#ifdef CONFIG_X86_32 5#ifdef CONFIG_X86_32
2# include "xor_32.h" 6# include "xor_32.h"
3#else 7#else
4# include "xor_64.h" 8# include "xor_64.h"
5#endif 9#endif
10#endif
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index cbc78182917..b67efd1cf59 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -28,7 +28,7 @@ CFLAGS_paravirt.o := $(nostackp)
28obj-y := process_$(BITS).o signal.o entry_$(BITS).o 28obj-y := process_$(BITS).o signal.o entry_$(BITS).o
29obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o 29obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
30obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o 30obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o
31obj-y += setup.o i8259.o irqinit_$(BITS).o 31obj-y += setup.o i8259.o irqinit.o
32obj-$(CONFIG_X86_VISWS) += visws_quirks.o 32obj-$(CONFIG_X86_VISWS) += visws_quirks.o
33obj-$(CONFIG_X86_32) += probe_roms_32.o 33obj-$(CONFIG_X86_32) += probe_roms_32.o
34obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o 34obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
@@ -73,7 +73,7 @@ obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
73obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o 73obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
74obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o 74obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
75obj-$(CONFIG_KPROBES) += kprobes.o 75obj-$(CONFIG_KPROBES) += kprobes.o
76obj-$(CONFIG_MODULES) += module_$(BITS).o 76obj-$(CONFIG_MODULES) += module.o
77obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o 77obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o
78obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o 78obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
79obj-$(CONFIG_KGDB) += kgdb.o 79obj-$(CONFIG_KGDB) += kgdb.o
@@ -90,7 +90,8 @@ obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
90obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o 90obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
91obj-$(CONFIG_KVM_GUEST) += kvm.o 91obj-$(CONFIG_KVM_GUEST) += kvm.o
92obj-$(CONFIG_KVM_CLOCK) += kvmclock.o 92obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
93obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o paravirt-spinlocks.o 93obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
94obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
94obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o 95obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
95 96
96obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o 97obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 723989d7f80..631086159c5 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -33,6 +33,7 @@
33#include <linux/irq.h> 33#include <linux/irq.h>
34#include <linux/bootmem.h> 34#include <linux/bootmem.h>
35#include <linux/ioport.h> 35#include <linux/ioport.h>
36#include <linux/pci.h>
36 37
37#include <asm/pgtable.h> 38#include <asm/pgtable.h>
38#include <asm/io_apic.h> 39#include <asm/io_apic.h>
@@ -522,7 +523,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
522 * success: return IRQ number (>=0) 523 * success: return IRQ number (>=0)
523 * failure: return < 0 524 * failure: return < 0
524 */ 525 */
525int acpi_register_gsi(u32 gsi, int triggering, int polarity) 526int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
526{ 527{
527 unsigned int irq; 528 unsigned int irq;
528 unsigned int plat_gsi = gsi; 529 unsigned int plat_gsi = gsi;
@@ -532,14 +533,14 @@ int acpi_register_gsi(u32 gsi, int triggering, int polarity)
532 * Make sure all (legacy) PCI IRQs are set as level-triggered. 533 * Make sure all (legacy) PCI IRQs are set as level-triggered.
533 */ 534 */
534 if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) { 535 if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
535 if (triggering == ACPI_LEVEL_SENSITIVE) 536 if (trigger == ACPI_LEVEL_SENSITIVE)
536 eisa_set_level_irq(gsi); 537 eisa_set_level_irq(gsi);
537 } 538 }
538#endif 539#endif
539 540
540#ifdef CONFIG_X86_IO_APIC 541#ifdef CONFIG_X86_IO_APIC
541 if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) { 542 if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) {
542 plat_gsi = mp_register_gsi(gsi, triggering, polarity); 543 plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity);
543 } 544 }
544#endif 545#endif
545 acpi_gsi_to_irq(plat_gsi, &irq); 546 acpi_gsi_to_irq(plat_gsi, &irq);
@@ -903,10 +904,8 @@ extern int es7000_plat;
903#endif 904#endif
904 905
905static struct { 906static struct {
906 int apic_id;
907 int gsi_base; 907 int gsi_base;
908 int gsi_end; 908 int gsi_end;
909 DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
910} mp_ioapic_routing[MAX_IO_APICS]; 909} mp_ioapic_routing[MAX_IO_APICS];
911 910
912int mp_find_ioapic(int gsi) 911int mp_find_ioapic(int gsi)
@@ -986,16 +985,12 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
986 985
987 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); 986 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
988 mp_ioapics[idx].apicid = uniq_ioapic_id(id); 987 mp_ioapics[idx].apicid = uniq_ioapic_id(id);
989#ifdef CONFIG_X86_32
990 mp_ioapics[idx].apicver = io_apic_get_version(idx); 988 mp_ioapics[idx].apicver = io_apic_get_version(idx);
991#else 989
992 mp_ioapics[idx].apicver = 0;
993#endif
994 /* 990 /*
995 * Build basic GSI lookup table to facilitate gsi->io_apic lookups 991 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
996 * and to prevent reprogramming of IOAPIC pins (PCI GSIs). 992 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
997 */ 993 */
998 mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].apicid;
999 mp_ioapic_routing[idx].gsi_base = gsi_base; 994 mp_ioapic_routing[idx].gsi_base = gsi_base;
1000 mp_ioapic_routing[idx].gsi_end = gsi_base + 995 mp_ioapic_routing[idx].gsi_end = gsi_base +
1001 io_apic_get_redir_entries(idx); 996 io_apic_get_redir_entries(idx);
@@ -1158,26 +1153,52 @@ void __init mp_config_acpi_legacy_irqs(void)
1158 } 1153 }
1159} 1154}
1160 1155
1161int mp_register_gsi(u32 gsi, int triggering, int polarity) 1156static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
1157 int polarity)
1162{ 1158{
1159#ifdef CONFIG_X86_MPPARSE
1160 struct mpc_intsrc mp_irq;
1161 struct pci_dev *pdev;
1162 unsigned char number;
1163 unsigned int devfn;
1163 int ioapic; 1164 int ioapic;
1164 int ioapic_pin; 1165 u8 pin;
1165#ifdef CONFIG_X86_32
1166#define MAX_GSI_NUM 4096
1167#define IRQ_COMPRESSION_START 64
1168 1166
1169 static int pci_irq = IRQ_COMPRESSION_START; 1167 if (!acpi_ioapic)
1170 /* 1168 return 0;
1171 * Mapping between Global System Interrupts, which 1169 if (!dev)
1172 * represent all possible interrupts, and IRQs 1170 return 0;
1173 * assigned to actual devices. 1171 if (dev->bus != &pci_bus_type)
1174 */ 1172 return 0;
1175 static int gsi_to_irq[MAX_GSI_NUM]; 1173
1176#else 1174 pdev = to_pci_dev(dev);
1175 number = pdev->bus->number;
1176 devfn = pdev->devfn;
1177 pin = pdev->pin;
1178 /* print the entry should happen on mptable identically */
1179 mp_irq.type = MP_INTSRC;
1180 mp_irq.irqtype = mp_INT;
1181 mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
1182 (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
1183 mp_irq.srcbus = number;
1184 mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
1185 ioapic = mp_find_ioapic(gsi);
1186 mp_irq.dstapic = mp_ioapics[ioapic].apicid;
1187 mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
1188
1189 save_mp_irq(&mp_irq);
1190#endif
1191 return 0;
1192}
1193
1194int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
1195{
1196 int ioapic;
1197 int ioapic_pin;
1198 struct io_apic_irq_attr irq_attr;
1177 1199
1178 if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) 1200 if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
1179 return gsi; 1201 return gsi;
1180#endif
1181 1202
1182 /* Don't set up the ACPI SCI because it's already set up */ 1203 /* Don't set up the ACPI SCI because it's already set up */
1183 if (acpi_gbl_FADT.sci_interrupt == gsi) 1204 if (acpi_gbl_FADT.sci_interrupt == gsi)
@@ -1196,93 +1217,22 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
1196 gsi = ioapic_renumber_irq(ioapic, gsi); 1217 gsi = ioapic_renumber_irq(ioapic, gsi);
1197#endif 1218#endif
1198 1219
1199 /*
1200 * Avoid pin reprogramming. PRTs typically include entries
1201 * with redundant pin->gsi mappings (but unique PCI devices);
1202 * we only program the IOAPIC on the first.
1203 */
1204 if (ioapic_pin > MP_MAX_IOAPIC_PIN) { 1220 if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
1205 printk(KERN_ERR "Invalid reference to IOAPIC pin " 1221 printk(KERN_ERR "Invalid reference to IOAPIC pin "
1206 "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, 1222 "%d-%d\n", mp_ioapics[ioapic].apicid,
1207 ioapic_pin); 1223 ioapic_pin);
1208 return gsi; 1224 return gsi;
1209 } 1225 }
1210 if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
1211 pr_debug("Pin %d-%d already programmed\n",
1212 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
1213#ifdef CONFIG_X86_32
1214 return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
1215#else
1216 return gsi;
1217#endif
1218 }
1219
1220 set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
1221#ifdef CONFIG_X86_32
1222 /*
1223 * For GSI >= 64, use IRQ compression
1224 */
1225 if ((gsi >= IRQ_COMPRESSION_START)
1226 && (triggering == ACPI_LEVEL_SENSITIVE)) {
1227 /*
1228 * For PCI devices assign IRQs in order, avoiding gaps
1229 * due to unused I/O APIC pins.
1230 */
1231 int irq = gsi;
1232 if (gsi < MAX_GSI_NUM) {
1233 /*
1234 * Retain the VIA chipset work-around (gsi > 15), but
1235 * avoid a problem where the 8254 timer (IRQ0) is setup
1236 * via an override (so it's not on pin 0 of the ioapic),
1237 * and at the same time, the pin 0 interrupt is a PCI
1238 * type. The gsi > 15 test could cause these two pins
1239 * to be shared as IRQ0, and they are not shareable.
1240 * So test for this condition, and if necessary, avoid
1241 * the pin collision.
1242 */
1243 gsi = pci_irq++;
1244 /*
1245 * Don't assign IRQ used by ACPI SCI
1246 */
1247 if (gsi == acpi_gbl_FADT.sci_interrupt)
1248 gsi = pci_irq++;
1249 gsi_to_irq[irq] = gsi;
1250 } else {
1251 printk(KERN_ERR "GSI %u is too high\n", gsi);
1252 return gsi;
1253 }
1254 }
1255#endif
1256 io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
1257 triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
1258 polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
1259 return gsi;
1260}
1261 1226
1262int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin, 1227 if (enable_update_mptable)
1263 u32 gsi, int triggering, int polarity) 1228 mp_config_acpi_gsi(dev, gsi, trigger, polarity);
1264{
1265#ifdef CONFIG_X86_MPPARSE
1266 struct mpc_intsrc mp_irq;
1267 int ioapic;
1268 1229
1269 if (!acpi_ioapic) 1230 set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin,
1270 return 0; 1231 trigger == ACPI_EDGE_SENSITIVE ? 0 : 1,
1232 polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
1233 io_apic_set_pci_routing(dev, gsi, &irq_attr);
1271 1234
1272 /* print the entry should happen on mptable identically */ 1235 return gsi;
1273 mp_irq.type = MP_INTSRC;
1274 mp_irq.irqtype = mp_INT;
1275 mp_irq.irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
1276 (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
1277 mp_irq.srcbus = number;
1278 mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
1279 ioapic = mp_find_ioapic(gsi);
1280 mp_irq.dstapic = mp_ioapic_routing[ioapic].apic_id;
1281 mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
1282
1283 save_mp_irq(&mp_irq);
1284#endif
1285 return 0;
1286} 1236}
1287 1237
1288/* 1238/*
diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile
index 1c31cc0e9de..167bc16ce0e 100644
--- a/arch/x86/kernel/acpi/realmode/Makefile
+++ b/arch/x86/kernel/acpi/realmode/Makefile
@@ -9,7 +9,7 @@
9always := wakeup.bin 9always := wakeup.bin
10targets := wakeup.elf wakeup.lds 10targets := wakeup.elf wakeup.lds
11 11
12wakeup-y += wakeup.o wakemain.o video-mode.o copy.o 12wakeup-y += wakeup.o wakemain.o video-mode.o copy.o bioscall.o regs.o
13 13
14# The link order of the video-*.o modules can matter. In particular, 14# The link order of the video-*.o modules can matter. In particular,
15# video-vga.o *must* be listed first, followed by video-vesa.o. 15# video-vga.o *must* be listed first, followed by video-vesa.o.
diff --git a/arch/x86/kernel/acpi/realmode/bioscall.S b/arch/x86/kernel/acpi/realmode/bioscall.S
new file mode 100644
index 00000000000..f51eb0bb56c
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/bioscall.S
@@ -0,0 +1 @@
#include "../../../boot/bioscall.S"
diff --git a/arch/x86/kernel/acpi/realmode/regs.c b/arch/x86/kernel/acpi/realmode/regs.c
new file mode 100644
index 00000000000..6206033ba20
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/regs.c
@@ -0,0 +1 @@
#include "../../../boot/regs.c"
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 7c243a2c511..ca93638ba43 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -104,7 +104,7 @@ int acpi_save_state_mem(void)
104 initial_gs = per_cpu_offset(smp_processor_id()); 104 initial_gs = per_cpu_offset(smp_processor_id());
105#endif 105#endif
106 initial_code = (unsigned long)wakeup_long64; 106 initial_code = (unsigned long)wakeup_long64;
107 saved_magic = 0x123456789abcdef0; 107 saved_magic = 0x123456789abcdef0L;
108#endif /* CONFIG_64BIT */ 108#endif /* CONFIG_64BIT */
109 109
110 return 0; 110 return 0;
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index a97db99dad5..1c60554537c 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -55,7 +55,16 @@ struct iommu_cmd {
55static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, 55static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
56 struct unity_map_entry *e); 56 struct unity_map_entry *e);
57static struct dma_ops_domain *find_protection_domain(u16 devid); 57static struct dma_ops_domain *find_protection_domain(u16 devid);
58static u64* alloc_pte(struct protection_domain *dom,
59 unsigned long address, u64
60 **pte_page, gfp_t gfp);
61static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
62 unsigned long start_page,
63 unsigned int pages);
58 64
65#ifndef BUS_NOTIFY_UNBOUND_DRIVER
66#define BUS_NOTIFY_UNBOUND_DRIVER 0x0005
67#endif
59 68
60#ifdef CONFIG_AMD_IOMMU_STATS 69#ifdef CONFIG_AMD_IOMMU_STATS
61 70
@@ -213,7 +222,7 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data)
213{ 222{
214 struct amd_iommu *iommu; 223 struct amd_iommu *iommu;
215 224
216 list_for_each_entry(iommu, &amd_iommu_list, list) 225 for_each_iommu(iommu)
217 iommu_poll_events(iommu); 226 iommu_poll_events(iommu);
218 227
219 return IRQ_HANDLED; 228 return IRQ_HANDLED;
@@ -440,7 +449,7 @@ static void iommu_flush_domain(u16 domid)
440 __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 449 __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
441 domid, 1, 1); 450 domid, 1, 1);
442 451
443 list_for_each_entry(iommu, &amd_iommu_list, list) { 452 for_each_iommu(iommu) {
444 spin_lock_irqsave(&iommu->lock, flags); 453 spin_lock_irqsave(&iommu->lock, flags);
445 __iommu_queue_command(iommu, &cmd); 454 __iommu_queue_command(iommu, &cmd);
446 __iommu_completion_wait(iommu); 455 __iommu_completion_wait(iommu);
@@ -449,6 +458,35 @@ static void iommu_flush_domain(u16 domid)
449 } 458 }
450} 459}
451 460
461void amd_iommu_flush_all_domains(void)
462{
463 int i;
464
465 for (i = 1; i < MAX_DOMAIN_ID; ++i) {
466 if (!test_bit(i, amd_iommu_pd_alloc_bitmap))
467 continue;
468 iommu_flush_domain(i);
469 }
470}
471
472void amd_iommu_flush_all_devices(void)
473{
474 struct amd_iommu *iommu;
475 int i;
476
477 for (i = 0; i <= amd_iommu_last_bdf; ++i) {
478 if (amd_iommu_pd_table[i] == NULL)
479 continue;
480
481 iommu = amd_iommu_rlookup_table[i];
482 if (!iommu)
483 continue;
484
485 iommu_queue_inv_dev_entry(iommu, i);
486 iommu_completion_wait(iommu);
487 }
488}
489
452/**************************************************************************** 490/****************************************************************************
453 * 491 *
454 * The functions below are used the create the page table mappings for 492 * The functions below are used the create the page table mappings for
@@ -468,7 +506,7 @@ static int iommu_map_page(struct protection_domain *dom,
468 unsigned long phys_addr, 506 unsigned long phys_addr,
469 int prot) 507 int prot)
470{ 508{
471 u64 __pte, *pte, *page; 509 u64 __pte, *pte;
472 510
473 bus_addr = PAGE_ALIGN(bus_addr); 511 bus_addr = PAGE_ALIGN(bus_addr);
474 phys_addr = PAGE_ALIGN(phys_addr); 512 phys_addr = PAGE_ALIGN(phys_addr);
@@ -477,27 +515,7 @@ static int iommu_map_page(struct protection_domain *dom,
477 if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK)) 515 if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK))
478 return -EINVAL; 516 return -EINVAL;
479 517
480 pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)]; 518 pte = alloc_pte(dom, bus_addr, NULL, GFP_KERNEL);
481
482 if (!IOMMU_PTE_PRESENT(*pte)) {
483 page = (u64 *)get_zeroed_page(GFP_KERNEL);
484 if (!page)
485 return -ENOMEM;
486 *pte = IOMMU_L2_PDE(virt_to_phys(page));
487 }
488
489 pte = IOMMU_PTE_PAGE(*pte);
490 pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
491
492 if (!IOMMU_PTE_PRESENT(*pte)) {
493 page = (u64 *)get_zeroed_page(GFP_KERNEL);
494 if (!page)
495 return -ENOMEM;
496 *pte = IOMMU_L1_PDE(virt_to_phys(page));
497 }
498
499 pte = IOMMU_PTE_PAGE(*pte);
500 pte = &pte[IOMMU_PTE_L0_INDEX(bus_addr)];
501 519
502 if (IOMMU_PTE_PRESENT(*pte)) 520 if (IOMMU_PTE_PRESENT(*pte))
503 return -EBUSY; 521 return -EBUSY;
@@ -595,7 +613,8 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
595 * as allocated in the aperture 613 * as allocated in the aperture
596 */ 614 */
597 if (addr < dma_dom->aperture_size) 615 if (addr < dma_dom->aperture_size)
598 __set_bit(addr >> PAGE_SHIFT, dma_dom->bitmap); 616 __set_bit(addr >> PAGE_SHIFT,
617 dma_dom->aperture[0]->bitmap);
599 } 618 }
600 619
601 return 0; 620 return 0;
@@ -632,42 +651,191 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
632 ****************************************************************************/ 651 ****************************************************************************/
633 652
634/* 653/*
635 * The address allocator core function. 654 * The address allocator core functions.
636 * 655 *
637 * called with domain->lock held 656 * called with domain->lock held
638 */ 657 */
658
659/*
660 * This function checks if there is a PTE for a given dma address. If
661 * there is one, it returns the pointer to it.
662 */
663static u64* fetch_pte(struct protection_domain *domain,
664 unsigned long address)
665{
666 u64 *pte;
667
668 pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(address)];
669
670 if (!IOMMU_PTE_PRESENT(*pte))
671 return NULL;
672
673 pte = IOMMU_PTE_PAGE(*pte);
674 pte = &pte[IOMMU_PTE_L1_INDEX(address)];
675
676 if (!IOMMU_PTE_PRESENT(*pte))
677 return NULL;
678
679 pte = IOMMU_PTE_PAGE(*pte);
680 pte = &pte[IOMMU_PTE_L0_INDEX(address)];
681
682 return pte;
683}
684
685/*
686 * This function is used to add a new aperture range to an existing
687 * aperture in case of dma_ops domain allocation or address allocation
688 * failure.
689 */
690static int alloc_new_range(struct amd_iommu *iommu,
691 struct dma_ops_domain *dma_dom,
692 bool populate, gfp_t gfp)
693{
694 int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
695 int i;
696
697#ifdef CONFIG_IOMMU_STRESS
698 populate = false;
699#endif
700
701 if (index >= APERTURE_MAX_RANGES)
702 return -ENOMEM;
703
704 dma_dom->aperture[index] = kzalloc(sizeof(struct aperture_range), gfp);
705 if (!dma_dom->aperture[index])
706 return -ENOMEM;
707
708 dma_dom->aperture[index]->bitmap = (void *)get_zeroed_page(gfp);
709 if (!dma_dom->aperture[index]->bitmap)
710 goto out_free;
711
712 dma_dom->aperture[index]->offset = dma_dom->aperture_size;
713
714 if (populate) {
715 unsigned long address = dma_dom->aperture_size;
716 int i, num_ptes = APERTURE_RANGE_PAGES / 512;
717 u64 *pte, *pte_page;
718
719 for (i = 0; i < num_ptes; ++i) {
720 pte = alloc_pte(&dma_dom->domain, address,
721 &pte_page, gfp);
722 if (!pte)
723 goto out_free;
724
725 dma_dom->aperture[index]->pte_pages[i] = pte_page;
726
727 address += APERTURE_RANGE_SIZE / 64;
728 }
729 }
730
731 dma_dom->aperture_size += APERTURE_RANGE_SIZE;
732
733 /* Intialize the exclusion range if necessary */
734 if (iommu->exclusion_start &&
735 iommu->exclusion_start >= dma_dom->aperture[index]->offset &&
736 iommu->exclusion_start < dma_dom->aperture_size) {
737 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
738 int pages = iommu_num_pages(iommu->exclusion_start,
739 iommu->exclusion_length,
740 PAGE_SIZE);
741 dma_ops_reserve_addresses(dma_dom, startpage, pages);
742 }
743
744 /*
745 * Check for areas already mapped as present in the new aperture
746 * range and mark those pages as reserved in the allocator. Such
747 * mappings may already exist as a result of requested unity
748 * mappings for devices.
749 */
750 for (i = dma_dom->aperture[index]->offset;
751 i < dma_dom->aperture_size;
752 i += PAGE_SIZE) {
753 u64 *pte = fetch_pte(&dma_dom->domain, i);
754 if (!pte || !IOMMU_PTE_PRESENT(*pte))
755 continue;
756
757 dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1);
758 }
759
760 return 0;
761
762out_free:
763 free_page((unsigned long)dma_dom->aperture[index]->bitmap);
764
765 kfree(dma_dom->aperture[index]);
766 dma_dom->aperture[index] = NULL;
767
768 return -ENOMEM;
769}
770
771static unsigned long dma_ops_area_alloc(struct device *dev,
772 struct dma_ops_domain *dom,
773 unsigned int pages,
774 unsigned long align_mask,
775 u64 dma_mask,
776 unsigned long start)
777{
778 unsigned long next_bit = dom->next_address % APERTURE_RANGE_SIZE;
779 int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT;
780 int i = start >> APERTURE_RANGE_SHIFT;
781 unsigned long boundary_size;
782 unsigned long address = -1;
783 unsigned long limit;
784
785 next_bit >>= PAGE_SHIFT;
786
787 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
788 PAGE_SIZE) >> PAGE_SHIFT;
789
790 for (;i < max_index; ++i) {
791 unsigned long offset = dom->aperture[i]->offset >> PAGE_SHIFT;
792
793 if (dom->aperture[i]->offset >= dma_mask)
794 break;
795
796 limit = iommu_device_max_index(APERTURE_RANGE_PAGES, offset,
797 dma_mask >> PAGE_SHIFT);
798
799 address = iommu_area_alloc(dom->aperture[i]->bitmap,
800 limit, next_bit, pages, 0,
801 boundary_size, align_mask);
802 if (address != -1) {
803 address = dom->aperture[i]->offset +
804 (address << PAGE_SHIFT);
805 dom->next_address = address + (pages << PAGE_SHIFT);
806 break;
807 }
808
809 next_bit = 0;
810 }
811
812 return address;
813}
814
639static unsigned long dma_ops_alloc_addresses(struct device *dev, 815static unsigned long dma_ops_alloc_addresses(struct device *dev,
640 struct dma_ops_domain *dom, 816 struct dma_ops_domain *dom,
641 unsigned int pages, 817 unsigned int pages,
642 unsigned long align_mask, 818 unsigned long align_mask,
643 u64 dma_mask) 819 u64 dma_mask)
644{ 820{
645 unsigned long limit;
646 unsigned long address; 821 unsigned long address;
647 unsigned long boundary_size;
648 822
649 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, 823#ifdef CONFIG_IOMMU_STRESS
650 PAGE_SIZE) >> PAGE_SHIFT; 824 dom->next_address = 0;
651 limit = iommu_device_max_index(dom->aperture_size >> PAGE_SHIFT, 0, 825 dom->need_flush = true;
652 dma_mask >> PAGE_SHIFT); 826#endif
653 827
654 if (dom->next_bit >= limit) { 828 address = dma_ops_area_alloc(dev, dom, pages, align_mask,
655 dom->next_bit = 0; 829 dma_mask, dom->next_address);
656 dom->need_flush = true;
657 }
658 830
659 address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages,
660 0 , boundary_size, align_mask);
661 if (address == -1) { 831 if (address == -1) {
662 address = iommu_area_alloc(dom->bitmap, limit, 0, pages, 832 dom->next_address = 0;
663 0, boundary_size, align_mask); 833 address = dma_ops_area_alloc(dev, dom, pages, align_mask,
834 dma_mask, 0);
664 dom->need_flush = true; 835 dom->need_flush = true;
665 } 836 }
666 837
667 if (likely(address != -1)) { 838 if (unlikely(address == -1))
668 dom->next_bit = address + pages;
669 address <<= PAGE_SHIFT;
670 } else
671 address = bad_dma_address; 839 address = bad_dma_address;
672 840
673 WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size); 841 WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
@@ -684,11 +852,23 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
684 unsigned long address, 852 unsigned long address,
685 unsigned int pages) 853 unsigned int pages)
686{ 854{
687 address >>= PAGE_SHIFT; 855 unsigned i = address >> APERTURE_RANGE_SHIFT;
688 iommu_area_free(dom->bitmap, address, pages); 856 struct aperture_range *range = dom->aperture[i];
689 857
690 if (address >= dom->next_bit) 858 BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL);
859
860#ifdef CONFIG_IOMMU_STRESS
861 if (i < 4)
862 return;
863#endif
864
865 if (address >= dom->next_address)
691 dom->need_flush = true; 866 dom->need_flush = true;
867
868 address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT;
869
870 iommu_area_free(range->bitmap, address, pages);
871
692} 872}
693 873
694/**************************************************************************** 874/****************************************************************************
@@ -736,12 +916,16 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
736 unsigned long start_page, 916 unsigned long start_page,
737 unsigned int pages) 917 unsigned int pages)
738{ 918{
739 unsigned int last_page = dom->aperture_size >> PAGE_SHIFT; 919 unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
740 920
741 if (start_page + pages > last_page) 921 if (start_page + pages > last_page)
742 pages = last_page - start_page; 922 pages = last_page - start_page;
743 923
744 iommu_area_reserve(dom->bitmap, start_page, pages); 924 for (i = start_page; i < start_page + pages; ++i) {
925 int index = i / APERTURE_RANGE_PAGES;
926 int page = i % APERTURE_RANGE_PAGES;
927 __set_bit(page, dom->aperture[index]->bitmap);
928 }
745} 929}
746 930
747static void free_pagetable(struct protection_domain *domain) 931static void free_pagetable(struct protection_domain *domain)
@@ -780,14 +964,19 @@ static void free_pagetable(struct protection_domain *domain)
780 */ 964 */
781static void dma_ops_domain_free(struct dma_ops_domain *dom) 965static void dma_ops_domain_free(struct dma_ops_domain *dom)
782{ 966{
967 int i;
968
783 if (!dom) 969 if (!dom)
784 return; 970 return;
785 971
786 free_pagetable(&dom->domain); 972 free_pagetable(&dom->domain);
787 973
788 kfree(dom->pte_pages); 974 for (i = 0; i < APERTURE_MAX_RANGES; ++i) {
789 975 if (!dom->aperture[i])
790 kfree(dom->bitmap); 976 continue;
977 free_page((unsigned long)dom->aperture[i]->bitmap);
978 kfree(dom->aperture[i]);
979 }
791 980
792 kfree(dom); 981 kfree(dom);
793} 982}
@@ -797,19 +986,9 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
797 * It also intializes the page table and the address allocator data 986 * It also intializes the page table and the address allocator data
798 * structures required for the dma_ops interface 987 * structures required for the dma_ops interface
799 */ 988 */
800static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, 989static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
801 unsigned order)
802{ 990{
803 struct dma_ops_domain *dma_dom; 991 struct dma_ops_domain *dma_dom;
804 unsigned i, num_pte_pages;
805 u64 *l2_pde;
806 u64 address;
807
808 /*
809 * Currently the DMA aperture must be between 32 MB and 1GB in size
810 */
811 if ((order < 25) || (order > 30))
812 return NULL;
813 992
814 dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL); 993 dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL);
815 if (!dma_dom) 994 if (!dma_dom)
@@ -826,55 +1005,20 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
826 dma_dom->domain.priv = dma_dom; 1005 dma_dom->domain.priv = dma_dom;
827 if (!dma_dom->domain.pt_root) 1006 if (!dma_dom->domain.pt_root)
828 goto free_dma_dom; 1007 goto free_dma_dom;
829 dma_dom->aperture_size = (1ULL << order);
830 dma_dom->bitmap = kzalloc(dma_dom->aperture_size / (PAGE_SIZE * 8),
831 GFP_KERNEL);
832 if (!dma_dom->bitmap)
833 goto free_dma_dom;
834 /*
835 * mark the first page as allocated so we never return 0 as
836 * a valid dma-address. So we can use 0 as error value
837 */
838 dma_dom->bitmap[0] = 1;
839 dma_dom->next_bit = 0;
840 1008
841 dma_dom->need_flush = false; 1009 dma_dom->need_flush = false;
842 dma_dom->target_dev = 0xffff; 1010 dma_dom->target_dev = 0xffff;
843 1011
844 /* Intialize the exclusion range if necessary */ 1012 if (alloc_new_range(iommu, dma_dom, true, GFP_KERNEL))
845 if (iommu->exclusion_start && 1013 goto free_dma_dom;
846 iommu->exclusion_start < dma_dom->aperture_size) {
847 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
848 int pages = iommu_num_pages(iommu->exclusion_start,
849 iommu->exclusion_length,
850 PAGE_SIZE);
851 dma_ops_reserve_addresses(dma_dom, startpage, pages);
852 }
853 1014
854 /* 1015 /*
855 * At the last step, build the page tables so we don't need to 1016 * mark the first page as allocated so we never return 0 as
856 * allocate page table pages in the dma_ops mapping/unmapping 1017 * a valid dma-address. So we can use 0 as error value
857 * path.
858 */ 1018 */
859 num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512); 1019 dma_dom->aperture[0]->bitmap[0] = 1;
860 dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *), 1020 dma_dom->next_address = 0;
861 GFP_KERNEL);
862 if (!dma_dom->pte_pages)
863 goto free_dma_dom;
864
865 l2_pde = (u64 *)get_zeroed_page(GFP_KERNEL);
866 if (l2_pde == NULL)
867 goto free_dma_dom;
868 1021
869 dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde));
870
871 for (i = 0; i < num_pte_pages; ++i) {
872 dma_dom->pte_pages[i] = (u64 *)get_zeroed_page(GFP_KERNEL);
873 if (!dma_dom->pte_pages[i])
874 goto free_dma_dom;
875 address = virt_to_phys(dma_dom->pte_pages[i]);
876 l2_pde[i] = IOMMU_L1_PDE(address);
877 }
878 1022
879 return dma_dom; 1023 return dma_dom;
880 1024
@@ -983,7 +1127,6 @@ static int device_change_notifier(struct notifier_block *nb,
983 struct protection_domain *domain; 1127 struct protection_domain *domain;
984 struct dma_ops_domain *dma_domain; 1128 struct dma_ops_domain *dma_domain;
985 struct amd_iommu *iommu; 1129 struct amd_iommu *iommu;
986 int order = amd_iommu_aperture_order;
987 unsigned long flags; 1130 unsigned long flags;
988 1131
989 if (devid > amd_iommu_last_bdf) 1132 if (devid > amd_iommu_last_bdf)
@@ -1002,17 +1145,7 @@ static int device_change_notifier(struct notifier_block *nb,
1002 "to a non-dma-ops domain\n", dev_name(dev)); 1145 "to a non-dma-ops domain\n", dev_name(dev));
1003 1146
1004 switch (action) { 1147 switch (action) {
1005 case BUS_NOTIFY_BOUND_DRIVER: 1148 case BUS_NOTIFY_UNBOUND_DRIVER:
1006 if (domain)
1007 goto out;
1008 dma_domain = find_protection_domain(devid);
1009 if (!dma_domain)
1010 dma_domain = iommu->default_dom;
1011 attach_device(iommu, &dma_domain->domain, devid);
1012 printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
1013 "device %s\n", dma_domain->domain.id, dev_name(dev));
1014 break;
1015 case BUS_NOTIFY_UNBIND_DRIVER:
1016 if (!domain) 1149 if (!domain)
1017 goto out; 1150 goto out;
1018 detach_device(domain, devid); 1151 detach_device(domain, devid);
@@ -1022,7 +1155,7 @@ static int device_change_notifier(struct notifier_block *nb,
1022 dma_domain = find_protection_domain(devid); 1155 dma_domain = find_protection_domain(devid);
1023 if (dma_domain) 1156 if (dma_domain)
1024 goto out; 1157 goto out;
1025 dma_domain = dma_ops_domain_alloc(iommu, order); 1158 dma_domain = dma_ops_domain_alloc(iommu);
1026 if (!dma_domain) 1159 if (!dma_domain)
1027 goto out; 1160 goto out;
1028 dma_domain->target_dev = devid; 1161 dma_domain->target_dev = devid;
@@ -1133,8 +1266,8 @@ static int get_device_resources(struct device *dev,
1133 dma_dom = (*iommu)->default_dom; 1266 dma_dom = (*iommu)->default_dom;
1134 *domain = &dma_dom->domain; 1267 *domain = &dma_dom->domain;
1135 attach_device(*iommu, *domain, *bdf); 1268 attach_device(*iommu, *domain, *bdf);
1136 printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " 1269 DUMP_printk("Using protection domain %d for device %s\n",
1137 "device %s\n", (*domain)->id, dev_name(dev)); 1270 (*domain)->id, dev_name(dev));
1138 } 1271 }
1139 1272
1140 if (domain_for_device(_bdf) == NULL) 1273 if (domain_for_device(_bdf) == NULL)
@@ -1144,6 +1277,66 @@ static int get_device_resources(struct device *dev,
1144} 1277}
1145 1278
1146/* 1279/*
1280 * If the pte_page is not yet allocated this function is called
1281 */
1282static u64* alloc_pte(struct protection_domain *dom,
1283 unsigned long address, u64 **pte_page, gfp_t gfp)
1284{
1285 u64 *pte, *page;
1286
1287 pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(address)];
1288
1289 if (!IOMMU_PTE_PRESENT(*pte)) {
1290 page = (u64 *)get_zeroed_page(gfp);
1291 if (!page)
1292 return NULL;
1293 *pte = IOMMU_L2_PDE(virt_to_phys(page));
1294 }
1295
1296 pte = IOMMU_PTE_PAGE(*pte);
1297 pte = &pte[IOMMU_PTE_L1_INDEX(address)];
1298
1299 if (!IOMMU_PTE_PRESENT(*pte)) {
1300 page = (u64 *)get_zeroed_page(gfp);
1301 if (!page)
1302 return NULL;
1303 *pte = IOMMU_L1_PDE(virt_to_phys(page));
1304 }
1305
1306 pte = IOMMU_PTE_PAGE(*pte);
1307
1308 if (pte_page)
1309 *pte_page = pte;
1310
1311 pte = &pte[IOMMU_PTE_L0_INDEX(address)];
1312
1313 return pte;
1314}
1315
1316/*
1317 * This function fetches the PTE for a given address in the aperture
1318 */
1319static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
1320 unsigned long address)
1321{
1322 struct aperture_range *aperture;
1323 u64 *pte, *pte_page;
1324
1325 aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
1326 if (!aperture)
1327 return NULL;
1328
1329 pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
1330 if (!pte) {
1331 pte = alloc_pte(&dom->domain, address, &pte_page, GFP_ATOMIC);
1332 aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
1333 } else
1334 pte += IOMMU_PTE_L0_INDEX(address);
1335
1336 return pte;
1337}
1338
1339/*
1147 * This is the generic map function. It maps one 4kb page at paddr to 1340 * This is the generic map function. It maps one 4kb page at paddr to
1148 * the given address in the DMA address space for the domain. 1341 * the given address in the DMA address space for the domain.
1149 */ 1342 */
@@ -1159,8 +1352,9 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
1159 1352
1160 paddr &= PAGE_MASK; 1353 paddr &= PAGE_MASK;
1161 1354
1162 pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)]; 1355 pte = dma_ops_get_pte(dom, address);
1163 pte += IOMMU_PTE_L0_INDEX(address); 1356 if (!pte)
1357 return bad_dma_address;
1164 1358
1165 __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; 1359 __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
1166 1360
@@ -1185,14 +1379,20 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
1185 struct dma_ops_domain *dom, 1379 struct dma_ops_domain *dom,
1186 unsigned long address) 1380 unsigned long address)
1187{ 1381{
1382 struct aperture_range *aperture;
1188 u64 *pte; 1383 u64 *pte;
1189 1384
1190 if (address >= dom->aperture_size) 1385 if (address >= dom->aperture_size)
1191 return; 1386 return;
1192 1387
1193 WARN_ON(address & ~PAGE_MASK || address >= dom->aperture_size); 1388 aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
1389 if (!aperture)
1390 return;
1391
1392 pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
1393 if (!pte)
1394 return;
1194 1395
1195 pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)];
1196 pte += IOMMU_PTE_L0_INDEX(address); 1396 pte += IOMMU_PTE_L0_INDEX(address);
1197 1397
1198 WARN_ON(!*pte); 1398 WARN_ON(!*pte);
@@ -1216,7 +1416,7 @@ static dma_addr_t __map_single(struct device *dev,
1216 u64 dma_mask) 1416 u64 dma_mask)
1217{ 1417{
1218 dma_addr_t offset = paddr & ~PAGE_MASK; 1418 dma_addr_t offset = paddr & ~PAGE_MASK;
1219 dma_addr_t address, start; 1419 dma_addr_t address, start, ret;
1220 unsigned int pages; 1420 unsigned int pages;
1221 unsigned long align_mask = 0; 1421 unsigned long align_mask = 0;
1222 int i; 1422 int i;
@@ -1232,14 +1432,33 @@ static dma_addr_t __map_single(struct device *dev,
1232 if (align) 1432 if (align)
1233 align_mask = (1UL << get_order(size)) - 1; 1433 align_mask = (1UL << get_order(size)) - 1;
1234 1434
1435retry:
1235 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask, 1436 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
1236 dma_mask); 1437 dma_mask);
1237 if (unlikely(address == bad_dma_address)) 1438 if (unlikely(address == bad_dma_address)) {
1238 goto out; 1439 /*
1440 * setting next_address here will let the address
1441 * allocator only scan the new allocated range in the
1442 * first run. This is a small optimization.
1443 */
1444 dma_dom->next_address = dma_dom->aperture_size;
1445
1446 if (alloc_new_range(iommu, dma_dom, false, GFP_ATOMIC))
1447 goto out;
1448
1449 /*
1450 * aperture was sucessfully enlarged by 128 MB, try
1451 * allocation again
1452 */
1453 goto retry;
1454 }
1239 1455
1240 start = address; 1456 start = address;
1241 for (i = 0; i < pages; ++i) { 1457 for (i = 0; i < pages; ++i) {
1242 dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); 1458 ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir);
1459 if (ret == bad_dma_address)
1460 goto out_unmap;
1461
1243 paddr += PAGE_SIZE; 1462 paddr += PAGE_SIZE;
1244 start += PAGE_SIZE; 1463 start += PAGE_SIZE;
1245 } 1464 }
@@ -1255,6 +1474,17 @@ static dma_addr_t __map_single(struct device *dev,
1255 1474
1256out: 1475out:
1257 return address; 1476 return address;
1477
1478out_unmap:
1479
1480 for (--i; i >= 0; --i) {
1481 start -= PAGE_SIZE;
1482 dma_ops_domain_unmap(iommu, dma_dom, start);
1483 }
1484
1485 dma_ops_free_addresses(dma_dom, address, pages);
1486
1487 return bad_dma_address;
1258} 1488}
1259 1489
1260/* 1490/*
@@ -1537,8 +1767,10 @@ static void *alloc_coherent(struct device *dev, size_t size,
1537 *dma_addr = __map_single(dev, iommu, domain->priv, paddr, 1767 *dma_addr = __map_single(dev, iommu, domain->priv, paddr,
1538 size, DMA_BIDIRECTIONAL, true, dma_mask); 1768 size, DMA_BIDIRECTIONAL, true, dma_mask);
1539 1769
1540 if (*dma_addr == bad_dma_address) 1770 if (*dma_addr == bad_dma_address) {
1771 spin_unlock_irqrestore(&domain->lock, flags);
1541 goto out_free; 1772 goto out_free;
1773 }
1542 1774
1543 iommu_completion_wait(iommu); 1775 iommu_completion_wait(iommu);
1544 1776
@@ -1625,7 +1857,6 @@ static void prealloc_protection_domains(void)
1625 struct pci_dev *dev = NULL; 1857 struct pci_dev *dev = NULL;
1626 struct dma_ops_domain *dma_dom; 1858 struct dma_ops_domain *dma_dom;
1627 struct amd_iommu *iommu; 1859 struct amd_iommu *iommu;
1628 int order = amd_iommu_aperture_order;
1629 u16 devid; 1860 u16 devid;
1630 1861
1631 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 1862 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
@@ -1638,7 +1869,7 @@ static void prealloc_protection_domains(void)
1638 iommu = amd_iommu_rlookup_table[devid]; 1869 iommu = amd_iommu_rlookup_table[devid];
1639 if (!iommu) 1870 if (!iommu)
1640 continue; 1871 continue;
1641 dma_dom = dma_ops_domain_alloc(iommu, order); 1872 dma_dom = dma_ops_domain_alloc(iommu);
1642 if (!dma_dom) 1873 if (!dma_dom)
1643 continue; 1874 continue;
1644 init_unity_mappings_for_device(dma_dom, devid); 1875 init_unity_mappings_for_device(dma_dom, devid);
@@ -1664,7 +1895,6 @@ static struct dma_map_ops amd_iommu_dma_ops = {
1664int __init amd_iommu_init_dma_ops(void) 1895int __init amd_iommu_init_dma_ops(void)
1665{ 1896{
1666 struct amd_iommu *iommu; 1897 struct amd_iommu *iommu;
1667 int order = amd_iommu_aperture_order;
1668 int ret; 1898 int ret;
1669 1899
1670 /* 1900 /*
@@ -1672,8 +1902,8 @@ int __init amd_iommu_init_dma_ops(void)
1672 * found in the system. Devices not assigned to any other 1902 * found in the system. Devices not assigned to any other
1673 * protection domain will be assigned to the default one. 1903 * protection domain will be assigned to the default one.
1674 */ 1904 */
1675 list_for_each_entry(iommu, &amd_iommu_list, list) { 1905 for_each_iommu(iommu) {
1676 iommu->default_dom = dma_ops_domain_alloc(iommu, order); 1906 iommu->default_dom = dma_ops_domain_alloc(iommu);
1677 if (iommu->default_dom == NULL) 1907 if (iommu->default_dom == NULL)
1678 return -ENOMEM; 1908 return -ENOMEM;
1679 iommu->default_dom->domain.flags |= PD_DEFAULT_MASK; 1909 iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
@@ -1710,7 +1940,7 @@ int __init amd_iommu_init_dma_ops(void)
1710 1940
1711free_domains: 1941free_domains:
1712 1942
1713 list_for_each_entry(iommu, &amd_iommu_list, list) { 1943 for_each_iommu(iommu) {
1714 if (iommu->default_dom) 1944 if (iommu->default_dom)
1715 dma_ops_domain_free(iommu->default_dom); 1945 dma_ops_domain_free(iommu->default_dom);
1716 } 1946 }
@@ -1842,7 +2072,7 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
1842 2072
1843 old_domain = domain_for_device(devid); 2073 old_domain = domain_for_device(devid);
1844 if (old_domain) 2074 if (old_domain)
1845 return -EBUSY; 2075 detach_device(old_domain, devid);
1846 2076
1847 attach_device(iommu, domain, devid); 2077 attach_device(iommu, domain, devid);
1848 2078
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 8c0be0902da..238989ec077 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -115,15 +115,21 @@ struct ivmd_header {
115 u64 range_length; 115 u64 range_length;
116} __attribute__((packed)); 116} __attribute__((packed));
117 117
118bool amd_iommu_dump;
119
118static int __initdata amd_iommu_detected; 120static int __initdata amd_iommu_detected;
119 121
120u16 amd_iommu_last_bdf; /* largest PCI device id we have 122u16 amd_iommu_last_bdf; /* largest PCI device id we have
121 to handle */ 123 to handle */
122LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings 124LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
123 we find in ACPI */ 125 we find in ACPI */
124unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ 126#ifdef CONFIG_IOMMU_STRESS
127bool amd_iommu_isolate = false;
128#else
125bool amd_iommu_isolate = true; /* if true, device isolation is 129bool amd_iommu_isolate = true; /* if true, device isolation is
126 enabled */ 130 enabled */
131#endif
132
127bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ 133bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
128 134
129LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the 135LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
@@ -175,7 +181,7 @@ static inline void update_last_devid(u16 devid)
175static inline unsigned long tbl_size(int entry_size) 181static inline unsigned long tbl_size(int entry_size)
176{ 182{
177 unsigned shift = PAGE_SHIFT + 183 unsigned shift = PAGE_SHIFT +
178 get_order(amd_iommu_last_bdf * entry_size); 184 get_order(((int)amd_iommu_last_bdf + 1) * entry_size);
179 185
180 return 1UL << shift; 186 return 1UL << shift;
181} 187}
@@ -193,7 +199,7 @@ static inline unsigned long tbl_size(int entry_size)
193 * This function set the exclusion range in the IOMMU. DMA accesses to the 199 * This function set the exclusion range in the IOMMU. DMA accesses to the
194 * exclusion range are passed through untranslated 200 * exclusion range are passed through untranslated
195 */ 201 */
196static void __init iommu_set_exclusion_range(struct amd_iommu *iommu) 202static void iommu_set_exclusion_range(struct amd_iommu *iommu)
197{ 203{
198 u64 start = iommu->exclusion_start & PAGE_MASK; 204 u64 start = iommu->exclusion_start & PAGE_MASK;
199 u64 limit = (start + iommu->exclusion_length) & PAGE_MASK; 205 u64 limit = (start + iommu->exclusion_length) & PAGE_MASK;
@@ -225,7 +231,7 @@ static void __init iommu_set_device_table(struct amd_iommu *iommu)
225} 231}
226 232
227/* Generic functions to enable/disable certain features of the IOMMU. */ 233/* Generic functions to enable/disable certain features of the IOMMU. */
228static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit) 234static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
229{ 235{
230 u32 ctrl; 236 u32 ctrl;
231 237
@@ -244,7 +250,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
244} 250}
245 251
246/* Function to enable the hardware */ 252/* Function to enable the hardware */
247static void __init iommu_enable(struct amd_iommu *iommu) 253static void iommu_enable(struct amd_iommu *iommu)
248{ 254{
249 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n", 255 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n",
250 dev_name(&iommu->dev->dev), iommu->cap_ptr); 256 dev_name(&iommu->dev->dev), iommu->cap_ptr);
@@ -252,11 +258,9 @@ static void __init iommu_enable(struct amd_iommu *iommu)
252 iommu_feature_enable(iommu, CONTROL_IOMMU_EN); 258 iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
253} 259}
254 260
255/* Function to enable IOMMU event logging and event interrupts */ 261static void iommu_disable(struct amd_iommu *iommu)
256static void __init iommu_enable_event_logging(struct amd_iommu *iommu)
257{ 262{
258 iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); 263 iommu_feature_disable(iommu, CONTROL_IOMMU_EN);
259 iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
260} 264}
261 265
262/* 266/*
@@ -413,25 +417,36 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
413{ 417{
414 u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 418 u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
415 get_order(CMD_BUFFER_SIZE)); 419 get_order(CMD_BUFFER_SIZE));
416 u64 entry;
417 420
418 if (cmd_buf == NULL) 421 if (cmd_buf == NULL)
419 return NULL; 422 return NULL;
420 423
421 iommu->cmd_buf_size = CMD_BUFFER_SIZE; 424 iommu->cmd_buf_size = CMD_BUFFER_SIZE;
422 425
423 entry = (u64)virt_to_phys(cmd_buf); 426 return cmd_buf;
427}
428
429/*
430 * This function writes the command buffer address to the hardware and
431 * enables it.
432 */
433static void iommu_enable_command_buffer(struct amd_iommu *iommu)
434{
435 u64 entry;
436
437 BUG_ON(iommu->cmd_buf == NULL);
438
439 entry = (u64)virt_to_phys(iommu->cmd_buf);
424 entry |= MMIO_CMD_SIZE_512; 440 entry |= MMIO_CMD_SIZE_512;
441
425 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, 442 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
426 &entry, sizeof(entry)); 443 &entry, sizeof(entry));
427 444
428 /* set head and tail to zero manually */ 445 /* set head and tail to zero manually */
429 writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); 446 writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
430 writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); 447 writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
431 448
432 iommu_feature_enable(iommu, CONTROL_CMDBUF_EN); 449 iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
433
434 return cmd_buf;
435} 450}
436 451
437static void __init free_command_buffer(struct amd_iommu *iommu) 452static void __init free_command_buffer(struct amd_iommu *iommu)
@@ -443,20 +458,27 @@ static void __init free_command_buffer(struct amd_iommu *iommu)
443/* allocates the memory where the IOMMU will log its events to */ 458/* allocates the memory where the IOMMU will log its events to */
444static u8 * __init alloc_event_buffer(struct amd_iommu *iommu) 459static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
445{ 460{
446 u64 entry;
447 iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 461 iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
448 get_order(EVT_BUFFER_SIZE)); 462 get_order(EVT_BUFFER_SIZE));
449 463
450 if (iommu->evt_buf == NULL) 464 if (iommu->evt_buf == NULL)
451 return NULL; 465 return NULL;
452 466
467 return iommu->evt_buf;
468}
469
470static void iommu_enable_event_buffer(struct amd_iommu *iommu)
471{
472 u64 entry;
473
474 BUG_ON(iommu->evt_buf == NULL);
475
453 entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK; 476 entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
477
454 memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET, 478 memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
455 &entry, sizeof(entry)); 479 &entry, sizeof(entry));
456 480
457 iommu->evt_buf_size = EVT_BUFFER_SIZE; 481 iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
458
459 return iommu->evt_buf;
460} 482}
461 483
462static void __init free_event_buffer(struct amd_iommu *iommu) 484static void __init free_event_buffer(struct amd_iommu *iommu)
@@ -596,32 +618,83 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
596 p += sizeof(struct ivhd_header); 618 p += sizeof(struct ivhd_header);
597 end += h->length; 619 end += h->length;
598 620
621
599 while (p < end) { 622 while (p < end) {
600 e = (struct ivhd_entry *)p; 623 e = (struct ivhd_entry *)p;
601 switch (e->type) { 624 switch (e->type) {
602 case IVHD_DEV_ALL: 625 case IVHD_DEV_ALL:
626
627 DUMP_printk(" DEV_ALL\t\t\t first devid: %02x:%02x.%x"
628 " last device %02x:%02x.%x flags: %02x\n",
629 PCI_BUS(iommu->first_device),
630 PCI_SLOT(iommu->first_device),
631 PCI_FUNC(iommu->first_device),
632 PCI_BUS(iommu->last_device),
633 PCI_SLOT(iommu->last_device),
634 PCI_FUNC(iommu->last_device),
635 e->flags);
636
603 for (dev_i = iommu->first_device; 637 for (dev_i = iommu->first_device;
604 dev_i <= iommu->last_device; ++dev_i) 638 dev_i <= iommu->last_device; ++dev_i)
605 set_dev_entry_from_acpi(iommu, dev_i, 639 set_dev_entry_from_acpi(iommu, dev_i,
606 e->flags, 0); 640 e->flags, 0);
607 break; 641 break;
608 case IVHD_DEV_SELECT: 642 case IVHD_DEV_SELECT:
643
644 DUMP_printk(" DEV_SELECT\t\t\t devid: %02x:%02x.%x "
645 "flags: %02x\n",
646 PCI_BUS(e->devid),
647 PCI_SLOT(e->devid),
648 PCI_FUNC(e->devid),
649 e->flags);
650
609 devid = e->devid; 651 devid = e->devid;
610 set_dev_entry_from_acpi(iommu, devid, e->flags, 0); 652 set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
611 break; 653 break;
612 case IVHD_DEV_SELECT_RANGE_START: 654 case IVHD_DEV_SELECT_RANGE_START:
655
656 DUMP_printk(" DEV_SELECT_RANGE_START\t "
657 "devid: %02x:%02x.%x flags: %02x\n",
658 PCI_BUS(e->devid),
659 PCI_SLOT(e->devid),
660 PCI_FUNC(e->devid),
661 e->flags);
662
613 devid_start = e->devid; 663 devid_start = e->devid;
614 flags = e->flags; 664 flags = e->flags;
615 ext_flags = 0; 665 ext_flags = 0;
616 alias = false; 666 alias = false;
617 break; 667 break;
618 case IVHD_DEV_ALIAS: 668 case IVHD_DEV_ALIAS:
669
670 DUMP_printk(" DEV_ALIAS\t\t\t devid: %02x:%02x.%x "
671 "flags: %02x devid_to: %02x:%02x.%x\n",
672 PCI_BUS(e->devid),
673 PCI_SLOT(e->devid),
674 PCI_FUNC(e->devid),
675 e->flags,
676 PCI_BUS(e->ext >> 8),
677 PCI_SLOT(e->ext >> 8),
678 PCI_FUNC(e->ext >> 8));
679
619 devid = e->devid; 680 devid = e->devid;
620 devid_to = e->ext >> 8; 681 devid_to = e->ext >> 8;
621 set_dev_entry_from_acpi(iommu, devid, e->flags, 0); 682 set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0);
622 amd_iommu_alias_table[devid] = devid_to; 683 amd_iommu_alias_table[devid] = devid_to;
623 break; 684 break;
624 case IVHD_DEV_ALIAS_RANGE: 685 case IVHD_DEV_ALIAS_RANGE:
686
687 DUMP_printk(" DEV_ALIAS_RANGE\t\t "
688 "devid: %02x:%02x.%x flags: %02x "
689 "devid_to: %02x:%02x.%x\n",
690 PCI_BUS(e->devid),
691 PCI_SLOT(e->devid),
692 PCI_FUNC(e->devid),
693 e->flags,
694 PCI_BUS(e->ext >> 8),
695 PCI_SLOT(e->ext >> 8),
696 PCI_FUNC(e->ext >> 8));
697
625 devid_start = e->devid; 698 devid_start = e->devid;
626 flags = e->flags; 699 flags = e->flags;
627 devid_to = e->ext >> 8; 700 devid_to = e->ext >> 8;
@@ -629,17 +702,39 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
629 alias = true; 702 alias = true;
630 break; 703 break;
631 case IVHD_DEV_EXT_SELECT: 704 case IVHD_DEV_EXT_SELECT:
705
706 DUMP_printk(" DEV_EXT_SELECT\t\t devid: %02x:%02x.%x "
707 "flags: %02x ext: %08x\n",
708 PCI_BUS(e->devid),
709 PCI_SLOT(e->devid),
710 PCI_FUNC(e->devid),
711 e->flags, e->ext);
712
632 devid = e->devid; 713 devid = e->devid;
633 set_dev_entry_from_acpi(iommu, devid, e->flags, 714 set_dev_entry_from_acpi(iommu, devid, e->flags,
634 e->ext); 715 e->ext);
635 break; 716 break;
636 case IVHD_DEV_EXT_SELECT_RANGE: 717 case IVHD_DEV_EXT_SELECT_RANGE:
718
719 DUMP_printk(" DEV_EXT_SELECT_RANGE\t devid: "
720 "%02x:%02x.%x flags: %02x ext: %08x\n",
721 PCI_BUS(e->devid),
722 PCI_SLOT(e->devid),
723 PCI_FUNC(e->devid),
724 e->flags, e->ext);
725
637 devid_start = e->devid; 726 devid_start = e->devid;
638 flags = e->flags; 727 flags = e->flags;
639 ext_flags = e->ext; 728 ext_flags = e->ext;
640 alias = false; 729 alias = false;
641 break; 730 break;
642 case IVHD_DEV_RANGE_END: 731 case IVHD_DEV_RANGE_END:
732
733 DUMP_printk(" DEV_RANGE_END\t\t devid: %02x:%02x.%x\n",
734 PCI_BUS(e->devid),
735 PCI_SLOT(e->devid),
736 PCI_FUNC(e->devid));
737
643 devid = e->devid; 738 devid = e->devid;
644 for (dev_i = devid_start; dev_i <= devid; ++dev_i) { 739 for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
645 if (alias) 740 if (alias)
@@ -679,7 +774,7 @@ static void __init free_iommu_all(void)
679{ 774{
680 struct amd_iommu *iommu, *next; 775 struct amd_iommu *iommu, *next;
681 776
682 list_for_each_entry_safe(iommu, next, &amd_iommu_list, list) { 777 for_each_iommu_safe(iommu, next) {
683 list_del(&iommu->list); 778 list_del(&iommu->list);
684 free_iommu_one(iommu); 779 free_iommu_one(iommu);
685 kfree(iommu); 780 kfree(iommu);
@@ -710,7 +805,6 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
710 if (!iommu->mmio_base) 805 if (!iommu->mmio_base)
711 return -ENOMEM; 806 return -ENOMEM;
712 807
713 iommu_set_device_table(iommu);
714 iommu->cmd_buf = alloc_command_buffer(iommu); 808 iommu->cmd_buf = alloc_command_buffer(iommu);
715 if (!iommu->cmd_buf) 809 if (!iommu->cmd_buf)
716 return -ENOMEM; 810 return -ENOMEM;
@@ -746,6 +840,15 @@ static int __init init_iommu_all(struct acpi_table_header *table)
746 h = (struct ivhd_header *)p; 840 h = (struct ivhd_header *)p;
747 switch (*p) { 841 switch (*p) {
748 case ACPI_IVHD_TYPE: 842 case ACPI_IVHD_TYPE:
843
844 DUMP_printk("IOMMU: device: %02x:%02x.%01x cap: %04x "
845 "seg: %d flags: %01x info %04x\n",
846 PCI_BUS(h->devid), PCI_SLOT(h->devid),
847 PCI_FUNC(h->devid), h->cap_ptr,
848 h->pci_seg, h->flags, h->info);
849 DUMP_printk(" mmio-addr: %016llx\n",
850 h->mmio_phys);
851
749 iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL); 852 iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
750 if (iommu == NULL) 853 if (iommu == NULL)
751 return -ENOMEM; 854 return -ENOMEM;
@@ -773,56 +876,9 @@ static int __init init_iommu_all(struct acpi_table_header *table)
773 * 876 *
774 ****************************************************************************/ 877 ****************************************************************************/
775 878
776static int __init iommu_setup_msix(struct amd_iommu *iommu)
777{
778 struct amd_iommu *curr;
779 struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */
780 int nvec = 0, i;
781
782 list_for_each_entry(curr, &amd_iommu_list, list) {
783 if (curr->dev == iommu->dev) {
784 entries[nvec].entry = curr->evt_msi_num;
785 entries[nvec].vector = 0;
786 curr->int_enabled = true;
787 nvec++;
788 }
789 }
790
791 if (pci_enable_msix(iommu->dev, entries, nvec)) {
792 pci_disable_msix(iommu->dev);
793 return 1;
794 }
795
796 for (i = 0; i < nvec; ++i) {
797 int r = request_irq(entries->vector, amd_iommu_int_handler,
798 IRQF_SAMPLE_RANDOM,
799 "AMD IOMMU",
800 NULL);
801 if (r)
802 goto out_free;
803 }
804
805 return 0;
806
807out_free:
808 for (i -= 1; i >= 0; --i)
809 free_irq(entries->vector, NULL);
810
811 pci_disable_msix(iommu->dev);
812
813 return 1;
814}
815
816static int __init iommu_setup_msi(struct amd_iommu *iommu) 879static int __init iommu_setup_msi(struct amd_iommu *iommu)
817{ 880{
818 int r; 881 int r;
819 struct amd_iommu *curr;
820
821 list_for_each_entry(curr, &amd_iommu_list, list) {
822 if (curr->dev == iommu->dev)
823 curr->int_enabled = true;
824 }
825
826 882
827 if (pci_enable_msi(iommu->dev)) 883 if (pci_enable_msi(iommu->dev))
828 return 1; 884 return 1;
@@ -837,17 +893,18 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu)
837 return 1; 893 return 1;
838 } 894 }
839 895
896 iommu->int_enabled = true;
897 iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
898
840 return 0; 899 return 0;
841} 900}
842 901
843static int __init iommu_init_msi(struct amd_iommu *iommu) 902static int iommu_init_msi(struct amd_iommu *iommu)
844{ 903{
845 if (iommu->int_enabled) 904 if (iommu->int_enabled)
846 return 0; 905 return 0;
847 906
848 if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSIX)) 907 if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
849 return iommu_setup_msix(iommu);
850 else if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
851 return iommu_setup_msi(iommu); 908 return iommu_setup_msi(iommu);
852 909
853 return 1; 910 return 1;
@@ -899,6 +956,7 @@ static int __init init_exclusion_range(struct ivmd_header *m)
899static int __init init_unity_map_range(struct ivmd_header *m) 956static int __init init_unity_map_range(struct ivmd_header *m)
900{ 957{
901 struct unity_map_entry *e = 0; 958 struct unity_map_entry *e = 0;
959 char *s;
902 960
903 e = kzalloc(sizeof(*e), GFP_KERNEL); 961 e = kzalloc(sizeof(*e), GFP_KERNEL);
904 if (e == NULL) 962 if (e == NULL)
@@ -906,14 +964,19 @@ static int __init init_unity_map_range(struct ivmd_header *m)
906 964
907 switch (m->type) { 965 switch (m->type) {
908 default: 966 default:
967 kfree(e);
968 return 0;
909 case ACPI_IVMD_TYPE: 969 case ACPI_IVMD_TYPE:
970 s = "IVMD_TYPEi\t\t\t";
910 e->devid_start = e->devid_end = m->devid; 971 e->devid_start = e->devid_end = m->devid;
911 break; 972 break;
912 case ACPI_IVMD_TYPE_ALL: 973 case ACPI_IVMD_TYPE_ALL:
974 s = "IVMD_TYPE_ALL\t\t";
913 e->devid_start = 0; 975 e->devid_start = 0;
914 e->devid_end = amd_iommu_last_bdf; 976 e->devid_end = amd_iommu_last_bdf;
915 break; 977 break;
916 case ACPI_IVMD_TYPE_RANGE: 978 case ACPI_IVMD_TYPE_RANGE:
979 s = "IVMD_TYPE_RANGE\t\t";
917 e->devid_start = m->devid; 980 e->devid_start = m->devid;
918 e->devid_end = m->aux; 981 e->devid_end = m->aux;
919 break; 982 break;
@@ -922,6 +985,13 @@ static int __init init_unity_map_range(struct ivmd_header *m)
922 e->address_end = e->address_start + PAGE_ALIGN(m->range_length); 985 e->address_end = e->address_start + PAGE_ALIGN(m->range_length);
923 e->prot = m->flags >> 1; 986 e->prot = m->flags >> 1;
924 987
988 DUMP_printk("%s devid_start: %02x:%02x.%x devid_end: %02x:%02x.%x"
989 " range_start: %016llx range_end: %016llx flags: %x\n", s,
990 PCI_BUS(e->devid_start), PCI_SLOT(e->devid_start),
991 PCI_FUNC(e->devid_start), PCI_BUS(e->devid_end),
992 PCI_SLOT(e->devid_end), PCI_FUNC(e->devid_end),
993 e->address_start, e->address_end, m->flags);
994
925 list_add_tail(&e->list, &amd_iommu_unity_map); 995 list_add_tail(&e->list, &amd_iommu_unity_map);
926 996
927 return 0; 997 return 0;
@@ -967,18 +1037,28 @@ static void init_device_table(void)
967 * This function finally enables all IOMMUs found in the system after 1037 * This function finally enables all IOMMUs found in the system after
968 * they have been initialized 1038 * they have been initialized
969 */ 1039 */
970static void __init enable_iommus(void) 1040static void enable_iommus(void)
971{ 1041{
972 struct amd_iommu *iommu; 1042 struct amd_iommu *iommu;
973 1043
974 list_for_each_entry(iommu, &amd_iommu_list, list) { 1044 for_each_iommu(iommu) {
1045 iommu_set_device_table(iommu);
1046 iommu_enable_command_buffer(iommu);
1047 iommu_enable_event_buffer(iommu);
975 iommu_set_exclusion_range(iommu); 1048 iommu_set_exclusion_range(iommu);
976 iommu_init_msi(iommu); 1049 iommu_init_msi(iommu);
977 iommu_enable_event_logging(iommu);
978 iommu_enable(iommu); 1050 iommu_enable(iommu);
979 } 1051 }
980} 1052}
981 1053
1054static void disable_iommus(void)
1055{
1056 struct amd_iommu *iommu;
1057
1058 for_each_iommu(iommu)
1059 iommu_disable(iommu);
1060}
1061
982/* 1062/*
983 * Suspend/Resume support 1063 * Suspend/Resume support
984 * disable suspend until real resume implemented 1064 * disable suspend until real resume implemented
@@ -986,12 +1066,31 @@ static void __init enable_iommus(void)
986 1066
987static int amd_iommu_resume(struct sys_device *dev) 1067static int amd_iommu_resume(struct sys_device *dev)
988{ 1068{
1069 /*
1070 * Disable IOMMUs before reprogramming the hardware registers.
1071 * IOMMU is still enabled from the resume kernel.
1072 */
1073 disable_iommus();
1074
1075 /* re-load the hardware */
1076 enable_iommus();
1077
1078 /*
1079 * we have to flush after the IOMMUs are enabled because a
1080 * disabled IOMMU will never execute the commands we send
1081 */
1082 amd_iommu_flush_all_domains();
1083 amd_iommu_flush_all_devices();
1084
989 return 0; 1085 return 0;
990} 1086}
991 1087
992static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state) 1088static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state)
993{ 1089{
994 return -EINVAL; 1090 /* disable IOMMUs to go out of the way for BIOS */
1091 disable_iommus();
1092
1093 return 0;
995} 1094}
996 1095
997static struct sysdev_class amd_iommu_sysdev_class = { 1096static struct sysdev_class amd_iommu_sysdev_class = {
@@ -1137,9 +1236,6 @@ int __init amd_iommu_init(void)
1137 1236
1138 enable_iommus(); 1237 enable_iommus();
1139 1238
1140 printk(KERN_INFO "AMD IOMMU: aperture size is %d MB\n",
1141 (1 << (amd_iommu_aperture_order-20)));
1142
1143 printk(KERN_INFO "AMD IOMMU: device isolation "); 1239 printk(KERN_INFO "AMD IOMMU: device isolation ");
1144 if (amd_iommu_isolate) 1240 if (amd_iommu_isolate)
1145 printk("enabled\n"); 1241 printk("enabled\n");
@@ -1211,6 +1307,13 @@ void __init amd_iommu_detect(void)
1211 * 1307 *
1212 ****************************************************************************/ 1308 ****************************************************************************/
1213 1309
1310static int __init parse_amd_iommu_dump(char *str)
1311{
1312 amd_iommu_dump = true;
1313
1314 return 1;
1315}
1316
1214static int __init parse_amd_iommu_options(char *str) 1317static int __init parse_amd_iommu_options(char *str)
1215{ 1318{
1216 for (; *str; ++str) { 1319 for (; *str; ++str) {
@@ -1225,15 +1328,5 @@ static int __init parse_amd_iommu_options(char *str)
1225 return 1; 1328 return 1;
1226} 1329}
1227 1330
1228static int __init parse_amd_iommu_size_options(char *str) 1331__setup("amd_iommu_dump", parse_amd_iommu_dump);
1229{
1230 unsigned order = PAGE_SHIFT + get_order(memparse(str, &str));
1231
1232 if ((order > 24) && (order < 31))
1233 amd_iommu_aperture_order = order;
1234
1235 return 1;
1236}
1237
1238__setup("amd_iommu=", parse_amd_iommu_options); 1332__setup("amd_iommu=", parse_amd_iommu_options);
1239__setup("amd_iommu_size=", parse_amd_iommu_size_options);
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index f2870920f24..8c7c042ecad 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -14,6 +14,7 @@
14 * Mikael Pettersson : PM converted to driver model. 14 * Mikael Pettersson : PM converted to driver model.
15 */ 15 */
16 16
17#include <linux/perf_counter.h>
17#include <linux/kernel_stat.h> 18#include <linux/kernel_stat.h>
18#include <linux/mc146818rtc.h> 19#include <linux/mc146818rtc.h>
19#include <linux/acpi_pmtmr.h> 20#include <linux/acpi_pmtmr.h>
@@ -34,6 +35,7 @@
34#include <linux/smp.h> 35#include <linux/smp.h>
35#include <linux/mm.h> 36#include <linux/mm.h>
36 37
38#include <asm/perf_counter.h>
37#include <asm/pgalloc.h> 39#include <asm/pgalloc.h>
38#include <asm/atomic.h> 40#include <asm/atomic.h>
39#include <asm/mpspec.h> 41#include <asm/mpspec.h>
@@ -98,6 +100,29 @@ early_param("lapic", parse_lapic);
98/* Local APIC was disabled by the BIOS and enabled by the kernel */ 100/* Local APIC was disabled by the BIOS and enabled by the kernel */
99static int enabled_via_apicbase; 101static int enabled_via_apicbase;
100 102
103/*
104 * Handle interrupt mode configuration register (IMCR).
105 * This register controls whether the interrupt signals
106 * that reach the BSP come from the master PIC or from the
107 * local APIC. Before entering Symmetric I/O Mode, either
108 * the BIOS or the operating system must switch out of
109 * PIC Mode by changing the IMCR.
110 */
111static inline void imcr_pic_to_apic(void)
112{
113 /* select IMCR register */
114 outb(0x70, 0x22);
115 /* NMI and 8259 INTR go through APIC */
116 outb(0x01, 0x23);
117}
118
119static inline void imcr_apic_to_pic(void)
120{
121 /* select IMCR register */
122 outb(0x70, 0x22);
123 /* NMI and 8259 INTR go directly to BSP */
124 outb(0x00, 0x23);
125}
101#endif 126#endif
102 127
103#ifdef CONFIG_X86_64 128#ifdef CONFIG_X86_64
@@ -111,13 +136,19 @@ static __init int setup_apicpmtimer(char *s)
111__setup("apicpmtimer", setup_apicpmtimer); 136__setup("apicpmtimer", setup_apicpmtimer);
112#endif 137#endif
113 138
139int x2apic_mode;
114#ifdef CONFIG_X86_X2APIC 140#ifdef CONFIG_X86_X2APIC
115int x2apic;
116/* x2apic enabled before OS handover */ 141/* x2apic enabled before OS handover */
117static int x2apic_preenabled; 142static int x2apic_preenabled;
118static int disable_x2apic; 143static int disable_x2apic;
119static __init int setup_nox2apic(char *str) 144static __init int setup_nox2apic(char *str)
120{ 145{
146 if (x2apic_enabled()) {
147 pr_warning("Bios already enabled x2apic, "
148 "can't enforce nox2apic");
149 return 0;
150 }
151
121 disable_x2apic = 1; 152 disable_x2apic = 1;
122 setup_clear_cpu_cap(X86_FEATURE_X2APIC); 153 setup_clear_cpu_cap(X86_FEATURE_X2APIC);
123 return 0; 154 return 0;
@@ -209,6 +240,31 @@ static int modern_apic(void)
209 return lapic_get_version() >= 0x14; 240 return lapic_get_version() >= 0x14;
210} 241}
211 242
243/*
244 * bare function to substitute write operation
245 * and it's _that_ fast :)
246 */
247static void native_apic_write_dummy(u32 reg, u32 v)
248{
249 WARN_ON_ONCE((cpu_has_apic || !disable_apic));
250}
251
252static u32 native_apic_read_dummy(u32 reg)
253{
254 WARN_ON_ONCE((cpu_has_apic && !disable_apic));
255 return 0;
256}
257
258/*
259 * right after this call apic->write/read doesn't do anything
260 * note that there is no restore operation it works one way
261 */
262void apic_disable(void)
263{
264 apic->read = native_apic_read_dummy;
265 apic->write = native_apic_write_dummy;
266}
267
212void native_apic_wait_icr_idle(void) 268void native_apic_wait_icr_idle(void)
213{ 269{
214 while (apic_read(APIC_ICR) & APIC_ICR_BUSY) 270 while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
@@ -348,7 +404,7 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
348 404
349static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) 405static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
350{ 406{
351 unsigned long reg = (lvt_off << 4) + APIC_EILVT0; 407 unsigned long reg = (lvt_off << 4) + APIC_EILVTn(0);
352 unsigned int v = (mask << 16) | (msg_type << 8) | vector; 408 unsigned int v = (mask << 16) | (msg_type << 8) | vector;
353 409
354 apic_write(reg, v); 410 apic_write(reg, v);
@@ -815,7 +871,7 @@ void clear_local_APIC(void)
815 u32 v; 871 u32 v;
816 872
817 /* APIC hasn't been mapped yet */ 873 /* APIC hasn't been mapped yet */
818 if (!x2apic && !apic_phys) 874 if (!x2apic_mode && !apic_phys)
819 return; 875 return;
820 876
821 maxlvt = lapic_get_maxlvt(); 877 maxlvt = lapic_get_maxlvt();
@@ -843,7 +899,7 @@ void clear_local_APIC(void)
843 } 899 }
844 900
845 /* lets not touch this if we didn't frob it */ 901 /* lets not touch this if we didn't frob it */
846#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) 902#ifdef CONFIG_X86_THERMAL_VECTOR
847 if (maxlvt >= 5) { 903 if (maxlvt >= 5) {
848 v = apic_read(APIC_LVTTHMR); 904 v = apic_read(APIC_LVTTHMR);
849 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); 905 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
@@ -1133,6 +1189,7 @@ void __cpuinit setup_local_APIC(void)
1133 apic_write(APIC_ESR, 0); 1189 apic_write(APIC_ESR, 0);
1134 } 1190 }
1135#endif 1191#endif
1192 perf_counters_lapic_init();
1136 1193
1137 preempt_disable(); 1194 preempt_disable();
1138 1195
@@ -1287,7 +1344,7 @@ void check_x2apic(void)
1287{ 1344{
1288 if (x2apic_enabled()) { 1345 if (x2apic_enabled()) {
1289 pr_info("x2apic enabled by BIOS, switching to x2apic ops\n"); 1346 pr_info("x2apic enabled by BIOS, switching to x2apic ops\n");
1290 x2apic_preenabled = x2apic = 1; 1347 x2apic_preenabled = x2apic_mode = 1;
1291 } 1348 }
1292} 1349}
1293 1350
@@ -1295,7 +1352,7 @@ void enable_x2apic(void)
1295{ 1352{
1296 int msr, msr2; 1353 int msr, msr2;
1297 1354
1298 if (!x2apic) 1355 if (!x2apic_mode)
1299 return; 1356 return;
1300 1357
1301 rdmsr(MSR_IA32_APICBASE, msr, msr2); 1358 rdmsr(MSR_IA32_APICBASE, msr, msr2);
@@ -1304,6 +1361,7 @@ void enable_x2apic(void)
1304 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0); 1361 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
1305 } 1362 }
1306} 1363}
1364#endif /* CONFIG_X86_X2APIC */
1307 1365
1308void __init enable_IR_x2apic(void) 1366void __init enable_IR_x2apic(void)
1309{ 1367{
@@ -1312,32 +1370,21 @@ void __init enable_IR_x2apic(void)
1312 unsigned long flags; 1370 unsigned long flags;
1313 struct IO_APIC_route_entry **ioapic_entries = NULL; 1371 struct IO_APIC_route_entry **ioapic_entries = NULL;
1314 1372
1315 if (!cpu_has_x2apic) 1373 ret = dmar_table_init();
1316 return; 1374 if (ret) {
1317 1375 pr_debug("dmar_table_init() failed with %d:\n", ret);
1318 if (!x2apic_preenabled && disable_x2apic) { 1376 goto ir_failed;
1319 pr_info("Skipped enabling x2apic and Interrupt-remapping "
1320 "because of nox2apic\n");
1321 return;
1322 } 1377 }
1323 1378
1324 if (x2apic_preenabled && disable_x2apic) 1379 if (!intr_remapping_supported()) {
1325 panic("Bios already enabled x2apic, can't enforce nox2apic"); 1380 pr_debug("intr-remapping not supported\n");
1326 1381 goto ir_failed;
1327 if (!x2apic_preenabled && skip_ioapic_setup) {
1328 pr_info("Skipped enabling x2apic and Interrupt-remapping "
1329 "because of skipping io-apic setup\n");
1330 return;
1331 } 1382 }
1332 1383
1333 ret = dmar_table_init();
1334 if (ret) {
1335 pr_info("dmar_table_init() failed with %d:\n", ret);
1336 1384
1337 if (x2apic_preenabled) 1385 if (!x2apic_preenabled && skip_ioapic_setup) {
1338 panic("x2apic enabled by bios. But IR enabling failed"); 1386 pr_info("Skipped enabling intr-remap because of skipping "
1339 else 1387 "io-apic setup\n");
1340 pr_info("Not enabling x2apic,Intr-remapping\n");
1341 return; 1388 return;
1342 } 1389 }
1343 1390
@@ -1357,19 +1404,16 @@ void __init enable_IR_x2apic(void)
1357 mask_IO_APIC_setup(ioapic_entries); 1404 mask_IO_APIC_setup(ioapic_entries);
1358 mask_8259A(); 1405 mask_8259A();
1359 1406
1360 ret = enable_intr_remapping(EIM_32BIT_APIC_ID); 1407 ret = enable_intr_remapping(x2apic_supported());
1361
1362 if (ret && x2apic_preenabled) {
1363 local_irq_restore(flags);
1364 panic("x2apic enabled by bios. But IR enabling failed");
1365 }
1366
1367 if (ret) 1408 if (ret)
1368 goto end_restore; 1409 goto end_restore;
1369 1410
1370 if (!x2apic) { 1411 pr_info("Enabled Interrupt-remapping\n");
1371 x2apic = 1; 1412
1413 if (x2apic_supported() && !x2apic_mode) {
1414 x2apic_mode = 1;
1372 enable_x2apic(); 1415 enable_x2apic();
1416 pr_info("Enabled x2apic\n");
1373 } 1417 }
1374 1418
1375end_restore: 1419end_restore:
@@ -1378,37 +1422,34 @@ end_restore:
1378 * IR enabling failed 1422 * IR enabling failed
1379 */ 1423 */
1380 restore_IO_APIC_setup(ioapic_entries); 1424 restore_IO_APIC_setup(ioapic_entries);
1381 else
1382 reinit_intr_remapped_IO_APIC(x2apic_preenabled, ioapic_entries);
1383 1425
1384 unmask_8259A(); 1426 unmask_8259A();
1385 local_irq_restore(flags); 1427 local_irq_restore(flags);
1386 1428
1387end: 1429end:
1388 if (!ret) {
1389 if (!x2apic_preenabled)
1390 pr_info("Enabled x2apic and interrupt-remapping\n");
1391 else
1392 pr_info("Enabled Interrupt-remapping\n");
1393 } else
1394 pr_err("Failed to enable Interrupt-remapping and x2apic\n");
1395 if (ioapic_entries) 1430 if (ioapic_entries)
1396 free_ioapic_entries(ioapic_entries); 1431 free_ioapic_entries(ioapic_entries);
1432
1433 if (!ret)
1434 return;
1435
1436ir_failed:
1437 if (x2apic_preenabled)
1438 panic("x2apic enabled by bios. But IR enabling failed");
1439 else if (cpu_has_x2apic)
1440 pr_info("Not enabling x2apic,Intr-remapping\n");
1397#else 1441#else
1398 if (!cpu_has_x2apic) 1442 if (!cpu_has_x2apic)
1399 return; 1443 return;
1400 1444
1401 if (x2apic_preenabled) 1445 if (x2apic_preenabled)
1402 panic("x2apic enabled prior OS handover," 1446 panic("x2apic enabled prior OS handover,"
1403 " enable CONFIG_INTR_REMAP"); 1447 " enable CONFIG_X86_X2APIC, CONFIG_INTR_REMAP");
1404
1405 pr_info("Enable CONFIG_INTR_REMAP for enabling intr-remapping "
1406 " and x2apic\n");
1407#endif 1448#endif
1408 1449
1409 return; 1450 return;
1410} 1451}
1411#endif /* CONFIG_X86_X2APIC */ 1452
1412 1453
1413#ifdef CONFIG_X86_64 1454#ifdef CONFIG_X86_64
1414/* 1455/*
@@ -1425,7 +1466,6 @@ static int __init detect_init_APIC(void)
1425 } 1466 }
1426 1467
1427 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; 1468 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
1428 boot_cpu_physical_apicid = 0;
1429 return 0; 1469 return 0;
1430} 1470}
1431#else 1471#else
@@ -1539,32 +1579,49 @@ void __init early_init_lapic_mapping(void)
1539 */ 1579 */
1540void __init init_apic_mappings(void) 1580void __init init_apic_mappings(void)
1541{ 1581{
1542 if (x2apic) { 1582 unsigned int new_apicid;
1583
1584 if (x2apic_mode) {
1543 boot_cpu_physical_apicid = read_apic_id(); 1585 boot_cpu_physical_apicid = read_apic_id();
1544 return; 1586 return;
1545 } 1587 }
1546 1588
1547 /* 1589 /* If no local APIC can be found return early */
1548 * If no local APIC can be found then set up a fake all
1549 * zeroes page to simulate the local APIC and another
1550 * one for the IO-APIC.
1551 */
1552 if (!smp_found_config && detect_init_APIC()) { 1590 if (!smp_found_config && detect_init_APIC()) {
1553 apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); 1591 /* lets NOP'ify apic operations */
1554 apic_phys = __pa(apic_phys); 1592 pr_info("APIC: disable apic facility\n");
1555 } else 1593 apic_disable();
1594 } else {
1556 apic_phys = mp_lapic_addr; 1595 apic_phys = mp_lapic_addr;
1557 1596
1558 set_fixmap_nocache(FIX_APIC_BASE, apic_phys); 1597 /*
1559 apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", 1598 * acpi lapic path already maps that address in
1560 APIC_BASE, apic_phys); 1599 * acpi_register_lapic_address()
1600 */
1601 if (!acpi_lapic)
1602 set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
1603
1604 apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
1605 APIC_BASE, apic_phys);
1606 }
1561 1607
1562 /* 1608 /*
1563 * Fetch the APIC ID of the BSP in case we have a 1609 * Fetch the APIC ID of the BSP in case we have a
1564 * default configuration (or the MP table is broken). 1610 * default configuration (or the MP table is broken).
1565 */ 1611 */
1566 if (boot_cpu_physical_apicid == -1U) 1612 new_apicid = read_apic_id();
1567 boot_cpu_physical_apicid = read_apic_id(); 1613 if (boot_cpu_physical_apicid != new_apicid) {
1614 boot_cpu_physical_apicid = new_apicid;
1615 /*
1616 * yeah -- we lie about apic_version
1617 * in case if apic was disabled via boot option
1618 * but it's not a problem for SMP compiled kernel
1619 * since smp_sanity_check is prepared for such a case
1620 * and disable smp mode
1621 */
1622 apic_version[new_apicid] =
1623 GET_APIC_VERSION(apic_read(APIC_LVR));
1624 }
1568} 1625}
1569 1626
1570/* 1627/*
@@ -1733,8 +1790,7 @@ void __init connect_bsp_APIC(void)
1733 */ 1790 */
1734 apic_printk(APIC_VERBOSE, "leaving PIC mode, " 1791 apic_printk(APIC_VERBOSE, "leaving PIC mode, "
1735 "enabling APIC mode.\n"); 1792 "enabling APIC mode.\n");
1736 outb(0x70, 0x22); 1793 imcr_pic_to_apic();
1737 outb(0x01, 0x23);
1738 } 1794 }
1739#endif 1795#endif
1740 if (apic->enable_apic_mode) 1796 if (apic->enable_apic_mode)
@@ -1762,8 +1818,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1762 */ 1818 */
1763 apic_printk(APIC_VERBOSE, "disabling APIC mode, " 1819 apic_printk(APIC_VERBOSE, "disabling APIC mode, "
1764 "entering PIC mode.\n"); 1820 "entering PIC mode.\n");
1765 outb(0x70, 0x22); 1821 imcr_apic_to_pic();
1766 outb(0x00, 0x23);
1767 return; 1822 return;
1768 } 1823 }
1769#endif 1824#endif
@@ -1962,17 +2017,17 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
1962 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); 2017 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
1963 apic_pm_state.apic_tmict = apic_read(APIC_TMICT); 2018 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
1964 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); 2019 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
1965#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) 2020#ifdef CONFIG_X86_THERMAL_VECTOR
1966 if (maxlvt >= 5) 2021 if (maxlvt >= 5)
1967 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); 2022 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
1968#endif 2023#endif
1969 2024
1970 local_irq_save(flags); 2025 local_irq_save(flags);
1971 disable_local_APIC(); 2026 disable_local_APIC();
1972#ifdef CONFIG_INTR_REMAP 2027
1973 if (intr_remapping_enabled) 2028 if (intr_remapping_enabled)
1974 disable_intr_remapping(); 2029 disable_intr_remapping();
1975#endif 2030
1976 local_irq_restore(flags); 2031 local_irq_restore(flags);
1977 return 0; 2032 return 0;
1978} 2033}
@@ -1982,42 +2037,34 @@ static int lapic_resume(struct sys_device *dev)
1982 unsigned int l, h; 2037 unsigned int l, h;
1983 unsigned long flags; 2038 unsigned long flags;
1984 int maxlvt; 2039 int maxlvt;
1985 2040 int ret = 0;
1986#ifdef CONFIG_INTR_REMAP
1987 int ret;
1988 struct IO_APIC_route_entry **ioapic_entries = NULL; 2041 struct IO_APIC_route_entry **ioapic_entries = NULL;
1989 2042
1990 if (!apic_pm_state.active) 2043 if (!apic_pm_state.active)
1991 return 0; 2044 return 0;
1992 2045
1993 local_irq_save(flags); 2046 local_irq_save(flags);
1994 if (x2apic) { 2047 if (intr_remapping_enabled) {
1995 ioapic_entries = alloc_ioapic_entries(); 2048 ioapic_entries = alloc_ioapic_entries();
1996 if (!ioapic_entries) { 2049 if (!ioapic_entries) {
1997 WARN(1, "Alloc ioapic_entries in lapic resume failed."); 2050 WARN(1, "Alloc ioapic_entries in lapic resume failed.");
1998 return -ENOMEM; 2051 ret = -ENOMEM;
2052 goto restore;
1999 } 2053 }
2000 2054
2001 ret = save_IO_APIC_setup(ioapic_entries); 2055 ret = save_IO_APIC_setup(ioapic_entries);
2002 if (ret) { 2056 if (ret) {
2003 WARN(1, "Saving IO-APIC state failed: %d\n", ret); 2057 WARN(1, "Saving IO-APIC state failed: %d\n", ret);
2004 free_ioapic_entries(ioapic_entries); 2058 free_ioapic_entries(ioapic_entries);
2005 return ret; 2059 goto restore;
2006 } 2060 }
2007 2061
2008 mask_IO_APIC_setup(ioapic_entries); 2062 mask_IO_APIC_setup(ioapic_entries);
2009 mask_8259A(); 2063 mask_8259A();
2010 enable_x2apic();
2011 } 2064 }
2012#else
2013 if (!apic_pm_state.active)
2014 return 0;
2015 2065
2016 local_irq_save(flags); 2066 if (x2apic_mode)
2017 if (x2apic)
2018 enable_x2apic(); 2067 enable_x2apic();
2019#endif
2020
2021 else { 2068 else {
2022 /* 2069 /*
2023 * Make sure the APICBASE points to the right address 2070 * Make sure the APICBASE points to the right address
@@ -2055,21 +2102,16 @@ static int lapic_resume(struct sys_device *dev)
2055 apic_write(APIC_ESR, 0); 2102 apic_write(APIC_ESR, 0);
2056 apic_read(APIC_ESR); 2103 apic_read(APIC_ESR);
2057 2104
2058#ifdef CONFIG_INTR_REMAP 2105 if (intr_remapping_enabled) {
2059 if (intr_remapping_enabled) 2106 reenable_intr_remapping(x2apic_mode);
2060 reenable_intr_remapping(EIM_32BIT_APIC_ID);
2061
2062 if (x2apic) {
2063 unmask_8259A(); 2107 unmask_8259A();
2064 restore_IO_APIC_setup(ioapic_entries); 2108 restore_IO_APIC_setup(ioapic_entries);
2065 free_ioapic_entries(ioapic_entries); 2109 free_ioapic_entries(ioapic_entries);
2066 } 2110 }
2067#endif 2111restore:
2068
2069 local_irq_restore(flags); 2112 local_irq_restore(flags);
2070 2113
2071 2114 return ret;
2072 return 0;
2073} 2115}
2074 2116
2075/* 2117/*
@@ -2117,31 +2159,14 @@ static void apic_pm_activate(void) { }
2117#endif /* CONFIG_PM */ 2159#endif /* CONFIG_PM */
2118 2160
2119#ifdef CONFIG_X86_64 2161#ifdef CONFIG_X86_64
2120/* 2162
2121 * apic_is_clustered_box() -- Check if we can expect good TSC 2163static int __cpuinit apic_cluster_num(void)
2122 *
2123 * Thus far, the major user of this is IBM's Summit2 series:
2124 *
2125 * Clustered boxes may have unsynced TSC problems if they are
2126 * multi-chassis. Use available data to take a good guess.
2127 * If in doubt, go HPET.
2128 */
2129__cpuinit int apic_is_clustered_box(void)
2130{ 2164{
2131 int i, clusters, zeros; 2165 int i, clusters, zeros;
2132 unsigned id; 2166 unsigned id;
2133 u16 *bios_cpu_apicid; 2167 u16 *bios_cpu_apicid;
2134 DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); 2168 DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
2135 2169
2136 /*
2137 * there is not this kind of box with AMD CPU yet.
2138 * Some AMD box with quadcore cpu and 8 sockets apicid
2139 * will be [4, 0x23] or [8, 0x27] could be thought to
2140 * vsmp box still need checking...
2141 */
2142 if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box())
2143 return 0;
2144
2145 bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); 2170 bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
2146 bitmap_zero(clustermap, NUM_APIC_CLUSTERS); 2171 bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
2147 2172
@@ -2177,18 +2202,67 @@ __cpuinit int apic_is_clustered_box(void)
2177 ++zeros; 2202 ++zeros;
2178 } 2203 }
2179 2204
2180 /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are 2205 return clusters;
2181 * not guaranteed to be synced between boards 2206}
2182 */ 2207
2183 if (is_vsmp_box() && clusters > 1) 2208static int __cpuinitdata multi_checked;
2209static int __cpuinitdata multi;
2210
2211static int __cpuinit set_multi(const struct dmi_system_id *d)
2212{
2213 if (multi)
2214 return 0;
2215 pr_info("APIC: %s detected, Multi Chassis\n", d->ident);
2216 multi = 1;
2217 return 0;
2218}
2219
2220static const __cpuinitconst struct dmi_system_id multi_dmi_table[] = {
2221 {
2222 .callback = set_multi,
2223 .ident = "IBM System Summit2",
2224 .matches = {
2225 DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
2226 DMI_MATCH(DMI_PRODUCT_NAME, "Summit2"),
2227 },
2228 },
2229 {}
2230};
2231
2232static void __cpuinit dmi_check_multi(void)
2233{
2234 if (multi_checked)
2235 return;
2236
2237 dmi_check_system(multi_dmi_table);
2238 multi_checked = 1;
2239}
2240
2241/*
2242 * apic_is_clustered_box() -- Check if we can expect good TSC
2243 *
2244 * Thus far, the major user of this is IBM's Summit2 series:
2245 * Clustered boxes may have unsynced TSC problems if they are
2246 * multi-chassis.
2247 * Use DMI to check them
2248 */
2249__cpuinit int apic_is_clustered_box(void)
2250{
2251 dmi_check_multi();
2252 if (multi)
2184 return 1; 2253 return 1;
2185 2254
2255 if (!is_vsmp_box())
2256 return 0;
2257
2186 /* 2258 /*
2187 * If clusters > 2, then should be multi-chassis. 2259 * ScaleMP vSMPowered boxes have one cluster per board and TSCs are
2188 * May have to revisit this when multi-core + hyperthreaded CPUs come 2260 * not guaranteed to be synced between boards
2189 * out, but AFAIK this will work even for them.
2190 */ 2261 */
2191 return (clusters > 2); 2262 if (apic_cluster_num() > 1)
2263 return 1;
2264
2265 return 0;
2192} 2266}
2193#endif 2267#endif
2194 2268
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 306e5e88fb6..d0c99abc26c 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -161,7 +161,7 @@ static int flat_apic_id_registered(void)
161 161
162static int flat_phys_pkg_id(int initial_apic_id, int index_msb) 162static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
163{ 163{
164 return hard_smp_processor_id() >> index_msb; 164 return initial_apic_id >> index_msb;
165} 165}
166 166
167struct apic apic_flat = { 167struct apic apic_flat = {
@@ -235,7 +235,7 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
235 * regardless of how many processors are present (x86_64 ES7000 235 * regardless of how many processors are present (x86_64 ES7000
236 * is an example). 236 * is an example).
237 */ 237 */
238 if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID && 238 if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
239 (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) { 239 (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
240 printk(KERN_DEBUG "system APIC only can use physical flat"); 240 printk(KERN_DEBUG "system APIC only can use physical flat");
241 return 1; 241 return 1;
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 1c11b819f24..69328ac8de9 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -145,7 +145,7 @@ es7000_rename_gsi(int ioapic, int gsi)
145 return gsi; 145 return gsi;
146} 146}
147 147
148static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) 148static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
149{ 149{
150 unsigned long vect = 0, psaival = 0; 150 unsigned long vect = 0, psaival = 0;
151 151
@@ -254,7 +254,7 @@ static int parse_unisys_oem(char *oemptr)
254} 254}
255 255
256#ifdef CONFIG_ACPI 256#ifdef CONFIG_ACPI
257static int find_unisys_acpi_oem_table(unsigned long *oem_addr) 257static int __init find_unisys_acpi_oem_table(unsigned long *oem_addr)
258{ 258{
259 struct acpi_table_header *header = NULL; 259 struct acpi_table_header *header = NULL;
260 struct es7000_oem_table *table; 260 struct es7000_oem_table *table;
@@ -285,7 +285,7 @@ static int find_unisys_acpi_oem_table(unsigned long *oem_addr)
285 return 0; 285 return 0;
286} 286}
287 287
288static void unmap_unisys_acpi_oem_table(unsigned long oem_addr) 288static void __init unmap_unisys_acpi_oem_table(unsigned long oem_addr)
289{ 289{
290 if (!oem_addr) 290 if (!oem_addr)
291 return; 291 return;
@@ -306,7 +306,7 @@ static int es7000_check_dsdt(void)
306static int es7000_acpi_ret; 306static int es7000_acpi_ret;
307 307
308/* Hook from generic ACPI tables.c */ 308/* Hook from generic ACPI tables.c */
309static int es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 309static int __init es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
310{ 310{
311 unsigned long oem_addr = 0; 311 unsigned long oem_addr = 0;
312 int check_dsdt; 312 int check_dsdt;
@@ -717,7 +717,7 @@ struct apic apic_es7000_cluster = {
717 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 717 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
718}; 718};
719 719
720struct apic apic_es7000 = { 720struct apic __refdata apic_es7000 = {
721 721
722 .name = "es7000", 722 .name = "es7000",
723 .probe = probe_es7000, 723 .probe = probe_es7000,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 30da617d18e..ef8d9290c7e 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -59,6 +59,7 @@
59#include <asm/setup.h> 59#include <asm/setup.h>
60#include <asm/irq_remapping.h> 60#include <asm/irq_remapping.h>
61#include <asm/hpet.h> 61#include <asm/hpet.h>
62#include <asm/hw_irq.h>
62#include <asm/uv/uv_hub.h> 63#include <asm/uv/uv_hub.h>
63#include <asm/uv/uv_irq.h> 64#include <asm/uv/uv_irq.h>
64 65
@@ -129,12 +130,9 @@ struct irq_pin_list {
129 struct irq_pin_list *next; 130 struct irq_pin_list *next;
130}; 131};
131 132
132static struct irq_pin_list *get_one_free_irq_2_pin(int cpu) 133static struct irq_pin_list *get_one_free_irq_2_pin(int node)
133{ 134{
134 struct irq_pin_list *pin; 135 struct irq_pin_list *pin;
135 int node;
136
137 node = cpu_to_node(cpu);
138 136
139 pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node); 137 pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
140 138
@@ -148,9 +146,6 @@ struct irq_cfg {
148 unsigned move_cleanup_count; 146 unsigned move_cleanup_count;
149 u8 vector; 147 u8 vector;
150 u8 move_in_progress : 1; 148 u8 move_in_progress : 1;
151#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
152 u8 move_desc_pending : 1;
153#endif
154}; 149};
155 150
156/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ 151/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
@@ -182,16 +177,18 @@ int __init arch_early_irq_init(void)
182 struct irq_cfg *cfg; 177 struct irq_cfg *cfg;
183 struct irq_desc *desc; 178 struct irq_desc *desc;
184 int count; 179 int count;
180 int node;
185 int i; 181 int i;
186 182
187 cfg = irq_cfgx; 183 cfg = irq_cfgx;
188 count = ARRAY_SIZE(irq_cfgx); 184 count = ARRAY_SIZE(irq_cfgx);
185 node= cpu_to_node(boot_cpu_id);
189 186
190 for (i = 0; i < count; i++) { 187 for (i = 0; i < count; i++) {
191 desc = irq_to_desc(i); 188 desc = irq_to_desc(i);
192 desc->chip_data = &cfg[i]; 189 desc->chip_data = &cfg[i];
193 alloc_bootmem_cpumask_var(&cfg[i].domain); 190 zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
194 alloc_bootmem_cpumask_var(&cfg[i].old_domain); 191 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
195 if (i < NR_IRQS_LEGACY) 192 if (i < NR_IRQS_LEGACY)
196 cpumask_setall(cfg[i].domain); 193 cpumask_setall(cfg[i].domain);
197 } 194 }
@@ -212,12 +209,9 @@ static struct irq_cfg *irq_cfg(unsigned int irq)
212 return cfg; 209 return cfg;
213} 210}
214 211
215static struct irq_cfg *get_one_free_irq_cfg(int cpu) 212static struct irq_cfg *get_one_free_irq_cfg(int node)
216{ 213{
217 struct irq_cfg *cfg; 214 struct irq_cfg *cfg;
218 int node;
219
220 node = cpu_to_node(cpu);
221 215
222 cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); 216 cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
223 if (cfg) { 217 if (cfg) {
@@ -238,13 +232,13 @@ static struct irq_cfg *get_one_free_irq_cfg(int cpu)
238 return cfg; 232 return cfg;
239} 233}
240 234
241int arch_init_chip_data(struct irq_desc *desc, int cpu) 235int arch_init_chip_data(struct irq_desc *desc, int node)
242{ 236{
243 struct irq_cfg *cfg; 237 struct irq_cfg *cfg;
244 238
245 cfg = desc->chip_data; 239 cfg = desc->chip_data;
246 if (!cfg) { 240 if (!cfg) {
247 desc->chip_data = get_one_free_irq_cfg(cpu); 241 desc->chip_data = get_one_free_irq_cfg(node);
248 if (!desc->chip_data) { 242 if (!desc->chip_data) {
249 printk(KERN_ERR "can not alloc irq_cfg\n"); 243 printk(KERN_ERR "can not alloc irq_cfg\n");
250 BUG_ON(1); 244 BUG_ON(1);
@@ -254,10 +248,9 @@ int arch_init_chip_data(struct irq_desc *desc, int cpu)
254 return 0; 248 return 0;
255} 249}
256 250
257#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC 251/* for move_irq_desc */
258
259static void 252static void
260init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu) 253init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int node)
261{ 254{
262 struct irq_pin_list *old_entry, *head, *tail, *entry; 255 struct irq_pin_list *old_entry, *head, *tail, *entry;
263 256
@@ -266,7 +259,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
266 if (!old_entry) 259 if (!old_entry)
267 return; 260 return;
268 261
269 entry = get_one_free_irq_2_pin(cpu); 262 entry = get_one_free_irq_2_pin(node);
270 if (!entry) 263 if (!entry)
271 return; 264 return;
272 265
@@ -276,7 +269,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
276 tail = entry; 269 tail = entry;
277 old_entry = old_entry->next; 270 old_entry = old_entry->next;
278 while (old_entry) { 271 while (old_entry) {
279 entry = get_one_free_irq_2_pin(cpu); 272 entry = get_one_free_irq_2_pin(node);
280 if (!entry) { 273 if (!entry) {
281 entry = head; 274 entry = head;
282 while (entry) { 275 while (entry) {
@@ -316,12 +309,12 @@ static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)
316} 309}
317 310
318void arch_init_copy_chip_data(struct irq_desc *old_desc, 311void arch_init_copy_chip_data(struct irq_desc *old_desc,
319 struct irq_desc *desc, int cpu) 312 struct irq_desc *desc, int node)
320{ 313{
321 struct irq_cfg *cfg; 314 struct irq_cfg *cfg;
322 struct irq_cfg *old_cfg; 315 struct irq_cfg *old_cfg;
323 316
324 cfg = get_one_free_irq_cfg(cpu); 317 cfg = get_one_free_irq_cfg(node);
325 318
326 if (!cfg) 319 if (!cfg)
327 return; 320 return;
@@ -332,7 +325,7 @@ void arch_init_copy_chip_data(struct irq_desc *old_desc,
332 325
333 memcpy(cfg, old_cfg, sizeof(struct irq_cfg)); 326 memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
334 327
335 init_copy_irq_2_pin(old_cfg, cfg, cpu); 328 init_copy_irq_2_pin(old_cfg, cfg, node);
336} 329}
337 330
338static void free_irq_cfg(struct irq_cfg *old_cfg) 331static void free_irq_cfg(struct irq_cfg *old_cfg)
@@ -356,19 +349,7 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
356 old_desc->chip_data = NULL; 349 old_desc->chip_data = NULL;
357 } 350 }
358} 351}
359 352/* end for move_irq_desc */
360static void
361set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
362{
363 struct irq_cfg *cfg = desc->chip_data;
364
365 if (!cfg->move_in_progress) {
366 /* it means that domain is not changed */
367 if (!cpumask_intersects(desc->affinity, mask))
368 cfg->move_desc_pending = 1;
369 }
370}
371#endif
372 353
373#else 354#else
374static struct irq_cfg *irq_cfg(unsigned int irq) 355static struct irq_cfg *irq_cfg(unsigned int irq)
@@ -378,13 +359,6 @@ static struct irq_cfg *irq_cfg(unsigned int irq)
378 359
379#endif 360#endif
380 361
381#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC
382static inline void
383set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
384{
385}
386#endif
387
388struct io_apic { 362struct io_apic {
389 unsigned int index; 363 unsigned int index;
390 unsigned int unused[3]; 364 unsigned int unused[3];
@@ -518,132 +492,18 @@ static void ioapic_mask_entry(int apic, int pin)
518 spin_unlock_irqrestore(&ioapic_lock, flags); 492 spin_unlock_irqrestore(&ioapic_lock, flags);
519} 493}
520 494
521#ifdef CONFIG_SMP
522static void send_cleanup_vector(struct irq_cfg *cfg)
523{
524 cpumask_var_t cleanup_mask;
525
526 if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
527 unsigned int i;
528 cfg->move_cleanup_count = 0;
529 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
530 cfg->move_cleanup_count++;
531 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
532 apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
533 } else {
534 cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
535 cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
536 apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
537 free_cpumask_var(cleanup_mask);
538 }
539 cfg->move_in_progress = 0;
540}
541
542static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
543{
544 int apic, pin;
545 struct irq_pin_list *entry;
546 u8 vector = cfg->vector;
547
548 entry = cfg->irq_2_pin;
549 for (;;) {
550 unsigned int reg;
551
552 if (!entry)
553 break;
554
555 apic = entry->apic;
556 pin = entry->pin;
557 /*
558 * With interrupt-remapping, destination information comes
559 * from interrupt-remapping table entry.
560 */
561 if (!irq_remapped(irq))
562 io_apic_write(apic, 0x11 + pin*2, dest);
563 reg = io_apic_read(apic, 0x10 + pin*2);
564 reg &= ~IO_APIC_REDIR_VECTOR_MASK;
565 reg |= vector;
566 io_apic_modify(apic, 0x10 + pin*2, reg);
567 if (!entry->next)
568 break;
569 entry = entry->next;
570 }
571}
572
573static int
574assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
575
576/*
577 * Either sets desc->affinity to a valid value, and returns
578 * ->cpu_mask_to_apicid of that, or returns BAD_APICID and
579 * leaves desc->affinity untouched.
580 */
581static unsigned int
582set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
583{
584 struct irq_cfg *cfg;
585 unsigned int irq;
586
587 if (!cpumask_intersects(mask, cpu_online_mask))
588 return BAD_APICID;
589
590 irq = desc->irq;
591 cfg = desc->chip_data;
592 if (assign_irq_vector(irq, cfg, mask))
593 return BAD_APICID;
594
595 /* check that before desc->addinity get updated */
596 set_extra_move_desc(desc, mask);
597
598 cpumask_copy(desc->affinity, mask);
599
600 return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
601}
602
603static void
604set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
605{
606 struct irq_cfg *cfg;
607 unsigned long flags;
608 unsigned int dest;
609 unsigned int irq;
610
611 irq = desc->irq;
612 cfg = desc->chip_data;
613
614 spin_lock_irqsave(&ioapic_lock, flags);
615 dest = set_desc_affinity(desc, mask);
616 if (dest != BAD_APICID) {
617 /* Only the high 8 bits are valid. */
618 dest = SET_APIC_LOGICAL_ID(dest);
619 __target_IO_APIC_irq(irq, dest, cfg);
620 }
621 spin_unlock_irqrestore(&ioapic_lock, flags);
622}
623
624static void
625set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
626{
627 struct irq_desc *desc;
628
629 desc = irq_to_desc(irq);
630
631 set_ioapic_affinity_irq_desc(desc, mask);
632}
633#endif /* CONFIG_SMP */
634
635/* 495/*
636 * The common case is 1:1 IRQ<->pin mappings. Sometimes there are 496 * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
637 * shared ISA-space IRQs, so we have to support them. We are super 497 * shared ISA-space IRQs, so we have to support them. We are super
638 * fast in the common case, and fast for shared ISA-space IRQs. 498 * fast in the common case, and fast for shared ISA-space IRQs.
639 */ 499 */
640static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin) 500static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
641{ 501{
642 struct irq_pin_list *entry; 502 struct irq_pin_list *entry;
643 503
644 entry = cfg->irq_2_pin; 504 entry = cfg->irq_2_pin;
645 if (!entry) { 505 if (!entry) {
646 entry = get_one_free_irq_2_pin(cpu); 506 entry = get_one_free_irq_2_pin(node);
647 if (!entry) { 507 if (!entry) {
648 printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n", 508 printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n",
649 apic, pin); 509 apic, pin);
@@ -663,7 +523,7 @@ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
663 entry = entry->next; 523 entry = entry->next;
664 } 524 }
665 525
666 entry->next = get_one_free_irq_2_pin(cpu); 526 entry->next = get_one_free_irq_2_pin(node);
667 entry = entry->next; 527 entry = entry->next;
668 entry->apic = apic; 528 entry->apic = apic;
669 entry->pin = pin; 529 entry->pin = pin;
@@ -672,7 +532,7 @@ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
672/* 532/*
673 * Reroute an IRQ to a different pin. 533 * Reroute an IRQ to a different pin.
674 */ 534 */
675static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu, 535static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
676 int oldapic, int oldpin, 536 int oldapic, int oldpin,
677 int newapic, int newpin) 537 int newapic, int newpin)
678{ 538{
@@ -692,7 +552,7 @@ static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
692 552
693 /* why? call replace before add? */ 553 /* why? call replace before add? */
694 if (!replaced) 554 if (!replaced)
695 add_pin_to_irq_cpu(cfg, cpu, newapic, newpin); 555 add_pin_to_irq_node(cfg, node, newapic, newpin);
696} 556}
697 557
698static inline void io_apic_modify_irq(struct irq_cfg *cfg, 558static inline void io_apic_modify_irq(struct irq_cfg *cfg,
@@ -850,7 +710,6 @@ static int __init ioapic_pirq_setup(char *str)
850__setup("pirq=", ioapic_pirq_setup); 710__setup("pirq=", ioapic_pirq_setup);
851#endif /* CONFIG_X86_32 */ 711#endif /* CONFIG_X86_32 */
852 712
853#ifdef CONFIG_INTR_REMAP
854struct IO_APIC_route_entry **alloc_ioapic_entries(void) 713struct IO_APIC_route_entry **alloc_ioapic_entries(void)
855{ 714{
856 int apic; 715 int apic;
@@ -948,20 +807,6 @@ int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
948 return 0; 807 return 0;
949} 808}
950 809
951void reinit_intr_remapped_IO_APIC(int intr_remapping,
952 struct IO_APIC_route_entry **ioapic_entries)
953
954{
955 /*
956 * for now plain restore of previous settings.
957 * TBD: In the case of OS enabling interrupt-remapping,
958 * IO-APIC RTE's need to be setup to point to interrupt-remapping
959 * table entries. for now, do a plain restore, and wait for
960 * the setup_IO_APIC_irqs() to do proper initialization.
961 */
962 restore_IO_APIC_setup(ioapic_entries);
963}
964
965void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries) 810void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries)
966{ 811{
967 int apic; 812 int apic;
@@ -971,7 +816,6 @@ void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries)
971 816
972 kfree(ioapic_entries); 817 kfree(ioapic_entries);
973} 818}
974#endif
975 819
976/* 820/*
977 * Find the IRQ entry number of a certain pin. 821 * Find the IRQ entry number of a certain pin.
@@ -1032,54 +876,6 @@ static int __init find_isa_irq_apic(int irq, int type)
1032 return -1; 876 return -1;
1033} 877}
1034 878
1035/*
1036 * Find a specific PCI IRQ entry.
1037 * Not an __init, possibly needed by modules
1038 */
1039static int pin_2_irq(int idx, int apic, int pin);
1040
1041int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
1042{
1043 int apic, i, best_guess = -1;
1044
1045 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
1046 bus, slot, pin);
1047 if (test_bit(bus, mp_bus_not_pci)) {
1048 apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
1049 return -1;
1050 }
1051 for (i = 0; i < mp_irq_entries; i++) {
1052 int lbus = mp_irqs[i].srcbus;
1053
1054 for (apic = 0; apic < nr_ioapics; apic++)
1055 if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic ||
1056 mp_irqs[i].dstapic == MP_APIC_ALL)
1057 break;
1058
1059 if (!test_bit(lbus, mp_bus_not_pci) &&
1060 !mp_irqs[i].irqtype &&
1061 (bus == lbus) &&
1062 (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
1063 int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq);
1064
1065 if (!(apic || IO_APIC_IRQ(irq)))
1066 continue;
1067
1068 if (pin == (mp_irqs[i].srcbusirq & 3))
1069 return irq;
1070 /*
1071 * Use the first all-but-pin matching entry as a
1072 * best-guess fuzzy result for broken mptables.
1073 */
1074 if (best_guess < 0)
1075 best_guess = irq;
1076 }
1077 }
1078 return best_guess;
1079}
1080
1081EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
1082
1083#if defined(CONFIG_EISA) || defined(CONFIG_MCA) 879#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
1084/* 880/*
1085 * EISA Edge/Level control register, ELCR 881 * EISA Edge/Level control register, ELCR
@@ -1298,6 +1094,64 @@ static int pin_2_irq(int idx, int apic, int pin)
1298 return irq; 1094 return irq;
1299} 1095}
1300 1096
1097/*
1098 * Find a specific PCI IRQ entry.
1099 * Not an __init, possibly needed by modules
1100 */
1101int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
1102 struct io_apic_irq_attr *irq_attr)
1103{
1104 int apic, i, best_guess = -1;
1105
1106 apic_printk(APIC_DEBUG,
1107 "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
1108 bus, slot, pin);
1109 if (test_bit(bus, mp_bus_not_pci)) {
1110 apic_printk(APIC_VERBOSE,
1111 "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
1112 return -1;
1113 }
1114 for (i = 0; i < mp_irq_entries; i++) {
1115 int lbus = mp_irqs[i].srcbus;
1116
1117 for (apic = 0; apic < nr_ioapics; apic++)
1118 if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic ||
1119 mp_irqs[i].dstapic == MP_APIC_ALL)
1120 break;
1121
1122 if (!test_bit(lbus, mp_bus_not_pci) &&
1123 !mp_irqs[i].irqtype &&
1124 (bus == lbus) &&
1125 (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
1126 int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq);
1127
1128 if (!(apic || IO_APIC_IRQ(irq)))
1129 continue;
1130
1131 if (pin == (mp_irqs[i].srcbusirq & 3)) {
1132 set_io_apic_irq_attr(irq_attr, apic,
1133 mp_irqs[i].dstirq,
1134 irq_trigger(i),
1135 irq_polarity(i));
1136 return irq;
1137 }
1138 /*
1139 * Use the first all-but-pin matching entry as a
1140 * best-guess fuzzy result for broken mptables.
1141 */
1142 if (best_guess < 0) {
1143 set_io_apic_irq_attr(irq_attr, apic,
1144 mp_irqs[i].dstirq,
1145 irq_trigger(i),
1146 irq_polarity(i));
1147 best_guess = irq;
1148 }
1149 }
1150 }
1151 return best_guess;
1152}
1153EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
1154
1301void lock_vector_lock(void) 1155void lock_vector_lock(void)
1302{ 1156{
1303 /* Used to the online set of cpus does not change 1157 /* Used to the online set of cpus does not change
@@ -1628,58 +1482,70 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
1628 ioapic_write_entry(apic_id, pin, entry); 1482 ioapic_write_entry(apic_id, pin, entry);
1629} 1483}
1630 1484
1485static struct {
1486 DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
1487} mp_ioapic_routing[MAX_IO_APICS];
1488
1631static void __init setup_IO_APIC_irqs(void) 1489static void __init setup_IO_APIC_irqs(void)
1632{ 1490{
1633 int apic_id, pin, idx, irq; 1491 int apic_id = 0, pin, idx, irq;
1634 int notcon = 0; 1492 int notcon = 0;
1635 struct irq_desc *desc; 1493 struct irq_desc *desc;
1636 struct irq_cfg *cfg; 1494 struct irq_cfg *cfg;
1637 int cpu = boot_cpu_id; 1495 int node = cpu_to_node(boot_cpu_id);
1638 1496
1639 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); 1497 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
1640 1498
1641 for (apic_id = 0; apic_id < nr_ioapics; apic_id++) { 1499#ifdef CONFIG_ACPI
1642 for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { 1500 if (!acpi_disabled && acpi_ioapic) {
1643 1501 apic_id = mp_find_ioapic(0);
1644 idx = find_irq_entry(apic_id, pin, mp_INT); 1502 if (apic_id < 0)
1645 if (idx == -1) { 1503 apic_id = 0;
1646 if (!notcon) { 1504 }
1647 notcon = 1; 1505#endif
1648 apic_printk(APIC_VERBOSE,
1649 KERN_DEBUG " %d-%d",
1650 mp_ioapics[apic_id].apicid, pin);
1651 } else
1652 apic_printk(APIC_VERBOSE, " %d-%d",
1653 mp_ioapics[apic_id].apicid, pin);
1654 continue;
1655 }
1656 if (notcon) {
1657 apic_printk(APIC_VERBOSE,
1658 " (apicid-pin) not connected\n");
1659 notcon = 0;
1660 }
1661 1506
1662 irq = pin_2_irq(idx, apic_id, pin); 1507 for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
1508 idx = find_irq_entry(apic_id, pin, mp_INT);
1509 if (idx == -1) {
1510 if (!notcon) {
1511 notcon = 1;
1512 apic_printk(APIC_VERBOSE,
1513 KERN_DEBUG " %d-%d",
1514 mp_ioapics[apic_id].apicid, pin);
1515 } else
1516 apic_printk(APIC_VERBOSE, " %d-%d",
1517 mp_ioapics[apic_id].apicid, pin);
1518 continue;
1519 }
1520 if (notcon) {
1521 apic_printk(APIC_VERBOSE,
1522 " (apicid-pin) not connected\n");
1523 notcon = 0;
1524 }
1663 1525
1664 /* 1526 irq = pin_2_irq(idx, apic_id, pin);
1665 * Skip the timer IRQ if there's a quirk handler
1666 * installed and if it returns 1:
1667 */
1668 if (apic->multi_timer_check &&
1669 apic->multi_timer_check(apic_id, irq))
1670 continue;
1671 1527
1672 desc = irq_to_desc_alloc_cpu(irq, cpu); 1528 /*
1673 if (!desc) { 1529 * Skip the timer IRQ if there's a quirk handler
1674 printk(KERN_INFO "can not get irq_desc for %d\n", irq); 1530 * installed and if it returns 1:
1675 continue; 1531 */
1676 } 1532 if (apic->multi_timer_check &&
1677 cfg = desc->chip_data; 1533 apic->multi_timer_check(apic_id, irq))
1678 add_pin_to_irq_cpu(cfg, cpu, apic_id, pin); 1534 continue;
1679 1535
1680 setup_IO_APIC_irq(apic_id, pin, irq, desc, 1536 desc = irq_to_desc_alloc_node(irq, node);
1681 irq_trigger(idx), irq_polarity(idx)); 1537 if (!desc) {
1538 printk(KERN_INFO "can not get irq_desc for %d\n", irq);
1539 continue;
1682 } 1540 }
1541 cfg = desc->chip_data;
1542 add_pin_to_irq_node(cfg, node, apic_id, pin);
1543 /*
1544 * don't mark it in pin_programmed, so later acpi could
1545 * set it correctly when irq < 16
1546 */
1547 setup_IO_APIC_irq(apic_id, pin, irq, desc,
1548 irq_trigger(idx), irq_polarity(idx));
1683 } 1549 }
1684 1550
1685 if (notcon) 1551 if (notcon)
@@ -1869,7 +1735,7 @@ __apicdebuginit(void) print_APIC_bitfield(int base)
1869 1735
1870__apicdebuginit(void) print_local_APIC(void *dummy) 1736__apicdebuginit(void) print_local_APIC(void *dummy)
1871{ 1737{
1872 unsigned int v, ver, maxlvt; 1738 unsigned int i, v, ver, maxlvt;
1873 u64 icr; 1739 u64 icr;
1874 1740
1875 if (apic_verbosity == APIC_QUIET) 1741 if (apic_verbosity == APIC_QUIET)
@@ -1957,6 +1823,18 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
1957 printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); 1823 printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
1958 v = apic_read(APIC_TDCR); 1824 v = apic_read(APIC_TDCR);
1959 printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); 1825 printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
1826
1827 if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
1828 v = apic_read(APIC_EFEAT);
1829 maxlvt = (v >> 16) & 0xff;
1830 printk(KERN_DEBUG "... APIC EFEAT: %08x\n", v);
1831 v = apic_read(APIC_ECTRL);
1832 printk(KERN_DEBUG "... APIC ECTRL: %08x\n", v);
1833 for (i = 0; i < maxlvt; i++) {
1834 v = apic_read(APIC_EILVTn(i));
1835 printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v);
1836 }
1837 }
1960 printk("\n"); 1838 printk("\n");
1961} 1839}
1962 1840
@@ -2005,6 +1883,11 @@ __apicdebuginit(void) print_PIC(void)
2005__apicdebuginit(int) print_all_ICs(void) 1883__apicdebuginit(int) print_all_ICs(void)
2006{ 1884{
2007 print_PIC(); 1885 print_PIC();
1886
1887 /* don't print out if apic is not there */
1888 if (!cpu_has_apic || disable_apic)
1889 return 0;
1890
2008 print_all_local_APICs(); 1891 print_all_local_APICs();
2009 print_IO_APIC(); 1892 print_IO_APIC();
2010 1893
@@ -2360,6 +2243,118 @@ static int ioapic_retrigger_irq(unsigned int irq)
2360 */ 2243 */
2361 2244
2362#ifdef CONFIG_SMP 2245#ifdef CONFIG_SMP
2246static void send_cleanup_vector(struct irq_cfg *cfg)
2247{
2248 cpumask_var_t cleanup_mask;
2249
2250 if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
2251 unsigned int i;
2252 cfg->move_cleanup_count = 0;
2253 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
2254 cfg->move_cleanup_count++;
2255 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
2256 apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
2257 } else {
2258 cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
2259 cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
2260 apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
2261 free_cpumask_var(cleanup_mask);
2262 }
2263 cfg->move_in_progress = 0;
2264}
2265
2266static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
2267{
2268 int apic, pin;
2269 struct irq_pin_list *entry;
2270 u8 vector = cfg->vector;
2271
2272 entry = cfg->irq_2_pin;
2273 for (;;) {
2274 unsigned int reg;
2275
2276 if (!entry)
2277 break;
2278
2279 apic = entry->apic;
2280 pin = entry->pin;
2281 /*
2282 * With interrupt-remapping, destination information comes
2283 * from interrupt-remapping table entry.
2284 */
2285 if (!irq_remapped(irq))
2286 io_apic_write(apic, 0x11 + pin*2, dest);
2287 reg = io_apic_read(apic, 0x10 + pin*2);
2288 reg &= ~IO_APIC_REDIR_VECTOR_MASK;
2289 reg |= vector;
2290 io_apic_modify(apic, 0x10 + pin*2, reg);
2291 if (!entry->next)
2292 break;
2293 entry = entry->next;
2294 }
2295}
2296
2297static int
2298assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
2299
2300/*
2301 * Either sets desc->affinity to a valid value, and returns
2302 * ->cpu_mask_to_apicid of that, or returns BAD_APICID and
2303 * leaves desc->affinity untouched.
2304 */
2305static unsigned int
2306set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
2307{
2308 struct irq_cfg *cfg;
2309 unsigned int irq;
2310
2311 if (!cpumask_intersects(mask, cpu_online_mask))
2312 return BAD_APICID;
2313
2314 irq = desc->irq;
2315 cfg = desc->chip_data;
2316 if (assign_irq_vector(irq, cfg, mask))
2317 return BAD_APICID;
2318
2319 cpumask_copy(desc->affinity, mask);
2320
2321 return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
2322}
2323
2324static int
2325set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2326{
2327 struct irq_cfg *cfg;
2328 unsigned long flags;
2329 unsigned int dest;
2330 unsigned int irq;
2331 int ret = -1;
2332
2333 irq = desc->irq;
2334 cfg = desc->chip_data;
2335
2336 spin_lock_irqsave(&ioapic_lock, flags);
2337 dest = set_desc_affinity(desc, mask);
2338 if (dest != BAD_APICID) {
2339 /* Only the high 8 bits are valid. */
2340 dest = SET_APIC_LOGICAL_ID(dest);
2341 __target_IO_APIC_irq(irq, dest, cfg);
2342 ret = 0;
2343 }
2344 spin_unlock_irqrestore(&ioapic_lock, flags);
2345
2346 return ret;
2347}
2348
2349static int
2350set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
2351{
2352 struct irq_desc *desc;
2353
2354 desc = irq_to_desc(irq);
2355
2356 return set_ioapic_affinity_irq_desc(desc, mask);
2357}
2363 2358
2364#ifdef CONFIG_INTR_REMAP 2359#ifdef CONFIG_INTR_REMAP
2365 2360
@@ -2374,26 +2369,25 @@ static int ioapic_retrigger_irq(unsigned int irq)
2374 * Real vector that is used for interrupting cpu will be coming from 2369 * Real vector that is used for interrupting cpu will be coming from
2375 * the interrupt-remapping table entry. 2370 * the interrupt-remapping table entry.
2376 */ 2371 */
2377static void 2372static int
2378migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) 2373migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2379{ 2374{
2380 struct irq_cfg *cfg; 2375 struct irq_cfg *cfg;
2381 struct irte irte; 2376 struct irte irte;
2382 unsigned int dest; 2377 unsigned int dest;
2383 unsigned int irq; 2378 unsigned int irq;
2379 int ret = -1;
2384 2380
2385 if (!cpumask_intersects(mask, cpu_online_mask)) 2381 if (!cpumask_intersects(mask, cpu_online_mask))
2386 return; 2382 return ret;
2387 2383
2388 irq = desc->irq; 2384 irq = desc->irq;
2389 if (get_irte(irq, &irte)) 2385 if (get_irte(irq, &irte))
2390 return; 2386 return ret;
2391 2387
2392 cfg = desc->chip_data; 2388 cfg = desc->chip_data;
2393 if (assign_irq_vector(irq, cfg, mask)) 2389 if (assign_irq_vector(irq, cfg, mask))
2394 return; 2390 return ret;
2395
2396 set_extra_move_desc(desc, mask);
2397 2391
2398 dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); 2392 dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
2399 2393
@@ -2409,27 +2403,30 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2409 send_cleanup_vector(cfg); 2403 send_cleanup_vector(cfg);
2410 2404
2411 cpumask_copy(desc->affinity, mask); 2405 cpumask_copy(desc->affinity, mask);
2406
2407 return 0;
2412} 2408}
2413 2409
2414/* 2410/*
2415 * Migrates the IRQ destination in the process context. 2411 * Migrates the IRQ destination in the process context.
2416 */ 2412 */
2417static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, 2413static int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
2418 const struct cpumask *mask) 2414 const struct cpumask *mask)
2419{ 2415{
2420 migrate_ioapic_irq_desc(desc, mask); 2416 return migrate_ioapic_irq_desc(desc, mask);
2421} 2417}
2422static void set_ir_ioapic_affinity_irq(unsigned int irq, 2418static int set_ir_ioapic_affinity_irq(unsigned int irq,
2423 const struct cpumask *mask) 2419 const struct cpumask *mask)
2424{ 2420{
2425 struct irq_desc *desc = irq_to_desc(irq); 2421 struct irq_desc *desc = irq_to_desc(irq);
2426 2422
2427 set_ir_ioapic_affinity_irq_desc(desc, mask); 2423 return set_ir_ioapic_affinity_irq_desc(desc, mask);
2428} 2424}
2429#else 2425#else
2430static inline void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, 2426static inline int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
2431 const struct cpumask *mask) 2427 const struct cpumask *mask)
2432{ 2428{
2429 return 0;
2433} 2430}
2434#endif 2431#endif
2435 2432
@@ -2491,86 +2488,19 @@ static void irq_complete_move(struct irq_desc **descp)
2491 struct irq_cfg *cfg = desc->chip_data; 2488 struct irq_cfg *cfg = desc->chip_data;
2492 unsigned vector, me; 2489 unsigned vector, me;
2493 2490
2494 if (likely(!cfg->move_in_progress)) { 2491 if (likely(!cfg->move_in_progress))
2495#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
2496 if (likely(!cfg->move_desc_pending))
2497 return;
2498
2499 /* domain has not changed, but affinity did */
2500 me = smp_processor_id();
2501 if (cpumask_test_cpu(me, desc->affinity)) {
2502 *descp = desc = move_irq_desc(desc, me);
2503 /* get the new one */
2504 cfg = desc->chip_data;
2505 cfg->move_desc_pending = 0;
2506 }
2507#endif
2508 return; 2492 return;
2509 }
2510 2493
2511 vector = ~get_irq_regs()->orig_ax; 2494 vector = ~get_irq_regs()->orig_ax;
2512 me = smp_processor_id(); 2495 me = smp_processor_id();
2513 2496
2514 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) { 2497 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
2515#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
2516 *descp = desc = move_irq_desc(desc, me);
2517 /* get the new one */
2518 cfg = desc->chip_data;
2519#endif
2520 send_cleanup_vector(cfg); 2498 send_cleanup_vector(cfg);
2521 }
2522} 2499}
2523#else 2500#else
2524static inline void irq_complete_move(struct irq_desc **descp) {} 2501static inline void irq_complete_move(struct irq_desc **descp) {}
2525#endif 2502#endif
2526 2503
2527static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2528{
2529 int apic, pin;
2530 struct irq_pin_list *entry;
2531
2532 entry = cfg->irq_2_pin;
2533 for (;;) {
2534
2535 if (!entry)
2536 break;
2537
2538 apic = entry->apic;
2539 pin = entry->pin;
2540 io_apic_eoi(apic, pin);
2541 entry = entry->next;
2542 }
2543}
2544
2545static void
2546eoi_ioapic_irq(struct irq_desc *desc)
2547{
2548 struct irq_cfg *cfg;
2549 unsigned long flags;
2550 unsigned int irq;
2551
2552 irq = desc->irq;
2553 cfg = desc->chip_data;
2554
2555 spin_lock_irqsave(&ioapic_lock, flags);
2556 __eoi_ioapic_irq(irq, cfg);
2557 spin_unlock_irqrestore(&ioapic_lock, flags);
2558}
2559
2560#ifdef CONFIG_X86_X2APIC
2561static void ack_x2apic_level(unsigned int irq)
2562{
2563 struct irq_desc *desc = irq_to_desc(irq);
2564 ack_x2APIC_irq();
2565 eoi_ioapic_irq(desc);
2566}
2567
2568static void ack_x2apic_edge(unsigned int irq)
2569{
2570 ack_x2APIC_irq();
2571}
2572#endif
2573
2574static void ack_apic_edge(unsigned int irq) 2504static void ack_apic_edge(unsigned int irq)
2575{ 2505{
2576 struct irq_desc *desc = irq_to_desc(irq); 2506 struct irq_desc *desc = irq_to_desc(irq);
@@ -2634,9 +2564,6 @@ static void ack_apic_level(unsigned int irq)
2634 */ 2564 */
2635 ack_APIC_irq(); 2565 ack_APIC_irq();
2636 2566
2637 if (irq_remapped(irq))
2638 eoi_ioapic_irq(desc);
2639
2640 /* Now we can move and renable the irq */ 2567 /* Now we can move and renable the irq */
2641 if (unlikely(do_unmask_irq)) { 2568 if (unlikely(do_unmask_irq)) {
2642 /* Only migrate the irq if the ack has been received. 2569 /* Only migrate the irq if the ack has been received.
@@ -2683,22 +2610,50 @@ static void ack_apic_level(unsigned int irq)
2683} 2610}
2684 2611
2685#ifdef CONFIG_INTR_REMAP 2612#ifdef CONFIG_INTR_REMAP
2613static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2614{
2615 int apic, pin;
2616 struct irq_pin_list *entry;
2617
2618 entry = cfg->irq_2_pin;
2619 for (;;) {
2620
2621 if (!entry)
2622 break;
2623
2624 apic = entry->apic;
2625 pin = entry->pin;
2626 io_apic_eoi(apic, pin);
2627 entry = entry->next;
2628 }
2629}
2630
2631static void
2632eoi_ioapic_irq(struct irq_desc *desc)
2633{
2634 struct irq_cfg *cfg;
2635 unsigned long flags;
2636 unsigned int irq;
2637
2638 irq = desc->irq;
2639 cfg = desc->chip_data;
2640
2641 spin_lock_irqsave(&ioapic_lock, flags);
2642 __eoi_ioapic_irq(irq, cfg);
2643 spin_unlock_irqrestore(&ioapic_lock, flags);
2644}
2645
2686static void ir_ack_apic_edge(unsigned int irq) 2646static void ir_ack_apic_edge(unsigned int irq)
2687{ 2647{
2688#ifdef CONFIG_X86_X2APIC 2648 ack_APIC_irq();
2689 if (x2apic_enabled())
2690 return ack_x2apic_edge(irq);
2691#endif
2692 return ack_apic_edge(irq);
2693} 2649}
2694 2650
2695static void ir_ack_apic_level(unsigned int irq) 2651static void ir_ack_apic_level(unsigned int irq)
2696{ 2652{
2697#ifdef CONFIG_X86_X2APIC 2653 struct irq_desc *desc = irq_to_desc(irq);
2698 if (x2apic_enabled()) 2654
2699 return ack_x2apic_level(irq); 2655 ack_APIC_irq();
2700#endif 2656 eoi_ioapic_irq(desc);
2701 return ack_apic_level(irq);
2702} 2657}
2703#endif /* CONFIG_INTR_REMAP */ 2658#endif /* CONFIG_INTR_REMAP */
2704 2659
@@ -2903,7 +2858,7 @@ static inline void __init check_timer(void)
2903{ 2858{
2904 struct irq_desc *desc = irq_to_desc(0); 2859 struct irq_desc *desc = irq_to_desc(0);
2905 struct irq_cfg *cfg = desc->chip_data; 2860 struct irq_cfg *cfg = desc->chip_data;
2906 int cpu = boot_cpu_id; 2861 int node = cpu_to_node(boot_cpu_id);
2907 int apic1, pin1, apic2, pin2; 2862 int apic1, pin1, apic2, pin2;
2908 unsigned long flags; 2863 unsigned long flags;
2909 int no_pin1 = 0; 2864 int no_pin1 = 0;
@@ -2969,7 +2924,7 @@ static inline void __init check_timer(void)
2969 * Ok, does IRQ0 through the IOAPIC work? 2924 * Ok, does IRQ0 through the IOAPIC work?
2970 */ 2925 */
2971 if (no_pin1) { 2926 if (no_pin1) {
2972 add_pin_to_irq_cpu(cfg, cpu, apic1, pin1); 2927 add_pin_to_irq_node(cfg, node, apic1, pin1);
2973 setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); 2928 setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
2974 } else { 2929 } else {
2975 /* for edge trigger, setup_IO_APIC_irq already 2930 /* for edge trigger, setup_IO_APIC_irq already
@@ -3006,7 +2961,7 @@ static inline void __init check_timer(void)
3006 /* 2961 /*
3007 * legacy devices should be connected to IO APIC #0 2962 * legacy devices should be connected to IO APIC #0
3008 */ 2963 */
3009 replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2); 2964 replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
3010 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); 2965 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
3011 enable_8259A_irq(0); 2966 enable_8259A_irq(0);
3012 if (timer_irq_works()) { 2967 if (timer_irq_works()) {
@@ -3218,14 +3173,13 @@ static int nr_irqs_gsi = NR_IRQS_LEGACY;
3218/* 3173/*
3219 * Dynamic irq allocate and deallocation 3174 * Dynamic irq allocate and deallocation
3220 */ 3175 */
3221unsigned int create_irq_nr(unsigned int irq_want) 3176unsigned int create_irq_nr(unsigned int irq_want, int node)
3222{ 3177{
3223 /* Allocate an unused irq */ 3178 /* Allocate an unused irq */
3224 unsigned int irq; 3179 unsigned int irq;
3225 unsigned int new; 3180 unsigned int new;
3226 unsigned long flags; 3181 unsigned long flags;
3227 struct irq_cfg *cfg_new = NULL; 3182 struct irq_cfg *cfg_new = NULL;
3228 int cpu = boot_cpu_id;
3229 struct irq_desc *desc_new = NULL; 3183 struct irq_desc *desc_new = NULL;
3230 3184
3231 irq = 0; 3185 irq = 0;
@@ -3234,7 +3188,7 @@ unsigned int create_irq_nr(unsigned int irq_want)
3234 3188
3235 spin_lock_irqsave(&vector_lock, flags); 3189 spin_lock_irqsave(&vector_lock, flags);
3236 for (new = irq_want; new < nr_irqs; new++) { 3190 for (new = irq_want; new < nr_irqs; new++) {
3237 desc_new = irq_to_desc_alloc_cpu(new, cpu); 3191 desc_new = irq_to_desc_alloc_node(new, node);
3238 if (!desc_new) { 3192 if (!desc_new) {
3239 printk(KERN_INFO "can not get irq_desc for %d\n", new); 3193 printk(KERN_INFO "can not get irq_desc for %d\n", new);
3240 continue; 3194 continue;
@@ -3243,6 +3197,9 @@ unsigned int create_irq_nr(unsigned int irq_want)
3243 3197
3244 if (cfg_new->vector != 0) 3198 if (cfg_new->vector != 0)
3245 continue; 3199 continue;
3200
3201 desc_new = move_irq_desc(desc_new, node);
3202
3246 if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) 3203 if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
3247 irq = new; 3204 irq = new;
3248 break; 3205 break;
@@ -3260,11 +3217,12 @@ unsigned int create_irq_nr(unsigned int irq_want)
3260 3217
3261int create_irq(void) 3218int create_irq(void)
3262{ 3219{
3220 int node = cpu_to_node(boot_cpu_id);
3263 unsigned int irq_want; 3221 unsigned int irq_want;
3264 int irq; 3222 int irq;
3265 3223
3266 irq_want = nr_irqs_gsi; 3224 irq_want = nr_irqs_gsi;
3267 irq = create_irq_nr(irq_want); 3225 irq = create_irq_nr(irq_want, node);
3268 3226
3269 if (irq == 0) 3227 if (irq == 0)
3270 irq = -1; 3228 irq = -1;
@@ -3366,7 +3324,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
3366} 3324}
3367 3325
3368#ifdef CONFIG_SMP 3326#ifdef CONFIG_SMP
3369static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) 3327static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3370{ 3328{
3371 struct irq_desc *desc = irq_to_desc(irq); 3329 struct irq_desc *desc = irq_to_desc(irq);
3372 struct irq_cfg *cfg; 3330 struct irq_cfg *cfg;
@@ -3375,7 +3333,7 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3375 3333
3376 dest = set_desc_affinity(desc, mask); 3334 dest = set_desc_affinity(desc, mask);
3377 if (dest == BAD_APICID) 3335 if (dest == BAD_APICID)
3378 return; 3336 return -1;
3379 3337
3380 cfg = desc->chip_data; 3338 cfg = desc->chip_data;
3381 3339
@@ -3387,13 +3345,15 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3387 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3345 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3388 3346
3389 write_msi_msg_desc(desc, &msg); 3347 write_msi_msg_desc(desc, &msg);
3348
3349 return 0;
3390} 3350}
3391#ifdef CONFIG_INTR_REMAP 3351#ifdef CONFIG_INTR_REMAP
3392/* 3352/*
3393 * Migrate the MSI irq to another cpumask. This migration is 3353 * Migrate the MSI irq to another cpumask. This migration is
3394 * done in the process context using interrupt-remapping hardware. 3354 * done in the process context using interrupt-remapping hardware.
3395 */ 3355 */
3396static void 3356static int
3397ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) 3357ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3398{ 3358{
3399 struct irq_desc *desc = irq_to_desc(irq); 3359 struct irq_desc *desc = irq_to_desc(irq);
@@ -3402,11 +3362,11 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3402 struct irte irte; 3362 struct irte irte;
3403 3363
3404 if (get_irte(irq, &irte)) 3364 if (get_irte(irq, &irte))
3405 return; 3365 return -1;
3406 3366
3407 dest = set_desc_affinity(desc, mask); 3367 dest = set_desc_affinity(desc, mask);
3408 if (dest == BAD_APICID) 3368 if (dest == BAD_APICID)
3409 return; 3369 return -1;
3410 3370
3411 irte.vector = cfg->vector; 3371 irte.vector = cfg->vector;
3412 irte.dest_id = IRTE_DEST(dest); 3372 irte.dest_id = IRTE_DEST(dest);
@@ -3423,6 +3383,8 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3423 */ 3383 */
3424 if (cfg->move_in_progress) 3384 if (cfg->move_in_progress)
3425 send_cleanup_vector(cfg); 3385 send_cleanup_vector(cfg);
3386
3387 return 0;
3426} 3388}
3427 3389
3428#endif 3390#endif
@@ -3518,15 +3480,17 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3518 unsigned int irq_want; 3480 unsigned int irq_want;
3519 struct intel_iommu *iommu = NULL; 3481 struct intel_iommu *iommu = NULL;
3520 int index = 0; 3482 int index = 0;
3483 int node;
3521 3484
3522 /* x86 doesn't support multiple MSI yet */ 3485 /* x86 doesn't support multiple MSI yet */
3523 if (type == PCI_CAP_ID_MSI && nvec > 1) 3486 if (type == PCI_CAP_ID_MSI && nvec > 1)
3524 return 1; 3487 return 1;
3525 3488
3489 node = dev_to_node(&dev->dev);
3526 irq_want = nr_irqs_gsi; 3490 irq_want = nr_irqs_gsi;
3527 sub_handle = 0; 3491 sub_handle = 0;
3528 list_for_each_entry(msidesc, &dev->msi_list, list) { 3492 list_for_each_entry(msidesc, &dev->msi_list, list) {
3529 irq = create_irq_nr(irq_want); 3493 irq = create_irq_nr(irq_want, node);
3530 if (irq == 0) 3494 if (irq == 0)
3531 return -1; 3495 return -1;
3532 irq_want = irq + 1; 3496 irq_want = irq + 1;
@@ -3576,7 +3540,7 @@ void arch_teardown_msi_irq(unsigned int irq)
3576 3540
3577#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP) 3541#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
3578#ifdef CONFIG_SMP 3542#ifdef CONFIG_SMP
3579static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) 3543static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3580{ 3544{
3581 struct irq_desc *desc = irq_to_desc(irq); 3545 struct irq_desc *desc = irq_to_desc(irq);
3582 struct irq_cfg *cfg; 3546 struct irq_cfg *cfg;
@@ -3585,7 +3549,7 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3585 3549
3586 dest = set_desc_affinity(desc, mask); 3550 dest = set_desc_affinity(desc, mask);
3587 if (dest == BAD_APICID) 3551 if (dest == BAD_APICID)
3588 return; 3552 return -1;
3589 3553
3590 cfg = desc->chip_data; 3554 cfg = desc->chip_data;
3591 3555
@@ -3597,6 +3561,8 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3597 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3561 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3598 3562
3599 dmar_msi_write(irq, &msg); 3563 dmar_msi_write(irq, &msg);
3564
3565 return 0;
3600} 3566}
3601 3567
3602#endif /* CONFIG_SMP */ 3568#endif /* CONFIG_SMP */
@@ -3630,7 +3596,7 @@ int arch_setup_dmar_msi(unsigned int irq)
3630#ifdef CONFIG_HPET_TIMER 3596#ifdef CONFIG_HPET_TIMER
3631 3597
3632#ifdef CONFIG_SMP 3598#ifdef CONFIG_SMP
3633static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) 3599static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3634{ 3600{
3635 struct irq_desc *desc = irq_to_desc(irq); 3601 struct irq_desc *desc = irq_to_desc(irq);
3636 struct irq_cfg *cfg; 3602 struct irq_cfg *cfg;
@@ -3639,7 +3605,7 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3639 3605
3640 dest = set_desc_affinity(desc, mask); 3606 dest = set_desc_affinity(desc, mask);
3641 if (dest == BAD_APICID) 3607 if (dest == BAD_APICID)
3642 return; 3608 return -1;
3643 3609
3644 cfg = desc->chip_data; 3610 cfg = desc->chip_data;
3645 3611
@@ -3651,6 +3617,8 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3651 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3617 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3652 3618
3653 hpet_msi_write(irq, &msg); 3619 hpet_msi_write(irq, &msg);
3620
3621 return 0;
3654} 3622}
3655 3623
3656#endif /* CONFIG_SMP */ 3624#endif /* CONFIG_SMP */
@@ -3707,7 +3675,7 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
3707 write_ht_irq_msg(irq, &msg); 3675 write_ht_irq_msg(irq, &msg);
3708} 3676}
3709 3677
3710static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) 3678static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
3711{ 3679{
3712 struct irq_desc *desc = irq_to_desc(irq); 3680 struct irq_desc *desc = irq_to_desc(irq);
3713 struct irq_cfg *cfg; 3681 struct irq_cfg *cfg;
@@ -3715,11 +3683,13 @@ static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
3715 3683
3716 dest = set_desc_affinity(desc, mask); 3684 dest = set_desc_affinity(desc, mask);
3717 if (dest == BAD_APICID) 3685 if (dest == BAD_APICID)
3718 return; 3686 return -1;
3719 3687
3720 cfg = desc->chip_data; 3688 cfg = desc->chip_data;
3721 3689
3722 target_ht_irq(irq, dest, cfg->vector); 3690 target_ht_irq(irq, dest, cfg->vector);
3691
3692 return 0;
3723} 3693}
3724 3694
3725#endif 3695#endif
@@ -3794,6 +3764,8 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3794 unsigned long flags; 3764 unsigned long flags;
3795 int err; 3765 int err;
3796 3766
3767 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
3768
3797 cfg = irq_cfg(irq); 3769 cfg = irq_cfg(irq);
3798 3770
3799 err = assign_irq_vector(irq, cfg, eligible_cpu); 3771 err = assign_irq_vector(irq, cfg, eligible_cpu);
@@ -3807,15 +3779,13 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3807 3779
3808 mmr_value = 0; 3780 mmr_value = 0;
3809 entry = (struct uv_IO_APIC_route_entry *)&mmr_value; 3781 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
3810 BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); 3782 entry->vector = cfg->vector;
3811 3783 entry->delivery_mode = apic->irq_delivery_mode;
3812 entry->vector = cfg->vector; 3784 entry->dest_mode = apic->irq_dest_mode;
3813 entry->delivery_mode = apic->irq_delivery_mode; 3785 entry->polarity = 0;
3814 entry->dest_mode = apic->irq_dest_mode; 3786 entry->trigger = 0;
3815 entry->polarity = 0; 3787 entry->mask = 0;
3816 entry->trigger = 0; 3788 entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
3817 entry->mask = 0;
3818 entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
3819 3789
3820 mmr_pnode = uv_blade_to_pnode(mmr_blade); 3790 mmr_pnode = uv_blade_to_pnode(mmr_blade);
3821 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); 3791 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
@@ -3833,10 +3803,10 @@ void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset)
3833 struct uv_IO_APIC_route_entry *entry; 3803 struct uv_IO_APIC_route_entry *entry;
3834 int mmr_pnode; 3804 int mmr_pnode;
3835 3805
3806 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
3807
3836 mmr_value = 0; 3808 mmr_value = 0;
3837 entry = (struct uv_IO_APIC_route_entry *)&mmr_value; 3809 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
3838 BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
3839
3840 entry->mask = 1; 3810 entry->mask = 1;
3841 3811
3842 mmr_pnode = uv_blade_to_pnode(mmr_blade); 3812 mmr_pnode = uv_blade_to_pnode(mmr_blade);
@@ -3900,6 +3870,71 @@ int __init arch_probe_nr_irqs(void)
3900} 3870}
3901#endif 3871#endif
3902 3872
3873static int __io_apic_set_pci_routing(struct device *dev, int irq,
3874 struct io_apic_irq_attr *irq_attr)
3875{
3876 struct irq_desc *desc;
3877 struct irq_cfg *cfg;
3878 int node;
3879 int ioapic, pin;
3880 int trigger, polarity;
3881
3882 ioapic = irq_attr->ioapic;
3883 if (!IO_APIC_IRQ(irq)) {
3884 apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
3885 ioapic);
3886 return -EINVAL;
3887 }
3888
3889 if (dev)
3890 node = dev_to_node(dev);
3891 else
3892 node = cpu_to_node(boot_cpu_id);
3893
3894 desc = irq_to_desc_alloc_node(irq, node);
3895 if (!desc) {
3896 printk(KERN_INFO "can not get irq_desc %d\n", irq);
3897 return 0;
3898 }
3899
3900 pin = irq_attr->ioapic_pin;
3901 trigger = irq_attr->trigger;
3902 polarity = irq_attr->polarity;
3903
3904 /*
3905 * IRQs < 16 are already in the irq_2_pin[] map
3906 */
3907 if (irq >= NR_IRQS_LEGACY) {
3908 cfg = desc->chip_data;
3909 add_pin_to_irq_node(cfg, node, ioapic, pin);
3910 }
3911
3912 setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity);
3913
3914 return 0;
3915}
3916
3917int io_apic_set_pci_routing(struct device *dev, int irq,
3918 struct io_apic_irq_attr *irq_attr)
3919{
3920 int ioapic, pin;
3921 /*
3922 * Avoid pin reprogramming. PRTs typically include entries
3923 * with redundant pin->gsi mappings (but unique PCI devices);
3924 * we only program the IOAPIC on the first.
3925 */
3926 ioapic = irq_attr->ioapic;
3927 pin = irq_attr->ioapic_pin;
3928 if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) {
3929 pr_debug("Pin %d-%d already programmed\n",
3930 mp_ioapics[ioapic].apicid, pin);
3931 return 0;
3932 }
3933 set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed);
3934
3935 return __io_apic_set_pci_routing(dev, irq, irq_attr);
3936}
3937
3903/* -------------------------------------------------------------------------- 3938/* --------------------------------------------------------------------------
3904 ACPI-based IOAPIC Configuration 3939 ACPI-based IOAPIC Configuration
3905 -------------------------------------------------------------------------- */ 3940 -------------------------------------------------------------------------- */
@@ -3980,6 +4015,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3980 4015
3981 return apic_id; 4016 return apic_id;
3982} 4017}
4018#endif
3983 4019
3984int __init io_apic_get_version(int ioapic) 4020int __init io_apic_get_version(int ioapic)
3985{ 4021{
@@ -3992,39 +4028,6 @@ int __init io_apic_get_version(int ioapic)
3992 4028
3993 return reg_01.bits.version; 4029 return reg_01.bits.version;
3994} 4030}
3995#endif
3996
3997int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
3998{
3999 struct irq_desc *desc;
4000 struct irq_cfg *cfg;
4001 int cpu = boot_cpu_id;
4002
4003 if (!IO_APIC_IRQ(irq)) {
4004 apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
4005 ioapic);
4006 return -EINVAL;
4007 }
4008
4009 desc = irq_to_desc_alloc_cpu(irq, cpu);
4010 if (!desc) {
4011 printk(KERN_INFO "can not get irq_desc %d\n", irq);
4012 return 0;
4013 }
4014
4015 /*
4016 * IRQs < 16 are already in the irq_2_pin[] map
4017 */
4018 if (irq >= NR_IRQS_LEGACY) {
4019 cfg = desc->chip_data;
4020 add_pin_to_irq_cpu(cfg, cpu, ioapic, pin);
4021 }
4022
4023 setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity);
4024
4025 return 0;
4026}
4027
4028 4031
4029int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) 4032int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
4030{ 4033{
@@ -4055,51 +4058,44 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
4055#ifdef CONFIG_SMP 4058#ifdef CONFIG_SMP
4056void __init setup_ioapic_dest(void) 4059void __init setup_ioapic_dest(void)
4057{ 4060{
4058 int pin, ioapic, irq, irq_entry; 4061 int pin, ioapic = 0, irq, irq_entry;
4059 struct irq_desc *desc; 4062 struct irq_desc *desc;
4060 struct irq_cfg *cfg;
4061 const struct cpumask *mask; 4063 const struct cpumask *mask;
4062 4064
4063 if (skip_ioapic_setup == 1) 4065 if (skip_ioapic_setup == 1)
4064 return; 4066 return;
4065 4067
4066 for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { 4068#ifdef CONFIG_ACPI
4067 for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { 4069 if (!acpi_disabled && acpi_ioapic) {
4068 irq_entry = find_irq_entry(ioapic, pin, mp_INT); 4070 ioapic = mp_find_ioapic(0);
4069 if (irq_entry == -1) 4071 if (ioapic < 0)
4070 continue; 4072 ioapic = 0;
4071 irq = pin_2_irq(irq_entry, ioapic, pin); 4073 }
4072 4074#endif
4073 /* setup_IO_APIC_irqs could fail to get vector for some device
4074 * when you have too many devices, because at that time only boot
4075 * cpu is online.
4076 */
4077 desc = irq_to_desc(irq);
4078 cfg = desc->chip_data;
4079 if (!cfg->vector) {
4080 setup_IO_APIC_irq(ioapic, pin, irq, desc,
4081 irq_trigger(irq_entry),
4082 irq_polarity(irq_entry));
4083 continue;
4084 4075
4085 } 4076 for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
4077 irq_entry = find_irq_entry(ioapic, pin, mp_INT);
4078 if (irq_entry == -1)
4079 continue;
4080 irq = pin_2_irq(irq_entry, ioapic, pin);
4086 4081
4087 /* 4082 desc = irq_to_desc(irq);
4088 * Honour affinities which have been set in early boot
4089 */
4090 if (desc->status &
4091 (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
4092 mask = desc->affinity;
4093 else
4094 mask = apic->target_cpus();
4095 4083
4096 if (intr_remapping_enabled) 4084 /*
4097 set_ir_ioapic_affinity_irq_desc(desc, mask); 4085 * Honour affinities which have been set in early boot
4098 else 4086 */
4099 set_ioapic_affinity_irq_desc(desc, mask); 4087 if (desc->status &
4100 } 4088 (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
4089 mask = desc->affinity;
4090 else
4091 mask = apic->target_cpus();
4101 4092
4093 if (intr_remapping_enabled)
4094 set_ir_ioapic_affinity_irq_desc(desc, mask);
4095 else
4096 set_ioapic_affinity_irq_desc(desc, mask);
4102 } 4097 }
4098
4103} 4099}
4104#endif 4100#endif
4105 4101
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index ce4fbfa315a..b3025b43b63 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -66,7 +66,7 @@ static inline unsigned int get_nmi_count(int cpu)
66 66
67static inline int mce_in_progress(void) 67static inline int mce_in_progress(void)
68{ 68{
69#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) 69#if defined(CONFIG_X86_NEW_MCE)
70 return atomic_read(&mce_entry) > 0; 70 return atomic_read(&mce_entry) > 0;
71#endif 71#endif
72 return 0; 72 return 0;
@@ -104,7 +104,7 @@ static __init void nmi_cpu_busy(void *data)
104} 104}
105#endif 105#endif
106 106
107static void report_broken_nmi(int cpu, int *prev_nmi_count) 107static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count)
108{ 108{
109 printk(KERN_CONT "\n"); 109 printk(KERN_CONT "\n");
110 110
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 01eda2ac65e..440a8bccd91 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -160,7 +160,6 @@ extern struct apic apic_summit;
160extern struct apic apic_bigsmp; 160extern struct apic apic_bigsmp;
161extern struct apic apic_es7000; 161extern struct apic apic_es7000;
162extern struct apic apic_es7000_cluster; 162extern struct apic apic_es7000_cluster;
163extern struct apic apic_default;
164 163
165struct apic *apic = &apic_default; 164struct apic *apic = &apic_default;
166EXPORT_SYMBOL_GPL(apic); 165EXPORT_SYMBOL_GPL(apic);
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 1783652bb0e..bc3e880f9b8 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -50,7 +50,7 @@ static struct apic *apic_probe[] __initdata = {
50void __init default_setup_apic_routing(void) 50void __init default_setup_apic_routing(void)
51{ 51{
52#ifdef CONFIG_X86_X2APIC 52#ifdef CONFIG_X86_X2APIC
53 if (x2apic && (apic != &apic_x2apic_phys && 53 if (x2apic_mode && (apic != &apic_x2apic_phys &&
54#ifdef CONFIG_X86_UV 54#ifdef CONFIG_X86_UV
55 apic != &apic_x2apic_uv_x && 55 apic != &apic_x2apic_uv_x &&
56#endif 56#endif
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 9cfe1f415d8..344eee4ac0a 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -173,13 +173,6 @@ static inline int is_WPEG(struct rio_detail *rio){
173 rio->type == LookOutAWPEG || rio->type == LookOutBWPEG); 173 rio->type == LookOutAWPEG || rio->type == LookOutBWPEG);
174} 174}
175 175
176
177/* In clustered mode, the high nibble of APIC ID is a cluster number.
178 * The low nibble is a 4-bit bitmap. */
179#define XAPIC_DEST_CPUS_SHIFT 4
180#define XAPIC_DEST_CPUS_MASK ((1u << XAPIC_DEST_CPUS_SHIFT) - 1)
181#define XAPIC_DEST_CLUSTER_MASK (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT)
182
183#define SUMMIT_APIC_DFR_VALUE (APIC_DFR_CLUSTER) 176#define SUMMIT_APIC_DFR_VALUE (APIC_DFR_CLUSTER)
184 177
185static const struct cpumask *summit_target_cpus(void) 178static const struct cpumask *summit_target_cpus(void)
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 4a903e2f0d1..8e4cbb255c3 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -10,7 +10,7 @@
10#include <asm/apic.h> 10#include <asm/apic.h>
11#include <asm/ipi.h> 11#include <asm/ipi.h>
12 12
13DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); 13static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
14 14
15static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 15static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
16{ 16{
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 2bda6935297..096d19aea2f 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -105,7 +105,7 @@ static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
105 cpumask_set_cpu(cpu, retmask); 105 cpumask_set_cpu(cpu, retmask);
106} 106}
107 107
108static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) 108static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
109{ 109{
110#ifdef CONFIG_SMP 110#ifdef CONFIG_SMP
111 unsigned long val; 111 unsigned long val;
@@ -463,7 +463,7 @@ static void uv_heartbeat(unsigned long ignored)
463 uv_set_scir_bits(bits); 463 uv_set_scir_bits(bits);
464 464
465 /* enable next timer period */ 465 /* enable next timer period */
466 mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL); 466 mod_timer_pinned(timer, jiffies + SCIR_CPU_HB_INTERVAL);
467} 467}
468 468
469static void __cpuinit uv_heartbeat_enable(int cpu) 469static void __cpuinit uv_heartbeat_enable(int cpu)
@@ -562,7 +562,7 @@ void __init uv_system_init(void)
562 union uvh_node_id_u node_id; 562 union uvh_node_id_u node_id;
563 unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; 563 unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
564 int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; 564 int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val;
565 int max_pnode = 0; 565 int gnode_extra, max_pnode = 0;
566 unsigned long mmr_base, present, paddr; 566 unsigned long mmr_base, present, paddr;
567 unsigned short pnode_mask; 567 unsigned short pnode_mask;
568 568
@@ -574,6 +574,13 @@ void __init uv_system_init(void)
574 mmr_base = 574 mmr_base =
575 uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & 575 uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
576 ~UV_MMR_ENABLE; 576 ~UV_MMR_ENABLE;
577 pnode_mask = (1 << n_val) - 1;
578 node_id.v = uv_read_local_mmr(UVH_NODE_ID);
579 gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1;
580 gnode_upper = ((unsigned long)gnode_extra << m_val);
581 printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n",
582 n_val, m_val, gnode_upper, gnode_extra);
583
577 printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base); 584 printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);
578 585
579 for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) 586 for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++)
@@ -583,15 +590,18 @@ void __init uv_system_init(void)
583 590
584 bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); 591 bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
585 uv_blade_info = kmalloc(bytes, GFP_KERNEL); 592 uv_blade_info = kmalloc(bytes, GFP_KERNEL);
593 BUG_ON(!uv_blade_info);
586 594
587 get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size); 595 get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size);
588 596
589 bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes(); 597 bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes();
590 uv_node_to_blade = kmalloc(bytes, GFP_KERNEL); 598 uv_node_to_blade = kmalloc(bytes, GFP_KERNEL);
599 BUG_ON(!uv_node_to_blade);
591 memset(uv_node_to_blade, 255, bytes); 600 memset(uv_node_to_blade, 255, bytes);
592 601
593 bytes = sizeof(uv_cpu_to_blade[0]) * num_possible_cpus(); 602 bytes = sizeof(uv_cpu_to_blade[0]) * num_possible_cpus();
594 uv_cpu_to_blade = kmalloc(bytes, GFP_KERNEL); 603 uv_cpu_to_blade = kmalloc(bytes, GFP_KERNEL);
604 BUG_ON(!uv_cpu_to_blade);
595 memset(uv_cpu_to_blade, 255, bytes); 605 memset(uv_cpu_to_blade, 255, bytes);
596 606
597 blade = 0; 607 blade = 0;
@@ -607,11 +617,6 @@ void __init uv_system_init(void)
607 } 617 }
608 } 618 }
609 619
610 pnode_mask = (1 << n_val) - 1;
611 node_id.v = uv_read_local_mmr(UVH_NODE_ID);
612 gnode_upper = (((unsigned long)node_id.s.node_id) &
613 ~((1 << n_val) - 1)) << m_val;
614
615 uv_bios_init(); 620 uv_bios_init();
616 uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, 621 uv_bios_get_sn_info(0, &uv_type, &sn_partition_id,
617 &sn_coherency_id, &sn_region_size); 622 &sn_coherency_id, &sn_region_size);
@@ -634,6 +639,7 @@ void __init uv_system_init(void)
634 uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask; 639 uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
635 uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; 640 uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
636 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; 641 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
642 uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
637 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; 643 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
638 uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; 644 uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id;
639 uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; 645 uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu;
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 49e0939bac4..79302e9a33a 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1233,9 +1233,9 @@ static int suspend(int vetoable)
1233 int err; 1233 int err;
1234 struct apm_user *as; 1234 struct apm_user *as;
1235 1235
1236 device_suspend(PMSG_SUSPEND); 1236 dpm_suspend_start(PMSG_SUSPEND);
1237 1237
1238 device_power_down(PMSG_SUSPEND); 1238 dpm_suspend_noirq(PMSG_SUSPEND);
1239 1239
1240 local_irq_disable(); 1240 local_irq_disable();
1241 sysdev_suspend(PMSG_SUSPEND); 1241 sysdev_suspend(PMSG_SUSPEND);
@@ -1259,9 +1259,9 @@ static int suspend(int vetoable)
1259 sysdev_resume(); 1259 sysdev_resume();
1260 local_irq_enable(); 1260 local_irq_enable();
1261 1261
1262 device_power_up(PMSG_RESUME); 1262 dpm_resume_noirq(PMSG_RESUME);
1263 1263
1264 device_resume(PMSG_RESUME); 1264 dpm_resume_end(PMSG_RESUME);
1265 queue_event(APM_NORMAL_RESUME, NULL); 1265 queue_event(APM_NORMAL_RESUME, NULL);
1266 spin_lock(&user_list_lock); 1266 spin_lock(&user_list_lock);
1267 for (as = user_list; as != NULL; as = as->next) { 1267 for (as = user_list; as != NULL; as = as->next) {
@@ -1277,7 +1277,7 @@ static void standby(void)
1277{ 1277{
1278 int err; 1278 int err;
1279 1279
1280 device_power_down(PMSG_SUSPEND); 1280 dpm_suspend_noirq(PMSG_SUSPEND);
1281 1281
1282 local_irq_disable(); 1282 local_irq_disable();
1283 sysdev_suspend(PMSG_SUSPEND); 1283 sysdev_suspend(PMSG_SUSPEND);
@@ -1291,7 +1291,7 @@ static void standby(void)
1291 sysdev_resume(); 1291 sysdev_resume();
1292 local_irq_enable(); 1292 local_irq_enable();
1293 1293
1294 device_power_up(PMSG_RESUME); 1294 dpm_resume_noirq(PMSG_RESUME);
1295} 1295}
1296 1296
1297static apm_event_t get_event(void) 1297static apm_event_t get_event(void)
@@ -1376,7 +1376,7 @@ static void check_events(void)
1376 ignore_bounce = 1; 1376 ignore_bounce = 1;
1377 if ((event != APM_NORMAL_RESUME) 1377 if ((event != APM_NORMAL_RESUME)
1378 || (ignore_normal_resume == 0)) { 1378 || (ignore_normal_resume == 0)) {
1379 device_resume(PMSG_RESUME); 1379 dpm_resume_end(PMSG_RESUME);
1380 queue_event(event, NULL); 1380 queue_event(event, NULL);
1381 } 1381 }
1382 ignore_normal_resume = 0; 1382 ignore_normal_resume = 0;
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 5a6aa1c1162..dfdbf640389 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -126,6 +126,7 @@ void foo(void)
126#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) 126#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
127 BLANK(); 127 BLANK();
128 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); 128 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
129 OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
129 OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir); 130 OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
130 131
131 BLANK(); 132 BLANK();
@@ -146,4 +147,5 @@ void foo(void)
146 OFFSET(BP_loadflags, boot_params, hdr.loadflags); 147 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
147 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); 148 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
148 OFFSET(BP_version, boot_params, hdr.version); 149 OFFSET(BP_version, boot_params, hdr.version);
150 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
149} 151}
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index e72f062fb4b..898ecc47e12 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -125,6 +125,7 @@ int main(void)
125 OFFSET(BP_loadflags, boot_params, hdr.loadflags); 125 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
126 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); 126 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
127 OFFSET(BP_version, boot_params, hdr.version); 127 OFFSET(BP_version, boot_params, hdr.version);
128 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
128 129
129 BLANK(); 130 BLANK();
130 DEFINE(PAGE_SIZE_asm, PAGE_SIZE); 131 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 4e242f9a06e..3efcb2b96a1 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -1,5 +1,5 @@
1# 1#
2# Makefile for x86-compatible CPU details and quirks 2# Makefile for x86-compatible CPU details, features and quirks
3# 3#
4 4
5# Don't trace early stages of a secondary CPU boot 5# Don't trace early stages of a secondary CPU boot
@@ -23,11 +23,13 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
23obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o 23obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
24obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o 24obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
25 25
26obj-$(CONFIG_X86_MCE) += mcheck/ 26obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
27obj-$(CONFIG_MTRR) += mtrr/
28obj-$(CONFIG_CPU_FREQ) += cpufreq/
29 27
30obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o 28obj-$(CONFIG_X86_MCE) += mcheck/
29obj-$(CONFIG_MTRR) += mtrr/
30obj-$(CONFIG_CPU_FREQ) += cpufreq/
31
32obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
31 33
32quiet_cmd_mkcapflags = MKCAP $@ 34quiet_cmd_mkcapflags = MKCAP $@
33 cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ 35 cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 7e4a459daa6..e5b27d8f1b4 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -6,6 +6,7 @@
6#include <asm/processor.h> 6#include <asm/processor.h>
7#include <asm/apic.h> 7#include <asm/apic.h>
8#include <asm/cpu.h> 8#include <asm/cpu.h>
9#include <asm/pci-direct.h>
9 10
10#ifdef CONFIG_X86_64 11#ifdef CONFIG_X86_64
11# include <asm/numa_64.h> 12# include <asm/numa_64.h>
@@ -272,7 +273,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
272#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 273#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
273 int cpu = smp_processor_id(); 274 int cpu = smp_processor_id();
274 int node; 275 int node;
275 unsigned apicid = hard_smp_processor_id(); 276 unsigned apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid;
276 277
277 node = c->phys_proc_id; 278 node = c->phys_proc_id;
278 if (apicid_to_node[apicid] != NUMA_NO_NODE) 279 if (apicid_to_node[apicid] != NUMA_NO_NODE)
@@ -351,6 +352,15 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
351 (c->x86_model == 8 && c->x86_mask >= 8)) 352 (c->x86_model == 8 && c->x86_mask >= 8))
352 set_cpu_cap(c, X86_FEATURE_K6_MTRR); 353 set_cpu_cap(c, X86_FEATURE_K6_MTRR);
353#endif 354#endif
355#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI)
356 /* check CPU config space for extended APIC ID */
357 if (c->x86 >= 0xf) {
358 unsigned int val;
359 val = read_pci_config(0, 24, 0, 0x68);
360 if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18)))
361 set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
362 }
363#endif
354} 364}
355 365
356static void __cpuinit init_amd(struct cpuinfo_x86 *c) 366static void __cpuinit init_amd(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c1caefc82e6..9fa33886c0d 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -13,6 +13,7 @@
13#include <linux/io.h> 13#include <linux/io.h>
14 14
15#include <asm/stackprotector.h> 15#include <asm/stackprotector.h>
16#include <asm/perf_counter.h>
16#include <asm/mmu_context.h> 17#include <asm/mmu_context.h>
17#include <asm/hypervisor.h> 18#include <asm/hypervisor.h>
18#include <asm/processor.h> 19#include <asm/processor.h>
@@ -114,6 +115,13 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
114} }; 115} };
115EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); 116EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
116 117
118static int __init x86_xsave_setup(char *s)
119{
120 setup_clear_cpu_cap(X86_FEATURE_XSAVE);
121 return 1;
122}
123__setup("noxsave", x86_xsave_setup);
124
117#ifdef CONFIG_X86_32 125#ifdef CONFIG_X86_32
118static int cachesize_override __cpuinitdata = -1; 126static int cachesize_override __cpuinitdata = -1;
119static int disable_x86_serial_nr __cpuinitdata = 1; 127static int disable_x86_serial_nr __cpuinitdata = 1;
@@ -292,7 +300,8 @@ static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c)
292 return NULL; /* Not found */ 300 return NULL; /* Not found */
293} 301}
294 302
295__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; 303__u32 cpu_caps_cleared[NCAPINTS] __cpuinitdata;
304__u32 cpu_caps_set[NCAPINTS] __cpuinitdata;
296 305
297void load_percpu_segment(int cpu) 306void load_percpu_segment(int cpu)
298{ 307{
@@ -478,7 +487,6 @@ out:
478static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) 487static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
479{ 488{
480 char *v = c->x86_vendor_id; 489 char *v = c->x86_vendor_id;
481 static int printed;
482 int i; 490 int i;
483 491
484 for (i = 0; i < X86_VENDOR_NUM; i++) { 492 for (i = 0; i < X86_VENDOR_NUM; i++) {
@@ -495,13 +503,9 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
495 } 503 }
496 } 504 }
497 505
498 if (!printed) { 506 printk_once(KERN_ERR
499 printed++; 507 "CPU: vendor_id '%s' unknown, using generic init.\n" \
500 printk(KERN_ERR 508 "CPU: Your system may be unstable.\n", v);
501 "CPU: vendor_id '%s' unknown, using generic init.\n", v);
502
503 printk(KERN_ERR "CPU: Your system may be unstable.\n");
504 }
505 509
506 c->x86_vendor = X86_VENDOR_UNKNOWN; 510 c->x86_vendor = X86_VENDOR_UNKNOWN;
507 this_cpu = &default_cpu; 511 this_cpu = &default_cpu;
@@ -761,6 +765,12 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
761 if (this_cpu->c_identify) 765 if (this_cpu->c_identify)
762 this_cpu->c_identify(c); 766 this_cpu->c_identify(c);
763 767
768 /* Clear/Set all flags overriden by options, after probe */
769 for (i = 0; i < NCAPINTS; i++) {
770 c->x86_capability[i] &= ~cpu_caps_cleared[i];
771 c->x86_capability[i] |= cpu_caps_set[i];
772 }
773
764#ifdef CONFIG_X86_64 774#ifdef CONFIG_X86_64
765 c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); 775 c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
766#endif 776#endif
@@ -806,6 +816,16 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
806#endif 816#endif
807 817
808 init_hypervisor(c); 818 init_hypervisor(c);
819
820 /*
821 * Clear/Set all flags overriden by options, need do it
822 * before following smp all cpus cap AND.
823 */
824 for (i = 0; i < NCAPINTS; i++) {
825 c->x86_capability[i] &= ~cpu_caps_cleared[i];
826 c->x86_capability[i] |= cpu_caps_set[i];
827 }
828
809 /* 829 /*
810 * On SMP, boot_cpu_data holds the common feature set between 830 * On SMP, boot_cpu_data holds the common feature set between
811 * all CPUs; so make sure that we indicate which features are 831 * all CPUs; so make sure that we indicate which features are
@@ -818,10 +838,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
818 boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; 838 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
819 } 839 }
820 840
821 /* Clear all flags overriden by options */
822 for (i = 0; i < NCAPINTS; i++)
823 c->x86_capability[i] &= ~cleared_cpu_caps[i];
824
825#ifdef CONFIG_X86_MCE 841#ifdef CONFIG_X86_MCE
826 /* Init Machine Check Exception if available. */ 842 /* Init Machine Check Exception if available. */
827 mcheck_init(c); 843 mcheck_init(c);
@@ -854,6 +870,7 @@ void __init identify_boot_cpu(void)
854#else 870#else
855 vgetcpu_set_mode(); 871 vgetcpu_set_mode();
856#endif 872#endif
873 init_hw_perf_counters();
857} 874}
858 875
859void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) 876void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
index 46e29ab96c6..6b2a52dd040 100644
--- a/arch/x86/kernel/cpu/cpu_debug.c
+++ b/arch/x86/kernel/cpu/cpu_debug.c
@@ -32,9 +32,7 @@
32 32
33static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]); 33static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]);
34static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]); 34static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]);
35static DEFINE_PER_CPU(unsigned, cpu_modelflag);
36static DEFINE_PER_CPU(int, cpu_priv_count); 35static DEFINE_PER_CPU(int, cpu_priv_count);
37static DEFINE_PER_CPU(unsigned, cpu_model);
38 36
39static DEFINE_MUTEX(cpu_debug_lock); 37static DEFINE_MUTEX(cpu_debug_lock);
40 38
@@ -80,302 +78,102 @@ static struct cpu_file_base cpu_file[] = {
80 { "value", CPU_REG_ALL, 1 }, 78 { "value", CPU_REG_ALL, 1 },
81}; 79};
82 80
83/* Intel Registers Range */ 81/* CPU Registers Range */
84static struct cpu_debug_range cpu_intel_range[] = { 82static struct cpu_debug_range cpu_reg_range[] = {
85 { 0x00000000, 0x00000001, CPU_MC, CPU_INTEL_ALL }, 83 { 0x00000000, 0x00000001, CPU_MC, },
86 { 0x00000006, 0x00000007, CPU_MONITOR, CPU_CX_AT_XE }, 84 { 0x00000006, 0x00000007, CPU_MONITOR, },
87 { 0x00000010, 0x00000010, CPU_TIME, CPU_INTEL_ALL }, 85 { 0x00000010, 0x00000010, CPU_TIME, },
88 { 0x00000011, 0x00000013, CPU_PMC, CPU_INTEL_PENTIUM }, 86 { 0x00000011, 0x00000013, CPU_PMC, },
89 { 0x00000017, 0x00000017, CPU_PLATFORM, CPU_PX_CX_AT_XE }, 87 { 0x00000017, 0x00000017, CPU_PLATFORM, },
90 { 0x0000001B, 0x0000001B, CPU_APIC, CPU_P6_CX_AT_XE }, 88 { 0x0000001B, 0x0000001B, CPU_APIC, },
91 89 { 0x0000002A, 0x0000002B, CPU_POWERON, },
92 { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_PX_CX_AT_XE }, 90 { 0x0000002C, 0x0000002C, CPU_FREQ, },
93 { 0x0000002B, 0x0000002B, CPU_POWERON, CPU_INTEL_XEON }, 91 { 0x0000003A, 0x0000003A, CPU_CONTROL, },
94 { 0x0000002C, 0x0000002C, CPU_FREQ, CPU_INTEL_XEON }, 92 { 0x00000040, 0x00000047, CPU_LBRANCH, },
95 { 0x0000003A, 0x0000003A, CPU_CONTROL, CPU_CX_AT_XE }, 93 { 0x00000060, 0x00000067, CPU_LBRANCH, },
96 94 { 0x00000079, 0x00000079, CPU_BIOS, },
97 { 0x00000040, 0x00000043, CPU_LBRANCH, CPU_PM_CX_AT_XE }, 95 { 0x00000088, 0x0000008A, CPU_CACHE, },
98 { 0x00000044, 0x00000047, CPU_LBRANCH, CPU_PM_CO_AT }, 96 { 0x0000008B, 0x0000008B, CPU_BIOS, },
99 { 0x00000060, 0x00000063, CPU_LBRANCH, CPU_C2_AT }, 97 { 0x0000009B, 0x0000009B, CPU_MONITOR, },
100 { 0x00000064, 0x00000067, CPU_LBRANCH, CPU_INTEL_ATOM }, 98 { 0x000000C1, 0x000000C4, CPU_PMC, },
101 99 { 0x000000CD, 0x000000CD, CPU_FREQ, },
102 { 0x00000079, 0x00000079, CPU_BIOS, CPU_P6_CX_AT_XE }, 100 { 0x000000E7, 0x000000E8, CPU_PERF, },
103 { 0x00000088, 0x0000008A, CPU_CACHE, CPU_INTEL_P6 }, 101 { 0x000000FE, 0x000000FE, CPU_MTRR, },
104 { 0x0000008B, 0x0000008B, CPU_BIOS, CPU_P6_CX_AT_XE }, 102
105 { 0x0000009B, 0x0000009B, CPU_MONITOR, CPU_INTEL_XEON }, 103 { 0x00000116, 0x0000011E, CPU_CACHE, },
106 104 { 0x00000174, 0x00000176, CPU_SYSENTER, },
107 { 0x000000C1, 0x000000C2, CPU_PMC, CPU_P6_CX_AT }, 105 { 0x00000179, 0x0000017B, CPU_MC, },
108 { 0x000000CD, 0x000000CD, CPU_FREQ, CPU_CX_AT }, 106 { 0x00000186, 0x00000189, CPU_PMC, },
109 { 0x000000E7, 0x000000E8, CPU_PERF, CPU_CX_AT }, 107 { 0x00000198, 0x00000199, CPU_PERF, },
110 { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_P6_CX_XE }, 108 { 0x0000019A, 0x0000019A, CPU_TIME, },
111 109 { 0x0000019B, 0x0000019D, CPU_THERM, },
112 { 0x00000116, 0x00000116, CPU_CACHE, CPU_INTEL_P6 }, 110 { 0x000001A0, 0x000001A0, CPU_MISC, },
113 { 0x00000118, 0x00000118, CPU_CACHE, CPU_INTEL_P6 }, 111 { 0x000001C9, 0x000001C9, CPU_LBRANCH, },
114 { 0x00000119, 0x00000119, CPU_CACHE, CPU_INTEL_PX }, 112 { 0x000001D7, 0x000001D8, CPU_LBRANCH, },
115 { 0x0000011A, 0x0000011B, CPU_CACHE, CPU_INTEL_P6 }, 113 { 0x000001D9, 0x000001D9, CPU_DEBUG, },
116 { 0x0000011E, 0x0000011E, CPU_CACHE, CPU_PX_CX_AT }, 114 { 0x000001DA, 0x000001E0, CPU_LBRANCH, },
117 115
118 { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_P6_CX_AT_XE }, 116 { 0x00000200, 0x0000020F, CPU_MTRR, },
119 { 0x00000179, 0x0000017A, CPU_MC, CPU_PX_CX_AT_XE }, 117 { 0x00000250, 0x00000250, CPU_MTRR, },
120 { 0x0000017B, 0x0000017B, CPU_MC, CPU_P6_XE }, 118 { 0x00000258, 0x00000259, CPU_MTRR, },
121 { 0x00000186, 0x00000187, CPU_PMC, CPU_P6_CX_AT }, 119 { 0x00000268, 0x0000026F, CPU_MTRR, },
122 { 0x00000198, 0x00000199, CPU_PERF, CPU_PM_CX_AT_XE }, 120 { 0x00000277, 0x00000277, CPU_PAT, },
123 { 0x0000019A, 0x0000019A, CPU_TIME, CPU_PM_CX_AT_XE }, 121 { 0x000002FF, 0x000002FF, CPU_MTRR, },
124 { 0x0000019B, 0x0000019D, CPU_THERM, CPU_PM_CX_AT_XE }, 122
125 { 0x000001A0, 0x000001A0, CPU_MISC, CPU_PM_CX_AT_XE }, 123 { 0x00000300, 0x00000311, CPU_PMC, },
126 124 { 0x00000345, 0x00000345, CPU_PMC, },
127 { 0x000001C9, 0x000001C9, CPU_LBRANCH, CPU_PM_CX_AT }, 125 { 0x00000360, 0x00000371, CPU_PMC, },
128 { 0x000001D7, 0x000001D8, CPU_LBRANCH, CPU_INTEL_XEON }, 126 { 0x0000038D, 0x00000390, CPU_PMC, },
129 { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_CX_AT_XE }, 127 { 0x000003A0, 0x000003BE, CPU_PMC, },
130 { 0x000001DA, 0x000001DA, CPU_LBRANCH, CPU_INTEL_XEON }, 128 { 0x000003C0, 0x000003CD, CPU_PMC, },
131 { 0x000001DB, 0x000001DB, CPU_LBRANCH, CPU_P6_XE }, 129 { 0x000003E0, 0x000003E1, CPU_PMC, },
132 { 0x000001DC, 0x000001DC, CPU_LBRANCH, CPU_INTEL_P6 }, 130 { 0x000003F0, 0x000003F2, CPU_PMC, },
133 { 0x000001DD, 0x000001DE, CPU_LBRANCH, CPU_PX_CX_AT_XE }, 131
134 { 0x000001E0, 0x000001E0, CPU_LBRANCH, CPU_INTEL_P6 }, 132 { 0x00000400, 0x00000417, CPU_MC, },
135 133 { 0x00000480, 0x0000048B, CPU_VMX, },
136 { 0x00000200, 0x0000020F, CPU_MTRR, CPU_P6_CX_XE }, 134
137 { 0x00000250, 0x00000250, CPU_MTRR, CPU_P6_CX_XE }, 135 { 0x00000600, 0x00000600, CPU_DEBUG, },
138 { 0x00000258, 0x00000259, CPU_MTRR, CPU_P6_CX_XE }, 136 { 0x00000680, 0x0000068F, CPU_LBRANCH, },
139 { 0x00000268, 0x0000026F, CPU_MTRR, CPU_P6_CX_XE }, 137 { 0x000006C0, 0x000006CF, CPU_LBRANCH, },
140 { 0x00000277, 0x00000277, CPU_PAT, CPU_C2_AT_XE }, 138
141 { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_P6_CX_XE }, 139 { 0x000107CC, 0x000107D3, CPU_PMC, },
142 140
143 { 0x00000300, 0x00000308, CPU_PMC, CPU_INTEL_XEON }, 141 { 0xC0000080, 0xC0000080, CPU_FEATURES, },
144 { 0x00000309, 0x0000030B, CPU_PMC, CPU_C2_AT_XE }, 142 { 0xC0000081, 0xC0000084, CPU_CALL, },
145 { 0x0000030C, 0x00000311, CPU_PMC, CPU_INTEL_XEON }, 143 { 0xC0000100, 0xC0000102, CPU_BASE, },
146 { 0x00000345, 0x00000345, CPU_PMC, CPU_C2_AT }, 144 { 0xC0000103, 0xC0000103, CPU_TIME, },
147 { 0x00000360, 0x00000371, CPU_PMC, CPU_INTEL_XEON }, 145
148 { 0x0000038D, 0x00000390, CPU_PMC, CPU_C2_AT }, 146 { 0xC0010000, 0xC0010007, CPU_PMC, },
149 { 0x000003A0, 0x000003BE, CPU_PMC, CPU_INTEL_XEON }, 147 { 0xC0010010, 0xC0010010, CPU_CONF, },
150 { 0x000003C0, 0x000003CD, CPU_PMC, CPU_INTEL_XEON }, 148 { 0xC0010015, 0xC0010015, CPU_CONF, },
151 { 0x000003E0, 0x000003E1, CPU_PMC, CPU_INTEL_XEON }, 149 { 0xC0010016, 0xC001001A, CPU_MTRR, },
152 { 0x000003F0, 0x000003F0, CPU_PMC, CPU_INTEL_XEON }, 150 { 0xC001001D, 0xC001001D, CPU_MTRR, },
153 { 0x000003F1, 0x000003F1, CPU_PMC, CPU_C2_AT_XE }, 151 { 0xC001001F, 0xC001001F, CPU_CONF, },
154 { 0x000003F2, 0x000003F2, CPU_PMC, CPU_INTEL_XEON }, 152 { 0xC0010030, 0xC0010035, CPU_BIOS, },
155 153 { 0xC0010044, 0xC0010048, CPU_MC, },
156 { 0x00000400, 0x00000402, CPU_MC, CPU_PM_CX_AT_XE }, 154 { 0xC0010050, 0xC0010056, CPU_SMM, },
157 { 0x00000403, 0x00000403, CPU_MC, CPU_INTEL_XEON }, 155 { 0xC0010058, 0xC0010058, CPU_CONF, },
158 { 0x00000404, 0x00000406, CPU_MC, CPU_PM_CX_AT_XE }, 156 { 0xC0010060, 0xC0010060, CPU_CACHE, },
159 { 0x00000407, 0x00000407, CPU_MC, CPU_INTEL_XEON }, 157 { 0xC0010061, 0xC0010068, CPU_SMM, },
160 { 0x00000408, 0x0000040A, CPU_MC, CPU_PM_CX_AT_XE }, 158 { 0xC0010069, 0xC001006B, CPU_SMM, },
161 { 0x0000040B, 0x0000040B, CPU_MC, CPU_INTEL_XEON }, 159 { 0xC0010070, 0xC0010071, CPU_SMM, },
162 { 0x0000040C, 0x0000040E, CPU_MC, CPU_PM_CX_XE }, 160 { 0xC0010111, 0xC0010113, CPU_SMM, },
163 { 0x0000040F, 0x0000040F, CPU_MC, CPU_INTEL_XEON }, 161 { 0xC0010114, 0xC0010118, CPU_SVM, },
164 { 0x00000410, 0x00000412, CPU_MC, CPU_PM_CX_AT_XE }, 162 { 0xC0010140, 0xC0010141, CPU_OSVM, },
165 { 0x00000413, 0x00000417, CPU_MC, CPU_CX_AT_XE }, 163 { 0xC0011022, 0xC0011023, CPU_CONF, },
166 { 0x00000480, 0x0000048B, CPU_VMX, CPU_CX_AT_XE },
167
168 { 0x00000600, 0x00000600, CPU_DEBUG, CPU_PM_CX_AT_XE },
169 { 0x00000680, 0x0000068F, CPU_LBRANCH, CPU_INTEL_XEON },
170 { 0x000006C0, 0x000006CF, CPU_LBRANCH, CPU_INTEL_XEON },
171
172 { 0x000107CC, 0x000107D3, CPU_PMC, CPU_INTEL_XEON_MP },
173
174 { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_INTEL_XEON },
175 { 0xC0000081, 0xC0000082, CPU_CALL, CPU_INTEL_XEON },
176 { 0xC0000084, 0xC0000084, CPU_CALL, CPU_INTEL_XEON },
177 { 0xC0000100, 0xC0000102, CPU_BASE, CPU_INTEL_XEON },
178}; 164};
179 165
180/* AMD Registers Range */
181static struct cpu_debug_range cpu_amd_range[] = {
182 { 0x00000000, 0x00000001, CPU_MC, CPU_K10_PLUS, },
183 { 0x00000010, 0x00000010, CPU_TIME, CPU_K8_PLUS, },
184 { 0x0000001B, 0x0000001B, CPU_APIC, CPU_K8_PLUS, },
185 { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_K7_PLUS },
186 { 0x0000008B, 0x0000008B, CPU_VER, CPU_K8_PLUS },
187 { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_K8_PLUS, },
188
189 { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_K8_PLUS, },
190 { 0x00000179, 0x0000017B, CPU_MC, CPU_K8_PLUS, },
191 { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_K8_PLUS, },
192 { 0x000001DB, 0x000001DE, CPU_LBRANCH, CPU_K8_PLUS, },
193
194 { 0x00000200, 0x0000020F, CPU_MTRR, CPU_K8_PLUS, },
195 { 0x00000250, 0x00000250, CPU_MTRR, CPU_K8_PLUS, },
196 { 0x00000258, 0x00000259, CPU_MTRR, CPU_K8_PLUS, },
197 { 0x00000268, 0x0000026F, CPU_MTRR, CPU_K8_PLUS, },
198 { 0x00000277, 0x00000277, CPU_PAT, CPU_K8_PLUS, },
199 { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_K8_PLUS, },
200
201 { 0x00000400, 0x00000413, CPU_MC, CPU_K8_PLUS, },
202
203 { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_AMD_ALL, },
204 { 0xC0000081, 0xC0000084, CPU_CALL, CPU_K8_PLUS, },
205 { 0xC0000100, 0xC0000102, CPU_BASE, CPU_K8_PLUS, },
206 { 0xC0000103, 0xC0000103, CPU_TIME, CPU_K10_PLUS, },
207
208 { 0xC0010000, 0xC0010007, CPU_PMC, CPU_K8_PLUS, },
209 { 0xC0010010, 0xC0010010, CPU_CONF, CPU_K7_PLUS, },
210 { 0xC0010015, 0xC0010015, CPU_CONF, CPU_K7_PLUS, },
211 { 0xC0010016, 0xC001001A, CPU_MTRR, CPU_K8_PLUS, },
212 { 0xC001001D, 0xC001001D, CPU_MTRR, CPU_K8_PLUS, },
213 { 0xC001001F, 0xC001001F, CPU_CONF, CPU_K8_PLUS, },
214 { 0xC0010030, 0xC0010035, CPU_BIOS, CPU_K8_PLUS, },
215 { 0xC0010044, 0xC0010048, CPU_MC, CPU_K8_PLUS, },
216 { 0xC0010050, 0xC0010056, CPU_SMM, CPU_K0F_PLUS, },
217 { 0xC0010058, 0xC0010058, CPU_CONF, CPU_K10_PLUS, },
218 { 0xC0010060, 0xC0010060, CPU_CACHE, CPU_AMD_11, },
219 { 0xC0010061, 0xC0010068, CPU_SMM, CPU_K10_PLUS, },
220 { 0xC0010069, 0xC001006B, CPU_SMM, CPU_AMD_11, },
221 { 0xC0010070, 0xC0010071, CPU_SMM, CPU_K10_PLUS, },
222 { 0xC0010111, 0xC0010113, CPU_SMM, CPU_K8_PLUS, },
223 { 0xC0010114, 0xC0010118, CPU_SVM, CPU_K10_PLUS, },
224 { 0xC0010140, 0xC0010141, CPU_OSVM, CPU_K10_PLUS, },
225 { 0xC0011022, 0xC0011023, CPU_CONF, CPU_K10_PLUS, },
226};
227
228
229/* Intel */
230static int get_intel_modelflag(unsigned model)
231{
232 int flag;
233
234 switch (model) {
235 case 0x0501:
236 case 0x0502:
237 case 0x0504:
238 flag = CPU_INTEL_PENTIUM;
239 break;
240 case 0x0601:
241 case 0x0603:
242 case 0x0605:
243 case 0x0607:
244 case 0x0608:
245 case 0x060A:
246 case 0x060B:
247 flag = CPU_INTEL_P6;
248 break;
249 case 0x0609:
250 case 0x060D:
251 flag = CPU_INTEL_PENTIUM_M;
252 break;
253 case 0x060E:
254 flag = CPU_INTEL_CORE;
255 break;
256 case 0x060F:
257 case 0x0617:
258 flag = CPU_INTEL_CORE2;
259 break;
260 case 0x061C:
261 flag = CPU_INTEL_ATOM;
262 break;
263 case 0x0F00:
264 case 0x0F01:
265 case 0x0F02:
266 case 0x0F03:
267 case 0x0F04:
268 flag = CPU_INTEL_XEON_P4;
269 break;
270 case 0x0F06:
271 flag = CPU_INTEL_XEON_MP;
272 break;
273 default:
274 flag = CPU_NONE;
275 break;
276 }
277
278 return flag;
279}
280
281/* AMD */
282static int get_amd_modelflag(unsigned model)
283{
284 int flag;
285
286 switch (model >> 8) {
287 case 0x6:
288 flag = CPU_AMD_K6;
289 break;
290 case 0x7:
291 flag = CPU_AMD_K7;
292 break;
293 case 0x8:
294 flag = CPU_AMD_K8;
295 break;
296 case 0xf:
297 flag = CPU_AMD_0F;
298 break;
299 case 0x10:
300 flag = CPU_AMD_10;
301 break;
302 case 0x11:
303 flag = CPU_AMD_11;
304 break;
305 default:
306 flag = CPU_NONE;
307 break;
308 }
309
310 return flag;
311}
312
313static int get_cpu_modelflag(unsigned cpu)
314{
315 int flag;
316
317 flag = per_cpu(cpu_model, cpu);
318
319 switch (flag >> 16) {
320 case X86_VENDOR_INTEL:
321 flag = get_intel_modelflag(flag);
322 break;
323 case X86_VENDOR_AMD:
324 flag = get_amd_modelflag(flag & 0xffff);
325 break;
326 default:
327 flag = CPU_NONE;
328 break;
329 }
330
331 return flag;
332}
333
334static int get_cpu_range_count(unsigned cpu)
335{
336 int index;
337
338 switch (per_cpu(cpu_model, cpu) >> 16) {
339 case X86_VENDOR_INTEL:
340 index = ARRAY_SIZE(cpu_intel_range);
341 break;
342 case X86_VENDOR_AMD:
343 index = ARRAY_SIZE(cpu_amd_range);
344 break;
345 default:
346 index = 0;
347 break;
348 }
349
350 return index;
351}
352
353static int is_typeflag_valid(unsigned cpu, unsigned flag) 166static int is_typeflag_valid(unsigned cpu, unsigned flag)
354{ 167{
355 unsigned vendor, modelflag; 168 int i;
356 int i, index;
357 169
358 /* Standard Registers should be always valid */ 170 /* Standard Registers should be always valid */
359 if (flag >= CPU_TSS) 171 if (flag >= CPU_TSS)
360 return 1; 172 return 1;
361 173
362 modelflag = per_cpu(cpu_modelflag, cpu); 174 for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
363 vendor = per_cpu(cpu_model, cpu) >> 16; 175 if (cpu_reg_range[i].flag == flag)
364 index = get_cpu_range_count(cpu); 176 return 1;
365
366 for (i = 0; i < index; i++) {
367 switch (vendor) {
368 case X86_VENDOR_INTEL:
369 if ((cpu_intel_range[i].model & modelflag) &&
370 (cpu_intel_range[i].flag & flag))
371 return 1;
372 break;
373 case X86_VENDOR_AMD:
374 if ((cpu_amd_range[i].model & modelflag) &&
375 (cpu_amd_range[i].flag & flag))
376 return 1;
377 break;
378 }
379 } 177 }
380 178
381 /* Invalid */ 179 /* Invalid */
@@ -385,26 +183,11 @@ static int is_typeflag_valid(unsigned cpu, unsigned flag)
385static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max, 183static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max,
386 int index, unsigned flag) 184 int index, unsigned flag)
387{ 185{
388 unsigned modelflag; 186 if (cpu_reg_range[index].flag == flag) {
389 187 *min = cpu_reg_range[index].min;
390 modelflag = per_cpu(cpu_modelflag, cpu); 188 *max = cpu_reg_range[index].max;
391 *max = 0; 189 } else
392 switch (per_cpu(cpu_model, cpu) >> 16) { 190 *max = 0;
393 case X86_VENDOR_INTEL:
394 if ((cpu_intel_range[index].model & modelflag) &&
395 (cpu_intel_range[index].flag & flag)) {
396 *min = cpu_intel_range[index].min;
397 *max = cpu_intel_range[index].max;
398 }
399 break;
400 case X86_VENDOR_AMD:
401 if ((cpu_amd_range[index].model & modelflag) &&
402 (cpu_amd_range[index].flag & flag)) {
403 *min = cpu_amd_range[index].min;
404 *max = cpu_amd_range[index].max;
405 }
406 break;
407 }
408 191
409 return *max; 192 return *max;
410} 193}
@@ -434,7 +217,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
434 unsigned msr, msr_min, msr_max; 217 unsigned msr, msr_min, msr_max;
435 struct cpu_private *priv; 218 struct cpu_private *priv;
436 u32 low, high; 219 u32 low, high;
437 int i, range; 220 int i;
438 221
439 if (seq) { 222 if (seq) {
440 priv = seq->private; 223 priv = seq->private;
@@ -446,9 +229,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
446 } 229 }
447 } 230 }
448 231
449 range = get_cpu_range_count(cpu); 232 for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
450
451 for (i = 0; i < range; i++) {
452 if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag)) 233 if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag))
453 continue; 234 continue;
454 235
@@ -588,8 +369,20 @@ static void print_apic(void *arg)
588 seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT)); 369 seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT));
589 seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT)); 370 seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT));
590 seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR)); 371 seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR));
591#endif /* CONFIG_X86_LOCAL_APIC */ 372 if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
373 unsigned int i, v, maxeilvt;
374
375 v = apic_read(APIC_EFEAT);
376 maxeilvt = (v >> 16) & 0xff;
377 seq_printf(seq, " EFEAT\t\t: %08x\n", v);
378 seq_printf(seq, " ECTRL\t\t: %08x\n", apic_read(APIC_ECTRL));
592 379
380 for (i = 0; i < maxeilvt; i++) {
381 v = apic_read(APIC_EILVTn(i));
382 seq_printf(seq, " EILVT%d\t\t: %08x\n", i, v);
383 }
384 }
385#endif /* CONFIG_X86_LOCAL_APIC */
593 seq_printf(seq, "\n MSR\t:\n"); 386 seq_printf(seq, "\n MSR\t:\n");
594} 387}
595 388
@@ -788,13 +581,11 @@ static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry)
788{ 581{
789 struct dentry *cpu_dentry = NULL; 582 struct dentry *cpu_dentry = NULL;
790 unsigned reg, reg_min, reg_max; 583 unsigned reg, reg_min, reg_max;
791 int i, range, err = 0; 584 int i, err = 0;
792 char reg_dir[12]; 585 char reg_dir[12];
793 u32 low, high; 586 u32 low, high;
794 587
795 range = get_cpu_range_count(cpu); 588 for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
796
797 for (i = 0; i < range; i++) {
798 if (!get_cpu_range(cpu, &reg_min, &reg_max, i, 589 if (!get_cpu_range(cpu, &reg_min, &reg_max, i,
799 cpu_base[type].flag)) 590 cpu_base[type].flag))
800 continue; 591 continue;
@@ -850,10 +641,6 @@ static int cpu_init_cpu(void)
850 cpui = &cpu_data(cpu); 641 cpui = &cpu_data(cpu);
851 if (!cpu_has(cpui, X86_FEATURE_MSR)) 642 if (!cpu_has(cpui, X86_FEATURE_MSR))
852 continue; 643 continue;
853 per_cpu(cpu_model, cpu) = ((cpui->x86_vendor << 16) |
854 (cpui->x86 << 8) |
855 (cpui->x86_model));
856 per_cpu(cpu_modelflag, cpu) = get_cpu_modelflag(cpu);
857 644
858 sprintf(cpu_dir, "cpu%d", cpu); 645 sprintf(cpu_dir, "cpu%d", cpu);
859 cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir); 646 cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir);
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
index 52c83987547..f138c6c389b 100644
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ b/arch/x86/kernel/cpu/cpufreq/Kconfig
@@ -220,11 +220,14 @@ config X86_LONGHAUL
220 If in doubt, say N. 220 If in doubt, say N.
221 221
222config X86_E_POWERSAVER 222config X86_E_POWERSAVER
223 tristate "VIA C7 Enhanced PowerSaver" 223 tristate "VIA C7 Enhanced PowerSaver (DANGEROUS)"
224 select CPU_FREQ_TABLE 224 select CPU_FREQ_TABLE
225 depends on X86_32 225 depends on X86_32 && EXPERIMENTAL
226 help 226 help
227 This adds the CPUFreq driver for VIA C7 processors. 227 This adds the CPUFreq driver for VIA C7 processors. However, this driver
228 does not have any safeguards to prevent operating the CPU out of spec
229 and is thus considered dangerous. Please use the regular ACPI cpufreq
230 driver, enabled by CONFIG_X86_ACPI_CPUFREQ.
228 231
229 If in doubt, say N. 232 If in doubt, say N.
230 233
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 208ecf6643d..ae9b503220c 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -90,11 +90,7 @@ static int check_est_cpu(unsigned int cpuid)
90{ 90{
91 struct cpuinfo_x86 *cpu = &cpu_data(cpuid); 91 struct cpuinfo_x86 *cpu = &cpu_data(cpuid);
92 92
93 if (cpu->x86_vendor != X86_VENDOR_INTEL || 93 return cpu_has(cpu, X86_FEATURE_EST);
94 !cpu_has(cpu, X86_FEATURE_EST))
95 return 0;
96
97 return 1;
98} 94}
99 95
100static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data) 96static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data)
@@ -550,7 +546,7 @@ static int __init acpi_cpufreq_early_init(void)
550 return -ENOMEM; 546 return -ENOMEM;
551 } 547 }
552 for_each_possible_cpu(i) { 548 for_each_possible_cpu(i) {
553 if (!alloc_cpumask_var_node( 549 if (!zalloc_cpumask_var_node(
554 &per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map, 550 &per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map,
555 GFP_KERNEL, cpu_to_node(i))) { 551 GFP_KERNEL, cpu_to_node(i))) {
556 552
@@ -693,8 +689,8 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
693 if (perf->control_register.space_id == ACPI_ADR_SPACE_FIXED_HARDWARE && 689 if (perf->control_register.space_id == ACPI_ADR_SPACE_FIXED_HARDWARE &&
694 policy->cpuinfo.transition_latency > 20 * 1000) { 690 policy->cpuinfo.transition_latency > 20 * 1000) {
695 policy->cpuinfo.transition_latency = 20 * 1000; 691 policy->cpuinfo.transition_latency = 20 * 1000;
696 printk_once(KERN_INFO "Capping off P-state tranision" 692 printk_once(KERN_INFO
697 " latency at 20 uS\n"); 693 "P-state transition latency capped at 20 uS\n");
698 } 694 }
699 695
700 /* table init */ 696 /* table init */
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index 6ac55bd341a..86961519372 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -168,6 +168,7 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
168 case 0x0E: /* Core */ 168 case 0x0E: /* Core */
169 case 0x0F: /* Core Duo */ 169 case 0x0F: /* Core Duo */
170 case 0x16: /* Celeron Core */ 170 case 0x16: /* Celeron Core */
171 case 0x1C: /* Atom */
171 p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS; 172 p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
172 return speedstep_get_frequency(SPEEDSTEP_CPU_PCORE); 173 return speedstep_get_frequency(SPEEDSTEP_CPU_PCORE);
173 case 0x0D: /* Pentium M (Dothan) */ 174 case 0x0D: /* Pentium M (Dothan) */
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index 3c28ccd4974..d47c775eb0a 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -168,10 +168,12 @@ static int check_powernow(void)
168 return 1; 168 return 1;
169} 169}
170 170
171#ifdef CONFIG_X86_POWERNOW_K7_ACPI
171static void invalidate_entry(unsigned int entry) 172static void invalidate_entry(unsigned int entry)
172{ 173{
173 powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID; 174 powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
174} 175}
176#endif
175 177
176static int get_ranges(unsigned char *pst) 178static int get_ranges(unsigned char *pst)
177{ 179{
@@ -320,7 +322,7 @@ static int powernow_acpi_init(void)
320 goto err0; 322 goto err0;
321 } 323 }
322 324
323 if (!alloc_cpumask_var(&acpi_processor_perf->shared_cpu_map, 325 if (!zalloc_cpumask_var(&acpi_processor_perf->shared_cpu_map,
324 GFP_KERNEL)) { 326 GFP_KERNEL)) {
325 retval = -ENOMEM; 327 retval = -ENOMEM;
326 goto err05; 328 goto err05;
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 4709ead2db5..cf52215d9eb 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -649,6 +649,20 @@ static void print_basics(struct powernow_k8_data *data)
649 data->batps); 649 data->batps);
650} 650}
651 651
652static u32 freq_from_fid_did(u32 fid, u32 did)
653{
654 u32 mhz = 0;
655
656 if (boot_cpu_data.x86 == 0x10)
657 mhz = (100 * (fid + 0x10)) >> did;
658 else if (boot_cpu_data.x86 == 0x11)
659 mhz = (100 * (fid + 8)) >> did;
660 else
661 BUG();
662
663 return mhz * 1000;
664}
665
652static int fill_powernow_table(struct powernow_k8_data *data, 666static int fill_powernow_table(struct powernow_k8_data *data,
653 struct pst_s *pst, u8 maxvid) 667 struct pst_s *pst, u8 maxvid)
654{ 668{
@@ -821,7 +835,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
821{ 835{
822 struct cpufreq_frequency_table *powernow_table; 836 struct cpufreq_frequency_table *powernow_table;
823 int ret_val = -ENODEV; 837 int ret_val = -ENODEV;
824 acpi_integer space_id; 838 acpi_integer control, status;
825 839
826 if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { 840 if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
827 dprintk("register performance failed: bad ACPI data\n"); 841 dprintk("register performance failed: bad ACPI data\n");
@@ -834,12 +848,13 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
834 goto err_out; 848 goto err_out;
835 } 849 }
836 850
837 space_id = data->acpi_data.control_register.space_id; 851 control = data->acpi_data.control_register.space_id;
838 if ((space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) || 852 status = data->acpi_data.status_register.space_id;
839 (space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) { 853
854 if ((control != ACPI_ADR_SPACE_FIXED_HARDWARE) ||
855 (status != ACPI_ADR_SPACE_FIXED_HARDWARE)) {
840 dprintk("Invalid control/status registers (%x - %x)\n", 856 dprintk("Invalid control/status registers (%x - %x)\n",
841 data->acpi_data.control_register.space_id, 857 control, status);
842 space_id);
843 goto err_out; 858 goto err_out;
844 } 859 }
845 860
@@ -872,7 +887,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
872 /* notify BIOS that we exist */ 887 /* notify BIOS that we exist */
873 acpi_processor_notify_smm(THIS_MODULE); 888 acpi_processor_notify_smm(THIS_MODULE);
874 889
875 if (!alloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) { 890 if (!zalloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) {
876 printk(KERN_ERR PFX 891 printk(KERN_ERR PFX
877 "unable to alloc powernow_k8_data cpumask\n"); 892 "unable to alloc powernow_k8_data cpumask\n");
878 ret_val = -ENOMEM; 893 ret_val = -ENOMEM;
@@ -923,8 +938,13 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data,
923 938
924 powernow_table[i].index = index; 939 powernow_table[i].index = index;
925 940
926 powernow_table[i].frequency = 941 /* Frequency may be rounded for these */
927 data->acpi_data.states[i].core_frequency * 1000; 942 if (boot_cpu_data.x86 == 0x10 || boot_cpu_data.x86 == 0x11) {
943 powernow_table[i].frequency =
944 freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7);
945 } else
946 powernow_table[i].frequency =
947 data->acpi_data.states[i].core_frequency * 1000;
928 } 948 }
929 return 0; 949 return 0;
930} 950}
@@ -1215,13 +1235,16 @@ static int powernowk8_verify(struct cpufreq_policy *pol)
1215 return cpufreq_frequency_table_verify(pol, data->powernow_table); 1235 return cpufreq_frequency_table_verify(pol, data->powernow_table);
1216} 1236}
1217 1237
1238static const char ACPI_PSS_BIOS_BUG_MSG[] =
1239 KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n"
1240 KERN_ERR FW_BUG PFX "Try again with latest BIOS.\n";
1241
1218/* per CPU init entry point to the driver */ 1242/* per CPU init entry point to the driver */
1219static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) 1243static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1220{ 1244{
1221 struct powernow_k8_data *data; 1245 struct powernow_k8_data *data;
1222 cpumask_t oldmask; 1246 cpumask_t oldmask;
1223 int rc; 1247 int rc;
1224 static int print_once;
1225 1248
1226 if (!cpu_online(pol->cpu)) 1249 if (!cpu_online(pol->cpu))
1227 return -ENODEV; 1250 return -ENODEV;
@@ -1244,19 +1267,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1244 * an UP version, and is deprecated by AMD. 1267 * an UP version, and is deprecated by AMD.
1245 */ 1268 */
1246 if (num_online_cpus() != 1) { 1269 if (num_online_cpus() != 1) {
1247 /* 1270 printk_once(ACPI_PSS_BIOS_BUG_MSG);
1248 * Replace this one with print_once as soon as such a
1249 * thing gets introduced
1250 */
1251 if (!print_once) {
1252 WARN_ONCE(1, KERN_ERR FW_BUG PFX "Your BIOS "
1253 "does not provide ACPI _PSS objects "
1254 "in a way that Linux understands. "
1255 "Please report this to the Linux ACPI"
1256 " maintainers and complain to your "
1257 "BIOS vendor.\n");
1258 print_once++;
1259 }
1260 goto err_out; 1271 goto err_out;
1261 } 1272 }
1262 if (pol->cpu != 0) { 1273 if (pol->cpu != 0) {
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index c9f1fdc0283..55c831ed71c 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -471,7 +471,7 @@ static int centrino_target (struct cpufreq_policy *policy,
471 471
472 if (unlikely(!alloc_cpumask_var(&saved_mask, GFP_KERNEL))) 472 if (unlikely(!alloc_cpumask_var(&saved_mask, GFP_KERNEL)))
473 return -ENOMEM; 473 return -ENOMEM;
474 if (unlikely(!alloc_cpumask_var(&covered_cpus, GFP_KERNEL))) { 474 if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL))) {
475 free_cpumask_var(saved_mask); 475 free_cpumask_var(saved_mask);
476 return -ENOMEM; 476 return -ENOMEM;
477 } 477 }
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 7437fa133c0..3260ab04499 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -86,6 +86,29 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
86 */ 86 */
87 if (c->x86 == 6 && c->x86_model < 15) 87 if (c->x86 == 6 && c->x86_model < 15)
88 clear_cpu_cap(c, X86_FEATURE_PAT); 88 clear_cpu_cap(c, X86_FEATURE_PAT);
89
90#ifdef CONFIG_KMEMCHECK
91 /*
92 * P4s have a "fast strings" feature which causes single-
93 * stepping REP instructions to only generate a #DB on
94 * cache-line boundaries.
95 *
96 * Ingo Molnar reported a Pentium D (model 6) and a Xeon
97 * (model 2) with the same problem.
98 */
99 if (c->x86 == 15) {
100 u64 misc_enable;
101
102 rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
103
104 if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) {
105 printk(KERN_INFO "kmemcheck: Disabling fast string operations\n");
106
107 misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING;
108 wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
109 }
110 }
111#endif
89} 112}
90 113
91#ifdef CONFIG_X86_32 114#ifdef CONFIG_X86_32
@@ -229,12 +252,12 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
229} 252}
230#endif 253#endif
231 254
232static void __cpuinit srat_detect_node(void) 255static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
233{ 256{
234#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 257#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
235 unsigned node; 258 unsigned node;
236 int cpu = smp_processor_id(); 259 int cpu = smp_processor_id();
237 int apicid = hard_smp_processor_id(); 260 int apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid;
238 261
239 /* Don't do the funky fallback heuristics the AMD version employs 262 /* Don't do the funky fallback heuristics the AMD version employs
240 for now. */ 263 for now. */
@@ -400,7 +423,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
400 } 423 }
401 424
402 /* Work around errata */ 425 /* Work around errata */
403 srat_detect_node(); 426 srat_detect_node(c);
404 427
405 if (cpu_has(c, X86_FEATURE_VMX)) 428 if (cpu_has(c, X86_FEATURE_VMX))
406 detect_vmx_virtcap(c); 429 detect_vmx_virtcap(c);
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 483eda96e10..789efe217e1 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -17,6 +17,7 @@
17 17
18#include <asm/processor.h> 18#include <asm/processor.h>
19#include <asm/smp.h> 19#include <asm/smp.h>
20#include <asm/k8.h>
20 21
21#define LVL_1_INST 1 22#define LVL_1_INST 1
22#define LVL_1_DATA 2 23#define LVL_1_DATA 2
@@ -159,14 +160,6 @@ struct _cpuid4_info_regs {
159 unsigned long can_disable; 160 unsigned long can_disable;
160}; 161};
161 162
162#if defined(CONFIG_PCI) && defined(CONFIG_SYSFS)
163static struct pci_device_id k8_nb_id[] = {
164 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) },
165 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) },
166 {}
167};
168#endif
169
170unsigned short num_cache_leaves; 163unsigned short num_cache_leaves;
171 164
172/* AMD doesn't have CPUID4. Emulate it here to report the same 165/* AMD doesn't have CPUID4. Emulate it here to report the same
@@ -207,10 +200,17 @@ union l3_cache {
207}; 200};
208 201
209static const unsigned short __cpuinitconst assocs[] = { 202static const unsigned short __cpuinitconst assocs[] = {
210 [1] = 1, [2] = 2, [4] = 4, [6] = 8, 203 [1] = 1,
211 [8] = 16, [0xa] = 32, [0xb] = 48, 204 [2] = 2,
205 [4] = 4,
206 [6] = 8,
207 [8] = 16,
208 [0xa] = 32,
209 [0xb] = 48,
212 [0xc] = 64, 210 [0xc] = 64,
213 [0xf] = 0xffff // ?? 211 [0xd] = 96,
212 [0xe] = 128,
213 [0xf] = 0xffff /* fully associative - no way to show this currently */
214}; 214};
215 215
216static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 }; 216static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 };
@@ -271,7 +271,8 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
271 eax->split.type = types[leaf]; 271 eax->split.type = types[leaf];
272 eax->split.level = levels[leaf]; 272 eax->split.level = levels[leaf];
273 if (leaf == 3) 273 if (leaf == 3)
274 eax->split.num_threads_sharing = current_cpu_data.x86_max_cores - 1; 274 eax->split.num_threads_sharing =
275 current_cpu_data.x86_max_cores - 1;
275 else 276 else
276 eax->split.num_threads_sharing = 0; 277 eax->split.num_threads_sharing = 0;
277 eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; 278 eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1;
@@ -291,6 +292,14 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
291{ 292{
292 if (index < 3) 293 if (index < 3)
293 return; 294 return;
295
296 if (boot_cpu_data.x86 == 0x11)
297 return;
298
299 /* see erratum #382 */
300 if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8))
301 return;
302
294 this_leaf->can_disable = 1; 303 this_leaf->can_disable = 1;
295} 304}
296 305
@@ -696,97 +705,75 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)
696#define to_object(k) container_of(k, struct _index_kobject, kobj) 705#define to_object(k) container_of(k, struct _index_kobject, kobj)
697#define to_attr(a) container_of(a, struct _cache_attr, attr) 706#define to_attr(a) container_of(a, struct _cache_attr, attr)
698 707
699#ifdef CONFIG_PCI 708static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
700static struct pci_dev *get_k8_northbridge(int node) 709 unsigned int index)
701{
702 struct pci_dev *dev = NULL;
703 int i;
704
705 for (i = 0; i <= node; i++) {
706 do {
707 dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
708 if (!dev)
709 break;
710 } while (!pci_match_id(&k8_nb_id[0], dev));
711 if (!dev)
712 break;
713 }
714 return dev;
715}
716#else
717static struct pci_dev *get_k8_northbridge(int node)
718{
719 return NULL;
720}
721#endif
722
723static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf)
724{ 710{
725 const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); 711 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
726 int node = cpu_to_node(cpumask_first(mask)); 712 int node = cpu_to_node(cpu);
727 struct pci_dev *dev = NULL; 713 struct pci_dev *dev = node_to_k8_nb_misc(node);
728 ssize_t ret = 0; 714 unsigned int reg = 0;
729 int i;
730 715
731 if (!this_leaf->can_disable) 716 if (!this_leaf->can_disable)
732 return sprintf(buf, "Feature not enabled\n");
733
734 dev = get_k8_northbridge(node);
735 if (!dev) {
736 printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
737 return -EINVAL; 717 return -EINVAL;
738 }
739 718
740 for (i = 0; i < 2; i++) { 719 if (!dev)
741 unsigned int reg; 720 return -EINVAL;
742 721
743 pci_read_config_dword(dev, 0x1BC + i * 4, &reg); 722 pci_read_config_dword(dev, 0x1BC + index * 4, &reg);
723 return sprintf(buf, "%x\n", reg);
724}
744 725
745 ret += sprintf(buf, "%sEntry: %d\n", buf, i); 726#define SHOW_CACHE_DISABLE(index) \
746 ret += sprintf(buf, "%sReads: %s\tNew Entries: %s\n", 727static ssize_t \
747 buf, 728show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \
748 reg & 0x80000000 ? "Disabled" : "Allowed", 729{ \
749 reg & 0x40000000 ? "Disabled" : "Allowed"); 730 return show_cache_disable(this_leaf, buf, index); \
750 ret += sprintf(buf, "%sSubCache: %x\tIndex: %x\n",
751 buf, (reg & 0x30000) >> 16, reg & 0xfff);
752 }
753 return ret;
754} 731}
732SHOW_CACHE_DISABLE(0)
733SHOW_CACHE_DISABLE(1)
755 734
756static ssize_t 735static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
757store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, 736 const char *buf, size_t count, unsigned int index)
758 size_t count)
759{ 737{
760 const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); 738 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
761 int node = cpu_to_node(cpumask_first(mask)); 739 int node = cpu_to_node(cpu);
762 struct pci_dev *dev = NULL; 740 struct pci_dev *dev = node_to_k8_nb_misc(node);
763 unsigned int ret, index, val; 741 unsigned long val = 0;
742 unsigned int scrubber = 0;
764 743
765 if (!this_leaf->can_disable) 744 if (!this_leaf->can_disable)
766 return 0;
767
768 if (strlen(buf) > 15)
769 return -EINVAL; 745 return -EINVAL;
770 746
771 ret = sscanf(buf, "%x %x", &index, &val); 747 if (!capable(CAP_SYS_ADMIN))
772 if (ret != 2) 748 return -EPERM;
749
750 if (!dev)
773 return -EINVAL; 751 return -EINVAL;
774 if (index > 1) 752
753 if (strict_strtoul(buf, 10, &val) < 0)
775 return -EINVAL; 754 return -EINVAL;
776 755
777 val |= 0xc0000000; 756 val |= 0xc0000000;
778 dev = get_k8_northbridge(node); 757
779 if (!dev) { 758 pci_read_config_dword(dev, 0x58, &scrubber);
780 printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); 759 scrubber &= ~0x1f000000;
781 return -EINVAL; 760 pci_write_config_dword(dev, 0x58, scrubber);
782 }
783 761
784 pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000); 762 pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000);
785 wbinvd(); 763 wbinvd();
786 pci_write_config_dword(dev, 0x1BC + index * 4, val); 764 pci_write_config_dword(dev, 0x1BC + index * 4, val);
765 return count;
766}
787 767
788 return 1; 768#define STORE_CACHE_DISABLE(index) \
769static ssize_t \
770store_cache_disable_##index(struct _cpuid4_info *this_leaf, \
771 const char *buf, size_t count) \
772{ \
773 return store_cache_disable(this_leaf, buf, count, index); \
789} 774}
775STORE_CACHE_DISABLE(0)
776STORE_CACHE_DISABLE(1)
790 777
791struct _cache_attr { 778struct _cache_attr {
792 struct attribute attr; 779 struct attribute attr;
@@ -808,7 +795,10 @@ define_one_ro(size);
808define_one_ro(shared_cpu_map); 795define_one_ro(shared_cpu_map);
809define_one_ro(shared_cpu_list); 796define_one_ro(shared_cpu_list);
810 797
811static struct _cache_attr cache_disable = __ATTR(cache_disable, 0644, show_cache_disable, store_cache_disable); 798static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
799 show_cache_disable_0, store_cache_disable_0);
800static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
801 show_cache_disable_1, store_cache_disable_1);
812 802
813static struct attribute * default_attrs[] = { 803static struct attribute * default_attrs[] = {
814 &type.attr, 804 &type.attr,
@@ -820,7 +810,8 @@ static struct attribute * default_attrs[] = {
820 &size.attr, 810 &size.attr,
821 &shared_cpu_map.attr, 811 &shared_cpu_map.attr,
822 &shared_cpu_list.attr, 812 &shared_cpu_list.attr,
823 &cache_disable.attr, 813 &cache_disable_0.attr,
814 &cache_disable_1.attr,
824 NULL 815 NULL
825}; 816};
826 817
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index b2f89829bbe..45004faf67e 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -1,7 +1,11 @@
1obj-y = mce_$(BITS).o therm_throt.o 1obj-y = mce.o therm_throt.o
2 2
3obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o 3obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o
4obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o 4obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o
5obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o
6obj-$(CONFIG_X86_MCE_P4THERMAL) += mce_intel.o
7obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o mce_intel.o
5obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o 8obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o
6obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o 9obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
7obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o 10obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
11obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
index dd3af6e7b39..89e51042415 100644
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ b/arch/x86/kernel/cpu/mcheck/k7.c
@@ -2,11 +2,10 @@
2 * Athlon specific Machine Check Exception Reporting 2 * Athlon specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Dave Jones <davej@redhat.com> 3 * (C) Copyright 2002 Dave Jones <davej@redhat.com>
4 */ 4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
10#include <linux/smp.h> 9#include <linux/smp.h>
11 10
12#include <asm/processor.h> 11#include <asm/processor.h>
@@ -15,12 +14,12 @@
15 14
16#include "mce.h" 15#include "mce.h"
17 16
18/* Machine Check Handler For AMD Athlon/Duron */ 17/* Machine Check Handler For AMD Athlon/Duron: */
19static void k7_machine_check(struct pt_regs *regs, long error_code) 18static void k7_machine_check(struct pt_regs *regs, long error_code)
20{ 19{
21 int recover = 1;
22 u32 alow, ahigh, high, low; 20 u32 alow, ahigh, high, low;
23 u32 mcgstl, mcgsth; 21 u32 mcgstl, mcgsth;
22 int recover = 1;
24 int i; 23 int i;
25 24
26 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 25 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
@@ -32,15 +31,19 @@ static void k7_machine_check(struct pt_regs *regs, long error_code)
32 31
33 for (i = 1; i < nr_mce_banks; i++) { 32 for (i = 1; i < nr_mce_banks; i++) {
34 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); 33 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
35 if (high&(1<<31)) { 34 if (high & (1<<31)) {
36 char misc[20]; 35 char misc[20];
37 char addr[24]; 36 char addr[24];
38 misc[0] = addr[0] = '\0'; 37
38 misc[0] = '\0';
39 addr[0] = '\0';
40
39 if (high & (1<<29)) 41 if (high & (1<<29))
40 recover |= 1; 42 recover |= 1;
41 if (high & (1<<25)) 43 if (high & (1<<25))
42 recover |= 2; 44 recover |= 2;
43 high &= ~(1<<31); 45 high &= ~(1<<31);
46
44 if (high & (1<<27)) { 47 if (high & (1<<27)) {
45 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); 48 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
46 snprintf(misc, 20, "[%08x%08x]", ahigh, alow); 49 snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
@@ -49,27 +52,31 @@ static void k7_machine_check(struct pt_regs *regs, long error_code)
49 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); 52 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
50 snprintf(addr, 24, " at %08x%08x", ahigh, alow); 53 snprintf(addr, 24, " at %08x%08x", ahigh, alow);
51 } 54 }
55
52 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", 56 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
53 smp_processor_id(), i, high, low, misc, addr); 57 smp_processor_id(), i, high, low, misc, addr);
54 /* Clear it */ 58
59 /* Clear it: */
55 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); 60 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
56 /* Serialize */ 61 /* Serialize: */
57 wmb(); 62 wmb();
58 add_taint(TAINT_MACHINE_CHECK); 63 add_taint(TAINT_MACHINE_CHECK);
59 } 64 }
60 } 65 }
61 66
62 if (recover&2) 67 if (recover & 2)
63 panic("CPU context corrupt"); 68 panic("CPU context corrupt");
64 if (recover&1) 69 if (recover & 1)
65 panic("Unable to continue"); 70 panic("Unable to continue");
71
66 printk(KERN_EMERG "Attempting to continue.\n"); 72 printk(KERN_EMERG "Attempting to continue.\n");
73
67 mcgstl &= ~(1<<2); 74 mcgstl &= ~(1<<2);
68 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 75 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
69} 76}
70 77
71 78
72/* AMD K7 machine check is Intel like */ 79/* AMD K7 machine check is Intel like: */
73void amd_mcheck_init(struct cpuinfo_x86 *c) 80void amd_mcheck_init(struct cpuinfo_x86 *c)
74{ 81{
75 u32 l, h; 82 u32 l, h;
@@ -79,21 +86,26 @@ void amd_mcheck_init(struct cpuinfo_x86 *c)
79 return; 86 return;
80 87
81 machine_check_vector = k7_machine_check; 88 machine_check_vector = k7_machine_check;
89 /* Make sure the vector pointer is visible before we enable MCEs: */
82 wmb(); 90 wmb();
83 91
84 printk(KERN_INFO "Intel machine check architecture supported.\n"); 92 printk(KERN_INFO "Intel machine check architecture supported.\n");
93
85 rdmsr(MSR_IA32_MCG_CAP, l, h); 94 rdmsr(MSR_IA32_MCG_CAP, l, h);
86 if (l & (1<<8)) /* Control register present ? */ 95 if (l & (1<<8)) /* Control register present ? */
87 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 96 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
88 nr_mce_banks = l & 0xff; 97 nr_mce_banks = l & 0xff;
89 98
90 /* Clear status for MC index 0 separately, we don't touch CTL, 99 /*
91 * as some K7 Athlons cause spurious MCEs when its enabled. */ 100 * Clear status for MC index 0 separately, we don't touch CTL,
101 * as some K7 Athlons cause spurious MCEs when its enabled:
102 */
92 if (boot_cpu_data.x86 == 6) { 103 if (boot_cpu_data.x86 == 6) {
93 wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0); 104 wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0);
94 i = 1; 105 i = 1;
95 } else 106 } else
96 i = 0; 107 i = 0;
108
97 for (; i < nr_mce_banks; i++) { 109 for (; i < nr_mce_banks; i++) {
98 wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); 110 wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
99 wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); 111 wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
new file mode 100644
index 00000000000..a3a235a53f0
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -0,0 +1,127 @@
1/*
2 * Machine check injection support.
3 * Copyright 2008 Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; version 2
8 * of the License.
9 *
10 * Authors:
11 * Andi Kleen
12 * Ying Huang
13 */
14#include <linux/uaccess.h>
15#include <linux/module.h>
16#include <linux/timer.h>
17#include <linux/kernel.h>
18#include <linux/string.h>
19#include <linux/fs.h>
20#include <linux/smp.h>
21#include <asm/mce.h>
22
23/* Update fake mce registers on current CPU. */
24static void inject_mce(struct mce *m)
25{
26 struct mce *i = &per_cpu(injectm, m->extcpu);
27
28 /* Make sure noone reads partially written injectm */
29 i->finished = 0;
30 mb();
31 m->finished = 0;
32 /* First set the fields after finished */
33 i->extcpu = m->extcpu;
34 mb();
35 /* Now write record in order, finished last (except above) */
36 memcpy(i, m, sizeof(struct mce));
37 /* Finally activate it */
38 mb();
39 i->finished = 1;
40}
41
42struct delayed_mce {
43 struct timer_list timer;
44 struct mce m;
45};
46
47/* Inject mce on current CPU */
48static void raise_mce(unsigned long data)
49{
50 struct delayed_mce *dm = (struct delayed_mce *)data;
51 struct mce *m = &dm->m;
52 int cpu = m->extcpu;
53
54 inject_mce(m);
55 if (m->status & MCI_STATUS_UC) {
56 struct pt_regs regs;
57 memset(&regs, 0, sizeof(struct pt_regs));
58 regs.ip = m->ip;
59 regs.cs = m->cs;
60 printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu);
61 do_machine_check(&regs, 0);
62 printk(KERN_INFO "MCE exception done on CPU %d\n", cpu);
63 } else {
64 mce_banks_t b;
65 memset(&b, 0xff, sizeof(mce_banks_t));
66 printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu);
67 machine_check_poll(0, &b);
68 mce_notify_irq();
69 printk(KERN_INFO "Finished machine check poll on CPU %d\n",
70 cpu);
71 }
72 kfree(dm);
73}
74
75/* Error injection interface */
76static ssize_t mce_write(struct file *filp, const char __user *ubuf,
77 size_t usize, loff_t *off)
78{
79 struct delayed_mce *dm;
80 struct mce m;
81
82 if (!capable(CAP_SYS_ADMIN))
83 return -EPERM;
84 /*
85 * There are some cases where real MSR reads could slip
86 * through.
87 */
88 if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA))
89 return -EIO;
90
91 if ((unsigned long)usize > sizeof(struct mce))
92 usize = sizeof(struct mce);
93 if (copy_from_user(&m, ubuf, usize))
94 return -EFAULT;
95
96 if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu))
97 return -EINVAL;
98
99 dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL);
100 if (!dm)
101 return -ENOMEM;
102
103 /*
104 * Need to give user space some time to set everything up,
105 * so do it a jiffie or two later everywhere.
106 * Should we use a hrtimer here for better synchronization?
107 */
108 memcpy(&dm->m, &m, sizeof(struct mce));
109 setup_timer(&dm->timer, raise_mce, (unsigned long)dm);
110 dm->timer.expires = jiffies + 2;
111 add_timer_on(&dm->timer, m.extcpu);
112 return usize;
113}
114
115static int inject_init(void)
116{
117 printk(KERN_INFO "Machine check injector initialized\n");
118 mce_chrdev_ops.write = mce_write;
119 return 0;
120}
121
122module_init(inject_init);
123/*
124 * Cannot tolerate unloading currently because we cannot
125 * guarantee all openers of mce_chrdev will get a reference to us.
126 */
127MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
new file mode 100644
index 00000000000..54dcb8ff12e
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -0,0 +1,15 @@
1#include <asm/mce.h>
2
3enum severity_level {
4 MCE_NO_SEVERITY,
5 MCE_KEEP_SEVERITY,
6 MCE_SOME_SEVERITY,
7 MCE_AO_SEVERITY,
8 MCE_UC_SEVERITY,
9 MCE_AR_SEVERITY,
10 MCE_PANIC_SEVERITY,
11};
12
13int mce_severity(struct mce *a, int tolerant, char **msg);
14
15extern int mce_ser;
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
new file mode 100644
index 00000000000..ff0807f9705
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -0,0 +1,218 @@
1/*
2 * MCE grading rules.
3 * Copyright 2008, 2009 Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; version 2
8 * of the License.
9 *
10 * Author: Andi Kleen
11 */
12#include <linux/kernel.h>
13#include <linux/seq_file.h>
14#include <linux/init.h>
15#include <linux/debugfs.h>
16#include <asm/mce.h>
17
18#include "mce-internal.h"
19
20/*
21 * Grade an mce by severity. In general the most severe ones are processed
22 * first. Since there are quite a lot of combinations test the bits in a
23 * table-driven way. The rules are simply processed in order, first
24 * match wins.
25 *
26 * Note this is only used for machine check exceptions, the corrected
27 * errors use much simpler rules. The exceptions still check for the corrected
28 * errors, but only to leave them alone for the CMCI handler (except for
29 * panic situations)
30 */
31
32enum context { IN_KERNEL = 1, IN_USER = 2 };
33enum ser { SER_REQUIRED = 1, NO_SER = 2 };
34
35static struct severity {
36 u64 mask;
37 u64 result;
38 unsigned char sev;
39 unsigned char mcgmask;
40 unsigned char mcgres;
41 unsigned char ser;
42 unsigned char context;
43 unsigned char covered;
44 char *msg;
45} severities[] = {
46#define KERNEL .context = IN_KERNEL
47#define USER .context = IN_USER
48#define SER .ser = SER_REQUIRED
49#define NOSER .ser = NO_SER
50#define SEV(s) .sev = MCE_ ## s ## _SEVERITY
51#define BITCLR(x, s, m, r...) { .mask = x, .result = 0, SEV(s), .msg = m, ## r }
52#define BITSET(x, s, m, r...) { .mask = x, .result = x, SEV(s), .msg = m, ## r }
53#define MCGMASK(x, res, s, m, r...) \
54 { .mcgmask = x, .mcgres = res, SEV(s), .msg = m, ## r }
55#define MASK(x, y, s, m, r...) \
56 { .mask = x, .result = y, SEV(s), .msg = m, ## r }
57#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
58#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
59#define MCACOD 0xffff
60
61 BITCLR(MCI_STATUS_VAL, NO, "Invalid"),
62 BITCLR(MCI_STATUS_EN, NO, "Not enabled"),
63 BITSET(MCI_STATUS_PCC, PANIC, "Processor context corrupt"),
64 /* When MCIP is not set something is very confused */
65 MCGMASK(MCG_STATUS_MCIP, 0, PANIC, "MCIP not set in MCA handler"),
66 /* Neither return not error IP -- no chance to recover -> PANIC */
67 MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0, PANIC,
68 "Neither restart nor error IP"),
69 MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "In kernel and no restart IP",
70 KERNEL),
71 BITCLR(MCI_STATUS_UC, KEEP, "Corrected error", NOSER),
72 MASK(MCI_STATUS_OVER|MCI_STATUS_UC|MCI_STATUS_EN, MCI_STATUS_UC, SOME,
73 "Spurious not enabled", SER),
74
75 /* ignore OVER for UCNA */
76 MASK(MCI_UC_SAR, MCI_STATUS_UC, KEEP,
77 "Uncorrected no action required", SER),
78 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR, PANIC,
79 "Illegal combination (UCNA with AR=1)", SER),
80 MASK(MCI_STATUS_S, 0, KEEP, "Non signalled machine check", SER),
81
82 /* AR add known MCACODs here */
83 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR, PANIC,
84 "Action required with lost events", SER),
85 MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_SAR, PANIC,
86 "Action required; unknown MCACOD", SER),
87
88 /* known AO MCACODs: */
89 MASK(MCI_UC_SAR|MCI_STATUS_OVER|0xfff0, MCI_UC_S|0xc0, AO,
90 "Action optional: memory scrubbing error", SER),
91 MASK(MCI_UC_SAR|MCI_STATUS_OVER|MCACOD, MCI_UC_S|0x17a, AO,
92 "Action optional: last level cache writeback error", SER),
93
94 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S, SOME,
95 "Action optional unknown MCACOD", SER),
96 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S|MCI_STATUS_OVER, SOME,
97 "Action optional with lost events", SER),
98 BITSET(MCI_STATUS_UC|MCI_STATUS_OVER, PANIC, "Overflowed uncorrected"),
99 BITSET(MCI_STATUS_UC, UC, "Uncorrected"),
100 BITSET(0, SOME, "No match") /* always matches. keep at end */
101};
102
103/*
104 * If the EIPV bit is set, it means the saved IP is the
105 * instruction which caused the MCE.
106 */
107static int error_context(struct mce *m)
108{
109 if (m->mcgstatus & MCG_STATUS_EIPV)
110 return (m->ip && (m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
111 /* Unknown, assume kernel */
112 return IN_KERNEL;
113}
114
115int mce_severity(struct mce *a, int tolerant, char **msg)
116{
117 enum context ctx = error_context(a);
118 struct severity *s;
119
120 for (s = severities;; s++) {
121 if ((a->status & s->mask) != s->result)
122 continue;
123 if ((a->mcgstatus & s->mcgmask) != s->mcgres)
124 continue;
125 if (s->ser == SER_REQUIRED && !mce_ser)
126 continue;
127 if (s->ser == NO_SER && mce_ser)
128 continue;
129 if (s->context && ctx != s->context)
130 continue;
131 if (msg)
132 *msg = s->msg;
133 s->covered = 1;
134 if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) {
135 if (panic_on_oops || tolerant < 1)
136 return MCE_PANIC_SEVERITY;
137 }
138 return s->sev;
139 }
140}
141
142static void *s_start(struct seq_file *f, loff_t *pos)
143{
144 if (*pos >= ARRAY_SIZE(severities))
145 return NULL;
146 return &severities[*pos];
147}
148
149static void *s_next(struct seq_file *f, void *data, loff_t *pos)
150{
151 if (++(*pos) >= ARRAY_SIZE(severities))
152 return NULL;
153 return &severities[*pos];
154}
155
156static void s_stop(struct seq_file *f, void *data)
157{
158}
159
160static int s_show(struct seq_file *f, void *data)
161{
162 struct severity *ser = data;
163 seq_printf(f, "%d\t%s\n", ser->covered, ser->msg);
164 return 0;
165}
166
167static const struct seq_operations severities_seq_ops = {
168 .start = s_start,
169 .next = s_next,
170 .stop = s_stop,
171 .show = s_show,
172};
173
174static int severities_coverage_open(struct inode *inode, struct file *file)
175{
176 return seq_open(file, &severities_seq_ops);
177}
178
179static ssize_t severities_coverage_write(struct file *file,
180 const char __user *ubuf,
181 size_t count, loff_t *ppos)
182{
183 int i;
184 for (i = 0; i < ARRAY_SIZE(severities); i++)
185 severities[i].covered = 0;
186 return count;
187}
188
189static const struct file_operations severities_coverage_fops = {
190 .open = severities_coverage_open,
191 .release = seq_release,
192 .read = seq_read,
193 .write = severities_coverage_write,
194};
195
196static int __init severities_debugfs_init(void)
197{
198 struct dentry *dmce = NULL, *fseverities_coverage = NULL;
199
200 dmce = debugfs_create_dir("mce", NULL);
201 if (dmce == NULL)
202 goto err_out;
203 fseverities_coverage = debugfs_create_file("severities-coverage",
204 0444, dmce, NULL,
205 &severities_coverage_fops);
206 if (fseverities_coverage == NULL)
207 goto err_out;
208
209 return 0;
210
211err_out:
212 if (fseverities_coverage)
213 debugfs_remove(fseverities_coverage);
214 if (dmce)
215 debugfs_remove(dmce);
216 return -ENOMEM;
217}
218late_initcall(severities_debugfs_init);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
new file mode 100644
index 00000000000..fabba15e455
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -0,0 +1,1964 @@
1/*
2 * Machine check handler.
3 *
4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 * Rest from unknown author(s).
6 * 2004 Andi Kleen. Rewrote most of it.
7 * Copyright 2008 Intel Corporation
8 * Author: Andi Kleen
9 */
10#include <linux/thread_info.h>
11#include <linux/capability.h>
12#include <linux/miscdevice.h>
13#include <linux/interrupt.h>
14#include <linux/ratelimit.h>
15#include <linux/kallsyms.h>
16#include <linux/rcupdate.h>
17#include <linux/kobject.h>
18#include <linux/uaccess.h>
19#include <linux/kdebug.h>
20#include <linux/kernel.h>
21#include <linux/percpu.h>
22#include <linux/string.h>
23#include <linux/sysdev.h>
24#include <linux/delay.h>
25#include <linux/ctype.h>
26#include <linux/sched.h>
27#include <linux/sysfs.h>
28#include <linux/types.h>
29#include <linux/init.h>
30#include <linux/kmod.h>
31#include <linux/poll.h>
32#include <linux/nmi.h>
33#include <linux/cpu.h>
34#include <linux/smp.h>
35#include <linux/fs.h>
36#include <linux/mm.h>
37
38#include <asm/processor.h>
39#include <asm/hw_irq.h>
40#include <asm/apic.h>
41#include <asm/idle.h>
42#include <asm/ipi.h>
43#include <asm/mce.h>
44#include <asm/msr.h>
45
46#include "mce-internal.h"
47#include "mce.h"
48
49/* Handle unconfigured int18 (should never happen) */
50static void unexpected_machine_check(struct pt_regs *regs, long error_code)
51{
52 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
53 smp_processor_id());
54}
55
56/* Call the installed machine check handler for this CPU setup. */
57void (*machine_check_vector)(struct pt_regs *, long error_code) =
58 unexpected_machine_check;
59
60int mce_disabled;
61
62#ifdef CONFIG_X86_NEW_MCE
63
64#define MISC_MCELOG_MINOR 227
65
66#define SPINUNIT 100 /* 100ns */
67
68atomic_t mce_entry;
69
70DEFINE_PER_CPU(unsigned, mce_exception_count);
71
72/*
73 * Tolerant levels:
74 * 0: always panic on uncorrected errors, log corrected errors
75 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
76 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
77 * 3: never panic or SIGBUS, log all errors (for testing only)
78 */
79static int tolerant = 1;
80static int banks;
81static u64 *bank;
82static unsigned long notify_user;
83static int rip_msr;
84static int mce_bootlog = -1;
85static int monarch_timeout = -1;
86static int mce_panic_timeout;
87static int mce_dont_log_ce;
88int mce_cmci_disabled;
89int mce_ignore_ce;
90int mce_ser;
91
92static char trigger[128];
93static char *trigger_argv[2] = { trigger, NULL };
94
95static unsigned long dont_init_banks;
96
97static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
98static DEFINE_PER_CPU(struct mce, mces_seen);
99static int cpu_missing;
100
101
102/* MCA banks polled by the period polling timer for corrected events */
103DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
104 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
105};
106
107static inline int skip_bank_init(int i)
108{
109 return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);
110}
111
112static DEFINE_PER_CPU(struct work_struct, mce_work);
113
114/* Do initial initialization of a struct mce */
115void mce_setup(struct mce *m)
116{
117 memset(m, 0, sizeof(struct mce));
118 m->cpu = m->extcpu = smp_processor_id();
119 rdtscll(m->tsc);
120 /* We hope get_seconds stays lockless */
121 m->time = get_seconds();
122 m->cpuvendor = boot_cpu_data.x86_vendor;
123 m->cpuid = cpuid_eax(1);
124#ifdef CONFIG_SMP
125 m->socketid = cpu_data(m->extcpu).phys_proc_id;
126#endif
127 m->apicid = cpu_data(m->extcpu).initial_apicid;
128 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
129}
130
131DEFINE_PER_CPU(struct mce, injectm);
132EXPORT_PER_CPU_SYMBOL_GPL(injectm);
133
134/*
135 * Lockless MCE logging infrastructure.
136 * This avoids deadlocks on printk locks without having to break locks. Also
137 * separate MCEs from kernel messages to avoid bogus bug reports.
138 */
139
140static struct mce_log mcelog = {
141 .signature = MCE_LOG_SIGNATURE,
142 .len = MCE_LOG_LEN,
143 .recordlen = sizeof(struct mce),
144};
145
146void mce_log(struct mce *mce)
147{
148 unsigned next, entry;
149
150 mce->finished = 0;
151 wmb();
152 for (;;) {
153 entry = rcu_dereference(mcelog.next);
154 for (;;) {
155 /*
156 * When the buffer fills up discard new entries.
157 * Assume that the earlier errors are the more
158 * interesting ones:
159 */
160 if (entry >= MCE_LOG_LEN) {
161 set_bit(MCE_OVERFLOW,
162 (unsigned long *)&mcelog.flags);
163 return;
164 }
165 /* Old left over entry. Skip: */
166 if (mcelog.entry[entry].finished) {
167 entry++;
168 continue;
169 }
170 break;
171 }
172 smp_rmb();
173 next = entry + 1;
174 if (cmpxchg(&mcelog.next, entry, next) == entry)
175 break;
176 }
177 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
178 wmb();
179 mcelog.entry[entry].finished = 1;
180 wmb();
181
182 mce->finished = 1;
183 set_bit(0, &notify_user);
184}
185
186static void print_mce(struct mce *m)
187{
188 printk(KERN_EMERG
189 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
190 m->extcpu, m->mcgstatus, m->bank, m->status);
191 if (m->ip) {
192 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
193 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
194 m->cs, m->ip);
195 if (m->cs == __KERNEL_CS)
196 print_symbol("{%s}", m->ip);
197 printk("\n");
198 }
199 printk(KERN_EMERG "TSC %llx ", m->tsc);
200 if (m->addr)
201 printk("ADDR %llx ", m->addr);
202 if (m->misc)
203 printk("MISC %llx ", m->misc);
204 printk("\n");
205 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
206 m->cpuvendor, m->cpuid, m->time, m->socketid,
207 m->apicid);
208}
209
210static void print_mce_head(void)
211{
212 printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n");
213}
214
215static void print_mce_tail(void)
216{
217 printk(KERN_EMERG "This is not a software problem!\n"
218 KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n");
219}
220
221#define PANIC_TIMEOUT 5 /* 5 seconds */
222
223static atomic_t mce_paniced;
224
225/* Panic in progress. Enable interrupts and wait for final IPI */
226static void wait_for_panic(void)
227{
228 long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
229 preempt_disable();
230 local_irq_enable();
231 while (timeout-- > 0)
232 udelay(1);
233 if (panic_timeout == 0)
234 panic_timeout = mce_panic_timeout;
235 panic("Panicing machine check CPU died");
236}
237
238static void mce_panic(char *msg, struct mce *final, char *exp)
239{
240 int i;
241
242 /*
243 * Make sure only one CPU runs in machine check panic
244 */
245 if (atomic_add_return(1, &mce_paniced) > 1)
246 wait_for_panic();
247 barrier();
248
249 bust_spinlocks(1);
250 console_verbose();
251 print_mce_head();
252 /* First print corrected ones that are still unlogged */
253 for (i = 0; i < MCE_LOG_LEN; i++) {
254 struct mce *m = &mcelog.entry[i];
255 if (!(m->status & MCI_STATUS_VAL))
256 continue;
257 if (!(m->status & MCI_STATUS_UC))
258 print_mce(m);
259 }
260 /* Now print uncorrected but with the final one last */
261 for (i = 0; i < MCE_LOG_LEN; i++) {
262 struct mce *m = &mcelog.entry[i];
263 if (!(m->status & MCI_STATUS_VAL))
264 continue;
265 if (!(m->status & MCI_STATUS_UC))
266 continue;
267 if (!final || memcmp(m, final, sizeof(struct mce)))
268 print_mce(m);
269 }
270 if (final)
271 print_mce(final);
272 if (cpu_missing)
273 printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
274 print_mce_tail();
275 if (exp)
276 printk(KERN_EMERG "Machine check: %s\n", exp);
277 if (panic_timeout == 0)
278 panic_timeout = mce_panic_timeout;
279 panic(msg);
280}
281
282/* Support code for software error injection */
283
284static int msr_to_offset(u32 msr)
285{
286 unsigned bank = __get_cpu_var(injectm.bank);
287 if (msr == rip_msr)
288 return offsetof(struct mce, ip);
289 if (msr == MSR_IA32_MC0_STATUS + bank*4)
290 return offsetof(struct mce, status);
291 if (msr == MSR_IA32_MC0_ADDR + bank*4)
292 return offsetof(struct mce, addr);
293 if (msr == MSR_IA32_MC0_MISC + bank*4)
294 return offsetof(struct mce, misc);
295 if (msr == MSR_IA32_MCG_STATUS)
296 return offsetof(struct mce, mcgstatus);
297 return -1;
298}
299
300/* MSR access wrappers used for error injection */
301static u64 mce_rdmsrl(u32 msr)
302{
303 u64 v;
304 if (__get_cpu_var(injectm).finished) {
305 int offset = msr_to_offset(msr);
306 if (offset < 0)
307 return 0;
308 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
309 }
310 rdmsrl(msr, v);
311 return v;
312}
313
314static void mce_wrmsrl(u32 msr, u64 v)
315{
316 if (__get_cpu_var(injectm).finished) {
317 int offset = msr_to_offset(msr);
318 if (offset >= 0)
319 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
320 return;
321 }
322 wrmsrl(msr, v);
323}
324
325/*
326 * Simple lockless ring to communicate PFNs from the exception handler with the
327 * process context work function. This is vastly simplified because there's
328 * only a single reader and a single writer.
329 */
330#define MCE_RING_SIZE 16 /* we use one entry less */
331
332struct mce_ring {
333 unsigned short start;
334 unsigned short end;
335 unsigned long ring[MCE_RING_SIZE];
336};
337static DEFINE_PER_CPU(struct mce_ring, mce_ring);
338
339/* Runs with CPU affinity in workqueue */
340static int mce_ring_empty(void)
341{
342 struct mce_ring *r = &__get_cpu_var(mce_ring);
343
344 return r->start == r->end;
345}
346
347static int mce_ring_get(unsigned long *pfn)
348{
349 struct mce_ring *r;
350 int ret = 0;
351
352 *pfn = 0;
353 get_cpu();
354 r = &__get_cpu_var(mce_ring);
355 if (r->start == r->end)
356 goto out;
357 *pfn = r->ring[r->start];
358 r->start = (r->start + 1) % MCE_RING_SIZE;
359 ret = 1;
360out:
361 put_cpu();
362 return ret;
363}
364
365/* Always runs in MCE context with preempt off */
366static int mce_ring_add(unsigned long pfn)
367{
368 struct mce_ring *r = &__get_cpu_var(mce_ring);
369 unsigned next;
370
371 next = (r->end + 1) % MCE_RING_SIZE;
372 if (next == r->start)
373 return -1;
374 r->ring[r->end] = pfn;
375 wmb();
376 r->end = next;
377 return 0;
378}
379
380int mce_available(struct cpuinfo_x86 *c)
381{
382 if (mce_disabled)
383 return 0;
384 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
385}
386
387static void mce_schedule_work(void)
388{
389 if (!mce_ring_empty()) {
390 struct work_struct *work = &__get_cpu_var(mce_work);
391 if (!work_pending(work))
392 schedule_work(work);
393 }
394}
395
396/*
397 * Get the address of the instruction at the time of the machine check
398 * error.
399 */
400static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
401{
402
403 if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) {
404 m->ip = regs->ip;
405 m->cs = regs->cs;
406 } else {
407 m->ip = 0;
408 m->cs = 0;
409 }
410 if (rip_msr)
411 m->ip = mce_rdmsrl(rip_msr);
412}
413
414#ifdef CONFIG_X86_LOCAL_APIC
415/*
416 * Called after interrupts have been reenabled again
417 * when a MCE happened during an interrupts off region
418 * in the kernel.
419 */
420asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
421{
422 ack_APIC_irq();
423 exit_idle();
424 irq_enter();
425 mce_notify_irq();
426 mce_schedule_work();
427 irq_exit();
428}
429#endif
430
431static void mce_report_event(struct pt_regs *regs)
432{
433 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
434 mce_notify_irq();
435 /*
436 * Triggering the work queue here is just an insurance
437 * policy in case the syscall exit notify handler
438 * doesn't run soon enough or ends up running on the
439 * wrong CPU (can happen when audit sleeps)
440 */
441 mce_schedule_work();
442 return;
443 }
444
445#ifdef CONFIG_X86_LOCAL_APIC
446 /*
447 * Without APIC do not notify. The event will be picked
448 * up eventually.
449 */
450 if (!cpu_has_apic)
451 return;
452
453 /*
454 * When interrupts are disabled we cannot use
455 * kernel services safely. Trigger an self interrupt
456 * through the APIC to instead do the notification
457 * after interrupts are reenabled again.
458 */
459 apic->send_IPI_self(MCE_SELF_VECTOR);
460
461 /*
462 * Wait for idle afterwards again so that we don't leave the
463 * APIC in a non idle state because the normal APIC writes
464 * cannot exclude us.
465 */
466 apic_wait_icr_idle();
467#endif
468}
469
470DEFINE_PER_CPU(unsigned, mce_poll_count);
471
472/*
473 * Poll for corrected events or events that happened before reset.
474 * Those are just logged through /dev/mcelog.
475 *
476 * This is executed in standard interrupt context.
477 *
478 * Note: spec recommends to panic for fatal unsignalled
479 * errors here. However this would be quite problematic --
480 * we would need to reimplement the Monarch handling and
481 * it would mess up the exclusion between exception handler
482 * and poll hander -- * so we skip this for now.
483 * These cases should not happen anyways, or only when the CPU
484 * is already totally * confused. In this case it's likely it will
485 * not fully execute the machine check handler either.
486 */
487void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
488{
489 struct mce m;
490 int i;
491
492 __get_cpu_var(mce_poll_count)++;
493
494 mce_setup(&m);
495
496 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
497 for (i = 0; i < banks; i++) {
498 if (!bank[i] || !test_bit(i, *b))
499 continue;
500
501 m.misc = 0;
502 m.addr = 0;
503 m.bank = i;
504 m.tsc = 0;
505
506 barrier();
507 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
508 if (!(m.status & MCI_STATUS_VAL))
509 continue;
510
511 /*
512 * Uncorrected or signalled events are handled by the exception
513 * handler when it is enabled, so don't process those here.
514 *
515 * TBD do the same check for MCI_STATUS_EN here?
516 */
517 if (!(flags & MCP_UC) &&
518 (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))
519 continue;
520
521 if (m.status & MCI_STATUS_MISCV)
522 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
523 if (m.status & MCI_STATUS_ADDRV)
524 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
525
526 if (!(flags & MCP_TIMESTAMP))
527 m.tsc = 0;
528 /*
529 * Don't get the IP here because it's unlikely to
530 * have anything to do with the actual error location.
531 */
532 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
533 mce_log(&m);
534 add_taint(TAINT_MACHINE_CHECK);
535 }
536
537 /*
538 * Clear state for this bank.
539 */
540 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
541 }
542
543 /*
544 * Don't clear MCG_STATUS here because it's only defined for
545 * exceptions.
546 */
547
548 sync_core();
549}
550EXPORT_SYMBOL_GPL(machine_check_poll);
551
552/*
553 * Do a quick check if any of the events requires a panic.
554 * This decides if we keep the events around or clear them.
555 */
556static int mce_no_way_out(struct mce *m, char **msg)
557{
558 int i;
559
560 for (i = 0; i < banks; i++) {
561 m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
562 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
563 return 1;
564 }
565 return 0;
566}
567
568/*
569 * Variable to establish order between CPUs while scanning.
570 * Each CPU spins initially until executing is equal its number.
571 */
572static atomic_t mce_executing;
573
574/*
575 * Defines order of CPUs on entry. First CPU becomes Monarch.
576 */
577static atomic_t mce_callin;
578
579/*
580 * Check if a timeout waiting for other CPUs happened.
581 */
582static int mce_timed_out(u64 *t)
583{
584 /*
585 * The others already did panic for some reason.
586 * Bail out like in a timeout.
587 * rmb() to tell the compiler that system_state
588 * might have been modified by someone else.
589 */
590 rmb();
591 if (atomic_read(&mce_paniced))
592 wait_for_panic();
593 if (!monarch_timeout)
594 goto out;
595 if ((s64)*t < SPINUNIT) {
596 /* CHECKME: Make panic default for 1 too? */
597 if (tolerant < 1)
598 mce_panic("Timeout synchronizing machine check over CPUs",
599 NULL, NULL);
600 cpu_missing = 1;
601 return 1;
602 }
603 *t -= SPINUNIT;
604out:
605 touch_nmi_watchdog();
606 return 0;
607}
608
609/*
610 * The Monarch's reign. The Monarch is the CPU who entered
611 * the machine check handler first. It waits for the others to
612 * raise the exception too and then grades them. When any
613 * error is fatal panic. Only then let the others continue.
614 *
615 * The other CPUs entering the MCE handler will be controlled by the
616 * Monarch. They are called Subjects.
617 *
618 * This way we prevent any potential data corruption in a unrecoverable case
619 * and also makes sure always all CPU's errors are examined.
620 *
621 * Also this detects the case of an machine check event coming from outer
622 * space (not detected by any CPUs) In this case some external agent wants
623 * us to shut down, so panic too.
624 *
625 * The other CPUs might still decide to panic if the handler happens
626 * in a unrecoverable place, but in this case the system is in a semi-stable
627 * state and won't corrupt anything by itself. It's ok to let the others
628 * continue for a bit first.
629 *
630 * All the spin loops have timeouts; when a timeout happens a CPU
631 * typically elects itself to be Monarch.
632 */
633static void mce_reign(void)
634{
635 int cpu;
636 struct mce *m = NULL;
637 int global_worst = 0;
638 char *msg = NULL;
639 char *nmsg = NULL;
640
641 /*
642 * This CPU is the Monarch and the other CPUs have run
643 * through their handlers.
644 * Grade the severity of the errors of all the CPUs.
645 */
646 for_each_possible_cpu(cpu) {
647 int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant,
648 &nmsg);
649 if (severity > global_worst) {
650 msg = nmsg;
651 global_worst = severity;
652 m = &per_cpu(mces_seen, cpu);
653 }
654 }
655
656 /*
657 * Cannot recover? Panic here then.
658 * This dumps all the mces in the log buffer and stops the
659 * other CPUs.
660 */
661 if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3)
662 mce_panic("Fatal Machine check", m, msg);
663
664 /*
665 * For UC somewhere we let the CPU who detects it handle it.
666 * Also must let continue the others, otherwise the handling
667 * CPU could deadlock on a lock.
668 */
669
670 /*
671 * No machine check event found. Must be some external
672 * source or one CPU is hung. Panic.
673 */
674 if (!m && tolerant < 3)
675 mce_panic("Machine check from unknown source", NULL, NULL);
676
677 /*
678 * Now clear all the mces_seen so that they don't reappear on
679 * the next mce.
680 */
681 for_each_possible_cpu(cpu)
682 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
683}
684
685static atomic_t global_nwo;
686
687/*
688 * Start of Monarch synchronization. This waits until all CPUs have
689 * entered the exception handler and then determines if any of them
690 * saw a fatal event that requires panic. Then it executes them
691 * in the entry order.
692 * TBD double check parallel CPU hotunplug
693 */
694static int mce_start(int no_way_out, int *order)
695{
696 int nwo;
697 int cpus = num_online_cpus();
698 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
699
700 if (!timeout) {
701 *order = -1;
702 return no_way_out;
703 }
704
705 atomic_add(no_way_out, &global_nwo);
706
707 /*
708 * Wait for everyone.
709 */
710 while (atomic_read(&mce_callin) != cpus) {
711 if (mce_timed_out(&timeout)) {
712 atomic_set(&global_nwo, 0);
713 *order = -1;
714 return no_way_out;
715 }
716 ndelay(SPINUNIT);
717 }
718
719 /*
720 * Cache the global no_way_out state.
721 */
722 nwo = atomic_read(&global_nwo);
723
724 /*
725 * Monarch starts executing now, the others wait.
726 */
727 if (*order == 1) {
728 atomic_set(&mce_executing, 1);
729 return nwo;
730 }
731
732 /*
733 * Now start the scanning loop one by one
734 * in the original callin order.
735 * This way when there are any shared banks it will
736 * be only seen by one CPU before cleared, avoiding duplicates.
737 */
738 while (atomic_read(&mce_executing) < *order) {
739 if (mce_timed_out(&timeout)) {
740 atomic_set(&global_nwo, 0);
741 *order = -1;
742 return no_way_out;
743 }
744 ndelay(SPINUNIT);
745 }
746 return nwo;
747}
748
749/*
750 * Synchronize between CPUs after main scanning loop.
751 * This invokes the bulk of the Monarch processing.
752 */
753static int mce_end(int order)
754{
755 int ret = -1;
756 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
757
758 if (!timeout)
759 goto reset;
760 if (order < 0)
761 goto reset;
762
763 /*
764 * Allow others to run.
765 */
766 atomic_inc(&mce_executing);
767
768 if (order == 1) {
769 /* CHECKME: Can this race with a parallel hotplug? */
770 int cpus = num_online_cpus();
771
772 /*
773 * Monarch: Wait for everyone to go through their scanning
774 * loops.
775 */
776 while (atomic_read(&mce_executing) <= cpus) {
777 if (mce_timed_out(&timeout))
778 goto reset;
779 ndelay(SPINUNIT);
780 }
781
782 mce_reign();
783 barrier();
784 ret = 0;
785 } else {
786 /*
787 * Subject: Wait for Monarch to finish.
788 */
789 while (atomic_read(&mce_executing) != 0) {
790 if (mce_timed_out(&timeout))
791 goto reset;
792 ndelay(SPINUNIT);
793 }
794
795 /*
796 * Don't reset anything. That's done by the Monarch.
797 */
798 return 0;
799 }
800
801 /*
802 * Reset all global state.
803 */
804reset:
805 atomic_set(&global_nwo, 0);
806 atomic_set(&mce_callin, 0);
807 barrier();
808
809 /*
810 * Let others run again.
811 */
812 atomic_set(&mce_executing, 0);
813 return ret;
814}
815
816/*
817 * Check if the address reported by the CPU is in a format we can parse.
818 * It would be possible to add code for most other cases, but all would
819 * be somewhat complicated (e.g. segment offset would require an instruction
820 * parser). So only support physical addresses upto page granuality for now.
821 */
822static int mce_usable_address(struct mce *m)
823{
824 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
825 return 0;
826 if ((m->misc & 0x3f) > PAGE_SHIFT)
827 return 0;
828 if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS)
829 return 0;
830 return 1;
831}
832
833static void mce_clear_state(unsigned long *toclear)
834{
835 int i;
836
837 for (i = 0; i < banks; i++) {
838 if (test_bit(i, toclear))
839 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
840 }
841}
842
843/*
844 * The actual machine check handler. This only handles real
845 * exceptions when something got corrupted coming in through int 18.
846 *
847 * This is executed in NMI context not subject to normal locking rules. This
848 * implies that most kernel services cannot be safely used. Don't even
849 * think about putting a printk in there!
850 *
851 * On Intel systems this is entered on all CPUs in parallel through
852 * MCE broadcast. However some CPUs might be broken beyond repair,
853 * so be always careful when synchronizing with others.
854 */
855void do_machine_check(struct pt_regs *regs, long error_code)
856{
857 struct mce m, *final;
858 int i;
859 int worst = 0;
860 int severity;
861 /*
862 * Establish sequential order between the CPUs entering the machine
863 * check handler.
864 */
865 int order;
866
867 /*
868 * If no_way_out gets set, there is no safe way to recover from this
869 * MCE. If tolerant is cranked up, we'll try anyway.
870 */
871 int no_way_out = 0;
872 /*
873 * If kill_it gets set, there might be a way to recover from this
874 * error.
875 */
876 int kill_it = 0;
877 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
878 char *msg = "Unknown";
879
880 atomic_inc(&mce_entry);
881
882 __get_cpu_var(mce_exception_count)++;
883
884 if (notify_die(DIE_NMI, "machine check", regs, error_code,
885 18, SIGKILL) == NOTIFY_STOP)
886 goto out;
887 if (!banks)
888 goto out;
889
890 order = atomic_add_return(1, &mce_callin);
891 mce_setup(&m);
892
893 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
894 no_way_out = mce_no_way_out(&m, &msg);
895
896 final = &__get_cpu_var(mces_seen);
897 *final = m;
898
899 barrier();
900
901 /*
902 * When no restart IP must always kill or panic.
903 */
904 if (!(m.mcgstatus & MCG_STATUS_RIPV))
905 kill_it = 1;
906
907 /*
908 * Go through all the banks in exclusion of the other CPUs.
909 * This way we don't report duplicated events on shared banks
910 * because the first one to see it will clear it.
911 */
912 no_way_out = mce_start(no_way_out, &order);
913 for (i = 0; i < banks; i++) {
914 __clear_bit(i, toclear);
915 if (!bank[i])
916 continue;
917
918 m.misc = 0;
919 m.addr = 0;
920 m.bank = i;
921
922 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
923 if ((m.status & MCI_STATUS_VAL) == 0)
924 continue;
925
926 /*
927 * Non uncorrected or non signaled errors are handled by
928 * machine_check_poll. Leave them alone, unless this panics.
929 */
930 if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
931 !no_way_out)
932 continue;
933
934 /*
935 * Set taint even when machine check was not enabled.
936 */
937 add_taint(TAINT_MACHINE_CHECK);
938
939 severity = mce_severity(&m, tolerant, NULL);
940
941 /*
942 * When machine check was for corrected handler don't touch,
943 * unless we're panicing.
944 */
945 if (severity == MCE_KEEP_SEVERITY && !no_way_out)
946 continue;
947 __set_bit(i, toclear);
948 if (severity == MCE_NO_SEVERITY) {
949 /*
950 * Machine check event was not enabled. Clear, but
951 * ignore.
952 */
953 continue;
954 }
955
956 /*
957 * Kill on action required.
958 */
959 if (severity == MCE_AR_SEVERITY)
960 kill_it = 1;
961
962 if (m.status & MCI_STATUS_MISCV)
963 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
964 if (m.status & MCI_STATUS_ADDRV)
965 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
966
967 /*
968 * Action optional error. Queue address for later processing.
969 * When the ring overflows we just ignore the AO error.
970 * RED-PEN add some logging mechanism when
971 * usable_address or mce_add_ring fails.
972 * RED-PEN don't ignore overflow for tolerant == 0
973 */
974 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
975 mce_ring_add(m.addr >> PAGE_SHIFT);
976
977 mce_get_rip(&m, regs);
978 mce_log(&m);
979
980 if (severity > worst) {
981 *final = m;
982 worst = severity;
983 }
984 }
985
986 if (!no_way_out)
987 mce_clear_state(toclear);
988
989 /*
990 * Do most of the synchronization with other CPUs.
991 * When there's any problem use only local no_way_out state.
992 */
993 if (mce_end(order) < 0)
994 no_way_out = worst >= MCE_PANIC_SEVERITY;
995
996 /*
997 * If we have decided that we just CAN'T continue, and the user
998 * has not set tolerant to an insane level, give up and die.
999 *
1000 * This is mainly used in the case when the system doesn't
1001 * support MCE broadcasting or it has been disabled.
1002 */
1003 if (no_way_out && tolerant < 3)
1004 mce_panic("Fatal machine check on current CPU", final, msg);
1005
1006 /*
1007 * If the error seems to be unrecoverable, something should be
1008 * done. Try to kill as little as possible. If we can kill just
1009 * one task, do that. If the user has set the tolerance very
1010 * high, don't try to do anything at all.
1011 */
1012
1013 if (kill_it && tolerant < 3)
1014 force_sig(SIGBUS, current);
1015
1016 /* notify userspace ASAP */
1017 set_thread_flag(TIF_MCE_NOTIFY);
1018
1019 if (worst > 0)
1020 mce_report_event(regs);
1021 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1022out:
1023 atomic_dec(&mce_entry);
1024 sync_core();
1025}
1026EXPORT_SYMBOL_GPL(do_machine_check);
1027
1028/* dummy to break dependency. actual code is in mm/memory-failure.c */
1029void __attribute__((weak)) memory_failure(unsigned long pfn, int vector)
1030{
1031 printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn);
1032}
1033
1034/*
1035 * Called after mce notification in process context. This code
1036 * is allowed to sleep. Call the high level VM handler to process
1037 * any corrupted pages.
1038 * Assume that the work queue code only calls this one at a time
1039 * per CPU.
1040 * Note we don't disable preemption, so this code might run on the wrong
1041 * CPU. In this case the event is picked up by the scheduled work queue.
1042 * This is merely a fast path to expedite processing in some common
1043 * cases.
1044 */
1045void mce_notify_process(void)
1046{
1047 unsigned long pfn;
1048 mce_notify_irq();
1049 while (mce_ring_get(&pfn))
1050 memory_failure(pfn, MCE_VECTOR);
1051}
1052
1053static void mce_process_work(struct work_struct *dummy)
1054{
1055 mce_notify_process();
1056}
1057
1058#ifdef CONFIG_X86_MCE_INTEL
1059/***
1060 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
1061 * @cpu: The CPU on which the event occurred.
1062 * @status: Event status information
1063 *
1064 * This function should be called by the thermal interrupt after the
1065 * event has been processed and the decision was made to log the event
1066 * further.
1067 *
1068 * The status parameter will be saved to the 'status' field of 'struct mce'
1069 * and historically has been the register value of the
1070 * MSR_IA32_THERMAL_STATUS (Intel) msr.
1071 */
1072void mce_log_therm_throt_event(__u64 status)
1073{
1074 struct mce m;
1075
1076 mce_setup(&m);
1077 m.bank = MCE_THERMAL_BANK;
1078 m.status = status;
1079 mce_log(&m);
1080}
1081#endif /* CONFIG_X86_MCE_INTEL */
1082
1083/*
1084 * Periodic polling timer for "silent" machine check errors. If the
1085 * poller finds an MCE, poll 2x faster. When the poller finds no more
1086 * errors, poll 2x slower (up to check_interval seconds).
1087 */
1088static int check_interval = 5 * 60; /* 5 minutes */
1089
1090static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
1091static DEFINE_PER_CPU(struct timer_list, mce_timer);
1092
1093static void mcheck_timer(unsigned long data)
1094{
1095 struct timer_list *t = &per_cpu(mce_timer, data);
1096 int *n;
1097
1098 WARN_ON(smp_processor_id() != data);
1099
1100 if (mce_available(&current_cpu_data)) {
1101 machine_check_poll(MCP_TIMESTAMP,
1102 &__get_cpu_var(mce_poll_banks));
1103 }
1104
1105 /*
1106 * Alert userspace if needed. If we logged an MCE, reduce the
1107 * polling interval, otherwise increase the polling interval.
1108 */
1109 n = &__get_cpu_var(next_interval);
1110 if (mce_notify_irq())
1111 *n = max(*n/2, HZ/100);
1112 else
1113 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
1114
1115 t->expires = jiffies + *n;
1116 add_timer(t);
1117}
1118
1119static void mce_do_trigger(struct work_struct *work)
1120{
1121 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
1122}
1123
1124static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
1125
1126/*
1127 * Notify the user(s) about new machine check events.
1128 * Can be called from interrupt context, but not from machine check/NMI
1129 * context.
1130 */
1131int mce_notify_irq(void)
1132{
1133 /* Not more than two messages every minute */
1134 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1135
1136 clear_thread_flag(TIF_MCE_NOTIFY);
1137
1138 if (test_and_clear_bit(0, &notify_user)) {
1139 wake_up_interruptible(&mce_wait);
1140
1141 /*
1142 * There is no risk of missing notifications because
1143 * work_pending is always cleared before the function is
1144 * executed.
1145 */
1146 if (trigger[0] && !work_pending(&mce_trigger_work))
1147 schedule_work(&mce_trigger_work);
1148
1149 if (__ratelimit(&ratelimit))
1150 printk(KERN_INFO "Machine check events logged\n");
1151
1152 return 1;
1153 }
1154 return 0;
1155}
1156EXPORT_SYMBOL_GPL(mce_notify_irq);
1157
1158/*
1159 * Initialize Machine Checks for a CPU.
1160 */
1161static int mce_cap_init(void)
1162{
1163 unsigned b;
1164 u64 cap;
1165
1166 rdmsrl(MSR_IA32_MCG_CAP, cap);
1167
1168 b = cap & MCG_BANKCNT_MASK;
1169 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
1170
1171 if (b > MAX_NR_BANKS) {
1172 printk(KERN_WARNING
1173 "MCE: Using only %u machine check banks out of %u\n",
1174 MAX_NR_BANKS, b);
1175 b = MAX_NR_BANKS;
1176 }
1177
1178 /* Don't support asymmetric configurations today */
1179 WARN_ON(banks != 0 && b != banks);
1180 banks = b;
1181 if (!bank) {
1182 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
1183 if (!bank)
1184 return -ENOMEM;
1185 memset(bank, 0xff, banks * sizeof(u64));
1186 }
1187
1188 /* Use accurate RIP reporting if available. */
1189 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1190 rip_msr = MSR_IA32_MCG_EIP;
1191
1192 if (cap & MCG_SER_P)
1193 mce_ser = 1;
1194
1195 return 0;
1196}
1197
1198static void mce_init(void)
1199{
1200 mce_banks_t all_banks;
1201 u64 cap;
1202 int i;
1203
1204 /*
1205 * Log the machine checks left over from the previous reset.
1206 */
1207 bitmap_fill(all_banks, MAX_NR_BANKS);
1208 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
1209
1210 set_in_cr4(X86_CR4_MCE);
1211
1212 rdmsrl(MSR_IA32_MCG_CAP, cap);
1213 if (cap & MCG_CTL_P)
1214 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1215
1216 for (i = 0; i < banks; i++) {
1217 if (skip_bank_init(i))
1218 continue;
1219 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
1220 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
1221 }
1222}
1223
1224/* Add per CPU specific workarounds here */
1225static void mce_cpu_quirks(struct cpuinfo_x86 *c)
1226{
1227 /* This should be disabled by the BIOS, but isn't always */
1228 if (c->x86_vendor == X86_VENDOR_AMD) {
1229 if (c->x86 == 15 && banks > 4) {
1230 /*
1231 * disable GART TBL walk error reporting, which
1232 * trips off incorrectly with the IOMMU & 3ware
1233 * & Cerberus:
1234 */
1235 clear_bit(10, (unsigned long *)&bank[4]);
1236 }
1237 if (c->x86 <= 17 && mce_bootlog < 0) {
1238 /*
1239 * Lots of broken BIOS around that don't clear them
1240 * by default and leave crap in there. Don't log:
1241 */
1242 mce_bootlog = 0;
1243 }
1244 /*
1245 * Various K7s with broken bank 0 around. Always disable
1246 * by default.
1247 */
1248 if (c->x86 == 6)
1249 bank[0] = 0;
1250 }
1251
1252 if (c->x86_vendor == X86_VENDOR_INTEL) {
1253 /*
1254 * SDM documents that on family 6 bank 0 should not be written
1255 * because it aliases to another special BIOS controlled
1256 * register.
1257 * But it's not aliased anymore on model 0x1a+
1258 * Don't ignore bank 0 completely because there could be a
1259 * valid event later, merely don't write CTL0.
1260 */
1261
1262 if (c->x86 == 6 && c->x86_model < 0x1A)
1263 __set_bit(0, &dont_init_banks);
1264
1265 /*
1266 * All newer Intel systems support MCE broadcasting. Enable
1267 * synchronization with a one second timeout.
1268 */
1269 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1270 monarch_timeout < 0)
1271 monarch_timeout = USEC_PER_SEC;
1272 }
1273 if (monarch_timeout < 0)
1274 monarch_timeout = 0;
1275 if (mce_bootlog != 0)
1276 mce_panic_timeout = 30;
1277}
1278
1279static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
1280{
1281 if (c->x86 != 5)
1282 return;
1283 switch (c->x86_vendor) {
1284 case X86_VENDOR_INTEL:
1285 if (mce_p5_enabled())
1286 intel_p5_mcheck_init(c);
1287 break;
1288 case X86_VENDOR_CENTAUR:
1289 winchip_mcheck_init(c);
1290 break;
1291 }
1292}
1293
1294static void mce_cpu_features(struct cpuinfo_x86 *c)
1295{
1296 switch (c->x86_vendor) {
1297 case X86_VENDOR_INTEL:
1298 mce_intel_feature_init(c);
1299 break;
1300 case X86_VENDOR_AMD:
1301 mce_amd_feature_init(c);
1302 break;
1303 default:
1304 break;
1305 }
1306}
1307
1308static void mce_init_timer(void)
1309{
1310 struct timer_list *t = &__get_cpu_var(mce_timer);
1311 int *n = &__get_cpu_var(next_interval);
1312
1313 if (mce_ignore_ce)
1314 return;
1315
1316 *n = check_interval * HZ;
1317 if (!*n)
1318 return;
1319 setup_timer(t, mcheck_timer, smp_processor_id());
1320 t->expires = round_jiffies(jiffies + *n);
1321 add_timer(t);
1322}
1323
1324/*
1325 * Called for each booted CPU to set up machine checks.
1326 * Must be called with preempt off:
1327 */
1328void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
1329{
1330 if (mce_disabled)
1331 return;
1332
1333 mce_ancient_init(c);
1334
1335 if (!mce_available(c))
1336 return;
1337
1338 if (mce_cap_init() < 0) {
1339 mce_disabled = 1;
1340 return;
1341 }
1342 mce_cpu_quirks(c);
1343
1344 machine_check_vector = do_machine_check;
1345
1346 mce_init();
1347 mce_cpu_features(c);
1348 mce_init_timer();
1349 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
1350}
1351
1352/*
1353 * Character device to read and clear the MCE log.
1354 */
1355
1356static DEFINE_SPINLOCK(mce_state_lock);
1357static int open_count; /* #times opened */
1358static int open_exclu; /* already open exclusive? */
1359
1360static int mce_open(struct inode *inode, struct file *file)
1361{
1362 spin_lock(&mce_state_lock);
1363
1364 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
1365 spin_unlock(&mce_state_lock);
1366
1367 return -EBUSY;
1368 }
1369
1370 if (file->f_flags & O_EXCL)
1371 open_exclu = 1;
1372 open_count++;
1373
1374 spin_unlock(&mce_state_lock);
1375
1376 return nonseekable_open(inode, file);
1377}
1378
1379static int mce_release(struct inode *inode, struct file *file)
1380{
1381 spin_lock(&mce_state_lock);
1382
1383 open_count--;
1384 open_exclu = 0;
1385
1386 spin_unlock(&mce_state_lock);
1387
1388 return 0;
1389}
1390
1391static void collect_tscs(void *data)
1392{
1393 unsigned long *cpu_tsc = (unsigned long *)data;
1394
1395 rdtscll(cpu_tsc[smp_processor_id()]);
1396}
1397
1398static DEFINE_MUTEX(mce_read_mutex);
1399
1400static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1401 loff_t *off)
1402{
1403 char __user *buf = ubuf;
1404 unsigned long *cpu_tsc;
1405 unsigned prev, next;
1406 int i, err;
1407
1408 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
1409 if (!cpu_tsc)
1410 return -ENOMEM;
1411
1412 mutex_lock(&mce_read_mutex);
1413 next = rcu_dereference(mcelog.next);
1414
1415 /* Only supports full reads right now */
1416 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
1417 mutex_unlock(&mce_read_mutex);
1418 kfree(cpu_tsc);
1419
1420 return -EINVAL;
1421 }
1422
1423 err = 0;
1424 prev = 0;
1425 do {
1426 for (i = prev; i < next; i++) {
1427 unsigned long start = jiffies;
1428
1429 while (!mcelog.entry[i].finished) {
1430 if (time_after_eq(jiffies, start + 2)) {
1431 memset(mcelog.entry + i, 0,
1432 sizeof(struct mce));
1433 goto timeout;
1434 }
1435 cpu_relax();
1436 }
1437 smp_rmb();
1438 err |= copy_to_user(buf, mcelog.entry + i,
1439 sizeof(struct mce));
1440 buf += sizeof(struct mce);
1441timeout:
1442 ;
1443 }
1444
1445 memset(mcelog.entry + prev, 0,
1446 (next - prev) * sizeof(struct mce));
1447 prev = next;
1448 next = cmpxchg(&mcelog.next, prev, 0);
1449 } while (next != prev);
1450
1451 synchronize_sched();
1452
1453 /*
1454 * Collect entries that were still getting written before the
1455 * synchronize.
1456 */
1457 on_each_cpu(collect_tscs, cpu_tsc, 1);
1458
1459 for (i = next; i < MCE_LOG_LEN; i++) {
1460 if (mcelog.entry[i].finished &&
1461 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
1462 err |= copy_to_user(buf, mcelog.entry+i,
1463 sizeof(struct mce));
1464 smp_rmb();
1465 buf += sizeof(struct mce);
1466 memset(&mcelog.entry[i], 0, sizeof(struct mce));
1467 }
1468 }
1469 mutex_unlock(&mce_read_mutex);
1470 kfree(cpu_tsc);
1471
1472 return err ? -EFAULT : buf - ubuf;
1473}
1474
1475static unsigned int mce_poll(struct file *file, poll_table *wait)
1476{
1477 poll_wait(file, &mce_wait, wait);
1478 if (rcu_dereference(mcelog.next))
1479 return POLLIN | POLLRDNORM;
1480 return 0;
1481}
1482
1483static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
1484{
1485 int __user *p = (int __user *)arg;
1486
1487 if (!capable(CAP_SYS_ADMIN))
1488 return -EPERM;
1489
1490 switch (cmd) {
1491 case MCE_GET_RECORD_LEN:
1492 return put_user(sizeof(struct mce), p);
1493 case MCE_GET_LOG_LEN:
1494 return put_user(MCE_LOG_LEN, p);
1495 case MCE_GETCLEAR_FLAGS: {
1496 unsigned flags;
1497
1498 do {
1499 flags = mcelog.flags;
1500 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
1501
1502 return put_user(flags, p);
1503 }
1504 default:
1505 return -ENOTTY;
1506 }
1507}
1508
1509/* Modified in mce-inject.c, so not static or const */
1510struct file_operations mce_chrdev_ops = {
1511 .open = mce_open,
1512 .release = mce_release,
1513 .read = mce_read,
1514 .poll = mce_poll,
1515 .unlocked_ioctl = mce_ioctl,
1516};
1517EXPORT_SYMBOL_GPL(mce_chrdev_ops);
1518
1519static struct miscdevice mce_log_device = {
1520 MISC_MCELOG_MINOR,
1521 "mcelog",
1522 &mce_chrdev_ops,
1523};
1524
1525/*
1526 * mce=off Disables machine check
1527 * mce=no_cmci Disables CMCI
1528 * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
1529 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
1530 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
1531 * monarchtimeout is how long to wait for other CPUs on machine
1532 * check, or 0 to not wait
1533 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
1534 * mce=nobootlog Don't log MCEs from before booting.
1535 */
1536static int __init mcheck_enable(char *str)
1537{
1538 if (*str == 0)
1539 enable_p5_mce();
1540 if (*str == '=')
1541 str++;
1542 if (!strcmp(str, "off"))
1543 mce_disabled = 1;
1544 else if (!strcmp(str, "no_cmci"))
1545 mce_cmci_disabled = 1;
1546 else if (!strcmp(str, "dont_log_ce"))
1547 mce_dont_log_ce = 1;
1548 else if (!strcmp(str, "ignore_ce"))
1549 mce_ignore_ce = 1;
1550 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
1551 mce_bootlog = (str[0] == 'b');
1552 else if (isdigit(str[0])) {
1553 get_option(&str, &tolerant);
1554 if (*str == ',') {
1555 ++str;
1556 get_option(&str, &monarch_timeout);
1557 }
1558 } else {
1559 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
1560 str);
1561 return 0;
1562 }
1563 return 1;
1564}
1565__setup("mce", mcheck_enable);
1566
1567/*
1568 * Sysfs support
1569 */
1570
1571/*
1572 * Disable machine checks on suspend and shutdown. We can't really handle
1573 * them later.
1574 */
1575static int mce_disable(void)
1576{
1577 int i;
1578
1579 for (i = 0; i < banks; i++) {
1580 if (!skip_bank_init(i))
1581 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1582 }
1583 return 0;
1584}
1585
1586static int mce_suspend(struct sys_device *dev, pm_message_t state)
1587{
1588 return mce_disable();
1589}
1590
1591static int mce_shutdown(struct sys_device *dev)
1592{
1593 return mce_disable();
1594}
1595
1596/*
1597 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
1598 * Only one CPU is active at this time, the others get re-added later using
1599 * CPU hotplug:
1600 */
1601static int mce_resume(struct sys_device *dev)
1602{
1603 mce_init();
1604 mce_cpu_features(&current_cpu_data);
1605
1606 return 0;
1607}
1608
1609static void mce_cpu_restart(void *data)
1610{
1611 del_timer_sync(&__get_cpu_var(mce_timer));
1612 if (mce_available(&current_cpu_data))
1613 mce_init();
1614 mce_init_timer();
1615}
1616
1617/* Reinit MCEs after user configuration changes */
1618static void mce_restart(void)
1619{
1620 on_each_cpu(mce_cpu_restart, NULL, 1);
1621}
1622
1623static struct sysdev_class mce_sysclass = {
1624 .suspend = mce_suspend,
1625 .shutdown = mce_shutdown,
1626 .resume = mce_resume,
1627 .name = "machinecheck",
1628};
1629
1630DEFINE_PER_CPU(struct sys_device, mce_dev);
1631
1632__cpuinitdata
1633void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
1634
1635static struct sysdev_attribute *bank_attrs;
1636
1637static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
1638 char *buf)
1639{
1640 u64 b = bank[attr - bank_attrs];
1641
1642 return sprintf(buf, "%llx\n", b);
1643}
1644
1645static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
1646 const char *buf, size_t size)
1647{
1648 u64 new;
1649
1650 if (strict_strtoull(buf, 0, &new) < 0)
1651 return -EINVAL;
1652
1653 bank[attr - bank_attrs] = new;
1654 mce_restart();
1655
1656 return size;
1657}
1658
1659static ssize_t
1660show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
1661{
1662 strcpy(buf, trigger);
1663 strcat(buf, "\n");
1664 return strlen(trigger) + 1;
1665}
1666
1667static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
1668 const char *buf, size_t siz)
1669{
1670 char *p;
1671 int len;
1672
1673 strncpy(trigger, buf, sizeof(trigger));
1674 trigger[sizeof(trigger)-1] = 0;
1675 len = strlen(trigger);
1676 p = strchr(trigger, '\n');
1677
1678 if (*p)
1679 *p = 0;
1680
1681 return len;
1682}
1683
1684static ssize_t store_int_with_restart(struct sys_device *s,
1685 struct sysdev_attribute *attr,
1686 const char *buf, size_t size)
1687{
1688 ssize_t ret = sysdev_store_int(s, attr, buf, size);
1689 mce_restart();
1690 return ret;
1691}
1692
1693static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
1694static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
1695static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
1696
1697static struct sysdev_ext_attribute attr_check_interval = {
1698 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
1699 store_int_with_restart),
1700 &check_interval
1701};
1702
1703static struct sysdev_attribute *mce_attrs[] = {
1704 &attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger,
1705 &attr_monarch_timeout.attr,
1706 NULL
1707};
1708
1709static cpumask_var_t mce_dev_initialized;
1710
1711/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
1712static __cpuinit int mce_create_device(unsigned int cpu)
1713{
1714 int err;
1715 int i;
1716
1717 if (!mce_available(&boot_cpu_data))
1718 return -EIO;
1719
1720 memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
1721 per_cpu(mce_dev, cpu).id = cpu;
1722 per_cpu(mce_dev, cpu).cls = &mce_sysclass;
1723
1724 err = sysdev_register(&per_cpu(mce_dev, cpu));
1725 if (err)
1726 return err;
1727
1728 for (i = 0; mce_attrs[i]; i++) {
1729 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1730 if (err)
1731 goto error;
1732 }
1733 for (i = 0; i < banks; i++) {
1734 err = sysdev_create_file(&per_cpu(mce_dev, cpu),
1735 &bank_attrs[i]);
1736 if (err)
1737 goto error2;
1738 }
1739 cpumask_set_cpu(cpu, mce_dev_initialized);
1740
1741 return 0;
1742error2:
1743 while (--i >= 0)
1744 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
1745error:
1746 while (--i >= 0)
1747 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1748
1749 sysdev_unregister(&per_cpu(mce_dev, cpu));
1750
1751 return err;
1752}
1753
1754static __cpuinit void mce_remove_device(unsigned int cpu)
1755{
1756 int i;
1757
1758 if (!cpumask_test_cpu(cpu, mce_dev_initialized))
1759 return;
1760
1761 for (i = 0; mce_attrs[i]; i++)
1762 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1763
1764 for (i = 0; i < banks; i++)
1765 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
1766
1767 sysdev_unregister(&per_cpu(mce_dev, cpu));
1768 cpumask_clear_cpu(cpu, mce_dev_initialized);
1769}
1770
1771/* Make sure there are no machine checks on offlined CPUs. */
1772static void mce_disable_cpu(void *h)
1773{
1774 unsigned long action = *(unsigned long *)h;
1775 int i;
1776
1777 if (!mce_available(&current_cpu_data))
1778 return;
1779 if (!(action & CPU_TASKS_FROZEN))
1780 cmci_clear();
1781 for (i = 0; i < banks; i++) {
1782 if (!skip_bank_init(i))
1783 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1784 }
1785}
1786
1787static void mce_reenable_cpu(void *h)
1788{
1789 unsigned long action = *(unsigned long *)h;
1790 int i;
1791
1792 if (!mce_available(&current_cpu_data))
1793 return;
1794
1795 if (!(action & CPU_TASKS_FROZEN))
1796 cmci_reenable();
1797 for (i = 0; i < banks; i++) {
1798 if (!skip_bank_init(i))
1799 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1800 }
1801}
1802
1803/* Get notified when a cpu comes on/off. Be hotplug friendly. */
1804static int __cpuinit
1805mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
1806{
1807 unsigned int cpu = (unsigned long)hcpu;
1808 struct timer_list *t = &per_cpu(mce_timer, cpu);
1809
1810 switch (action) {
1811 case CPU_ONLINE:
1812 case CPU_ONLINE_FROZEN:
1813 mce_create_device(cpu);
1814 if (threshold_cpu_callback)
1815 threshold_cpu_callback(action, cpu);
1816 break;
1817 case CPU_DEAD:
1818 case CPU_DEAD_FROZEN:
1819 if (threshold_cpu_callback)
1820 threshold_cpu_callback(action, cpu);
1821 mce_remove_device(cpu);
1822 break;
1823 case CPU_DOWN_PREPARE:
1824 case CPU_DOWN_PREPARE_FROZEN:
1825 del_timer_sync(t);
1826 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1827 break;
1828 case CPU_DOWN_FAILED:
1829 case CPU_DOWN_FAILED_FROZEN:
1830 t->expires = round_jiffies(jiffies +
1831 __get_cpu_var(next_interval));
1832 add_timer_on(t, cpu);
1833 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1834 break;
1835 case CPU_POST_DEAD:
1836 /* intentionally ignoring frozen here */
1837 cmci_rediscover(cpu);
1838 break;
1839 }
1840 return NOTIFY_OK;
1841}
1842
1843static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1844 .notifier_call = mce_cpu_callback,
1845};
1846
1847static __init int mce_init_banks(void)
1848{
1849 int i;
1850
1851 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1852 GFP_KERNEL);
1853 if (!bank_attrs)
1854 return -ENOMEM;
1855
1856 for (i = 0; i < banks; i++) {
1857 struct sysdev_attribute *a = &bank_attrs[i];
1858
1859 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
1860 if (!a->attr.name)
1861 goto nomem;
1862
1863 a->attr.mode = 0644;
1864 a->show = show_bank;
1865 a->store = set_bank;
1866 }
1867 return 0;
1868
1869nomem:
1870 while (--i >= 0)
1871 kfree(bank_attrs[i].attr.name);
1872 kfree(bank_attrs);
1873 bank_attrs = NULL;
1874
1875 return -ENOMEM;
1876}
1877
1878static __init int mce_init_device(void)
1879{
1880 int err;
1881 int i = 0;
1882
1883 if (!mce_available(&boot_cpu_data))
1884 return -EIO;
1885
1886 alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
1887
1888 err = mce_init_banks();
1889 if (err)
1890 return err;
1891
1892 err = sysdev_class_register(&mce_sysclass);
1893 if (err)
1894 return err;
1895
1896 for_each_online_cpu(i) {
1897 err = mce_create_device(i);
1898 if (err)
1899 return err;
1900 }
1901
1902 register_hotcpu_notifier(&mce_cpu_notifier);
1903 misc_register(&mce_log_device);
1904
1905 return err;
1906}
1907
1908device_initcall(mce_init_device);
1909
1910#else /* CONFIG_X86_OLD_MCE: */
1911
1912int nr_mce_banks;
1913EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */
1914
1915/* This has to be run for each processor */
1916void mcheck_init(struct cpuinfo_x86 *c)
1917{
1918 if (mce_disabled == 1)
1919 return;
1920
1921 switch (c->x86_vendor) {
1922 case X86_VENDOR_AMD:
1923 amd_mcheck_init(c);
1924 break;
1925
1926 case X86_VENDOR_INTEL:
1927 if (c->x86 == 5)
1928 intel_p5_mcheck_init(c);
1929 if (c->x86 == 6)
1930 intel_p6_mcheck_init(c);
1931 if (c->x86 == 15)
1932 intel_p4_mcheck_init(c);
1933 break;
1934
1935 case X86_VENDOR_CENTAUR:
1936 if (c->x86 == 5)
1937 winchip_mcheck_init(c);
1938 break;
1939
1940 default:
1941 break;
1942 }
1943 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks);
1944}
1945
1946static int __init mcheck_enable(char *str)
1947{
1948 mce_disabled = -1;
1949 return 1;
1950}
1951
1952__setup("mce", mcheck_enable);
1953
1954#endif /* CONFIG_X86_OLD_MCE */
1955
1956/*
1957 * Old style boot options parsing. Only for compatibility.
1958 */
1959static int __init mcheck_disable(char *str)
1960{
1961 mce_disabled = 1;
1962 return 1;
1963}
1964__setup("nomce", mcheck_disable);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h
index ae9f628838f..84a552b458c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.h
+++ b/arch/x86/kernel/cpu/mcheck/mce.h
@@ -1,14 +1,38 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <asm/mce.h> 2#include <asm/mce.h>
3 3
4#ifdef CONFIG_X86_OLD_MCE
4void amd_mcheck_init(struct cpuinfo_x86 *c); 5void amd_mcheck_init(struct cpuinfo_x86 *c);
5void intel_p4_mcheck_init(struct cpuinfo_x86 *c); 6void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
6void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
7void intel_p6_mcheck_init(struct cpuinfo_x86 *c); 7void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
8#endif
9
10#ifdef CONFIG_X86_ANCIENT_MCE
11void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
8void winchip_mcheck_init(struct cpuinfo_x86 *c); 12void winchip_mcheck_init(struct cpuinfo_x86 *c);
13extern int mce_p5_enable;
14static inline int mce_p5_enabled(void) { return mce_p5_enable; }
15static inline void enable_p5_mce(void) { mce_p5_enable = 1; }
16#else
17static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {}
18static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
19static inline int mce_p5_enabled(void) { return 0; }
20static inline void enable_p5_mce(void) { }
21#endif
9 22
10/* Call the installed machine check handler for this CPU setup. */ 23/* Call the installed machine check handler for this CPU setup. */
11extern void (*machine_check_vector)(struct pt_regs *, long error_code); 24extern void (*machine_check_vector)(struct pt_regs *, long error_code);
12 25
26#ifdef CONFIG_X86_OLD_MCE
27
13extern int nr_mce_banks; 28extern int nr_mce_banks;
14 29
30void intel_set_thermal_handler(void);
31
32#else
33
34static inline void intel_set_thermal_handler(void) { }
35
36#endif
37
38void intel_init_thermal(struct cpuinfo_x86 *c);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
deleted file mode 100644
index 3552119b091..00000000000
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ /dev/null
@@ -1,76 +0,0 @@
1/*
2 * mce.c - x86 Machine Check Exception Reporting
3 * (c) 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>, Dave Jones <davej@redhat.com>
4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/module.h>
10#include <linux/smp.h>
11#include <linux/thread_info.h>
12
13#include <asm/processor.h>
14#include <asm/system.h>
15#include <asm/mce.h>
16
17#include "mce.h"
18
19int mce_disabled;
20int nr_mce_banks;
21
22EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */
23
24/* Handle unconfigured int18 (should never happen) */
25static void unexpected_machine_check(struct pt_regs *regs, long error_code)
26{
27 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id());
28}
29
30/* Call the installed machine check handler for this CPU setup. */
31void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check;
32
33/* This has to be run for each processor */
34void mcheck_init(struct cpuinfo_x86 *c)
35{
36 if (mce_disabled == 1)
37 return;
38
39 switch (c->x86_vendor) {
40 case X86_VENDOR_AMD:
41 amd_mcheck_init(c);
42 break;
43
44 case X86_VENDOR_INTEL:
45 if (c->x86 == 5)
46 intel_p5_mcheck_init(c);
47 if (c->x86 == 6)
48 intel_p6_mcheck_init(c);
49 if (c->x86 == 15)
50 intel_p4_mcheck_init(c);
51 break;
52
53 case X86_VENDOR_CENTAUR:
54 if (c->x86 == 5)
55 winchip_mcheck_init(c);
56 break;
57
58 default:
59 break;
60 }
61}
62
63static int __init mcheck_disable(char *str)
64{
65 mce_disabled = 1;
66 return 1;
67}
68
69static int __init mcheck_enable(char *str)
70{
71 mce_disabled = -1;
72 return 1;
73}
74
75__setup("nomce", mcheck_disable);
76__setup("mce", mcheck_enable);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
deleted file mode 100644
index 6fb0b359d2a..00000000000
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ /dev/null
@@ -1,1187 +0,0 @@
1/*
2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
6 * Copyright 2008 Intel Corporation
7 * Author: Andi Kleen
8 */
9
10#include <linux/init.h>
11#include <linux/types.h>
12#include <linux/kernel.h>
13#include <linux/sched.h>
14#include <linux/smp_lock.h>
15#include <linux/string.h>
16#include <linux/rcupdate.h>
17#include <linux/kallsyms.h>
18#include <linux/sysdev.h>
19#include <linux/miscdevice.h>
20#include <linux/fs.h>
21#include <linux/capability.h>
22#include <linux/cpu.h>
23#include <linux/percpu.h>
24#include <linux/poll.h>
25#include <linux/thread_info.h>
26#include <linux/ctype.h>
27#include <linux/kmod.h>
28#include <linux/kdebug.h>
29#include <linux/kobject.h>
30#include <linux/sysfs.h>
31#include <linux/ratelimit.h>
32#include <asm/processor.h>
33#include <asm/msr.h>
34#include <asm/mce.h>
35#include <asm/uaccess.h>
36#include <asm/smp.h>
37#include <asm/idle.h>
38
39#define MISC_MCELOG_MINOR 227
40
41atomic_t mce_entry;
42
43static int mce_dont_init;
44
45/*
46 * Tolerant levels:
47 * 0: always panic on uncorrected errors, log corrected errors
48 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
49 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
50 * 3: never panic or SIGBUS, log all errors (for testing only)
51 */
52static int tolerant = 1;
53static int banks;
54static u64 *bank;
55static unsigned long notify_user;
56static int rip_msr;
57static int mce_bootlog = -1;
58static atomic_t mce_events;
59
60static char trigger[128];
61static char *trigger_argv[2] = { trigger, NULL };
62
63static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
64
65/* MCA banks polled by the period polling timer for corrected events */
66DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
67 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
68};
69
70/* Do initial initialization of a struct mce */
71void mce_setup(struct mce *m)
72{
73 memset(m, 0, sizeof(struct mce));
74 m->cpu = smp_processor_id();
75 rdtscll(m->tsc);
76}
77
78/*
79 * Lockless MCE logging infrastructure.
80 * This avoids deadlocks on printk locks without having to break locks. Also
81 * separate MCEs from kernel messages to avoid bogus bug reports.
82 */
83
84static struct mce_log mcelog = {
85 MCE_LOG_SIGNATURE,
86 MCE_LOG_LEN,
87};
88
89void mce_log(struct mce *mce)
90{
91 unsigned next, entry;
92 atomic_inc(&mce_events);
93 mce->finished = 0;
94 wmb();
95 for (;;) {
96 entry = rcu_dereference(mcelog.next);
97 for (;;) {
98 /* When the buffer fills up discard new entries. Assume
99 that the earlier errors are the more interesting. */
100 if (entry >= MCE_LOG_LEN) {
101 set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
102 return;
103 }
104 /* Old left over entry. Skip. */
105 if (mcelog.entry[entry].finished) {
106 entry++;
107 continue;
108 }
109 break;
110 }
111 smp_rmb();
112 next = entry + 1;
113 if (cmpxchg(&mcelog.next, entry, next) == entry)
114 break;
115 }
116 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
117 wmb();
118 mcelog.entry[entry].finished = 1;
119 wmb();
120
121 set_bit(0, &notify_user);
122}
123
124static void print_mce(struct mce *m)
125{
126 printk(KERN_EMERG "\n"
127 KERN_EMERG "HARDWARE ERROR\n"
128 KERN_EMERG
129 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
130 m->cpu, m->mcgstatus, m->bank, m->status);
131 if (m->ip) {
132 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
133 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
134 m->cs, m->ip);
135 if (m->cs == __KERNEL_CS)
136 print_symbol("{%s}", m->ip);
137 printk("\n");
138 }
139 printk(KERN_EMERG "TSC %llx ", m->tsc);
140 if (m->addr)
141 printk("ADDR %llx ", m->addr);
142 if (m->misc)
143 printk("MISC %llx ", m->misc);
144 printk("\n");
145 printk(KERN_EMERG "This is not a software problem!\n");
146 printk(KERN_EMERG "Run through mcelog --ascii to decode "
147 "and contact your hardware vendor\n");
148}
149
150static void mce_panic(char *msg, struct mce *backup, unsigned long start)
151{
152 int i;
153
154 oops_begin();
155 for (i = 0; i < MCE_LOG_LEN; i++) {
156 unsigned long tsc = mcelog.entry[i].tsc;
157
158 if (time_before(tsc, start))
159 continue;
160 print_mce(&mcelog.entry[i]);
161 if (backup && mcelog.entry[i].tsc == backup->tsc)
162 backup = NULL;
163 }
164 if (backup)
165 print_mce(backup);
166 panic(msg);
167}
168
169int mce_available(struct cpuinfo_x86 *c)
170{
171 if (mce_dont_init)
172 return 0;
173 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
174}
175
176static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
177{
178 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
179 m->ip = regs->ip;
180 m->cs = regs->cs;
181 } else {
182 m->ip = 0;
183 m->cs = 0;
184 }
185 if (rip_msr) {
186 /* Assume the RIP in the MSR is exact. Is this true? */
187 m->mcgstatus |= MCG_STATUS_EIPV;
188 rdmsrl(rip_msr, m->ip);
189 m->cs = 0;
190 }
191}
192
193/*
194 * Poll for corrected events or events that happened before reset.
195 * Those are just logged through /dev/mcelog.
196 *
197 * This is executed in standard interrupt context.
198 */
199void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
200{
201 struct mce m;
202 int i;
203
204 mce_setup(&m);
205
206 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
207 for (i = 0; i < banks; i++) {
208 if (!bank[i] || !test_bit(i, *b))
209 continue;
210
211 m.misc = 0;
212 m.addr = 0;
213 m.bank = i;
214 m.tsc = 0;
215
216 barrier();
217 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
218 if (!(m.status & MCI_STATUS_VAL))
219 continue;
220
221 /*
222 * Uncorrected events are handled by the exception handler
223 * when it is enabled. But when the exception is disabled log
224 * everything.
225 *
226 * TBD do the same check for MCI_STATUS_EN here?
227 */
228 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
229 continue;
230
231 if (m.status & MCI_STATUS_MISCV)
232 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
233 if (m.status & MCI_STATUS_ADDRV)
234 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
235
236 if (!(flags & MCP_TIMESTAMP))
237 m.tsc = 0;
238 /*
239 * Don't get the IP here because it's unlikely to
240 * have anything to do with the actual error location.
241 */
242 if (!(flags & MCP_DONTLOG)) {
243 mce_log(&m);
244 add_taint(TAINT_MACHINE_CHECK);
245 }
246
247 /*
248 * Clear state for this bank.
249 */
250 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
251 }
252
253 /*
254 * Don't clear MCG_STATUS here because it's only defined for
255 * exceptions.
256 */
257}
258
259/*
260 * The actual machine check handler. This only handles real
261 * exceptions when something got corrupted coming in through int 18.
262 *
263 * This is executed in NMI context not subject to normal locking rules. This
264 * implies that most kernel services cannot be safely used. Don't even
265 * think about putting a printk in there!
266 */
267void do_machine_check(struct pt_regs * regs, long error_code)
268{
269 struct mce m, panicm;
270 u64 mcestart = 0;
271 int i;
272 int panicm_found = 0;
273 /*
274 * If no_way_out gets set, there is no safe way to recover from this
275 * MCE. If tolerant is cranked up, we'll try anyway.
276 */
277 int no_way_out = 0;
278 /*
279 * If kill_it gets set, there might be a way to recover from this
280 * error.
281 */
282 int kill_it = 0;
283 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
284
285 atomic_inc(&mce_entry);
286
287 if (notify_die(DIE_NMI, "machine check", regs, error_code,
288 18, SIGKILL) == NOTIFY_STOP)
289 goto out2;
290 if (!banks)
291 goto out2;
292
293 mce_setup(&m);
294
295 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
296 /* if the restart IP is not valid, we're done for */
297 if (!(m.mcgstatus & MCG_STATUS_RIPV))
298 no_way_out = 1;
299
300 rdtscll(mcestart);
301 barrier();
302
303 for (i = 0; i < banks; i++) {
304 __clear_bit(i, toclear);
305 if (!bank[i])
306 continue;
307
308 m.misc = 0;
309 m.addr = 0;
310 m.bank = i;
311
312 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
313 if ((m.status & MCI_STATUS_VAL) == 0)
314 continue;
315
316 /*
317 * Non uncorrected errors are handled by machine_check_poll
318 * Leave them alone.
319 */
320 if ((m.status & MCI_STATUS_UC) == 0)
321 continue;
322
323 /*
324 * Set taint even when machine check was not enabled.
325 */
326 add_taint(TAINT_MACHINE_CHECK);
327
328 __set_bit(i, toclear);
329
330 if (m.status & MCI_STATUS_EN) {
331 /* if PCC was set, there's no way out */
332 no_way_out |= !!(m.status & MCI_STATUS_PCC);
333 /*
334 * If this error was uncorrectable and there was
335 * an overflow, we're in trouble. If no overflow,
336 * we might get away with just killing a task.
337 */
338 if (m.status & MCI_STATUS_UC) {
339 if (tolerant < 1 || m.status & MCI_STATUS_OVER)
340 no_way_out = 1;
341 kill_it = 1;
342 }
343 } else {
344 /*
345 * Machine check event was not enabled. Clear, but
346 * ignore.
347 */
348 continue;
349 }
350
351 if (m.status & MCI_STATUS_MISCV)
352 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
353 if (m.status & MCI_STATUS_ADDRV)
354 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
355
356 mce_get_rip(&m, regs);
357 mce_log(&m);
358
359 /* Did this bank cause the exception? */
360 /* Assume that the bank with uncorrectable errors did it,
361 and that there is only a single one. */
362 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
363 panicm = m;
364 panicm_found = 1;
365 }
366 }
367
368 /* If we didn't find an uncorrectable error, pick
369 the last one (shouldn't happen, just being safe). */
370 if (!panicm_found)
371 panicm = m;
372
373 /*
374 * If we have decided that we just CAN'T continue, and the user
375 * has not set tolerant to an insane level, give up and die.
376 */
377 if (no_way_out && tolerant < 3)
378 mce_panic("Machine check", &panicm, mcestart);
379
380 /*
381 * If the error seems to be unrecoverable, something should be
382 * done. Try to kill as little as possible. If we can kill just
383 * one task, do that. If the user has set the tolerance very
384 * high, don't try to do anything at all.
385 */
386 if (kill_it && tolerant < 3) {
387 int user_space = 0;
388
389 /*
390 * If the EIPV bit is set, it means the saved IP is the
391 * instruction which caused the MCE.
392 */
393 if (m.mcgstatus & MCG_STATUS_EIPV)
394 user_space = panicm.ip && (panicm.cs & 3);
395
396 /*
397 * If we know that the error was in user space, send a
398 * SIGBUS. Otherwise, panic if tolerance is low.
399 *
400 * force_sig() takes an awful lot of locks and has a slight
401 * risk of deadlocking.
402 */
403 if (user_space) {
404 force_sig(SIGBUS, current);
405 } else if (panic_on_oops || tolerant < 2) {
406 mce_panic("Uncorrected machine check",
407 &panicm, mcestart);
408 }
409 }
410
411 /* notify userspace ASAP */
412 set_thread_flag(TIF_MCE_NOTIFY);
413
414 /* the last thing we do is clear state */
415 for (i = 0; i < banks; i++) {
416 if (test_bit(i, toclear))
417 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
418 }
419 wrmsrl(MSR_IA32_MCG_STATUS, 0);
420 out2:
421 atomic_dec(&mce_entry);
422}
423
424#ifdef CONFIG_X86_MCE_INTEL
425/***
426 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
427 * @cpu: The CPU on which the event occurred.
428 * @status: Event status information
429 *
430 * This function should be called by the thermal interrupt after the
431 * event has been processed and the decision was made to log the event
432 * further.
433 *
434 * The status parameter will be saved to the 'status' field of 'struct mce'
435 * and historically has been the register value of the
436 * MSR_IA32_THERMAL_STATUS (Intel) msr.
437 */
438void mce_log_therm_throt_event(__u64 status)
439{
440 struct mce m;
441
442 mce_setup(&m);
443 m.bank = MCE_THERMAL_BANK;
444 m.status = status;
445 mce_log(&m);
446}
447#endif /* CONFIG_X86_MCE_INTEL */
448
449/*
450 * Periodic polling timer for "silent" machine check errors. If the
451 * poller finds an MCE, poll 2x faster. When the poller finds no more
452 * errors, poll 2x slower (up to check_interval seconds).
453 */
454
455static int check_interval = 5 * 60; /* 5 minutes */
456static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
457static void mcheck_timer(unsigned long);
458static DEFINE_PER_CPU(struct timer_list, mce_timer);
459
460static void mcheck_timer(unsigned long data)
461{
462 struct timer_list *t = &per_cpu(mce_timer, data);
463 int *n;
464
465 WARN_ON(smp_processor_id() != data);
466
467 if (mce_available(&current_cpu_data))
468 machine_check_poll(MCP_TIMESTAMP,
469 &__get_cpu_var(mce_poll_banks));
470
471 /*
472 * Alert userspace if needed. If we logged an MCE, reduce the
473 * polling interval, otherwise increase the polling interval.
474 */
475 n = &__get_cpu_var(next_interval);
476 if (mce_notify_user()) {
477 *n = max(*n/2, HZ/100);
478 } else {
479 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
480 }
481
482 t->expires = jiffies + *n;
483 add_timer(t);
484}
485
486static void mce_do_trigger(struct work_struct *work)
487{
488 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
489}
490
491static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
492
493/*
494 * Notify the user(s) about new machine check events.
495 * Can be called from interrupt context, but not from machine check/NMI
496 * context.
497 */
498int mce_notify_user(void)
499{
500 /* Not more than two messages every minute */
501 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
502
503 clear_thread_flag(TIF_MCE_NOTIFY);
504 if (test_and_clear_bit(0, &notify_user)) {
505 wake_up_interruptible(&mce_wait);
506
507 /*
508 * There is no risk of missing notifications because
509 * work_pending is always cleared before the function is
510 * executed.
511 */
512 if (trigger[0] && !work_pending(&mce_trigger_work))
513 schedule_work(&mce_trigger_work);
514
515 if (__ratelimit(&ratelimit))
516 printk(KERN_INFO "Machine check events logged\n");
517
518 return 1;
519 }
520 return 0;
521}
522
523/* see if the idle task needs to notify userspace */
524static int
525mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
526{
527 /* IDLE_END should be safe - interrupts are back on */
528 if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
529 mce_notify_user();
530
531 return NOTIFY_OK;
532}
533
534static struct notifier_block mce_idle_notifier = {
535 .notifier_call = mce_idle_callback,
536};
537
538static __init int periodic_mcheck_init(void)
539{
540 idle_notifier_register(&mce_idle_notifier);
541 return 0;
542}
543__initcall(periodic_mcheck_init);
544
545/*
546 * Initialize Machine Checks for a CPU.
547 */
548static int mce_cap_init(void)
549{
550 u64 cap;
551 unsigned b;
552
553 rdmsrl(MSR_IA32_MCG_CAP, cap);
554 b = cap & 0xff;
555 if (b > MAX_NR_BANKS) {
556 printk(KERN_WARNING
557 "MCE: Using only %u machine check banks out of %u\n",
558 MAX_NR_BANKS, b);
559 b = MAX_NR_BANKS;
560 }
561
562 /* Don't support asymmetric configurations today */
563 WARN_ON(banks != 0 && b != banks);
564 banks = b;
565 if (!bank) {
566 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
567 if (!bank)
568 return -ENOMEM;
569 memset(bank, 0xff, banks * sizeof(u64));
570 }
571
572 /* Use accurate RIP reporting if available. */
573 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
574 rip_msr = MSR_IA32_MCG_EIP;
575
576 return 0;
577}
578
579static void mce_init(void *dummy)
580{
581 u64 cap;
582 int i;
583 mce_banks_t all_banks;
584
585 /*
586 * Log the machine checks left over from the previous reset.
587 */
588 bitmap_fill(all_banks, MAX_NR_BANKS);
589 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
590
591 set_in_cr4(X86_CR4_MCE);
592
593 rdmsrl(MSR_IA32_MCG_CAP, cap);
594 if (cap & MCG_CTL_P)
595 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
596
597 for (i = 0; i < banks; i++) {
598 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
599 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
600 }
601}
602
603/* Add per CPU specific workarounds here */
604static void mce_cpu_quirks(struct cpuinfo_x86 *c)
605{
606 /* This should be disabled by the BIOS, but isn't always */
607 if (c->x86_vendor == X86_VENDOR_AMD) {
608 if (c->x86 == 15 && banks > 4)
609 /* disable GART TBL walk error reporting, which trips off
610 incorrectly with the IOMMU & 3ware & Cerberus. */
611 clear_bit(10, (unsigned long *)&bank[4]);
612 if(c->x86 <= 17 && mce_bootlog < 0)
613 /* Lots of broken BIOS around that don't clear them
614 by default and leave crap in there. Don't log. */
615 mce_bootlog = 0;
616 }
617
618}
619
620static void mce_cpu_features(struct cpuinfo_x86 *c)
621{
622 switch (c->x86_vendor) {
623 case X86_VENDOR_INTEL:
624 mce_intel_feature_init(c);
625 break;
626 case X86_VENDOR_AMD:
627 mce_amd_feature_init(c);
628 break;
629 default:
630 break;
631 }
632}
633
634static void mce_init_timer(void)
635{
636 struct timer_list *t = &__get_cpu_var(mce_timer);
637 int *n = &__get_cpu_var(next_interval);
638
639 *n = check_interval * HZ;
640 if (!*n)
641 return;
642 setup_timer(t, mcheck_timer, smp_processor_id());
643 t->expires = round_jiffies(jiffies + *n);
644 add_timer(t);
645}
646
647/*
648 * Called for each booted CPU to set up machine checks.
649 * Must be called with preempt off.
650 */
651void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
652{
653 if (!mce_available(c))
654 return;
655
656 if (mce_cap_init() < 0) {
657 mce_dont_init = 1;
658 return;
659 }
660 mce_cpu_quirks(c);
661
662 mce_init(NULL);
663 mce_cpu_features(c);
664 mce_init_timer();
665}
666
667/*
668 * Character device to read and clear the MCE log.
669 */
670
671static DEFINE_SPINLOCK(mce_state_lock);
672static int open_count; /* #times opened */
673static int open_exclu; /* already open exclusive? */
674
675static int mce_open(struct inode *inode, struct file *file)
676{
677 lock_kernel();
678 spin_lock(&mce_state_lock);
679
680 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
681 spin_unlock(&mce_state_lock);
682 unlock_kernel();
683 return -EBUSY;
684 }
685
686 if (file->f_flags & O_EXCL)
687 open_exclu = 1;
688 open_count++;
689
690 spin_unlock(&mce_state_lock);
691 unlock_kernel();
692
693 return nonseekable_open(inode, file);
694}
695
696static int mce_release(struct inode *inode, struct file *file)
697{
698 spin_lock(&mce_state_lock);
699
700 open_count--;
701 open_exclu = 0;
702
703 spin_unlock(&mce_state_lock);
704
705 return 0;
706}
707
708static void collect_tscs(void *data)
709{
710 unsigned long *cpu_tsc = (unsigned long *)data;
711
712 rdtscll(cpu_tsc[smp_processor_id()]);
713}
714
715static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
716 loff_t *off)
717{
718 unsigned long *cpu_tsc;
719 static DEFINE_MUTEX(mce_read_mutex);
720 unsigned prev, next;
721 char __user *buf = ubuf;
722 int i, err;
723
724 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
725 if (!cpu_tsc)
726 return -ENOMEM;
727
728 mutex_lock(&mce_read_mutex);
729 next = rcu_dereference(mcelog.next);
730
731 /* Only supports full reads right now */
732 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
733 mutex_unlock(&mce_read_mutex);
734 kfree(cpu_tsc);
735 return -EINVAL;
736 }
737
738 err = 0;
739 prev = 0;
740 do {
741 for (i = prev; i < next; i++) {
742 unsigned long start = jiffies;
743
744 while (!mcelog.entry[i].finished) {
745 if (time_after_eq(jiffies, start + 2)) {
746 memset(mcelog.entry + i, 0,
747 sizeof(struct mce));
748 goto timeout;
749 }
750 cpu_relax();
751 }
752 smp_rmb();
753 err |= copy_to_user(buf, mcelog.entry + i,
754 sizeof(struct mce));
755 buf += sizeof(struct mce);
756timeout:
757 ;
758 }
759
760 memset(mcelog.entry + prev, 0,
761 (next - prev) * sizeof(struct mce));
762 prev = next;
763 next = cmpxchg(&mcelog.next, prev, 0);
764 } while (next != prev);
765
766 synchronize_sched();
767
768 /*
769 * Collect entries that were still getting written before the
770 * synchronize.
771 */
772 on_each_cpu(collect_tscs, cpu_tsc, 1);
773 for (i = next; i < MCE_LOG_LEN; i++) {
774 if (mcelog.entry[i].finished &&
775 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
776 err |= copy_to_user(buf, mcelog.entry+i,
777 sizeof(struct mce));
778 smp_rmb();
779 buf += sizeof(struct mce);
780 memset(&mcelog.entry[i], 0, sizeof(struct mce));
781 }
782 }
783 mutex_unlock(&mce_read_mutex);
784 kfree(cpu_tsc);
785 return err ? -EFAULT : buf - ubuf;
786}
787
788static unsigned int mce_poll(struct file *file, poll_table *wait)
789{
790 poll_wait(file, &mce_wait, wait);
791 if (rcu_dereference(mcelog.next))
792 return POLLIN | POLLRDNORM;
793 return 0;
794}
795
796static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
797{
798 int __user *p = (int __user *)arg;
799
800 if (!capable(CAP_SYS_ADMIN))
801 return -EPERM;
802 switch (cmd) {
803 case MCE_GET_RECORD_LEN:
804 return put_user(sizeof(struct mce), p);
805 case MCE_GET_LOG_LEN:
806 return put_user(MCE_LOG_LEN, p);
807 case MCE_GETCLEAR_FLAGS: {
808 unsigned flags;
809
810 do {
811 flags = mcelog.flags;
812 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
813 return put_user(flags, p);
814 }
815 default:
816 return -ENOTTY;
817 }
818}
819
820static const struct file_operations mce_chrdev_ops = {
821 .open = mce_open,
822 .release = mce_release,
823 .read = mce_read,
824 .poll = mce_poll,
825 .unlocked_ioctl = mce_ioctl,
826};
827
828static struct miscdevice mce_log_device = {
829 MISC_MCELOG_MINOR,
830 "mcelog",
831 &mce_chrdev_ops,
832};
833
834/*
835 * Old style boot options parsing. Only for compatibility.
836 */
837static int __init mcheck_disable(char *str)
838{
839 mce_dont_init = 1;
840 return 1;
841}
842
843/* mce=off disables machine check.
844 mce=TOLERANCELEVEL (number, see above)
845 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
846 mce=nobootlog Don't log MCEs from before booting. */
847static int __init mcheck_enable(char *str)
848{
849 if (!strcmp(str, "off"))
850 mce_dont_init = 1;
851 else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
852 mce_bootlog = str[0] == 'b';
853 else if (isdigit(str[0]))
854 get_option(&str, &tolerant);
855 else
856 printk("mce= argument %s ignored. Please use /sys", str);
857 return 1;
858}
859
860__setup("nomce", mcheck_disable);
861__setup("mce=", mcheck_enable);
862
863/*
864 * Sysfs support
865 */
866
867/*
868 * Disable machine checks on suspend and shutdown. We can't really handle
869 * them later.
870 */
871static int mce_disable(void)
872{
873 int i;
874
875 for (i = 0; i < banks; i++)
876 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
877 return 0;
878}
879
880static int mce_suspend(struct sys_device *dev, pm_message_t state)
881{
882 return mce_disable();
883}
884
885static int mce_shutdown(struct sys_device *dev)
886{
887 return mce_disable();
888}
889
890/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
891 Only one CPU is active at this time, the others get readded later using
892 CPU hotplug. */
893static int mce_resume(struct sys_device *dev)
894{
895 mce_init(NULL);
896 mce_cpu_features(&current_cpu_data);
897 return 0;
898}
899
900static void mce_cpu_restart(void *data)
901{
902 del_timer_sync(&__get_cpu_var(mce_timer));
903 if (mce_available(&current_cpu_data))
904 mce_init(NULL);
905 mce_init_timer();
906}
907
908/* Reinit MCEs after user configuration changes */
909static void mce_restart(void)
910{
911 on_each_cpu(mce_cpu_restart, NULL, 1);
912}
913
914static struct sysdev_class mce_sysclass = {
915 .suspend = mce_suspend,
916 .shutdown = mce_shutdown,
917 .resume = mce_resume,
918 .name = "machinecheck",
919};
920
921DEFINE_PER_CPU(struct sys_device, device_mce);
922void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata;
923
924/* Why are there no generic functions for this? */
925#define ACCESSOR(name, var, start) \
926 static ssize_t show_ ## name(struct sys_device *s, \
927 struct sysdev_attribute *attr, \
928 char *buf) { \
929 return sprintf(buf, "%lx\n", (unsigned long)var); \
930 } \
931 static ssize_t set_ ## name(struct sys_device *s, \
932 struct sysdev_attribute *attr, \
933 const char *buf, size_t siz) { \
934 char *end; \
935 unsigned long new = simple_strtoul(buf, &end, 0); \
936 if (end == buf) return -EINVAL; \
937 var = new; \
938 start; \
939 return end-buf; \
940 } \
941 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
942
943static struct sysdev_attribute *bank_attrs;
944
945static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
946 char *buf)
947{
948 u64 b = bank[attr - bank_attrs];
949 return sprintf(buf, "%llx\n", b);
950}
951
952static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
953 const char *buf, size_t siz)
954{
955 char *end;
956 u64 new = simple_strtoull(buf, &end, 0);
957 if (end == buf)
958 return -EINVAL;
959 bank[attr - bank_attrs] = new;
960 mce_restart();
961 return end-buf;
962}
963
964static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
965 char *buf)
966{
967 strcpy(buf, trigger);
968 strcat(buf, "\n");
969 return strlen(trigger) + 1;
970}
971
972static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
973 const char *buf,size_t siz)
974{
975 char *p;
976 int len;
977 strncpy(trigger, buf, sizeof(trigger));
978 trigger[sizeof(trigger)-1] = 0;
979 len = strlen(trigger);
980 p = strchr(trigger, '\n');
981 if (*p) *p = 0;
982 return len;
983}
984
985static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
986static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
987ACCESSOR(check_interval,check_interval,mce_restart())
988static struct sysdev_attribute *mce_attributes[] = {
989 &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
990 NULL
991};
992
993static cpumask_var_t mce_device_initialized;
994
995/* Per cpu sysdev init. All of the cpus still share the same ctl bank */
996static __cpuinit int mce_create_device(unsigned int cpu)
997{
998 int err;
999 int i;
1000
1001 if (!mce_available(&boot_cpu_data))
1002 return -EIO;
1003
1004 memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
1005 per_cpu(device_mce,cpu).id = cpu;
1006 per_cpu(device_mce,cpu).cls = &mce_sysclass;
1007
1008 err = sysdev_register(&per_cpu(device_mce,cpu));
1009 if (err)
1010 return err;
1011
1012 for (i = 0; mce_attributes[i]; i++) {
1013 err = sysdev_create_file(&per_cpu(device_mce,cpu),
1014 mce_attributes[i]);
1015 if (err)
1016 goto error;
1017 }
1018 for (i = 0; i < banks; i++) {
1019 err = sysdev_create_file(&per_cpu(device_mce, cpu),
1020 &bank_attrs[i]);
1021 if (err)
1022 goto error2;
1023 }
1024 cpumask_set_cpu(cpu, mce_device_initialized);
1025
1026 return 0;
1027error2:
1028 while (--i >= 0) {
1029 sysdev_remove_file(&per_cpu(device_mce, cpu),
1030 &bank_attrs[i]);
1031 }
1032error:
1033 while (--i >= 0) {
1034 sysdev_remove_file(&per_cpu(device_mce,cpu),
1035 mce_attributes[i]);
1036 }
1037 sysdev_unregister(&per_cpu(device_mce,cpu));
1038
1039 return err;
1040}
1041
1042static __cpuinit void mce_remove_device(unsigned int cpu)
1043{
1044 int i;
1045
1046 if (!cpumask_test_cpu(cpu, mce_device_initialized))
1047 return;
1048
1049 for (i = 0; mce_attributes[i]; i++)
1050 sysdev_remove_file(&per_cpu(device_mce,cpu),
1051 mce_attributes[i]);
1052 for (i = 0; i < banks; i++)
1053 sysdev_remove_file(&per_cpu(device_mce, cpu),
1054 &bank_attrs[i]);
1055 sysdev_unregister(&per_cpu(device_mce,cpu));
1056 cpumask_clear_cpu(cpu, mce_device_initialized);
1057}
1058
1059/* Make sure there are no machine checks on offlined CPUs. */
1060static void mce_disable_cpu(void *h)
1061{
1062 int i;
1063 unsigned long action = *(unsigned long *)h;
1064
1065 if (!mce_available(&current_cpu_data))
1066 return;
1067 if (!(action & CPU_TASKS_FROZEN))
1068 cmci_clear();
1069 for (i = 0; i < banks; i++)
1070 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1071}
1072
1073static void mce_reenable_cpu(void *h)
1074{
1075 int i;
1076 unsigned long action = *(unsigned long *)h;
1077
1078 if (!mce_available(&current_cpu_data))
1079 return;
1080 if (!(action & CPU_TASKS_FROZEN))
1081 cmci_reenable();
1082 for (i = 0; i < banks; i++)
1083 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1084}
1085
1086/* Get notified when a cpu comes on/off. Be hotplug friendly. */
1087static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
1088 unsigned long action, void *hcpu)
1089{
1090 unsigned int cpu = (unsigned long)hcpu;
1091 struct timer_list *t = &per_cpu(mce_timer, cpu);
1092
1093 switch (action) {
1094 case CPU_ONLINE:
1095 case CPU_ONLINE_FROZEN:
1096 mce_create_device(cpu);
1097 if (threshold_cpu_callback)
1098 threshold_cpu_callback(action, cpu);
1099 break;
1100 case CPU_DEAD:
1101 case CPU_DEAD_FROZEN:
1102 if (threshold_cpu_callback)
1103 threshold_cpu_callback(action, cpu);
1104 mce_remove_device(cpu);
1105 break;
1106 case CPU_DOWN_PREPARE:
1107 case CPU_DOWN_PREPARE_FROZEN:
1108 del_timer_sync(t);
1109 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1110 break;
1111 case CPU_DOWN_FAILED:
1112 case CPU_DOWN_FAILED_FROZEN:
1113 t->expires = round_jiffies(jiffies +
1114 __get_cpu_var(next_interval));
1115 add_timer_on(t, cpu);
1116 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1117 break;
1118 case CPU_POST_DEAD:
1119 /* intentionally ignoring frozen here */
1120 cmci_rediscover(cpu);
1121 break;
1122 }
1123 return NOTIFY_OK;
1124}
1125
1126static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1127 .notifier_call = mce_cpu_callback,
1128};
1129
1130static __init int mce_init_banks(void)
1131{
1132 int i;
1133
1134 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1135 GFP_KERNEL);
1136 if (!bank_attrs)
1137 return -ENOMEM;
1138
1139 for (i = 0; i < banks; i++) {
1140 struct sysdev_attribute *a = &bank_attrs[i];
1141 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
1142 if (!a->attr.name)
1143 goto nomem;
1144 a->attr.mode = 0644;
1145 a->show = show_bank;
1146 a->store = set_bank;
1147 }
1148 return 0;
1149
1150nomem:
1151 while (--i >= 0)
1152 kfree(bank_attrs[i].attr.name);
1153 kfree(bank_attrs);
1154 bank_attrs = NULL;
1155 return -ENOMEM;
1156}
1157
1158static __init int mce_init_device(void)
1159{
1160 int err;
1161 int i = 0;
1162
1163 if (!mce_available(&boot_cpu_data))
1164 return -EIO;
1165
1166 alloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
1167
1168 err = mce_init_banks();
1169 if (err)
1170 return err;
1171
1172 err = sysdev_class_register(&mce_sysclass);
1173 if (err)
1174 return err;
1175
1176 for_each_online_cpu(i) {
1177 err = mce_create_device(i);
1178 if (err)
1179 return err;
1180 }
1181
1182 register_hotcpu_notifier(&mce_cpu_notifier);
1183 misc_register(&mce_log_device);
1184 return err;
1185}
1186
1187device_initcall(mce_init_device);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 56dde9c4bc9..ddae21620bd 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -13,22 +13,22 @@
13 * 13 *
14 * All MC4_MISCi registers are shared between multi-cores 14 * All MC4_MISCi registers are shared between multi-cores
15 */ 15 */
16
17#include <linux/cpu.h>
18#include <linux/errno.h>
19#include <linux/init.h>
20#include <linux/interrupt.h> 16#include <linux/interrupt.h>
21#include <linux/kobject.h>
22#include <linux/notifier.h> 17#include <linux/notifier.h>
23#include <linux/sched.h> 18#include <linux/kobject.h>
24#include <linux/smp.h> 19#include <linux/percpu.h>
25#include <linux/sysdev.h> 20#include <linux/sysdev.h>
21#include <linux/errno.h>
22#include <linux/sched.h>
26#include <linux/sysfs.h> 23#include <linux/sysfs.h>
24#include <linux/init.h>
25#include <linux/cpu.h>
26#include <linux/smp.h>
27
27#include <asm/apic.h> 28#include <asm/apic.h>
29#include <asm/idle.h>
28#include <asm/mce.h> 30#include <asm/mce.h>
29#include <asm/msr.h> 31#include <asm/msr.h>
30#include <asm/percpu.h>
31#include <asm/idle.h>
32 32
33#define PFX "mce_threshold: " 33#define PFX "mce_threshold: "
34#define VERSION "version 1.1.1" 34#define VERSION "version 1.1.1"
@@ -48,26 +48,26 @@
48#define MCG_XBLK_ADDR 0xC0000400 48#define MCG_XBLK_ADDR 0xC0000400
49 49
50struct threshold_block { 50struct threshold_block {
51 unsigned int block; 51 unsigned int block;
52 unsigned int bank; 52 unsigned int bank;
53 unsigned int cpu; 53 unsigned int cpu;
54 u32 address; 54 u32 address;
55 u16 interrupt_enable; 55 u16 interrupt_enable;
56 u16 threshold_limit; 56 u16 threshold_limit;
57 struct kobject kobj; 57 struct kobject kobj;
58 struct list_head miscj; 58 struct list_head miscj;
59}; 59};
60 60
61/* defaults used early on boot */ 61/* defaults used early on boot */
62static struct threshold_block threshold_defaults = { 62static struct threshold_block threshold_defaults = {
63 .interrupt_enable = 0, 63 .interrupt_enable = 0,
64 .threshold_limit = THRESHOLD_MAX, 64 .threshold_limit = THRESHOLD_MAX,
65}; 65};
66 66
67struct threshold_bank { 67struct threshold_bank {
68 struct kobject *kobj; 68 struct kobject *kobj;
69 struct threshold_block *blocks; 69 struct threshold_block *blocks;
70 cpumask_var_t cpus; 70 cpumask_var_t cpus;
71}; 71};
72static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); 72static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
73 73
@@ -86,9 +86,9 @@ static void amd_threshold_interrupt(void);
86 */ 86 */
87 87
88struct thresh_restart { 88struct thresh_restart {
89 struct threshold_block *b; 89 struct threshold_block *b;
90 int reset; 90 int reset;
91 u16 old_limit; 91 u16 old_limit;
92}; 92};
93 93
94/* must be called with correct cpu affinity */ 94/* must be called with correct cpu affinity */
@@ -110,6 +110,7 @@ static void threshold_restart_bank(void *_tr)
110 } else if (tr->old_limit) { /* change limit w/o reset */ 110 } else if (tr->old_limit) { /* change limit w/o reset */
111 int new_count = (mci_misc_hi & THRESHOLD_MAX) + 111 int new_count = (mci_misc_hi & THRESHOLD_MAX) +
112 (tr->old_limit - tr->b->threshold_limit); 112 (tr->old_limit - tr->b->threshold_limit);
113
113 mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | 114 mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
114 (new_count & THRESHOLD_MAX); 115 (new_count & THRESHOLD_MAX);
115 } 116 }
@@ -125,11 +126,11 @@ static void threshold_restart_bank(void *_tr)
125/* cpu init entry point, called from mce.c with preempt off */ 126/* cpu init entry point, called from mce.c with preempt off */
126void mce_amd_feature_init(struct cpuinfo_x86 *c) 127void mce_amd_feature_init(struct cpuinfo_x86 *c)
127{ 128{
128 unsigned int bank, block;
129 unsigned int cpu = smp_processor_id(); 129 unsigned int cpu = smp_processor_id();
130 u8 lvt_off;
131 u32 low = 0, high = 0, address = 0; 130 u32 low = 0, high = 0, address = 0;
131 unsigned int bank, block;
132 struct thresh_restart tr; 132 struct thresh_restart tr;
133 u8 lvt_off;
133 134
134 for (bank = 0; bank < NR_BANKS; ++bank) { 135 for (bank = 0; bank < NR_BANKS; ++bank) {
135 for (block = 0; block < NR_BLOCKS; ++block) { 136 for (block = 0; block < NR_BLOCKS; ++block) {
@@ -140,8 +141,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
140 if (!address) 141 if (!address)
141 break; 142 break;
142 address += MCG_XBLK_ADDR; 143 address += MCG_XBLK_ADDR;
143 } 144 } else
144 else
145 ++address; 145 ++address;
146 146
147 if (rdmsr_safe(address, &low, &high)) 147 if (rdmsr_safe(address, &low, &high))
@@ -193,9 +193,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
193 */ 193 */
194static void amd_threshold_interrupt(void) 194static void amd_threshold_interrupt(void)
195{ 195{
196 u32 low = 0, high = 0, address = 0;
196 unsigned int bank, block; 197 unsigned int bank, block;
197 struct mce m; 198 struct mce m;
198 u32 low = 0, high = 0, address = 0;
199 199
200 mce_setup(&m); 200 mce_setup(&m);
201 201
@@ -204,16 +204,16 @@ static void amd_threshold_interrupt(void)
204 if (!(per_cpu(bank_map, m.cpu) & (1 << bank))) 204 if (!(per_cpu(bank_map, m.cpu) & (1 << bank)))
205 continue; 205 continue;
206 for (block = 0; block < NR_BLOCKS; ++block) { 206 for (block = 0; block < NR_BLOCKS; ++block) {
207 if (block == 0) 207 if (block == 0) {
208 address = MSR_IA32_MC0_MISC + bank * 4; 208 address = MSR_IA32_MC0_MISC + bank * 4;
209 else if (block == 1) { 209 } else if (block == 1) {
210 address = (low & MASK_BLKPTR_LO) >> 21; 210 address = (low & MASK_BLKPTR_LO) >> 21;
211 if (!address) 211 if (!address)
212 break; 212 break;
213 address += MCG_XBLK_ADDR; 213 address += MCG_XBLK_ADDR;
214 } 214 } else {
215 else
216 ++address; 215 ++address;
216 }
217 217
218 if (rdmsr_safe(address, &low, &high)) 218 if (rdmsr_safe(address, &low, &high))
219 break; 219 break;
@@ -229,8 +229,10 @@ static void amd_threshold_interrupt(void)
229 (high & MASK_LOCKED_HI)) 229 (high & MASK_LOCKED_HI))
230 continue; 230 continue;
231 231
232 /* Log the machine check that caused the threshold 232 /*
233 event. */ 233 * Log the machine check that caused the threshold
234 * event.
235 */
234 machine_check_poll(MCP_TIMESTAMP, 236 machine_check_poll(MCP_TIMESTAMP,
235 &__get_cpu_var(mce_poll_banks)); 237 &__get_cpu_var(mce_poll_banks));
236 238
@@ -254,48 +256,52 @@ static void amd_threshold_interrupt(void)
254 256
255struct threshold_attr { 257struct threshold_attr {
256 struct attribute attr; 258 struct attribute attr;
257 ssize_t(*show) (struct threshold_block *, char *); 259 ssize_t (*show) (struct threshold_block *, char *);
258 ssize_t(*store) (struct threshold_block *, const char *, size_t count); 260 ssize_t (*store) (struct threshold_block *, const char *, size_t count);
259}; 261};
260 262
261#define SHOW_FIELDS(name) \ 263#define SHOW_FIELDS(name) \
262static ssize_t show_ ## name(struct threshold_block * b, char *buf) \ 264static ssize_t show_ ## name(struct threshold_block *b, char *buf) \
263{ \ 265{ \
264 return sprintf(buf, "%lx\n", (unsigned long) b->name); \ 266 return sprintf(buf, "%lx\n", (unsigned long) b->name); \
265} 267}
266SHOW_FIELDS(interrupt_enable) 268SHOW_FIELDS(interrupt_enable)
267SHOW_FIELDS(threshold_limit) 269SHOW_FIELDS(threshold_limit)
268 270
269static ssize_t store_interrupt_enable(struct threshold_block *b, 271static ssize_t
270 const char *buf, size_t count) 272store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
271{ 273{
272 char *end;
273 struct thresh_restart tr; 274 struct thresh_restart tr;
274 unsigned long new = simple_strtoul(buf, &end, 0); 275 unsigned long new;
275 if (end == buf) 276
277 if (strict_strtoul(buf, 0, &new) < 0)
276 return -EINVAL; 278 return -EINVAL;
279
277 b->interrupt_enable = !!new; 280 b->interrupt_enable = !!new;
278 281
279 tr.b = b; 282 tr.b = b;
280 tr.reset = 0; 283 tr.reset = 0;
281 tr.old_limit = 0; 284 tr.old_limit = 0;
285
282 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); 286 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
283 287
284 return end - buf; 288 return size;
285} 289}
286 290
287static ssize_t store_threshold_limit(struct threshold_block *b, 291static ssize_t
288 const char *buf, size_t count) 292store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
289{ 293{
290 char *end;
291 struct thresh_restart tr; 294 struct thresh_restart tr;
292 unsigned long new = simple_strtoul(buf, &end, 0); 295 unsigned long new;
293 if (end == buf) 296
297 if (strict_strtoul(buf, 0, &new) < 0)
294 return -EINVAL; 298 return -EINVAL;
299
295 if (new > THRESHOLD_MAX) 300 if (new > THRESHOLD_MAX)
296 new = THRESHOLD_MAX; 301 new = THRESHOLD_MAX;
297 if (new < 1) 302 if (new < 1)
298 new = 1; 303 new = 1;
304
299 tr.old_limit = b->threshold_limit; 305 tr.old_limit = b->threshold_limit;
300 b->threshold_limit = new; 306 b->threshold_limit = new;
301 tr.b = b; 307 tr.b = b;
@@ -303,12 +309,12 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
303 309
304 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); 310 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
305 311
306 return end - buf; 312 return size;
307} 313}
308 314
309struct threshold_block_cross_cpu { 315struct threshold_block_cross_cpu {
310 struct threshold_block *tb; 316 struct threshold_block *tb;
311 long retval; 317 long retval;
312}; 318};
313 319
314static void local_error_count_handler(void *_tbcc) 320static void local_error_count_handler(void *_tbcc)
@@ -338,16 +344,13 @@ static ssize_t store_error_count(struct threshold_block *b,
338 return 1; 344 return 1;
339} 345}
340 346
341#define THRESHOLD_ATTR(_name,_mode,_show,_store) { \ 347#define RW_ATTR(val) \
342 .attr = {.name = __stringify(_name), .mode = _mode }, \ 348static struct threshold_attr val = { \
343 .show = _show, \ 349 .attr = {.name = __stringify(val), .mode = 0644 }, \
344 .store = _store, \ 350 .show = show_## val, \
351 .store = store_## val, \
345}; 352};
346 353
347#define RW_ATTR(name) \
348static struct threshold_attr name = \
349 THRESHOLD_ATTR(name, 0644, show_## name, store_## name)
350
351RW_ATTR(interrupt_enable); 354RW_ATTR(interrupt_enable);
352RW_ATTR(threshold_limit); 355RW_ATTR(threshold_limit);
353RW_ATTR(error_count); 356RW_ATTR(error_count);
@@ -359,15 +362,17 @@ static struct attribute *default_attrs[] = {
359 NULL 362 NULL
360}; 363};
361 364
362#define to_block(k) container_of(k, struct threshold_block, kobj) 365#define to_block(k) container_of(k, struct threshold_block, kobj)
363#define to_attr(a) container_of(a, struct threshold_attr, attr) 366#define to_attr(a) container_of(a, struct threshold_attr, attr)
364 367
365static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) 368static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
366{ 369{
367 struct threshold_block *b = to_block(kobj); 370 struct threshold_block *b = to_block(kobj);
368 struct threshold_attr *a = to_attr(attr); 371 struct threshold_attr *a = to_attr(attr);
369 ssize_t ret; 372 ssize_t ret;
373
370 ret = a->show ? a->show(b, buf) : -EIO; 374 ret = a->show ? a->show(b, buf) : -EIO;
375
371 return ret; 376 return ret;
372} 377}
373 378
@@ -377,18 +382,20 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
377 struct threshold_block *b = to_block(kobj); 382 struct threshold_block *b = to_block(kobj);
378 struct threshold_attr *a = to_attr(attr); 383 struct threshold_attr *a = to_attr(attr);
379 ssize_t ret; 384 ssize_t ret;
385
380 ret = a->store ? a->store(b, buf, count) : -EIO; 386 ret = a->store ? a->store(b, buf, count) : -EIO;
387
381 return ret; 388 return ret;
382} 389}
383 390
384static struct sysfs_ops threshold_ops = { 391static struct sysfs_ops threshold_ops = {
385 .show = show, 392 .show = show,
386 .store = store, 393 .store = store,
387}; 394};
388 395
389static struct kobj_type threshold_ktype = { 396static struct kobj_type threshold_ktype = {
390 .sysfs_ops = &threshold_ops, 397 .sysfs_ops = &threshold_ops,
391 .default_attrs = default_attrs, 398 .default_attrs = default_attrs,
392}; 399};
393 400
394static __cpuinit int allocate_threshold_blocks(unsigned int cpu, 401static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
@@ -396,9 +403,9 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
396 unsigned int block, 403 unsigned int block,
397 u32 address) 404 u32 address)
398{ 405{
399 int err;
400 u32 low, high;
401 struct threshold_block *b = NULL; 406 struct threshold_block *b = NULL;
407 u32 low, high;
408 int err;
402 409
403 if ((bank >= NR_BANKS) || (block >= NR_BLOCKS)) 410 if ((bank >= NR_BANKS) || (block >= NR_BLOCKS))
404 return 0; 411 return 0;
@@ -421,20 +428,21 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
421 if (!b) 428 if (!b)
422 return -ENOMEM; 429 return -ENOMEM;
423 430
424 b->block = block; 431 b->block = block;
425 b->bank = bank; 432 b->bank = bank;
426 b->cpu = cpu; 433 b->cpu = cpu;
427 b->address = address; 434 b->address = address;
428 b->interrupt_enable = 0; 435 b->interrupt_enable = 0;
429 b->threshold_limit = THRESHOLD_MAX; 436 b->threshold_limit = THRESHOLD_MAX;
430 437
431 INIT_LIST_HEAD(&b->miscj); 438 INIT_LIST_HEAD(&b->miscj);
432 439
433 if (per_cpu(threshold_banks, cpu)[bank]->blocks) 440 if (per_cpu(threshold_banks, cpu)[bank]->blocks) {
434 list_add(&b->miscj, 441 list_add(&b->miscj,
435 &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj); 442 &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj);
436 else 443 } else {
437 per_cpu(threshold_banks, cpu)[bank]->blocks = b; 444 per_cpu(threshold_banks, cpu)[bank]->blocks = b;
445 }
438 446
439 err = kobject_init_and_add(&b->kobj, &threshold_ktype, 447 err = kobject_init_and_add(&b->kobj, &threshold_ktype,
440 per_cpu(threshold_banks, cpu)[bank]->kobj, 448 per_cpu(threshold_banks, cpu)[bank]->kobj,
@@ -447,8 +455,9 @@ recurse:
447 if (!address) 455 if (!address)
448 return 0; 456 return 0;
449 address += MCG_XBLK_ADDR; 457 address += MCG_XBLK_ADDR;
450 } else 458 } else {
451 ++address; 459 ++address;
460 }
452 461
453 err = allocate_threshold_blocks(cpu, bank, ++block, address); 462 err = allocate_threshold_blocks(cpu, bank, ++block, address);
454 if (err) 463 if (err)
@@ -500,13 +509,14 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
500 if (!b) 509 if (!b)
501 goto out; 510 goto out;
502 511
503 err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj, 512 err = sysfs_create_link(&per_cpu(mce_dev, cpu).kobj,
504 b->kobj, name); 513 b->kobj, name);
505 if (err) 514 if (err)
506 goto out; 515 goto out;
507 516
508 cpumask_copy(b->cpus, cpu_core_mask(cpu)); 517 cpumask_copy(b->cpus, cpu_core_mask(cpu));
509 per_cpu(threshold_banks, cpu)[bank] = b; 518 per_cpu(threshold_banks, cpu)[bank] = b;
519
510 goto out; 520 goto out;
511 } 521 }
512#endif 522#endif
@@ -522,7 +532,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
522 goto out; 532 goto out;
523 } 533 }
524 534
525 b->kobj = kobject_create_and_add(name, &per_cpu(device_mce, cpu).kobj); 535 b->kobj = kobject_create_and_add(name, &per_cpu(mce_dev, cpu).kobj);
526 if (!b->kobj) 536 if (!b->kobj)
527 goto out_free; 537 goto out_free;
528 538
@@ -542,7 +552,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
542 if (i == cpu) 552 if (i == cpu)
543 continue; 553 continue;
544 554
545 err = sysfs_create_link(&per_cpu(device_mce, i).kobj, 555 err = sysfs_create_link(&per_cpu(mce_dev, i).kobj,
546 b->kobj, name); 556 b->kobj, name);
547 if (err) 557 if (err)
548 goto out; 558 goto out;
@@ -605,15 +615,13 @@ static void deallocate_threshold_block(unsigned int cpu,
605 615
606static void threshold_remove_bank(unsigned int cpu, int bank) 616static void threshold_remove_bank(unsigned int cpu, int bank)
607{ 617{
608 int i = 0;
609 struct threshold_bank *b; 618 struct threshold_bank *b;
610 char name[32]; 619 char name[32];
620 int i = 0;
611 621
612 b = per_cpu(threshold_banks, cpu)[bank]; 622 b = per_cpu(threshold_banks, cpu)[bank];
613
614 if (!b) 623 if (!b)
615 return; 624 return;
616
617 if (!b->blocks) 625 if (!b->blocks)
618 goto free_out; 626 goto free_out;
619 627
@@ -622,8 +630,9 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
622#ifdef CONFIG_SMP 630#ifdef CONFIG_SMP
623 /* sibling symlink */ 631 /* sibling symlink */
624 if (shared_bank[bank] && b->blocks->cpu != cpu) { 632 if (shared_bank[bank] && b->blocks->cpu != cpu) {
625 sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name); 633 sysfs_remove_link(&per_cpu(mce_dev, cpu).kobj, name);
626 per_cpu(threshold_banks, cpu)[bank] = NULL; 634 per_cpu(threshold_banks, cpu)[bank] = NULL;
635
627 return; 636 return;
628 } 637 }
629#endif 638#endif
@@ -633,7 +642,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
633 if (i == cpu) 642 if (i == cpu)
634 continue; 643 continue;
635 644
636 sysfs_remove_link(&per_cpu(device_mce, i).kobj, name); 645 sysfs_remove_link(&per_cpu(mce_dev, i).kobj, name);
637 per_cpu(threshold_banks, i)[bank] = NULL; 646 per_cpu(threshold_banks, i)[bank] = NULL;
638 } 647 }
639 648
@@ -659,12 +668,9 @@ static void threshold_remove_device(unsigned int cpu)
659} 668}
660 669
661/* get notified when a cpu comes on/off */ 670/* get notified when a cpu comes on/off */
662static void __cpuinit amd_64_threshold_cpu_callback(unsigned long action, 671static void __cpuinit
663 unsigned int cpu) 672amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu)
664{ 673{
665 if (cpu >= NR_CPUS)
666 return;
667
668 switch (action) { 674 switch (action) {
669 case CPU_ONLINE: 675 case CPU_ONLINE:
670 case CPU_ONLINE_FROZEN: 676 case CPU_ONLINE_FROZEN:
@@ -686,11 +692,12 @@ static __init int threshold_init_device(void)
686 /* to hit CPUs online before the notifier is up */ 692 /* to hit CPUs online before the notifier is up */
687 for_each_online_cpu(lcpu) { 693 for_each_online_cpu(lcpu) {
688 int err = threshold_create_device(lcpu); 694 int err = threshold_create_device(lcpu);
695
689 if (err) 696 if (err)
690 return err; 697 return err;
691 } 698 }
692 threshold_cpu_callback = amd_64_threshold_cpu_callback; 699 threshold_cpu_callback = amd_64_threshold_cpu_callback;
700
693 return 0; 701 return 0;
694} 702}
695
696device_initcall(threshold_init_device); 703device_initcall(threshold_init_device);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
new file mode 100644
index 00000000000..2b011d2d857
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -0,0 +1,74 @@
1/*
2 * Common code for Intel machine checks
3 */
4#include <linux/interrupt.h>
5#include <linux/kernel.h>
6#include <linux/types.h>
7#include <linux/init.h>
8#include <linux/smp.h>
9
10#include <asm/therm_throt.h>
11#include <asm/processor.h>
12#include <asm/system.h>
13#include <asm/apic.h>
14#include <asm/msr.h>
15
16#include "mce.h"
17
18void intel_init_thermal(struct cpuinfo_x86 *c)
19{
20 unsigned int cpu = smp_processor_id();
21 int tm2 = 0;
22 u32 l, h;
23
24 /* Thermal monitoring depends on ACPI and clock modulation*/
25 if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
26 return;
27
28 /*
29 * First check if its enabled already, in which case there might
30 * be some SMM goo which handles it, so we can't even put a handler
31 * since it might be delivered via SMI already:
32 */
33 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
34 h = apic_read(APIC_LVTTHMR);
35 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
36 printk(KERN_DEBUG
37 "CPU%d: Thermal monitoring handled by SMI\n", cpu);
38 return;
39 }
40
41 if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2))
42 tm2 = 1;
43
44 /* Check whether a vector already exists */
45 if (h & APIC_VECTOR_MASK) {
46 printk(KERN_DEBUG
47 "CPU%d: Thermal LVT vector (%#x) already installed\n",
48 cpu, (h & APIC_VECTOR_MASK));
49 return;
50 }
51
52 /* We'll mask the thermal vector in the lapic till we're ready: */
53 h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
54 apic_write(APIC_LVTTHMR, h);
55
56 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
57 wrmsr(MSR_IA32_THERM_INTERRUPT,
58 l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
59
60 intel_set_thermal_handler();
61
62 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
63 wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
64
65 /* Unmask the thermal vector: */
66 l = apic_read(APIC_LVTTHMR);
67 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
68
69 printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
70 cpu, tm2 ? "TM2" : "TM1");
71
72 /* enable thermal throttle processing */
73 atomic_set(&therm_throt_en, 1);
74}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index cef3ee30744..f2ef6952c40 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -15,7 +15,8 @@
15#include <asm/hw_irq.h> 15#include <asm/hw_irq.h>
16#include <asm/idle.h> 16#include <asm/idle.h>
17#include <asm/therm_throt.h> 17#include <asm/therm_throt.h>
18#include <asm/apic.h> 18
19#include "mce.h"
19 20
20asmlinkage void smp_thermal_interrupt(void) 21asmlinkage void smp_thermal_interrupt(void)
21{ 22{
@@ -27,67 +28,13 @@ asmlinkage void smp_thermal_interrupt(void)
27 irq_enter(); 28 irq_enter();
28 29
29 rdmsrl(MSR_IA32_THERM_STATUS, msr_val); 30 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
30 if (therm_throt_process(msr_val & 1)) 31 if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT))
31 mce_log_therm_throt_event(msr_val); 32 mce_log_therm_throt_event(msr_val);
32 33
33 inc_irq_stat(irq_thermal_count); 34 inc_irq_stat(irq_thermal_count);
34 irq_exit(); 35 irq_exit();
35} 36}
36 37
37static void intel_init_thermal(struct cpuinfo_x86 *c)
38{
39 u32 l, h;
40 int tm2 = 0;
41 unsigned int cpu = smp_processor_id();
42
43 if (!cpu_has(c, X86_FEATURE_ACPI))
44 return;
45
46 if (!cpu_has(c, X86_FEATURE_ACC))
47 return;
48
49 /* first check if TM1 is already enabled by the BIOS, in which
50 * case there might be some SMM goo which handles it, so we can't even
51 * put a handler since it might be delivered via SMI already.
52 */
53 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
54 h = apic_read(APIC_LVTTHMR);
55 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
56 printk(KERN_DEBUG
57 "CPU%d: Thermal monitoring handled by SMI\n", cpu);
58 return;
59 }
60
61 if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2))
62 tm2 = 1;
63
64 if (h & APIC_VECTOR_MASK) {
65 printk(KERN_DEBUG
66 "CPU%d: Thermal LVT vector (%#x) already "
67 "installed\n", cpu, (h & APIC_VECTOR_MASK));
68 return;
69 }
70
71 h = THERMAL_APIC_VECTOR;
72 h |= (APIC_DM_FIXED | APIC_LVT_MASKED);
73 apic_write(APIC_LVTTHMR, h);
74
75 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
76 wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h);
77
78 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
79 wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
80
81 l = apic_read(APIC_LVTTHMR);
82 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
83 printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
84 cpu, tm2 ? "TM2" : "TM1");
85
86 /* enable thermal throttle processing */
87 atomic_set(&therm_throt_en, 1);
88 return;
89}
90
91/* 38/*
92 * Support for Intel Correct Machine Check Interrupts. This allows 39 * Support for Intel Correct Machine Check Interrupts. This allows
93 * the CPU to raise an interrupt when a corrected machine check happened. 40 * the CPU to raise an interrupt when a corrected machine check happened.
@@ -109,6 +56,9 @@ static int cmci_supported(int *banks)
109{ 56{
110 u64 cap; 57 u64 cap;
111 58
59 if (mce_cmci_disabled || mce_ignore_ce)
60 return 0;
61
112 /* 62 /*
113 * Vendor check is not strictly needed, but the initial 63 * Vendor check is not strictly needed, but the initial
114 * initialization is vendor keyed and this 64 * initialization is vendor keyed and this
@@ -132,7 +82,7 @@ static int cmci_supported(int *banks)
132static void intel_threshold_interrupt(void) 82static void intel_threshold_interrupt(void)
133{ 83{
134 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); 84 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
135 mce_notify_user(); 85 mce_notify_irq();
136} 86}
137 87
138static void print_update(char *type, int *hdr, int num) 88static void print_update(char *type, int *hdr, int num)
@@ -248,7 +198,7 @@ void cmci_rediscover(int dying)
248 return; 198 return;
249 cpumask_copy(old, &current->cpus_allowed); 199 cpumask_copy(old, &current->cpus_allowed);
250 200
251 for_each_online_cpu (cpu) { 201 for_each_online_cpu(cpu) {
252 if (cpu == dying) 202 if (cpu == dying)
253 continue; 203 continue;
254 if (set_cpus_allowed_ptr(current, cpumask_of(cpu))) 204 if (set_cpus_allowed_ptr(current, cpumask_of(cpu)))
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c
index a74af128efc..70b710420f7 100644
--- a/arch/x86/kernel/cpu/mcheck/non-fatal.c
+++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c
@@ -6,15 +6,14 @@
6 * This file contains routines to check for non-fatal MCEs every 15s 6 * This file contains routines to check for non-fatal MCEs every 15s
7 * 7 *
8 */ 8 */
9
10#include <linux/init.h>
11#include <linux/types.h>
12#include <linux/kernel.h>
13#include <linux/jiffies.h>
14#include <linux/workqueue.h>
15#include <linux/interrupt.h> 9#include <linux/interrupt.h>
16#include <linux/smp.h> 10#include <linux/workqueue.h>
11#include <linux/jiffies.h>
12#include <linux/kernel.h>
17#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/types.h>
15#include <linux/init.h>
16#include <linux/smp.h>
18 17
19#include <asm/processor.h> 18#include <asm/processor.h>
20#include <asm/system.h> 19#include <asm/system.h>
@@ -22,9 +21,9 @@
22 21
23#include "mce.h" 22#include "mce.h"
24 23
25static int firstbank; 24static int firstbank;
26 25
27#define MCE_RATE 15*HZ /* timer rate is 15s */ 26#define MCE_RATE (15*HZ) /* timer rate is 15s */
28 27
29static void mce_checkregs(void *info) 28static void mce_checkregs(void *info)
30{ 29{
@@ -34,23 +33,24 @@ static void mce_checkregs(void *info)
34 for (i = firstbank; i < nr_mce_banks; i++) { 33 for (i = firstbank; i < nr_mce_banks; i++) {
35 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); 34 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
36 35
37 if (high & (1<<31)) { 36 if (!(high & (1<<31)))
38 printk(KERN_INFO "MCE: The hardware reports a non " 37 continue;
39 "fatal, correctable incident occurred on " 38
40 "CPU %d.\n", 39 printk(KERN_INFO "MCE: The hardware reports a non fatal, "
40 "correctable incident occurred on CPU %d.\n",
41 smp_processor_id()); 41 smp_processor_id());
42 printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low); 42
43 43 printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low);
44 /* 44
45 * Scrub the error so we don't pick it up in MCE_RATE 45 /*
46 * seconds time. 46 * Scrub the error so we don't pick it up in MCE_RATE
47 */ 47 * seconds time:
48 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); 48 */
49 49 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
50 /* Serialize */ 50
51 wmb(); 51 /* Serialize: */
52 add_taint(TAINT_MACHINE_CHECK); 52 wmb();
53 } 53 add_taint(TAINT_MACHINE_CHECK);
54 } 54 }
55} 55}
56 56
@@ -77,16 +77,17 @@ static int __init init_nonfatal_mce_checker(void)
77 77
78 /* Some Athlons misbehave when we frob bank 0 */ 78 /* Some Athlons misbehave when we frob bank 0 */
79 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && 79 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
80 boot_cpu_data.x86 == 6) 80 boot_cpu_data.x86 == 6)
81 firstbank = 1; 81 firstbank = 1;
82 else 82 else
83 firstbank = 0; 83 firstbank = 0;
84 84
85 /* 85 /*
86 * Check for non-fatal errors every MCE_RATE s 86 * Check for non-fatal errors every MCE_RATE s
87 */ 87 */
88 schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); 88 schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
89 printk(KERN_INFO "Machine check exception polling timer started.\n"); 89 printk(KERN_INFO "Machine check exception polling timer started.\n");
90
90 return 0; 91 return 0;
91} 92}
92module_init(init_nonfatal_mce_checker); 93module_init(init_nonfatal_mce_checker);
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
index f53bdcbaf38..82cee108a2d 100644
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -2,18 +2,17 @@
2 * P4 specific Machine Check Exception Reporting 2 * P4 specific Machine Check Exception Reporting
3 */ 3 */
4 4
5#include <linux/init.h>
6#include <linux/types.h>
7#include <linux/kernel.h>
8#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
9#include <linux/smp.h> 9#include <linux/smp.h>
10 10
11#include <asm/therm_throt.h>
11#include <asm/processor.h> 12#include <asm/processor.h>
12#include <asm/system.h> 13#include <asm/system.h>
13#include <asm/msr.h>
14#include <asm/apic.h> 14#include <asm/apic.h>
15 15#include <asm/msr.h>
16#include <asm/therm_throt.h>
17 16
18#include "mce.h" 17#include "mce.h"
19 18
@@ -36,6 +35,7 @@ static int mce_num_extended_msrs;
36 35
37 36
38#ifdef CONFIG_X86_MCE_P4THERMAL 37#ifdef CONFIG_X86_MCE_P4THERMAL
38
39static void unexpected_thermal_interrupt(struct pt_regs *regs) 39static void unexpected_thermal_interrupt(struct pt_regs *regs)
40{ 40{
41 printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", 41 printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n",
@@ -43,7 +43,7 @@ static void unexpected_thermal_interrupt(struct pt_regs *regs)
43 add_taint(TAINT_MACHINE_CHECK); 43 add_taint(TAINT_MACHINE_CHECK);
44} 44}
45 45
46/* P4/Xeon Thermal transition interrupt handler */ 46/* P4/Xeon Thermal transition interrupt handler: */
47static void intel_thermal_interrupt(struct pt_regs *regs) 47static void intel_thermal_interrupt(struct pt_regs *regs)
48{ 48{
49 __u64 msr_val; 49 __u64 msr_val;
@@ -51,11 +51,12 @@ static void intel_thermal_interrupt(struct pt_regs *regs)
51 ack_APIC_irq(); 51 ack_APIC_irq();
52 52
53 rdmsrl(MSR_IA32_THERM_STATUS, msr_val); 53 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
54 therm_throt_process(msr_val & 0x1); 54 therm_throt_process(msr_val & THERM_STATUS_PROCHOT);
55} 55}
56 56
57/* Thermal interrupt handler for this CPU setup */ 57/* Thermal interrupt handler for this CPU setup: */
58static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt; 58static void (*vendor_thermal_interrupt)(struct pt_regs *regs) =
59 unexpected_thermal_interrupt;
59 60
60void smp_thermal_interrupt(struct pt_regs *regs) 61void smp_thermal_interrupt(struct pt_regs *regs)
61{ 62{
@@ -65,67 +66,15 @@ void smp_thermal_interrupt(struct pt_regs *regs)
65 irq_exit(); 66 irq_exit();
66} 67}
67 68
68/* P4/Xeon Thermal regulation detect and init */ 69void intel_set_thermal_handler(void)
69static void intel_init_thermal(struct cpuinfo_x86 *c)
70{ 70{
71 u32 l, h;
72 unsigned int cpu = smp_processor_id();
73
74 /* Thermal monitoring */
75 if (!cpu_has(c, X86_FEATURE_ACPI))
76 return; /* -ENODEV */
77
78 /* Clock modulation */
79 if (!cpu_has(c, X86_FEATURE_ACC))
80 return; /* -ENODEV */
81
82 /* first check if its enabled already, in which case there might
83 * be some SMM goo which handles it, so we can't even put a handler
84 * since it might be delivered via SMI already -zwanem.
85 */
86 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
87 h = apic_read(APIC_LVTTHMR);
88 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
89 printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n",
90 cpu);
91 return; /* -EBUSY */
92 }
93
94 /* check whether a vector already exists, temporarily masked? */
95 if (h & APIC_VECTOR_MASK) {
96 printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already "
97 "installed\n",
98 cpu, (h & APIC_VECTOR_MASK));
99 return; /* -EBUSY */
100 }
101
102 /* The temperature transition interrupt handler setup */
103 h = THERMAL_APIC_VECTOR; /* our delivery vector */
104 h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */
105 apic_write(APIC_LVTTHMR, h);
106
107 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
108 wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
109
110 /* ok we're good to go... */
111 vendor_thermal_interrupt = intel_thermal_interrupt; 71 vendor_thermal_interrupt = intel_thermal_interrupt;
112
113 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
114 wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
115
116 l = apic_read(APIC_LVTTHMR);
117 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
118 printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
119
120 /* enable thermal throttle processing */
121 atomic_set(&therm_throt_en, 1);
122 return;
123} 72}
124#endif /* CONFIG_X86_MCE_P4THERMAL */
125 73
74#endif /* CONFIG_X86_MCE_P4THERMAL */
126 75
127/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ 76/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */
128static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) 77static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
129{ 78{
130 u32 h; 79 u32 h;
131 80
@@ -143,9 +92,9 @@ static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
143 92
144static void intel_machine_check(struct pt_regs *regs, long error_code) 93static void intel_machine_check(struct pt_regs *regs, long error_code)
145{ 94{
146 int recover = 1;
147 u32 alow, ahigh, high, low; 95 u32 alow, ahigh, high, low;
148 u32 mcgstl, mcgsth; 96 u32 mcgstl, mcgsth;
97 int recover = 1;
149 int i; 98 int i;
150 99
151 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 100 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
@@ -157,7 +106,9 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
157 106
158 if (mce_num_extended_msrs > 0) { 107 if (mce_num_extended_msrs > 0) {
159 struct intel_mce_extended_msrs dbg; 108 struct intel_mce_extended_msrs dbg;
109
160 intel_get_extended_msrs(&dbg); 110 intel_get_extended_msrs(&dbg);
111
161 printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n" 112 printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n"
162 "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n" 113 "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n"
163 "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", 114 "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n",
@@ -171,6 +122,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
171 if (high & (1<<31)) { 122 if (high & (1<<31)) {
172 char misc[20]; 123 char misc[20];
173 char addr[24]; 124 char addr[24];
125
174 misc[0] = addr[0] = '\0'; 126 misc[0] = addr[0] = '\0';
175 if (high & (1<<29)) 127 if (high & (1<<29))
176 recover |= 1; 128 recover |= 1;
@@ -196,6 +148,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
196 panic("Unable to continue"); 148 panic("Unable to continue");
197 149
198 printk(KERN_EMERG "Attempting to continue.\n"); 150 printk(KERN_EMERG "Attempting to continue.\n");
151
199 /* 152 /*
200 * Do not clear the MSR_IA32_MCi_STATUS if the error is not 153 * Do not clear the MSR_IA32_MCi_STATUS if the error is not
201 * recoverable/continuable.This will allow BIOS to look at the MSRs 154 * recoverable/continuable.This will allow BIOS to look at the MSRs
@@ -217,7 +170,6 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
217 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 170 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
218} 171}
219 172
220
221void intel_p4_mcheck_init(struct cpuinfo_x86 *c) 173void intel_p4_mcheck_init(struct cpuinfo_x86 *c)
222{ 174{
223 u32 l, h; 175 u32 l, h;
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index c9f77ea69ed..015f481ab1b 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -2,11 +2,10 @@
2 * P5 specific Machine Check Exception Reporting 2 * P5 specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> 3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
4 */ 4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
10#include <linux/smp.h> 9#include <linux/smp.h>
11 10
12#include <asm/processor.h> 11#include <asm/processor.h>
@@ -15,39 +14,58 @@
15 14
16#include "mce.h" 15#include "mce.h"
17 16
18/* Machine check handler for Pentium class Intel */ 17/* By default disabled */
18int mce_p5_enable;
19
20/* Machine check handler for Pentium class Intel CPUs: */
19static void pentium_machine_check(struct pt_regs *regs, long error_code) 21static void pentium_machine_check(struct pt_regs *regs, long error_code)
20{ 22{
21 u32 loaddr, hi, lotype; 23 u32 loaddr, hi, lotype;
24
22 rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); 25 rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
23 rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); 26 rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
24 printk(KERN_EMERG "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n", smp_processor_id(), loaddr, lotype); 27
25 if (lotype&(1<<5)) 28 printk(KERN_EMERG
26 printk(KERN_EMERG "CPU#%d: Possible thermal failure (CPU on fire ?).\n", smp_processor_id()); 29 "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n",
30 smp_processor_id(), loaddr, lotype);
31
32 if (lotype & (1<<5)) {
33 printk(KERN_EMERG
34 "CPU#%d: Possible thermal failure (CPU on fire ?).\n",
35 smp_processor_id());
36 }
37
27 add_taint(TAINT_MACHINE_CHECK); 38 add_taint(TAINT_MACHINE_CHECK);
28} 39}
29 40
30/* Set up machine check reporting for processors with Intel style MCE */ 41/* Set up machine check reporting for processors with Intel style MCE: */
31void intel_p5_mcheck_init(struct cpuinfo_x86 *c) 42void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
32{ 43{
33 u32 l, h; 44 u32 l, h;
34 45
35 /*Check for MCE support */ 46 /* Check for MCE support: */
36 if (!cpu_has(c, X86_FEATURE_MCE)) 47 if (!cpu_has(c, X86_FEATURE_MCE))
37 return; 48 return;
38 49
39 /* Default P5 to off as its often misconnected */ 50#ifdef CONFIG_X86_OLD_MCE
51 /* Default P5 to off as its often misconnected: */
40 if (mce_disabled != -1) 52 if (mce_disabled != -1)
41 return; 53 return;
54#endif
55
42 machine_check_vector = pentium_machine_check; 56 machine_check_vector = pentium_machine_check;
57 /* Make sure the vector pointer is visible before we enable MCEs: */
43 wmb(); 58 wmb();
44 59
45 /* Read registers before enabling */ 60 /* Read registers before enabling: */
46 rdmsr(MSR_IA32_P5_MC_ADDR, l, h); 61 rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
47 rdmsr(MSR_IA32_P5_MC_TYPE, l, h); 62 rdmsr(MSR_IA32_P5_MC_TYPE, l, h);
48 printk(KERN_INFO "Intel old style machine check architecture supported.\n"); 63 printk(KERN_INFO
64 "Intel old style machine check architecture supported.\n");
49 65
50 /* Enable MCE */ 66 /* Enable MCE: */
51 set_in_cr4(X86_CR4_MCE); 67 set_in_cr4(X86_CR4_MCE);
52 printk(KERN_INFO "Intel old style machine check reporting enabled on CPU#%d.\n", smp_processor_id()); 68 printk(KERN_INFO
69 "Intel old style machine check reporting enabled on CPU#%d.\n",
70 smp_processor_id());
53} 71}
diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c
index 2ac52d7b434..43c24e66745 100644
--- a/arch/x86/kernel/cpu/mcheck/p6.c
+++ b/arch/x86/kernel/cpu/mcheck/p6.c
@@ -2,11 +2,10 @@
2 * P6 specific Machine Check Exception Reporting 2 * P6 specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> 3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
4 */ 4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
10#include <linux/smp.h> 9#include <linux/smp.h>
11 10
12#include <asm/processor.h> 11#include <asm/processor.h>
@@ -18,9 +17,9 @@
18/* Machine Check Handler For PII/PIII */ 17/* Machine Check Handler For PII/PIII */
19static void intel_machine_check(struct pt_regs *regs, long error_code) 18static void intel_machine_check(struct pt_regs *regs, long error_code)
20{ 19{
21 int recover = 1;
22 u32 alow, ahigh, high, low; 20 u32 alow, ahigh, high, low;
23 u32 mcgstl, mcgsth; 21 u32 mcgstl, mcgsth;
22 int recover = 1;
24 int i; 23 int i;
25 24
26 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 25 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
@@ -35,12 +34,16 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
35 if (high & (1<<31)) { 34 if (high & (1<<31)) {
36 char misc[20]; 35 char misc[20];
37 char addr[24]; 36 char addr[24];
38 misc[0] = addr[0] = '\0'; 37
38 misc[0] = '\0';
39 addr[0] = '\0';
40
39 if (high & (1<<29)) 41 if (high & (1<<29))
40 recover |= 1; 42 recover |= 1;
41 if (high & (1<<25)) 43 if (high & (1<<25))
42 recover |= 2; 44 recover |= 2;
43 high &= ~(1<<31); 45 high &= ~(1<<31);
46
44 if (high & (1<<27)) { 47 if (high & (1<<27)) {
45 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); 48 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
46 snprintf(misc, 20, "[%08x%08x]", ahigh, alow); 49 snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
@@ -49,6 +52,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
49 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); 52 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
50 snprintf(addr, 24, " at %08x%08x", ahigh, alow); 53 snprintf(addr, 24, " at %08x%08x", ahigh, alow);
51 } 54 }
55
52 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", 56 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
53 smp_processor_id(), i, high, low, misc, addr); 57 smp_processor_id(), i, high, low, misc, addr);
54 } 58 }
@@ -63,16 +67,17 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
63 /* 67 /*
64 * Do not clear the MSR_IA32_MCi_STATUS if the error is not 68 * Do not clear the MSR_IA32_MCi_STATUS if the error is not
65 * recoverable/continuable.This will allow BIOS to look at the MSRs 69 * recoverable/continuable.This will allow BIOS to look at the MSRs
66 * for errors if the OS could not log the error. 70 * for errors if the OS could not log the error:
67 */ 71 */
68 for (i = 0; i < nr_mce_banks; i++) { 72 for (i = 0; i < nr_mce_banks; i++) {
69 unsigned int msr; 73 unsigned int msr;
74
70 msr = MSR_IA32_MC0_STATUS+i*4; 75 msr = MSR_IA32_MC0_STATUS+i*4;
71 rdmsr(msr, low, high); 76 rdmsr(msr, low, high);
72 if (high & (1<<31)) { 77 if (high & (1<<31)) {
73 /* Clear it */ 78 /* Clear it: */
74 wrmsr(msr, 0UL, 0UL); 79 wrmsr(msr, 0UL, 0UL);
75 /* Serialize */ 80 /* Serialize: */
76 wmb(); 81 wmb();
77 add_taint(TAINT_MACHINE_CHECK); 82 add_taint(TAINT_MACHINE_CHECK);
78 } 83 }
@@ -81,7 +86,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
81 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 86 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
82} 87}
83 88
84/* Set up machine check reporting for processors with Intel style MCE */ 89/* Set up machine check reporting for processors with Intel style MCE: */
85void intel_p6_mcheck_init(struct cpuinfo_x86 *c) 90void intel_p6_mcheck_init(struct cpuinfo_x86 *c)
86{ 91{
87 u32 l, h; 92 u32 l, h;
@@ -97,6 +102,7 @@ void intel_p6_mcheck_init(struct cpuinfo_x86 *c)
97 102
98 /* Ok machine check is available */ 103 /* Ok machine check is available */
99 machine_check_vector = intel_machine_check; 104 machine_check_vector = intel_machine_check;
105 /* Make sure the vector pointer is visible before we enable MCEs: */
100 wmb(); 106 wmb();
101 107
102 printk(KERN_INFO "Intel machine check architecture supported.\n"); 108 printk(KERN_INFO "Intel machine check architecture supported.\n");
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index d5ae2243f0b..7b1ae2e20ba 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -1,7 +1,7 @@
1/* 1/*
2 *
3 * Thermal throttle event support code (such as syslog messaging and rate 2 * Thermal throttle event support code (such as syslog messaging and rate
4 * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c). 3 * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c).
4 *
5 * This allows consistent reporting of CPU thermal throttle events. 5 * This allows consistent reporting of CPU thermal throttle events.
6 * 6 *
7 * Maintains a counter in /sys that keeps track of the number of thermal 7 * Maintains a counter in /sys that keeps track of the number of thermal
@@ -13,43 +13,43 @@
13 * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c. 13 * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c.
14 * Inspired by Ross Biro's and Al Borchers' counter code. 14 * Inspired by Ross Biro's and Al Borchers' counter code.
15 */ 15 */
16 16#include <linux/notifier.h>
17#include <linux/jiffies.h>
17#include <linux/percpu.h> 18#include <linux/percpu.h>
18#include <linux/sysdev.h> 19#include <linux/sysdev.h>
19#include <linux/cpu.h> 20#include <linux/cpu.h>
20#include <asm/cpu.h> 21
21#include <linux/notifier.h>
22#include <linux/jiffies.h>
23#include <asm/therm_throt.h> 22#include <asm/therm_throt.h>
24 23
25/* How long to wait between reporting thermal events */ 24/* How long to wait between reporting thermal events */
26#define CHECK_INTERVAL (300 * HZ) 25#define CHECK_INTERVAL (300 * HZ)
27 26
28static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; 27static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES;
29static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); 28static DEFINE_PER_CPU(unsigned long, thermal_throttle_count);
30atomic_t therm_throt_en = ATOMIC_INIT(0); 29
30atomic_t therm_throt_en = ATOMIC_INIT(0);
31 31
32#ifdef CONFIG_SYSFS 32#ifdef CONFIG_SYSFS
33#define define_therm_throt_sysdev_one_ro(_name) \ 33#define define_therm_throt_sysdev_one_ro(_name) \
34 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) 34 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
35 35
36#define define_therm_throt_sysdev_show_func(name) \ 36#define define_therm_throt_sysdev_show_func(name) \
37static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ 37static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \
38 struct sysdev_attribute *attr, \ 38 struct sysdev_attribute *attr, \
39 char *buf) \ 39 char *buf) \
40{ \ 40{ \
41 unsigned int cpu = dev->id; \ 41 unsigned int cpu = dev->id; \
42 ssize_t ret; \ 42 ssize_t ret; \
43 \ 43 \
44 preempt_disable(); /* CPU hotplug */ \ 44 preempt_disable(); /* CPU hotplug */ \
45 if (cpu_online(cpu)) \ 45 if (cpu_online(cpu)) \
46 ret = sprintf(buf, "%lu\n", \ 46 ret = sprintf(buf, "%lu\n", \
47 per_cpu(thermal_throttle_##name, cpu)); \ 47 per_cpu(thermal_throttle_##name, cpu)); \
48 else \ 48 else \
49 ret = 0; \ 49 ret = 0; \
50 preempt_enable(); \ 50 preempt_enable(); \
51 \ 51 \
52 return ret; \ 52 return ret; \
53} 53}
54 54
55define_therm_throt_sysdev_show_func(count); 55define_therm_throt_sysdev_show_func(count);
@@ -61,8 +61,8 @@ static struct attribute *thermal_throttle_attrs[] = {
61}; 61};
62 62
63static struct attribute_group thermal_throttle_attr_group = { 63static struct attribute_group thermal_throttle_attr_group = {
64 .attrs = thermal_throttle_attrs, 64 .attrs = thermal_throttle_attrs,
65 .name = "thermal_throttle" 65 .name = "thermal_throttle"
66}; 66};
67#endif /* CONFIG_SYSFS */ 67#endif /* CONFIG_SYSFS */
68 68
@@ -110,10 +110,11 @@ int therm_throt_process(int curr)
110} 110}
111 111
112#ifdef CONFIG_SYSFS 112#ifdef CONFIG_SYSFS
113/* Add/Remove thermal_throttle interface for CPU device */ 113/* Add/Remove thermal_throttle interface for CPU device: */
114static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) 114static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
115{ 115{
116 return sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group); 116 return sysfs_create_group(&sys_dev->kobj,
117 &thermal_throttle_attr_group);
117} 118}
118 119
119static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) 120static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
@@ -121,19 +122,21 @@ static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
121 sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); 122 sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group);
122} 123}
123 124
124/* Mutex protecting device creation against CPU hotplug */ 125/* Mutex protecting device creation against CPU hotplug: */
125static DEFINE_MUTEX(therm_cpu_lock); 126static DEFINE_MUTEX(therm_cpu_lock);
126 127
127/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 128/* Get notified when a cpu comes on/off. Be hotplug friendly. */
128static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb, 129static __cpuinit int
129 unsigned long action, 130thermal_throttle_cpu_callback(struct notifier_block *nfb,
130 void *hcpu) 131 unsigned long action,
132 void *hcpu)
131{ 133{
132 unsigned int cpu = (unsigned long)hcpu; 134 unsigned int cpu = (unsigned long)hcpu;
133 struct sys_device *sys_dev; 135 struct sys_device *sys_dev;
134 int err = 0; 136 int err = 0;
135 137
136 sys_dev = get_cpu_sysdev(cpu); 138 sys_dev = get_cpu_sysdev(cpu);
139
137 switch (action) { 140 switch (action) {
138 case CPU_UP_PREPARE: 141 case CPU_UP_PREPARE:
139 case CPU_UP_PREPARE_FROZEN: 142 case CPU_UP_PREPARE_FROZEN:
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
index 23ee9e730f7..d746df2909c 100644
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -17,7 +17,7 @@ static void default_threshold_interrupt(void)
17 17
18void (*mce_threshold_vector)(void) = default_threshold_interrupt; 18void (*mce_threshold_vector)(void) = default_threshold_interrupt;
19 19
20asmlinkage void mce_threshold_interrupt(void) 20asmlinkage void smp_threshold_interrupt(void)
21{ 21{
22 exit_idle(); 22 exit_idle();
23 irq_enter(); 23 irq_enter();
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 2a043d89811..81b02487090 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -2,11 +2,10 @@
2 * IDT Winchip specific Machine Check Exception Reporting 2 * IDT Winchip specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> 3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
4 */ 4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
10 9
11#include <asm/processor.h> 10#include <asm/processor.h>
12#include <asm/system.h> 11#include <asm/system.h>
@@ -14,7 +13,7 @@
14 13
15#include "mce.h" 14#include "mce.h"
16 15
17/* Machine check handler for WinChip C6 */ 16/* Machine check handler for WinChip C6: */
18static void winchip_machine_check(struct pt_regs *regs, long error_code) 17static void winchip_machine_check(struct pt_regs *regs, long error_code)
19{ 18{
20 printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); 19 printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
@@ -25,12 +24,18 @@ static void winchip_machine_check(struct pt_regs *regs, long error_code)
25void winchip_mcheck_init(struct cpuinfo_x86 *c) 24void winchip_mcheck_init(struct cpuinfo_x86 *c)
26{ 25{
27 u32 lo, hi; 26 u32 lo, hi;
27
28 machine_check_vector = winchip_machine_check; 28 machine_check_vector = winchip_machine_check;
29 /* Make sure the vector pointer is visible before we enable MCEs: */
29 wmb(); 30 wmb();
31
30 rdmsr(MSR_IDT_FCR1, lo, hi); 32 rdmsr(MSR_IDT_FCR1, lo, hi);
31 lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */ 33 lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */
32 lo &= ~(1<<4); /* Enable MCE */ 34 lo &= ~(1<<4); /* Enable MCE */
33 wrmsr(MSR_IDT_FCR1, lo, hi); 35 wrmsr(MSR_IDT_FCR1, lo, hi);
36
34 set_in_cr4(X86_CR4_MCE); 37 set_in_cr4(X86_CR4_MCE);
35 printk(KERN_INFO "Winchip machine check reporting enabled on CPU#0.\n"); 38
39 printk(KERN_INFO
40 "Winchip machine check reporting enabled on CPU#0.\n");
36} 41}
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index ce0fe4b5c04..1d584a18a50 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -808,7 +808,7 @@ int __init mtrr_cleanup(unsigned address_bits)
808 808
809 if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) 809 if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
810 return 0; 810 return 0;
811 rdmsr(MTRRdefType_MSR, def, dummy); 811 rdmsr(MSR_MTRRdefType, def, dummy);
812 def &= 0xff; 812 def &= 0xff;
813 if (def != MTRR_TYPE_UNCACHABLE) 813 if (def != MTRR_TYPE_UNCACHABLE)
814 return 0; 814 return 0;
@@ -1003,7 +1003,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1003 */ 1003 */
1004 if (!is_cpu(INTEL) || disable_mtrr_trim) 1004 if (!is_cpu(INTEL) || disable_mtrr_trim)
1005 return 0; 1005 return 0;
1006 rdmsr(MTRRdefType_MSR, def, dummy); 1006 rdmsr(MSR_MTRRdefType, def, dummy);
1007 def &= 0xff; 1007 def &= 0xff;
1008 if (def != MTRR_TYPE_UNCACHABLE) 1008 if (def != MTRR_TYPE_UNCACHABLE)
1009 return 0; 1009 return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 0b776c09aff..0543f69f0b2 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -20,9 +20,9 @@ struct fixed_range_block {
20}; 20};
21 21
22static struct fixed_range_block fixed_range_blocks[] = { 22static struct fixed_range_block fixed_range_blocks[] = {
23 { MTRRfix64K_00000_MSR, 1 }, /* one 64k MTRR */ 23 { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */
24 { MTRRfix16K_80000_MSR, 2 }, /* two 16k MTRRs */ 24 { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */
25 { MTRRfix4K_C0000_MSR, 8 }, /* eight 4k MTRRs */ 25 { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */
26 {} 26 {}
27}; 27};
28 28
@@ -194,12 +194,12 @@ get_fixed_ranges(mtrr_type * frs)
194 194
195 k8_check_syscfg_dram_mod_en(); 195 k8_check_syscfg_dram_mod_en();
196 196
197 rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]); 197 rdmsr(MSR_MTRRfix64K_00000, p[0], p[1]);
198 198
199 for (i = 0; i < 2; i++) 199 for (i = 0; i < 2; i++)
200 rdmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2], p[3 + i * 2]); 200 rdmsr(MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]);
201 for (i = 0; i < 8; i++) 201 for (i = 0; i < 8; i++)
202 rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]); 202 rdmsr(MSR_MTRRfix4K_C0000 + i, p[6 + i * 2], p[7 + i * 2]);
203} 203}
204 204
205void mtrr_save_fixed_ranges(void *info) 205void mtrr_save_fixed_ranges(void *info)
@@ -275,7 +275,11 @@ static void __init print_mtrr_state(void)
275 } 275 }
276 printk(KERN_DEBUG "MTRR variable ranges %sabled:\n", 276 printk(KERN_DEBUG "MTRR variable ranges %sabled:\n",
277 mtrr_state.enabled & 2 ? "en" : "dis"); 277 mtrr_state.enabled & 2 ? "en" : "dis");
278 high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4; 278 if (size_or_mask & 0xffffffffUL)
279 high_width = ffs(size_or_mask & 0xffffffffUL) - 1;
280 else
281 high_width = ffs(size_or_mask>>32) + 32 - 1;
282 high_width = (high_width - (32 - PAGE_SHIFT) + 3) / 4;
279 for (i = 0; i < num_var_ranges; ++i) { 283 for (i = 0; i < num_var_ranges; ++i) {
280 if (mtrr_state.var_ranges[i].mask_lo & (1 << 11)) 284 if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
281 printk(KERN_DEBUG " %u base %0*X%05X000 mask %0*X%05X000 %s\n", 285 printk(KERN_DEBUG " %u base %0*X%05X000 mask %0*X%05X000 %s\n",
@@ -306,7 +310,7 @@ void __init get_mtrr_state(void)
306 310
307 vrs = mtrr_state.var_ranges; 311 vrs = mtrr_state.var_ranges;
308 312
309 rdmsr(MTRRcap_MSR, lo, dummy); 313 rdmsr(MSR_MTRRcap, lo, dummy);
310 mtrr_state.have_fixed = (lo >> 8) & 1; 314 mtrr_state.have_fixed = (lo >> 8) & 1;
311 315
312 for (i = 0; i < num_var_ranges; i++) 316 for (i = 0; i < num_var_ranges; i++)
@@ -314,7 +318,7 @@ void __init get_mtrr_state(void)
314 if (mtrr_state.have_fixed) 318 if (mtrr_state.have_fixed)
315 get_fixed_ranges(mtrr_state.fixed_ranges); 319 get_fixed_ranges(mtrr_state.fixed_ranges);
316 320
317 rdmsr(MTRRdefType_MSR, lo, dummy); 321 rdmsr(MSR_MTRRdefType, lo, dummy);
318 mtrr_state.def_type = (lo & 0xff); 322 mtrr_state.def_type = (lo & 0xff);
319 mtrr_state.enabled = (lo & 0xc00) >> 10; 323 mtrr_state.enabled = (lo & 0xc00) >> 10;
320 324
@@ -579,10 +583,10 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
579 __flush_tlb(); 583 __flush_tlb();
580 584
581 /* Save MTRR state */ 585 /* Save MTRR state */
582 rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); 586 rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
583 587
584 /* Disable MTRRs, and set the default type to uncached */ 588 /* Disable MTRRs, and set the default type to uncached */
585 mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & ~0xcff, deftype_hi); 589 mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi);
586} 590}
587 591
588static void post_set(void) __releases(set_atomicity_lock) 592static void post_set(void) __releases(set_atomicity_lock)
@@ -591,7 +595,7 @@ static void post_set(void) __releases(set_atomicity_lock)
591 __flush_tlb(); 595 __flush_tlb();
592 596
593 /* Intel (P6) standard MTRRs */ 597 /* Intel (P6) standard MTRRs */
594 mtrr_wrmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); 598 mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
595 599
596 /* Enable caches */ 600 /* Enable caches */
597 write_cr0(read_cr0() & 0xbfffffff); 601 write_cr0(read_cr0() & 0xbfffffff);
@@ -703,7 +707,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i
703static int generic_have_wrcomb(void) 707static int generic_have_wrcomb(void)
704{ 708{
705 unsigned long config, dummy; 709 unsigned long config, dummy;
706 rdmsr(MTRRcap_MSR, config, dummy); 710 rdmsr(MSR_MTRRcap, config, dummy);
707 return (config & (1 << 10)); 711 return (config & (1 << 10));
708} 712}
709 713
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 03cda01f57c..8fc248b5aea 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -104,7 +104,7 @@ static void __init set_num_var_ranges(void)
104 unsigned long config = 0, dummy; 104 unsigned long config = 0, dummy;
105 105
106 if (use_intel()) { 106 if (use_intel()) {
107 rdmsr(MTRRcap_MSR, config, dummy); 107 rdmsr(MSR_MTRRcap, config, dummy);
108 } else if (is_cpu(AMD)) 108 } else if (is_cpu(AMD))
109 config = 2; 109 config = 2;
110 else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) 110 else if (is_cpu(CYRIX) || is_cpu(CENTAUR))
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 77f67f7b347..7538b767f20 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -5,21 +5,6 @@
5#include <linux/types.h> 5#include <linux/types.h>
6#include <linux/stddef.h> 6#include <linux/stddef.h>
7 7
8#define MTRRcap_MSR 0x0fe
9#define MTRRdefType_MSR 0x2ff
10
11#define MTRRfix64K_00000_MSR 0x250
12#define MTRRfix16K_80000_MSR 0x258
13#define MTRRfix16K_A0000_MSR 0x259
14#define MTRRfix4K_C0000_MSR 0x268
15#define MTRRfix4K_C8000_MSR 0x269
16#define MTRRfix4K_D0000_MSR 0x26a
17#define MTRRfix4K_D8000_MSR 0x26b
18#define MTRRfix4K_E0000_MSR 0x26c
19#define MTRRfix4K_E8000_MSR 0x26d
20#define MTRRfix4K_F0000_MSR 0x26e
21#define MTRRfix4K_F8000_MSR 0x26f
22
23#define MTRR_CHANGE_MASK_FIXED 0x01 8#define MTRR_CHANGE_MASK_FIXED 0x01
24#define MTRR_CHANGE_MASK_VARIABLE 0x02 9#define MTRR_CHANGE_MASK_VARIABLE 0x02
25#define MTRR_CHANGE_MASK_DEFTYPE 0x04 10#define MTRR_CHANGE_MASK_DEFTYPE 0x04
diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c
index 7f7e2753685..1f5fb1588d1 100644
--- a/arch/x86/kernel/cpu/mtrr/state.c
+++ b/arch/x86/kernel/cpu/mtrr/state.c
@@ -35,7 +35,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt)
35 35
36 if (use_intel()) 36 if (use_intel())
37 /* Save MTRR state */ 37 /* Save MTRR state */
38 rdmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); 38 rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
39 else 39 else
40 /* Cyrix ARRs - everything else were excluded at the top */ 40 /* Cyrix ARRs - everything else were excluded at the top */
41 ctxt->ccr3 = getCx86(CX86_CCR3); 41 ctxt->ccr3 = getCx86(CX86_CCR3);
@@ -46,7 +46,7 @@ void set_mtrr_cache_disable(struct set_mtrr_context *ctxt)
46{ 46{
47 if (use_intel()) 47 if (use_intel())
48 /* Disable MTRRs, and set the default type to uncached */ 48 /* Disable MTRRs, and set the default type to uncached */
49 mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo & 0xf300UL, 49 mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL,
50 ctxt->deftype_hi); 50 ctxt->deftype_hi);
51 else if (is_cpu(CYRIX)) 51 else if (is_cpu(CYRIX))
52 /* Cyrix ARRs - everything else were excluded at the top */ 52 /* Cyrix ARRs - everything else were excluded at the top */
@@ -64,7 +64,7 @@ void set_mtrr_done(struct set_mtrr_context *ctxt)
64 /* Restore MTRRdefType */ 64 /* Restore MTRRdefType */
65 if (use_intel()) 65 if (use_intel())
66 /* Intel (P6) standard MTRRs */ 66 /* Intel (P6) standard MTRRs */
67 mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); 67 mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
68 else 68 else
69 /* Cyrix ARRs - everything else was excluded at the top */ 69 /* Cyrix ARRs - everything else was excluded at the top */
70 setCx86(CX86_CCR3, ctxt->ccr3); 70 setCx86(CX86_CCR3, ctxt->ccr3);
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
new file mode 100644
index 00000000000..275bc142cd5
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -0,0 +1,1711 @@
1/*
2 * Performance counter x86 architecture code
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2009 Jaswinder Singh Rajput
7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
9 *
10 * For licencing details see kernel-base/COPYING
11 */
12
13#include <linux/perf_counter.h>
14#include <linux/capability.h>
15#include <linux/notifier.h>
16#include <linux/hardirq.h>
17#include <linux/kprobes.h>
18#include <linux/module.h>
19#include <linux/kdebug.h>
20#include <linux/sched.h>
21#include <linux/uaccess.h>
22
23#include <asm/apic.h>
24#include <asm/stacktrace.h>
25#include <asm/nmi.h>
26
27static u64 perf_counter_mask __read_mostly;
28
29struct cpu_hw_counters {
30 struct perf_counter *counters[X86_PMC_IDX_MAX];
31 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
32 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
33 unsigned long interrupts;
34 int enabled;
35};
36
37/*
38 * struct x86_pmu - generic x86 pmu
39 */
40struct x86_pmu {
41 const char *name;
42 int version;
43 int (*handle_irq)(struct pt_regs *);
44 void (*disable_all)(void);
45 void (*enable_all)(void);
46 void (*enable)(struct hw_perf_counter *, int);
47 void (*disable)(struct hw_perf_counter *, int);
48 unsigned eventsel;
49 unsigned perfctr;
50 u64 (*event_map)(int);
51 u64 (*raw_event)(u64);
52 int max_events;
53 int num_counters;
54 int num_counters_fixed;
55 int counter_bits;
56 u64 counter_mask;
57 u64 max_period;
58 u64 intel_ctrl;
59};
60
61static struct x86_pmu x86_pmu __read_mostly;
62
63static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
64 .enabled = 1,
65};
66
67/*
68 * Intel PerfMon v3. Used on Core2 and later.
69 */
70static const u64 intel_perfmon_event_map[] =
71{
72 [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
73 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
74 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
75 [PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
76 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
77 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
78 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
79};
80
81static u64 intel_pmu_event_map(int event)
82{
83 return intel_perfmon_event_map[event];
84}
85
86/*
87 * Generalized hw caching related event table, filled
88 * in on a per model basis. A value of 0 means
89 * 'not supported', -1 means 'event makes no sense on
90 * this CPU', any other value means the raw event
91 * ID.
92 */
93
94#define C(x) PERF_COUNT_HW_CACHE_##x
95
96static u64 __read_mostly hw_cache_event_ids
97 [PERF_COUNT_HW_CACHE_MAX]
98 [PERF_COUNT_HW_CACHE_OP_MAX]
99 [PERF_COUNT_HW_CACHE_RESULT_MAX];
100
101static const u64 nehalem_hw_cache_event_ids
102 [PERF_COUNT_HW_CACHE_MAX]
103 [PERF_COUNT_HW_CACHE_OP_MAX]
104 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
105{
106 [ C(L1D) ] = {
107 [ C(OP_READ) ] = {
108 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
109 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
110 },
111 [ C(OP_WRITE) ] = {
112 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
113 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
114 },
115 [ C(OP_PREFETCH) ] = {
116 [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
117 [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
118 },
119 },
120 [ C(L1I ) ] = {
121 [ C(OP_READ) ] = {
122 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
123 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
124 },
125 [ C(OP_WRITE) ] = {
126 [ C(RESULT_ACCESS) ] = -1,
127 [ C(RESULT_MISS) ] = -1,
128 },
129 [ C(OP_PREFETCH) ] = {
130 [ C(RESULT_ACCESS) ] = 0x0,
131 [ C(RESULT_MISS) ] = 0x0,
132 },
133 },
134 [ C(LL ) ] = {
135 [ C(OP_READ) ] = {
136 [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */
137 [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */
138 },
139 [ C(OP_WRITE) ] = {
140 [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */
141 [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */
142 },
143 [ C(OP_PREFETCH) ] = {
144 [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */
145 [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */
146 },
147 },
148 [ C(DTLB) ] = {
149 [ C(OP_READ) ] = {
150 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
151 [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
152 },
153 [ C(OP_WRITE) ] = {
154 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
155 [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
156 },
157 [ C(OP_PREFETCH) ] = {
158 [ C(RESULT_ACCESS) ] = 0x0,
159 [ C(RESULT_MISS) ] = 0x0,
160 },
161 },
162 [ C(ITLB) ] = {
163 [ C(OP_READ) ] = {
164 [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
165 [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */
166 },
167 [ C(OP_WRITE) ] = {
168 [ C(RESULT_ACCESS) ] = -1,
169 [ C(RESULT_MISS) ] = -1,
170 },
171 [ C(OP_PREFETCH) ] = {
172 [ C(RESULT_ACCESS) ] = -1,
173 [ C(RESULT_MISS) ] = -1,
174 },
175 },
176 [ C(BPU ) ] = {
177 [ C(OP_READ) ] = {
178 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
179 [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
180 },
181 [ C(OP_WRITE) ] = {
182 [ C(RESULT_ACCESS) ] = -1,
183 [ C(RESULT_MISS) ] = -1,
184 },
185 [ C(OP_PREFETCH) ] = {
186 [ C(RESULT_ACCESS) ] = -1,
187 [ C(RESULT_MISS) ] = -1,
188 },
189 },
190};
191
192static const u64 core2_hw_cache_event_ids
193 [PERF_COUNT_HW_CACHE_MAX]
194 [PERF_COUNT_HW_CACHE_OP_MAX]
195 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
196{
197 [ C(L1D) ] = {
198 [ C(OP_READ) ] = {
199 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
200 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
201 },
202 [ C(OP_WRITE) ] = {
203 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
204 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
205 },
206 [ C(OP_PREFETCH) ] = {
207 [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */
208 [ C(RESULT_MISS) ] = 0,
209 },
210 },
211 [ C(L1I ) ] = {
212 [ C(OP_READ) ] = {
213 [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */
214 [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */
215 },
216 [ C(OP_WRITE) ] = {
217 [ C(RESULT_ACCESS) ] = -1,
218 [ C(RESULT_MISS) ] = -1,
219 },
220 [ C(OP_PREFETCH) ] = {
221 [ C(RESULT_ACCESS) ] = 0,
222 [ C(RESULT_MISS) ] = 0,
223 },
224 },
225 [ C(LL ) ] = {
226 [ C(OP_READ) ] = {
227 [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
228 [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
229 },
230 [ C(OP_WRITE) ] = {
231 [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
232 [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
233 },
234 [ C(OP_PREFETCH) ] = {
235 [ C(RESULT_ACCESS) ] = 0,
236 [ C(RESULT_MISS) ] = 0,
237 },
238 },
239 [ C(DTLB) ] = {
240 [ C(OP_READ) ] = {
241 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
242 [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */
243 },
244 [ C(OP_WRITE) ] = {
245 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
246 [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */
247 },
248 [ C(OP_PREFETCH) ] = {
249 [ C(RESULT_ACCESS) ] = 0,
250 [ C(RESULT_MISS) ] = 0,
251 },
252 },
253 [ C(ITLB) ] = {
254 [ C(OP_READ) ] = {
255 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
256 [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */
257 },
258 [ C(OP_WRITE) ] = {
259 [ C(RESULT_ACCESS) ] = -1,
260 [ C(RESULT_MISS) ] = -1,
261 },
262 [ C(OP_PREFETCH) ] = {
263 [ C(RESULT_ACCESS) ] = -1,
264 [ C(RESULT_MISS) ] = -1,
265 },
266 },
267 [ C(BPU ) ] = {
268 [ C(OP_READ) ] = {
269 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
270 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
271 },
272 [ C(OP_WRITE) ] = {
273 [ C(RESULT_ACCESS) ] = -1,
274 [ C(RESULT_MISS) ] = -1,
275 },
276 [ C(OP_PREFETCH) ] = {
277 [ C(RESULT_ACCESS) ] = -1,
278 [ C(RESULT_MISS) ] = -1,
279 },
280 },
281};
282
283static const u64 atom_hw_cache_event_ids
284 [PERF_COUNT_HW_CACHE_MAX]
285 [PERF_COUNT_HW_CACHE_OP_MAX]
286 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
287{
288 [ C(L1D) ] = {
289 [ C(OP_READ) ] = {
290 [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */
291 [ C(RESULT_MISS) ] = 0,
292 },
293 [ C(OP_WRITE) ] = {
294 [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */
295 [ C(RESULT_MISS) ] = 0,
296 },
297 [ C(OP_PREFETCH) ] = {
298 [ C(RESULT_ACCESS) ] = 0x0,
299 [ C(RESULT_MISS) ] = 0,
300 },
301 },
302 [ C(L1I ) ] = {
303 [ C(OP_READ) ] = {
304 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
305 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
306 },
307 [ C(OP_WRITE) ] = {
308 [ C(RESULT_ACCESS) ] = -1,
309 [ C(RESULT_MISS) ] = -1,
310 },
311 [ C(OP_PREFETCH) ] = {
312 [ C(RESULT_ACCESS) ] = 0,
313 [ C(RESULT_MISS) ] = 0,
314 },
315 },
316 [ C(LL ) ] = {
317 [ C(OP_READ) ] = {
318 [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
319 [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
320 },
321 [ C(OP_WRITE) ] = {
322 [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
323 [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
324 },
325 [ C(OP_PREFETCH) ] = {
326 [ C(RESULT_ACCESS) ] = 0,
327 [ C(RESULT_MISS) ] = 0,
328 },
329 },
330 [ C(DTLB) ] = {
331 [ C(OP_READ) ] = {
332 [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */
333 [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */
334 },
335 [ C(OP_WRITE) ] = {
336 [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */
337 [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */
338 },
339 [ C(OP_PREFETCH) ] = {
340 [ C(RESULT_ACCESS) ] = 0,
341 [ C(RESULT_MISS) ] = 0,
342 },
343 },
344 [ C(ITLB) ] = {
345 [ C(OP_READ) ] = {
346 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
347 [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */
348 },
349 [ C(OP_WRITE) ] = {
350 [ C(RESULT_ACCESS) ] = -1,
351 [ C(RESULT_MISS) ] = -1,
352 },
353 [ C(OP_PREFETCH) ] = {
354 [ C(RESULT_ACCESS) ] = -1,
355 [ C(RESULT_MISS) ] = -1,
356 },
357 },
358 [ C(BPU ) ] = {
359 [ C(OP_READ) ] = {
360 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
361 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
362 },
363 [ C(OP_WRITE) ] = {
364 [ C(RESULT_ACCESS) ] = -1,
365 [ C(RESULT_MISS) ] = -1,
366 },
367 [ C(OP_PREFETCH) ] = {
368 [ C(RESULT_ACCESS) ] = -1,
369 [ C(RESULT_MISS) ] = -1,
370 },
371 },
372};
373
374static u64 intel_pmu_raw_event(u64 event)
375{
376#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
377#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
378#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
379#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
380#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL
381
382#define CORE_EVNTSEL_MASK \
383 (CORE_EVNTSEL_EVENT_MASK | \
384 CORE_EVNTSEL_UNIT_MASK | \
385 CORE_EVNTSEL_EDGE_MASK | \
386 CORE_EVNTSEL_INV_MASK | \
387 CORE_EVNTSEL_COUNTER_MASK)
388
389 return event & CORE_EVNTSEL_MASK;
390}
391
392static const u64 amd_0f_hw_cache_event_ids
393 [PERF_COUNT_HW_CACHE_MAX]
394 [PERF_COUNT_HW_CACHE_OP_MAX]
395 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
396{
397 [ C(L1D) ] = {
398 [ C(OP_READ) ] = {
399 [ C(RESULT_ACCESS) ] = 0,
400 [ C(RESULT_MISS) ] = 0,
401 },
402 [ C(OP_WRITE) ] = {
403 [ C(RESULT_ACCESS) ] = 0,
404 [ C(RESULT_MISS) ] = 0,
405 },
406 [ C(OP_PREFETCH) ] = {
407 [ C(RESULT_ACCESS) ] = 0,
408 [ C(RESULT_MISS) ] = 0,
409 },
410 },
411 [ C(L1I ) ] = {
412 [ C(OP_READ) ] = {
413 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */
414 [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */
415 },
416 [ C(OP_WRITE) ] = {
417 [ C(RESULT_ACCESS) ] = -1,
418 [ C(RESULT_MISS) ] = -1,
419 },
420 [ C(OP_PREFETCH) ] = {
421 [ C(RESULT_ACCESS) ] = 0,
422 [ C(RESULT_MISS) ] = 0,
423 },
424 },
425 [ C(LL ) ] = {
426 [ C(OP_READ) ] = {
427 [ C(RESULT_ACCESS) ] = 0,
428 [ C(RESULT_MISS) ] = 0,
429 },
430 [ C(OP_WRITE) ] = {
431 [ C(RESULT_ACCESS) ] = 0,
432 [ C(RESULT_MISS) ] = 0,
433 },
434 [ C(OP_PREFETCH) ] = {
435 [ C(RESULT_ACCESS) ] = 0,
436 [ C(RESULT_MISS) ] = 0,
437 },
438 },
439 [ C(DTLB) ] = {
440 [ C(OP_READ) ] = {
441 [ C(RESULT_ACCESS) ] = 0,
442 [ C(RESULT_MISS) ] = 0,
443 },
444 [ C(OP_WRITE) ] = {
445 [ C(RESULT_ACCESS) ] = 0,
446 [ C(RESULT_MISS) ] = 0,
447 },
448 [ C(OP_PREFETCH) ] = {
449 [ C(RESULT_ACCESS) ] = 0,
450 [ C(RESULT_MISS) ] = 0,
451 },
452 },
453 [ C(ITLB) ] = {
454 [ C(OP_READ) ] = {
455 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
456 [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */
457 },
458 [ C(OP_WRITE) ] = {
459 [ C(RESULT_ACCESS) ] = -1,
460 [ C(RESULT_MISS) ] = -1,
461 },
462 [ C(OP_PREFETCH) ] = {
463 [ C(RESULT_ACCESS) ] = -1,
464 [ C(RESULT_MISS) ] = -1,
465 },
466 },
467 [ C(BPU ) ] = {
468 [ C(OP_READ) ] = {
469 [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */
470 [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */
471 },
472 [ C(OP_WRITE) ] = {
473 [ C(RESULT_ACCESS) ] = -1,
474 [ C(RESULT_MISS) ] = -1,
475 },
476 [ C(OP_PREFETCH) ] = {
477 [ C(RESULT_ACCESS) ] = -1,
478 [ C(RESULT_MISS) ] = -1,
479 },
480 },
481};
482
483/*
484 * AMD Performance Monitor K7 and later.
485 */
486static const u64 amd_perfmon_event_map[] =
487{
488 [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
489 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
490 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
491 [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
492 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
493 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
494};
495
496static u64 amd_pmu_event_map(int event)
497{
498 return amd_perfmon_event_map[event];
499}
500
501static u64 amd_pmu_raw_event(u64 event)
502{
503#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL
504#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL
505#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL
506#define K7_EVNTSEL_INV_MASK 0x000800000ULL
507#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL
508
509#define K7_EVNTSEL_MASK \
510 (K7_EVNTSEL_EVENT_MASK | \
511 K7_EVNTSEL_UNIT_MASK | \
512 K7_EVNTSEL_EDGE_MASK | \
513 K7_EVNTSEL_INV_MASK | \
514 K7_EVNTSEL_COUNTER_MASK)
515
516 return event & K7_EVNTSEL_MASK;
517}
518
519/*
520 * Propagate counter elapsed time into the generic counter.
521 * Can only be executed on the CPU where the counter is active.
522 * Returns the delta events processed.
523 */
524static u64
525x86_perf_counter_update(struct perf_counter *counter,
526 struct hw_perf_counter *hwc, int idx)
527{
528 int shift = 64 - x86_pmu.counter_bits;
529 u64 prev_raw_count, new_raw_count;
530 s64 delta;
531
532 /*
533 * Careful: an NMI might modify the previous counter value.
534 *
535 * Our tactic to handle this is to first atomically read and
536 * exchange a new raw count - then add that new-prev delta
537 * count to the generic counter atomically:
538 */
539again:
540 prev_raw_count = atomic64_read(&hwc->prev_count);
541 rdmsrl(hwc->counter_base + idx, new_raw_count);
542
543 if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
544 new_raw_count) != prev_raw_count)
545 goto again;
546
547 /*
548 * Now we have the new raw value and have updated the prev
549 * timestamp already. We can now calculate the elapsed delta
550 * (counter-)time and add that to the generic counter.
551 *
552 * Careful, not all hw sign-extends above the physical width
553 * of the count.
554 */
555 delta = (new_raw_count << shift) - (prev_raw_count << shift);
556 delta >>= shift;
557
558 atomic64_add(delta, &counter->count);
559 atomic64_sub(delta, &hwc->period_left);
560
561 return new_raw_count;
562}
563
564static atomic_t active_counters;
565static DEFINE_MUTEX(pmc_reserve_mutex);
566
567static bool reserve_pmc_hardware(void)
568{
569 int i;
570
571 if (nmi_watchdog == NMI_LOCAL_APIC)
572 disable_lapic_nmi_watchdog();
573
574 for (i = 0; i < x86_pmu.num_counters; i++) {
575 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
576 goto perfctr_fail;
577 }
578
579 for (i = 0; i < x86_pmu.num_counters; i++) {
580 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
581 goto eventsel_fail;
582 }
583
584 return true;
585
586eventsel_fail:
587 for (i--; i >= 0; i--)
588 release_evntsel_nmi(x86_pmu.eventsel + i);
589
590 i = x86_pmu.num_counters;
591
592perfctr_fail:
593 for (i--; i >= 0; i--)
594 release_perfctr_nmi(x86_pmu.perfctr + i);
595
596 if (nmi_watchdog == NMI_LOCAL_APIC)
597 enable_lapic_nmi_watchdog();
598
599 return false;
600}
601
602static void release_pmc_hardware(void)
603{
604 int i;
605
606 for (i = 0; i < x86_pmu.num_counters; i++) {
607 release_perfctr_nmi(x86_pmu.perfctr + i);
608 release_evntsel_nmi(x86_pmu.eventsel + i);
609 }
610
611 if (nmi_watchdog == NMI_LOCAL_APIC)
612 enable_lapic_nmi_watchdog();
613}
614
615static void hw_perf_counter_destroy(struct perf_counter *counter)
616{
617 if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) {
618 release_pmc_hardware();
619 mutex_unlock(&pmc_reserve_mutex);
620 }
621}
622
623static inline int x86_pmu_initialized(void)
624{
625 return x86_pmu.handle_irq != NULL;
626}
627
628static inline int
629set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
630{
631 unsigned int cache_type, cache_op, cache_result;
632 u64 config, val;
633
634 config = attr->config;
635
636 cache_type = (config >> 0) & 0xff;
637 if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
638 return -EINVAL;
639
640 cache_op = (config >> 8) & 0xff;
641 if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
642 return -EINVAL;
643
644 cache_result = (config >> 16) & 0xff;
645 if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
646 return -EINVAL;
647
648 val = hw_cache_event_ids[cache_type][cache_op][cache_result];
649
650 if (val == 0)
651 return -ENOENT;
652
653 if (val == -1)
654 return -EINVAL;
655
656 hwc->config |= val;
657
658 return 0;
659}
660
661/*
662 * Setup the hardware configuration for a given attr_type
663 */
664static int __hw_perf_counter_init(struct perf_counter *counter)
665{
666 struct perf_counter_attr *attr = &counter->attr;
667 struct hw_perf_counter *hwc = &counter->hw;
668 int err;
669
670 if (!x86_pmu_initialized())
671 return -ENODEV;
672
673 err = 0;
674 if (!atomic_inc_not_zero(&active_counters)) {
675 mutex_lock(&pmc_reserve_mutex);
676 if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware())
677 err = -EBUSY;
678 else
679 atomic_inc(&active_counters);
680 mutex_unlock(&pmc_reserve_mutex);
681 }
682 if (err)
683 return err;
684
685 /*
686 * Generate PMC IRQs:
687 * (keep 'enabled' bit clear for now)
688 */
689 hwc->config = ARCH_PERFMON_EVENTSEL_INT;
690
691 /*
692 * Count user and OS events unless requested not to.
693 */
694 if (!attr->exclude_user)
695 hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
696 if (!attr->exclude_kernel)
697 hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
698
699 if (!hwc->sample_period) {
700 hwc->sample_period = x86_pmu.max_period;
701 hwc->last_period = hwc->sample_period;
702 atomic64_set(&hwc->period_left, hwc->sample_period);
703 }
704
705 counter->destroy = hw_perf_counter_destroy;
706
707 /*
708 * Raw event type provide the config in the event structure
709 */
710 if (attr->type == PERF_TYPE_RAW) {
711 hwc->config |= x86_pmu.raw_event(attr->config);
712 return 0;
713 }
714
715 if (attr->type == PERF_TYPE_HW_CACHE)
716 return set_ext_hw_attr(hwc, attr);
717
718 if (attr->config >= x86_pmu.max_events)
719 return -EINVAL;
720 /*
721 * The generic map:
722 */
723 hwc->config |= x86_pmu.event_map(attr->config);
724
725 return 0;
726}
727
728static void intel_pmu_disable_all(void)
729{
730 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
731}
732
733static void amd_pmu_disable_all(void)
734{
735 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
736 int idx;
737
738 if (!cpuc->enabled)
739 return;
740
741 cpuc->enabled = 0;
742 /*
743 * ensure we write the disable before we start disabling the
744 * counters proper, so that amd_pmu_enable_counter() does the
745 * right thing.
746 */
747 barrier();
748
749 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
750 u64 val;
751
752 if (!test_bit(idx, cpuc->active_mask))
753 continue;
754 rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
755 if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
756 continue;
757 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
758 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
759 }
760}
761
762void hw_perf_disable(void)
763{
764 if (!x86_pmu_initialized())
765 return;
766 return x86_pmu.disable_all();
767}
768
769static void intel_pmu_enable_all(void)
770{
771 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
772}
773
774static void amd_pmu_enable_all(void)
775{
776 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
777 int idx;
778
779 if (cpuc->enabled)
780 return;
781
782 cpuc->enabled = 1;
783 barrier();
784
785 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
786 u64 val;
787
788 if (!test_bit(idx, cpuc->active_mask))
789 continue;
790 rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
791 if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
792 continue;
793 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
794 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
795 }
796}
797
798void hw_perf_enable(void)
799{
800 if (!x86_pmu_initialized())
801 return;
802 x86_pmu.enable_all();
803}
804
805static inline u64 intel_pmu_get_status(void)
806{
807 u64 status;
808
809 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
810
811 return status;
812}
813
814static inline void intel_pmu_ack_status(u64 ack)
815{
816 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
817}
818
819static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
820{
821 int err;
822 err = checking_wrmsrl(hwc->config_base + idx,
823 hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
824}
825
826static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
827{
828 int err;
829 err = checking_wrmsrl(hwc->config_base + idx,
830 hwc->config);
831}
832
833static inline void
834intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
835{
836 int idx = __idx - X86_PMC_IDX_FIXED;
837 u64 ctrl_val, mask;
838 int err;
839
840 mask = 0xfULL << (idx * 4);
841
842 rdmsrl(hwc->config_base, ctrl_val);
843 ctrl_val &= ~mask;
844 err = checking_wrmsrl(hwc->config_base, ctrl_val);
845}
846
847static inline void
848intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
849{
850 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
851 intel_pmu_disable_fixed(hwc, idx);
852 return;
853 }
854
855 x86_pmu_disable_counter(hwc, idx);
856}
857
858static inline void
859amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
860{
861 x86_pmu_disable_counter(hwc, idx);
862}
863
864static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
865
866/*
867 * Set the next IRQ period, based on the hwc->period_left value.
868 * To be called with the counter disabled in hw:
869 */
870static int
871x86_perf_counter_set_period(struct perf_counter *counter,
872 struct hw_perf_counter *hwc, int idx)
873{
874 s64 left = atomic64_read(&hwc->period_left);
875 s64 period = hwc->sample_period;
876 int err, ret = 0;
877
878 /*
879 * If we are way outside a reasoable range then just skip forward:
880 */
881 if (unlikely(left <= -period)) {
882 left = period;
883 atomic64_set(&hwc->period_left, left);
884 hwc->last_period = period;
885 ret = 1;
886 }
887
888 if (unlikely(left <= 0)) {
889 left += period;
890 atomic64_set(&hwc->period_left, left);
891 hwc->last_period = period;
892 ret = 1;
893 }
894 /*
895 * Quirk: certain CPUs dont like it if just 1 event is left:
896 */
897 if (unlikely(left < 2))
898 left = 2;
899
900 if (left > x86_pmu.max_period)
901 left = x86_pmu.max_period;
902
903 per_cpu(prev_left[idx], smp_processor_id()) = left;
904
905 /*
906 * The hw counter starts counting from this counter offset,
907 * mark it to be able to extra future deltas:
908 */
909 atomic64_set(&hwc->prev_count, (u64)-left);
910
911 err = checking_wrmsrl(hwc->counter_base + idx,
912 (u64)(-left) & x86_pmu.counter_mask);
913
914 return ret;
915}
916
917static inline void
918intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
919{
920 int idx = __idx - X86_PMC_IDX_FIXED;
921 u64 ctrl_val, bits, mask;
922 int err;
923
924 /*
925 * Enable IRQ generation (0x8),
926 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
927 * if requested:
928 */
929 bits = 0x8ULL;
930 if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
931 bits |= 0x2;
932 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
933 bits |= 0x1;
934 bits <<= (idx * 4);
935 mask = 0xfULL << (idx * 4);
936
937 rdmsrl(hwc->config_base, ctrl_val);
938 ctrl_val &= ~mask;
939 ctrl_val |= bits;
940 err = checking_wrmsrl(hwc->config_base, ctrl_val);
941}
942
943static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
944{
945 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
946 intel_pmu_enable_fixed(hwc, idx);
947 return;
948 }
949
950 x86_pmu_enable_counter(hwc, idx);
951}
952
953static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
954{
955 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
956
957 if (cpuc->enabled)
958 x86_pmu_enable_counter(hwc, idx);
959 else
960 x86_pmu_disable_counter(hwc, idx);
961}
962
963static int
964fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
965{
966 unsigned int event;
967
968 if (!x86_pmu.num_counters_fixed)
969 return -1;
970
971 /*
972 * Quirk, IA32_FIXED_CTRs do not work on current Atom processors:
973 */
974 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
975 boot_cpu_data.x86_model == 28)
976 return -1;
977
978 event = hwc->config & ARCH_PERFMON_EVENT_MASK;
979
980 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
981 return X86_PMC_IDX_FIXED_INSTRUCTIONS;
982 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
983 return X86_PMC_IDX_FIXED_CPU_CYCLES;
984 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
985 return X86_PMC_IDX_FIXED_BUS_CYCLES;
986
987 return -1;
988}
989
990/*
991 * Find a PMC slot for the freshly enabled / scheduled in counter:
992 */
993static int x86_pmu_enable(struct perf_counter *counter)
994{
995 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
996 struct hw_perf_counter *hwc = &counter->hw;
997 int idx;
998
999 idx = fixed_mode_idx(counter, hwc);
1000 if (idx >= 0) {
1001 /*
1002 * Try to get the fixed counter, if that is already taken
1003 * then try to get a generic counter:
1004 */
1005 if (test_and_set_bit(idx, cpuc->used_mask))
1006 goto try_generic;
1007
1008 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
1009 /*
1010 * We set it so that counter_base + idx in wrmsr/rdmsr maps to
1011 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
1012 */
1013 hwc->counter_base =
1014 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
1015 hwc->idx = idx;
1016 } else {
1017 idx = hwc->idx;
1018 /* Try to get the previous generic counter again */
1019 if (test_and_set_bit(idx, cpuc->used_mask)) {
1020try_generic:
1021 idx = find_first_zero_bit(cpuc->used_mask,
1022 x86_pmu.num_counters);
1023 if (idx == x86_pmu.num_counters)
1024 return -EAGAIN;
1025
1026 set_bit(idx, cpuc->used_mask);
1027 hwc->idx = idx;
1028 }
1029 hwc->config_base = x86_pmu.eventsel;
1030 hwc->counter_base = x86_pmu.perfctr;
1031 }
1032
1033 perf_counters_lapic_init();
1034
1035 x86_pmu.disable(hwc, idx);
1036
1037 cpuc->counters[idx] = counter;
1038 set_bit(idx, cpuc->active_mask);
1039
1040 x86_perf_counter_set_period(counter, hwc, idx);
1041 x86_pmu.enable(hwc, idx);
1042
1043 return 0;
1044}
1045
1046static void x86_pmu_unthrottle(struct perf_counter *counter)
1047{
1048 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
1049 struct hw_perf_counter *hwc = &counter->hw;
1050
1051 if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
1052 cpuc->counters[hwc->idx] != counter))
1053 return;
1054
1055 x86_pmu.enable(hwc, hwc->idx);
1056}
1057
1058void perf_counter_print_debug(void)
1059{
1060 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
1061 struct cpu_hw_counters *cpuc;
1062 unsigned long flags;
1063 int cpu, idx;
1064
1065 if (!x86_pmu.num_counters)
1066 return;
1067
1068 local_irq_save(flags);
1069
1070 cpu = smp_processor_id();
1071 cpuc = &per_cpu(cpu_hw_counters, cpu);
1072
1073 if (x86_pmu.version >= 2) {
1074 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
1075 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
1076 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
1077 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
1078
1079 pr_info("\n");
1080 pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl);
1081 pr_info("CPU#%d: status: %016llx\n", cpu, status);
1082 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
1083 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
1084 }
1085 pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask);
1086
1087 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1088 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
1089 rdmsrl(x86_pmu.perfctr + idx, pmc_count);
1090
1091 prev_left = per_cpu(prev_left[idx], cpu);
1092
1093 pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n",
1094 cpu, idx, pmc_ctrl);
1095 pr_info("CPU#%d: gen-PMC%d count: %016llx\n",
1096 cpu, idx, pmc_count);
1097 pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
1098 cpu, idx, prev_left);
1099 }
1100 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
1101 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
1102
1103 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
1104 cpu, idx, pmc_count);
1105 }
1106 local_irq_restore(flags);
1107}
1108
1109static void x86_pmu_disable(struct perf_counter *counter)
1110{
1111 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
1112 struct hw_perf_counter *hwc = &counter->hw;
1113 int idx = hwc->idx;
1114
1115 /*
1116 * Must be done before we disable, otherwise the nmi handler
1117 * could reenable again:
1118 */
1119 clear_bit(idx, cpuc->active_mask);
1120 x86_pmu.disable(hwc, idx);
1121
1122 /*
1123 * Make sure the cleared pointer becomes visible before we
1124 * (potentially) free the counter:
1125 */
1126 barrier();
1127
1128 /*
1129 * Drain the remaining delta count out of a counter
1130 * that we are disabling:
1131 */
1132 x86_perf_counter_update(counter, hwc, idx);
1133 cpuc->counters[idx] = NULL;
1134 clear_bit(idx, cpuc->used_mask);
1135}
1136
1137/*
1138 * Save and restart an expired counter. Called by NMI contexts,
1139 * so it has to be careful about preempting normal counter ops:
1140 */
1141static int intel_pmu_save_and_restart(struct perf_counter *counter)
1142{
1143 struct hw_perf_counter *hwc = &counter->hw;
1144 int idx = hwc->idx;
1145 int ret;
1146
1147 x86_perf_counter_update(counter, hwc, idx);
1148 ret = x86_perf_counter_set_period(counter, hwc, idx);
1149
1150 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
1151 intel_pmu_enable_counter(hwc, idx);
1152
1153 return ret;
1154}
1155
1156static void intel_pmu_reset(void)
1157{
1158 unsigned long flags;
1159 int idx;
1160
1161 if (!x86_pmu.num_counters)
1162 return;
1163
1164 local_irq_save(flags);
1165
1166 printk("clearing PMU state on CPU#%d\n", smp_processor_id());
1167
1168 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1169 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
1170 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
1171 }
1172 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
1173 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
1174 }
1175
1176 local_irq_restore(flags);
1177}
1178
1179
1180/*
1181 * This handler is triggered by the local APIC, so the APIC IRQ handling
1182 * rules apply:
1183 */
1184static int intel_pmu_handle_irq(struct pt_regs *regs)
1185{
1186 struct perf_sample_data data;
1187 struct cpu_hw_counters *cpuc;
1188 int bit, cpu, loops;
1189 u64 ack, status;
1190
1191 data.regs = regs;
1192 data.addr = 0;
1193
1194 cpu = smp_processor_id();
1195 cpuc = &per_cpu(cpu_hw_counters, cpu);
1196
1197 perf_disable();
1198 status = intel_pmu_get_status();
1199 if (!status) {
1200 perf_enable();
1201 return 0;
1202 }
1203
1204 loops = 0;
1205again:
1206 if (++loops > 100) {
1207 WARN_ONCE(1, "perfcounters: irq loop stuck!\n");
1208 perf_counter_print_debug();
1209 intel_pmu_reset();
1210 perf_enable();
1211 return 1;
1212 }
1213
1214 inc_irq_stat(apic_perf_irqs);
1215 ack = status;
1216 for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
1217 struct perf_counter *counter = cpuc->counters[bit];
1218
1219 clear_bit(bit, (unsigned long *) &status);
1220 if (!test_bit(bit, cpuc->active_mask))
1221 continue;
1222
1223 if (!intel_pmu_save_and_restart(counter))
1224 continue;
1225
1226 if (perf_counter_overflow(counter, 1, &data))
1227 intel_pmu_disable_counter(&counter->hw, bit);
1228 }
1229
1230 intel_pmu_ack_status(ack);
1231
1232 /*
1233 * Repeat if there is more work to be done:
1234 */
1235 status = intel_pmu_get_status();
1236 if (status)
1237 goto again;
1238
1239 perf_enable();
1240
1241 return 1;
1242}
1243
1244static int amd_pmu_handle_irq(struct pt_regs *regs)
1245{
1246 struct perf_sample_data data;
1247 struct cpu_hw_counters *cpuc;
1248 struct perf_counter *counter;
1249 struct hw_perf_counter *hwc;
1250 int cpu, idx, handled = 0;
1251 u64 val;
1252
1253 data.regs = regs;
1254 data.addr = 0;
1255
1256 cpu = smp_processor_id();
1257 cpuc = &per_cpu(cpu_hw_counters, cpu);
1258
1259 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1260 if (!test_bit(idx, cpuc->active_mask))
1261 continue;
1262
1263 counter = cpuc->counters[idx];
1264 hwc = &counter->hw;
1265
1266 val = x86_perf_counter_update(counter, hwc, idx);
1267 if (val & (1ULL << (x86_pmu.counter_bits - 1)))
1268 continue;
1269
1270 /*
1271 * counter overflow
1272 */
1273 handled = 1;
1274 data.period = counter->hw.last_period;
1275
1276 if (!x86_perf_counter_set_period(counter, hwc, idx))
1277 continue;
1278
1279 if (perf_counter_overflow(counter, 1, &data))
1280 amd_pmu_disable_counter(hwc, idx);
1281 }
1282
1283 if (handled)
1284 inc_irq_stat(apic_perf_irqs);
1285
1286 return handled;
1287}
1288
1289void smp_perf_pending_interrupt(struct pt_regs *regs)
1290{
1291 irq_enter();
1292 ack_APIC_irq();
1293 inc_irq_stat(apic_pending_irqs);
1294 perf_counter_do_pending();
1295 irq_exit();
1296}
1297
1298void set_perf_counter_pending(void)
1299{
1300 apic->send_IPI_self(LOCAL_PENDING_VECTOR);
1301}
1302
1303void perf_counters_lapic_init(void)
1304{
1305 if (!x86_pmu_initialized())
1306 return;
1307
1308 /*
1309 * Always use NMI for PMU
1310 */
1311 apic_write(APIC_LVTPC, APIC_DM_NMI);
1312}
1313
1314static int __kprobes
1315perf_counter_nmi_handler(struct notifier_block *self,
1316 unsigned long cmd, void *__args)
1317{
1318 struct die_args *args = __args;
1319 struct pt_regs *regs;
1320
1321 if (!atomic_read(&active_counters))
1322 return NOTIFY_DONE;
1323
1324 switch (cmd) {
1325 case DIE_NMI:
1326 case DIE_NMI_IPI:
1327 break;
1328
1329 default:
1330 return NOTIFY_DONE;
1331 }
1332
1333 regs = args->regs;
1334
1335 apic_write(APIC_LVTPC, APIC_DM_NMI);
1336 /*
1337 * Can't rely on the handled return value to say it was our NMI, two
1338 * counters could trigger 'simultaneously' raising two back-to-back NMIs.
1339 *
1340 * If the first NMI handles both, the latter will be empty and daze
1341 * the CPU.
1342 */
1343 x86_pmu.handle_irq(regs);
1344
1345 return NOTIFY_STOP;
1346}
1347
1348static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
1349 .notifier_call = perf_counter_nmi_handler,
1350 .next = NULL,
1351 .priority = 1
1352};
1353
1354static struct x86_pmu intel_pmu = {
1355 .name = "Intel",
1356 .handle_irq = intel_pmu_handle_irq,
1357 .disable_all = intel_pmu_disable_all,
1358 .enable_all = intel_pmu_enable_all,
1359 .enable = intel_pmu_enable_counter,
1360 .disable = intel_pmu_disable_counter,
1361 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
1362 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
1363 .event_map = intel_pmu_event_map,
1364 .raw_event = intel_pmu_raw_event,
1365 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
1366 /*
1367 * Intel PMCs cannot be accessed sanely above 32 bit width,
1368 * so we install an artificial 1<<31 period regardless of
1369 * the generic counter period:
1370 */
1371 .max_period = (1ULL << 31) - 1,
1372};
1373
1374static struct x86_pmu amd_pmu = {
1375 .name = "AMD",
1376 .handle_irq = amd_pmu_handle_irq,
1377 .disable_all = amd_pmu_disable_all,
1378 .enable_all = amd_pmu_enable_all,
1379 .enable = amd_pmu_enable_counter,
1380 .disable = amd_pmu_disable_counter,
1381 .eventsel = MSR_K7_EVNTSEL0,
1382 .perfctr = MSR_K7_PERFCTR0,
1383 .event_map = amd_pmu_event_map,
1384 .raw_event = amd_pmu_raw_event,
1385 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
1386 .num_counters = 4,
1387 .counter_bits = 48,
1388 .counter_mask = (1ULL << 48) - 1,
1389 /* use highest bit to detect overflow */
1390 .max_period = (1ULL << 47) - 1,
1391};
1392
1393static int intel_pmu_init(void)
1394{
1395 union cpuid10_edx edx;
1396 union cpuid10_eax eax;
1397 unsigned int unused;
1398 unsigned int ebx;
1399 int version;
1400
1401 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
1402 return -ENODEV;
1403
1404 /*
1405 * Check whether the Architectural PerfMon supports
1406 * Branch Misses Retired Event or not.
1407 */
1408 cpuid(10, &eax.full, &ebx, &unused, &edx.full);
1409 if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
1410 return -ENODEV;
1411
1412 version = eax.split.version_id;
1413 if (version < 2)
1414 return -ENODEV;
1415
1416 x86_pmu = intel_pmu;
1417 x86_pmu.version = version;
1418 x86_pmu.num_counters = eax.split.num_counters;
1419 x86_pmu.counter_bits = eax.split.bit_width;
1420 x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1;
1421
1422 /*
1423 * Quirk: v2 perfmon does not report fixed-purpose counters, so
1424 * assume at least 3 counters:
1425 */
1426 x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
1427
1428 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
1429
1430 /*
1431 * Install the hw-cache-events table:
1432 */
1433 switch (boot_cpu_data.x86_model) {
1434 case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
1435 case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
1436 case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
1437 case 29: /* six-core 45 nm xeon "Dunnington" */
1438 memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
1439 sizeof(hw_cache_event_ids));
1440
1441 pr_cont("Core2 events, ");
1442 break;
1443 default:
1444 case 26:
1445 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
1446 sizeof(hw_cache_event_ids));
1447
1448 pr_cont("Nehalem/Corei7 events, ");
1449 break;
1450 case 28:
1451 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
1452 sizeof(hw_cache_event_ids));
1453
1454 pr_cont("Atom events, ");
1455 break;
1456 }
1457 return 0;
1458}
1459
1460static int amd_pmu_init(void)
1461{
1462 x86_pmu = amd_pmu;
1463
1464 switch (boot_cpu_data.x86) {
1465 case 0x0f:
1466 case 0x10:
1467 case 0x11:
1468 memcpy(hw_cache_event_ids, amd_0f_hw_cache_event_ids,
1469 sizeof(hw_cache_event_ids));
1470
1471 pr_cont("AMD Family 0f/10/11 events, ");
1472 break;
1473 }
1474 return 0;
1475}
1476
1477void __init init_hw_perf_counters(void)
1478{
1479 int err;
1480
1481 pr_info("Performance Counters: ");
1482
1483 switch (boot_cpu_data.x86_vendor) {
1484 case X86_VENDOR_INTEL:
1485 err = intel_pmu_init();
1486 break;
1487 case X86_VENDOR_AMD:
1488 err = amd_pmu_init();
1489 break;
1490 default:
1491 return;
1492 }
1493 if (err != 0) {
1494 pr_cont("no PMU driver, software counters only.\n");
1495 return;
1496 }
1497
1498 pr_cont("%s PMU driver.\n", x86_pmu.name);
1499
1500 if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
1501 x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
1502 WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
1503 x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
1504 }
1505 perf_counter_mask = (1 << x86_pmu.num_counters) - 1;
1506 perf_max_counters = x86_pmu.num_counters;
1507
1508 if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
1509 x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
1510 WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
1511 x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
1512 }
1513
1514 perf_counter_mask |=
1515 ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
1516
1517 perf_counters_lapic_init();
1518 register_die_notifier(&perf_counter_nmi_notifier);
1519
1520 pr_info("... version: %d\n", x86_pmu.version);
1521 pr_info("... bit width: %d\n", x86_pmu.counter_bits);
1522 pr_info("... generic counters: %d\n", x86_pmu.num_counters);
1523 pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask);
1524 pr_info("... max period: %016Lx\n", x86_pmu.max_period);
1525 pr_info("... fixed-purpose counters: %d\n", x86_pmu.num_counters_fixed);
1526 pr_info("... counter mask: %016Lx\n", perf_counter_mask);
1527}
1528
1529static inline void x86_pmu_read(struct perf_counter *counter)
1530{
1531 x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
1532}
1533
1534static const struct pmu pmu = {
1535 .enable = x86_pmu_enable,
1536 .disable = x86_pmu_disable,
1537 .read = x86_pmu_read,
1538 .unthrottle = x86_pmu_unthrottle,
1539};
1540
1541const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
1542{
1543 int err;
1544
1545 err = __hw_perf_counter_init(counter);
1546 if (err)
1547 return ERR_PTR(err);
1548
1549 return &pmu;
1550}
1551
1552/*
1553 * callchain support
1554 */
1555
1556static inline
1557void callchain_store(struct perf_callchain_entry *entry, unsigned long ip)
1558{
1559 if (entry->nr < MAX_STACK_DEPTH)
1560 entry->ip[entry->nr++] = ip;
1561}
1562
1563static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
1564static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
1565
1566
1567static void
1568backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
1569{
1570 /* Ignore warnings */
1571}
1572
1573static void backtrace_warning(void *data, char *msg)
1574{
1575 /* Ignore warnings */
1576}
1577
1578static int backtrace_stack(void *data, char *name)
1579{
1580 /* Don't bother with IRQ stacks for now */
1581 return -1;
1582}
1583
1584static void backtrace_address(void *data, unsigned long addr, int reliable)
1585{
1586 struct perf_callchain_entry *entry = data;
1587
1588 if (reliable)
1589 callchain_store(entry, addr);
1590}
1591
1592static const struct stacktrace_ops backtrace_ops = {
1593 .warning = backtrace_warning,
1594 .warning_symbol = backtrace_warning_symbol,
1595 .stack = backtrace_stack,
1596 .address = backtrace_address,
1597};
1598
1599static void
1600perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
1601{
1602 unsigned long bp;
1603 char *stack;
1604 int nr = entry->nr;
1605
1606 callchain_store(entry, instruction_pointer(regs));
1607
1608 stack = ((char *)regs + sizeof(struct pt_regs));
1609#ifdef CONFIG_FRAME_POINTER
1610 bp = frame_pointer(regs);
1611#else
1612 bp = 0;
1613#endif
1614
1615 dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry);
1616
1617 entry->kernel = entry->nr - nr;
1618}
1619
1620
1621struct stack_frame {
1622 const void __user *next_fp;
1623 unsigned long return_address;
1624};
1625
1626static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
1627{
1628 int ret;
1629
1630 if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
1631 return 0;
1632
1633 ret = 1;
1634 pagefault_disable();
1635 if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
1636 ret = 0;
1637 pagefault_enable();
1638
1639 return ret;
1640}
1641
1642static void
1643perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
1644{
1645 struct stack_frame frame;
1646 const void __user *fp;
1647 int nr = entry->nr;
1648
1649 regs = (struct pt_regs *)current->thread.sp0 - 1;
1650 fp = (void __user *)regs->bp;
1651
1652 callchain_store(entry, regs->ip);
1653
1654 while (entry->nr < MAX_STACK_DEPTH) {
1655 frame.next_fp = NULL;
1656 frame.return_address = 0;
1657
1658 if (!copy_stack_frame(fp, &frame))
1659 break;
1660
1661 if ((unsigned long)fp < user_stack_pointer(regs))
1662 break;
1663
1664 callchain_store(entry, frame.return_address);
1665 fp = frame.next_fp;
1666 }
1667
1668 entry->user = entry->nr - nr;
1669}
1670
1671static void
1672perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
1673{
1674 int is_user;
1675
1676 if (!regs)
1677 return;
1678
1679 is_user = user_mode(regs);
1680
1681 if (!current || current->pid == 0)
1682 return;
1683
1684 if (is_user && current->state != TASK_RUNNING)
1685 return;
1686
1687 if (!is_user)
1688 perf_callchain_kernel(regs, entry);
1689
1690 if (current->mm)
1691 perf_callchain_user(regs, entry);
1692}
1693
1694struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1695{
1696 struct perf_callchain_entry *entry;
1697
1698 if (in_nmi())
1699 entry = &__get_cpu_var(nmi_entry);
1700 else
1701 entry = &__get_cpu_var(irq_entry);
1702
1703 entry->nr = 0;
1704 entry->hv = 0;
1705 entry->kernel = 0;
1706 entry->user = 0;
1707
1708 perf_do_callchain(regs, entry);
1709
1710 return entry;
1711}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index f6c70a164e3..d6f5b9fbde3 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -19,8 +19,8 @@
19#include <linux/nmi.h> 19#include <linux/nmi.h>
20#include <linux/kprobes.h> 20#include <linux/kprobes.h>
21 21
22#include <asm/genapic.h> 22#include <asm/apic.h>
23#include <asm/intel_arch_perfmon.h> 23#include <asm/perf_counter.h>
24 24
25struct nmi_watchdog_ctlblk { 25struct nmi_watchdog_ctlblk {
26 unsigned int cccr_msr; 26 unsigned int cccr_msr;
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 2ac1f0c2beb..b07af886124 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -182,6 +182,11 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier =
182 .notifier_call = cpuid_class_cpu_callback, 182 .notifier_call = cpuid_class_cpu_callback,
183}; 183};
184 184
185static char *cpuid_nodename(struct device *dev)
186{
187 return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));
188}
189
185static int __init cpuid_init(void) 190static int __init cpuid_init(void)
186{ 191{
187 int i, err = 0; 192 int i, err = 0;
@@ -198,6 +203,7 @@ static int __init cpuid_init(void)
198 err = PTR_ERR(cpuid_class); 203 err = PTR_ERR(cpuid_class);
199 goto out_chrdev; 204 goto out_chrdev;
200 } 205 }
206 cpuid_class->nodename = cpuid_nodename;
201 for_each_online_cpu(i) { 207 for_each_online_cpu(i) {
202 err = cpuid_device_create(i); 208 err = cpuid_device_create(i);
203 if (err != 0) 209 if (err != 0)
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
index da87590b869..81086c227ab 100644
--- a/arch/x86/kernel/dumpstack.h
+++ b/arch/x86/kernel/dumpstack.h
@@ -29,7 +29,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
29 unsigned long *sp, unsigned long bp, char *log_lvl); 29 unsigned long *sp, unsigned long bp, char *log_lvl);
30 30
31extern unsigned int code_bytes; 31extern unsigned int code_bytes;
32extern int kstack_depth_to_print;
33 32
34/* The form of the top of the frame on the stack */ 33/* The form of the top of the frame on the stack */
35struct stack_frame { 34struct stack_frame {
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 00628130292..7271fa33d79 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -617,7 +617,7 @@ __init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
617 */ 617 */
618__init void e820_setup_gap(void) 618__init void e820_setup_gap(void)
619{ 619{
620 unsigned long gapstart, gapsize, round; 620 unsigned long gapstart, gapsize;
621 int found; 621 int found;
622 622
623 gapstart = 0x10000000; 623 gapstart = 0x10000000;
@@ -635,14 +635,9 @@ __init void e820_setup_gap(void)
635#endif 635#endif
636 636
637 /* 637 /*
638 * See how much we want to round up: start off with 638 * e820_reserve_resources_late protect stolen RAM already
639 * rounding to the next 1MB area.
640 */ 639 */
641 round = 0x100000; 640 pci_mem_start = gapstart;
642 while ((gapsize >> 4) > round)
643 round += round;
644 /* Fun with two's complement */
645 pci_mem_start = (gapstart + round) & -round;
646 641
647 printk(KERN_INFO 642 printk(KERN_INFO
648 "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", 643 "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
@@ -1371,6 +1366,23 @@ void __init e820_reserve_resources(void)
1371 } 1366 }
1372} 1367}
1373 1368
1369/* How much should we pad RAM ending depending on where it is? */
1370static unsigned long ram_alignment(resource_size_t pos)
1371{
1372 unsigned long mb = pos >> 20;
1373
1374 /* To 64kB in the first megabyte */
1375 if (!mb)
1376 return 64*1024;
1377
1378 /* To 1MB in the first 16MB */
1379 if (mb < 16)
1380 return 1024*1024;
1381
1382 /* To 32MB for anything above that */
1383 return 32*1024*1024;
1384}
1385
1374void __init e820_reserve_resources_late(void) 1386void __init e820_reserve_resources_late(void)
1375{ 1387{
1376 int i; 1388 int i;
@@ -1382,6 +1394,24 @@ void __init e820_reserve_resources_late(void)
1382 insert_resource_expand_to_fit(&iomem_resource, res); 1394 insert_resource_expand_to_fit(&iomem_resource, res);
1383 res++; 1395 res++;
1384 } 1396 }
1397
1398 /*
1399 * Try to bump up RAM regions to reasonable boundaries to
1400 * avoid stolen RAM:
1401 */
1402 for (i = 0; i < e820.nr_map; i++) {
1403 struct e820entry *entry = &e820_saved.map[i];
1404 resource_size_t start, end;
1405
1406 if (entry->type != E820_RAM)
1407 continue;
1408 start = entry->addr + entry->size;
1409 end = round_up(start, ram_alignment(start));
1410 if (start == end)
1411 continue;
1412 reserve_region_with_split(&iomem_resource, start,
1413 end - 1, "RAM buffer");
1414 }
1385} 1415}
1386 1416
1387char *__init default_machine_specific_memory_setup(void) 1417char *__init default_machine_specific_memory_setup(void)
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 76b8cd953de..ebdb85cf268 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -97,6 +97,7 @@ static void __init nvidia_bugs(int num, int slot, int func)
97} 97}
98 98
99#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) 99#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
100#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
100static u32 __init ati_ixp4x0_rev(int num, int slot, int func) 101static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
101{ 102{
102 u32 d; 103 u32 d;
@@ -114,6 +115,7 @@ static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
114 d &= 0xff; 115 d &= 0xff;
115 return d; 116 return d;
116} 117}
118#endif
117 119
118static void __init ati_bugs(int num, int slot, int func) 120static void __init ati_bugs(int num, int slot, int func)
119{ 121{
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 987f91f0f75..de74f0a3e0e 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -963,6 +963,8 @@ END(\sym)
963#ifdef CONFIG_SMP 963#ifdef CONFIG_SMP
964apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \ 964apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
965 irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt 965 irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
966apicinterrupt REBOOT_VECTOR \
967 reboot_interrupt smp_reboot_interrupt
966#endif 968#endif
967 969
968#ifdef CONFIG_X86_UV 970#ifdef CONFIG_X86_UV
@@ -994,10 +996,15 @@ apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \
994#endif 996#endif
995 997
996apicinterrupt THRESHOLD_APIC_VECTOR \ 998apicinterrupt THRESHOLD_APIC_VECTOR \
997 threshold_interrupt mce_threshold_interrupt 999 threshold_interrupt smp_threshold_interrupt
998apicinterrupt THERMAL_APIC_VECTOR \ 1000apicinterrupt THERMAL_APIC_VECTOR \
999 thermal_interrupt smp_thermal_interrupt 1001 thermal_interrupt smp_thermal_interrupt
1000 1002
1003#ifdef CONFIG_X86_MCE
1004apicinterrupt MCE_SELF_VECTOR \
1005 mce_self_interrupt smp_mce_self_interrupt
1006#endif
1007
1001#ifdef CONFIG_SMP 1008#ifdef CONFIG_SMP
1002apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \ 1009apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
1003 call_function_single_interrupt smp_call_function_single_interrupt 1010 call_function_single_interrupt smp_call_function_single_interrupt
@@ -1012,6 +1019,11 @@ apicinterrupt ERROR_APIC_VECTOR \
1012apicinterrupt SPURIOUS_APIC_VECTOR \ 1019apicinterrupt SPURIOUS_APIC_VECTOR \
1013 spurious_interrupt smp_spurious_interrupt 1020 spurious_interrupt smp_spurious_interrupt
1014 1021
1022#ifdef CONFIG_PERF_COUNTERS
1023apicinterrupt LOCAL_PENDING_VECTOR \
1024 perf_pending_interrupt smp_perf_pending_interrupt
1025#endif
1026
1015/* 1027/*
1016 * Exception entry points. 1028 * Exception entry points.
1017 */ 1029 */
@@ -1366,10 +1378,15 @@ END(xen_failsafe_callback)
1366paranoidzeroentry_ist debug do_debug DEBUG_STACK 1378paranoidzeroentry_ist debug do_debug DEBUG_STACK
1367paranoidzeroentry_ist int3 do_int3 DEBUG_STACK 1379paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
1368paranoiderrorentry stack_segment do_stack_segment 1380paranoiderrorentry stack_segment do_stack_segment
1381#ifdef CONFIG_XEN
1382zeroentry xen_debug do_debug
1383zeroentry xen_int3 do_int3
1384errorentry xen_stack_segment do_stack_segment
1385#endif
1369errorentry general_protection do_general_protection 1386errorentry general_protection do_general_protection
1370errorentry page_fault do_page_fault 1387errorentry page_fault do_page_fault
1371#ifdef CONFIG_X86_MCE 1388#ifdef CONFIG_X86_MCE
1372paranoidzeroentry machine_check do_machine_check 1389paranoidzeroentry machine_check *machine_check_vector(%rip)
1373#endif 1390#endif
1374 1391
1375 /* 1392 /*
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 18dfa30795c..b79c5533c42 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -442,7 +442,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
442 _ASM_EXTABLE(1b, 4b) 442 _ASM_EXTABLE(1b, 4b)
443 _ASM_EXTABLE(2b, 4b) 443 _ASM_EXTABLE(2b, 4b)
444 444
445 : [old] "=r" (old), [faulted] "=r" (faulted) 445 : [old] "=&r" (old), [faulted] "=r" (faulted)
446 : [parent] "r" (parent), [return_hooker] "r" (return_hooker) 446 : [parent] "r" (parent), [return_hooker] "r" (return_hooker)
447 : "memory" 447 : "memory"
448 ); 448 );
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 30683883e0c..dc5ed4bdd88 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -608,13 +608,6 @@ ignore_int:
608ENTRY(initial_code) 608ENTRY(initial_code)
609 .long i386_start_kernel 609 .long i386_start_kernel
610 610
611.section .text
612/*
613 * Real beginning of normal "text" segment
614 */
615ENTRY(stext)
616ENTRY(_stext)
617
618/* 611/*
619 * BSS section 612 * BSS section
620 */ 613 */
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 69451473dbd..51d959528b1 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -91,7 +91,7 @@ void arch_update_kernel_hw_breakpoint(void *unused)
91 */ 91 */
92 kdr7 = temp_kdr7; 92 kdr7 = temp_kdr7;
93 set_debugreg(kdr7 | current->thread.debugreg7, 7); 93 set_debugreg(kdr7 | current->thread.debugreg7, 7);
94 put_cpu_no_resched(); 94 put_cpu();
95} 95}
96 96
97/* 97/*
@@ -374,7 +374,7 @@ int __kprobes hw_breakpoint_handler(struct die_args *args)
374 rc = NOTIFY_DONE; 374 rc = NOTIFY_DONE;
375 375
376 set_debugreg(dr7, 7); 376 set_debugreg(dr7, 7);
377 put_cpu_no_resched(); 377 put_cpu();
378 return rc; 378 return rc;
379} 379}
380 380
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index c2e0bb0890d..5cf36c053ac 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -7,6 +7,7 @@
7#include <linux/spinlock.h> 7#include <linux/spinlock.h>
8#include <linux/jiffies.h> 8#include <linux/jiffies.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/timex.h>
10#include <linux/delay.h> 11#include <linux/delay.h>
11#include <linux/init.h> 12#include <linux/init.h>
12#include <linux/io.h> 13#include <linux/io.h>
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index df3bf269bea..270ff83efc1 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -12,7 +12,6 @@
12 12
13static struct signal_struct init_signals = INIT_SIGNALS(init_signals); 13static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
14static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); 14static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
15struct mm_struct init_mm = INIT_MM(init_mm);
16 15
17/* 16/*
18 * Initial thread structure. 17 * Initial thread structure.
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index c3fe010d74c..b0cdde6932f 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -12,6 +12,8 @@
12#include <asm/io_apic.h> 12#include <asm/io_apic.h>
13#include <asm/irq.h> 13#include <asm/irq.h>
14#include <asm/idle.h> 14#include <asm/idle.h>
15#include <asm/mce.h>
16#include <asm/hw_irq.h>
15 17
16atomic_t irq_err_count; 18atomic_t irq_err_count;
17 19
@@ -24,9 +26,9 @@ void (*generic_interrupt_extension)(void) = NULL;
24 */ 26 */
25void ack_bad_irq(unsigned int irq) 27void ack_bad_irq(unsigned int irq)
26{ 28{
27 printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq); 29 if (printk_ratelimit())
30 pr_err("unexpected IRQ trap at vector %02x\n", irq);
28 31
29#ifdef CONFIG_X86_LOCAL_APIC
30 /* 32 /*
31 * Currently unexpected vectors happen only on SMP and APIC. 33 * Currently unexpected vectors happen only on SMP and APIC.
32 * We _must_ ack these because every local APIC has only N 34 * We _must_ ack these because every local APIC has only N
@@ -36,9 +38,7 @@ void ack_bad_irq(unsigned int irq)
36 * completely. 38 * completely.
37 * But only ack when the APIC is enabled -AK 39 * But only ack when the APIC is enabled -AK
38 */ 40 */
39 if (cpu_has_apic) 41 ack_APIC_irq();
40 ack_APIC_irq();
41#endif
42} 42}
43 43
44#define irq_stats(x) (&per_cpu(irq_stat, x)) 44#define irq_stats(x) (&per_cpu(irq_stat, x))
@@ -63,6 +63,14 @@ static int show_other_interrupts(struct seq_file *p, int prec)
63 for_each_online_cpu(j) 63 for_each_online_cpu(j)
64 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); 64 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
65 seq_printf(p, " Spurious interrupts\n"); 65 seq_printf(p, " Spurious interrupts\n");
66 seq_printf(p, "%*s: ", prec, "CNT");
67 for_each_online_cpu(j)
68 seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
69 seq_printf(p, " Performance counter interrupts\n");
70 seq_printf(p, "%*s: ", prec, "PND");
71 for_each_online_cpu(j)
72 seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
73 seq_printf(p, " Performance pending work\n");
66#endif 74#endif
67 if (generic_interrupt_extension) { 75 if (generic_interrupt_extension) {
68 seq_printf(p, "%*s: ", prec, "PLT"); 76 seq_printf(p, "%*s: ", prec, "PLT");
@@ -89,13 +97,23 @@ static int show_other_interrupts(struct seq_file *p, int prec)
89 for_each_online_cpu(j) 97 for_each_online_cpu(j)
90 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); 98 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
91 seq_printf(p, " Thermal event interrupts\n"); 99 seq_printf(p, " Thermal event interrupts\n");
92# ifdef CONFIG_X86_64 100# ifdef CONFIG_X86_MCE_THRESHOLD
93 seq_printf(p, "%*s: ", prec, "THR"); 101 seq_printf(p, "%*s: ", prec, "THR");
94 for_each_online_cpu(j) 102 for_each_online_cpu(j)
95 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); 103 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
96 seq_printf(p, " Threshold APIC interrupts\n"); 104 seq_printf(p, " Threshold APIC interrupts\n");
97# endif 105# endif
98#endif 106#endif
107#ifdef CONFIG_X86_NEW_MCE
108 seq_printf(p, "%*s: ", prec, "MCE");
109 for_each_online_cpu(j)
110 seq_printf(p, "%10u ", per_cpu(mce_exception_count, j));
111 seq_printf(p, " Machine check exceptions\n");
112 seq_printf(p, "%*s: ", prec, "MCP");
113 for_each_online_cpu(j)
114 seq_printf(p, "%10u ", per_cpu(mce_poll_count, j));
115 seq_printf(p, " Machine check polls\n");
116#endif
99 seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); 117 seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
100#if defined(CONFIG_X86_IO_APIC) 118#if defined(CONFIG_X86_IO_APIC)
101 seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count)); 119 seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
@@ -166,6 +184,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
166#ifdef CONFIG_X86_LOCAL_APIC 184#ifdef CONFIG_X86_LOCAL_APIC
167 sum += irq_stats(cpu)->apic_timer_irqs; 185 sum += irq_stats(cpu)->apic_timer_irqs;
168 sum += irq_stats(cpu)->irq_spurious_count; 186 sum += irq_stats(cpu)->irq_spurious_count;
187 sum += irq_stats(cpu)->apic_perf_irqs;
188 sum += irq_stats(cpu)->apic_pending_irqs;
169#endif 189#endif
170 if (generic_interrupt_extension) 190 if (generic_interrupt_extension)
171 sum += irq_stats(cpu)->generic_irqs; 191 sum += irq_stats(cpu)->generic_irqs;
@@ -176,9 +196,13 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
176#endif 196#endif
177#ifdef CONFIG_X86_MCE 197#ifdef CONFIG_X86_MCE
178 sum += irq_stats(cpu)->irq_thermal_count; 198 sum += irq_stats(cpu)->irq_thermal_count;
179# ifdef CONFIG_X86_64 199# ifdef CONFIG_X86_MCE_THRESHOLD
180 sum += irq_stats(cpu)->irq_threshold_count; 200 sum += irq_stats(cpu)->irq_threshold_count;
201# endif
181#endif 202#endif
203#ifdef CONFIG_X86_NEW_MCE
204 sum += per_cpu(mce_exception_count, cpu);
205 sum += per_cpu(mce_poll_count, cpu);
182#endif 206#endif
183 return sum; 207 return sum;
184} 208}
@@ -213,14 +237,11 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
213 irq = __get_cpu_var(vector_irq)[vector]; 237 irq = __get_cpu_var(vector_irq)[vector];
214 238
215 if (!handle_irq(irq, regs)) { 239 if (!handle_irq(irq, regs)) {
216#ifdef CONFIG_X86_64 240 ack_APIC_irq();
217 if (!disable_apic)
218 ack_APIC_irq();
219#endif
220 241
221 if (printk_ratelimit()) 242 if (printk_ratelimit())
222 printk(KERN_EMERG "%s: %d.%d No irq handler for vector (irq %d)\n", 243 pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n",
223 __func__, smp_processor_id(), vector, irq); 244 __func__, smp_processor_id(), vector, irq);
224 } 245 }
225 246
226 irq_exit(); 247 irq_exit();
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit.c
index 368b0a8836f..696f0e475c2 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit.c
@@ -1,20 +1,25 @@
1#include <linux/linkage.h>
1#include <linux/errno.h> 2#include <linux/errno.h>
2#include <linux/signal.h> 3#include <linux/signal.h>
3#include <linux/sched.h> 4#include <linux/sched.h>
4#include <linux/ioport.h> 5#include <linux/ioport.h>
5#include <linux/interrupt.h> 6#include <linux/interrupt.h>
7#include <linux/timex.h>
6#include <linux/slab.h> 8#include <linux/slab.h>
7#include <linux/random.h> 9#include <linux/random.h>
10#include <linux/kprobes.h>
8#include <linux/init.h> 11#include <linux/init.h>
9#include <linux/kernel_stat.h> 12#include <linux/kernel_stat.h>
10#include <linux/sysdev.h> 13#include <linux/sysdev.h>
11#include <linux/bitops.h> 14#include <linux/bitops.h>
15#include <linux/acpi.h>
12#include <linux/io.h> 16#include <linux/io.h>
13#include <linux/delay.h> 17#include <linux/delay.h>
14 18
15#include <asm/atomic.h> 19#include <asm/atomic.h>
16#include <asm/system.h> 20#include <asm/system.h>
17#include <asm/timer.h> 21#include <asm/timer.h>
22#include <asm/hw_irq.h>
18#include <asm/pgtable.h> 23#include <asm/pgtable.h>
19#include <asm/desc.h> 24#include <asm/desc.h>
20#include <asm/apic.h> 25#include <asm/apic.h>
@@ -22,7 +27,23 @@
22#include <asm/i8259.h> 27#include <asm/i8259.h>
23#include <asm/traps.h> 28#include <asm/traps.h>
24 29
30/*
31 * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
32 * (these are usually mapped to vectors 0x30-0x3f)
33 */
34
35/*
36 * The IO-APIC gives us many more interrupt sources. Most of these
37 * are unused but an SMP system is supposed to have enough memory ...
38 * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
39 * across the spectrum, so we really want to be prepared to get all
40 * of these. Plus, more powerful systems might have more than 64
41 * IO-APIC registers.
42 *
43 * (these are usually mapped into the 0x30-0xff vector range)
44 */
25 45
46#ifdef CONFIG_X86_32
26/* 47/*
27 * Note that on a 486, we don't want to do a SIGFPE on an irq13 48 * Note that on a 486, we don't want to do a SIGFPE on an irq13
28 * as the irq is unreliable, and exception 16 works correctly 49 * as the irq is unreliable, and exception 16 works correctly
@@ -52,30 +73,7 @@ static struct irqaction fpu_irq = {
52 .handler = math_error_irq, 73 .handler = math_error_irq,
53 .name = "fpu", 74 .name = "fpu",
54}; 75};
55
56void __init init_ISA_irqs(void)
57{
58 int i;
59
60#ifdef CONFIG_X86_LOCAL_APIC
61 init_bsp_APIC();
62#endif 76#endif
63 init_8259A(0);
64
65 /*
66 * 16 old-style INTA-cycle interrupts:
67 */
68 for (i = 0; i < NR_IRQS_LEGACY; i++) {
69 struct irq_desc *desc = irq_to_desc(i);
70
71 desc->status = IRQ_DISABLED;
72 desc->action = NULL;
73 desc->depth = 1;
74
75 set_irq_chip_and_handler_name(i, &i8259A_chip,
76 handle_level_irq, "XT");
77 }
78}
79 77
80/* 78/*
81 * IRQ2 is cascade interrupt to second interrupt controller 79 * IRQ2 is cascade interrupt to second interrupt controller
@@ -118,29 +116,37 @@ int vector_used_by_percpu_irq(unsigned int vector)
118 return 0; 116 return 0;
119} 117}
120 118
121/* Overridden in paravirt.c */ 119static void __init init_ISA_irqs(void)
122void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
123
124void __init native_init_IRQ(void)
125{ 120{
126 int i; 121 int i;
127 122
128 /* Execute any quirks before the call gates are initialised: */ 123#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
129 x86_quirk_pre_intr_init(); 124 init_bsp_APIC();
125#endif
126 init_8259A(0);
130 127
131 /* 128 /*
132 * Cover the whole vector space, no vector can escape 129 * 16 old-style INTA-cycle interrupts:
133 * us. (some of these will be overridden and become
134 * 'special' SMP interrupts)
135 */ 130 */
136 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { 131 for (i = 0; i < NR_IRQS_LEGACY; i++) {
137 /* SYSCALL_VECTOR was reserved in trap_init. */ 132 struct irq_desc *desc = irq_to_desc(i);
138 if (i != SYSCALL_VECTOR) 133
139 set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); 134 desc->status = IRQ_DISABLED;
135 desc->action = NULL;
136 desc->depth = 1;
137
138 set_irq_chip_and_handler_name(i, &i8259A_chip,
139 handle_level_irq, "XT");
140 } 140 }
141}
141 142
143/* Overridden in paravirt.c */
144void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
142 145
143#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) 146static void __init smp_intr_init(void)
147{
148#ifdef CONFIG_SMP
149#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
144 /* 150 /*
145 * The reschedule interrupt is a CPU-to-CPU reschedule-helper 151 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
146 * IPI, driven by wakeup. 152 * IPI, driven by wakeup.
@@ -160,16 +166,35 @@ void __init native_init_IRQ(void)
160 /* IPI for generic function call */ 166 /* IPI for generic function call */
161 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); 167 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
162 168
163 /* IPI for single call function */ 169 /* IPI for generic single function call */
164 alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, 170 alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
165 call_function_single_interrupt); 171 call_function_single_interrupt);
166 172
167 /* Low priority IPI to cleanup after moving an irq */ 173 /* Low priority IPI to cleanup after moving an irq */
168 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); 174 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
169 set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); 175 set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
176
177 /* IPI used for rebooting/stopping */
178 alloc_intr_gate(REBOOT_VECTOR, reboot_interrupt);
170#endif 179#endif
180#endif /* CONFIG_SMP */
181}
182
183static void __init apic_intr_init(void)
184{
185 smp_intr_init();
171 186
172#ifdef CONFIG_X86_LOCAL_APIC 187#ifdef CONFIG_X86_THERMAL_VECTOR
188 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
189#endif
190#ifdef CONFIG_X86_THRESHOLD
191 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
192#endif
193#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC)
194 alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt);
195#endif
196
197#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
173 /* self generated IPI for local APIC timer */ 198 /* self generated IPI for local APIC timer */
174 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); 199 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
175 200
@@ -179,16 +204,59 @@ void __init native_init_IRQ(void)
179 /* IPI vectors for APIC spurious and error interrupts */ 204 /* IPI vectors for APIC spurious and error interrupts */
180 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 205 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
181 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 206 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
207
208 /* Performance monitoring interrupts: */
209# ifdef CONFIG_PERF_COUNTERS
210 alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
211# endif
212
182#endif 213#endif
214}
183 215
184#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) 216/**
185 /* thermal monitor LVT interrupt */ 217 * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
186 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); 218 *
219 * Description:
220 * Perform any necessary interrupt initialisation prior to setting up
221 * the "ordinary" interrupt call gates. For legacy reasons, the ISA
222 * interrupts should be initialised here if the machine emulates a PC
223 * in any way.
224 **/
225static void __init x86_quirk_pre_intr_init(void)
226{
227#ifdef CONFIG_X86_32
228 if (x86_quirks->arch_pre_intr_init) {
229 if (x86_quirks->arch_pre_intr_init())
230 return;
231 }
187#endif 232#endif
233 init_ISA_irqs();
234}
235
236void __init native_init_IRQ(void)
237{
238 int i;
239
240 /* Execute any quirks before the call gates are initialised: */
241 x86_quirk_pre_intr_init();
242
243 apic_intr_init();
244
245 /*
246 * Cover the whole vector space, no vector can escape
247 * us. (some of these will be overridden and become
248 * 'special' SMP interrupts)
249 */
250 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
251 /* IA32_SYSCALL_VECTOR could be used in trap_init already. */
252 if (!test_bit(i, used_vectors))
253 set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
254 }
188 255
189 if (!acpi_ioapic) 256 if (!acpi_ioapic)
190 setup_irq(2, &irq2); 257 setup_irq(2, &irq2);
191 258
259#ifdef CONFIG_X86_32
192 /* 260 /*
193 * Call quirks after call gates are initialised (usually add in 261 * Call quirks after call gates are initialised (usually add in
194 * the architecture specific gates): 262 * the architecture specific gates):
@@ -203,4 +271,5 @@ void __init native_init_IRQ(void)
203 setup_irq(FPU_IRQ, &fpu_irq); 271 setup_irq(FPU_IRQ, &fpu_irq);
204 272
205 irq_ctx_init(smp_processor_id()); 273 irq_ctx_init(smp_processor_id());
274#endif
206} 275}
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
deleted file mode 100644
index 8cd10537fd4..00000000000
--- a/arch/x86/kernel/irqinit_64.c
+++ /dev/null
@@ -1,177 +0,0 @@
1#include <linux/linkage.h>
2#include <linux/errno.h>
3#include <linux/signal.h>
4#include <linux/sched.h>
5#include <linux/ioport.h>
6#include <linux/interrupt.h>
7#include <linux/timex.h>
8#include <linux/slab.h>
9#include <linux/random.h>
10#include <linux/init.h>
11#include <linux/kernel_stat.h>
12#include <linux/sysdev.h>
13#include <linux/bitops.h>
14#include <linux/acpi.h>
15#include <linux/io.h>
16#include <linux/delay.h>
17
18#include <asm/atomic.h>
19#include <asm/system.h>
20#include <asm/hw_irq.h>
21#include <asm/pgtable.h>
22#include <asm/desc.h>
23#include <asm/apic.h>
24#include <asm/i8259.h>
25
26/*
27 * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
28 * (these are usually mapped to vectors 0x30-0x3f)
29 */
30
31/*
32 * The IO-APIC gives us many more interrupt sources. Most of these
33 * are unused but an SMP system is supposed to have enough memory ...
34 * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
35 * across the spectrum, so we really want to be prepared to get all
36 * of these. Plus, more powerful systems might have more than 64
37 * IO-APIC registers.
38 *
39 * (these are usually mapped into the 0x30-0xff vector range)
40 */
41
42/*
43 * IRQ2 is cascade interrupt to second interrupt controller
44 */
45
46static struct irqaction irq2 = {
47 .handler = no_action,
48 .name = "cascade",
49};
50DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
51 [0 ... IRQ0_VECTOR - 1] = -1,
52 [IRQ0_VECTOR] = 0,
53 [IRQ1_VECTOR] = 1,
54 [IRQ2_VECTOR] = 2,
55 [IRQ3_VECTOR] = 3,
56 [IRQ4_VECTOR] = 4,
57 [IRQ5_VECTOR] = 5,
58 [IRQ6_VECTOR] = 6,
59 [IRQ7_VECTOR] = 7,
60 [IRQ8_VECTOR] = 8,
61 [IRQ9_VECTOR] = 9,
62 [IRQ10_VECTOR] = 10,
63 [IRQ11_VECTOR] = 11,
64 [IRQ12_VECTOR] = 12,
65 [IRQ13_VECTOR] = 13,
66 [IRQ14_VECTOR] = 14,
67 [IRQ15_VECTOR] = 15,
68 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
69};
70
71int vector_used_by_percpu_irq(unsigned int vector)
72{
73 int cpu;
74
75 for_each_online_cpu(cpu) {
76 if (per_cpu(vector_irq, cpu)[vector] != -1)
77 return 1;
78 }
79
80 return 0;
81}
82
83static void __init init_ISA_irqs(void)
84{
85 int i;
86
87 init_bsp_APIC();
88 init_8259A(0);
89
90 for (i = 0; i < NR_IRQS_LEGACY; i++) {
91 struct irq_desc *desc = irq_to_desc(i);
92
93 desc->status = IRQ_DISABLED;
94 desc->action = NULL;
95 desc->depth = 1;
96
97 /*
98 * 16 old-style INTA-cycle interrupts:
99 */
100 set_irq_chip_and_handler_name(i, &i8259A_chip,
101 handle_level_irq, "XT");
102 }
103}
104
105void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
106
107static void __init smp_intr_init(void)
108{
109#ifdef CONFIG_SMP
110 /*
111 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
112 * IPI, driven by wakeup.
113 */
114 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
115
116 /* IPIs for invalidation */
117 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
118 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
119 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
120 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
121 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
122 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
123 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
124 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
125
126 /* IPI for generic function call */
127 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
128
129 /* IPI for generic single function call */
130 alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
131 call_function_single_interrupt);
132
133 /* Low priority IPI to cleanup after moving an irq */
134 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
135 set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
136#endif
137}
138
139static void __init apic_intr_init(void)
140{
141 smp_intr_init();
142
143 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
144 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
145
146 /* self generated IPI for local APIC timer */
147 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
148
149 /* generic IPI for platform specific use */
150 alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
151
152 /* IPI vectors for APIC spurious and error interrupts */
153 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
154 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
155}
156
157void __init native_init_IRQ(void)
158{
159 int i;
160
161 init_ISA_irqs();
162 /*
163 * Cover the whole vector space, no vector can escape
164 * us. (some of these will be overridden and become
165 * 'special' SMP interrupts)
166 */
167 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
168 int vector = FIRST_EXTERNAL_VECTOR + i;
169 if (vector != IA32_SYSCALL_VECTOR)
170 set_intr_gate(vector, interrupt[i]);
171 }
172
173 apic_intr_init();
174
175 if (!acpi_ioapic)
176 setup_irq(2, &irq2);
177}
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index f820b73c7f2..34e86b67550 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -143,7 +143,7 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
143 gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); 143 gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8);
144 gdb_regs32[GDB_CS] = __KERNEL_CS; 144 gdb_regs32[GDB_CS] = __KERNEL_CS;
145 gdb_regs32[GDB_SS] = __KERNEL_DS; 145 gdb_regs32[GDB_SS] = __KERNEL_DS;
146 gdb_regs[GDB_PC] = p->thread.ip; 146 gdb_regs[GDB_PC] = 0;
147 gdb_regs[GDB_R8] = 0; 147 gdb_regs[GDB_R8] = 0;
148 gdb_regs[GDB_R9] = 0; 148 gdb_regs[GDB_R9] = 0;
149 gdb_regs[GDB_R10] = 0; 149 gdb_regs[GDB_R10] = 0;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 33019ddb56b..a78ecad0c90 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -27,6 +27,7 @@
27#include <linux/mm.h> 27#include <linux/mm.h>
28#include <linux/highmem.h> 28#include <linux/highmem.h>
29#include <linux/hardirq.h> 29#include <linux/hardirq.h>
30#include <asm/timer.h>
30 31
31#define MMU_QUEUE_SIZE 1024 32#define MMU_QUEUE_SIZE 1024
32 33
@@ -195,7 +196,7 @@ static void kvm_leave_lazy_mmu(void)
195 struct kvm_para_state *state = kvm_para_state(); 196 struct kvm_para_state *state = kvm_para_state();
196 197
197 mmu_queue_flush(state); 198 mmu_queue_flush(state);
198 paravirt_leave_lazy(paravirt_get_lazy_mode()); 199 paravirt_leave_lazy_mmu();
199 state->mode = paravirt_get_lazy_mode(); 200 state->mode = paravirt_get_lazy_mode();
200} 201}
201 202
@@ -230,6 +231,9 @@ static void paravirt_ops_setup(void)
230 pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu; 231 pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu;
231 pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu; 232 pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu;
232 } 233 }
234#ifdef CONFIG_X86_IO_APIC
235 no_timer_check = 1;
236#endif
233} 237}
234 238
235void __init kvm_guest_init(void) 239void __init kvm_guest_init(void)
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 453b5795a5c..366baa17991 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -13,25 +13,13 @@
13 * Licensed under the terms of the GNU General Public 13 * Licensed under the terms of the GNU General Public
14 * License version 2. See file COPYING for details. 14 * License version 2. See file COPYING for details.
15 */ 15 */
16#include <linux/platform_device.h>
17#include <linux/capability.h>
18#include <linux/miscdevice.h>
19#include <linux/firmware.h> 16#include <linux/firmware.h>
20#include <linux/spinlock.h>
21#include <linux/cpumask.h>
22#include <linux/pci_ids.h> 17#include <linux/pci_ids.h>
23#include <linux/uaccess.h> 18#include <linux/uaccess.h>
24#include <linux/vmalloc.h> 19#include <linux/vmalloc.h>
25#include <linux/kernel.h> 20#include <linux/kernel.h>
26#include <linux/module.h> 21#include <linux/module.h>
27#include <linux/mutex.h>
28#include <linux/sched.h>
29#include <linux/init.h>
30#include <linux/slab.h>
31#include <linux/cpu.h>
32#include <linux/pci.h> 22#include <linux/pci.h>
33#include <linux/fs.h>
34#include <linux/mm.h>
35 23
36#include <asm/microcode.h> 24#include <asm/microcode.h>
37#include <asm/processor.h> 25#include <asm/processor.h>
@@ -79,9 +67,6 @@ struct microcode_amd {
79#define UCODE_CONTAINER_SECTION_HDR 8 67#define UCODE_CONTAINER_SECTION_HDR 8
80#define UCODE_CONTAINER_HEADER_SIZE 12 68#define UCODE_CONTAINER_HEADER_SIZE 12
81 69
82/* serialize access to the physical write */
83static DEFINE_SPINLOCK(microcode_update_lock);
84
85static struct equiv_cpu_entry *equiv_cpu_table; 70static struct equiv_cpu_entry *equiv_cpu_table;
86 71
87static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) 72static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
@@ -144,9 +129,8 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
144 return 1; 129 return 1;
145} 130}
146 131
147static void apply_microcode_amd(int cpu) 132static int apply_microcode_amd(int cpu)
148{ 133{
149 unsigned long flags;
150 u32 rev, dummy; 134 u32 rev, dummy;
151 int cpu_num = raw_smp_processor_id(); 135 int cpu_num = raw_smp_processor_id();
152 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; 136 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
@@ -156,25 +140,25 @@ static void apply_microcode_amd(int cpu)
156 BUG_ON(cpu_num != cpu); 140 BUG_ON(cpu_num != cpu);
157 141
158 if (mc_amd == NULL) 142 if (mc_amd == NULL)
159 return; 143 return 0;
160 144
161 spin_lock_irqsave(&microcode_update_lock, flags);
162 wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); 145 wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
163 /* get patch id after patching */ 146 /* get patch id after patching */
164 rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); 147 rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
165 spin_unlock_irqrestore(&microcode_update_lock, flags);
166 148
167 /* check current patch id and patch's id for match */ 149 /* check current patch id and patch's id for match */
168 if (rev != mc_amd->hdr.patch_id) { 150 if (rev != mc_amd->hdr.patch_id) {
169 printk(KERN_ERR "microcode: CPU%d: update failed " 151 printk(KERN_ERR "microcode: CPU%d: update failed "
170 "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id); 152 "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id);
171 return; 153 return -1;
172 } 154 }
173 155
174 printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n", 156 printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n",
175 cpu, rev); 157 cpu, rev);
176 158
177 uci->cpu_sig.rev = rev; 159 uci->cpu_sig.rev = rev;
160
161 return 0;
178} 162}
179 163
180static int get_ucode_data(void *to, const u8 *from, size_t n) 164static int get_ucode_data(void *to, const u8 *from, size_t n)
@@ -257,13 +241,12 @@ static int install_equiv_cpu_table(const u8 *buf)
257 241
258static void free_equiv_cpu_table(void) 242static void free_equiv_cpu_table(void)
259{ 243{
260 if (equiv_cpu_table) { 244 vfree(equiv_cpu_table);
261 vfree(equiv_cpu_table); 245 equiv_cpu_table = NULL;
262 equiv_cpu_table = NULL;
263 }
264} 246}
265 247
266static int generic_load_microcode(int cpu, const u8 *data, size_t size) 248static enum ucode_state
249generic_load_microcode(int cpu, const u8 *data, size_t size)
267{ 250{
268 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 251 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
269 const u8 *ucode_ptr = data; 252 const u8 *ucode_ptr = data;
@@ -272,12 +255,13 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size)
272 int new_rev = uci->cpu_sig.rev; 255 int new_rev = uci->cpu_sig.rev;
273 unsigned int leftover; 256 unsigned int leftover;
274 unsigned long offset; 257 unsigned long offset;
258 enum ucode_state state = UCODE_OK;
275 259
276 offset = install_equiv_cpu_table(ucode_ptr); 260 offset = install_equiv_cpu_table(ucode_ptr);
277 if (!offset) { 261 if (!offset) {
278 printk(KERN_ERR "microcode: failed to create " 262 printk(KERN_ERR "microcode: failed to create "
279 "equivalent cpu table\n"); 263 "equivalent cpu table\n");
280 return -EINVAL; 264 return UCODE_ERROR;
281 } 265 }
282 266
283 ucode_ptr += offset; 267 ucode_ptr += offset;
@@ -293,8 +277,7 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size)
293 277
294 mc_header = (struct microcode_header_amd *)mc; 278 mc_header = (struct microcode_header_amd *)mc;
295 if (get_matching_microcode(cpu, mc, new_rev)) { 279 if (get_matching_microcode(cpu, mc, new_rev)) {
296 if (new_mc) 280 vfree(new_mc);
297 vfree(new_mc);
298 new_rev = mc_header->patch_id; 281 new_rev = mc_header->patch_id;
299 new_mc = mc; 282 new_mc = mc;
300 } else 283 } else
@@ -306,34 +289,32 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size)
306 289
307 if (new_mc) { 290 if (new_mc) {
308 if (!leftover) { 291 if (!leftover) {
309 if (uci->mc) 292 vfree(uci->mc);
310 vfree(uci->mc);
311 uci->mc = new_mc; 293 uci->mc = new_mc;
312 pr_debug("microcode: CPU%d found a matching microcode " 294 pr_debug("microcode: CPU%d found a matching microcode "
313 "update with version 0x%x (current=0x%x)\n", 295 "update with version 0x%x (current=0x%x)\n",
314 cpu, new_rev, uci->cpu_sig.rev); 296 cpu, new_rev, uci->cpu_sig.rev);
315 } else 297 } else {
316 vfree(new_mc); 298 vfree(new_mc);
317 } 299 state = UCODE_ERROR;
300 }
301 } else
302 state = UCODE_NFOUND;
318 303
319 free_equiv_cpu_table(); 304 free_equiv_cpu_table();
320 305
321 return (int)leftover; 306 return state;
322} 307}
323 308
324static int request_microcode_fw(int cpu, struct device *device) 309static enum ucode_state request_microcode_fw(int cpu, struct device *device)
325{ 310{
326 const char *fw_name = "amd-ucode/microcode_amd.bin"; 311 const char *fw_name = "amd-ucode/microcode_amd.bin";
327 const struct firmware *firmware; 312 const struct firmware *firmware;
328 int ret; 313 enum ucode_state ret;
329
330 /* We should bind the task to the CPU */
331 BUG_ON(cpu != raw_smp_processor_id());
332 314
333 ret = request_firmware(&firmware, fw_name, device); 315 if (request_firmware(&firmware, fw_name, device)) {
334 if (ret) {
335 printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); 316 printk(KERN_ERR "microcode: failed to load file %s\n", fw_name);
336 return ret; 317 return UCODE_NFOUND;
337 } 318 }
338 319
339 ret = generic_load_microcode(cpu, firmware->data, firmware->size); 320 ret = generic_load_microcode(cpu, firmware->data, firmware->size);
@@ -343,11 +324,12 @@ static int request_microcode_fw(int cpu, struct device *device)
343 return ret; 324 return ret;
344} 325}
345 326
346static int request_microcode_user(int cpu, const void __user *buf, size_t size) 327static enum ucode_state
328request_microcode_user(int cpu, const void __user *buf, size_t size)
347{ 329{
348 printk(KERN_INFO "microcode: AMD microcode update via " 330 printk(KERN_INFO "microcode: AMD microcode update via "
349 "/dev/cpu/microcode not supported\n"); 331 "/dev/cpu/microcode not supported\n");
350 return -1; 332 return UCODE_ERROR;
351} 333}
352 334
353static void microcode_fini_cpu_amd(int cpu) 335static void microcode_fini_cpu_amd(int cpu)
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 98c470c069d..9371448290a 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -71,27 +71,18 @@
71 * Thanks to Stuart Swales for pointing out this bug. 71 * Thanks to Stuart Swales for pointing out this bug.
72 */ 72 */
73#include <linux/platform_device.h> 73#include <linux/platform_device.h>
74#include <linux/capability.h>
75#include <linux/miscdevice.h> 74#include <linux/miscdevice.h>
76#include <linux/firmware.h> 75#include <linux/capability.h>
77#include <linux/smp_lock.h> 76#include <linux/smp_lock.h>
78#include <linux/spinlock.h>
79#include <linux/cpumask.h>
80#include <linux/uaccess.h>
81#include <linux/vmalloc.h>
82#include <linux/kernel.h> 77#include <linux/kernel.h>
83#include <linux/module.h> 78#include <linux/module.h>
84#include <linux/mutex.h> 79#include <linux/mutex.h>
85#include <linux/sched.h>
86#include <linux/init.h>
87#include <linux/slab.h>
88#include <linux/cpu.h> 80#include <linux/cpu.h>
89#include <linux/fs.h> 81#include <linux/fs.h>
90#include <linux/mm.h> 82#include <linux/mm.h>
91 83
92#include <asm/microcode.h> 84#include <asm/microcode.h>
93#include <asm/processor.h> 85#include <asm/processor.h>
94#include <asm/msr.h>
95 86
96MODULE_DESCRIPTION("Microcode Update Driver"); 87MODULE_DESCRIPTION("Microcode Update Driver");
97MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); 88MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
@@ -101,36 +92,110 @@ MODULE_LICENSE("GPL");
101 92
102static struct microcode_ops *microcode_ops; 93static struct microcode_ops *microcode_ops;
103 94
104/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ 95/*
96 * Synchronization.
97 *
98 * All non cpu-hotplug-callback call sites use:
99 *
100 * - microcode_mutex to synchronize with each other;
101 * - get/put_online_cpus() to synchronize with
102 * the cpu-hotplug-callback call sites.
103 *
104 * We guarantee that only a single cpu is being
105 * updated at any particular moment of time.
106 */
105static DEFINE_MUTEX(microcode_mutex); 107static DEFINE_MUTEX(microcode_mutex);
106 108
107struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; 109struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
108EXPORT_SYMBOL_GPL(ucode_cpu_info); 110EXPORT_SYMBOL_GPL(ucode_cpu_info);
109 111
112/*
113 * Operations that are run on a target cpu:
114 */
115
116struct cpu_info_ctx {
117 struct cpu_signature *cpu_sig;
118 int err;
119};
120
121static void collect_cpu_info_local(void *arg)
122{
123 struct cpu_info_ctx *ctx = arg;
124
125 ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(),
126 ctx->cpu_sig);
127}
128
129static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig)
130{
131 struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 };
132 int ret;
133
134 ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1);
135 if (!ret)
136 ret = ctx.err;
137
138 return ret;
139}
140
141static int collect_cpu_info(int cpu)
142{
143 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
144 int ret;
145
146 memset(uci, 0, sizeof(*uci));
147
148 ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig);
149 if (!ret)
150 uci->valid = 1;
151
152 return ret;
153}
154
155struct apply_microcode_ctx {
156 int err;
157};
158
159static void apply_microcode_local(void *arg)
160{
161 struct apply_microcode_ctx *ctx = arg;
162
163 ctx->err = microcode_ops->apply_microcode(smp_processor_id());
164}
165
166static int apply_microcode_on_target(int cpu)
167{
168 struct apply_microcode_ctx ctx = { .err = 0 };
169 int ret;
170
171 ret = smp_call_function_single(cpu, apply_microcode_local, &ctx, 1);
172 if (!ret)
173 ret = ctx.err;
174
175 return ret;
176}
177
110#ifdef CONFIG_MICROCODE_OLD_INTERFACE 178#ifdef CONFIG_MICROCODE_OLD_INTERFACE
111static int do_microcode_update(const void __user *buf, size_t size) 179static int do_microcode_update(const void __user *buf, size_t size)
112{ 180{
113 cpumask_t old;
114 int error = 0; 181 int error = 0;
115 int cpu; 182 int cpu;
116 183
117 old = current->cpus_allowed;
118
119 for_each_online_cpu(cpu) { 184 for_each_online_cpu(cpu) {
120 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 185 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
186 enum ucode_state ustate;
121 187
122 if (!uci->valid) 188 if (!uci->valid)
123 continue; 189 continue;
124 190
125 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 191 ustate = microcode_ops->request_microcode_user(cpu, buf, size);
126 error = microcode_ops->request_microcode_user(cpu, buf, size); 192 if (ustate == UCODE_ERROR) {
127 if (error < 0) 193 error = -1;
128 goto out; 194 break;
129 if (!error) 195 } else if (ustate == UCODE_OK)
130 microcode_ops->apply_microcode(cpu); 196 apply_microcode_on_target(cpu);
131 } 197 }
132out: 198
133 set_cpus_allowed_ptr(current, &old);
134 return error; 199 return error;
135} 200}
136 201
@@ -143,19 +208,17 @@ static int microcode_open(struct inode *unused1, struct file *unused2)
143static ssize_t microcode_write(struct file *file, const char __user *buf, 208static ssize_t microcode_write(struct file *file, const char __user *buf,
144 size_t len, loff_t *ppos) 209 size_t len, loff_t *ppos)
145{ 210{
146 ssize_t ret; 211 ssize_t ret = -EINVAL;
147 212
148 if ((len >> PAGE_SHIFT) > num_physpages) { 213 if ((len >> PAGE_SHIFT) > num_physpages) {
149 printk(KERN_ERR "microcode: too much data (max %ld pages)\n", 214 pr_err("microcode: too much data (max %ld pages)\n", num_physpages);
150 num_physpages); 215 return ret;
151 return -EINVAL;
152 } 216 }
153 217
154 get_online_cpus(); 218 get_online_cpus();
155 mutex_lock(&microcode_mutex); 219 mutex_lock(&microcode_mutex);
156 220
157 ret = do_microcode_update(buf, len); 221 if (do_microcode_update(buf, len) == 0)
158 if (!ret)
159 ret = (ssize_t)len; 222 ret = (ssize_t)len;
160 223
161 mutex_unlock(&microcode_mutex); 224 mutex_unlock(&microcode_mutex);
@@ -165,15 +228,16 @@ static ssize_t microcode_write(struct file *file, const char __user *buf,
165} 228}
166 229
167static const struct file_operations microcode_fops = { 230static const struct file_operations microcode_fops = {
168 .owner = THIS_MODULE, 231 .owner = THIS_MODULE,
169 .write = microcode_write, 232 .write = microcode_write,
170 .open = microcode_open, 233 .open = microcode_open,
171}; 234};
172 235
173static struct miscdevice microcode_dev = { 236static struct miscdevice microcode_dev = {
174 .minor = MICROCODE_MINOR, 237 .minor = MICROCODE_MINOR,
175 .name = "microcode", 238 .name = "microcode",
176 .fops = &microcode_fops, 239 .devnode = "cpu/microcode",
240 .fops = &microcode_fops,
177}; 241};
178 242
179static int __init microcode_dev_init(void) 243static int __init microcode_dev_init(void)
@@ -182,9 +246,7 @@ static int __init microcode_dev_init(void)
182 246
183 error = misc_register(&microcode_dev); 247 error = misc_register(&microcode_dev);
184 if (error) { 248 if (error) {
185 printk(KERN_ERR 249 pr_err("microcode: can't misc_register on minor=%d\n", MICROCODE_MINOR);
186 "microcode: can't misc_register on minor=%d\n",
187 MICROCODE_MINOR);
188 return error; 250 return error;
189 } 251 }
190 252
@@ -205,42 +267,51 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
205/* fake device for request_firmware */ 267/* fake device for request_firmware */
206static struct platform_device *microcode_pdev; 268static struct platform_device *microcode_pdev;
207 269
208static long reload_for_cpu(void *unused) 270static int reload_for_cpu(int cpu)
209{ 271{
210 struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id(); 272 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
211 int err = 0; 273 int err = 0;
212 274
213 mutex_lock(&microcode_mutex); 275 mutex_lock(&microcode_mutex);
214 if (uci->valid) { 276 if (uci->valid) {
215 err = microcode_ops->request_microcode_fw(smp_processor_id(), 277 enum ucode_state ustate;
216 &microcode_pdev->dev); 278
217 if (!err) 279 ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
218 microcode_ops->apply_microcode(smp_processor_id()); 280 if (ustate == UCODE_OK)
281 apply_microcode_on_target(cpu);
282 else
283 if (ustate == UCODE_ERROR)
284 err = -EINVAL;
219 } 285 }
220 mutex_unlock(&microcode_mutex); 286 mutex_unlock(&microcode_mutex);
287
221 return err; 288 return err;
222} 289}
223 290
224static ssize_t reload_store(struct sys_device *dev, 291static ssize_t reload_store(struct sys_device *dev,
225 struct sysdev_attribute *attr, 292 struct sysdev_attribute *attr,
226 const char *buf, size_t sz) 293 const char *buf, size_t size)
227{ 294{
228 char *end; 295 unsigned long val;
229 unsigned long val = simple_strtoul(buf, &end, 0);
230 int err = 0;
231 int cpu = dev->id; 296 int cpu = dev->id;
297 int ret = 0;
298 char *end;
232 299
300 val = simple_strtoul(buf, &end, 0);
233 if (end == buf) 301 if (end == buf)
234 return -EINVAL; 302 return -EINVAL;
303
235 if (val == 1) { 304 if (val == 1) {
236 get_online_cpus(); 305 get_online_cpus();
237 if (cpu_online(cpu)) 306 if (cpu_online(cpu))
238 err = work_on_cpu(cpu, reload_for_cpu, NULL); 307 ret = reload_for_cpu(cpu);
239 put_online_cpus(); 308 put_online_cpus();
240 } 309 }
241 if (err) 310
242 return err; 311 if (!ret)
243 return sz; 312 ret = size;
313
314 return ret;
244} 315}
245 316
246static ssize_t version_show(struct sys_device *dev, 317static ssize_t version_show(struct sys_device *dev,
@@ -271,11 +342,11 @@ static struct attribute *mc_default_attrs[] = {
271}; 342};
272 343
273static struct attribute_group mc_attr_group = { 344static struct attribute_group mc_attr_group = {
274 .attrs = mc_default_attrs, 345 .attrs = mc_default_attrs,
275 .name = "microcode", 346 .name = "microcode",
276}; 347};
277 348
278static void __microcode_fini_cpu(int cpu) 349static void microcode_fini_cpu(int cpu)
279{ 350{
280 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 351 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
281 352
@@ -283,103 +354,68 @@ static void __microcode_fini_cpu(int cpu)
283 uci->valid = 0; 354 uci->valid = 0;
284} 355}
285 356
286static void microcode_fini_cpu(int cpu) 357static enum ucode_state microcode_resume_cpu(int cpu)
287{
288 mutex_lock(&microcode_mutex);
289 __microcode_fini_cpu(cpu);
290 mutex_unlock(&microcode_mutex);
291}
292
293static void collect_cpu_info(int cpu)
294{ 358{
295 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 359 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
296 360
297 memset(uci, 0, sizeof(*uci)); 361 if (!uci->mc)
298 if (!microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig)) 362 return UCODE_NFOUND;
299 uci->valid = 1; 363
364 pr_debug("microcode: CPU%d updated upon resume\n", cpu);
365 apply_microcode_on_target(cpu);
366
367 return UCODE_OK;
300} 368}
301 369
302static int microcode_resume_cpu(int cpu) 370static enum ucode_state microcode_init_cpu(int cpu)
303{ 371{
304 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 372 enum ucode_state ustate;
305 struct cpu_signature nsig;
306 373
307 pr_debug("microcode: CPU%d resumed\n", cpu); 374 if (collect_cpu_info(cpu))
375 return UCODE_ERROR;
308 376
309 if (!uci->mc) 377 /* --dimm. Trigger a delayed update? */
310 return 1; 378 if (system_state != SYSTEM_RUNNING)
379 return UCODE_NFOUND;
311 380
312 /* 381 ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
313 * Let's verify that the 'cached' ucode does belong
314 * to this cpu (a bit of paranoia):
315 */
316 if (microcode_ops->collect_cpu_info(cpu, &nsig)) {
317 __microcode_fini_cpu(cpu);
318 printk(KERN_ERR "failed to collect_cpu_info for resuming cpu #%d\n",
319 cpu);
320 return -1;
321 }
322 382
323 if ((nsig.sig != uci->cpu_sig.sig) || (nsig.pf != uci->cpu_sig.pf)) { 383 if (ustate == UCODE_OK) {
324 __microcode_fini_cpu(cpu); 384 pr_debug("microcode: CPU%d updated upon init\n", cpu);
325 printk(KERN_ERR "cached ucode doesn't match the resuming cpu #%d\n", 385 apply_microcode_on_target(cpu);
326 cpu);
327 /* Should we look for a new ucode here? */
328 return 1;
329 } 386 }
330 387
331 return 0; 388 return ustate;
332} 389}
333 390
334static long microcode_update_cpu(void *unused) 391static enum ucode_state microcode_update_cpu(int cpu)
335{ 392{
336 struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id(); 393 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
337 int err = 0; 394 enum ucode_state ustate;
338 395
339 /* 396 if (uci->valid)
340 * Check if the system resume is in progress (uci->valid != NULL), 397 ustate = microcode_resume_cpu(cpu);
341 * otherwise just request a firmware: 398 else
342 */ 399 ustate = microcode_init_cpu(cpu);
343 if (uci->valid) {
344 err = microcode_resume_cpu(smp_processor_id());
345 } else {
346 collect_cpu_info(smp_processor_id());
347 if (uci->valid && system_state == SYSTEM_RUNNING)
348 err = microcode_ops->request_microcode_fw(
349 smp_processor_id(),
350 &microcode_pdev->dev);
351 }
352 if (!err)
353 microcode_ops->apply_microcode(smp_processor_id());
354 return err;
355}
356 400
357static int microcode_init_cpu(int cpu) 401 return ustate;
358{
359 int err;
360 mutex_lock(&microcode_mutex);
361 err = work_on_cpu(cpu, microcode_update_cpu, NULL);
362 mutex_unlock(&microcode_mutex);
363
364 return err;
365} 402}
366 403
367static int mc_sysdev_add(struct sys_device *sys_dev) 404static int mc_sysdev_add(struct sys_device *sys_dev)
368{ 405{
369 int err, cpu = sys_dev->id; 406 int err, cpu = sys_dev->id;
370 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
371 407
372 if (!cpu_online(cpu)) 408 if (!cpu_online(cpu))
373 return 0; 409 return 0;
374 410
375 pr_debug("microcode: CPU%d added\n", cpu); 411 pr_debug("microcode: CPU%d added\n", cpu);
376 memset(uci, 0, sizeof(*uci));
377 412
378 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); 413 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
379 if (err) 414 if (err)
380 return err; 415 return err;
381 416
382 err = microcode_init_cpu(cpu); 417 if (microcode_init_cpu(cpu) == UCODE_ERROR)
418 err = -EINVAL;
383 419
384 return err; 420 return err;
385} 421}
@@ -400,19 +436,30 @@ static int mc_sysdev_remove(struct sys_device *sys_dev)
400static int mc_sysdev_resume(struct sys_device *dev) 436static int mc_sysdev_resume(struct sys_device *dev)
401{ 437{
402 int cpu = dev->id; 438 int cpu = dev->id;
439 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
403 440
404 if (!cpu_online(cpu)) 441 if (!cpu_online(cpu))
405 return 0; 442 return 0;
406 443
407 /* only CPU 0 will apply ucode here */ 444 /*
408 microcode_update_cpu(NULL); 445 * All non-bootup cpus are still disabled,
446 * so only CPU 0 will apply ucode here.
447 *
448 * Moreover, there can be no concurrent
449 * updates from any other places at this point.
450 */
451 WARN_ON(cpu != 0);
452
453 if (uci->valid && uci->mc)
454 microcode_ops->apply_microcode(cpu);
455
409 return 0; 456 return 0;
410} 457}
411 458
412static struct sysdev_driver mc_sysdev_driver = { 459static struct sysdev_driver mc_sysdev_driver = {
413 .add = mc_sysdev_add, 460 .add = mc_sysdev_add,
414 .remove = mc_sysdev_remove, 461 .remove = mc_sysdev_remove,
415 .resume = mc_sysdev_resume, 462 .resume = mc_sysdev_resume,
416}; 463};
417 464
418static __cpuinit int 465static __cpuinit int
@@ -425,15 +472,12 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
425 switch (action) { 472 switch (action) {
426 case CPU_ONLINE: 473 case CPU_ONLINE:
427 case CPU_ONLINE_FROZEN: 474 case CPU_ONLINE_FROZEN:
428 if (microcode_init_cpu(cpu)) 475 microcode_update_cpu(cpu);
429 printk(KERN_ERR "microcode: failed to init CPU%d\n",
430 cpu);
431 case CPU_DOWN_FAILED: 476 case CPU_DOWN_FAILED:
432 case CPU_DOWN_FAILED_FROZEN: 477 case CPU_DOWN_FAILED_FROZEN:
433 pr_debug("microcode: CPU%d added\n", cpu); 478 pr_debug("microcode: CPU%d added\n", cpu);
434 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) 479 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
435 printk(KERN_ERR "microcode: Failed to create the sysfs " 480 pr_err("microcode: Failed to create group for CPU%d\n", cpu);
436 "group for CPU%d\n", cpu);
437 break; 481 break;
438 case CPU_DOWN_PREPARE: 482 case CPU_DOWN_PREPARE:
439 case CPU_DOWN_PREPARE_FROZEN: 483 case CPU_DOWN_PREPARE_FROZEN:
@@ -465,13 +509,10 @@ static int __init microcode_init(void)
465 microcode_ops = init_amd_microcode(); 509 microcode_ops = init_amd_microcode();
466 510
467 if (!microcode_ops) { 511 if (!microcode_ops) {
468 printk(KERN_ERR "microcode: no support for this CPU vendor\n"); 512 pr_err("microcode: no support for this CPU vendor\n");
469 return -ENODEV; 513 return -ENODEV;
470 } 514 }
471 515
472 error = microcode_dev_init();
473 if (error)
474 return error;
475 microcode_pdev = platform_device_register_simple("microcode", -1, 516 microcode_pdev = platform_device_register_simple("microcode", -1,
476 NULL, 0); 517 NULL, 0);
477 if (IS_ERR(microcode_pdev)) { 518 if (IS_ERR(microcode_pdev)) {
@@ -480,23 +521,31 @@ static int __init microcode_init(void)
480 } 521 }
481 522
482 get_online_cpus(); 523 get_online_cpus();
524 mutex_lock(&microcode_mutex);
525
483 error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); 526 error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
527
528 mutex_unlock(&microcode_mutex);
484 put_online_cpus(); 529 put_online_cpus();
530
485 if (error) { 531 if (error) {
486 microcode_dev_exit();
487 platform_device_unregister(microcode_pdev); 532 platform_device_unregister(microcode_pdev);
488 return error; 533 return error;
489 } 534 }
490 535
536 error = microcode_dev_init();
537 if (error)
538 return error;
539
491 register_hotcpu_notifier(&mc_cpu_notifier); 540 register_hotcpu_notifier(&mc_cpu_notifier);
492 541
493 printk(KERN_INFO 542 pr_info("Microcode Update Driver: v" MICROCODE_VERSION
494 "Microcode Update Driver: v" MICROCODE_VERSION
495 " <tigran@aivazian.fsnet.co.uk>," 543 " <tigran@aivazian.fsnet.co.uk>,"
496 " Peter Oruba\n"); 544 " Peter Oruba\n");
497 545
498 return 0; 546 return 0;
499} 547}
548module_init(microcode_init);
500 549
501static void __exit microcode_exit(void) 550static void __exit microcode_exit(void)
502{ 551{
@@ -505,16 +554,17 @@ static void __exit microcode_exit(void)
505 unregister_hotcpu_notifier(&mc_cpu_notifier); 554 unregister_hotcpu_notifier(&mc_cpu_notifier);
506 555
507 get_online_cpus(); 556 get_online_cpus();
557 mutex_lock(&microcode_mutex);
558
508 sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); 559 sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
560
561 mutex_unlock(&microcode_mutex);
509 put_online_cpus(); 562 put_online_cpus();
510 563
511 platform_device_unregister(microcode_pdev); 564 platform_device_unregister(microcode_pdev);
512 565
513 microcode_ops = NULL; 566 microcode_ops = NULL;
514 567
515 printk(KERN_INFO 568 pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
516 "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
517} 569}
518
519module_init(microcode_init);
520module_exit(microcode_exit); 570module_exit(microcode_exit);
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 149b9ec7c1a..0d334ddd0a9 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -70,24 +70,11 @@
70 * Fix sigmatch() macro to handle old CPUs with pf == 0. 70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug. 71 * Thanks to Stuart Swales for pointing out this bug.
72 */ 72 */
73#include <linux/platform_device.h>
74#include <linux/capability.h>
75#include <linux/miscdevice.h>
76#include <linux/firmware.h> 73#include <linux/firmware.h>
77#include <linux/smp_lock.h>
78#include <linux/spinlock.h>
79#include <linux/cpumask.h>
80#include <linux/uaccess.h> 74#include <linux/uaccess.h>
81#include <linux/vmalloc.h>
82#include <linux/kernel.h> 75#include <linux/kernel.h>
83#include <linux/module.h> 76#include <linux/module.h>
84#include <linux/mutex.h> 77#include <linux/vmalloc.h>
85#include <linux/sched.h>
86#include <linux/init.h>
87#include <linux/slab.h>
88#include <linux/cpu.h>
89#include <linux/fs.h>
90#include <linux/mm.h>
91 78
92#include <asm/microcode.h> 79#include <asm/microcode.h>
93#include <asm/processor.h> 80#include <asm/processor.h>
@@ -150,13 +137,9 @@ struct extended_sigtable {
150 137
151#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) 138#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
152 139
153/* serialize access to the physical write to MSR 0x79 */
154static DEFINE_SPINLOCK(microcode_update_lock);
155
156static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) 140static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
157{ 141{
158 struct cpuinfo_x86 *c = &cpu_data(cpu_num); 142 struct cpuinfo_x86 *c = &cpu_data(cpu_num);
159 unsigned long flags;
160 unsigned int val[2]; 143 unsigned int val[2];
161 144
162 memset(csig, 0, sizeof(*csig)); 145 memset(csig, 0, sizeof(*csig));
@@ -176,18 +159,14 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
176 csig->pf = 1 << ((val[1] >> 18) & 7); 159 csig->pf = 1 << ((val[1] >> 18) & 7);
177 } 160 }
178 161
179 /* serialize access to the physical write to MSR 0x79 */
180 spin_lock_irqsave(&microcode_update_lock, flags);
181
182 wrmsr(MSR_IA32_UCODE_REV, 0, 0); 162 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
183 /* see notes above for revision 1.07. Apparent chip bug */ 163 /* see notes above for revision 1.07. Apparent chip bug */
184 sync_core(); 164 sync_core();
185 /* get the current revision from MSR 0x8B */ 165 /* get the current revision from MSR 0x8B */
186 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); 166 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
187 spin_unlock_irqrestore(&microcode_update_lock, flags);
188 167
189 pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n", 168 printk(KERN_INFO "microcode: CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n",
190 csig->sig, csig->pf, csig->rev); 169 cpu_num, csig->sig, csig->pf, csig->rev);
191 170
192 return 0; 171 return 0;
193} 172}
@@ -318,11 +297,10 @@ get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev)
318 return 0; 297 return 0;
319} 298}
320 299
321static void apply_microcode(int cpu) 300static int apply_microcode(int cpu)
322{ 301{
323 struct microcode_intel *mc_intel; 302 struct microcode_intel *mc_intel;
324 struct ucode_cpu_info *uci; 303 struct ucode_cpu_info *uci;
325 unsigned long flags;
326 unsigned int val[2]; 304 unsigned int val[2];
327 int cpu_num; 305 int cpu_num;
328 306
@@ -334,10 +312,7 @@ static void apply_microcode(int cpu)
334 BUG_ON(cpu_num != cpu); 312 BUG_ON(cpu_num != cpu);
335 313
336 if (mc_intel == NULL) 314 if (mc_intel == NULL)
337 return; 315 return 0;
338
339 /* serialize access to the physical write to MSR 0x79 */
340 spin_lock_irqsave(&microcode_update_lock, flags);
341 316
342 /* write microcode via MSR 0x79 */ 317 /* write microcode via MSR 0x79 */
343 wrmsr(MSR_IA32_UCODE_WRITE, 318 wrmsr(MSR_IA32_UCODE_WRITE,
@@ -351,30 +326,32 @@ static void apply_microcode(int cpu)
351 /* get the current revision from MSR 0x8B */ 326 /* get the current revision from MSR 0x8B */
352 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); 327 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
353 328
354 spin_unlock_irqrestore(&microcode_update_lock, flags);
355 if (val[1] != mc_intel->hdr.rev) { 329 if (val[1] != mc_intel->hdr.rev) {
356 printk(KERN_ERR "microcode: CPU%d update from revision " 330 printk(KERN_ERR "microcode: CPU%d update "
357 "0x%x to 0x%x failed\n", 331 "to revision 0x%x failed\n",
358 cpu_num, uci->cpu_sig.rev, val[1]); 332 cpu_num, mc_intel->hdr.rev);
359 return; 333 return -1;
360 } 334 }
361 printk(KERN_INFO "microcode: CPU%d updated from revision " 335 printk(KERN_INFO "microcode: CPU%d updated to revision "
362 "0x%x to 0x%x, date = %04x-%02x-%02x \n", 336 "0x%x, date = %04x-%02x-%02x \n",
363 cpu_num, uci->cpu_sig.rev, val[1], 337 cpu_num, val[1],
364 mc_intel->hdr.date & 0xffff, 338 mc_intel->hdr.date & 0xffff,
365 mc_intel->hdr.date >> 24, 339 mc_intel->hdr.date >> 24,
366 (mc_intel->hdr.date >> 16) & 0xff); 340 (mc_intel->hdr.date >> 16) & 0xff);
367 341
368 uci->cpu_sig.rev = val[1]; 342 uci->cpu_sig.rev = val[1];
343
344 return 0;
369} 345}
370 346
371static int generic_load_microcode(int cpu, void *data, size_t size, 347static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
372 int (*get_ucode_data)(void *, const void *, size_t)) 348 int (*get_ucode_data)(void *, const void *, size_t))
373{ 349{
374 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 350 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
375 u8 *ucode_ptr = data, *new_mc = NULL, *mc; 351 u8 *ucode_ptr = data, *new_mc = NULL, *mc;
376 int new_rev = uci->cpu_sig.rev; 352 int new_rev = uci->cpu_sig.rev;
377 unsigned int leftover = size; 353 unsigned int leftover = size;
354 enum ucode_state state = UCODE_OK;
378 355
379 while (leftover) { 356 while (leftover) {
380 struct microcode_header_intel mc_header; 357 struct microcode_header_intel mc_header;
@@ -412,11 +389,15 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
412 leftover -= mc_size; 389 leftover -= mc_size;
413 } 390 }
414 391
415 if (!new_mc) 392 if (leftover) {
393 if (new_mc)
394 vfree(new_mc);
395 state = UCODE_ERROR;
416 goto out; 396 goto out;
397 }
417 398
418 if (leftover) { 399 if (!new_mc) {
419 vfree(new_mc); 400 state = UCODE_NFOUND;
420 goto out; 401 goto out;
421 } 402 }
422 403
@@ -427,9 +408,8 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
427 pr_debug("microcode: CPU%d found a matching microcode update with" 408 pr_debug("microcode: CPU%d found a matching microcode update with"
428 " version 0x%x (current=0x%x)\n", 409 " version 0x%x (current=0x%x)\n",
429 cpu, new_rev, uci->cpu_sig.rev); 410 cpu, new_rev, uci->cpu_sig.rev);
430 411out:
431 out: 412 return state;
432 return (int)leftover;
433} 413}
434 414
435static int get_ucode_fw(void *to, const void *from, size_t n) 415static int get_ucode_fw(void *to, const void *from, size_t n)
@@ -438,21 +418,19 @@ static int get_ucode_fw(void *to, const void *from, size_t n)
438 return 0; 418 return 0;
439} 419}
440 420
441static int request_microcode_fw(int cpu, struct device *device) 421static enum ucode_state request_microcode_fw(int cpu, struct device *device)
442{ 422{
443 char name[30]; 423 char name[30];
444 struct cpuinfo_x86 *c = &cpu_data(cpu); 424 struct cpuinfo_x86 *c = &cpu_data(cpu);
445 const struct firmware *firmware; 425 const struct firmware *firmware;
446 int ret; 426 enum ucode_state ret;
447 427
448 /* We should bind the task to the CPU */
449 BUG_ON(cpu != raw_smp_processor_id());
450 sprintf(name, "intel-ucode/%02x-%02x-%02x", 428 sprintf(name, "intel-ucode/%02x-%02x-%02x",
451 c->x86, c->x86_model, c->x86_mask); 429 c->x86, c->x86_model, c->x86_mask);
452 ret = request_firmware(&firmware, name, device); 430
453 if (ret) { 431 if (request_firmware(&firmware, name, device)) {
454 pr_debug("microcode: data file %s load failed\n", name); 432 pr_debug("microcode: data file %s load failed\n", name);
455 return ret; 433 return UCODE_NFOUND;
456 } 434 }
457 435
458 ret = generic_load_microcode(cpu, (void *)firmware->data, 436 ret = generic_load_microcode(cpu, (void *)firmware->data,
@@ -468,11 +446,9 @@ static int get_ucode_user(void *to, const void *from, size_t n)
468 return copy_from_user(to, from, n); 446 return copy_from_user(to, from, n);
469} 447}
470 448
471static int request_microcode_user(int cpu, const void __user *buf, size_t size) 449static enum ucode_state
450request_microcode_user(int cpu, const void __user *buf, size_t size)
472{ 451{
473 /* We should bind the task to the CPU */
474 BUG_ON(cpu != raw_smp_processor_id());
475
476 return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user); 452 return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user);
477} 453}
478 454
diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module.c
index c23880b90b5..89f386f044e 100644
--- a/arch/x86/kernel/module_64.c
+++ b/arch/x86/kernel/module.c
@@ -1,6 +1,5 @@
1/* Kernel module help for x86-64 1/* Kernel module help for x86.
2 Copyright (C) 2001 Rusty Russell. 2 Copyright (C) 2001 Rusty Russell.
3 Copyright (C) 2002,2003 Andi Kleen, SuSE Labs.
4 3
5 This program is free software; you can redistribute it and/or modify 4 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by 5 it under the terms of the GNU General Public License as published by
@@ -22,23 +21,18 @@
22#include <linux/fs.h> 21#include <linux/fs.h>
23#include <linux/string.h> 22#include <linux/string.h>
24#include <linux/kernel.h> 23#include <linux/kernel.h>
25#include <linux/mm.h>
26#include <linux/slab.h>
27#include <linux/bug.h> 24#include <linux/bug.h>
25#include <linux/mm.h>
28 26
29#include <asm/system.h> 27#include <asm/system.h>
30#include <asm/page.h> 28#include <asm/page.h>
31#include <asm/pgtable.h> 29#include <asm/pgtable.h>
32 30
31#if 0
32#define DEBUGP printk
33#else
33#define DEBUGP(fmt...) 34#define DEBUGP(fmt...)
34 35#endif
35#ifndef CONFIG_UML
36void module_free(struct module *mod, void *module_region)
37{
38 vfree(module_region);
39 /* FIXME: If module_region == mod->init_region, trim exception
40 table entries. */
41}
42 36
43void *module_alloc(unsigned long size) 37void *module_alloc(unsigned long size)
44{ 38{
@@ -54,9 +48,15 @@ void *module_alloc(unsigned long size)
54 if (!area) 48 if (!area)
55 return NULL; 49 return NULL;
56 50
57 return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC); 51 return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM,
52 PAGE_KERNEL_EXEC);
53}
54
55/* Free memory returned from module_alloc */
56void module_free(struct module *mod, void *module_region)
57{
58 vfree(module_region);
58} 59}
59#endif
60 60
61/* We don't need anything special. */ 61/* We don't need anything special. */
62int module_frob_arch_sections(Elf_Ehdr *hdr, 62int module_frob_arch_sections(Elf_Ehdr *hdr,
@@ -67,6 +67,58 @@ int module_frob_arch_sections(Elf_Ehdr *hdr,
67 return 0; 67 return 0;
68} 68}
69 69
70#ifdef CONFIG_X86_32
71int apply_relocate(Elf32_Shdr *sechdrs,
72 const char *strtab,
73 unsigned int symindex,
74 unsigned int relsec,
75 struct module *me)
76{
77 unsigned int i;
78 Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr;
79 Elf32_Sym *sym;
80 uint32_t *location;
81
82 DEBUGP("Applying relocate section %u to %u\n", relsec,
83 sechdrs[relsec].sh_info);
84 for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
85 /* This is where to make the change */
86 location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
87 + rel[i].r_offset;
88 /* This is the symbol it is referring to. Note that all
89 undefined symbols have been resolved. */
90 sym = (Elf32_Sym *)sechdrs[symindex].sh_addr
91 + ELF32_R_SYM(rel[i].r_info);
92
93 switch (ELF32_R_TYPE(rel[i].r_info)) {
94 case R_386_32:
95 /* We add the value into the location given */
96 *location += sym->st_value;
97 break;
98 case R_386_PC32:
99 /* Add the value, subtract its postition */
100 *location += sym->st_value - (uint32_t)location;
101 break;
102 default:
103 printk(KERN_ERR "module %s: Unknown relocation: %u\n",
104 me->name, ELF32_R_TYPE(rel[i].r_info));
105 return -ENOEXEC;
106 }
107 }
108 return 0;
109}
110
111int apply_relocate_add(Elf32_Shdr *sechdrs,
112 const char *strtab,
113 unsigned int symindex,
114 unsigned int relsec,
115 struct module *me)
116{
117 printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n",
118 me->name);
119 return -ENOEXEC;
120}
121#else /*X86_64*/
70int apply_relocate_add(Elf64_Shdr *sechdrs, 122int apply_relocate_add(Elf64_Shdr *sechdrs,
71 const char *strtab, 123 const char *strtab,
72 unsigned int symindex, 124 unsigned int symindex,
@@ -147,6 +199,8 @@ int apply_relocate(Elf_Shdr *sechdrs,
147 return -ENOSYS; 199 return -ENOSYS;
148} 200}
149 201
202#endif
203
150int module_finalize(const Elf_Ehdr *hdr, 204int module_finalize(const Elf_Ehdr *hdr,
151 const Elf_Shdr *sechdrs, 205 const Elf_Shdr *sechdrs,
152 struct module *me) 206 struct module *me)
diff --git a/arch/x86/kernel/module_32.c b/arch/x86/kernel/module_32.c
deleted file mode 100644
index 0edd819050e..00000000000
--- a/arch/x86/kernel/module_32.c
+++ /dev/null
@@ -1,152 +0,0 @@
1/* Kernel module help for i386.
2 Copyright (C) 2001 Rusty Russell.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/
18#include <linux/moduleloader.h>
19#include <linux/elf.h>
20#include <linux/vmalloc.h>
21#include <linux/fs.h>
22#include <linux/string.h>
23#include <linux/kernel.h>
24#include <linux/bug.h>
25
26#if 0
27#define DEBUGP printk
28#else
29#define DEBUGP(fmt...)
30#endif
31
32void *module_alloc(unsigned long size)
33{
34 if (size == 0)
35 return NULL;
36 return vmalloc_exec(size);
37}
38
39
40/* Free memory returned from module_alloc */
41void module_free(struct module *mod, void *module_region)
42{
43 vfree(module_region);
44 /* FIXME: If module_region == mod->init_region, trim exception
45 table entries. */
46}
47
48/* We don't need anything special. */
49int module_frob_arch_sections(Elf_Ehdr *hdr,
50 Elf_Shdr *sechdrs,
51 char *secstrings,
52 struct module *mod)
53{
54 return 0;
55}
56
57int apply_relocate(Elf32_Shdr *sechdrs,
58 const char *strtab,
59 unsigned int symindex,
60 unsigned int relsec,
61 struct module *me)
62{
63 unsigned int i;
64 Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr;
65 Elf32_Sym *sym;
66 uint32_t *location;
67
68 DEBUGP("Applying relocate section %u to %u\n", relsec,
69 sechdrs[relsec].sh_info);
70 for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
71 /* This is where to make the change */
72 location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
73 + rel[i].r_offset;
74 /* This is the symbol it is referring to. Note that all
75 undefined symbols have been resolved. */
76 sym = (Elf32_Sym *)sechdrs[symindex].sh_addr
77 + ELF32_R_SYM(rel[i].r_info);
78
79 switch (ELF32_R_TYPE(rel[i].r_info)) {
80 case R_386_32:
81 /* We add the value into the location given */
82 *location += sym->st_value;
83 break;
84 case R_386_PC32:
85 /* Add the value, subtract its postition */
86 *location += sym->st_value - (uint32_t)location;
87 break;
88 default:
89 printk(KERN_ERR "module %s: Unknown relocation: %u\n",
90 me->name, ELF32_R_TYPE(rel[i].r_info));
91 return -ENOEXEC;
92 }
93 }
94 return 0;
95}
96
97int apply_relocate_add(Elf32_Shdr *sechdrs,
98 const char *strtab,
99 unsigned int symindex,
100 unsigned int relsec,
101 struct module *me)
102{
103 printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n",
104 me->name);
105 return -ENOEXEC;
106}
107
108int module_finalize(const Elf_Ehdr *hdr,
109 const Elf_Shdr *sechdrs,
110 struct module *me)
111{
112 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
113 *para = NULL;
114 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
115
116 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
117 if (!strcmp(".text", secstrings + s->sh_name))
118 text = s;
119 if (!strcmp(".altinstructions", secstrings + s->sh_name))
120 alt = s;
121 if (!strcmp(".smp_locks", secstrings + s->sh_name))
122 locks = s;
123 if (!strcmp(".parainstructions", secstrings + s->sh_name))
124 para = s;
125 }
126
127 if (alt) {
128 /* patch .altinstructions */
129 void *aseg = (void *)alt->sh_addr;
130 apply_alternatives(aseg, aseg + alt->sh_size);
131 }
132 if (locks && text) {
133 void *lseg = (void *)locks->sh_addr;
134 void *tseg = (void *)text->sh_addr;
135 alternatives_smp_module_add(me, me->name,
136 lseg, lseg + locks->sh_size,
137 tseg, tseg + text->sh_size);
138 }
139
140 if (para) {
141 void *pseg = (void *)para->sh_addr;
142 apply_paravirt(pseg, pseg + para->sh_size);
143 }
144
145 return module_bug_finalize(hdr, sechdrs, me);
146}
147
148void module_arch_cleanup(struct module *mod)
149{
150 alternatives_smp_module_del(mod);
151 module_bug_cleanup(mod);
152}
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 70fd7e414c1..651c93b2886 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -17,6 +17,7 @@
17#include <linux/acpi.h> 17#include <linux/acpi.h>
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <linux/pci.h>
20 21
21#include <asm/mtrr.h> 22#include <asm/mtrr.h>
22#include <asm/mpspec.h> 23#include <asm/mpspec.h>
@@ -870,24 +871,17 @@ static
870inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {} 871inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
871#endif /* CONFIG_X86_IO_APIC */ 872#endif /* CONFIG_X86_IO_APIC */
872 873
873static int check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, 874static int
874 int count) 875check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
875{ 876{
876 if (!mpc_new_phys) { 877 int ret = 0;
877 pr_info("No spare slots, try to append...take your risk, " 878
878 "new mpc_length %x\n", count); 879 if (!mpc_new_phys || count <= mpc_new_length) {
879 } else { 880 WARN(1, "update_mptable: No spare slots (length: %x)\n", count);
880 if (count <= mpc_new_length) 881 return -1;
881 pr_info("No spare slots, try to append..., "
882 "new mpc_length %x\n", count);
883 else {
884 pr_err("mpc_new_length %lx is too small\n",
885 mpc_new_length);
886 return -1;
887 }
888 } 882 }
889 883
890 return 0; 884 return ret;
891} 885}
892 886
893static int __init replace_intsrc_all(struct mpc_table *mpc, 887static int __init replace_intsrc_all(struct mpc_table *mpc,
@@ -946,7 +940,7 @@ static int __init replace_intsrc_all(struct mpc_table *mpc,
946 } else { 940 } else {
947 struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; 941 struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
948 count += sizeof(struct mpc_intsrc); 942 count += sizeof(struct mpc_intsrc);
949 if (!check_slot(mpc_new_phys, mpc_new_length, count)) 943 if (check_slot(mpc_new_phys, mpc_new_length, count) < 0)
950 goto out; 944 goto out;
951 assign_to_mpc_intsrc(&mp_irqs[i], m); 945 assign_to_mpc_intsrc(&mp_irqs[i], m);
952 mpc->length = count; 946 mpc->length = count;
@@ -963,11 +957,14 @@ out:
963 return 0; 957 return 0;
964} 958}
965 959
966static int __initdata enable_update_mptable; 960int enable_update_mptable;
967 961
968static int __init update_mptable_setup(char *str) 962static int __init update_mptable_setup(char *str)
969{ 963{
970 enable_update_mptable = 1; 964 enable_update_mptable = 1;
965#ifdef CONFIG_PCI
966 pci_routeirq = 1;
967#endif
971 return 0; 968 return 0;
972} 969}
973early_param("update_mptable", update_mptable_setup); 970early_param("update_mptable", update_mptable_setup);
@@ -980,6 +977,9 @@ static int __initdata alloc_mptable;
980static int __init parse_alloc_mptable_opt(char *p) 977static int __init parse_alloc_mptable_opt(char *p)
981{ 978{
982 enable_update_mptable = 1; 979 enable_update_mptable = 1;
980#ifdef CONFIG_PCI
981 pci_routeirq = 1;
982#endif
983 alloc_mptable = 1; 983 alloc_mptable = 1;
984 if (!p) 984 if (!p)
985 return 0; 985 return 0;
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 3cf3413ec62..98fd6cd4e3a 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -196,6 +196,11 @@ static struct notifier_block __refdata msr_class_cpu_notifier = {
196 .notifier_call = msr_class_cpu_callback, 196 .notifier_call = msr_class_cpu_callback,
197}; 197};
198 198
199static char *msr_nodename(struct device *dev)
200{
201 return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));
202}
203
199static int __init msr_init(void) 204static int __init msr_init(void)
200{ 205{
201 int i, err = 0; 206 int i, err = 0;
@@ -212,6 +217,7 @@ static int __init msr_init(void)
212 err = PTR_ERR(msr_class); 217 err = PTR_ERR(msr_class);
213 goto out_chrdev; 218 goto out_chrdev;
214 } 219 }
220 msr_class->nodename = msr_nodename;
215 for_each_online_cpu(i) { 221 for_each_online_cpu(i) {
216 err = msr_device_create(i); 222 err = msr_device_create(i);
217 if (err != 0) 223 if (err != 0)
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 8e45f446488..70ec9b951d7 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -134,7 +134,9 @@ static void *get_call_destination(u8 type)
134 .pv_irq_ops = pv_irq_ops, 134 .pv_irq_ops = pv_irq_ops,
135 .pv_apic_ops = pv_apic_ops, 135 .pv_apic_ops = pv_apic_ops,
136 .pv_mmu_ops = pv_mmu_ops, 136 .pv_mmu_ops = pv_mmu_ops,
137#ifdef CONFIG_PARAVIRT_SPINLOCKS
137 .pv_lock_ops = pv_lock_ops, 138 .pv_lock_ops = pv_lock_ops,
139#endif
138 }; 140 };
139 return *((void **)&tmpl + type); 141 return *((void **)&tmpl + type);
140} 142}
@@ -246,18 +248,16 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA
246 248
247static inline void enter_lazy(enum paravirt_lazy_mode mode) 249static inline void enter_lazy(enum paravirt_lazy_mode mode)
248{ 250{
249 BUG_ON(__get_cpu_var(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); 251 BUG_ON(percpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
250 BUG_ON(preemptible());
251 252
252 __get_cpu_var(paravirt_lazy_mode) = mode; 253 percpu_write(paravirt_lazy_mode, mode);
253} 254}
254 255
255void paravirt_leave_lazy(enum paravirt_lazy_mode mode) 256static void leave_lazy(enum paravirt_lazy_mode mode)
256{ 257{
257 BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode); 258 BUG_ON(percpu_read(paravirt_lazy_mode) != mode);
258 BUG_ON(preemptible());
259 259
260 __get_cpu_var(paravirt_lazy_mode) = PARAVIRT_LAZY_NONE; 260 percpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
261} 261}
262 262
263void paravirt_enter_lazy_mmu(void) 263void paravirt_enter_lazy_mmu(void)
@@ -267,22 +267,36 @@ void paravirt_enter_lazy_mmu(void)
267 267
268void paravirt_leave_lazy_mmu(void) 268void paravirt_leave_lazy_mmu(void)
269{ 269{
270 paravirt_leave_lazy(PARAVIRT_LAZY_MMU); 270 leave_lazy(PARAVIRT_LAZY_MMU);
271} 271}
272 272
273void paravirt_enter_lazy_cpu(void) 273void paravirt_start_context_switch(struct task_struct *prev)
274{ 274{
275 BUG_ON(preemptible());
276
277 if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
278 arch_leave_lazy_mmu_mode();
279 set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES);
280 }
275 enter_lazy(PARAVIRT_LAZY_CPU); 281 enter_lazy(PARAVIRT_LAZY_CPU);
276} 282}
277 283
278void paravirt_leave_lazy_cpu(void) 284void paravirt_end_context_switch(struct task_struct *next)
279{ 285{
280 paravirt_leave_lazy(PARAVIRT_LAZY_CPU); 286 BUG_ON(preemptible());
287
288 leave_lazy(PARAVIRT_LAZY_CPU);
289
290 if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES))
291 arch_enter_lazy_mmu_mode();
281} 292}
282 293
283enum paravirt_lazy_mode paravirt_get_lazy_mode(void) 294enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
284{ 295{
285 return __get_cpu_var(paravirt_lazy_mode); 296 if (in_interrupt())
297 return PARAVIRT_LAZY_NONE;
298
299 return percpu_read(paravirt_lazy_mode);
286} 300}
287 301
288void arch_flush_lazy_mmu_mode(void) 302void arch_flush_lazy_mmu_mode(void)
@@ -290,7 +304,6 @@ void arch_flush_lazy_mmu_mode(void)
290 preempt_disable(); 304 preempt_disable();
291 305
292 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { 306 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
293 WARN_ON(preempt_count() == 1);
294 arch_leave_lazy_mmu_mode(); 307 arch_leave_lazy_mmu_mode();
295 arch_enter_lazy_mmu_mode(); 308 arch_enter_lazy_mmu_mode();
296 } 309 }
@@ -298,19 +311,6 @@ void arch_flush_lazy_mmu_mode(void)
298 preempt_enable(); 311 preempt_enable();
299} 312}
300 313
301void arch_flush_lazy_cpu_mode(void)
302{
303 preempt_disable();
304
305 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
306 WARN_ON(preempt_count() == 1);
307 arch_leave_lazy_cpu_mode();
308 arch_enter_lazy_cpu_mode();
309 }
310
311 preempt_enable();
312}
313
314struct pv_info pv_info = { 314struct pv_info pv_info = {
315 .name = "bare hardware", 315 .name = "bare hardware",
316 .paravirt_enabled = 0, 316 .paravirt_enabled = 0,
@@ -402,10 +402,8 @@ struct pv_cpu_ops pv_cpu_ops = {
402 .set_iopl_mask = native_set_iopl_mask, 402 .set_iopl_mask = native_set_iopl_mask,
403 .io_delay = native_io_delay, 403 .io_delay = native_io_delay,
404 404
405 .lazy_mode = { 405 .start_context_switch = paravirt_nop,
406 .enter = paravirt_nop, 406 .end_context_switch = paravirt_nop,
407 .leave = paravirt_nop,
408 },
409}; 407};
410 408
411struct pv_apic_ops pv_apic_ops = { 409struct pv_apic_ops pv_apic_ops = {
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 755c21e906f..971a3bec47a 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -186,37 +186,6 @@ static struct cal_chipset_ops calioc2_chip_ops = {
186 186
187static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, }; 187static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, };
188 188
189/* enable this to stress test the chip's TCE cache */
190#ifdef CONFIG_IOMMU_DEBUG
191static int debugging = 1;
192
193static inline unsigned long verify_bit_range(unsigned long* bitmap,
194 int expected, unsigned long start, unsigned long end)
195{
196 unsigned long idx = start;
197
198 BUG_ON(start >= end);
199
200 while (idx < end) {
201 if (!!test_bit(idx, bitmap) != expected)
202 return idx;
203 ++idx;
204 }
205
206 /* all bits have the expected value */
207 return ~0UL;
208}
209#else /* debugging is disabled */
210static int debugging;
211
212static inline unsigned long verify_bit_range(unsigned long* bitmap,
213 int expected, unsigned long start, unsigned long end)
214{
215 return ~0UL;
216}
217
218#endif /* CONFIG_IOMMU_DEBUG */
219
220static inline int translation_enabled(struct iommu_table *tbl) 189static inline int translation_enabled(struct iommu_table *tbl)
221{ 190{
222 /* only PHBs with translation enabled have an IOMMU table */ 191 /* only PHBs with translation enabled have an IOMMU table */
@@ -228,7 +197,6 @@ static void iommu_range_reserve(struct iommu_table *tbl,
228{ 197{
229 unsigned long index; 198 unsigned long index;
230 unsigned long end; 199 unsigned long end;
231 unsigned long badbit;
232 unsigned long flags; 200 unsigned long flags;
233 201
234 index = start_addr >> PAGE_SHIFT; 202 index = start_addr >> PAGE_SHIFT;
@@ -243,14 +211,6 @@ static void iommu_range_reserve(struct iommu_table *tbl,
243 211
244 spin_lock_irqsave(&tbl->it_lock, flags); 212 spin_lock_irqsave(&tbl->it_lock, flags);
245 213
246 badbit = verify_bit_range(tbl->it_map, 0, index, end);
247 if (badbit != ~0UL) {
248 if (printk_ratelimit())
249 printk(KERN_ERR "Calgary: entry already allocated at "
250 "0x%lx tbl %p dma 0x%lx npages %u\n",
251 badbit, tbl, start_addr, npages);
252 }
253
254 iommu_area_reserve(tbl->it_map, index, npages); 214 iommu_area_reserve(tbl->it_map, index, npages);
255 215
256 spin_unlock_irqrestore(&tbl->it_lock, flags); 216 spin_unlock_irqrestore(&tbl->it_lock, flags);
@@ -326,7 +286,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
326 unsigned int npages) 286 unsigned int npages)
327{ 287{
328 unsigned long entry; 288 unsigned long entry;
329 unsigned long badbit;
330 unsigned long badend; 289 unsigned long badend;
331 unsigned long flags; 290 unsigned long flags;
332 291
@@ -346,14 +305,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
346 305
347 spin_lock_irqsave(&tbl->it_lock, flags); 306 spin_lock_irqsave(&tbl->it_lock, flags);
348 307
349 badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages);
350 if (badbit != ~0UL) {
351 if (printk_ratelimit())
352 printk(KERN_ERR "Calgary: bit is off at 0x%lx "
353 "tbl %p dma 0x%Lx entry 0x%lx npages %u\n",
354 badbit, tbl, dma_addr, entry, npages);
355 }
356
357 iommu_area_free(tbl->it_map, entry, npages); 308 iommu_area_free(tbl->it_map, entry, npages);
358 309
359 spin_unlock_irqrestore(&tbl->it_lock, flags); 310 spin_unlock_irqrestore(&tbl->it_lock, flags);
@@ -1488,9 +1439,8 @@ void __init detect_calgary(void)
1488 iommu_detected = 1; 1439 iommu_detected = 1;
1489 calgary_detected = 1; 1440 calgary_detected = 1;
1490 printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n"); 1441 printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n");
1491 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " 1442 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n",
1492 "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, 1443 specified_table_size);
1493 debugging ? "enabled" : "disabled");
1494 1444
1495 /* swiotlb for devices that aren't behind the Calgary. */ 1445 /* swiotlb for devices that aren't behind the Calgary. */
1496 if (max_pfn > MAX_DMA32_PFN) 1446 if (max_pfn > MAX_DMA32_PFN)
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index b284b58c035..cfd9f906389 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -144,48 +144,21 @@ static void flush_gart(void)
144} 144}
145 145
146#ifdef CONFIG_IOMMU_LEAK 146#ifdef CONFIG_IOMMU_LEAK
147
148#define SET_LEAK(x) \
149 do { \
150 if (iommu_leak_tab) \
151 iommu_leak_tab[x] = __builtin_return_address(0);\
152 } while (0)
153
154#define CLEAR_LEAK(x) \
155 do { \
156 if (iommu_leak_tab) \
157 iommu_leak_tab[x] = NULL; \
158 } while (0)
159
160/* Debugging aid for drivers that don't free their IOMMU tables */ 147/* Debugging aid for drivers that don't free their IOMMU tables */
161static void **iommu_leak_tab;
162static int leak_trace; 148static int leak_trace;
163static int iommu_leak_pages = 20; 149static int iommu_leak_pages = 20;
164 150
165static void dump_leak(void) 151static void dump_leak(void)
166{ 152{
167 int i;
168 static int dump; 153 static int dump;
169 154
170 if (dump || !iommu_leak_tab) 155 if (dump)
171 return; 156 return;
172 dump = 1; 157 dump = 1;
173 show_stack(NULL, NULL);
174 158
175 /* Very crude. dump some from the end of the table too */ 159 show_stack(NULL, NULL);
176 printk(KERN_DEBUG "Dumping %d pages from end of IOMMU:\n", 160 debug_dma_dump_mappings(NULL);
177 iommu_leak_pages);
178 for (i = 0; i < iommu_leak_pages; i += 2) {
179 printk(KERN_DEBUG "%lu: ", iommu_pages-i);
180 printk_address((unsigned long) iommu_leak_tab[iommu_pages-i],
181 0);
182 printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' ');
183 }
184 printk(KERN_DEBUG "\n");
185} 161}
186#else
187# define SET_LEAK(x)
188# define CLEAR_LEAK(x)
189#endif 162#endif
190 163
191static void iommu_full(struct device *dev, size_t size, int dir) 164static void iommu_full(struct device *dev, size_t size, int dir)
@@ -248,7 +221,6 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
248 221
249 for (i = 0; i < npages; i++) { 222 for (i = 0; i < npages; i++) {
250 iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem); 223 iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem);
251 SET_LEAK(iommu_page + i);
252 phys_mem += PAGE_SIZE; 224 phys_mem += PAGE_SIZE;
253 } 225 }
254 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); 226 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
@@ -294,7 +266,6 @@ static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr,
294 npages = iommu_num_pages(dma_addr, size, PAGE_SIZE); 266 npages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
295 for (i = 0; i < npages; i++) { 267 for (i = 0; i < npages; i++) {
296 iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; 268 iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
297 CLEAR_LEAK(iommu_page + i);
298 } 269 }
299 free_iommu(iommu_page, npages); 270 free_iommu(iommu_page, npages);
300} 271}
@@ -377,7 +348,6 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
377 pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE); 348 pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE);
378 while (pages--) { 349 while (pages--) {
379 iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); 350 iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
380 SET_LEAK(iommu_page);
381 addr += PAGE_SIZE; 351 addr += PAGE_SIZE;
382 iommu_page++; 352 iommu_page++;
383 } 353 }
@@ -688,8 +658,6 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
688 658
689 agp_gatt_table = gatt; 659 agp_gatt_table = gatt;
690 660
691 enable_gart_translations();
692
693 error = sysdev_class_register(&gart_sysdev_class); 661 error = sysdev_class_register(&gart_sysdev_class);
694 if (!error) 662 if (!error)
695 error = sysdev_register(&device_gart); 663 error = sysdev_register(&device_gart);
@@ -801,11 +769,12 @@ void __init gart_iommu_init(void)
801 769
802#ifdef CONFIG_IOMMU_LEAK 770#ifdef CONFIG_IOMMU_LEAK
803 if (leak_trace) { 771 if (leak_trace) {
804 iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, 772 int ret;
805 get_order(iommu_pages*sizeof(void *))); 773
806 if (!iommu_leak_tab) 774 ret = dma_debug_resize_entries(iommu_pages);
775 if (ret)
807 printk(KERN_DEBUG 776 printk(KERN_DEBUG
808 "PCI-DMA: Cannot allocate leak trace area\n"); 777 "PCI-DMA: Cannot trace all the entries\n");
809 } 778 }
810#endif 779#endif
811 780
@@ -845,6 +814,14 @@ void __init gart_iommu_init(void)
845 * the pages as Not-Present: 814 * the pages as Not-Present:
846 */ 815 */
847 wbinvd(); 816 wbinvd();
817
818 /*
819 * Now all caches are flushed and we can safely enable
820 * GART hardware. Doing it early leaves the possibility
821 * of stale cache entries that can lead to GART PTE
822 * errors.
823 */
824 enable_gart_translations();
848 825
849 /* 826 /*
850 * Try to workaround a bug (thanks to BenH): 827 * Try to workaround a bug (thanks to BenH):
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 221a3853e26..a1712f2b50f 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -28,7 +28,7 @@ dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr)
28 return paddr; 28 return paddr;
29} 29}
30 30
31phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr) 31phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr)
32{ 32{
33 return baddr; 33 return baddr;
34} 34}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 19a686c401b..fc6e4b773fc 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -8,9 +8,11 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/pm.h> 9#include <linux/pm.h>
10#include <linux/clockchips.h> 10#include <linux/clockchips.h>
11#include <linux/random.h>
11#include <trace/power.h> 12#include <trace/power.h>
12#include <asm/system.h> 13#include <asm/system.h>
13#include <asm/apic.h> 14#include <asm/apic.h>
15#include <asm/syscalls.h>
14#include <asm/idle.h> 16#include <asm/idle.h>
15#include <asm/uaccess.h> 17#include <asm/uaccess.h>
16#include <asm/i387.h> 18#include <asm/i387.h>
@@ -65,7 +67,7 @@ void arch_task_cache_init(void)
65 task_xstate_cachep = 67 task_xstate_cachep =
66 kmem_cache_create("task_xstate", xstate_size, 68 kmem_cache_create("task_xstate", xstate_size,
67 __alignof__(union thread_xstate), 69 __alignof__(union thread_xstate),
68 SLAB_PANIC, NULL); 70 SLAB_PANIC | SLAB_NOTRACK, NULL);
69} 71}
70 72
71/* 73/*
@@ -604,3 +606,16 @@ static int __init idle_setup(char *str)
604} 606}
605early_param("idle", idle_setup); 607early_param("idle", idle_setup);
606 608
609unsigned long arch_align_stack(unsigned long sp)
610{
611 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
612 sp -= get_random_int() % 8192;
613 return sp & ~0xf;
614}
615
616unsigned long arch_randomize_brk(struct mm_struct *mm)
617{
618 unsigned long range_end = mm->brk + 0x02000000;
619 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
620}
621
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 297ffff2ffc..00a8fe4c58b 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -9,8 +9,6 @@
9 * This file handles the architecture-dependent parts of process handling.. 9 * This file handles the architecture-dependent parts of process handling..
10 */ 10 */
11 11
12#include <stdarg.h>
13
14#include <linux/stackprotector.h> 12#include <linux/stackprotector.h>
15#include <linux/cpu.h> 13#include <linux/cpu.h>
16#include <linux/errno.h> 14#include <linux/errno.h>
@@ -33,7 +31,6 @@
33#include <linux/module.h> 31#include <linux/module.h>
34#include <linux/kallsyms.h> 32#include <linux/kallsyms.h>
35#include <linux/ptrace.h> 33#include <linux/ptrace.h>
36#include <linux/random.h>
37#include <linux/personality.h> 34#include <linux/personality.h>
38#include <linux/tick.h> 35#include <linux/tick.h>
39#include <linux/percpu.h> 36#include <linux/percpu.h>
@@ -419,7 +416,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
419 * done before math_state_restore, so the TS bit is up 416 * done before math_state_restore, so the TS bit is up
420 * to date. 417 * to date.
421 */ 418 */
422 arch_leave_lazy_cpu_mode(); 419 arch_end_context_switch(next_p);
423 420
424 /* If the task has used fpu the last 5 timeslices, just do a full 421 /* If the task has used fpu the last 5 timeslices, just do a full
425 * restore of the math state immediately to avoid the trap; the 422 * restore of the math state immediately to avoid the trap; the
@@ -526,15 +523,3 @@ unsigned long get_wchan(struct task_struct *p)
526 return 0; 523 return 0;
527} 524}
528 525
529unsigned long arch_align_stack(unsigned long sp)
530{
531 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
532 sp -= get_random_int() % 8192;
533 return sp & ~0xf;
534}
535
536unsigned long arch_randomize_brk(struct mm_struct *mm)
537{
538 unsigned long range_end = mm->brk + 0x02000000;
539 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
540}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index f7b276d4b3f..89c46f1259d 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -14,8 +14,6 @@
14 * This file handles the architecture-dependent parts of process handling.. 14 * This file handles the architecture-dependent parts of process handling..
15 */ 15 */
16 16
17#include <stdarg.h>
18
19#include <linux/stackprotector.h> 17#include <linux/stackprotector.h>
20#include <linux/cpu.h> 18#include <linux/cpu.h>
21#include <linux/errno.h> 19#include <linux/errno.h>
@@ -32,7 +30,6 @@
32#include <linux/delay.h> 30#include <linux/delay.h>
33#include <linux/module.h> 31#include <linux/module.h>
34#include <linux/ptrace.h> 32#include <linux/ptrace.h>
35#include <linux/random.h>
36#include <linux/notifier.h> 33#include <linux/notifier.h>
37#include <linux/kprobes.h> 34#include <linux/kprobes.h>
38#include <linux/kdebug.h> 35#include <linux/kdebug.h>
@@ -442,7 +439,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
442 * done before math_state_restore, so the TS bit is up 439 * done before math_state_restore, so the TS bit is up
443 * to date. 440 * to date.
444 */ 441 */
445 arch_leave_lazy_cpu_mode(); 442 arch_end_context_switch(next_p);
446 443
447 /* 444 /*
448 * Switch FS and GS. 445 * Switch FS and GS.
@@ -692,15 +689,3 @@ long sys_arch_prctl(int code, unsigned long addr)
692 return do_arch_prctl(current, code, addr); 689 return do_arch_prctl(current, code, addr);
693} 690}
694 691
695unsigned long arch_align_stack(unsigned long sp)
696{
697 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
698 sp -= get_random_int() % 8192;
699 return sp & ~0xf;
700}
701
702unsigned long arch_randomize_brk(struct mm_struct *mm)
703{
704 unsigned long range_end = mm->brk + 0x02000000;
705 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
706}
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 7563b31b4f0..af71d06624b 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -491,5 +491,42 @@ void force_hpet_resume(void)
491 break; 491 break;
492 } 492 }
493} 493}
494#endif
495
496#if defined(CONFIG_PCI) && defined(CONFIG_NUMA)
497/* Set correct numa_node information for AMD NB functions */
498static void __init quirk_amd_nb_node(struct pci_dev *dev)
499{
500 struct pci_dev *nb_ht;
501 unsigned int devfn;
502 u32 val;
503
504 devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0);
505 nb_ht = pci_get_slot(dev->bus, devfn);
506 if (!nb_ht)
507 return;
508
509 pci_read_config_dword(nb_ht, 0x60, &val);
510 set_dev_node(&dev->dev, val & 7);
511 pci_dev_put(dev);
512}
494 513
514DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
515 quirk_amd_nb_node);
516DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP,
517 quirk_amd_nb_node);
518DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MEMCTL,
519 quirk_amd_nb_node);
520DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC,
521 quirk_amd_nb_node);
522DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_HT,
523 quirk_amd_nb_node);
524DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MAP,
525 quirk_amd_nb_node);
526DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_DRAM,
527 quirk_amd_nb_node);
528DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC,
529 quirk_amd_nb_node);
530DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK,
531 quirk_amd_nb_node);
495#endif 532#endif
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 1340dad417f..d2d1ce8170f 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -192,6 +192,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
192 DMI_MATCH(DMI_BOARD_NAME, "0KP561"), 192 DMI_MATCH(DMI_BOARD_NAME, "0KP561"),
193 }, 193 },
194 }, 194 },
195 { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */
196 .callback = set_bios_reboot,
197 .ident = "Dell OptiPlex 360",
198 .matches = {
199 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
200 DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"),
201 DMI_MATCH(DMI_BOARD_NAME, "0T656F"),
202 },
203 },
195 { /* Handle problems with rebooting on Dell 2400's */ 204 { /* Handle problems with rebooting on Dell 2400's */
196 .callback = set_bios_reboot, 205 .callback = set_bios_reboot,
197 .ident = "Dell PowerEdge 2400", 206 .ident = "Dell PowerEdge 2400",
@@ -232,6 +241,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
232 DMI_MATCH(DMI_PRODUCT_NAME, "Dell DXP061"), 241 DMI_MATCH(DMI_PRODUCT_NAME, "Dell DXP061"),
233 }, 242 },
234 }, 243 },
244 { /* Handle problems with rebooting on Sony VGN-Z540N */
245 .callback = set_bios_reboot,
246 .ident = "Sony VGN-Z540N",
247 .matches = {
248 DMI_MATCH(DMI_SYS_VENDOR, "Sony Corporation"),
249 DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"),
250 },
251 },
235 { } 252 { }
236}; 253};
237 254
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b4158439bf6..be5ae80f897 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -112,6 +112,14 @@
112#define ARCH_SETUP 112#define ARCH_SETUP
113#endif 113#endif
114 114
115/*
116 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
117 * The direct mapping extends to max_pfn_mapped, so that we can directly access
118 * apertures, ACPI and other tables without having to play with fixmaps.
119 */
120unsigned long max_low_pfn_mapped;
121unsigned long max_pfn_mapped;
122
115RESERVE_BRK(dmi_alloc, 65536); 123RESERVE_BRK(dmi_alloc, 65536);
116 124
117unsigned int boot_cpu_id __read_mostly; 125unsigned int boot_cpu_id __read_mostly;
@@ -214,8 +222,8 @@ unsigned long mmu_cr4_features;
214unsigned long mmu_cr4_features = X86_CR4_PAE; 222unsigned long mmu_cr4_features = X86_CR4_PAE;
215#endif 223#endif
216 224
217/* Boot loader ID as an integer, for the benefit of proc_dointvec */ 225/* Boot loader ID and version as integers, for the benefit of proc_dointvec */
218int bootloader_type; 226int bootloader_type, bootloader_version;
219 227
220/* 228/*
221 * Setup options 229 * Setup options
@@ -293,15 +301,13 @@ static void __init reserve_brk(void)
293 301
294#ifdef CONFIG_BLK_DEV_INITRD 302#ifdef CONFIG_BLK_DEV_INITRD
295 303
296#ifdef CONFIG_X86_32
297
298#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) 304#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
299static void __init relocate_initrd(void) 305static void __init relocate_initrd(void)
300{ 306{
301 307
302 u64 ramdisk_image = boot_params.hdr.ramdisk_image; 308 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
303 u64 ramdisk_size = boot_params.hdr.ramdisk_size; 309 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
304 u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; 310 u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
305 u64 ramdisk_here; 311 u64 ramdisk_here;
306 unsigned long slop, clen, mapaddr; 312 unsigned long slop, clen, mapaddr;
307 char *p, *q; 313 char *p, *q;
@@ -357,14 +363,13 @@ static void __init relocate_initrd(void)
357 ramdisk_image, ramdisk_image + ramdisk_size - 1, 363 ramdisk_image, ramdisk_image + ramdisk_size - 1,
358 ramdisk_here, ramdisk_here + ramdisk_size - 1); 364 ramdisk_here, ramdisk_here + ramdisk_size - 1);
359} 365}
360#endif
361 366
362static void __init reserve_initrd(void) 367static void __init reserve_initrd(void)
363{ 368{
364 u64 ramdisk_image = boot_params.hdr.ramdisk_image; 369 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
365 u64 ramdisk_size = boot_params.hdr.ramdisk_size; 370 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
366 u64 ramdisk_end = ramdisk_image + ramdisk_size; 371 u64 ramdisk_end = ramdisk_image + ramdisk_size;
367 u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; 372 u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
368 373
369 if (!boot_params.hdr.type_of_loader || 374 if (!boot_params.hdr.type_of_loader ||
370 !ramdisk_image || !ramdisk_size) 375 !ramdisk_image || !ramdisk_size)
@@ -394,14 +399,8 @@ static void __init reserve_initrd(void)
394 return; 399 return;
395 } 400 }
396 401
397#ifdef CONFIG_X86_32
398 relocate_initrd(); 402 relocate_initrd();
399#else 403
400 printk(KERN_ERR "initrd extends beyond end of memory "
401 "(0x%08llx > 0x%08llx)\ndisabling initrd\n",
402 ramdisk_end, end_of_lowmem);
403 initrd_start = 0;
404#endif
405 free_early(ramdisk_image, ramdisk_end); 404 free_early(ramdisk_image, ramdisk_end);
406} 405}
407#else 406#else
@@ -706,6 +705,12 @@ void __init setup_arch(char **cmdline_p)
706#endif 705#endif
707 saved_video_mode = boot_params.hdr.vid_mode; 706 saved_video_mode = boot_params.hdr.vid_mode;
708 bootloader_type = boot_params.hdr.type_of_loader; 707 bootloader_type = boot_params.hdr.type_of_loader;
708 if ((bootloader_type >> 4) == 0xe) {
709 bootloader_type &= 0xf;
710 bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4;
711 }
712 bootloader_version = bootloader_type & 0xf;
713 bootloader_version |= boot_params.hdr.ext_loader_ver << 4;
709 714
710#ifdef CONFIG_BLK_DEV_RAM 715#ifdef CONFIG_BLK_DEV_RAM
711 rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; 716 rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
@@ -854,12 +859,16 @@ void __init setup_arch(char **cmdline_p)
854 max_low_pfn = max_pfn; 859 max_low_pfn = max_pfn;
855 860
856 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; 861 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
862 max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
857#endif 863#endif
858 864
859#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION 865#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
860 setup_bios_corruption_check(); 866 setup_bios_corruption_check();
861#endif 867#endif
862 868
869 printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
870 max_pfn_mapped<<PAGE_SHIFT);
871
863 reserve_brk(); 872 reserve_brk();
864 873
865 /* max_pfn_mapped is updated here */ 874 /* max_pfn_mapped is updated here */
@@ -997,24 +1006,6 @@ void __init setup_arch(char **cmdline_p)
997#ifdef CONFIG_X86_32 1006#ifdef CONFIG_X86_32
998 1007
999/** 1008/**
1000 * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
1001 *
1002 * Description:
1003 * Perform any necessary interrupt initialisation prior to setting up
1004 * the "ordinary" interrupt call gates. For legacy reasons, the ISA
1005 * interrupts should be initialised here if the machine emulates a PC
1006 * in any way.
1007 **/
1008void __init x86_quirk_pre_intr_init(void)
1009{
1010 if (x86_quirks->arch_pre_intr_init) {
1011 if (x86_quirks->arch_pre_intr_init())
1012 return;
1013 }
1014 init_ISA_irqs();
1015}
1016
1017/**
1018 * x86_quirk_intr_init - post gate setup interrupt initialisation 1009 * x86_quirk_intr_init - post gate setup interrupt initialisation
1019 * 1010 *
1020 * Description: 1011 * Description:
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 3a97a4cf187..9c3f0823e6a 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -160,8 +160,10 @@ static ssize_t __init setup_pcpu_remap(size_t static_size)
160 /* 160 /*
161 * If large page isn't supported, there's no benefit in doing 161 * If large page isn't supported, there's no benefit in doing
162 * this. Also, on non-NUMA, embedding is better. 162 * this. Also, on non-NUMA, embedding is better.
163 *
164 * NOTE: disabled for now.
163 */ 165 */
164 if (!cpu_has_pse || !pcpu_need_numa()) 166 if (true || !cpu_has_pse || !pcpu_need_numa())
165 return -EINVAL; 167 return -EINVAL;
166 168
167 /* 169 /*
@@ -423,6 +425,14 @@ void __init setup_per_cpu_areas(void)
423 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; 425 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
424#endif 426#endif
425 427
428#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
429 /*
430 * make sure boot cpu node_number is right, when boot cpu is on the
431 * node that doesn't have mem installed
432 */
433 per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id);
434#endif
435
426 /* Setup node to cpumask map */ 436 /* Setup node to cpumask map */
427 setup_node_to_cpumask_map(); 437 setup_node_to_cpumask_map();
428 438
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index f33d2e0ef09..0f89a4f20db 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -6,7 +6,6 @@
6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes 6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
7 * 2000-2002 x86-64 support by Andi Kleen 7 * 2000-2002 x86-64 support by Andi Kleen
8 */ 8 */
9
10#include <linux/sched.h> 9#include <linux/sched.h>
11#include <linux/mm.h> 10#include <linux/mm.h>
12#include <linux/smp.h> 11#include <linux/smp.h>
@@ -25,11 +24,11 @@
25#include <asm/ucontext.h> 24#include <asm/ucontext.h>
26#include <asm/i387.h> 25#include <asm/i387.h>
27#include <asm/vdso.h> 26#include <asm/vdso.h>
27#include <asm/mce.h>
28 28
29#ifdef CONFIG_X86_64 29#ifdef CONFIG_X86_64
30#include <asm/proto.h> 30#include <asm/proto.h>
31#include <asm/ia32_unistd.h> 31#include <asm/ia32_unistd.h>
32#include <asm/mce.h>
33#endif /* CONFIG_X86_64 */ 32#endif /* CONFIG_X86_64 */
34 33
35#include <asm/syscall.h> 34#include <asm/syscall.h>
@@ -848,10 +847,10 @@ static void do_signal(struct pt_regs *regs)
848void 847void
849do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) 848do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
850{ 849{
851#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) 850#ifdef CONFIG_X86_NEW_MCE
852 /* notify userspace of pending MCEs */ 851 /* notify userspace of pending MCEs */
853 if (thread_info_flags & _TIF_MCE_NOTIFY) 852 if (thread_info_flags & _TIF_MCE_NOTIFY)
854 mce_notify_user(); 853 mce_notify_process();
855#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ 854#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
856 855
857 /* deal with pending signal delivery */ 856 /* deal with pending signal delivery */
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 13f33ea8cca..ec1de97600e 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -150,14 +150,40 @@ void native_send_call_func_ipi(const struct cpumask *mask)
150 * this function calls the 'stop' function on all other CPUs in the system. 150 * this function calls the 'stop' function on all other CPUs in the system.
151 */ 151 */
152 152
153asmlinkage void smp_reboot_interrupt(void)
154{
155 ack_APIC_irq();
156 irq_enter();
157 stop_this_cpu(NULL);
158 irq_exit();
159}
160
153static void native_smp_send_stop(void) 161static void native_smp_send_stop(void)
154{ 162{
155 unsigned long flags; 163 unsigned long flags;
164 unsigned long wait;
156 165
157 if (reboot_force) 166 if (reboot_force)
158 return; 167 return;
159 168
160 smp_call_function(stop_this_cpu, NULL, 0); 169 /*
170 * Use an own vector here because smp_call_function
171 * does lots of things not suitable in a panic situation.
172 * On most systems we could also use an NMI here,
173 * but there are a few systems around where NMI
174 * is problematic so stay with an non NMI for now
175 * (this implies we cannot stop CPUs spinning with irq off
176 * currently)
177 */
178 if (num_online_cpus() > 1) {
179 apic->send_IPI_allbutself(REBOOT_VECTOR);
180
181 /* Don't wait longer than a second */
182 wait = USEC_PER_SEC;
183 while (num_online_cpus() > 1 && wait--)
184 udelay(1);
185 }
186
161 local_irq_save(flags); 187 local_irq_save(flags);
162 disable_local_APIC(); 188 disable_local_APIC();
163 local_irq_restore(flags); 189 local_irq_restore(flags);
@@ -172,6 +198,9 @@ void smp_reschedule_interrupt(struct pt_regs *regs)
172{ 198{
173 ack_APIC_irq(); 199 ack_APIC_irq();
174 inc_irq_stat(irq_resched_count); 200 inc_irq_stat(irq_resched_count);
201 /*
202 * KVM uses this interrupt to force a cpu out of guest mode
203 */
175} 204}
176 205
177void smp_call_function_interrupt(struct pt_regs *regs) 206void smp_call_function_interrupt(struct pt_regs *regs)
@@ -193,19 +222,19 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
193} 222}
194 223
195struct smp_ops smp_ops = { 224struct smp_ops smp_ops = {
196 .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, 225 .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
197 .smp_prepare_cpus = native_smp_prepare_cpus, 226 .smp_prepare_cpus = native_smp_prepare_cpus,
198 .smp_cpus_done = native_smp_cpus_done, 227 .smp_cpus_done = native_smp_cpus_done,
199 228
200 .smp_send_stop = native_smp_send_stop, 229 .smp_send_stop = native_smp_send_stop,
201 .smp_send_reschedule = native_smp_send_reschedule, 230 .smp_send_reschedule = native_smp_send_reschedule,
202 231
203 .cpu_up = native_cpu_up, 232 .cpu_up = native_cpu_up,
204 .cpu_die = native_cpu_die, 233 .cpu_die = native_cpu_die,
205 .cpu_disable = native_cpu_disable, 234 .cpu_disable = native_cpu_disable,
206 .play_dead = native_play_dead, 235 .play_dead = native_play_dead,
207 236
208 .send_call_func_ipi = native_send_call_func_ipi, 237 .send_call_func_ipi = native_send_call_func_ipi,
209 .send_call_func_single_ipi = native_send_call_func_single_ipi, 238 .send_call_func_single_ipi = native_send_call_func_single_ipi,
210}; 239};
211EXPORT_SYMBOL_GPL(smp_ops); 240EXPORT_SYMBOL_GPL(smp_ops);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 2b2652d205c..dee0f3d814a 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -506,7 +506,7 @@ void __inquire_remote_apic(int apicid)
506 * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this 506 * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
507 * won't ... remember to clear down the APIC, etc later. 507 * won't ... remember to clear down the APIC, etc later.
508 */ 508 */
509int __devinit 509int __cpuinit
510wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) 510wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
511{ 511{
512 unsigned long send_status, accept_status = 0; 512 unsigned long send_status, accept_status = 0;
@@ -540,7 +540,7 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
540 return (send_status | accept_status); 540 return (send_status | accept_status);
541} 541}
542 542
543int __devinit 543static int __cpuinit
544wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) 544wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
545{ 545{
546 unsigned long send_status, accept_status = 0; 546 unsigned long send_status, accept_status = 0;
@@ -824,10 +824,12 @@ do_rest:
824 /* mark "stuck" area as not stuck */ 824 /* mark "stuck" area as not stuck */
825 *((volatile unsigned long *)trampoline_base) = 0; 825 *((volatile unsigned long *)trampoline_base) = 0;
826 826
827 /* 827 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
828 * Cleanup possible dangling ends... 828 /*
829 */ 829 * Cleanup possible dangling ends...
830 smpboot_restore_warm_reset_vector(); 830 */
831 smpboot_restore_warm_reset_vector();
832 }
831 833
832 return boot_error; 834 return boot_error;
833} 835}
@@ -873,7 +875,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
873 875
874 err = do_boot_cpu(apicid, cpu); 876 err = do_boot_cpu(apicid, cpu);
875 877
876 zap_low_mappings(); 878 zap_low_mappings(false);
877 low_mappings = 0; 879 low_mappings = 0;
878#else 880#else
879 err = do_boot_cpu(apicid, cpu); 881 err = do_boot_cpu(apicid, cpu);
@@ -992,10 +994,12 @@ static int __init smp_sanity_check(unsigned max_cpus)
992 */ 994 */
993 if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && 995 if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) &&
994 !cpu_has_apic) { 996 !cpu_has_apic) {
995 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", 997 if (!disable_apic) {
996 boot_cpu_physical_apicid); 998 pr_err("BIOS bug, local APIC #%d not detected!...\n",
997 printk(KERN_ERR "... forcing use of dummy APIC emulation." 999 boot_cpu_physical_apicid);
1000 pr_err("... forcing use of dummy APIC emulation."
998 "(tell your hw vendor)\n"); 1001 "(tell your hw vendor)\n");
1002 }
999 smpboot_clear_io_apic(); 1003 smpboot_clear_io_apic();
1000 arch_disable_smp_support(); 1004 arch_disable_smp_support();
1001 return -1; 1005 return -1;
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 4aaf7e48394..c3eb207181f 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -77,6 +77,13 @@ void save_stack_trace(struct stack_trace *trace)
77} 77}
78EXPORT_SYMBOL_GPL(save_stack_trace); 78EXPORT_SYMBOL_GPL(save_stack_trace);
79 79
80void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp)
81{
82 dump_trace(current, NULL, NULL, bp, &save_stack_ops, trace);
83 if (trace->nr_entries < trace->max_entries)
84 trace->entries[trace->nr_entries++] = ULONG_MAX;
85}
86
80void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) 87void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
81{ 88{
82 dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace); 89 dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace);
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index ff5c8736b49..d51321ddafd 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -334,3 +334,5 @@ ENTRY(sys_call_table)
334 .long sys_inotify_init1 334 .long sys_inotify_init1
335 .long sys_preadv 335 .long sys_preadv
336 .long sys_pwritev 336 .long sys_pwritev
337 .long sys_rt_tgsigqueueinfo /* 335 */
338 .long sys_perf_counter_open
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index ed0c33761e6..124d40c575d 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -715,7 +715,12 @@ uv_activation_descriptor_init(int node, int pnode)
715 struct bau_desc *adp; 715 struct bau_desc *adp;
716 struct bau_desc *ad2; 716 struct bau_desc *ad2;
717 717
718 adp = (struct bau_desc *)kmalloc_node(16384, GFP_KERNEL, node); 718 /*
719 * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
720 * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per blade
721 */
722 adp = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)*
723 UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
719 BUG_ON(!adp); 724 BUG_ON(!adp);
720 725
721 pa = uv_gpa(adp); /* need the real nasid*/ 726 pa = uv_gpa(adp); /* need the real nasid*/
@@ -729,7 +734,13 @@ uv_activation_descriptor_init(int node, int pnode)
729 (n << UV_DESC_BASE_PNODE_SHIFT | m)); 734 (n << UV_DESC_BASE_PNODE_SHIFT | m));
730 } 735 }
731 736
732 for (i = 0, ad2 = adp; i < UV_ACTIVATION_DESCRIPTOR_SIZE; i++, ad2++) { 737 /*
738 * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
739 * cpu even though we only use the first one; one descriptor can
740 * describe a broadcast to 256 nodes.
741 */
742 for (i = 0, ad2 = adp; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
743 i++, ad2++) {
733 memset(ad2, 0, sizeof(struct bau_desc)); 744 memset(ad2, 0, sizeof(struct bau_desc));
734 ad2->header.sw_ack_flag = 1; 745 ad2->header.sw_ack_flag = 1;
735 /* 746 /*
@@ -832,7 +843,7 @@ static int __init uv_bau_init(void)
832 return 0; 843 return 0;
833 844
834 for_each_possible_cpu(cur_cpu) 845 for_each_possible_cpu(cur_cpu)
835 alloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), 846 zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
836 GFP_KERNEL, cpu_to_node(cur_cpu)); 847 GFP_KERNEL, cpu_to_node(cur_cpu));
837 848
838 uv_bau_retry_limit = 1; 849 uv_bau_retry_limit = 1;
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 124a4d5a95b..286d64eba31 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -45,6 +45,7 @@
45#include <linux/edac.h> 45#include <linux/edac.h>
46#endif 46#endif
47 47
48#include <asm/kmemcheck.h>
48#include <asm/stacktrace.h> 49#include <asm/stacktrace.h>
49#include <asm/processor.h> 50#include <asm/processor.h>
50#include <asm/debugreg.h> 51#include <asm/debugreg.h>
@@ -534,6 +535,10 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
534 535
535 get_debugreg(dr6, 6); 536 get_debugreg(dr6, 6);
536 537
538 /* Catch kmemcheck conditions first of all! */
539 if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
540 return;
541
537 /* DR6 may or may not be cleared by the CPU */ 542 /* DR6 may or may not be cleared by the CPU */
538 set_debugreg(0, 6); 543 set_debugreg(0, 6);
539 /* 544 /*
@@ -777,15 +782,15 @@ unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
777 782
778 return new_kesp; 783 return new_kesp;
779} 784}
780#else 785#endif
786
781asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) 787asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
782{ 788{
783} 789}
784 790
785asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) 791asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)
786{ 792{
787} 793}
788#endif
789 794
790/* 795/*
791 * 'math_state_restore()' saves the current math information in the 796 * 'math_state_restore()' saves the current math information in the
@@ -818,9 +823,6 @@ asmlinkage void math_state_restore(void)
818 } 823 }
819 824
820 clts(); /* Allow maths ops (or we recurse) */ 825 clts(); /* Allow maths ops (or we recurse) */
821#ifdef CONFIG_X86_32
822 restore_fpu(tsk);
823#else
824 /* 826 /*
825 * Paranoid restore. send a SIGSEGV if we fail to restore the state. 827 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
826 */ 828 */
@@ -829,7 +831,7 @@ asmlinkage void math_state_restore(void)
829 force_sig(SIGSEGV, tsk); 831 force_sig(SIGSEGV, tsk);
830 return; 832 return;
831 } 833 }
832#endif 834
833 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ 835 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
834 tsk->fpu_counter++; 836 tsk->fpu_counter++;
835} 837}
@@ -924,8 +926,13 @@ void __init trap_init(void)
924#endif 926#endif
925 set_intr_gate(19, &simd_coprocessor_error); 927 set_intr_gate(19, &simd_coprocessor_error);
926 928
929 /* Reserve all the builtin and the syscall vector: */
930 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
931 set_bit(i, used_vectors);
932
927#ifdef CONFIG_IA32_EMULATION 933#ifdef CONFIG_IA32_EMULATION
928 set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); 934 set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
935 set_bit(IA32_SYSCALL_VECTOR, used_vectors);
929#endif 936#endif
930 937
931#ifdef CONFIG_X86_32 938#ifdef CONFIG_X86_32
@@ -942,17 +949,9 @@ void __init trap_init(void)
942 } 949 }
943 950
944 set_system_trap_gate(SYSCALL_VECTOR, &system_call); 951 set_system_trap_gate(SYSCALL_VECTOR, &system_call);
945#endif
946
947 /* Reserve all the builtin and the syscall vector: */
948 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
949 set_bit(i, used_vectors);
950
951#ifdef CONFIG_X86_64
952 set_bit(IA32_SYSCALL_VECTOR, used_vectors);
953#else
954 set_bit(SYSCALL_VECTOR, used_vectors); 952 set_bit(SYSCALL_VECTOR, used_vectors);
955#endif 953#endif
954
956 /* 955 /*
957 * Should be a barrier for any external CPU state: 956 * Should be a barrier for any external CPU state:
958 */ 957 */
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index d57de05dc43..ae3180c506a 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -9,6 +9,7 @@
9#include <linux/delay.h> 9#include <linux/delay.h>
10#include <linux/clocksource.h> 10#include <linux/clocksource.h>
11#include <linux/percpu.h> 11#include <linux/percpu.h>
12#include <linux/timex.h>
12 13
13#include <asm/hpet.h> 14#include <asm/hpet.h>
14#include <asm/timer.h> 15#include <asm/timer.h>
@@ -384,13 +385,13 @@ unsigned long native_calibrate_tsc(void)
384{ 385{
385 u64 tsc1, tsc2, delta, ref1, ref2; 386 u64 tsc1, tsc2, delta, ref1, ref2;
386 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; 387 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
387 unsigned long flags, latch, ms, fast_calibrate, tsc_khz; 388 unsigned long flags, latch, ms, fast_calibrate, hv_tsc_khz;
388 int hpet = is_hpet_enabled(), i, loopmin; 389 int hpet = is_hpet_enabled(), i, loopmin;
389 390
390 tsc_khz = get_hypervisor_tsc_freq(); 391 hv_tsc_khz = get_hypervisor_tsc_freq();
391 if (tsc_khz) { 392 if (hv_tsc_khz) {
392 printk(KERN_INFO "TSC: Frequency read from the hypervisor\n"); 393 printk(KERN_INFO "TSC: Frequency read from the hypervisor\n");
393 return tsc_khz; 394 return hv_tsc_khz;
394 } 395 }
395 396
396 local_irq_save(flags); 397 local_irq_save(flags);
@@ -710,7 +711,16 @@ static cycle_t read_tsc(struct clocksource *cs)
710#ifdef CONFIG_X86_64 711#ifdef CONFIG_X86_64
711static cycle_t __vsyscall_fn vread_tsc(void) 712static cycle_t __vsyscall_fn vread_tsc(void)
712{ 713{
713 cycle_t ret = (cycle_t)vget_cycles(); 714 cycle_t ret;
715
716 /*
717 * Surround the RDTSC by barriers, to make sure it's not
718 * speculated to outside the seqlock critical section and
719 * does not cause time warps:
720 */
721 rdtsc_barrier();
722 ret = (cycle_t)vget_cycles();
723 rdtsc_barrier();
714 724
715 return ret >= __vsyscall_gtod_data.clock.cycle_last ? 725 return ret >= __vsyscall_gtod_data.clock.cycle_last ?
716 ret : __vsyscall_gtod_data.clock.cycle_last; 726 ret : __vsyscall_gtod_data.clock.cycle_last;
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index bf36328f6ef..027b5b49899 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -34,6 +34,7 @@ static __cpuinitdata atomic_t stop_count;
34 * of a critical section, to be able to prove TSC time-warps: 34 * of a critical section, to be able to prove TSC time-warps:
35 */ 35 */
36static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; 36static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED;
37
37static __cpuinitdata cycles_t last_tsc; 38static __cpuinitdata cycles_t last_tsc;
38static __cpuinitdata cycles_t max_warp; 39static __cpuinitdata cycles_t max_warp;
39static __cpuinitdata int nr_warps; 40static __cpuinitdata int nr_warps;
@@ -113,13 +114,12 @@ void __cpuinit check_tsc_sync_source(int cpu)
113 return; 114 return;
114 115
115 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { 116 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
116 printk(KERN_INFO 117 pr_info("Skipping synchronization checks as TSC is reliable.\n");
117 "Skipping synchronization checks as TSC is reliable.\n");
118 return; 118 return;
119 } 119 }
120 120
121 printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:", 121 pr_info("checking TSC synchronization [CPU#%d -> CPU#%d]:",
122 smp_processor_id(), cpu); 122 smp_processor_id(), cpu);
123 123
124 /* 124 /*
125 * Reset it - in case this is a second bootup: 125 * Reset it - in case this is a second bootup:
@@ -143,8 +143,8 @@ void __cpuinit check_tsc_sync_source(int cpu)
143 143
144 if (nr_warps) { 144 if (nr_warps) {
145 printk("\n"); 145 printk("\n");
146 printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs," 146 pr_warning("Measured %Ld cycles TSC warp between CPUs, "
147 " turning off TSC clock.\n", max_warp); 147 "turning off TSC clock.\n", max_warp);
148 mark_tsc_unstable("check_tsc_sync_source failed"); 148 mark_tsc_unstable("check_tsc_sync_source failed");
149 } else { 149 } else {
150 printk(" passed.\n"); 150 printk(" passed.\n");
@@ -195,5 +195,3 @@ void __cpuinit check_tsc_sync_target(void)
195 while (atomic_read(&stop_count) != cpus) 195 while (atomic_read(&stop_count) != cpus)
196 cpu_relax(); 196 cpu_relax();
197} 197}
198#undef NR_LOOPS
199
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index d7ac84e7fc1..9c4e6253905 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -287,10 +287,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
287 info->regs.pt.ds = 0; 287 info->regs.pt.ds = 0;
288 info->regs.pt.es = 0; 288 info->regs.pt.es = 0;
289 info->regs.pt.fs = 0; 289 info->regs.pt.fs = 0;
290 290#ifndef CONFIG_X86_32_LAZY_GS
291/* we are clearing gs later just before "jmp resume_userspace", 291 info->regs.pt.gs = 0;
292 * because it is not saved/restored. 292#endif
293 */
294 293
295/* 294/*
296 * The flags register is also special: we cannot trust that the user 295 * The flags register is also special: we cannot trust that the user
@@ -318,9 +317,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
318 } 317 }
319 318
320/* 319/*
321 * Save old state, set default return value (%ax) to 0 320 * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL)
322 */ 321 */
323 info->regs32->ax = 0; 322 info->regs32->ax = VM86_SIGNAL;
324 tsk->thread.saved_sp0 = tsk->thread.sp0; 323 tsk->thread.saved_sp0 = tsk->thread.sp0;
325 tsk->thread.saved_fs = info->regs32->fs; 324 tsk->thread.saved_fs = info->regs32->fs;
326 tsk->thread.saved_gs = get_user_gs(info->regs32); 325 tsk->thread.saved_gs = get_user_gs(info->regs32);
@@ -343,7 +342,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
343 __asm__ __volatile__( 342 __asm__ __volatile__(
344 "movl %0,%%esp\n\t" 343 "movl %0,%%esp\n\t"
345 "movl %1,%%ebp\n\t" 344 "movl %1,%%ebp\n\t"
345#ifdef CONFIG_X86_32_LAZY_GS
346 "mov %2, %%gs\n\t" 346 "mov %2, %%gs\n\t"
347#endif
347 "jmp resume_userspace" 348 "jmp resume_userspace"
348 : /* no outputs */ 349 : /* no outputs */
349 :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0)); 350 :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 95deb9f2211..b263423fbe2 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -462,22 +462,28 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
462} 462}
463#endif 463#endif
464 464
465static void vmi_enter_lazy_cpu(void) 465static void vmi_start_context_switch(struct task_struct *prev)
466{ 466{
467 paravirt_enter_lazy_cpu(); 467 paravirt_start_context_switch(prev);
468 vmi_ops.set_lazy_mode(2); 468 vmi_ops.set_lazy_mode(2);
469} 469}
470 470
471static void vmi_end_context_switch(struct task_struct *next)
472{
473 vmi_ops.set_lazy_mode(0);
474 paravirt_end_context_switch(next);
475}
476
471static void vmi_enter_lazy_mmu(void) 477static void vmi_enter_lazy_mmu(void)
472{ 478{
473 paravirt_enter_lazy_mmu(); 479 paravirt_enter_lazy_mmu();
474 vmi_ops.set_lazy_mode(1); 480 vmi_ops.set_lazy_mode(1);
475} 481}
476 482
477static void vmi_leave_lazy(void) 483static void vmi_leave_lazy_mmu(void)
478{ 484{
479 paravirt_leave_lazy(paravirt_get_lazy_mode());
480 vmi_ops.set_lazy_mode(0); 485 vmi_ops.set_lazy_mode(0);
486 paravirt_leave_lazy_mmu();
481} 487}
482 488
483static inline int __init check_vmi_rom(struct vrom_header *rom) 489static inline int __init check_vmi_rom(struct vrom_header *rom)
@@ -711,14 +717,14 @@ static inline int __init activate_vmi(void)
711 para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask); 717 para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
712 para_fill(pv_cpu_ops.io_delay, IODelay); 718 para_fill(pv_cpu_ops.io_delay, IODelay);
713 719
714 para_wrap(pv_cpu_ops.lazy_mode.enter, vmi_enter_lazy_cpu, 720 para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch,
715 set_lazy_mode, SetLazyMode); 721 set_lazy_mode, SetLazyMode);
716 para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy, 722 para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch,
717 set_lazy_mode, SetLazyMode); 723 set_lazy_mode, SetLazyMode);
718 724
719 para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu, 725 para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu,
720 set_lazy_mode, SetLazyMode); 726 set_lazy_mode, SetLazyMode);
721 para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy, 727 para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu,
722 set_lazy_mode, SetLazyMode); 728 set_lazy_mode, SetLazyMode);
723 729
724 /* user and kernel flush are just handled with different flags to FlushTLB */ 730 /* user and kernel flush are just handled with different flags to FlushTLB */
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 849ee611f01..367e8788204 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -1,5 +1,433 @@
1/*
2 * ld script for the x86 kernel
3 *
4 * Historic 32-bit version written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>
5 *
6 * Modernisation, unification and other changes and fixes:
7 * Copyright (C) 2007-2009 Sam Ravnborg <sam@ravnborg.org>
8 *
9 *
10 * Don't define absolute symbols until and unless you know that symbol
11 * value is should remain constant even if kernel image is relocated
12 * at run time. Absolute symbols are not relocated. If symbol value should
13 * change if kernel is relocated, make the symbol section relative and
14 * put it inside the section definition.
15 */
16
1#ifdef CONFIG_X86_32 17#ifdef CONFIG_X86_32
2# include "vmlinux_32.lds.S" 18#define LOAD_OFFSET __PAGE_OFFSET
3#else 19#else
4# include "vmlinux_64.lds.S" 20#define LOAD_OFFSET __START_KERNEL_map
5#endif 21#endif
22
23#include <asm-generic/vmlinux.lds.h>
24#include <asm/asm-offsets.h>
25#include <asm/thread_info.h>
26#include <asm/page_types.h>
27#include <asm/cache.h>
28#include <asm/boot.h>
29
30#undef i386 /* in case the preprocessor is a 32bit one */
31
32OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
33
34#ifdef CONFIG_X86_32
35OUTPUT_ARCH(i386)
36ENTRY(phys_startup_32)
37jiffies = jiffies_64;
38#else
39OUTPUT_ARCH(i386:x86-64)
40ENTRY(phys_startup_64)
41jiffies_64 = jiffies;
42#endif
43
44PHDRS {
45 text PT_LOAD FLAGS(5); /* R_E */
46 data PT_LOAD FLAGS(7); /* RWE */
47#ifdef CONFIG_X86_64
48 user PT_LOAD FLAGS(7); /* RWE */
49 data.init PT_LOAD FLAGS(7); /* RWE */
50#ifdef CONFIG_SMP
51 percpu PT_LOAD FLAGS(7); /* RWE */
52#endif
53 data.init2 PT_LOAD FLAGS(7); /* RWE */
54#endif
55 note PT_NOTE FLAGS(0); /* ___ */
56}
57
58SECTIONS
59{
60#ifdef CONFIG_X86_32
61 . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
62 phys_startup_32 = startup_32 - LOAD_OFFSET;
63#else
64 . = __START_KERNEL;
65 phys_startup_64 = startup_64 - LOAD_OFFSET;
66#endif
67
68 /* Text and read-only data */
69
70 /* bootstrapping code */
71 .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
72 _text = .;
73 *(.text.head)
74 } :text = 0x9090
75
76 /* The rest of the text */
77 .text : AT(ADDR(.text) - LOAD_OFFSET) {
78#ifdef CONFIG_X86_32
79 /* not really needed, already page aligned */
80 . = ALIGN(PAGE_SIZE);
81 *(.text.page_aligned)
82#endif
83 . = ALIGN(8);
84 _stext = .;
85 TEXT_TEXT
86 SCHED_TEXT
87 LOCK_TEXT
88 KPROBES_TEXT
89 IRQENTRY_TEXT
90 *(.fixup)
91 *(.gnu.warning)
92 /* End of text section */
93 _etext = .;
94 } :text = 0x9090
95
96 NOTES :text :note
97
98 /* Exception table */
99 . = ALIGN(16);
100 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
101 __start___ex_table = .;
102 *(__ex_table)
103 __stop___ex_table = .;
104 } :text = 0x9090
105
106 RODATA
107
108 /* Data */
109 . = ALIGN(PAGE_SIZE);
110 .data : AT(ADDR(.data) - LOAD_OFFSET) {
111 /* Start of data section */
112 _sdata = .;
113 DATA_DATA
114 CONSTRUCTORS
115
116#ifdef CONFIG_X86_64
117 /* End of data section */
118 _edata = .;
119#endif
120 } :data
121
122#ifdef CONFIG_X86_32
123 /* 32 bit has nosave before _edata */
124 . = ALIGN(PAGE_SIZE);
125 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
126 __nosave_begin = .;
127 *(.data.nosave)
128 . = ALIGN(PAGE_SIZE);
129 __nosave_end = .;
130 }
131#endif
132
133 . = ALIGN(PAGE_SIZE);
134 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
135 *(.data.page_aligned)
136 *(.data.idt)
137 }
138
139#ifdef CONFIG_X86_32
140 . = ALIGN(32);
141#else
142 . = ALIGN(PAGE_SIZE);
143 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
144#endif
145 .data.cacheline_aligned :
146 AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
147 *(.data.cacheline_aligned)
148 }
149
150 /* rarely changed data like cpu maps */
151#ifdef CONFIG_X86_32
152 . = ALIGN(32);
153#else
154 . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
155#endif
156 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
157 *(.data.read_mostly)
158
159#ifdef CONFIG_X86_32
160 /* End of data section */
161 _edata = .;
162#endif
163 }
164
165#ifdef CONFIG_X86_64
166
167#define VSYSCALL_ADDR (-10*1024*1024)
168#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \
169 SIZEOF(.data.read_mostly) + 4095) & ~(4095))
170#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \
171 SIZEOF(.data.read_mostly) + 4095) & ~(4095))
172
173#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
174#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
175
176#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
177#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
178
179 . = VSYSCALL_ADDR;
180 .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) {
181 *(.vsyscall_0)
182 } :user
183
184 __vsyscall_0 = VSYSCALL_VIRT_ADDR;
185
186 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
187 .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
188 *(.vsyscall_fn)
189 }
190
191 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
192 .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
193 *(.vsyscall_gtod_data)
194 }
195
196 vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
197 .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) {
198 *(.vsyscall_clock)
199 }
200 vsyscall_clock = VVIRT(.vsyscall_clock);
201
202
203 .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) {
204 *(.vsyscall_1)
205 }
206 .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) {
207 *(.vsyscall_2)
208 }
209
210 .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) {
211 *(.vgetcpu_mode)
212 }
213 vgetcpu_mode = VVIRT(.vgetcpu_mode);
214
215 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
216 .jiffies : AT(VLOAD(.jiffies)) {
217 *(.jiffies)
218 }
219 jiffies = VVIRT(.jiffies);
220
221 .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) {
222 *(.vsyscall_3)
223 }
224
225 . = VSYSCALL_VIRT_ADDR + PAGE_SIZE;
226
227#undef VSYSCALL_ADDR
228#undef VSYSCALL_PHYS_ADDR
229#undef VSYSCALL_VIRT_ADDR
230#undef VLOAD_OFFSET
231#undef VLOAD
232#undef VVIRT_OFFSET
233#undef VVIRT
234
235#endif /* CONFIG_X86_64 */
236
237 /* init_task */
238 . = ALIGN(THREAD_SIZE);
239 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
240 *(.data.init_task)
241 }
242#ifdef CONFIG_X86_64
243 :data.init
244#endif
245
246 /*
247 * smp_locks might be freed after init
248 * start/end must be page aligned
249 */
250 . = ALIGN(PAGE_SIZE);
251 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
252 __smp_locks = .;
253 *(.smp_locks)
254 __smp_locks_end = .;
255 . = ALIGN(PAGE_SIZE);
256 }
257
258 /* Init code and data - will be freed after init */
259 . = ALIGN(PAGE_SIZE);
260 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
261 __init_begin = .; /* paired with __init_end */
262 _sinittext = .;
263 INIT_TEXT
264 _einittext = .;
265 }
266
267 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
268 INIT_DATA
269 }
270
271 . = ALIGN(16);
272 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
273 __setup_start = .;
274 *(.init.setup)
275 __setup_end = .;
276 }
277 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
278 __initcall_start = .;
279 INITCALLS
280 __initcall_end = .;
281 }
282
283 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
284 __con_initcall_start = .;
285 *(.con_initcall.init)
286 __con_initcall_end = .;
287 }
288
289 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
290 __x86_cpu_dev_start = .;
291 *(.x86_cpu_dev.init)
292 __x86_cpu_dev_end = .;
293 }
294
295 SECURITY_INIT
296
297 . = ALIGN(8);
298 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
299 __parainstructions = .;
300 *(.parainstructions)
301 __parainstructions_end = .;
302 }
303
304 . = ALIGN(8);
305 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
306 __alt_instructions = .;
307 *(.altinstructions)
308 __alt_instructions_end = .;
309 }
310
311 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
312 *(.altinstr_replacement)
313 }
314
315 /*
316 * .exit.text is discard at runtime, not link time, to deal with
317 * references from .altinstructions and .eh_frame
318 */
319 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
320 EXIT_TEXT
321 }
322
323 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
324 EXIT_DATA
325 }
326
327#ifdef CONFIG_BLK_DEV_INITRD
328 . = ALIGN(PAGE_SIZE);
329 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
330 __initramfs_start = .;
331 *(.init.ramfs)
332 __initramfs_end = .;
333 }
334#endif
335
336#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
337 /*
338 * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
339 * output PHDR, so the next output section - __data_nosave - should
340 * start another section data.init2. Also, pda should be at the head of
341 * percpu area. Preallocate it and define the percpu offset symbol
342 * so that it can be accessed as a percpu variable.
343 */
344 . = ALIGN(PAGE_SIZE);
345 PERCPU_VADDR(0, :percpu)
346#else
347 PERCPU(PAGE_SIZE)
348#endif
349
350 . = ALIGN(PAGE_SIZE);
351
352 /* freed after init ends here */
353 .init.end : AT(ADDR(.init.end) - LOAD_OFFSET) {
354 __init_end = .;
355 }
356
357#ifdef CONFIG_X86_64
358 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
359 . = ALIGN(PAGE_SIZE);
360 __nosave_begin = .;
361 *(.data.nosave)
362 . = ALIGN(PAGE_SIZE);
363 __nosave_end = .;
364 } :data.init2
365 /* use another section data.init2, see PERCPU_VADDR() above */
366#endif
367
368 /* BSS */
369 . = ALIGN(PAGE_SIZE);
370 .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
371 __bss_start = .;
372 *(.bss.page_aligned)
373 *(.bss)
374 . = ALIGN(4);
375 __bss_stop = .;
376 }
377
378 . = ALIGN(PAGE_SIZE);
379 .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
380 __brk_base = .;
381 . += 64 * 1024; /* 64k alignment slop space */
382 *(.brk_reservation) /* areas brk users have reserved */
383 __brk_limit = .;
384 }
385
386 .end : AT(ADDR(.end) - LOAD_OFFSET) {
387 _end = .;
388 }
389
390 /* Sections to be discarded */
391 /DISCARD/ : {
392 *(.exitcall.exit)
393 *(.eh_frame)
394 *(.discard)
395 }
396
397 STABS_DEBUG
398 DWARF_DEBUG
399}
400
401
402#ifdef CONFIG_X86_32
403ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
404 "kernel image bigger than KERNEL_IMAGE_SIZE")
405#else
406/*
407 * Per-cpu symbols which need to be offset from __per_cpu_load
408 * for the boot processor.
409 */
410#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load
411INIT_PER_CPU(gdt_page);
412INIT_PER_CPU(irq_stack_union);
413
414/*
415 * Build-time check on the image size:
416 */
417ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
418 "kernel image bigger than KERNEL_IMAGE_SIZE")
419
420#ifdef CONFIG_SMP
421ASSERT((per_cpu__irq_stack_union == 0),
422 "irq_stack_union is not at start of per-cpu area");
423#endif
424
425#endif /* CONFIG_X86_32 */
426
427#ifdef CONFIG_KEXEC
428#include <asm/kexec.h>
429
430ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
431 "kexec control code size is too big")
432#endif
433
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
deleted file mode 100644
index 62ad500d55f..00000000000
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ /dev/null
@@ -1,229 +0,0 @@
1/* ld script to make i386 Linux kernel
2 * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
3 *
4 * Don't define absolute symbols until and unless you know that symbol
5 * value is should remain constant even if kernel image is relocated
6 * at run time. Absolute symbols are not relocated. If symbol value should
7 * change if kernel is relocated, make the symbol section relative and
8 * put it inside the section definition.
9 */
10
11#define LOAD_OFFSET __PAGE_OFFSET
12
13#include <asm-generic/vmlinux.lds.h>
14#include <asm/thread_info.h>
15#include <asm/page_types.h>
16#include <asm/cache.h>
17#include <asm/boot.h>
18
19OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
20OUTPUT_ARCH(i386)
21ENTRY(phys_startup_32)
22jiffies = jiffies_64;
23
24PHDRS {
25 text PT_LOAD FLAGS(5); /* R_E */
26 data PT_LOAD FLAGS(7); /* RWE */
27 note PT_NOTE FLAGS(0); /* ___ */
28}
29SECTIONS
30{
31 . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
32 phys_startup_32 = startup_32 - LOAD_OFFSET;
33
34 .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
35 _text = .; /* Text and read-only data */
36 *(.text.head)
37 } :text = 0x9090
38
39 /* read-only */
40 .text : AT(ADDR(.text) - LOAD_OFFSET) {
41 . = ALIGN(PAGE_SIZE); /* not really needed, already page aligned */
42 *(.text.page_aligned)
43 TEXT_TEXT
44 SCHED_TEXT
45 LOCK_TEXT
46 KPROBES_TEXT
47 IRQENTRY_TEXT
48 *(.fixup)
49 *(.gnu.warning)
50 _etext = .; /* End of text section */
51 } :text = 0x9090
52
53 NOTES :text :note
54
55 . = ALIGN(16); /* Exception table */
56 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
57 __start___ex_table = .;
58 *(__ex_table)
59 __stop___ex_table = .;
60 } :text = 0x9090
61
62 RODATA
63
64 /* writeable */
65 . = ALIGN(PAGE_SIZE);
66 .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */
67 DATA_DATA
68 CONSTRUCTORS
69 } :data
70
71 . = ALIGN(PAGE_SIZE);
72 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
73 __nosave_begin = .;
74 *(.data.nosave)
75 . = ALIGN(PAGE_SIZE);
76 __nosave_end = .;
77 }
78
79 . = ALIGN(PAGE_SIZE);
80 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
81 *(.data.page_aligned)
82 *(.data.idt)
83 }
84
85 . = ALIGN(32);
86 .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
87 *(.data.cacheline_aligned)
88 }
89
90 /* rarely changed data like cpu maps */
91 . = ALIGN(32);
92 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
93 *(.data.read_mostly)
94 _edata = .; /* End of data section */
95 }
96
97 . = ALIGN(THREAD_SIZE); /* init_task */
98 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
99 *(.data.init_task)
100 }
101
102 /* might get freed after init */
103 . = ALIGN(PAGE_SIZE);
104 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
105 __smp_locks = .;
106 *(.smp_locks)
107 __smp_locks_end = .;
108 }
109 /* will be freed after init
110 * Following ALIGN() is required to make sure no other data falls on the
111 * same page where __smp_alt_end is pointing as that page might be freed
112 * after boot. Always make sure that ALIGN() directive is present after
113 * the section which contains __smp_alt_end.
114 */
115 . = ALIGN(PAGE_SIZE);
116
117 /* will be freed after init */
118 . = ALIGN(PAGE_SIZE); /* Init code and data */
119 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
120 __init_begin = .;
121 _sinittext = .;
122 INIT_TEXT
123 _einittext = .;
124 }
125 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
126 INIT_DATA
127 }
128 . = ALIGN(16);
129 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
130 __setup_start = .;
131 *(.init.setup)
132 __setup_end = .;
133 }
134 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
135 __initcall_start = .;
136 INITCALLS
137 __initcall_end = .;
138 }
139 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
140 __con_initcall_start = .;
141 *(.con_initcall.init)
142 __con_initcall_end = .;
143 }
144 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
145 __x86_cpu_dev_start = .;
146 *(.x86_cpu_dev.init)
147 __x86_cpu_dev_end = .;
148 }
149 SECURITY_INIT
150 . = ALIGN(4);
151 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
152 __alt_instructions = .;
153 *(.altinstructions)
154 __alt_instructions_end = .;
155 }
156 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
157 *(.altinstr_replacement)
158 }
159 . = ALIGN(4);
160 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
161 __parainstructions = .;
162 *(.parainstructions)
163 __parainstructions_end = .;
164 }
165 /* .exit.text is discard at runtime, not link time, to deal with references
166 from .altinstructions and .eh_frame */
167 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
168 EXIT_TEXT
169 }
170 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
171 EXIT_DATA
172 }
173#if defined(CONFIG_BLK_DEV_INITRD)
174 . = ALIGN(PAGE_SIZE);
175 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
176 __initramfs_start = .;
177 *(.init.ramfs)
178 __initramfs_end = .;
179 }
180#endif
181 PERCPU(PAGE_SIZE)
182 . = ALIGN(PAGE_SIZE);
183 /* freed after init ends here */
184
185 .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
186 __init_end = .;
187 __bss_start = .; /* BSS */
188 *(.bss.page_aligned)
189 *(.bss)
190 . = ALIGN(4);
191 __bss_stop = .;
192 }
193
194 .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
195 . = ALIGN(PAGE_SIZE);
196 __brk_base = . ;
197 . += 64 * 1024 ; /* 64k alignment slop space */
198 *(.brk_reservation) /* areas brk users have reserved */
199 __brk_limit = . ;
200 }
201
202 .end : AT(ADDR(.end) - LOAD_OFFSET) {
203 _end = . ;
204 }
205
206 /* Sections to be discarded */
207 /DISCARD/ : {
208 *(.exitcall.exit)
209 *(.discard)
210 }
211
212 STABS_DEBUG
213
214 DWARF_DEBUG
215}
216
217/*
218 * Build-time check on the image size:
219 */
220ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
221 "kernel image bigger than KERNEL_IMAGE_SIZE")
222
223#ifdef CONFIG_KEXEC
224/* Link time checks */
225#include <asm/kexec.h>
226
227ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
228 "kexec control code size is too big")
229#endif
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
deleted file mode 100644
index c8742507b03..00000000000
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ /dev/null
@@ -1,298 +0,0 @@
1/* ld script to make x86-64 Linux kernel
2 * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
3 */
4
5#define LOAD_OFFSET __START_KERNEL_map
6
7#include <asm-generic/vmlinux.lds.h>
8#include <asm/asm-offsets.h>
9#include <asm/page_types.h>
10
11#undef i386 /* in case the preprocessor is a 32bit one */
12
13OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
14OUTPUT_ARCH(i386:x86-64)
15ENTRY(phys_startup_64)
16jiffies_64 = jiffies;
17PHDRS {
18 text PT_LOAD FLAGS(5); /* R_E */
19 data PT_LOAD FLAGS(7); /* RWE */
20 user PT_LOAD FLAGS(7); /* RWE */
21 data.init PT_LOAD FLAGS(7); /* RWE */
22#ifdef CONFIG_SMP
23 percpu PT_LOAD FLAGS(7); /* RWE */
24#endif
25 data.init2 PT_LOAD FLAGS(7); /* RWE */
26 note PT_NOTE FLAGS(0); /* ___ */
27}
28SECTIONS
29{
30 . = __START_KERNEL;
31 phys_startup_64 = startup_64 - LOAD_OFFSET;
32 .text : AT(ADDR(.text) - LOAD_OFFSET) {
33 _text = .; /* Text and read-only data */
34 /* First the code that has to be first for bootstrapping */
35 *(.text.head)
36 _stext = .;
37 /* Then the rest */
38 TEXT_TEXT
39 SCHED_TEXT
40 LOCK_TEXT
41 KPROBES_TEXT
42 IRQENTRY_TEXT
43 *(.fixup)
44 *(.gnu.warning)
45 _etext = .; /* End of text section */
46 } :text = 0x9090
47
48 NOTES :text :note
49
50 . = ALIGN(16); /* Exception table */
51 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
52 __start___ex_table = .;
53 *(__ex_table)
54 __stop___ex_table = .;
55 } :text = 0x9090
56
57 RODATA
58
59 . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */
60 /* Data */
61 .data : AT(ADDR(.data) - LOAD_OFFSET) {
62 DATA_DATA
63 CONSTRUCTORS
64 _edata = .; /* End of data section */
65 } :data
66
67
68 .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
69 . = ALIGN(PAGE_SIZE);
70 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
71 *(.data.cacheline_aligned)
72 }
73 . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
74 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
75 *(.data.read_mostly)
76 }
77
78#define VSYSCALL_ADDR (-10*1024*1024)
79#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
80#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
81
82#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
83#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
84
85#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
86#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
87
88 . = VSYSCALL_ADDR;
89 .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user
90 __vsyscall_0 = VSYSCALL_VIRT_ADDR;
91
92 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
93 .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) }
94 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
95 .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data))
96 { *(.vsyscall_gtod_data) }
97 vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
98 .vsyscall_clock : AT(VLOAD(.vsyscall_clock))
99 { *(.vsyscall_clock) }
100 vsyscall_clock = VVIRT(.vsyscall_clock);
101
102
103 .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1))
104 { *(.vsyscall_1) }
105 .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2))
106 { *(.vsyscall_2) }
107
108 .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) }
109 vgetcpu_mode = VVIRT(.vgetcpu_mode);
110
111 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
112 .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) }
113 jiffies = VVIRT(.jiffies);
114
115 .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3))
116 { *(.vsyscall_3) }
117
118 . = VSYSCALL_VIRT_ADDR + PAGE_SIZE;
119
120#undef VSYSCALL_ADDR
121#undef VSYSCALL_PHYS_ADDR
122#undef VSYSCALL_VIRT_ADDR
123#undef VLOAD_OFFSET
124#undef VLOAD
125#undef VVIRT_OFFSET
126#undef VVIRT
127
128 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
129 . = ALIGN(THREAD_SIZE); /* init_task */
130 *(.data.init_task)
131 }:data.init
132
133 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
134 . = ALIGN(PAGE_SIZE);
135 *(.data.page_aligned)
136 }
137
138 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
139 /* might get freed after init */
140 . = ALIGN(PAGE_SIZE);
141 __smp_alt_begin = .;
142 __smp_locks = .;
143 *(.smp_locks)
144 __smp_locks_end = .;
145 . = ALIGN(PAGE_SIZE);
146 __smp_alt_end = .;
147 }
148
149 . = ALIGN(PAGE_SIZE); /* Init code and data */
150 __init_begin = .; /* paired with __init_end */
151 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
152 _sinittext = .;
153 INIT_TEXT
154 _einittext = .;
155 }
156 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
157 __initdata_begin = .;
158 INIT_DATA
159 __initdata_end = .;
160 }
161
162 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
163 . = ALIGN(16);
164 __setup_start = .;
165 *(.init.setup)
166 __setup_end = .;
167 }
168 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
169 __initcall_start = .;
170 INITCALLS
171 __initcall_end = .;
172 }
173 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
174 __con_initcall_start = .;
175 *(.con_initcall.init)
176 __con_initcall_end = .;
177 }
178 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
179 __x86_cpu_dev_start = .;
180 *(.x86_cpu_dev.init)
181 __x86_cpu_dev_end = .;
182 }
183 SECURITY_INIT
184
185 . = ALIGN(8);
186 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
187 __parainstructions = .;
188 *(.parainstructions)
189 __parainstructions_end = .;
190 }
191
192 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
193 . = ALIGN(8);
194 __alt_instructions = .;
195 *(.altinstructions)
196 __alt_instructions_end = .;
197 }
198 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
199 *(.altinstr_replacement)
200 }
201 /* .exit.text is discard at runtime, not link time, to deal with references
202 from .altinstructions and .eh_frame */
203 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
204 EXIT_TEXT
205 }
206 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
207 EXIT_DATA
208 }
209
210#ifdef CONFIG_BLK_DEV_INITRD
211 . = ALIGN(PAGE_SIZE);
212 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
213 __initramfs_start = .;
214 *(.init.ramfs)
215 __initramfs_end = .;
216 }
217#endif
218
219#ifdef CONFIG_SMP
220 /*
221 * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
222 * output PHDR, so the next output section - __data_nosave - should
223 * start another section data.init2. Also, pda should be at the head of
224 * percpu area. Preallocate it and define the percpu offset symbol
225 * so that it can be accessed as a percpu variable.
226 */
227 . = ALIGN(PAGE_SIZE);
228 PERCPU_VADDR(0, :percpu)
229#else
230 PERCPU(PAGE_SIZE)
231#endif
232
233 . = ALIGN(PAGE_SIZE);
234 __init_end = .;
235
236 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
237 . = ALIGN(PAGE_SIZE);
238 __nosave_begin = .;
239 *(.data.nosave)
240 . = ALIGN(PAGE_SIZE);
241 __nosave_end = .;
242 } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */
243
244 .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
245 . = ALIGN(PAGE_SIZE);
246 __bss_start = .; /* BSS */
247 *(.bss.page_aligned)
248 *(.bss)
249 __bss_stop = .;
250 }
251
252 .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
253 . = ALIGN(PAGE_SIZE);
254 __brk_base = . ;
255 . += 64 * 1024 ; /* 64k alignment slop space */
256 *(.brk_reservation) /* areas brk users have reserved */
257 __brk_limit = . ;
258 }
259
260 _end = . ;
261
262 /* Sections to be discarded */
263 /DISCARD/ : {
264 *(.exitcall.exit)
265 *(.eh_frame)
266 *(.discard)
267 }
268
269 STABS_DEBUG
270
271 DWARF_DEBUG
272}
273
274 /*
275 * Per-cpu symbols which need to be offset from __per_cpu_load
276 * for the boot processor.
277 */
278#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load
279INIT_PER_CPU(gdt_page);
280INIT_PER_CPU(irq_stack_union);
281
282/*
283 * Build-time check on the image size:
284 */
285ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
286 "kernel image bigger than KERNEL_IMAGE_SIZE")
287
288#ifdef CONFIG_SMP
289ASSERT((per_cpu__irq_stack_union == 0),
290 "irq_stack_union is not at start of per-cpu area");
291#endif
292
293#ifdef CONFIG_KEXEC
294#include <asm/kexec.h>
295
296ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
297 "kexec control code size is too big")
298#endif
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 44153afc906..25ee06a80aa 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -132,15 +132,7 @@ static __always_inline void do_vgettimeofday(struct timeval * tv)
132 return; 132 return;
133 } 133 }
134 134
135 /*
136 * Surround the RDTSC by barriers, to make sure it's not
137 * speculated to outside the seqlock critical section and
138 * does not cause time warps:
139 */
140 rdtsc_barrier();
141 now = vread(); 135 now = vread();
142 rdtsc_barrier();
143
144 base = __vsyscall_gtod_data.clock.cycle_last; 136 base = __vsyscall_gtod_data.clock.cycle_last;
145 mask = __vsyscall_gtod_data.clock.mask; 137 mask = __vsyscall_gtod_data.clock.mask;
146 mult = __vsyscall_gtod_data.clock.mult; 138 mult = __vsyscall_gtod_data.clock.mult;
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index a58504ea78c..8600a09e0c6 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -50,6 +50,9 @@ config KVM_INTEL
50 Provides support for KVM on Intel processors equipped with the VT 50 Provides support for KVM on Intel processors equipped with the VT
51 extensions. 51 extensions.
52 52
53 To compile this as a module, choose M here: the module
54 will be called kvm-intel.
55
53config KVM_AMD 56config KVM_AMD
54 tristate "KVM for AMD processors support" 57 tristate "KVM for AMD processors support"
55 depends on KVM 58 depends on KVM
@@ -57,6 +60,9 @@ config KVM_AMD
57 Provides support for KVM on AMD processors equipped with the AMD-V 60 Provides support for KVM on AMD processors equipped with the AMD-V
58 (SVM) extensions. 61 (SVM) extensions.
59 62
63 To compile this as a module, choose M here: the module
64 will be called kvm-amd.
65
60config KVM_TRACE 66config KVM_TRACE
61 bool "KVM trace support" 67 bool "KVM trace support"
62 depends on KVM && SYSFS 68 depends on KVM && SYSFS
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index d3ec292f00f..b43c4efafe8 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -14,7 +14,7 @@ endif
14EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm 14EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
15 15
16kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \ 16kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \
17 i8254.o 17 i8254.o timer.o
18obj-$(CONFIG_KVM) += kvm.o 18obj-$(CONFIG_KVM) += kvm.o
19kvm-intel-objs = vmx.o 19kvm-intel-objs = vmx.o
20obj-$(CONFIG_KVM_INTEL) += kvm-intel.o 20obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index c13bb92d315..4d6f0d293ee 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -98,6 +98,37 @@ static int pit_get_gate(struct kvm *kvm, int channel)
98 return kvm->arch.vpit->pit_state.channels[channel].gate; 98 return kvm->arch.vpit->pit_state.channels[channel].gate;
99} 99}
100 100
101static s64 __kpit_elapsed(struct kvm *kvm)
102{
103 s64 elapsed;
104 ktime_t remaining;
105 struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
106
107 /*
108 * The Counter does not stop when it reaches zero. In
109 * Modes 0, 1, 4, and 5 the Counter ``wraps around'' to
110 * the highest count, either FFFF hex for binary counting
111 * or 9999 for BCD counting, and continues counting.
112 * Modes 2 and 3 are periodic; the Counter reloads
113 * itself with the initial count and continues counting
114 * from there.
115 */
116 remaining = hrtimer_expires_remaining(&ps->pit_timer.timer);
117 elapsed = ps->pit_timer.period - ktime_to_ns(remaining);
118 elapsed = mod_64(elapsed, ps->pit_timer.period);
119
120 return elapsed;
121}
122
123static s64 kpit_elapsed(struct kvm *kvm, struct kvm_kpit_channel_state *c,
124 int channel)
125{
126 if (channel == 0)
127 return __kpit_elapsed(kvm);
128
129 return ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
130}
131
101static int pit_get_count(struct kvm *kvm, int channel) 132static int pit_get_count(struct kvm *kvm, int channel)
102{ 133{
103 struct kvm_kpit_channel_state *c = 134 struct kvm_kpit_channel_state *c =
@@ -107,7 +138,7 @@ static int pit_get_count(struct kvm *kvm, int channel)
107 138
108 WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); 139 WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
109 140
110 t = ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time)); 141 t = kpit_elapsed(kvm, c, channel);
111 d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC); 142 d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
112 143
113 switch (c->mode) { 144 switch (c->mode) {
@@ -137,7 +168,7 @@ static int pit_get_out(struct kvm *kvm, int channel)
137 168
138 WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); 169 WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
139 170
140 t = ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time)); 171 t = kpit_elapsed(kvm, c, channel);
141 d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC); 172 d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
142 173
143 switch (c->mode) { 174 switch (c->mode) {
@@ -193,28 +224,6 @@ static void pit_latch_status(struct kvm *kvm, int channel)
193 } 224 }
194} 225}
195 226
196static int __pit_timer_fn(struct kvm_kpit_state *ps)
197{
198 struct kvm_vcpu *vcpu0 = ps->pit->kvm->vcpus[0];
199 struct kvm_kpit_timer *pt = &ps->pit_timer;
200
201 if (!atomic_inc_and_test(&pt->pending))
202 set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
203
204 if (!pt->reinject)
205 atomic_set(&pt->pending, 1);
206
207 if (vcpu0 && waitqueue_active(&vcpu0->wq))
208 wake_up_interruptible(&vcpu0->wq);
209
210 hrtimer_add_expires_ns(&pt->timer, pt->period);
211 pt->scheduled = hrtimer_get_expires_ns(&pt->timer);
212 if (pt->period)
213 ps->channels[0].count_load_time = ktime_get();
214
215 return (pt->period == 0 ? 0 : 1);
216}
217
218int pit_has_pending_timer(struct kvm_vcpu *vcpu) 227int pit_has_pending_timer(struct kvm_vcpu *vcpu)
219{ 228{
220 struct kvm_pit *pit = vcpu->kvm->arch.vpit; 229 struct kvm_pit *pit = vcpu->kvm->arch.vpit;
@@ -235,21 +244,6 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
235 spin_unlock(&ps->inject_lock); 244 spin_unlock(&ps->inject_lock);
236} 245}
237 246
238static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
239{
240 struct kvm_kpit_state *ps;
241 int restart_timer = 0;
242
243 ps = container_of(data, struct kvm_kpit_state, pit_timer.timer);
244
245 restart_timer = __pit_timer_fn(ps);
246
247 if (restart_timer)
248 return HRTIMER_RESTART;
249 else
250 return HRTIMER_NORESTART;
251}
252
253void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) 247void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
254{ 248{
255 struct kvm_pit *pit = vcpu->kvm->arch.vpit; 249 struct kvm_pit *pit = vcpu->kvm->arch.vpit;
@@ -263,15 +257,26 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
263 hrtimer_start_expires(timer, HRTIMER_MODE_ABS); 257 hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
264} 258}
265 259
266static void destroy_pit_timer(struct kvm_kpit_timer *pt) 260static void destroy_pit_timer(struct kvm_timer *pt)
267{ 261{
268 pr_debug("pit: execute del timer!\n"); 262 pr_debug("pit: execute del timer!\n");
269 hrtimer_cancel(&pt->timer); 263 hrtimer_cancel(&pt->timer);
270} 264}
271 265
266static bool kpit_is_periodic(struct kvm_timer *ktimer)
267{
268 struct kvm_kpit_state *ps = container_of(ktimer, struct kvm_kpit_state,
269 pit_timer);
270 return ps->is_periodic;
271}
272
273static struct kvm_timer_ops kpit_ops = {
274 .is_periodic = kpit_is_periodic,
275};
276
272static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) 277static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
273{ 278{
274 struct kvm_kpit_timer *pt = &ps->pit_timer; 279 struct kvm_timer *pt = &ps->pit_timer;
275 s64 interval; 280 s64 interval;
276 281
277 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); 282 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
@@ -280,8 +285,14 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
280 285
281 /* TODO The new value only affected after the retriggered */ 286 /* TODO The new value only affected after the retriggered */
282 hrtimer_cancel(&pt->timer); 287 hrtimer_cancel(&pt->timer);
283 pt->period = (is_period == 0) ? 0 : interval; 288 pt->period = interval;
284 pt->timer.function = pit_timer_fn; 289 ps->is_periodic = is_period;
290
291 pt->timer.function = kvm_timer_fn;
292 pt->t_ops = &kpit_ops;
293 pt->kvm = ps->pit->kvm;
294 pt->vcpu_id = 0;
295
285 atomic_set(&pt->pending, 0); 296 atomic_set(&pt->pending, 0);
286 ps->irq_ack = 1; 297 ps->irq_ack = 1;
287 298
@@ -298,23 +309,23 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
298 pr_debug("pit: load_count val is %d, channel is %d\n", val, channel); 309 pr_debug("pit: load_count val is %d, channel is %d\n", val, channel);
299 310
300 /* 311 /*
301 * Though spec said the state of 8254 is undefined after power-up, 312 * The largest possible initial count is 0; this is equivalent
302 * seems some tricky OS like Windows XP depends on IRQ0 interrupt 313 * to 216 for binary counting and 104 for BCD counting.
303 * when booting up.
304 * So here setting initialize rate for it, and not a specific number
305 */ 314 */
306 if (val == 0) 315 if (val == 0)
307 val = 0x10000; 316 val = 0x10000;
308 317
309 ps->channels[channel].count_load_time = ktime_get();
310 ps->channels[channel].count = val; 318 ps->channels[channel].count = val;
311 319
312 if (channel != 0) 320 if (channel != 0) {
321 ps->channels[channel].count_load_time = ktime_get();
313 return; 322 return;
323 }
314 324
315 /* Two types of timer 325 /* Two types of timer
316 * mode 1 is one shot, mode 2 is period, otherwise del timer */ 326 * mode 1 is one shot, mode 2 is period, otherwise del timer */
317 switch (ps->channels[0].mode) { 327 switch (ps->channels[0].mode) {
328 case 0:
318 case 1: 329 case 1:
319 /* FIXME: enhance mode 4 precision */ 330 /* FIXME: enhance mode 4 precision */
320 case 4: 331 case 4:
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 6acbe4b505d..bbd863ff60b 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -3,15 +3,6 @@
3 3
4#include "iodev.h" 4#include "iodev.h"
5 5
6struct kvm_kpit_timer {
7 struct hrtimer timer;
8 int irq;
9 s64 period; /* unit: ns */
10 s64 scheduled;
11 atomic_t pending;
12 bool reinject;
13};
14
15struct kvm_kpit_channel_state { 6struct kvm_kpit_channel_state {
16 u32 count; /* can be 65536 */ 7 u32 count; /* can be 65536 */
17 u16 latched_count; 8 u16 latched_count;
@@ -30,7 +21,8 @@ struct kvm_kpit_channel_state {
30 21
31struct kvm_kpit_state { 22struct kvm_kpit_state {
32 struct kvm_kpit_channel_state channels[3]; 23 struct kvm_kpit_channel_state channels[3];
33 struct kvm_kpit_timer pit_timer; 24 struct kvm_timer pit_timer;
25 bool is_periodic;
34 u32 speaker_data_on; 26 u32 speaker_data_on;
35 struct mutex lock; 27 struct mutex lock;
36 struct kvm_pit *pit; 28 struct kvm_pit *pit;
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index cf17ed52f6f..96dfbb6ad2a 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -24,6 +24,7 @@
24 24
25#include "irq.h" 25#include "irq.h"
26#include "i8254.h" 26#include "i8254.h"
27#include "x86.h"
27 28
28/* 29/*
29 * check if there are pending timer events 30 * check if there are pending timer events
@@ -48,6 +49,9 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
48{ 49{
49 struct kvm_pic *s; 50 struct kvm_pic *s;
50 51
52 if (!irqchip_in_kernel(v->kvm))
53 return v->arch.interrupt.pending;
54
51 if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */ 55 if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */
52 if (kvm_apic_accept_pic_intr(v)) { 56 if (kvm_apic_accept_pic_intr(v)) {
53 s = pic_irqchip(v->kvm); /* PIC */ 57 s = pic_irqchip(v->kvm); /* PIC */
@@ -67,6 +71,9 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
67 struct kvm_pic *s; 71 struct kvm_pic *s;
68 int vector; 72 int vector;
69 73
74 if (!irqchip_in_kernel(v->kvm))
75 return v->arch.interrupt.nr;
76
70 vector = kvm_get_apic_interrupt(v); /* APIC */ 77 vector = kvm_get_apic_interrupt(v); /* APIC */
71 if (vector == -1) { 78 if (vector == -1) {
72 if (kvm_apic_accept_pic_intr(v)) { 79 if (kvm_apic_accept_pic_intr(v)) {
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h
new file mode 100644
index 00000000000..26bd6ba74e1
--- /dev/null
+++ b/arch/x86/kvm/kvm_timer.h
@@ -0,0 +1,18 @@
1
2struct kvm_timer {
3 struct hrtimer timer;
4 s64 period; /* unit: ns */
5 atomic_t pending; /* accumulated triggered timers */
6 bool reinject;
7 struct kvm_timer_ops *t_ops;
8 struct kvm *kvm;
9 int vcpu_id;
10};
11
12struct kvm_timer_ops {
13 bool (*is_periodic)(struct kvm_timer *);
14};
15
16
17enum hrtimer_restart kvm_timer_fn(struct hrtimer *data);
18
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index f0b67f2cdd6..ae99d83f81a 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -196,20 +196,15 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
196} 196}
197EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr); 197EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
198 198
199int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig) 199static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
200 int vector, int level, int trig_mode);
201
202int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq)
200{ 203{
201 struct kvm_lapic *apic = vcpu->arch.apic; 204 struct kvm_lapic *apic = vcpu->arch.apic;
202 205
203 if (!apic_test_and_set_irr(vec, apic)) { 206 return __apic_accept_irq(apic, irq->delivery_mode, irq->vector,
204 /* a new pending irq is set in IRR */ 207 irq->level, irq->trig_mode);
205 if (trig)
206 apic_set_vector(vec, apic->regs + APIC_TMR);
207 else
208 apic_clear_vector(vec, apic->regs + APIC_TMR);
209 kvm_vcpu_kick(apic->vcpu);
210 return 1;
211 }
212 return 0;
213} 208}
214 209
215static inline int apic_find_highest_isr(struct kvm_lapic *apic) 210static inline int apic_find_highest_isr(struct kvm_lapic *apic)
@@ -250,7 +245,7 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
250 245
251int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest) 246int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest)
252{ 247{
253 return kvm_apic_id(apic) == dest; 248 return dest == 0xff || kvm_apic_id(apic) == dest;
254} 249}
255 250
256int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) 251int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
@@ -279,37 +274,34 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
279 return result; 274 return result;
280} 275}
281 276
282static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, 277int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
283 int short_hand, int dest, int dest_mode) 278 int short_hand, int dest, int dest_mode)
284{ 279{
285 int result = 0; 280 int result = 0;
286 struct kvm_lapic *target = vcpu->arch.apic; 281 struct kvm_lapic *target = vcpu->arch.apic;
287 282
288 apic_debug("target %p, source %p, dest 0x%x, " 283 apic_debug("target %p, source %p, dest 0x%x, "
289 "dest_mode 0x%x, short_hand 0x%x", 284 "dest_mode 0x%x, short_hand 0x%x\n",
290 target, source, dest, dest_mode, short_hand); 285 target, source, dest, dest_mode, short_hand);
291 286
292 ASSERT(!target); 287 ASSERT(!target);
293 switch (short_hand) { 288 switch (short_hand) {
294 case APIC_DEST_NOSHORT: 289 case APIC_DEST_NOSHORT:
295 if (dest_mode == 0) { 290 if (dest_mode == 0)
296 /* Physical mode. */ 291 /* Physical mode. */
297 if ((dest == 0xFF) || (dest == kvm_apic_id(target))) 292 result = kvm_apic_match_physical_addr(target, dest);
298 result = 1; 293 else
299 } else
300 /* Logical mode. */ 294 /* Logical mode. */
301 result = kvm_apic_match_logical_addr(target, dest); 295 result = kvm_apic_match_logical_addr(target, dest);
302 break; 296 break;
303 case APIC_DEST_SELF: 297 case APIC_DEST_SELF:
304 if (target == source) 298 result = (target == source);
305 result = 1;
306 break; 299 break;
307 case APIC_DEST_ALLINC: 300 case APIC_DEST_ALLINC:
308 result = 1; 301 result = 1;
309 break; 302 break;
310 case APIC_DEST_ALLBUT: 303 case APIC_DEST_ALLBUT:
311 if (target != source) 304 result = (target != source);
312 result = 1;
313 break; 305 break;
314 default: 306 default:
315 printk(KERN_WARNING "Bad dest shorthand value %x\n", 307 printk(KERN_WARNING "Bad dest shorthand value %x\n",
@@ -327,20 +319,22 @@ static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
327static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, 319static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
328 int vector, int level, int trig_mode) 320 int vector, int level, int trig_mode)
329{ 321{
330 int orig_irr, result = 0; 322 int result = 0;
331 struct kvm_vcpu *vcpu = apic->vcpu; 323 struct kvm_vcpu *vcpu = apic->vcpu;
332 324
333 switch (delivery_mode) { 325 switch (delivery_mode) {
334 case APIC_DM_FIXED:
335 case APIC_DM_LOWEST: 326 case APIC_DM_LOWEST:
327 vcpu->arch.apic_arb_prio++;
328 case APIC_DM_FIXED:
336 /* FIXME add logic for vcpu on reset */ 329 /* FIXME add logic for vcpu on reset */
337 if (unlikely(!apic_enabled(apic))) 330 if (unlikely(!apic_enabled(apic)))
338 break; 331 break;
339 332
340 orig_irr = apic_test_and_set_irr(vector, apic); 333 result = !apic_test_and_set_irr(vector, apic);
341 if (orig_irr && trig_mode) { 334 if (!result) {
342 apic_debug("level trig mode repeatedly for vector %d", 335 if (trig_mode)
343 vector); 336 apic_debug("level trig mode repeatedly for "
337 "vector %d", vector);
344 break; 338 break;
345 } 339 }
346 340
@@ -349,10 +343,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
349 apic_set_vector(vector, apic->regs + APIC_TMR); 343 apic_set_vector(vector, apic->regs + APIC_TMR);
350 } else 344 } else
351 apic_clear_vector(vector, apic->regs + APIC_TMR); 345 apic_clear_vector(vector, apic->regs + APIC_TMR);
352
353 kvm_vcpu_kick(vcpu); 346 kvm_vcpu_kick(vcpu);
354
355 result = (orig_irr == 0);
356 break; 347 break;
357 348
358 case APIC_DM_REMRD: 349 case APIC_DM_REMRD:
@@ -364,12 +355,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
364 break; 355 break;
365 356
366 case APIC_DM_NMI: 357 case APIC_DM_NMI:
358 result = 1;
367 kvm_inject_nmi(vcpu); 359 kvm_inject_nmi(vcpu);
368 kvm_vcpu_kick(vcpu); 360 kvm_vcpu_kick(vcpu);
369 break; 361 break;
370 362
371 case APIC_DM_INIT: 363 case APIC_DM_INIT:
372 if (level) { 364 if (level) {
365 result = 1;
373 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) 366 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
374 printk(KERN_DEBUG 367 printk(KERN_DEBUG
375 "INIT on a runnable vcpu %d\n", 368 "INIT on a runnable vcpu %d\n",
@@ -386,6 +379,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
386 apic_debug("SIPI to vcpu %d vector 0x%02x\n", 379 apic_debug("SIPI to vcpu %d vector 0x%02x\n",
387 vcpu->vcpu_id, vector); 380 vcpu->vcpu_id, vector);
388 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { 381 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
382 result = 1;
389 vcpu->arch.sipi_vector = vector; 383 vcpu->arch.sipi_vector = vector;
390 vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; 384 vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
391 kvm_vcpu_kick(vcpu); 385 kvm_vcpu_kick(vcpu);
@@ -408,43 +402,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
408 return result; 402 return result;
409} 403}
410 404
411static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, 405int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
412 unsigned long bitmap)
413{
414 int last;
415 int next;
416 struct kvm_lapic *apic = NULL;
417
418 last = kvm->arch.round_robin_prev_vcpu;
419 next = last;
420
421 do {
422 if (++next == KVM_MAX_VCPUS)
423 next = 0;
424 if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap))
425 continue;
426 apic = kvm->vcpus[next]->arch.apic;
427 if (apic && apic_enabled(apic))
428 break;
429 apic = NULL;
430 } while (next != last);
431 kvm->arch.round_robin_prev_vcpu = next;
432
433 if (!apic)
434 printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n");
435
436 return apic;
437}
438
439struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
440 unsigned long bitmap)
441{ 406{
442 struct kvm_lapic *apic; 407 return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
443
444 apic = kvm_apic_round_robin(kvm, vector, bitmap);
445 if (apic)
446 return apic->vcpu;
447 return NULL;
448} 408}
449 409
450static void apic_set_eoi(struct kvm_lapic *apic) 410static void apic_set_eoi(struct kvm_lapic *apic)
@@ -472,47 +432,24 @@ static void apic_send_ipi(struct kvm_lapic *apic)
472{ 432{
473 u32 icr_low = apic_get_reg(apic, APIC_ICR); 433 u32 icr_low = apic_get_reg(apic, APIC_ICR);
474 u32 icr_high = apic_get_reg(apic, APIC_ICR2); 434 u32 icr_high = apic_get_reg(apic, APIC_ICR2);
435 struct kvm_lapic_irq irq;
475 436
476 unsigned int dest = GET_APIC_DEST_FIELD(icr_high); 437 irq.vector = icr_low & APIC_VECTOR_MASK;
477 unsigned int short_hand = icr_low & APIC_SHORT_MASK; 438 irq.delivery_mode = icr_low & APIC_MODE_MASK;
478 unsigned int trig_mode = icr_low & APIC_INT_LEVELTRIG; 439 irq.dest_mode = icr_low & APIC_DEST_MASK;
479 unsigned int level = icr_low & APIC_INT_ASSERT; 440 irq.level = icr_low & APIC_INT_ASSERT;
480 unsigned int dest_mode = icr_low & APIC_DEST_MASK; 441 irq.trig_mode = icr_low & APIC_INT_LEVELTRIG;
481 unsigned int delivery_mode = icr_low & APIC_MODE_MASK; 442 irq.shorthand = icr_low & APIC_SHORT_MASK;
482 unsigned int vector = icr_low & APIC_VECTOR_MASK; 443 irq.dest_id = GET_APIC_DEST_FIELD(icr_high);
483
484 struct kvm_vcpu *target;
485 struct kvm_vcpu *vcpu;
486 unsigned long lpr_map = 0;
487 int i;
488 444
489 apic_debug("icr_high 0x%x, icr_low 0x%x, " 445 apic_debug("icr_high 0x%x, icr_low 0x%x, "
490 "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, " 446 "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
491 "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n", 447 "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
492 icr_high, icr_low, short_hand, dest, 448 icr_high, icr_low, irq.shorthand, irq.dest_id,
493 trig_mode, level, dest_mode, delivery_mode, vector); 449 irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
494 450 irq.vector);
495 for (i = 0; i < KVM_MAX_VCPUS; i++) {
496 vcpu = apic->vcpu->kvm->vcpus[i];
497 if (!vcpu)
498 continue;
499
500 if (vcpu->arch.apic &&
501 apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) {
502 if (delivery_mode == APIC_DM_LOWEST)
503 set_bit(vcpu->vcpu_id, &lpr_map);
504 else
505 __apic_accept_irq(vcpu->arch.apic, delivery_mode,
506 vector, level, trig_mode);
507 }
508 }
509 451
510 if (delivery_mode == APIC_DM_LOWEST) { 452 kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq);
511 target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map);
512 if (target != NULL)
513 __apic_accept_irq(target->arch.apic, delivery_mode,
514 vector, level, trig_mode);
515 }
516} 453}
517 454
518static u32 apic_get_tmcct(struct kvm_lapic *apic) 455static u32 apic_get_tmcct(struct kvm_lapic *apic)
@@ -527,12 +464,13 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
527 if (apic_get_reg(apic, APIC_TMICT) == 0) 464 if (apic_get_reg(apic, APIC_TMICT) == 0)
528 return 0; 465 return 0;
529 466
530 remaining = hrtimer_expires_remaining(&apic->timer.dev); 467 remaining = hrtimer_expires_remaining(&apic->lapic_timer.timer);
531 if (ktime_to_ns(remaining) < 0) 468 if (ktime_to_ns(remaining) < 0)
532 remaining = ktime_set(0, 0); 469 remaining = ktime_set(0, 0);
533 470
534 ns = mod_64(ktime_to_ns(remaining), apic->timer.period); 471 ns = mod_64(ktime_to_ns(remaining), apic->lapic_timer.period);
535 tmcct = div64_u64(ns, (APIC_BUS_CYCLE_NS * apic->timer.divide_count)); 472 tmcct = div64_u64(ns,
473 (APIC_BUS_CYCLE_NS * apic->divide_count));
536 474
537 return tmcct; 475 return tmcct;
538} 476}
@@ -619,25 +557,25 @@ static void update_divide_count(struct kvm_lapic *apic)
619 tdcr = apic_get_reg(apic, APIC_TDCR); 557 tdcr = apic_get_reg(apic, APIC_TDCR);
620 tmp1 = tdcr & 0xf; 558 tmp1 = tdcr & 0xf;
621 tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1; 559 tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1;
622 apic->timer.divide_count = 0x1 << (tmp2 & 0x7); 560 apic->divide_count = 0x1 << (tmp2 & 0x7);
623 561
624 apic_debug("timer divide count is 0x%x\n", 562 apic_debug("timer divide count is 0x%x\n",
625 apic->timer.divide_count); 563 apic->divide_count);
626} 564}
627 565
628static void start_apic_timer(struct kvm_lapic *apic) 566static void start_apic_timer(struct kvm_lapic *apic)
629{ 567{
630 ktime_t now = apic->timer.dev.base->get_time(); 568 ktime_t now = apic->lapic_timer.timer.base->get_time();
631 569
632 apic->timer.period = apic_get_reg(apic, APIC_TMICT) * 570 apic->lapic_timer.period = apic_get_reg(apic, APIC_TMICT) *
633 APIC_BUS_CYCLE_NS * apic->timer.divide_count; 571 APIC_BUS_CYCLE_NS * apic->divide_count;
634 atomic_set(&apic->timer.pending, 0); 572 atomic_set(&apic->lapic_timer.pending, 0);
635 573
636 if (!apic->timer.period) 574 if (!apic->lapic_timer.period)
637 return; 575 return;
638 576
639 hrtimer_start(&apic->timer.dev, 577 hrtimer_start(&apic->lapic_timer.timer,
640 ktime_add_ns(now, apic->timer.period), 578 ktime_add_ns(now, apic->lapic_timer.period),
641 HRTIMER_MODE_ABS); 579 HRTIMER_MODE_ABS);
642 580
643 apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" 581 apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
@@ -646,9 +584,9 @@ static void start_apic_timer(struct kvm_lapic *apic)
646 "expire @ 0x%016" PRIx64 ".\n", __func__, 584 "expire @ 0x%016" PRIx64 ".\n", __func__,
647 APIC_BUS_CYCLE_NS, ktime_to_ns(now), 585 APIC_BUS_CYCLE_NS, ktime_to_ns(now),
648 apic_get_reg(apic, APIC_TMICT), 586 apic_get_reg(apic, APIC_TMICT),
649 apic->timer.period, 587 apic->lapic_timer.period,
650 ktime_to_ns(ktime_add_ns(now, 588 ktime_to_ns(ktime_add_ns(now,
651 apic->timer.period))); 589 apic->lapic_timer.period)));
652} 590}
653 591
654static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) 592static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
@@ -730,7 +668,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
730 apic_set_reg(apic, APIC_LVTT + 0x10 * i, 668 apic_set_reg(apic, APIC_LVTT + 0x10 * i,
731 lvt_val | APIC_LVT_MASKED); 669 lvt_val | APIC_LVT_MASKED);
732 } 670 }
733 atomic_set(&apic->timer.pending, 0); 671 atomic_set(&apic->lapic_timer.pending, 0);
734 672
735 } 673 }
736 break; 674 break;
@@ -762,7 +700,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
762 break; 700 break;
763 701
764 case APIC_TMICT: 702 case APIC_TMICT:
765 hrtimer_cancel(&apic->timer.dev); 703 hrtimer_cancel(&apic->lapic_timer.timer);
766 apic_set_reg(apic, APIC_TMICT, val); 704 apic_set_reg(apic, APIC_TMICT, val);
767 start_apic_timer(apic); 705 start_apic_timer(apic);
768 return; 706 return;
@@ -802,7 +740,7 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
802 if (!vcpu->arch.apic) 740 if (!vcpu->arch.apic)
803 return; 741 return;
804 742
805 hrtimer_cancel(&vcpu->arch.apic->timer.dev); 743 hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer);
806 744
807 if (vcpu->arch.apic->regs_page) 745 if (vcpu->arch.apic->regs_page)
808 __free_page(vcpu->arch.apic->regs_page); 746 __free_page(vcpu->arch.apic->regs_page);
@@ -880,7 +818,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
880 ASSERT(apic != NULL); 818 ASSERT(apic != NULL);
881 819
882 /* Stop the timer in case it's a reset to an active apic */ 820 /* Stop the timer in case it's a reset to an active apic */
883 hrtimer_cancel(&apic->timer.dev); 821 hrtimer_cancel(&apic->lapic_timer.timer);
884 822
885 apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24); 823 apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24);
886 apic_set_reg(apic, APIC_LVR, APIC_VERSION); 824 apic_set_reg(apic, APIC_LVR, APIC_VERSION);
@@ -905,11 +843,13 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
905 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); 843 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
906 } 844 }
907 update_divide_count(apic); 845 update_divide_count(apic);
908 atomic_set(&apic->timer.pending, 0); 846 atomic_set(&apic->lapic_timer.pending, 0);
909 if (vcpu->vcpu_id == 0) 847 if (vcpu->vcpu_id == 0)
910 vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; 848 vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
911 apic_update_ppr(apic); 849 apic_update_ppr(apic);
912 850
851 vcpu->arch.apic_arb_prio = 0;
852
913 apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" 853 apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
914 "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__, 854 "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__,
915 vcpu, kvm_apic_id(apic), 855 vcpu, kvm_apic_id(apic),
@@ -917,16 +857,14 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
917} 857}
918EXPORT_SYMBOL_GPL(kvm_lapic_reset); 858EXPORT_SYMBOL_GPL(kvm_lapic_reset);
919 859
920int kvm_lapic_enabled(struct kvm_vcpu *vcpu) 860bool kvm_apic_present(struct kvm_vcpu *vcpu)
921{ 861{
922 struct kvm_lapic *apic = vcpu->arch.apic; 862 return vcpu->arch.apic && apic_hw_enabled(vcpu->arch.apic);
923 int ret = 0; 863}
924
925 if (!apic)
926 return 0;
927 ret = apic_enabled(apic);
928 864
929 return ret; 865int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
866{
867 return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic);
930} 868}
931EXPORT_SYMBOL_GPL(kvm_lapic_enabled); 869EXPORT_SYMBOL_GPL(kvm_lapic_enabled);
932 870
@@ -936,22 +874,11 @@ EXPORT_SYMBOL_GPL(kvm_lapic_enabled);
936 *---------------------------------------------------------------------- 874 *----------------------------------------------------------------------
937 */ 875 */
938 876
939/* TODO: make sure __apic_timer_fn runs in current pCPU */ 877static bool lapic_is_periodic(struct kvm_timer *ktimer)
940static int __apic_timer_fn(struct kvm_lapic *apic)
941{ 878{
942 int result = 0; 879 struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic,
943 wait_queue_head_t *q = &apic->vcpu->wq; 880 lapic_timer);
944 881 return apic_lvtt_period(apic);
945 if(!atomic_inc_and_test(&apic->timer.pending))
946 set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
947 if (waitqueue_active(q))
948 wake_up_interruptible(q);
949
950 if (apic_lvtt_period(apic)) {
951 result = 1;
952 hrtimer_add_expires_ns(&apic->timer.dev, apic->timer.period);
953 }
954 return result;
955} 882}
956 883
957int apic_has_pending_timer(struct kvm_vcpu *vcpu) 884int apic_has_pending_timer(struct kvm_vcpu *vcpu)
@@ -959,7 +886,7 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu)
959 struct kvm_lapic *lapic = vcpu->arch.apic; 886 struct kvm_lapic *lapic = vcpu->arch.apic;
960 887
961 if (lapic && apic_enabled(lapic) && apic_lvt_enabled(lapic, APIC_LVTT)) 888 if (lapic && apic_enabled(lapic) && apic_lvt_enabled(lapic, APIC_LVTT))
962 return atomic_read(&lapic->timer.pending); 889 return atomic_read(&lapic->lapic_timer.pending);
963 890
964 return 0; 891 return 0;
965} 892}
@@ -986,20 +913,9 @@ void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
986 kvm_apic_local_deliver(apic, APIC_LVT0); 913 kvm_apic_local_deliver(apic, APIC_LVT0);
987} 914}
988 915
989static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) 916static struct kvm_timer_ops lapic_timer_ops = {
990{ 917 .is_periodic = lapic_is_periodic,
991 struct kvm_lapic *apic; 918};
992 int restart_timer = 0;
993
994 apic = container_of(data, struct kvm_lapic, timer.dev);
995
996 restart_timer = __apic_timer_fn(apic);
997
998 if (restart_timer)
999 return HRTIMER_RESTART;
1000 else
1001 return HRTIMER_NORESTART;
1002}
1003 919
1004int kvm_create_lapic(struct kvm_vcpu *vcpu) 920int kvm_create_lapic(struct kvm_vcpu *vcpu)
1005{ 921{
@@ -1024,8 +940,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
1024 memset(apic->regs, 0, PAGE_SIZE); 940 memset(apic->regs, 0, PAGE_SIZE);
1025 apic->vcpu = vcpu; 941 apic->vcpu = vcpu;
1026 942
1027 hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 943 hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
1028 apic->timer.dev.function = apic_timer_fn; 944 HRTIMER_MODE_ABS);
945 apic->lapic_timer.timer.function = kvm_timer_fn;
946 apic->lapic_timer.t_ops = &lapic_timer_ops;
947 apic->lapic_timer.kvm = vcpu->kvm;
948 apic->lapic_timer.vcpu_id = vcpu->vcpu_id;
949
1029 apic->base_address = APIC_DEFAULT_PHYS_BASE; 950 apic->base_address = APIC_DEFAULT_PHYS_BASE;
1030 vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE; 951 vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
1031 952
@@ -1078,9 +999,9 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
1078{ 999{
1079 struct kvm_lapic *apic = vcpu->arch.apic; 1000 struct kvm_lapic *apic = vcpu->arch.apic;
1080 1001
1081 if (apic && atomic_read(&apic->timer.pending) > 0) { 1002 if (apic && atomic_read(&apic->lapic_timer.pending) > 0) {
1082 if (kvm_apic_local_deliver(apic, APIC_LVTT)) 1003 if (kvm_apic_local_deliver(apic, APIC_LVTT))
1083 atomic_dec(&apic->timer.pending); 1004 atomic_dec(&apic->lapic_timer.pending);
1084 } 1005 }
1085} 1006}
1086 1007
@@ -1106,7 +1027,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
1106 MSR_IA32_APICBASE_BASE; 1027 MSR_IA32_APICBASE_BASE;
1107 apic_set_reg(apic, APIC_LVR, APIC_VERSION); 1028 apic_set_reg(apic, APIC_LVR, APIC_VERSION);
1108 apic_update_ppr(apic); 1029 apic_update_ppr(apic);
1109 hrtimer_cancel(&apic->timer.dev); 1030 hrtimer_cancel(&apic->lapic_timer.timer);
1110 update_divide_count(apic); 1031 update_divide_count(apic);
1111 start_apic_timer(apic); 1032 start_apic_timer(apic);
1112} 1033}
@@ -1119,7 +1040,7 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
1119 if (!apic) 1040 if (!apic)
1120 return; 1041 return;
1121 1042
1122 timer = &apic->timer.dev; 1043 timer = &apic->lapic_timer.timer;
1123 if (hrtimer_cancel(timer)) 1044 if (hrtimer_cancel(timer))
1124 hrtimer_start_expires(timer, HRTIMER_MODE_ABS); 1045 hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
1125} 1046}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 45ab6ee7120..a587f8349c4 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -2,18 +2,15 @@
2#define __KVM_X86_LAPIC_H 2#define __KVM_X86_LAPIC_H
3 3
4#include "iodev.h" 4#include "iodev.h"
5#include "kvm_timer.h"
5 6
6#include <linux/kvm_host.h> 7#include <linux/kvm_host.h>
7 8
8struct kvm_lapic { 9struct kvm_lapic {
9 unsigned long base_address; 10 unsigned long base_address;
10 struct kvm_io_device dev; 11 struct kvm_io_device dev;
11 struct { 12 struct kvm_timer lapic_timer;
12 atomic_t pending; 13 u32 divide_count;
13 s64 period; /* unit: ns */
14 u32 divide_count;
15 struct hrtimer dev;
16 } timer;
17 struct kvm_vcpu *vcpu; 14 struct kvm_vcpu *vcpu;
18 struct page *regs_page; 15 struct page *regs_page;
19 void *regs; 16 void *regs;
@@ -34,12 +31,13 @@ u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
34 31
35int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); 32int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
36int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); 33int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
37int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig); 34int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
38 35
39u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); 36u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
40void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); 37void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
41void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu); 38void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
42int kvm_lapic_enabled(struct kvm_vcpu *vcpu); 39int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
40bool kvm_apic_present(struct kvm_vcpu *vcpu);
43int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); 41int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
44 42
45void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); 43void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b6caf1329b1..5c3d6e81a7d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -126,6 +126,7 @@ module_param(oos_shadow, bool, 0644);
126#define PFERR_PRESENT_MASK (1U << 0) 126#define PFERR_PRESENT_MASK (1U << 0)
127#define PFERR_WRITE_MASK (1U << 1) 127#define PFERR_WRITE_MASK (1U << 1)
128#define PFERR_USER_MASK (1U << 2) 128#define PFERR_USER_MASK (1U << 2)
129#define PFERR_RSVD_MASK (1U << 3)
129#define PFERR_FETCH_MASK (1U << 4) 130#define PFERR_FETCH_MASK (1U << 4)
130 131
131#define PT_DIRECTORY_LEVEL 2 132#define PT_DIRECTORY_LEVEL 2
@@ -177,7 +178,11 @@ static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
177static u64 __read_mostly shadow_user_mask; 178static u64 __read_mostly shadow_user_mask;
178static u64 __read_mostly shadow_accessed_mask; 179static u64 __read_mostly shadow_accessed_mask;
179static u64 __read_mostly shadow_dirty_mask; 180static u64 __read_mostly shadow_dirty_mask;
180static u64 __read_mostly shadow_mt_mask; 181
182static inline u64 rsvd_bits(int s, int e)
183{
184 return ((1ULL << (e - s + 1)) - 1) << s;
185}
181 186
182void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) 187void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
183{ 188{
@@ -193,14 +198,13 @@ void kvm_mmu_set_base_ptes(u64 base_pte)
193EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes); 198EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
194 199
195void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 200void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
196 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask) 201 u64 dirty_mask, u64 nx_mask, u64 x_mask)
197{ 202{
198 shadow_user_mask = user_mask; 203 shadow_user_mask = user_mask;
199 shadow_accessed_mask = accessed_mask; 204 shadow_accessed_mask = accessed_mask;
200 shadow_dirty_mask = dirty_mask; 205 shadow_dirty_mask = dirty_mask;
201 shadow_nx_mask = nx_mask; 206 shadow_nx_mask = nx_mask;
202 shadow_x_mask = x_mask; 207 shadow_x_mask = x_mask;
203 shadow_mt_mask = mt_mask;
204} 208}
205EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); 209EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
206 210
@@ -219,11 +223,6 @@ static int is_nx(struct kvm_vcpu *vcpu)
219 return vcpu->arch.shadow_efer & EFER_NX; 223 return vcpu->arch.shadow_efer & EFER_NX;
220} 224}
221 225
222static int is_present_pte(unsigned long pte)
223{
224 return pte & PT_PRESENT_MASK;
225}
226
227static int is_shadow_present_pte(u64 pte) 226static int is_shadow_present_pte(u64 pte)
228{ 227{
229 return pte != shadow_trap_nonpresent_pte 228 return pte != shadow_trap_nonpresent_pte
@@ -1074,18 +1073,10 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
1074 return NULL; 1073 return NULL;
1075} 1074}
1076 1075
1077static void kvm_unlink_unsync_global(struct kvm *kvm, struct kvm_mmu_page *sp)
1078{
1079 list_del(&sp->oos_link);
1080 --kvm->stat.mmu_unsync_global;
1081}
1082
1083static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1076static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1084{ 1077{
1085 WARN_ON(!sp->unsync); 1078 WARN_ON(!sp->unsync);
1086 sp->unsync = 0; 1079 sp->unsync = 0;
1087 if (sp->global)
1088 kvm_unlink_unsync_global(kvm, sp);
1089 --kvm->stat.mmu_unsync; 1080 --kvm->stat.mmu_unsync;
1090} 1081}
1091 1082
@@ -1248,7 +1239,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1248 pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word); 1239 pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word);
1249 sp->gfn = gfn; 1240 sp->gfn = gfn;
1250 sp->role = role; 1241 sp->role = role;
1251 sp->global = 0;
1252 hlist_add_head(&sp->hash_link, bucket); 1242 hlist_add_head(&sp->hash_link, bucket);
1253 if (!direct) { 1243 if (!direct) {
1254 if (rmap_write_protect(vcpu->kvm, gfn)) 1244 if (rmap_write_protect(vcpu->kvm, gfn))
@@ -1616,7 +1606,7 @@ static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
1616 return mtrr_state->def_type; 1606 return mtrr_state->def_type;
1617} 1607}
1618 1608
1619static u8 get_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) 1609u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
1620{ 1610{
1621 u8 mtrr; 1611 u8 mtrr;
1622 1612
@@ -1626,6 +1616,7 @@ static u8 get_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
1626 mtrr = MTRR_TYPE_WRBACK; 1616 mtrr = MTRR_TYPE_WRBACK;
1627 return mtrr; 1617 return mtrr;
1628} 1618}
1619EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
1629 1620
1630static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 1621static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1631{ 1622{
@@ -1646,11 +1637,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1646 ++vcpu->kvm->stat.mmu_unsync; 1637 ++vcpu->kvm->stat.mmu_unsync;
1647 sp->unsync = 1; 1638 sp->unsync = 1;
1648 1639
1649 if (sp->global) { 1640 kvm_mmu_mark_parents_unsync(vcpu, sp);
1650 list_add(&sp->oos_link, &vcpu->kvm->arch.oos_global_pages);
1651 ++vcpu->kvm->stat.mmu_unsync_global;
1652 } else
1653 kvm_mmu_mark_parents_unsync(vcpu, sp);
1654 1641
1655 mmu_convert_notrap(sp); 1642 mmu_convert_notrap(sp);
1656 return 0; 1643 return 0;
@@ -1677,21 +1664,11 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
1677static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, 1664static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1678 unsigned pte_access, int user_fault, 1665 unsigned pte_access, int user_fault,
1679 int write_fault, int dirty, int largepage, 1666 int write_fault, int dirty, int largepage,
1680 int global, gfn_t gfn, pfn_t pfn, bool speculative, 1667 gfn_t gfn, pfn_t pfn, bool speculative,
1681 bool can_unsync) 1668 bool can_unsync)
1682{ 1669{
1683 u64 spte; 1670 u64 spte;
1684 int ret = 0; 1671 int ret = 0;
1685 u64 mt_mask = shadow_mt_mask;
1686 struct kvm_mmu_page *sp = page_header(__pa(shadow_pte));
1687
1688 if (!global && sp->global) {
1689 sp->global = 0;
1690 if (sp->unsync) {
1691 kvm_unlink_unsync_global(vcpu->kvm, sp);
1692 kvm_mmu_mark_parents_unsync(vcpu, sp);
1693 }
1694 }
1695 1672
1696 /* 1673 /*
1697 * We don't set the accessed bit, since we sometimes want to see 1674 * We don't set the accessed bit, since we sometimes want to see
@@ -1711,16 +1688,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1711 spte |= shadow_user_mask; 1688 spte |= shadow_user_mask;
1712 if (largepage) 1689 if (largepage)
1713 spte |= PT_PAGE_SIZE_MASK; 1690 spte |= PT_PAGE_SIZE_MASK;
1714 if (mt_mask) { 1691 if (tdp_enabled)
1715 if (!kvm_is_mmio_pfn(pfn)) { 1692 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
1716 mt_mask = get_memory_type(vcpu, gfn) << 1693 kvm_is_mmio_pfn(pfn));
1717 kvm_x86_ops->get_mt_mask_shift();
1718 mt_mask |= VMX_EPT_IGMT_BIT;
1719 } else
1720 mt_mask = MTRR_TYPE_UNCACHABLE <<
1721 kvm_x86_ops->get_mt_mask_shift();
1722 spte |= mt_mask;
1723 }
1724 1694
1725 spte |= (u64)pfn << PAGE_SHIFT; 1695 spte |= (u64)pfn << PAGE_SHIFT;
1726 1696
@@ -1765,8 +1735,8 @@ set_pte:
1765static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, 1735static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1766 unsigned pt_access, unsigned pte_access, 1736 unsigned pt_access, unsigned pte_access,
1767 int user_fault, int write_fault, int dirty, 1737 int user_fault, int write_fault, int dirty,
1768 int *ptwrite, int largepage, int global, 1738 int *ptwrite, int largepage, gfn_t gfn,
1769 gfn_t gfn, pfn_t pfn, bool speculative) 1739 pfn_t pfn, bool speculative)
1770{ 1740{
1771 int was_rmapped = 0; 1741 int was_rmapped = 0;
1772 int was_writeble = is_writeble_pte(*shadow_pte); 1742 int was_writeble = is_writeble_pte(*shadow_pte);
@@ -1795,7 +1765,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1795 was_rmapped = 1; 1765 was_rmapped = 1;
1796 } 1766 }
1797 if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, 1767 if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
1798 dirty, largepage, global, gfn, pfn, speculative, true)) { 1768 dirty, largepage, gfn, pfn, speculative, true)) {
1799 if (write_fault) 1769 if (write_fault)
1800 *ptwrite = 1; 1770 *ptwrite = 1;
1801 kvm_x86_ops->tlb_flush(vcpu); 1771 kvm_x86_ops->tlb_flush(vcpu);
@@ -1843,7 +1813,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1843 || (largepage && iterator.level == PT_DIRECTORY_LEVEL)) { 1813 || (largepage && iterator.level == PT_DIRECTORY_LEVEL)) {
1844 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, 1814 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
1845 0, write, 1, &pt_write, 1815 0, write, 1, &pt_write,
1846 largepage, 0, gfn, pfn, false); 1816 largepage, gfn, pfn, false);
1847 ++vcpu->stat.pf_fixed; 1817 ++vcpu->stat.pf_fixed;
1848 break; 1818 break;
1849 } 1819 }
@@ -1942,7 +1912,19 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
1942 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 1912 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1943} 1913}
1944 1914
1945static void mmu_alloc_roots(struct kvm_vcpu *vcpu) 1915static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
1916{
1917 int ret = 0;
1918
1919 if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
1920 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
1921 ret = 1;
1922 }
1923
1924 return ret;
1925}
1926
1927static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
1946{ 1928{
1947 int i; 1929 int i;
1948 gfn_t root_gfn; 1930 gfn_t root_gfn;
@@ -1957,13 +1939,15 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
1957 ASSERT(!VALID_PAGE(root)); 1939 ASSERT(!VALID_PAGE(root));
1958 if (tdp_enabled) 1940 if (tdp_enabled)
1959 direct = 1; 1941 direct = 1;
1942 if (mmu_check_root(vcpu, root_gfn))
1943 return 1;
1960 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, 1944 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
1961 PT64_ROOT_LEVEL, direct, 1945 PT64_ROOT_LEVEL, direct,
1962 ACC_ALL, NULL); 1946 ACC_ALL, NULL);
1963 root = __pa(sp->spt); 1947 root = __pa(sp->spt);
1964 ++sp->root_count; 1948 ++sp->root_count;
1965 vcpu->arch.mmu.root_hpa = root; 1949 vcpu->arch.mmu.root_hpa = root;
1966 return; 1950 return 0;
1967 } 1951 }
1968 direct = !is_paging(vcpu); 1952 direct = !is_paging(vcpu);
1969 if (tdp_enabled) 1953 if (tdp_enabled)
@@ -1980,6 +1964,8 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
1980 root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT; 1964 root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT;
1981 } else if (vcpu->arch.mmu.root_level == 0) 1965 } else if (vcpu->arch.mmu.root_level == 0)
1982 root_gfn = 0; 1966 root_gfn = 0;
1967 if (mmu_check_root(vcpu, root_gfn))
1968 return 1;
1983 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, 1969 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
1984 PT32_ROOT_LEVEL, direct, 1970 PT32_ROOT_LEVEL, direct,
1985 ACC_ALL, NULL); 1971 ACC_ALL, NULL);
@@ -1988,6 +1974,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
1988 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; 1974 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
1989 } 1975 }
1990 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); 1976 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
1977 return 0;
1991} 1978}
1992 1979
1993static void mmu_sync_roots(struct kvm_vcpu *vcpu) 1980static void mmu_sync_roots(struct kvm_vcpu *vcpu)
@@ -2006,7 +1993,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2006 for (i = 0; i < 4; ++i) { 1993 for (i = 0; i < 4; ++i) {
2007 hpa_t root = vcpu->arch.mmu.pae_root[i]; 1994 hpa_t root = vcpu->arch.mmu.pae_root[i];
2008 1995
2009 if (root) { 1996 if (root && VALID_PAGE(root)) {
2010 root &= PT64_BASE_ADDR_MASK; 1997 root &= PT64_BASE_ADDR_MASK;
2011 sp = page_header(root); 1998 sp = page_header(root);
2012 mmu_sync_children(vcpu, sp); 1999 mmu_sync_children(vcpu, sp);
@@ -2014,15 +2001,6 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2014 } 2001 }
2015} 2002}
2016 2003
2017static void mmu_sync_global(struct kvm_vcpu *vcpu)
2018{
2019 struct kvm *kvm = vcpu->kvm;
2020 struct kvm_mmu_page *sp, *n;
2021
2022 list_for_each_entry_safe(sp, n, &kvm->arch.oos_global_pages, oos_link)
2023 kvm_sync_page(vcpu, sp);
2024}
2025
2026void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) 2004void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
2027{ 2005{
2028 spin_lock(&vcpu->kvm->mmu_lock); 2006 spin_lock(&vcpu->kvm->mmu_lock);
@@ -2030,13 +2008,6 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
2030 spin_unlock(&vcpu->kvm->mmu_lock); 2008 spin_unlock(&vcpu->kvm->mmu_lock);
2031} 2009}
2032 2010
2033void kvm_mmu_sync_global(struct kvm_vcpu *vcpu)
2034{
2035 spin_lock(&vcpu->kvm->mmu_lock);
2036 mmu_sync_global(vcpu);
2037 spin_unlock(&vcpu->kvm->mmu_lock);
2038}
2039
2040static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) 2011static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
2041{ 2012{
2042 return vaddr; 2013 return vaddr;
@@ -2151,6 +2122,14 @@ static void paging_free(struct kvm_vcpu *vcpu)
2151 nonpaging_free(vcpu); 2122 nonpaging_free(vcpu);
2152} 2123}
2153 2124
2125static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level)
2126{
2127 int bit7;
2128
2129 bit7 = (gpte >> 7) & 1;
2130 return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0;
2131}
2132
2154#define PTTYPE 64 2133#define PTTYPE 64
2155#include "paging_tmpl.h" 2134#include "paging_tmpl.h"
2156#undef PTTYPE 2135#undef PTTYPE
@@ -2159,6 +2138,59 @@ static void paging_free(struct kvm_vcpu *vcpu)
2159#include "paging_tmpl.h" 2138#include "paging_tmpl.h"
2160#undef PTTYPE 2139#undef PTTYPE
2161 2140
2141static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2142{
2143 struct kvm_mmu *context = &vcpu->arch.mmu;
2144 int maxphyaddr = cpuid_maxphyaddr(vcpu);
2145 u64 exb_bit_rsvd = 0;
2146
2147 if (!is_nx(vcpu))
2148 exb_bit_rsvd = rsvd_bits(63, 63);
2149 switch (level) {
2150 case PT32_ROOT_LEVEL:
2151 /* no rsvd bits for 2 level 4K page table entries */
2152 context->rsvd_bits_mask[0][1] = 0;
2153 context->rsvd_bits_mask[0][0] = 0;
2154 if (is_cpuid_PSE36())
2155 /* 36bits PSE 4MB page */
2156 context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
2157 else
2158 /* 32 bits PSE 4MB page */
2159 context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
2160 context->rsvd_bits_mask[1][0] = ~0ull;
2161 break;
2162 case PT32E_ROOT_LEVEL:
2163 context->rsvd_bits_mask[0][2] =
2164 rsvd_bits(maxphyaddr, 63) |
2165 rsvd_bits(7, 8) | rsvd_bits(1, 2); /* PDPTE */
2166 context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
2167 rsvd_bits(maxphyaddr, 62); /* PDE */
2168 context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
2169 rsvd_bits(maxphyaddr, 62); /* PTE */
2170 context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2171 rsvd_bits(maxphyaddr, 62) |
2172 rsvd_bits(13, 20); /* large page */
2173 context->rsvd_bits_mask[1][0] = ~0ull;
2174 break;
2175 case PT64_ROOT_LEVEL:
2176 context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
2177 rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
2178 context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
2179 rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
2180 context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
2181 rsvd_bits(maxphyaddr, 51);
2182 context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
2183 rsvd_bits(maxphyaddr, 51);
2184 context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
2185 context->rsvd_bits_mask[1][2] = context->rsvd_bits_mask[0][2];
2186 context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2187 rsvd_bits(maxphyaddr, 51) |
2188 rsvd_bits(13, 20); /* large page */
2189 context->rsvd_bits_mask[1][0] = ~0ull;
2190 break;
2191 }
2192}
2193
2162static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) 2194static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
2163{ 2195{
2164 struct kvm_mmu *context = &vcpu->arch.mmu; 2196 struct kvm_mmu *context = &vcpu->arch.mmu;
@@ -2179,6 +2211,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
2179 2211
2180static int paging64_init_context(struct kvm_vcpu *vcpu) 2212static int paging64_init_context(struct kvm_vcpu *vcpu)
2181{ 2213{
2214 reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
2182 return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL); 2215 return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
2183} 2216}
2184 2217
@@ -2186,6 +2219,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
2186{ 2219{
2187 struct kvm_mmu *context = &vcpu->arch.mmu; 2220 struct kvm_mmu *context = &vcpu->arch.mmu;
2188 2221
2222 reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
2189 context->new_cr3 = paging_new_cr3; 2223 context->new_cr3 = paging_new_cr3;
2190 context->page_fault = paging32_page_fault; 2224 context->page_fault = paging32_page_fault;
2191 context->gva_to_gpa = paging32_gva_to_gpa; 2225 context->gva_to_gpa = paging32_gva_to_gpa;
@@ -2201,6 +2235,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
2201 2235
2202static int paging32E_init_context(struct kvm_vcpu *vcpu) 2236static int paging32E_init_context(struct kvm_vcpu *vcpu)
2203{ 2237{
2238 reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
2204 return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); 2239 return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
2205} 2240}
2206 2241
@@ -2221,12 +2256,15 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2221 context->gva_to_gpa = nonpaging_gva_to_gpa; 2256 context->gva_to_gpa = nonpaging_gva_to_gpa;
2222 context->root_level = 0; 2257 context->root_level = 0;
2223 } else if (is_long_mode(vcpu)) { 2258 } else if (is_long_mode(vcpu)) {
2259 reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
2224 context->gva_to_gpa = paging64_gva_to_gpa; 2260 context->gva_to_gpa = paging64_gva_to_gpa;
2225 context->root_level = PT64_ROOT_LEVEL; 2261 context->root_level = PT64_ROOT_LEVEL;
2226 } else if (is_pae(vcpu)) { 2262 } else if (is_pae(vcpu)) {
2263 reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
2227 context->gva_to_gpa = paging64_gva_to_gpa; 2264 context->gva_to_gpa = paging64_gva_to_gpa;
2228 context->root_level = PT32E_ROOT_LEVEL; 2265 context->root_level = PT32E_ROOT_LEVEL;
2229 } else { 2266 } else {
2267 reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
2230 context->gva_to_gpa = paging32_gva_to_gpa; 2268 context->gva_to_gpa = paging32_gva_to_gpa;
2231 context->root_level = PT32_ROOT_LEVEL; 2269 context->root_level = PT32_ROOT_LEVEL;
2232 } 2270 }
@@ -2290,9 +2328,11 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
2290 goto out; 2328 goto out;
2291 spin_lock(&vcpu->kvm->mmu_lock); 2329 spin_lock(&vcpu->kvm->mmu_lock);
2292 kvm_mmu_free_some_pages(vcpu); 2330 kvm_mmu_free_some_pages(vcpu);
2293 mmu_alloc_roots(vcpu); 2331 r = mmu_alloc_roots(vcpu);
2294 mmu_sync_roots(vcpu); 2332 mmu_sync_roots(vcpu);
2295 spin_unlock(&vcpu->kvm->mmu_lock); 2333 spin_unlock(&vcpu->kvm->mmu_lock);
2334 if (r)
2335 goto out;
2296 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); 2336 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
2297 kvm_mmu_flush_tlb(vcpu); 2337 kvm_mmu_flush_tlb(vcpu);
2298out: 2338out:
@@ -2638,14 +2678,6 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp);
2638 2678
2639static void free_mmu_pages(struct kvm_vcpu *vcpu) 2679static void free_mmu_pages(struct kvm_vcpu *vcpu)
2640{ 2680{
2641 struct kvm_mmu_page *sp;
2642
2643 while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
2644 sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
2645 struct kvm_mmu_page, link);
2646 kvm_mmu_zap_page(vcpu->kvm, sp);
2647 cond_resched();
2648 }
2649 free_page((unsigned long)vcpu->arch.mmu.pae_root); 2681 free_page((unsigned long)vcpu->arch.mmu.pae_root);
2650} 2682}
2651 2683
@@ -2710,7 +2742,6 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
2710{ 2742{
2711 struct kvm_mmu_page *sp; 2743 struct kvm_mmu_page *sp;
2712 2744
2713 spin_lock(&kvm->mmu_lock);
2714 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { 2745 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
2715 int i; 2746 int i;
2716 u64 *pt; 2747 u64 *pt;
@@ -2725,7 +2756,6 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
2725 pt[i] &= ~PT_WRITABLE_MASK; 2756 pt[i] &= ~PT_WRITABLE_MASK;
2726 } 2757 }
2727 kvm_flush_remote_tlbs(kvm); 2758 kvm_flush_remote_tlbs(kvm);
2728 spin_unlock(&kvm->mmu_lock);
2729} 2759}
2730 2760
2731void kvm_mmu_zap_all(struct kvm *kvm) 2761void kvm_mmu_zap_all(struct kvm *kvm)
@@ -2897,8 +2927,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
2897 2927
2898static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) 2928static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
2899{ 2929{
2900 kvm_x86_ops->tlb_flush(vcpu); 2930 kvm_set_cr3(vcpu, vcpu->arch.cr3);
2901 set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
2902 return 1; 2931 return 1;
2903} 2932}
2904 2933
@@ -3008,11 +3037,13 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
3008 " in nonleaf level: levels %d gva %lx" 3037 " in nonleaf level: levels %d gva %lx"
3009 " level %d pte %llx\n", audit_msg, 3038 " level %d pte %llx\n", audit_msg,
3010 vcpu->arch.mmu.root_level, va, level, ent); 3039 vcpu->arch.mmu.root_level, va, level, ent);
3011 3040 else
3012 audit_mappings_page(vcpu, ent, va, level - 1); 3041 audit_mappings_page(vcpu, ent, va, level - 1);
3013 } else { 3042 } else {
3014 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); 3043 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
3015 hpa_t hpa = (hpa_t)gpa_to_pfn(vcpu, gpa) << PAGE_SHIFT; 3044 gfn_t gfn = gpa >> PAGE_SHIFT;
3045 pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
3046 hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
3016 3047
3017 if (is_shadow_present_pte(ent) 3048 if (is_shadow_present_pte(ent)
3018 && (ent & PT64_BASE_ADDR_MASK) != hpa) 3049 && (ent & PT64_BASE_ADDR_MASK) != hpa)
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index eaab2145f62..3494a2fb136 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -75,4 +75,9 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
75 return vcpu->arch.cr0 & X86_CR0_PG; 75 return vcpu->arch.cr0 & X86_CR0_PG;
76} 76}
77 77
78static inline int is_present_pte(unsigned long pte)
79{
80 return pte & PT_PRESENT_MASK;
81}
82
78#endif 83#endif
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 6bd70206c56..258e4591e1c 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -123,6 +123,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
123 gfn_t table_gfn; 123 gfn_t table_gfn;
124 unsigned index, pt_access, pte_access; 124 unsigned index, pt_access, pte_access;
125 gpa_t pte_gpa; 125 gpa_t pte_gpa;
126 int rsvd_fault = 0;
126 127
127 pgprintk("%s: addr %lx\n", __func__, addr); 128 pgprintk("%s: addr %lx\n", __func__, addr);
128walk: 129walk:
@@ -157,6 +158,10 @@ walk:
157 if (!is_present_pte(pte)) 158 if (!is_present_pte(pte))
158 goto not_present; 159 goto not_present;
159 160
161 rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level);
162 if (rsvd_fault)
163 goto access_error;
164
160 if (write_fault && !is_writeble_pte(pte)) 165 if (write_fault && !is_writeble_pte(pte))
161 if (user_fault || is_write_protection(vcpu)) 166 if (user_fault || is_write_protection(vcpu))
162 goto access_error; 167 goto access_error;
@@ -209,7 +214,6 @@ walk:
209 if (ret) 214 if (ret)
210 goto walk; 215 goto walk;
211 pte |= PT_DIRTY_MASK; 216 pte |= PT_DIRTY_MASK;
212 kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte), 0);
213 walker->ptes[walker->level - 1] = pte; 217 walker->ptes[walker->level - 1] = pte;
214 } 218 }
215 219
@@ -233,6 +237,8 @@ err:
233 walker->error_code |= PFERR_USER_MASK; 237 walker->error_code |= PFERR_USER_MASK;
234 if (fetch_fault) 238 if (fetch_fault)
235 walker->error_code |= PFERR_FETCH_MASK; 239 walker->error_code |= PFERR_FETCH_MASK;
240 if (rsvd_fault)
241 walker->error_code |= PFERR_RSVD_MASK;
236 return 0; 242 return 0;
237} 243}
238 244
@@ -262,8 +268,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
262 kvm_get_pfn(pfn); 268 kvm_get_pfn(pfn);
263 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, 269 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
264 gpte & PT_DIRTY_MASK, NULL, largepage, 270 gpte & PT_DIRTY_MASK, NULL, largepage,
265 gpte & PT_GLOBAL_MASK, gpte_to_gfn(gpte), 271 gpte_to_gfn(gpte), pfn, true);
266 pfn, true);
267} 272}
268 273
269/* 274/*
@@ -297,7 +302,6 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
297 user_fault, write_fault, 302 user_fault, write_fault,
298 gw->ptes[gw->level-1] & PT_DIRTY_MASK, 303 gw->ptes[gw->level-1] & PT_DIRTY_MASK,
299 ptwrite, largepage, 304 ptwrite, largepage,
300 gw->ptes[gw->level-1] & PT_GLOBAL_MASK,
301 gw->gfn, pfn, false); 305 gw->gfn, pfn, false);
302 break; 306 break;
303 } 307 }
@@ -380,7 +384,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
380 return r; 384 return r;
381 385
382 /* 386 /*
383 * Look up the shadow pte for the faulting address. 387 * Look up the guest pte for the faulting address.
384 */ 388 */
385 r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, 389 r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
386 fetch_fault); 390 fetch_fault);
@@ -586,7 +590,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
586 nr_present++; 590 nr_present++;
587 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 591 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
588 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, 592 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
589 is_dirty_pte(gpte), 0, gpte & PT_GLOBAL_MASK, gfn, 593 is_dirty_pte(gpte), 0, gfn,
590 spte_to_pfn(sp->spt[i]), true, false); 594 spte_to_pfn(sp->spt[i]), true, false);
591 } 595 }
592 596
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1f8510c51d6..71510e07e69 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -19,6 +19,7 @@
19#include "irq.h" 19#include "irq.h"
20#include "mmu.h" 20#include "mmu.h"
21#include "kvm_cache_regs.h" 21#include "kvm_cache_regs.h"
22#include "x86.h"
22 23
23#include <linux/module.h> 24#include <linux/module.h>
24#include <linux/kernel.h> 25#include <linux/kernel.h>
@@ -69,7 +70,6 @@ module_param(npt, int, S_IRUGO);
69static int nested = 0; 70static int nested = 0;
70module_param(nested, int, S_IRUGO); 71module_param(nested, int, S_IRUGO);
71 72
72static void kvm_reput_irq(struct vcpu_svm *svm);
73static void svm_flush_tlb(struct kvm_vcpu *vcpu); 73static void svm_flush_tlb(struct kvm_vcpu *vcpu);
74 74
75static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override); 75static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override);
@@ -132,24 +132,6 @@ static inline u32 svm_has(u32 feat)
132 return svm_features & feat; 132 return svm_features & feat;
133} 133}
134 134
135static inline u8 pop_irq(struct kvm_vcpu *vcpu)
136{
137 int word_index = __ffs(vcpu->arch.irq_summary);
138 int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
139 int irq = word_index * BITS_PER_LONG + bit_index;
140
141 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
142 if (!vcpu->arch.irq_pending[word_index])
143 clear_bit(word_index, &vcpu->arch.irq_summary);
144 return irq;
145}
146
147static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq)
148{
149 set_bit(irq, vcpu->arch.irq_pending);
150 set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
151}
152
153static inline void clgi(void) 135static inline void clgi(void)
154{ 136{
155 asm volatile (__ex(SVM_CLGI)); 137 asm volatile (__ex(SVM_CLGI));
@@ -214,17 +196,31 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
214 svm->vmcb->control.event_inj_err = error_code; 196 svm->vmcb->control.event_inj_err = error_code;
215} 197}
216 198
217static bool svm_exception_injected(struct kvm_vcpu *vcpu) 199static int is_external_interrupt(u32 info)
200{
201 info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
202 return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
203}
204
205static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
218{ 206{
219 struct vcpu_svm *svm = to_svm(vcpu); 207 struct vcpu_svm *svm = to_svm(vcpu);
208 u32 ret = 0;
220 209
221 return !(svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID); 210 if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
211 ret |= X86_SHADOW_INT_STI | X86_SHADOW_INT_MOV_SS;
212 return ret & mask;
222} 213}
223 214
224static int is_external_interrupt(u32 info) 215static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
225{ 216{
226 info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; 217 struct vcpu_svm *svm = to_svm(vcpu);
227 return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR); 218
219 if (mask == 0)
220 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
221 else
222 svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
223
228} 224}
229 225
230static void skip_emulated_instruction(struct kvm_vcpu *vcpu) 226static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
@@ -232,7 +228,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
232 struct vcpu_svm *svm = to_svm(vcpu); 228 struct vcpu_svm *svm = to_svm(vcpu);
233 229
234 if (!svm->next_rip) { 230 if (!svm->next_rip) {
235 printk(KERN_DEBUG "%s: NOP\n", __func__); 231 if (emulate_instruction(vcpu, vcpu->run, 0, 0, EMULTYPE_SKIP) !=
232 EMULATE_DONE)
233 printk(KERN_DEBUG "%s: NOP\n", __func__);
236 return; 234 return;
237 } 235 }
238 if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE) 236 if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
@@ -240,9 +238,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
240 __func__, kvm_rip_read(vcpu), svm->next_rip); 238 __func__, kvm_rip_read(vcpu), svm->next_rip);
241 239
242 kvm_rip_write(vcpu, svm->next_rip); 240 kvm_rip_write(vcpu, svm->next_rip);
243 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; 241 svm_set_interrupt_shadow(vcpu, 0);
244
245 vcpu->arch.interrupt_window_open = (svm->vcpu.arch.hflags & HF_GIF_MASK);
246} 242}
247 243
248static int has_svm(void) 244static int has_svm(void)
@@ -830,6 +826,15 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
830 if (!var->unusable) 826 if (!var->unusable)
831 var->type |= 0x1; 827 var->type |= 0x1;
832 break; 828 break;
829 case VCPU_SREG_SS:
830 /* On AMD CPUs sometimes the DB bit in the segment
831 * descriptor is left as 1, although the whole segment has
832 * been made unusable. Clear it here to pass an Intel VMX
833 * entry check when cross vendor migrating.
834 */
835 if (var->unusable)
836 var->db = 0;
837 break;
833 } 838 }
834} 839}
835 840
@@ -960,15 +965,16 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
960 965
961} 966}
962 967
963static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) 968static void update_db_intercept(struct kvm_vcpu *vcpu)
964{ 969{
965 int old_debug = vcpu->guest_debug;
966 struct vcpu_svm *svm = to_svm(vcpu); 970 struct vcpu_svm *svm = to_svm(vcpu);
967 971
968 vcpu->guest_debug = dbg->control;
969
970 svm->vmcb->control.intercept_exceptions &= 972 svm->vmcb->control.intercept_exceptions &=
971 ~((1 << DB_VECTOR) | (1 << BP_VECTOR)); 973 ~((1 << DB_VECTOR) | (1 << BP_VECTOR));
974
975 if (vcpu->arch.singlestep)
976 svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR);
977
972 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { 978 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
973 if (vcpu->guest_debug & 979 if (vcpu->guest_debug &
974 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 980 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
@@ -979,6 +985,16 @@ static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
979 1 << BP_VECTOR; 985 1 << BP_VECTOR;
980 } else 986 } else
981 vcpu->guest_debug = 0; 987 vcpu->guest_debug = 0;
988}
989
990static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
991{
992 int old_debug = vcpu->guest_debug;
993 struct vcpu_svm *svm = to_svm(vcpu);
994
995 vcpu->guest_debug = dbg->control;
996
997 update_db_intercept(vcpu);
982 998
983 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 999 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
984 svm->vmcb->save.dr7 = dbg->arch.debugreg[7]; 1000 svm->vmcb->save.dr7 = dbg->arch.debugreg[7];
@@ -993,16 +1009,6 @@ static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
993 return 0; 1009 return 0;
994} 1010}
995 1011
996static int svm_get_irq(struct kvm_vcpu *vcpu)
997{
998 struct vcpu_svm *svm = to_svm(vcpu);
999 u32 exit_int_info = svm->vmcb->control.exit_int_info;
1000
1001 if (is_external_interrupt(exit_int_info))
1002 return exit_int_info & SVM_EVTINJ_VEC_MASK;
1003 return -1;
1004}
1005
1006static void load_host_msrs(struct kvm_vcpu *vcpu) 1012static void load_host_msrs(struct kvm_vcpu *vcpu)
1007{ 1013{
1008#ifdef CONFIG_X86_64 1014#ifdef CONFIG_X86_64
@@ -1107,17 +1113,8 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
1107 1113
1108static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1114static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1109{ 1115{
1110 u32 exit_int_info = svm->vmcb->control.exit_int_info;
1111 struct kvm *kvm = svm->vcpu.kvm;
1112 u64 fault_address; 1116 u64 fault_address;
1113 u32 error_code; 1117 u32 error_code;
1114 bool event_injection = false;
1115
1116 if (!irqchip_in_kernel(kvm) &&
1117 is_external_interrupt(exit_int_info)) {
1118 event_injection = true;
1119 push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
1120 }
1121 1118
1122 fault_address = svm->vmcb->control.exit_info_2; 1119 fault_address = svm->vmcb->control.exit_info_2;
1123 error_code = svm->vmcb->control.exit_info_1; 1120 error_code = svm->vmcb->control.exit_info_1;
@@ -1137,23 +1134,40 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1137 */ 1134 */
1138 if (npt_enabled) 1135 if (npt_enabled)
1139 svm_flush_tlb(&svm->vcpu); 1136 svm_flush_tlb(&svm->vcpu);
1140 1137 else {
1141 if (!npt_enabled && event_injection) 1138 if (kvm_event_needs_reinjection(&svm->vcpu))
1142 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); 1139 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
1140 }
1143 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); 1141 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
1144} 1142}
1145 1143
1146static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1144static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1147{ 1145{
1148 if (!(svm->vcpu.guest_debug & 1146 if (!(svm->vcpu.guest_debug &
1149 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { 1147 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
1148 !svm->vcpu.arch.singlestep) {
1150 kvm_queue_exception(&svm->vcpu, DB_VECTOR); 1149 kvm_queue_exception(&svm->vcpu, DB_VECTOR);
1151 return 1; 1150 return 1;
1152 } 1151 }
1153 kvm_run->exit_reason = KVM_EXIT_DEBUG; 1152
1154 kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; 1153 if (svm->vcpu.arch.singlestep) {
1155 kvm_run->debug.arch.exception = DB_VECTOR; 1154 svm->vcpu.arch.singlestep = false;
1156 return 0; 1155 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
1156 svm->vmcb->save.rflags &=
1157 ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1158 update_db_intercept(&svm->vcpu);
1159 }
1160
1161 if (svm->vcpu.guest_debug &
1162 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)){
1163 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1164 kvm_run->debug.arch.pc =
1165 svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1166 kvm_run->debug.arch.exception = DB_VECTOR;
1167 return 0;
1168 }
1169
1170 return 1;
1157} 1171}
1158 1172
1159static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1173static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
@@ -1842,17 +1856,51 @@ static int task_switch_interception(struct vcpu_svm *svm,
1842 struct kvm_run *kvm_run) 1856 struct kvm_run *kvm_run)
1843{ 1857{
1844 u16 tss_selector; 1858 u16 tss_selector;
1859 int reason;
1860 int int_type = svm->vmcb->control.exit_int_info &
1861 SVM_EXITINTINFO_TYPE_MASK;
1862 int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
1863 uint32_t type =
1864 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
1865 uint32_t idt_v =
1866 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
1845 1867
1846 tss_selector = (u16)svm->vmcb->control.exit_info_1; 1868 tss_selector = (u16)svm->vmcb->control.exit_info_1;
1869
1847 if (svm->vmcb->control.exit_info_2 & 1870 if (svm->vmcb->control.exit_info_2 &
1848 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET)) 1871 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
1849 return kvm_task_switch(&svm->vcpu, tss_selector, 1872 reason = TASK_SWITCH_IRET;
1850 TASK_SWITCH_IRET); 1873 else if (svm->vmcb->control.exit_info_2 &
1851 if (svm->vmcb->control.exit_info_2 & 1874 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
1852 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP)) 1875 reason = TASK_SWITCH_JMP;
1853 return kvm_task_switch(&svm->vcpu, tss_selector, 1876 else if (idt_v)
1854 TASK_SWITCH_JMP); 1877 reason = TASK_SWITCH_GATE;
1855 return kvm_task_switch(&svm->vcpu, tss_selector, TASK_SWITCH_CALL); 1878 else
1879 reason = TASK_SWITCH_CALL;
1880
1881 if (reason == TASK_SWITCH_GATE) {
1882 switch (type) {
1883 case SVM_EXITINTINFO_TYPE_NMI:
1884 svm->vcpu.arch.nmi_injected = false;
1885 break;
1886 case SVM_EXITINTINFO_TYPE_EXEPT:
1887 kvm_clear_exception_queue(&svm->vcpu);
1888 break;
1889 case SVM_EXITINTINFO_TYPE_INTR:
1890 kvm_clear_interrupt_queue(&svm->vcpu);
1891 break;
1892 default:
1893 break;
1894 }
1895 }
1896
1897 if (reason != TASK_SWITCH_GATE ||
1898 int_type == SVM_EXITINTINFO_TYPE_SOFT ||
1899 (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
1900 (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
1901 skip_emulated_instruction(&svm->vcpu);
1902
1903 return kvm_task_switch(&svm->vcpu, tss_selector, reason);
1856} 1904}
1857 1905
1858static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1906static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
@@ -1862,6 +1910,14 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1862 return 1; 1910 return 1;
1863} 1911}
1864 1912
1913static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1914{
1915 ++svm->vcpu.stat.nmi_window_exits;
1916 svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
1917 svm->vcpu.arch.hflags |= HF_IRET_MASK;
1918 return 1;
1919}
1920
1865static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1921static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1866{ 1922{
1867 if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE) 1923 if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE)
@@ -1879,8 +1935,14 @@ static int emulate_on_interception(struct vcpu_svm *svm,
1879 1935
1880static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1936static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1881{ 1937{
1938 u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
1939 /* instruction emulation calls kvm_set_cr8() */
1882 emulate_instruction(&svm->vcpu, NULL, 0, 0, 0); 1940 emulate_instruction(&svm->vcpu, NULL, 0, 0, 0);
1883 if (irqchip_in_kernel(svm->vcpu.kvm)) 1941 if (irqchip_in_kernel(svm->vcpu.kvm)) {
1942 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
1943 return 1;
1944 }
1945 if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
1884 return 1; 1946 return 1;
1885 kvm_run->exit_reason = KVM_EXIT_SET_TPR; 1947 kvm_run->exit_reason = KVM_EXIT_SET_TPR;
1886 return 0; 1948 return 0;
@@ -2090,8 +2152,9 @@ static int interrupt_window_interception(struct vcpu_svm *svm,
2090 * If the user space waits to inject interrupts, exit as soon as 2152 * If the user space waits to inject interrupts, exit as soon as
2091 * possible 2153 * possible
2092 */ 2154 */
2093 if (kvm_run->request_interrupt_window && 2155 if (!irqchip_in_kernel(svm->vcpu.kvm) &&
2094 !svm->vcpu.arch.irq_summary) { 2156 kvm_run->request_interrupt_window &&
2157 !kvm_cpu_has_interrupt(&svm->vcpu)) {
2095 ++svm->vcpu.stat.irq_window_exits; 2158 ++svm->vcpu.stat.irq_window_exits;
2096 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; 2159 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
2097 return 0; 2160 return 0;
@@ -2134,6 +2197,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
2134 [SVM_EXIT_VINTR] = interrupt_window_interception, 2197 [SVM_EXIT_VINTR] = interrupt_window_interception,
2135 /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */ 2198 /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */
2136 [SVM_EXIT_CPUID] = cpuid_interception, 2199 [SVM_EXIT_CPUID] = cpuid_interception,
2200 [SVM_EXIT_IRET] = iret_interception,
2137 [SVM_EXIT_INVD] = emulate_on_interception, 2201 [SVM_EXIT_INVD] = emulate_on_interception,
2138 [SVM_EXIT_HLT] = halt_interception, 2202 [SVM_EXIT_HLT] = halt_interception,
2139 [SVM_EXIT_INVLPG] = invlpg_interception, 2203 [SVM_EXIT_INVLPG] = invlpg_interception,
@@ -2194,7 +2258,6 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2194 } 2258 }
2195 } 2259 }
2196 2260
2197 kvm_reput_irq(svm);
2198 2261
2199 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { 2262 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
2200 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 2263 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
@@ -2205,7 +2268,7 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2205 2268
2206 if (is_external_interrupt(svm->vmcb->control.exit_int_info) && 2269 if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
2207 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && 2270 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
2208 exit_code != SVM_EXIT_NPF) 2271 exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH)
2209 printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " 2272 printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
2210 "exit_code 0x%x\n", 2273 "exit_code 0x%x\n",
2211 __func__, svm->vmcb->control.exit_int_info, 2274 __func__, svm->vmcb->control.exit_int_info,
@@ -2242,6 +2305,15 @@ static void pre_svm_run(struct vcpu_svm *svm)
2242 new_asid(svm, svm_data); 2305 new_asid(svm, svm_data);
2243} 2306}
2244 2307
2308static void svm_inject_nmi(struct kvm_vcpu *vcpu)
2309{
2310 struct vcpu_svm *svm = to_svm(vcpu);
2311
2312 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
2313 vcpu->arch.hflags |= HF_NMI_MASK;
2314 svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET);
2315 ++vcpu->stat.nmi_injections;
2316}
2245 2317
2246static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) 2318static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
2247{ 2319{
@@ -2257,134 +2329,71 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
2257 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); 2329 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
2258} 2330}
2259 2331
2260static void svm_set_irq(struct kvm_vcpu *vcpu, int irq) 2332static void svm_queue_irq(struct kvm_vcpu *vcpu, unsigned nr)
2261{ 2333{
2262 struct vcpu_svm *svm = to_svm(vcpu); 2334 struct vcpu_svm *svm = to_svm(vcpu);
2263 2335
2264 nested_svm_intr(svm); 2336 svm->vmcb->control.event_inj = nr |
2265 2337 SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
2266 svm_inject_irq(svm, irq);
2267} 2338}
2268 2339
2269static void update_cr8_intercept(struct kvm_vcpu *vcpu) 2340static void svm_set_irq(struct kvm_vcpu *vcpu)
2270{ 2341{
2271 struct vcpu_svm *svm = to_svm(vcpu); 2342 struct vcpu_svm *svm = to_svm(vcpu);
2272 struct vmcb *vmcb = svm->vmcb;
2273 int max_irr, tpr;
2274 2343
2275 if (!irqchip_in_kernel(vcpu->kvm) || vcpu->arch.apic->vapic_addr) 2344 nested_svm_intr(svm);
2276 return;
2277 2345
2278 vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; 2346 svm_queue_irq(vcpu, vcpu->arch.interrupt.nr);
2347}
2279 2348
2280 max_irr = kvm_lapic_find_highest_irr(vcpu); 2349static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
2281 if (max_irr == -1) 2350{
2282 return; 2351 struct vcpu_svm *svm = to_svm(vcpu);
2283 2352
2284 tpr = kvm_lapic_get_cr8(vcpu) << 4; 2353 if (irr == -1)
2354 return;
2285 2355
2286 if (tpr >= (max_irr & 0xf0)) 2356 if (tpr >= irr)
2287 vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK; 2357 svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK;
2288} 2358}
2289 2359
2290static void svm_intr_assist(struct kvm_vcpu *vcpu) 2360static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
2291{ 2361{
2292 struct vcpu_svm *svm = to_svm(vcpu); 2362 struct vcpu_svm *svm = to_svm(vcpu);
2293 struct vmcb *vmcb = svm->vmcb; 2363 struct vmcb *vmcb = svm->vmcb;
2294 int intr_vector = -1; 2364 return !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
2295 2365 !(svm->vcpu.arch.hflags & HF_NMI_MASK);
2296 if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) &&
2297 ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) {
2298 intr_vector = vmcb->control.exit_int_info &
2299 SVM_EVTINJ_VEC_MASK;
2300 vmcb->control.exit_int_info = 0;
2301 svm_inject_irq(svm, intr_vector);
2302 goto out;
2303 }
2304
2305 if (vmcb->control.int_ctl & V_IRQ_MASK)
2306 goto out;
2307
2308 if (!kvm_cpu_has_interrupt(vcpu))
2309 goto out;
2310
2311 if (nested_svm_intr(svm))
2312 goto out;
2313
2314 if (!(svm->vcpu.arch.hflags & HF_GIF_MASK))
2315 goto out;
2316
2317 if (!(vmcb->save.rflags & X86_EFLAGS_IF) ||
2318 (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
2319 (vmcb->control.event_inj & SVM_EVTINJ_VALID)) {
2320 /* unable to deliver irq, set pending irq */
2321 svm_set_vintr(svm);
2322 svm_inject_irq(svm, 0x0);
2323 goto out;
2324 }
2325 /* Okay, we can deliver the interrupt: grab it and update PIC state. */
2326 intr_vector = kvm_cpu_get_interrupt(vcpu);
2327 svm_inject_irq(svm, intr_vector);
2328out:
2329 update_cr8_intercept(vcpu);
2330} 2366}
2331 2367
2332static void kvm_reput_irq(struct vcpu_svm *svm) 2368static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
2333{ 2369{
2334 struct vmcb_control_area *control = &svm->vmcb->control; 2370 struct vcpu_svm *svm = to_svm(vcpu);
2335 2371 struct vmcb *vmcb = svm->vmcb;
2336 if ((control->int_ctl & V_IRQ_MASK) 2372 return (vmcb->save.rflags & X86_EFLAGS_IF) &&
2337 && !irqchip_in_kernel(svm->vcpu.kvm)) { 2373 !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
2338 control->int_ctl &= ~V_IRQ_MASK; 2374 (svm->vcpu.arch.hflags & HF_GIF_MASK);
2339 push_irq(&svm->vcpu, control->int_vector);
2340 }
2341
2342 svm->vcpu.arch.interrupt_window_open =
2343 !(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
2344 (svm->vcpu.arch.hflags & HF_GIF_MASK);
2345} 2375}
2346 2376
2347static void svm_do_inject_vector(struct vcpu_svm *svm) 2377static void enable_irq_window(struct kvm_vcpu *vcpu)
2348{ 2378{
2349 struct kvm_vcpu *vcpu = &svm->vcpu; 2379 svm_set_vintr(to_svm(vcpu));
2350 int word_index = __ffs(vcpu->arch.irq_summary); 2380 svm_inject_irq(to_svm(vcpu), 0x0);
2351 int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
2352 int irq = word_index * BITS_PER_LONG + bit_index;
2353
2354 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
2355 if (!vcpu->arch.irq_pending[word_index])
2356 clear_bit(word_index, &vcpu->arch.irq_summary);
2357 svm_inject_irq(svm, irq);
2358} 2381}
2359 2382
2360static void do_interrupt_requests(struct kvm_vcpu *vcpu, 2383static void enable_nmi_window(struct kvm_vcpu *vcpu)
2361 struct kvm_run *kvm_run)
2362{ 2384{
2363 struct vcpu_svm *svm = to_svm(vcpu); 2385 struct vcpu_svm *svm = to_svm(vcpu);
2364 struct vmcb_control_area *control = &svm->vmcb->control;
2365
2366 if (nested_svm_intr(svm))
2367 return;
2368 2386
2369 svm->vcpu.arch.interrupt_window_open = 2387 if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
2370 (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) && 2388 == HF_NMI_MASK)
2371 (svm->vmcb->save.rflags & X86_EFLAGS_IF) && 2389 return; /* IRET will cause a vm exit */
2372 (svm->vcpu.arch.hflags & HF_GIF_MASK));
2373 2390
2374 if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary) 2391 /* Something prevents NMI from been injected. Single step over
2375 /* 2392 possible problem (IRET or exception injection or interrupt
2376 * If interrupts enabled, and not blocked by sti or mov ss. Good. 2393 shadow) */
2377 */ 2394 vcpu->arch.singlestep = true;
2378 svm_do_inject_vector(svm); 2395 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
2379 2396 update_db_intercept(vcpu);
2380 /*
2381 * Interrupts blocked. Wait for unblock.
2382 */
2383 if (!svm->vcpu.arch.interrupt_window_open &&
2384 (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window))
2385 svm_set_vintr(svm);
2386 else
2387 svm_clear_vintr(svm);
2388} 2397}
2389 2398
2390static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) 2399static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -2407,7 +2416,7 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
2407 2416
2408 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) { 2417 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) {
2409 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; 2418 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
2410 kvm_lapic_set_tpr(vcpu, cr8); 2419 kvm_set_cr8(vcpu, cr8);
2411 } 2420 }
2412} 2421}
2413 2422
@@ -2416,14 +2425,54 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
2416 struct vcpu_svm *svm = to_svm(vcpu); 2425 struct vcpu_svm *svm = to_svm(vcpu);
2417 u64 cr8; 2426 u64 cr8;
2418 2427
2419 if (!irqchip_in_kernel(vcpu->kvm))
2420 return;
2421
2422 cr8 = kvm_get_cr8(vcpu); 2428 cr8 = kvm_get_cr8(vcpu);
2423 svm->vmcb->control.int_ctl &= ~V_TPR_MASK; 2429 svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
2424 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; 2430 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
2425} 2431}
2426 2432
2433static void svm_complete_interrupts(struct vcpu_svm *svm)
2434{
2435 u8 vector;
2436 int type;
2437 u32 exitintinfo = svm->vmcb->control.exit_int_info;
2438
2439 if (svm->vcpu.arch.hflags & HF_IRET_MASK)
2440 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
2441
2442 svm->vcpu.arch.nmi_injected = false;
2443 kvm_clear_exception_queue(&svm->vcpu);
2444 kvm_clear_interrupt_queue(&svm->vcpu);
2445
2446 if (!(exitintinfo & SVM_EXITINTINFO_VALID))
2447 return;
2448
2449 vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
2450 type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
2451
2452 switch (type) {
2453 case SVM_EXITINTINFO_TYPE_NMI:
2454 svm->vcpu.arch.nmi_injected = true;
2455 break;
2456 case SVM_EXITINTINFO_TYPE_EXEPT:
2457 /* In case of software exception do not reinject an exception
2458 vector, but re-execute and instruction instead */
2459 if (kvm_exception_is_soft(vector))
2460 break;
2461 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
2462 u32 err = svm->vmcb->control.exit_int_info_err;
2463 kvm_queue_exception_e(&svm->vcpu, vector, err);
2464
2465 } else
2466 kvm_queue_exception(&svm->vcpu, vector);
2467 break;
2468 case SVM_EXITINTINFO_TYPE_INTR:
2469 kvm_queue_interrupt(&svm->vcpu, vector, false);
2470 break;
2471 default:
2472 break;
2473 }
2474}
2475
2427#ifdef CONFIG_X86_64 2476#ifdef CONFIG_X86_64
2428#define R "r" 2477#define R "r"
2429#else 2478#else
@@ -2552,6 +2601,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2552 sync_cr8_to_lapic(vcpu); 2601 sync_cr8_to_lapic(vcpu);
2553 2602
2554 svm->next_rip = 0; 2603 svm->next_rip = 0;
2604
2605 svm_complete_interrupts(svm);
2555} 2606}
2556 2607
2557#undef R 2608#undef R
@@ -2617,7 +2668,7 @@ static int get_npt_level(void)
2617#endif 2668#endif
2618} 2669}
2619 2670
2620static int svm_get_mt_mask_shift(void) 2671static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
2621{ 2672{
2622 return 0; 2673 return 0;
2623} 2674}
@@ -2667,17 +2718,21 @@ static struct kvm_x86_ops svm_x86_ops = {
2667 .run = svm_vcpu_run, 2718 .run = svm_vcpu_run,
2668 .handle_exit = handle_exit, 2719 .handle_exit = handle_exit,
2669 .skip_emulated_instruction = skip_emulated_instruction, 2720 .skip_emulated_instruction = skip_emulated_instruction,
2721 .set_interrupt_shadow = svm_set_interrupt_shadow,
2722 .get_interrupt_shadow = svm_get_interrupt_shadow,
2670 .patch_hypercall = svm_patch_hypercall, 2723 .patch_hypercall = svm_patch_hypercall,
2671 .get_irq = svm_get_irq,
2672 .set_irq = svm_set_irq, 2724 .set_irq = svm_set_irq,
2725 .set_nmi = svm_inject_nmi,
2673 .queue_exception = svm_queue_exception, 2726 .queue_exception = svm_queue_exception,
2674 .exception_injected = svm_exception_injected, 2727 .interrupt_allowed = svm_interrupt_allowed,
2675 .inject_pending_irq = svm_intr_assist, 2728 .nmi_allowed = svm_nmi_allowed,
2676 .inject_pending_vectors = do_interrupt_requests, 2729 .enable_nmi_window = enable_nmi_window,
2730 .enable_irq_window = enable_irq_window,
2731 .update_cr8_intercept = update_cr8_intercept,
2677 2732
2678 .set_tss_addr = svm_set_tss_addr, 2733 .set_tss_addr = svm_set_tss_addr,
2679 .get_tdp_level = get_npt_level, 2734 .get_tdp_level = get_npt_level,
2680 .get_mt_mask_shift = svm_get_mt_mask_shift, 2735 .get_mt_mask = svm_get_mt_mask,
2681}; 2736};
2682 2737
2683static int __init svm_init(void) 2738static int __init svm_init(void)
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
new file mode 100644
index 00000000000..86dbac072d0
--- /dev/null
+++ b/arch/x86/kvm/timer.c
@@ -0,0 +1,46 @@
1#include <linux/kvm_host.h>
2#include <linux/kvm.h>
3#include <linux/hrtimer.h>
4#include <asm/atomic.h>
5#include "kvm_timer.h"
6
7static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
8{
9 int restart_timer = 0;
10 wait_queue_head_t *q = &vcpu->wq;
11
12 /* FIXME: this code should not know anything about vcpus */
13 if (!atomic_inc_and_test(&ktimer->pending))
14 set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
15
16 if (!ktimer->reinject)
17 atomic_set(&ktimer->pending, 1);
18
19 if (waitqueue_active(q))
20 wake_up_interruptible(q);
21
22 if (ktimer->t_ops->is_periodic(ktimer)) {
23 hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
24 restart_timer = 1;
25 }
26
27 return restart_timer;
28}
29
30enum hrtimer_restart kvm_timer_fn(struct hrtimer *data)
31{
32 int restart_timer;
33 struct kvm_vcpu *vcpu;
34 struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
35
36 vcpu = ktimer->kvm->vcpus[ktimer->vcpu_id];
37 if (!vcpu)
38 return HRTIMER_NORESTART;
39
40 restart_timer = __kvm_timer_fn(vcpu, ktimer);
41 if (restart_timer)
42 return HRTIMER_RESTART;
43 else
44 return HRTIMER_NORESTART;
45}
46
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bb481330716..e770bf349ec 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -32,26 +32,27 @@
32#include <asm/desc.h> 32#include <asm/desc.h>
33#include <asm/vmx.h> 33#include <asm/vmx.h>
34#include <asm/virtext.h> 34#include <asm/virtext.h>
35#include <asm/mce.h>
35 36
36#define __ex(x) __kvm_handle_fault_on_reboot(x) 37#define __ex(x) __kvm_handle_fault_on_reboot(x)
37 38
38MODULE_AUTHOR("Qumranet"); 39MODULE_AUTHOR("Qumranet");
39MODULE_LICENSE("GPL"); 40MODULE_LICENSE("GPL");
40 41
41static int bypass_guest_pf = 1; 42static int __read_mostly bypass_guest_pf = 1;
42module_param(bypass_guest_pf, bool, 0); 43module_param(bypass_guest_pf, bool, S_IRUGO);
43 44
44static int enable_vpid = 1; 45static int __read_mostly enable_vpid = 1;
45module_param(enable_vpid, bool, 0); 46module_param_named(vpid, enable_vpid, bool, 0444);
46 47
47static int flexpriority_enabled = 1; 48static int __read_mostly flexpriority_enabled = 1;
48module_param(flexpriority_enabled, bool, 0); 49module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
49 50
50static int enable_ept = 1; 51static int __read_mostly enable_ept = 1;
51module_param(enable_ept, bool, 0); 52module_param_named(ept, enable_ept, bool, S_IRUGO);
52 53
53static int emulate_invalid_guest_state = 0; 54static int __read_mostly emulate_invalid_guest_state = 0;
54module_param(emulate_invalid_guest_state, bool, 0); 55module_param(emulate_invalid_guest_state, bool, S_IRUGO);
55 56
56struct vmcs { 57struct vmcs {
57 u32 revision_id; 58 u32 revision_id;
@@ -97,6 +98,7 @@ struct vcpu_vmx {
97 int soft_vnmi_blocked; 98 int soft_vnmi_blocked;
98 ktime_t entry_time; 99 ktime_t entry_time;
99 s64 vnmi_blocked_time; 100 s64 vnmi_blocked_time;
101 u32 exit_reason;
100}; 102};
101 103
102static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) 104static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -111,9 +113,10 @@ static DEFINE_PER_CPU(struct vmcs *, vmxarea);
111static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 113static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
112static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu); 114static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
113 115
114static struct page *vmx_io_bitmap_a; 116static unsigned long *vmx_io_bitmap_a;
115static struct page *vmx_io_bitmap_b; 117static unsigned long *vmx_io_bitmap_b;
116static struct page *vmx_msr_bitmap; 118static unsigned long *vmx_msr_bitmap_legacy;
119static unsigned long *vmx_msr_bitmap_longmode;
117 120
118static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); 121static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
119static DEFINE_SPINLOCK(vmx_vpid_lock); 122static DEFINE_SPINLOCK(vmx_vpid_lock);
@@ -213,70 +216,78 @@ static inline int is_external_interrupt(u32 intr_info)
213 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); 216 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
214} 217}
215 218
219static inline int is_machine_check(u32 intr_info)
220{
221 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
222 INTR_INFO_VALID_MASK)) ==
223 (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
224}
225
216static inline int cpu_has_vmx_msr_bitmap(void) 226static inline int cpu_has_vmx_msr_bitmap(void)
217{ 227{
218 return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS); 228 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
219} 229}
220 230
221static inline int cpu_has_vmx_tpr_shadow(void) 231static inline int cpu_has_vmx_tpr_shadow(void)
222{ 232{
223 return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW); 233 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
224} 234}
225 235
226static inline int vm_need_tpr_shadow(struct kvm *kvm) 236static inline int vm_need_tpr_shadow(struct kvm *kvm)
227{ 237{
228 return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm))); 238 return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
229} 239}
230 240
231static inline int cpu_has_secondary_exec_ctrls(void) 241static inline int cpu_has_secondary_exec_ctrls(void)
232{ 242{
233 return (vmcs_config.cpu_based_exec_ctrl & 243 return vmcs_config.cpu_based_exec_ctrl &
234 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS); 244 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
235} 245}
236 246
237static inline bool cpu_has_vmx_virtualize_apic_accesses(void) 247static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
238{ 248{
239 return flexpriority_enabled 249 return vmcs_config.cpu_based_2nd_exec_ctrl &
240 && (vmcs_config.cpu_based_2nd_exec_ctrl & 250 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
241 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); 251}
252
253static inline bool cpu_has_vmx_flexpriority(void)
254{
255 return cpu_has_vmx_tpr_shadow() &&
256 cpu_has_vmx_virtualize_apic_accesses();
242} 257}
243 258
244static inline int cpu_has_vmx_invept_individual_addr(void) 259static inline int cpu_has_vmx_invept_individual_addr(void)
245{ 260{
246 return (!!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT)); 261 return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT);
247} 262}
248 263
249static inline int cpu_has_vmx_invept_context(void) 264static inline int cpu_has_vmx_invept_context(void)
250{ 265{
251 return (!!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT)); 266 return !!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT);
252} 267}
253 268
254static inline int cpu_has_vmx_invept_global(void) 269static inline int cpu_has_vmx_invept_global(void)
255{ 270{
256 return (!!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT)); 271 return !!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT);
257} 272}
258 273
259static inline int cpu_has_vmx_ept(void) 274static inline int cpu_has_vmx_ept(void)
260{ 275{
261 return (vmcs_config.cpu_based_2nd_exec_ctrl & 276 return vmcs_config.cpu_based_2nd_exec_ctrl &
262 SECONDARY_EXEC_ENABLE_EPT); 277 SECONDARY_EXEC_ENABLE_EPT;
263}
264
265static inline int vm_need_ept(void)
266{
267 return (cpu_has_vmx_ept() && enable_ept);
268} 278}
269 279
270static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) 280static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
271{ 281{
272 return ((cpu_has_vmx_virtualize_apic_accesses()) && 282 return flexpriority_enabled &&
273 (irqchip_in_kernel(kvm))); 283 (cpu_has_vmx_virtualize_apic_accesses()) &&
284 (irqchip_in_kernel(kvm));
274} 285}
275 286
276static inline int cpu_has_vmx_vpid(void) 287static inline int cpu_has_vmx_vpid(void)
277{ 288{
278 return (vmcs_config.cpu_based_2nd_exec_ctrl & 289 return vmcs_config.cpu_based_2nd_exec_ctrl &
279 SECONDARY_EXEC_ENABLE_VPID); 290 SECONDARY_EXEC_ENABLE_VPID;
280} 291}
281 292
282static inline int cpu_has_virtual_nmis(void) 293static inline int cpu_has_virtual_nmis(void)
@@ -284,6 +295,11 @@ static inline int cpu_has_virtual_nmis(void)
284 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; 295 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
285} 296}
286 297
298static inline bool report_flexpriority(void)
299{
300 return flexpriority_enabled;
301}
302
287static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) 303static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
288{ 304{
289 int i; 305 int i;
@@ -381,7 +397,7 @@ static inline void ept_sync_global(void)
381 397
382static inline void ept_sync_context(u64 eptp) 398static inline void ept_sync_context(u64 eptp)
383{ 399{
384 if (vm_need_ept()) { 400 if (enable_ept) {
385 if (cpu_has_vmx_invept_context()) 401 if (cpu_has_vmx_invept_context())
386 __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0); 402 __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
387 else 403 else
@@ -391,7 +407,7 @@ static inline void ept_sync_context(u64 eptp)
391 407
392static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa) 408static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
393{ 409{
394 if (vm_need_ept()) { 410 if (enable_ept) {
395 if (cpu_has_vmx_invept_individual_addr()) 411 if (cpu_has_vmx_invept_individual_addr())
396 __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR, 412 __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
397 eptp, gpa); 413 eptp, gpa);
@@ -478,7 +494,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
478{ 494{
479 u32 eb; 495 u32 eb;
480 496
481 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR); 497 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR);
482 if (!vcpu->fpu_active) 498 if (!vcpu->fpu_active)
483 eb |= 1u << NM_VECTOR; 499 eb |= 1u << NM_VECTOR;
484 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { 500 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
@@ -488,9 +504,9 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
488 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 504 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
489 eb |= 1u << BP_VECTOR; 505 eb |= 1u << BP_VECTOR;
490 } 506 }
491 if (vcpu->arch.rmode.active) 507 if (vcpu->arch.rmode.vm86_active)
492 eb = ~0; 508 eb = ~0;
493 if (vm_need_ept()) 509 if (enable_ept)
494 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ 510 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
495 vmcs_write32(EXCEPTION_BITMAP, eb); 511 vmcs_write32(EXCEPTION_BITMAP, eb);
496} 512}
@@ -724,29 +740,50 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
724 740
725static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 741static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
726{ 742{
727 if (vcpu->arch.rmode.active) 743 if (vcpu->arch.rmode.vm86_active)
728 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 744 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
729 vmcs_writel(GUEST_RFLAGS, rflags); 745 vmcs_writel(GUEST_RFLAGS, rflags);
730} 746}
731 747
748static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
749{
750 u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
751 int ret = 0;
752
753 if (interruptibility & GUEST_INTR_STATE_STI)
754 ret |= X86_SHADOW_INT_STI;
755 if (interruptibility & GUEST_INTR_STATE_MOV_SS)
756 ret |= X86_SHADOW_INT_MOV_SS;
757
758 return ret & mask;
759}
760
761static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
762{
763 u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
764 u32 interruptibility = interruptibility_old;
765
766 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
767
768 if (mask & X86_SHADOW_INT_MOV_SS)
769 interruptibility |= GUEST_INTR_STATE_MOV_SS;
770 if (mask & X86_SHADOW_INT_STI)
771 interruptibility |= GUEST_INTR_STATE_STI;
772
773 if ((interruptibility != interruptibility_old))
774 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
775}
776
732static void skip_emulated_instruction(struct kvm_vcpu *vcpu) 777static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
733{ 778{
734 unsigned long rip; 779 unsigned long rip;
735 u32 interruptibility;
736 780
737 rip = kvm_rip_read(vcpu); 781 rip = kvm_rip_read(vcpu);
738 rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 782 rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
739 kvm_rip_write(vcpu, rip); 783 kvm_rip_write(vcpu, rip);
740 784
741 /* 785 /* skipping an emulated instruction also counts */
742 * We emulated an instruction, so temporary interrupt blocking 786 vmx_set_interrupt_shadow(vcpu, 0);
743 * should be removed, if set.
744 */
745 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
746 if (interruptibility & 3)
747 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
748 interruptibility & ~3);
749 vcpu->arch.interrupt_window_open = 1;
750} 787}
751 788
752static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, 789static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
@@ -760,7 +797,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
760 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 797 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
761 } 798 }
762 799
763 if (vcpu->arch.rmode.active) { 800 if (vcpu->arch.rmode.vm86_active) {
764 vmx->rmode.irq.pending = true; 801 vmx->rmode.irq.pending = true;
765 vmx->rmode.irq.vector = nr; 802 vmx->rmode.irq.vector = nr;
766 vmx->rmode.irq.rip = kvm_rip_read(vcpu); 803 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
@@ -773,8 +810,9 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
773 return; 810 return;
774 } 811 }
775 812
776 if (nr == BP_VECTOR || nr == OF_VECTOR) { 813 if (kvm_exception_is_soft(nr)) {
777 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); 814 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
815 vmx->vcpu.arch.event_exit_inst_len);
778 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 816 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
779 } else 817 } else
780 intr_info |= INTR_TYPE_HARD_EXCEPTION; 818 intr_info |= INTR_TYPE_HARD_EXCEPTION;
@@ -782,11 +820,6 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
782 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); 820 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
783} 821}
784 822
785static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
786{
787 return false;
788}
789
790/* 823/*
791 * Swap MSR entry in host/guest MSR entry array. 824 * Swap MSR entry in host/guest MSR entry array.
792 */ 825 */
@@ -812,6 +845,7 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
812static void setup_msrs(struct vcpu_vmx *vmx) 845static void setup_msrs(struct vcpu_vmx *vmx)
813{ 846{
814 int save_nmsrs; 847 int save_nmsrs;
848 unsigned long *msr_bitmap;
815 849
816 vmx_load_host_state(vmx); 850 vmx_load_host_state(vmx);
817 save_nmsrs = 0; 851 save_nmsrs = 0;
@@ -847,6 +881,15 @@ static void setup_msrs(struct vcpu_vmx *vmx)
847 __find_msr_index(vmx, MSR_KERNEL_GS_BASE); 881 __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
848#endif 882#endif
849 vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER); 883 vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
884
885 if (cpu_has_vmx_msr_bitmap()) {
886 if (is_long_mode(&vmx->vcpu))
887 msr_bitmap = vmx_msr_bitmap_longmode;
888 else
889 msr_bitmap = vmx_msr_bitmap_legacy;
890
891 vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
892 }
850} 893}
851 894
852/* 895/*
@@ -1034,13 +1077,6 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
1034 return 0; 1077 return 0;
1035} 1078}
1036 1079
1037static int vmx_get_irq(struct kvm_vcpu *vcpu)
1038{
1039 if (!vcpu->arch.interrupt.pending)
1040 return -1;
1041 return vcpu->arch.interrupt.nr;
1042}
1043
1044static __init int cpu_has_kvm_support(void) 1080static __init int cpu_has_kvm_support(void)
1045{ 1081{
1046 return cpu_has_vmx(); 1082 return cpu_has_vmx();
@@ -1241,7 +1277,7 @@ static struct vmcs *alloc_vmcs_cpu(int cpu)
1241 struct page *pages; 1277 struct page *pages;
1242 struct vmcs *vmcs; 1278 struct vmcs *vmcs;
1243 1279
1244 pages = alloc_pages_node(node, GFP_KERNEL, vmcs_config.order); 1280 pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order);
1245 if (!pages) 1281 if (!pages)
1246 return NULL; 1282 return NULL;
1247 vmcs = page_address(pages); 1283 vmcs = page_address(pages);
@@ -1294,6 +1330,18 @@ static __init int hardware_setup(void)
1294 if (boot_cpu_has(X86_FEATURE_NX)) 1330 if (boot_cpu_has(X86_FEATURE_NX))
1295 kvm_enable_efer_bits(EFER_NX); 1331 kvm_enable_efer_bits(EFER_NX);
1296 1332
1333 if (!cpu_has_vmx_vpid())
1334 enable_vpid = 0;
1335
1336 if (!cpu_has_vmx_ept())
1337 enable_ept = 0;
1338
1339 if (!cpu_has_vmx_flexpriority())
1340 flexpriority_enabled = 0;
1341
1342 if (!cpu_has_vmx_tpr_shadow())
1343 kvm_x86_ops->update_cr8_intercept = NULL;
1344
1297 return alloc_kvm_area(); 1345 return alloc_kvm_area();
1298} 1346}
1299 1347
@@ -1324,7 +1372,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1324 struct vcpu_vmx *vmx = to_vmx(vcpu); 1372 struct vcpu_vmx *vmx = to_vmx(vcpu);
1325 1373
1326 vmx->emulation_required = 1; 1374 vmx->emulation_required = 1;
1327 vcpu->arch.rmode.active = 0; 1375 vcpu->arch.rmode.vm86_active = 0;
1328 1376
1329 vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); 1377 vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
1330 vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit); 1378 vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit);
@@ -1386,7 +1434,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1386 struct vcpu_vmx *vmx = to_vmx(vcpu); 1434 struct vcpu_vmx *vmx = to_vmx(vcpu);
1387 1435
1388 vmx->emulation_required = 1; 1436 vmx->emulation_required = 1;
1389 vcpu->arch.rmode.active = 1; 1437 vcpu->arch.rmode.vm86_active = 1;
1390 1438
1391 vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); 1439 vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
1392 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); 1440 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
@@ -1485,7 +1533,7 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
1485static void vmx_flush_tlb(struct kvm_vcpu *vcpu) 1533static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
1486{ 1534{
1487 vpid_sync_vcpu_all(to_vmx(vcpu)); 1535 vpid_sync_vcpu_all(to_vmx(vcpu));
1488 if (vm_need_ept()) 1536 if (enable_ept)
1489 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); 1537 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
1490} 1538}
1491 1539
@@ -1555,10 +1603,10 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1555 1603
1556 vmx_fpu_deactivate(vcpu); 1604 vmx_fpu_deactivate(vcpu);
1557 1605
1558 if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE)) 1606 if (vcpu->arch.rmode.vm86_active && (cr0 & X86_CR0_PE))
1559 enter_pmode(vcpu); 1607 enter_pmode(vcpu);
1560 1608
1561 if (!vcpu->arch.rmode.active && !(cr0 & X86_CR0_PE)) 1609 if (!vcpu->arch.rmode.vm86_active && !(cr0 & X86_CR0_PE))
1562 enter_rmode(vcpu); 1610 enter_rmode(vcpu);
1563 1611
1564#ifdef CONFIG_X86_64 1612#ifdef CONFIG_X86_64
@@ -1570,7 +1618,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1570 } 1618 }
1571#endif 1619#endif
1572 1620
1573 if (vm_need_ept()) 1621 if (enable_ept)
1574 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); 1622 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
1575 1623
1576 vmcs_writel(CR0_READ_SHADOW, cr0); 1624 vmcs_writel(CR0_READ_SHADOW, cr0);
@@ -1599,7 +1647,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1599 u64 eptp; 1647 u64 eptp;
1600 1648
1601 guest_cr3 = cr3; 1649 guest_cr3 = cr3;
1602 if (vm_need_ept()) { 1650 if (enable_ept) {
1603 eptp = construct_eptp(cr3); 1651 eptp = construct_eptp(cr3);
1604 vmcs_write64(EPT_POINTER, eptp); 1652 vmcs_write64(EPT_POINTER, eptp);
1605 ept_sync_context(eptp); 1653 ept_sync_context(eptp);
@@ -1616,11 +1664,11 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1616 1664
1617static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 1665static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1618{ 1666{
1619 unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.active ? 1667 unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.vm86_active ?
1620 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); 1668 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
1621 1669
1622 vcpu->arch.cr4 = cr4; 1670 vcpu->arch.cr4 = cr4;
1623 if (vm_need_ept()) 1671 if (enable_ept)
1624 ept_update_paging_mode_cr4(&hw_cr4, vcpu); 1672 ept_update_paging_mode_cr4(&hw_cr4, vcpu);
1625 1673
1626 vmcs_writel(CR4_READ_SHADOW, cr4); 1674 vmcs_writel(CR4_READ_SHADOW, cr4);
@@ -1699,7 +1747,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
1699 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 1747 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1700 u32 ar; 1748 u32 ar;
1701 1749
1702 if (vcpu->arch.rmode.active && seg == VCPU_SREG_TR) { 1750 if (vcpu->arch.rmode.vm86_active && seg == VCPU_SREG_TR) {
1703 vcpu->arch.rmode.tr.selector = var->selector; 1751 vcpu->arch.rmode.tr.selector = var->selector;
1704 vcpu->arch.rmode.tr.base = var->base; 1752 vcpu->arch.rmode.tr.base = var->base;
1705 vcpu->arch.rmode.tr.limit = var->limit; 1753 vcpu->arch.rmode.tr.limit = var->limit;
@@ -1709,7 +1757,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
1709 vmcs_writel(sf->base, var->base); 1757 vmcs_writel(sf->base, var->base);
1710 vmcs_write32(sf->limit, var->limit); 1758 vmcs_write32(sf->limit, var->limit);
1711 vmcs_write16(sf->selector, var->selector); 1759 vmcs_write16(sf->selector, var->selector);
1712 if (vcpu->arch.rmode.active && var->s) { 1760 if (vcpu->arch.rmode.vm86_active && var->s) {
1713 /* 1761 /*
1714 * Hack real-mode segments into vm86 compatibility. 1762 * Hack real-mode segments into vm86 compatibility.
1715 */ 1763 */
@@ -1982,7 +2030,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
1982 pfn_t identity_map_pfn; 2030 pfn_t identity_map_pfn;
1983 u32 tmp; 2031 u32 tmp;
1984 2032
1985 if (!vm_need_ept()) 2033 if (!enable_ept)
1986 return 1; 2034 return 1;
1987 if (unlikely(!kvm->arch.ept_identity_pagetable)) { 2035 if (unlikely(!kvm->arch.ept_identity_pagetable)) {
1988 printk(KERN_ERR "EPT: identity-mapping pagetable " 2036 printk(KERN_ERR "EPT: identity-mapping pagetable "
@@ -2071,7 +2119,7 @@ static void allocate_vpid(struct vcpu_vmx *vmx)
2071 int vpid; 2119 int vpid;
2072 2120
2073 vmx->vpid = 0; 2121 vmx->vpid = 0;
2074 if (!enable_vpid || !cpu_has_vmx_vpid()) 2122 if (!enable_vpid)
2075 return; 2123 return;
2076 spin_lock(&vmx_vpid_lock); 2124 spin_lock(&vmx_vpid_lock);
2077 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); 2125 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
@@ -2082,9 +2130,9 @@ static void allocate_vpid(struct vcpu_vmx *vmx)
2082 spin_unlock(&vmx_vpid_lock); 2130 spin_unlock(&vmx_vpid_lock);
2083} 2131}
2084 2132
2085static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr) 2133static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
2086{ 2134{
2087 void *va; 2135 int f = sizeof(unsigned long);
2088 2136
2089 if (!cpu_has_vmx_msr_bitmap()) 2137 if (!cpu_has_vmx_msr_bitmap())
2090 return; 2138 return;
@@ -2094,16 +2142,21 @@ static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr)
2094 * have the write-low and read-high bitmap offsets the wrong way round. 2142 * have the write-low and read-high bitmap offsets the wrong way round.
2095 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. 2143 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
2096 */ 2144 */
2097 va = kmap(msr_bitmap);
2098 if (msr <= 0x1fff) { 2145 if (msr <= 0x1fff) {
2099 __clear_bit(msr, va + 0x000); /* read-low */ 2146 __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
2100 __clear_bit(msr, va + 0x800); /* write-low */ 2147 __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
2101 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 2148 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
2102 msr &= 0x1fff; 2149 msr &= 0x1fff;
2103 __clear_bit(msr, va + 0x400); /* read-high */ 2150 __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
2104 __clear_bit(msr, va + 0xc00); /* write-high */ 2151 __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
2105 } 2152 }
2106 kunmap(msr_bitmap); 2153}
2154
2155static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
2156{
2157 if (!longmode_only)
2158 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
2159 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
2107} 2160}
2108 2161
2109/* 2162/*
@@ -2121,11 +2174,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2121 u32 exec_control; 2174 u32 exec_control;
2122 2175
2123 /* I/O */ 2176 /* I/O */
2124 vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a)); 2177 vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
2125 vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b)); 2178 vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b));
2126 2179
2127 if (cpu_has_vmx_msr_bitmap()) 2180 if (cpu_has_vmx_msr_bitmap())
2128 vmcs_write64(MSR_BITMAP, page_to_phys(vmx_msr_bitmap)); 2181 vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
2129 2182
2130 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ 2183 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
2131 2184
@@ -2141,7 +2194,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2141 CPU_BASED_CR8_LOAD_EXITING; 2194 CPU_BASED_CR8_LOAD_EXITING;
2142#endif 2195#endif
2143 } 2196 }
2144 if (!vm_need_ept()) 2197 if (!enable_ept)
2145 exec_control |= CPU_BASED_CR3_STORE_EXITING | 2198 exec_control |= CPU_BASED_CR3_STORE_EXITING |
2146 CPU_BASED_CR3_LOAD_EXITING | 2199 CPU_BASED_CR3_LOAD_EXITING |
2147 CPU_BASED_INVLPG_EXITING; 2200 CPU_BASED_INVLPG_EXITING;
@@ -2154,7 +2207,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2154 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 2207 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
2155 if (vmx->vpid == 0) 2208 if (vmx->vpid == 0)
2156 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; 2209 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
2157 if (!vm_need_ept()) 2210 if (!enable_ept)
2158 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; 2211 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
2159 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); 2212 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
2160 } 2213 }
@@ -2273,7 +2326,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2273 goto out; 2326 goto out;
2274 } 2327 }
2275 2328
2276 vmx->vcpu.arch.rmode.active = 0; 2329 vmx->vcpu.arch.rmode.vm86_active = 0;
2277 2330
2278 vmx->soft_vnmi_blocked = 0; 2331 vmx->soft_vnmi_blocked = 0;
2279 2332
@@ -2402,14 +2455,16 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
2402 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 2455 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2403} 2456}
2404 2457
2405static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) 2458static void vmx_inject_irq(struct kvm_vcpu *vcpu)
2406{ 2459{
2407 struct vcpu_vmx *vmx = to_vmx(vcpu); 2460 struct vcpu_vmx *vmx = to_vmx(vcpu);
2461 uint32_t intr;
2462 int irq = vcpu->arch.interrupt.nr;
2408 2463
2409 KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler); 2464 KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler);
2410 2465
2411 ++vcpu->stat.irq_injections; 2466 ++vcpu->stat.irq_injections;
2412 if (vcpu->arch.rmode.active) { 2467 if (vcpu->arch.rmode.vm86_active) {
2413 vmx->rmode.irq.pending = true; 2468 vmx->rmode.irq.pending = true;
2414 vmx->rmode.irq.vector = irq; 2469 vmx->rmode.irq.vector = irq;
2415 vmx->rmode.irq.rip = kvm_rip_read(vcpu); 2470 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
@@ -2419,8 +2474,14 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
2419 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); 2474 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
2420 return; 2475 return;
2421 } 2476 }
2422 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2477 intr = irq | INTR_INFO_VALID_MASK;
2423 irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); 2478 if (vcpu->arch.interrupt.soft) {
2479 intr |= INTR_TYPE_SOFT_INTR;
2480 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2481 vmx->vcpu.arch.event_exit_inst_len);
2482 } else
2483 intr |= INTR_TYPE_EXT_INTR;
2484 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
2424} 2485}
2425 2486
2426static void vmx_inject_nmi(struct kvm_vcpu *vcpu) 2487static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
@@ -2441,7 +2502,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
2441 } 2502 }
2442 2503
2443 ++vcpu->stat.nmi_injections; 2504 ++vcpu->stat.nmi_injections;
2444 if (vcpu->arch.rmode.active) { 2505 if (vcpu->arch.rmode.vm86_active) {
2445 vmx->rmode.irq.pending = true; 2506 vmx->rmode.irq.pending = true;
2446 vmx->rmode.irq.vector = NMI_VECTOR; 2507 vmx->rmode.irq.vector = NMI_VECTOR;
2447 vmx->rmode.irq.rip = kvm_rip_read(vcpu); 2508 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
@@ -2456,76 +2517,21 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
2456 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 2517 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
2457} 2518}
2458 2519
2459static void vmx_update_window_states(struct kvm_vcpu *vcpu) 2520static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
2460{ 2521{
2461 u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
2462
2463 vcpu->arch.nmi_window_open =
2464 !(guest_intr & (GUEST_INTR_STATE_STI |
2465 GUEST_INTR_STATE_MOV_SS |
2466 GUEST_INTR_STATE_NMI));
2467 if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) 2522 if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
2468 vcpu->arch.nmi_window_open = 0; 2523 return 0;
2469
2470 vcpu->arch.interrupt_window_open =
2471 ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
2472 !(guest_intr & (GUEST_INTR_STATE_STI |
2473 GUEST_INTR_STATE_MOV_SS)));
2474}
2475
2476static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
2477{
2478 int word_index = __ffs(vcpu->arch.irq_summary);
2479 int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
2480 int irq = word_index * BITS_PER_LONG + bit_index;
2481 2524
2482 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); 2525 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2483 if (!vcpu->arch.irq_pending[word_index]) 2526 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS |
2484 clear_bit(word_index, &vcpu->arch.irq_summary); 2527 GUEST_INTR_STATE_NMI));
2485 kvm_queue_interrupt(vcpu, irq);
2486} 2528}
2487 2529
2488static void do_interrupt_requests(struct kvm_vcpu *vcpu, 2530static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
2489 struct kvm_run *kvm_run)
2490{ 2531{
2491 vmx_update_window_states(vcpu); 2532 return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
2492 2533 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2493 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 2534 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
2494 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
2495 GUEST_INTR_STATE_STI |
2496 GUEST_INTR_STATE_MOV_SS);
2497
2498 if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
2499 if (vcpu->arch.interrupt.pending) {
2500 enable_nmi_window(vcpu);
2501 } else if (vcpu->arch.nmi_window_open) {
2502 vcpu->arch.nmi_pending = false;
2503 vcpu->arch.nmi_injected = true;
2504 } else {
2505 enable_nmi_window(vcpu);
2506 return;
2507 }
2508 }
2509 if (vcpu->arch.nmi_injected) {
2510 vmx_inject_nmi(vcpu);
2511 if (vcpu->arch.nmi_pending)
2512 enable_nmi_window(vcpu);
2513 else if (vcpu->arch.irq_summary
2514 || kvm_run->request_interrupt_window)
2515 enable_irq_window(vcpu);
2516 return;
2517 }
2518
2519 if (vcpu->arch.interrupt_window_open) {
2520 if (vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
2521 kvm_do_inject_irq(vcpu);
2522
2523 if (vcpu->arch.interrupt.pending)
2524 vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
2525 }
2526 if (!vcpu->arch.interrupt_window_open &&
2527 (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
2528 enable_irq_window(vcpu);
2529} 2535}
2530 2536
2531static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) 2537static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -2585,6 +2591,31 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2585 return 0; 2591 return 0;
2586} 2592}
2587 2593
2594/*
2595 * Trigger machine check on the host. We assume all the MSRs are already set up
2596 * by the CPU and that we still run on the same CPU as the MCE occurred on.
2597 * We pass a fake environment to the machine check handler because we want
2598 * the guest to be always treated like user space, no matter what context
2599 * it used internally.
2600 */
2601static void kvm_machine_check(void)
2602{
2603#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
2604 struct pt_regs regs = {
2605 .cs = 3, /* Fake ring 3 no matter what the guest ran on */
2606 .flags = X86_EFLAGS_IF,
2607 };
2608
2609 do_machine_check(&regs, 0);
2610#endif
2611}
2612
2613static int handle_machine_check(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2614{
2615 /* already handled by vcpu_run */
2616 return 1;
2617}
2618
2588static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2619static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2589{ 2620{
2590 struct vcpu_vmx *vmx = to_vmx(vcpu); 2621 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2596,17 +2627,14 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2596 vect_info = vmx->idt_vectoring_info; 2627 vect_info = vmx->idt_vectoring_info;
2597 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 2628 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2598 2629
2630 if (is_machine_check(intr_info))
2631 return handle_machine_check(vcpu, kvm_run);
2632
2599 if ((vect_info & VECTORING_INFO_VALID_MASK) && 2633 if ((vect_info & VECTORING_INFO_VALID_MASK) &&
2600 !is_page_fault(intr_info)) 2634 !is_page_fault(intr_info))
2601 printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " 2635 printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
2602 "intr info 0x%x\n", __func__, vect_info, intr_info); 2636 "intr info 0x%x\n", __func__, vect_info, intr_info);
2603 2637
2604 if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) {
2605 int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
2606 set_bit(irq, vcpu->arch.irq_pending);
2607 set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
2608 }
2609
2610 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) 2638 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
2611 return 1; /* already handled by vmx_vcpu_run() */ 2639 return 1; /* already handled by vmx_vcpu_run() */
2612 2640
@@ -2628,17 +2656,17 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2628 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 2656 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
2629 if (is_page_fault(intr_info)) { 2657 if (is_page_fault(intr_info)) {
2630 /* EPT won't cause page fault directly */ 2658 /* EPT won't cause page fault directly */
2631 if (vm_need_ept()) 2659 if (enable_ept)
2632 BUG(); 2660 BUG();
2633 cr2 = vmcs_readl(EXIT_QUALIFICATION); 2661 cr2 = vmcs_readl(EXIT_QUALIFICATION);
2634 KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, 2662 KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
2635 (u32)((u64)cr2 >> 32), handler); 2663 (u32)((u64)cr2 >> 32), handler);
2636 if (vcpu->arch.interrupt.pending || vcpu->arch.exception.pending) 2664 if (kvm_event_needs_reinjection(vcpu))
2637 kvm_mmu_unprotect_page_virt(vcpu, cr2); 2665 kvm_mmu_unprotect_page_virt(vcpu, cr2);
2638 return kvm_mmu_page_fault(vcpu, cr2, error_code); 2666 return kvm_mmu_page_fault(vcpu, cr2, error_code);
2639 } 2667 }
2640 2668
2641 if (vcpu->arch.rmode.active && 2669 if (vcpu->arch.rmode.vm86_active &&
2642 handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, 2670 handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
2643 error_code)) { 2671 error_code)) {
2644 if (vcpu->arch.halt_request) { 2672 if (vcpu->arch.halt_request) {
@@ -2753,13 +2781,18 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2753 kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg)); 2781 kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg));
2754 skip_emulated_instruction(vcpu); 2782 skip_emulated_instruction(vcpu);
2755 return 1; 2783 return 1;
2756 case 8: 2784 case 8: {
2757 kvm_set_cr8(vcpu, kvm_register_read(vcpu, reg)); 2785 u8 cr8_prev = kvm_get_cr8(vcpu);
2758 skip_emulated_instruction(vcpu); 2786 u8 cr8 = kvm_register_read(vcpu, reg);
2759 if (irqchip_in_kernel(vcpu->kvm)) 2787 kvm_set_cr8(vcpu, cr8);
2760 return 1; 2788 skip_emulated_instruction(vcpu);
2761 kvm_run->exit_reason = KVM_EXIT_SET_TPR; 2789 if (irqchip_in_kernel(vcpu->kvm))
2762 return 0; 2790 return 1;
2791 if (cr8_prev <= cr8)
2792 return 1;
2793 kvm_run->exit_reason = KVM_EXIT_SET_TPR;
2794 return 0;
2795 }
2763 }; 2796 };
2764 break; 2797 break;
2765 case 2: /* clts */ 2798 case 2: /* clts */
@@ -2957,8 +2990,9 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
2957 * If the user space waits to inject interrupts, exit as soon as 2990 * If the user space waits to inject interrupts, exit as soon as
2958 * possible 2991 * possible
2959 */ 2992 */
2960 if (kvm_run->request_interrupt_window && 2993 if (!irqchip_in_kernel(vcpu->kvm) &&
2961 !vcpu->arch.irq_summary) { 2994 kvm_run->request_interrupt_window &&
2995 !kvm_cpu_has_interrupt(vcpu)) {
2962 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; 2996 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
2963 return 0; 2997 return 0;
2964 } 2998 }
@@ -2980,7 +3014,7 @@ static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2980 3014
2981static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3015static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2982{ 3016{
2983 u64 exit_qualification = vmcs_read64(EXIT_QUALIFICATION); 3017 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
2984 3018
2985 kvm_mmu_invlpg(vcpu, exit_qualification); 3019 kvm_mmu_invlpg(vcpu, exit_qualification);
2986 skip_emulated_instruction(vcpu); 3020 skip_emulated_instruction(vcpu);
@@ -2996,11 +3030,11 @@ static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2996 3030
2997static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3031static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2998{ 3032{
2999 u64 exit_qualification; 3033 unsigned long exit_qualification;
3000 enum emulation_result er; 3034 enum emulation_result er;
3001 unsigned long offset; 3035 unsigned long offset;
3002 3036
3003 exit_qualification = vmcs_read64(EXIT_QUALIFICATION); 3037 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3004 offset = exit_qualification & 0xffful; 3038 offset = exit_qualification & 0xffful;
3005 3039
3006 er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); 3040 er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
@@ -3019,22 +3053,41 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3019 struct vcpu_vmx *vmx = to_vmx(vcpu); 3053 struct vcpu_vmx *vmx = to_vmx(vcpu);
3020 unsigned long exit_qualification; 3054 unsigned long exit_qualification;
3021 u16 tss_selector; 3055 u16 tss_selector;
3022 int reason; 3056 int reason, type, idt_v;
3057
3058 idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
3059 type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
3023 3060
3024 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3061 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3025 3062
3026 reason = (u32)exit_qualification >> 30; 3063 reason = (u32)exit_qualification >> 30;
3027 if (reason == TASK_SWITCH_GATE && vmx->vcpu.arch.nmi_injected && 3064 if (reason == TASK_SWITCH_GATE && idt_v) {
3028 (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 3065 switch (type) {
3029 (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK) 3066 case INTR_TYPE_NMI_INTR:
3030 == INTR_TYPE_NMI_INTR) { 3067 vcpu->arch.nmi_injected = false;
3031 vcpu->arch.nmi_injected = false; 3068 if (cpu_has_virtual_nmis())
3032 if (cpu_has_virtual_nmis()) 3069 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
3033 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 3070 GUEST_INTR_STATE_NMI);
3034 GUEST_INTR_STATE_NMI); 3071 break;
3072 case INTR_TYPE_EXT_INTR:
3073 case INTR_TYPE_SOFT_INTR:
3074 kvm_clear_interrupt_queue(vcpu);
3075 break;
3076 case INTR_TYPE_HARD_EXCEPTION:
3077 case INTR_TYPE_SOFT_EXCEPTION:
3078 kvm_clear_exception_queue(vcpu);
3079 break;
3080 default:
3081 break;
3082 }
3035 } 3083 }
3036 tss_selector = exit_qualification; 3084 tss_selector = exit_qualification;
3037 3085
3086 if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
3087 type != INTR_TYPE_EXT_INTR &&
3088 type != INTR_TYPE_NMI_INTR))
3089 skip_emulated_instruction(vcpu);
3090
3038 if (!kvm_task_switch(vcpu, tss_selector, reason)) 3091 if (!kvm_task_switch(vcpu, tss_selector, reason))
3039 return 0; 3092 return 0;
3040 3093
@@ -3051,11 +3104,11 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3051 3104
3052static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3105static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3053{ 3106{
3054 u64 exit_qualification; 3107 unsigned long exit_qualification;
3055 gpa_t gpa; 3108 gpa_t gpa;
3056 int gla_validity; 3109 int gla_validity;
3057 3110
3058 exit_qualification = vmcs_read64(EXIT_QUALIFICATION); 3111 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3059 3112
3060 if (exit_qualification & (1 << 6)) { 3113 if (exit_qualification & (1 << 6)) {
3061 printk(KERN_ERR "EPT: GPA exceeds GAW!\n"); 3114 printk(KERN_ERR "EPT: GPA exceeds GAW!\n");
@@ -3067,7 +3120,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3067 printk(KERN_ERR "EPT: Handling EPT violation failed!\n"); 3120 printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
3068 printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n", 3121 printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
3069 (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS), 3122 (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
3070 (long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS)); 3123 vmcs_readl(GUEST_LINEAR_ADDRESS));
3071 printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", 3124 printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
3072 (long unsigned int)exit_qualification); 3125 (long unsigned int)exit_qualification);
3073 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 3126 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
@@ -3150,6 +3203,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
3150 [EXIT_REASON_WBINVD] = handle_wbinvd, 3203 [EXIT_REASON_WBINVD] = handle_wbinvd,
3151 [EXIT_REASON_TASK_SWITCH] = handle_task_switch, 3204 [EXIT_REASON_TASK_SWITCH] = handle_task_switch,
3152 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, 3205 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
3206 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
3153}; 3207};
3154 3208
3155static const int kvm_vmx_max_exit_handlers = 3209static const int kvm_vmx_max_exit_handlers =
@@ -3159,10 +3213,10 @@ static const int kvm_vmx_max_exit_handlers =
3159 * The guest has exited. See if we can fix it or if we need userspace 3213 * The guest has exited. See if we can fix it or if we need userspace
3160 * assistance. 3214 * assistance.
3161 */ 3215 */
3162static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 3216static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3163{ 3217{
3164 u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
3165 struct vcpu_vmx *vmx = to_vmx(vcpu); 3218 struct vcpu_vmx *vmx = to_vmx(vcpu);
3219 u32 exit_reason = vmx->exit_reason;
3166 u32 vectoring_info = vmx->idt_vectoring_info; 3220 u32 vectoring_info = vmx->idt_vectoring_info;
3167 3221
3168 KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu), 3222 KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu),
@@ -3178,7 +3232,7 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3178 3232
3179 /* Access CR3 don't cause VMExit in paging mode, so we need 3233 /* Access CR3 don't cause VMExit in paging mode, so we need
3180 * to sync with guest real CR3. */ 3234 * to sync with guest real CR3. */
3181 if (vm_need_ept() && is_paging(vcpu)) { 3235 if (enable_ept && is_paging(vcpu)) {
3182 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 3236 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3183 ept_load_pdptrs(vcpu); 3237 ept_load_pdptrs(vcpu);
3184 } 3238 }
@@ -3199,9 +3253,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3199 __func__, vectoring_info, exit_reason); 3253 __func__, vectoring_info, exit_reason);
3200 3254
3201 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) { 3255 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
3202 if (vcpu->arch.interrupt_window_open) { 3256 if (vmx_interrupt_allowed(vcpu)) {
3203 vmx->soft_vnmi_blocked = 0; 3257 vmx->soft_vnmi_blocked = 0;
3204 vcpu->arch.nmi_window_open = 1;
3205 } else if (vmx->vnmi_blocked_time > 1000000000LL && 3258 } else if (vmx->vnmi_blocked_time > 1000000000LL &&
3206 vcpu->arch.nmi_pending) { 3259 vcpu->arch.nmi_pending) {
3207 /* 3260 /*
@@ -3214,7 +3267,6 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3214 "state on VCPU %d after 1 s timeout\n", 3267 "state on VCPU %d after 1 s timeout\n",
3215 __func__, vcpu->vcpu_id); 3268 __func__, vcpu->vcpu_id);
3216 vmx->soft_vnmi_blocked = 0; 3269 vmx->soft_vnmi_blocked = 0;
3217 vmx->vcpu.arch.nmi_window_open = 1;
3218 } 3270 }
3219 } 3271 }
3220 3272
@@ -3228,122 +3280,107 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3228 return 0; 3280 return 0;
3229} 3281}
3230 3282
3231static void update_tpr_threshold(struct kvm_vcpu *vcpu) 3283static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3232{ 3284{
3233 int max_irr, tpr; 3285 if (irr == -1 || tpr < irr) {
3234
3235 if (!vm_need_tpr_shadow(vcpu->kvm))
3236 return;
3237
3238 if (!kvm_lapic_enabled(vcpu) ||
3239 ((max_irr = kvm_lapic_find_highest_irr(vcpu)) == -1)) {
3240 vmcs_write32(TPR_THRESHOLD, 0); 3286 vmcs_write32(TPR_THRESHOLD, 0);
3241 return; 3287 return;
3242 } 3288 }
3243 3289
3244 tpr = (kvm_lapic_get_cr8(vcpu) & 0x0f) << 4; 3290 vmcs_write32(TPR_THRESHOLD, irr);
3245 vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4);
3246} 3291}
3247 3292
3248static void vmx_complete_interrupts(struct vcpu_vmx *vmx) 3293static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3249{ 3294{
3250 u32 exit_intr_info; 3295 u32 exit_intr_info;
3251 u32 idt_vectoring_info; 3296 u32 idt_vectoring_info = vmx->idt_vectoring_info;
3252 bool unblock_nmi; 3297 bool unblock_nmi;
3253 u8 vector; 3298 u8 vector;
3254 int type; 3299 int type;
3255 bool idtv_info_valid; 3300 bool idtv_info_valid;
3256 u32 error;
3257 3301
3258 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 3302 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
3303
3304 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
3305
3306 /* Handle machine checks before interrupts are enabled */
3307 if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
3308 || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI
3309 && is_machine_check(exit_intr_info)))
3310 kvm_machine_check();
3311
3312 /* We need to handle NMIs before interrupts are enabled */
3313 if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
3314 (exit_intr_info & INTR_INFO_VALID_MASK)) {
3315 KVMTRACE_0D(NMI, &vmx->vcpu, handler);
3316 asm("int $2");
3317 }
3318
3319 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
3320
3259 if (cpu_has_virtual_nmis()) { 3321 if (cpu_has_virtual_nmis()) {
3260 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; 3322 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
3261 vector = exit_intr_info & INTR_INFO_VECTOR_MASK; 3323 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
3262 /* 3324 /*
3263 * SDM 3: 25.7.1.2 3325 * SDM 3: 27.7.1.2 (September 2008)
3264 * Re-set bit "block by NMI" before VM entry if vmexit caused by 3326 * Re-set bit "block by NMI" before VM entry if vmexit caused by
3265 * a guest IRET fault. 3327 * a guest IRET fault.
3328 * SDM 3: 23.2.2 (September 2008)
3329 * Bit 12 is undefined in any of the following cases:
3330 * If the VM exit sets the valid bit in the IDT-vectoring
3331 * information field.
3332 * If the VM exit is due to a double fault.
3266 */ 3333 */
3267 if (unblock_nmi && vector != DF_VECTOR) 3334 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
3335 vector != DF_VECTOR && !idtv_info_valid)
3268 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 3336 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
3269 GUEST_INTR_STATE_NMI); 3337 GUEST_INTR_STATE_NMI);
3270 } else if (unlikely(vmx->soft_vnmi_blocked)) 3338 } else if (unlikely(vmx->soft_vnmi_blocked))
3271 vmx->vnmi_blocked_time += 3339 vmx->vnmi_blocked_time +=
3272 ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); 3340 ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
3273 3341
3274 idt_vectoring_info = vmx->idt_vectoring_info; 3342 vmx->vcpu.arch.nmi_injected = false;
3275 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; 3343 kvm_clear_exception_queue(&vmx->vcpu);
3344 kvm_clear_interrupt_queue(&vmx->vcpu);
3345
3346 if (!idtv_info_valid)
3347 return;
3348
3276 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; 3349 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
3277 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; 3350 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
3278 if (vmx->vcpu.arch.nmi_injected) { 3351
3352 switch (type) {
3353 case INTR_TYPE_NMI_INTR:
3354 vmx->vcpu.arch.nmi_injected = true;
3279 /* 3355 /*
3280 * SDM 3: 25.7.1.2 3356 * SDM 3: 27.7.1.2 (September 2008)
3281 * Clear bit "block by NMI" before VM entry if a NMI delivery 3357 * Clear bit "block by NMI" before VM entry if a NMI
3282 * faulted. 3358 * delivery faulted.
3283 */ 3359 */
3284 if (idtv_info_valid && type == INTR_TYPE_NMI_INTR) 3360 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
3285 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, 3361 GUEST_INTR_STATE_NMI);
3286 GUEST_INTR_STATE_NMI); 3362 break;
3287 else 3363 case INTR_TYPE_SOFT_EXCEPTION:
3288 vmx->vcpu.arch.nmi_injected = false; 3364 vmx->vcpu.arch.event_exit_inst_len =
3289 } 3365 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
3290 kvm_clear_exception_queue(&vmx->vcpu); 3366 /* fall through */
3291 if (idtv_info_valid && (type == INTR_TYPE_HARD_EXCEPTION || 3367 case INTR_TYPE_HARD_EXCEPTION:
3292 type == INTR_TYPE_SOFT_EXCEPTION)) {
3293 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { 3368 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
3294 error = vmcs_read32(IDT_VECTORING_ERROR_CODE); 3369 u32 err = vmcs_read32(IDT_VECTORING_ERROR_CODE);
3295 kvm_queue_exception_e(&vmx->vcpu, vector, error); 3370 kvm_queue_exception_e(&vmx->vcpu, vector, err);
3296 } else 3371 } else
3297 kvm_queue_exception(&vmx->vcpu, vector); 3372 kvm_queue_exception(&vmx->vcpu, vector);
3298 vmx->idt_vectoring_info = 0; 3373 break;
3299 } 3374 case INTR_TYPE_SOFT_INTR:
3300 kvm_clear_interrupt_queue(&vmx->vcpu); 3375 vmx->vcpu.arch.event_exit_inst_len =
3301 if (idtv_info_valid && type == INTR_TYPE_EXT_INTR) { 3376 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
3302 kvm_queue_interrupt(&vmx->vcpu, vector); 3377 /* fall through */
3303 vmx->idt_vectoring_info = 0; 3378 case INTR_TYPE_EXT_INTR:
3304 } 3379 kvm_queue_interrupt(&vmx->vcpu, vector,
3305} 3380 type == INTR_TYPE_SOFT_INTR);
3306 3381 break;
3307static void vmx_intr_assist(struct kvm_vcpu *vcpu) 3382 default:
3308{ 3383 break;
3309 update_tpr_threshold(vcpu);
3310
3311 vmx_update_window_states(vcpu);
3312
3313 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
3314 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
3315 GUEST_INTR_STATE_STI |
3316 GUEST_INTR_STATE_MOV_SS);
3317
3318 if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
3319 if (vcpu->arch.interrupt.pending) {
3320 enable_nmi_window(vcpu);
3321 } else if (vcpu->arch.nmi_window_open) {
3322 vcpu->arch.nmi_pending = false;
3323 vcpu->arch.nmi_injected = true;
3324 } else {
3325 enable_nmi_window(vcpu);
3326 return;
3327 }
3328 }
3329 if (vcpu->arch.nmi_injected) {
3330 vmx_inject_nmi(vcpu);
3331 if (vcpu->arch.nmi_pending)
3332 enable_nmi_window(vcpu);
3333 else if (kvm_cpu_has_interrupt(vcpu))
3334 enable_irq_window(vcpu);
3335 return;
3336 }
3337 if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) {
3338 if (vcpu->arch.interrupt_window_open)
3339 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
3340 else
3341 enable_irq_window(vcpu);
3342 }
3343 if (vcpu->arch.interrupt.pending) {
3344 vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
3345 if (kvm_cpu_has_interrupt(vcpu))
3346 enable_irq_window(vcpu);
3347 } 3384 }
3348} 3385}
3349 3386
@@ -3381,7 +3418,6 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
3381static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3418static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3382{ 3419{
3383 struct vcpu_vmx *vmx = to_vmx(vcpu); 3420 struct vcpu_vmx *vmx = to_vmx(vcpu);
3384 u32 intr_info;
3385 3421
3386 /* Record the guest's net vcpu time for enforced NMI injections. */ 3422 /* Record the guest's net vcpu time for enforced NMI injections. */
3387 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) 3423 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
@@ -3505,20 +3541,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3505 if (vmx->rmode.irq.pending) 3541 if (vmx->rmode.irq.pending)
3506 fixup_rmode_irq(vmx); 3542 fixup_rmode_irq(vmx);
3507 3543
3508 vmx_update_window_states(vcpu);
3509
3510 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); 3544 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
3511 vmx->launched = 1; 3545 vmx->launched = 1;
3512 3546
3513 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
3514
3515 /* We need to handle NMIs before interrupts are enabled */
3516 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
3517 (intr_info & INTR_INFO_VALID_MASK)) {
3518 KVMTRACE_0D(NMI, vcpu, handler);
3519 asm("int $2");
3520 }
3521
3522 vmx_complete_interrupts(vmx); 3547 vmx_complete_interrupts(vmx);
3523} 3548}
3524 3549
@@ -3593,7 +3618,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
3593 if (alloc_apic_access_page(kvm) != 0) 3618 if (alloc_apic_access_page(kvm) != 0)
3594 goto free_vmcs; 3619 goto free_vmcs;
3595 3620
3596 if (vm_need_ept()) 3621 if (enable_ept)
3597 if (alloc_identity_pagetable(kvm) != 0) 3622 if (alloc_identity_pagetable(kvm) != 0)
3598 goto free_vmcs; 3623 goto free_vmcs;
3599 3624
@@ -3631,9 +3656,32 @@ static int get_ept_level(void)
3631 return VMX_EPT_DEFAULT_GAW + 1; 3656 return VMX_EPT_DEFAULT_GAW + 1;
3632} 3657}
3633 3658
3634static int vmx_get_mt_mask_shift(void) 3659static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
3635{ 3660{
3636 return VMX_EPT_MT_EPTE_SHIFT; 3661 u64 ret;
3662
3663 /* For VT-d and EPT combination
3664 * 1. MMIO: always map as UC
3665 * 2. EPT with VT-d:
3666 * a. VT-d without snooping control feature: can't guarantee the
3667 * result, try to trust guest.
3668 * b. VT-d with snooping control feature: snooping control feature of
3669 * VT-d engine can guarantee the cache correctness. Just set it
3670 * to WB to keep consistent with host. So the same as item 3.
3671 * 3. EPT without VT-d: always map as WB and set IGMT=1 to keep
3672 * consistent with host MTRR
3673 */
3674 if (is_mmio)
3675 ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
3676 else if (vcpu->kvm->arch.iommu_domain &&
3677 !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY))
3678 ret = kvm_get_guest_memory_type(vcpu, gfn) <<
3679 VMX_EPT_MT_EPTE_SHIFT;
3680 else
3681 ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT)
3682 | VMX_EPT_IGMT_BIT;
3683
3684 return ret;
3637} 3685}
3638 3686
3639static struct kvm_x86_ops vmx_x86_ops = { 3687static struct kvm_x86_ops vmx_x86_ops = {
@@ -3644,7 +3692,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
3644 .check_processor_compatibility = vmx_check_processor_compat, 3692 .check_processor_compatibility = vmx_check_processor_compat,
3645 .hardware_enable = hardware_enable, 3693 .hardware_enable = hardware_enable,
3646 .hardware_disable = hardware_disable, 3694 .hardware_disable = hardware_disable,
3647 .cpu_has_accelerated_tpr = cpu_has_vmx_virtualize_apic_accesses, 3695 .cpu_has_accelerated_tpr = report_flexpriority,
3648 3696
3649 .vcpu_create = vmx_create_vcpu, 3697 .vcpu_create = vmx_create_vcpu,
3650 .vcpu_free = vmx_free_vcpu, 3698 .vcpu_free = vmx_free_vcpu,
@@ -3678,78 +3726,82 @@ static struct kvm_x86_ops vmx_x86_ops = {
3678 .tlb_flush = vmx_flush_tlb, 3726 .tlb_flush = vmx_flush_tlb,
3679 3727
3680 .run = vmx_vcpu_run, 3728 .run = vmx_vcpu_run,
3681 .handle_exit = kvm_handle_exit, 3729 .handle_exit = vmx_handle_exit,
3682 .skip_emulated_instruction = skip_emulated_instruction, 3730 .skip_emulated_instruction = skip_emulated_instruction,
3731 .set_interrupt_shadow = vmx_set_interrupt_shadow,
3732 .get_interrupt_shadow = vmx_get_interrupt_shadow,
3683 .patch_hypercall = vmx_patch_hypercall, 3733 .patch_hypercall = vmx_patch_hypercall,
3684 .get_irq = vmx_get_irq,
3685 .set_irq = vmx_inject_irq, 3734 .set_irq = vmx_inject_irq,
3735 .set_nmi = vmx_inject_nmi,
3686 .queue_exception = vmx_queue_exception, 3736 .queue_exception = vmx_queue_exception,
3687 .exception_injected = vmx_exception_injected, 3737 .interrupt_allowed = vmx_interrupt_allowed,
3688 .inject_pending_irq = vmx_intr_assist, 3738 .nmi_allowed = vmx_nmi_allowed,
3689 .inject_pending_vectors = do_interrupt_requests, 3739 .enable_nmi_window = enable_nmi_window,
3740 .enable_irq_window = enable_irq_window,
3741 .update_cr8_intercept = update_cr8_intercept,
3690 3742
3691 .set_tss_addr = vmx_set_tss_addr, 3743 .set_tss_addr = vmx_set_tss_addr,
3692 .get_tdp_level = get_ept_level, 3744 .get_tdp_level = get_ept_level,
3693 .get_mt_mask_shift = vmx_get_mt_mask_shift, 3745 .get_mt_mask = vmx_get_mt_mask,
3694}; 3746};
3695 3747
3696static int __init vmx_init(void) 3748static int __init vmx_init(void)
3697{ 3749{
3698 void *va;
3699 int r; 3750 int r;
3700 3751
3701 vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); 3752 vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
3702 if (!vmx_io_bitmap_a) 3753 if (!vmx_io_bitmap_a)
3703 return -ENOMEM; 3754 return -ENOMEM;
3704 3755
3705 vmx_io_bitmap_b = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); 3756 vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);
3706 if (!vmx_io_bitmap_b) { 3757 if (!vmx_io_bitmap_b) {
3707 r = -ENOMEM; 3758 r = -ENOMEM;
3708 goto out; 3759 goto out;
3709 } 3760 }
3710 3761
3711 vmx_msr_bitmap = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); 3762 vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
3712 if (!vmx_msr_bitmap) { 3763 if (!vmx_msr_bitmap_legacy) {
3713 r = -ENOMEM; 3764 r = -ENOMEM;
3714 goto out1; 3765 goto out1;
3715 } 3766 }
3716 3767
3768 vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
3769 if (!vmx_msr_bitmap_longmode) {
3770 r = -ENOMEM;
3771 goto out2;
3772 }
3773
3717 /* 3774 /*
3718 * Allow direct access to the PC debug port (it is often used for I/O 3775 * Allow direct access to the PC debug port (it is often used for I/O
3719 * delays, but the vmexits simply slow things down). 3776 * delays, but the vmexits simply slow things down).
3720 */ 3777 */
3721 va = kmap(vmx_io_bitmap_a); 3778 memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE);
3722 memset(va, 0xff, PAGE_SIZE); 3779 clear_bit(0x80, vmx_io_bitmap_a);
3723 clear_bit(0x80, va);
3724 kunmap(vmx_io_bitmap_a);
3725 3780
3726 va = kmap(vmx_io_bitmap_b); 3781 memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
3727 memset(va, 0xff, PAGE_SIZE);
3728 kunmap(vmx_io_bitmap_b);
3729 3782
3730 va = kmap(vmx_msr_bitmap); 3783 memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
3731 memset(va, 0xff, PAGE_SIZE); 3784 memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
3732 kunmap(vmx_msr_bitmap);
3733 3785
3734 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ 3786 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
3735 3787
3736 r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE); 3788 r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
3737 if (r) 3789 if (r)
3738 goto out2; 3790 goto out3;
3739 3791
3740 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_FS_BASE); 3792 vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
3741 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_GS_BASE); 3793 vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
3742 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_CS); 3794 vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
3743 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP); 3795 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
3744 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP); 3796 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
3797 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
3745 3798
3746 if (vm_need_ept()) { 3799 if (enable_ept) {
3747 bypass_guest_pf = 0; 3800 bypass_guest_pf = 0;
3748 kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK | 3801 kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
3749 VMX_EPT_WRITABLE_MASK); 3802 VMX_EPT_WRITABLE_MASK);
3750 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, 3803 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
3751 VMX_EPT_EXECUTABLE_MASK, 3804 VMX_EPT_EXECUTABLE_MASK);
3752 VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
3753 kvm_enable_tdp(); 3805 kvm_enable_tdp();
3754 } else 3806 } else
3755 kvm_disable_tdp(); 3807 kvm_disable_tdp();
@@ -3761,20 +3813,23 @@ static int __init vmx_init(void)
3761 3813
3762 return 0; 3814 return 0;
3763 3815
3816out3:
3817 free_page((unsigned long)vmx_msr_bitmap_longmode);
3764out2: 3818out2:
3765 __free_page(vmx_msr_bitmap); 3819 free_page((unsigned long)vmx_msr_bitmap_legacy);
3766out1: 3820out1:
3767 __free_page(vmx_io_bitmap_b); 3821 free_page((unsigned long)vmx_io_bitmap_b);
3768out: 3822out:
3769 __free_page(vmx_io_bitmap_a); 3823 free_page((unsigned long)vmx_io_bitmap_a);
3770 return r; 3824 return r;
3771} 3825}
3772 3826
3773static void __exit vmx_exit(void) 3827static void __exit vmx_exit(void)
3774{ 3828{
3775 __free_page(vmx_msr_bitmap); 3829 free_page((unsigned long)vmx_msr_bitmap_legacy);
3776 __free_page(vmx_io_bitmap_b); 3830 free_page((unsigned long)vmx_msr_bitmap_longmode);
3777 __free_page(vmx_io_bitmap_a); 3831 free_page((unsigned long)vmx_io_bitmap_b);
3832 free_page((unsigned long)vmx_io_bitmap_a);
3778 3833
3779 kvm_exit(); 3834 kvm_exit();
3780} 3835}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 49079a46687..249540f9851 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -91,7 +91,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
91 { "halt_wakeup", VCPU_STAT(halt_wakeup) }, 91 { "halt_wakeup", VCPU_STAT(halt_wakeup) },
92 { "hypercalls", VCPU_STAT(hypercalls) }, 92 { "hypercalls", VCPU_STAT(hypercalls) },
93 { "request_irq", VCPU_STAT(request_irq_exits) }, 93 { "request_irq", VCPU_STAT(request_irq_exits) },
94 { "request_nmi", VCPU_STAT(request_nmi_exits) },
95 { "irq_exits", VCPU_STAT(irq_exits) }, 94 { "irq_exits", VCPU_STAT(irq_exits) },
96 { "host_state_reload", VCPU_STAT(host_state_reload) }, 95 { "host_state_reload", VCPU_STAT(host_state_reload) },
97 { "efer_reload", VCPU_STAT(efer_reload) }, 96 { "efer_reload", VCPU_STAT(efer_reload) },
@@ -108,7 +107,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
108 { "mmu_recycled", VM_STAT(mmu_recycled) }, 107 { "mmu_recycled", VM_STAT(mmu_recycled) },
109 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, 108 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
110 { "mmu_unsync", VM_STAT(mmu_unsync) }, 109 { "mmu_unsync", VM_STAT(mmu_unsync) },
111 { "mmu_unsync_global", VM_STAT(mmu_unsync_global) },
112 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, 110 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
113 { "largepages", VM_STAT(lpages) }, 111 { "largepages", VM_STAT(lpages) },
114 { NULL } 112 { NULL }
@@ -234,7 +232,8 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
234 goto out; 232 goto out;
235 } 233 }
236 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { 234 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
237 if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) { 235 if (is_present_pte(pdpte[i]) &&
236 (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
238 ret = 0; 237 ret = 0;
239 goto out; 238 goto out;
240 } 239 }
@@ -321,7 +320,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
321 kvm_x86_ops->set_cr0(vcpu, cr0); 320 kvm_x86_ops->set_cr0(vcpu, cr0);
322 vcpu->arch.cr0 = cr0; 321 vcpu->arch.cr0 = cr0;
323 322
324 kvm_mmu_sync_global(vcpu);
325 kvm_mmu_reset_context(vcpu); 323 kvm_mmu_reset_context(vcpu);
326 return; 324 return;
327} 325}
@@ -338,6 +336,9 @@ EXPORT_SYMBOL_GPL(kvm_lmsw);
338 336
339void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 337void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
340{ 338{
339 unsigned long old_cr4 = vcpu->arch.cr4;
340 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
341
341 if (cr4 & CR4_RESERVED_BITS) { 342 if (cr4 & CR4_RESERVED_BITS) {
342 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); 343 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
343 kvm_inject_gp(vcpu, 0); 344 kvm_inject_gp(vcpu, 0);
@@ -351,7 +352,8 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
351 kvm_inject_gp(vcpu, 0); 352 kvm_inject_gp(vcpu, 0);
352 return; 353 return;
353 } 354 }
354 } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE) 355 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
356 && ((cr4 ^ old_cr4) & pdptr_bits)
355 && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 357 && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
356 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); 358 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
357 kvm_inject_gp(vcpu, 0); 359 kvm_inject_gp(vcpu, 0);
@@ -366,7 +368,6 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
366 kvm_x86_ops->set_cr4(vcpu, cr4); 368 kvm_x86_ops->set_cr4(vcpu, cr4);
367 vcpu->arch.cr4 = cr4; 369 vcpu->arch.cr4 = cr4;
368 vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled; 370 vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
369 kvm_mmu_sync_global(vcpu);
370 kvm_mmu_reset_context(vcpu); 371 kvm_mmu_reset_context(vcpu);
371} 372}
372EXPORT_SYMBOL_GPL(kvm_set_cr4); 373EXPORT_SYMBOL_GPL(kvm_set_cr4);
@@ -519,6 +520,9 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
519 efer |= vcpu->arch.shadow_efer & EFER_LMA; 520 efer |= vcpu->arch.shadow_efer & EFER_LMA;
520 521
521 vcpu->arch.shadow_efer = efer; 522 vcpu->arch.shadow_efer = efer;
523
524 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
525 kvm_mmu_reset_context(vcpu);
522} 526}
523 527
524void kvm_enable_efer_bits(u64 mask) 528void kvm_enable_efer_bits(u64 mask)
@@ -626,14 +630,17 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
626 unsigned long flags; 630 unsigned long flags;
627 struct kvm_vcpu_arch *vcpu = &v->arch; 631 struct kvm_vcpu_arch *vcpu = &v->arch;
628 void *shared_kaddr; 632 void *shared_kaddr;
633 unsigned long this_tsc_khz;
629 634
630 if ((!vcpu->time_page)) 635 if ((!vcpu->time_page))
631 return; 636 return;
632 637
633 if (unlikely(vcpu->hv_clock_tsc_khz != __get_cpu_var(cpu_tsc_khz))) { 638 this_tsc_khz = get_cpu_var(cpu_tsc_khz);
634 kvm_set_time_scale(__get_cpu_var(cpu_tsc_khz), &vcpu->hv_clock); 639 if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) {
635 vcpu->hv_clock_tsc_khz = __get_cpu_var(cpu_tsc_khz); 640 kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
641 vcpu->hv_clock_tsc_khz = this_tsc_khz;
636 } 642 }
643 put_cpu_var(cpu_tsc_khz);
637 644
638 /* Keep irq disabled to prevent changes to the clock */ 645 /* Keep irq disabled to prevent changes to the clock */
639 local_irq_save(flags); 646 local_irq_save(flags);
@@ -889,6 +896,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
889 case MSR_IA32_LASTINTFROMIP: 896 case MSR_IA32_LASTINTFROMIP:
890 case MSR_IA32_LASTINTTOIP: 897 case MSR_IA32_LASTINTTOIP:
891 case MSR_VM_HSAVE_PA: 898 case MSR_VM_HSAVE_PA:
899 case MSR_P6_EVNTSEL0:
900 case MSR_P6_EVNTSEL1:
892 data = 0; 901 data = 0;
893 break; 902 break;
894 case MSR_MTRRcap: 903 case MSR_MTRRcap:
@@ -1020,6 +1029,7 @@ int kvm_dev_ioctl_check_extension(long ext)
1020 case KVM_CAP_SYNC_MMU: 1029 case KVM_CAP_SYNC_MMU:
1021 case KVM_CAP_REINJECT_CONTROL: 1030 case KVM_CAP_REINJECT_CONTROL:
1022 case KVM_CAP_IRQ_INJECT_STATUS: 1031 case KVM_CAP_IRQ_INJECT_STATUS:
1032 case KVM_CAP_ASSIGN_DEV_IRQ:
1023 r = 1; 1033 r = 1;
1024 break; 1034 break;
1025 case KVM_CAP_COALESCED_MMIO: 1035 case KVM_CAP_COALESCED_MMIO:
@@ -1237,41 +1247,53 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1237 entry->flags = 0; 1247 entry->flags = 0;
1238} 1248}
1239 1249
1250#define F(x) bit(X86_FEATURE_##x)
1251
1240static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 1252static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1241 u32 index, int *nent, int maxnent) 1253 u32 index, int *nent, int maxnent)
1242{ 1254{
1243 const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) | 1255 unsigned f_nx = is_efer_nx() ? F(NX) : 0;
1244 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
1245 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
1246 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
1247 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
1248 bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) |
1249 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
1250 bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |
1251 bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |
1252 bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);
1253 const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) |
1254 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
1255 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
1256 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
1257 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
1258 bit(X86_FEATURE_PGE) |
1259 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
1260 bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) |
1261 bit(X86_FEATURE_SYSCALL) |
1262 (is_efer_nx() ? bit(X86_FEATURE_NX) : 0) |
1263#ifdef CONFIG_X86_64 1256#ifdef CONFIG_X86_64
1264 bit(X86_FEATURE_LM) | 1257 unsigned f_lm = F(LM);
1258#else
1259 unsigned f_lm = 0;
1265#endif 1260#endif
1266 bit(X86_FEATURE_FXSR_OPT) | 1261
1267 bit(X86_FEATURE_MMXEXT) | 1262 /* cpuid 1.edx */
1268 bit(X86_FEATURE_3DNOWEXT) | 1263 const u32 kvm_supported_word0_x86_features =
1269 bit(X86_FEATURE_3DNOW); 1264 F(FPU) | F(VME) | F(DE) | F(PSE) |
1270 const u32 kvm_supported_word3_x86_features = 1265 F(TSC) | F(MSR) | F(PAE) | F(MCE) |
1271 bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16); 1266 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
1267 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
1268 F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) |
1269 0 /* Reserved, DS, ACPI */ | F(MMX) |
1270 F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
1271 0 /* HTT, TM, Reserved, PBE */;
1272 /* cpuid 0x80000001.edx */
1273 const u32 kvm_supported_word1_x86_features =
1274 F(FPU) | F(VME) | F(DE) | F(PSE) |
1275 F(TSC) | F(MSR) | F(PAE) | F(MCE) |
1276 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
1277 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
1278 F(PAT) | F(PSE36) | 0 /* Reserved */ |
1279 f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
1280 F(FXSR) | F(FXSR_OPT) | 0 /* GBPAGES */ | 0 /* RDTSCP */ |
1281 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
1282 /* cpuid 1.ecx */
1283 const u32 kvm_supported_word4_x86_features =
1284 F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ |
1285 0 /* DS-CPL, VMX, SMX, EST */ |
1286 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
1287 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
1288 0 /* Reserved, DCA */ | F(XMM4_1) |
1289 F(XMM4_2) | 0 /* x2APIC */ | F(MOVBE) | F(POPCNT) |
1290 0 /* Reserved, XSAVE, OSXSAVE */;
1291 /* cpuid 0x80000001.ecx */
1272 const u32 kvm_supported_word6_x86_features = 1292 const u32 kvm_supported_word6_x86_features =
1273 bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY) | 1293 F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
1274 bit(X86_FEATURE_SVM); 1294 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
1295 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) |
1296 0 /* SKINIT */ | 0 /* WDT */;
1275 1297
1276 /* all calls to cpuid_count() should be made on the same cpu */ 1298 /* all calls to cpuid_count() should be made on the same cpu */
1277 get_cpu(); 1299 get_cpu();
@@ -1284,7 +1306,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1284 break; 1306 break;
1285 case 1: 1307 case 1:
1286 entry->edx &= kvm_supported_word0_x86_features; 1308 entry->edx &= kvm_supported_word0_x86_features;
1287 entry->ecx &= kvm_supported_word3_x86_features; 1309 entry->ecx &= kvm_supported_word4_x86_features;
1288 break; 1310 break;
1289 /* function 2 entries are STATEFUL. That is, repeated cpuid commands 1311 /* function 2 entries are STATEFUL. That is, repeated cpuid commands
1290 * may return different values. This forces us to get_cpu() before 1312 * may return different values. This forces us to get_cpu() before
@@ -1346,6 +1368,8 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1346 put_cpu(); 1368 put_cpu();
1347} 1369}
1348 1370
1371#undef F
1372
1349static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, 1373static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
1350 struct kvm_cpuid_entry2 __user *entries) 1374 struct kvm_cpuid_entry2 __user *entries)
1351{ 1375{
@@ -1417,8 +1441,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
1417 return -ENXIO; 1441 return -ENXIO;
1418 vcpu_load(vcpu); 1442 vcpu_load(vcpu);
1419 1443
1420 set_bit(irq->irq, vcpu->arch.irq_pending); 1444 kvm_queue_interrupt(vcpu, irq->irq, false);
1421 set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
1422 1445
1423 vcpu_put(vcpu); 1446 vcpu_put(vcpu);
1424 1447
@@ -1580,8 +1603,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1580 r = -EINVAL; 1603 r = -EINVAL;
1581 } 1604 }
1582out: 1605out:
1583 if (lapic) 1606 kfree(lapic);
1584 kfree(lapic);
1585 return r; 1607 return r;
1586} 1608}
1587 1609
@@ -1602,10 +1624,12 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
1602 return -EINVAL; 1624 return -EINVAL;
1603 1625
1604 down_write(&kvm->slots_lock); 1626 down_write(&kvm->slots_lock);
1627 spin_lock(&kvm->mmu_lock);
1605 1628
1606 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); 1629 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
1607 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; 1630 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
1608 1631
1632 spin_unlock(&kvm->mmu_lock);
1609 up_write(&kvm->slots_lock); 1633 up_write(&kvm->slots_lock);
1610 return 0; 1634 return 0;
1611} 1635}
@@ -1781,7 +1805,9 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1781 1805
1782 /* If nothing is dirty, don't bother messing with page tables. */ 1806 /* If nothing is dirty, don't bother messing with page tables. */
1783 if (is_dirty) { 1807 if (is_dirty) {
1808 spin_lock(&kvm->mmu_lock);
1784 kvm_mmu_slot_remove_write_access(kvm, log->slot); 1809 kvm_mmu_slot_remove_write_access(kvm, log->slot);
1810 spin_unlock(&kvm->mmu_lock);
1785 kvm_flush_remote_tlbs(kvm); 1811 kvm_flush_remote_tlbs(kvm);
1786 memslot = &kvm->memslots[log->slot]; 1812 memslot = &kvm->memslots[log->slot];
1787 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; 1813 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
@@ -2356,7 +2382,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2356 u16 error_code, 2382 u16 error_code,
2357 int emulation_type) 2383 int emulation_type)
2358{ 2384{
2359 int r; 2385 int r, shadow_mask;
2360 struct decode_cache *c; 2386 struct decode_cache *c;
2361 2387
2362 kvm_clear_exception_queue(vcpu); 2388 kvm_clear_exception_queue(vcpu);
@@ -2404,7 +2430,16 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2404 } 2430 }
2405 } 2431 }
2406 2432
2433 if (emulation_type & EMULTYPE_SKIP) {
2434 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip);
2435 return EMULATE_DONE;
2436 }
2437
2407 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 2438 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2439 shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
2440
2441 if (r == 0)
2442 kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
2408 2443
2409 if (vcpu->arch.pio.string) 2444 if (vcpu->arch.pio.string)
2410 return EMULATE_DO_MMIO; 2445 return EMULATE_DO_MMIO;
@@ -2757,7 +2792,7 @@ int kvm_arch_init(void *opaque)
2757 kvm_mmu_set_nonpresent_ptes(0ull, 0ull); 2792 kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
2758 kvm_mmu_set_base_ptes(PT_PRESENT_MASK); 2793 kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
2759 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 2794 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
2760 PT_DIRTY_MASK, PT64_NX_MASK, 0, 0); 2795 PT_DIRTY_MASK, PT64_NX_MASK, 0);
2761 2796
2762 for_each_possible_cpu(cpu) 2797 for_each_possible_cpu(cpu)
2763 per_cpu(cpu_tsc_khz, cpu) = tsc_khz; 2798 per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
@@ -3008,6 +3043,16 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
3008 return best; 3043 return best;
3009} 3044}
3010 3045
3046int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
3047{
3048 struct kvm_cpuid_entry2 *best;
3049
3050 best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
3051 if (best)
3052 return best->eax & 0xff;
3053 return 36;
3054}
3055
3011void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) 3056void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
3012{ 3057{
3013 u32 function, index; 3058 u32 function, index;
@@ -3044,10 +3089,9 @@ EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
3044static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, 3089static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
3045 struct kvm_run *kvm_run) 3090 struct kvm_run *kvm_run)
3046{ 3091{
3047 return (!vcpu->arch.irq_summary && 3092 return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
3048 kvm_run->request_interrupt_window && 3093 kvm_run->request_interrupt_window &&
3049 vcpu->arch.interrupt_window_open && 3094 kvm_arch_interrupt_allowed(vcpu));
3050 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
3051} 3095}
3052 3096
3053static void post_kvm_run_save(struct kvm_vcpu *vcpu, 3097static void post_kvm_run_save(struct kvm_vcpu *vcpu,
@@ -3060,8 +3104,9 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu,
3060 kvm_run->ready_for_interrupt_injection = 1; 3104 kvm_run->ready_for_interrupt_injection = 1;
3061 else 3105 else
3062 kvm_run->ready_for_interrupt_injection = 3106 kvm_run->ready_for_interrupt_injection =
3063 (vcpu->arch.interrupt_window_open && 3107 kvm_arch_interrupt_allowed(vcpu) &&
3064 vcpu->arch.irq_summary == 0); 3108 !kvm_cpu_has_interrupt(vcpu) &&
3109 !kvm_event_needs_reinjection(vcpu);
3065} 3110}
3066 3111
3067static void vapic_enter(struct kvm_vcpu *vcpu) 3112static void vapic_enter(struct kvm_vcpu *vcpu)
@@ -3090,9 +3135,63 @@ static void vapic_exit(struct kvm_vcpu *vcpu)
3090 up_read(&vcpu->kvm->slots_lock); 3135 up_read(&vcpu->kvm->slots_lock);
3091} 3136}
3092 3137
3138static void update_cr8_intercept(struct kvm_vcpu *vcpu)
3139{
3140 int max_irr, tpr;
3141
3142 if (!kvm_x86_ops->update_cr8_intercept)
3143 return;
3144
3145 if (!vcpu->arch.apic->vapic_addr)
3146 max_irr = kvm_lapic_find_highest_irr(vcpu);
3147 else
3148 max_irr = -1;
3149
3150 if (max_irr != -1)
3151 max_irr >>= 4;
3152
3153 tpr = kvm_lapic_get_cr8(vcpu);
3154
3155 kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
3156}
3157
3158static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3159{
3160 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
3161 kvm_x86_ops->set_interrupt_shadow(vcpu, 0);
3162
3163 /* try to reinject previous events if any */
3164 if (vcpu->arch.nmi_injected) {
3165 kvm_x86_ops->set_nmi(vcpu);
3166 return;
3167 }
3168
3169 if (vcpu->arch.interrupt.pending) {
3170 kvm_x86_ops->set_irq(vcpu);
3171 return;
3172 }
3173
3174 /* try to inject new event if pending */
3175 if (vcpu->arch.nmi_pending) {
3176 if (kvm_x86_ops->nmi_allowed(vcpu)) {
3177 vcpu->arch.nmi_pending = false;
3178 vcpu->arch.nmi_injected = true;
3179 kvm_x86_ops->set_nmi(vcpu);
3180 }
3181 } else if (kvm_cpu_has_interrupt(vcpu)) {
3182 if (kvm_x86_ops->interrupt_allowed(vcpu)) {
3183 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
3184 false);
3185 kvm_x86_ops->set_irq(vcpu);
3186 }
3187 }
3188}
3189
3093static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3190static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3094{ 3191{
3095 int r; 3192 int r;
3193 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
3194 kvm_run->request_interrupt_window;
3096 3195
3097 if (vcpu->requests) 3196 if (vcpu->requests)
3098 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) 3197 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
@@ -3124,9 +3223,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3124 } 3223 }
3125 } 3224 }
3126 3225
3127 clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
3128 kvm_inject_pending_timer_irqs(vcpu);
3129
3130 preempt_disable(); 3226 preempt_disable();
3131 3227
3132 kvm_x86_ops->prepare_guest_switch(vcpu); 3228 kvm_x86_ops->prepare_guest_switch(vcpu);
@@ -3134,6 +3230,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3134 3230
3135 local_irq_disable(); 3231 local_irq_disable();
3136 3232
3233 clear_bit(KVM_REQ_KICK, &vcpu->requests);
3234 smp_mb__after_clear_bit();
3235
3137 if (vcpu->requests || need_resched() || signal_pending(current)) { 3236 if (vcpu->requests || need_resched() || signal_pending(current)) {
3138 local_irq_enable(); 3237 local_irq_enable();
3139 preempt_enable(); 3238 preempt_enable();
@@ -3141,21 +3240,21 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3141 goto out; 3240 goto out;
3142 } 3241 }
3143 3242
3144 vcpu->guest_mode = 1;
3145 /*
3146 * Make sure that guest_mode assignment won't happen after
3147 * testing the pending IRQ vector bitmap.
3148 */
3149 smp_wmb();
3150
3151 if (vcpu->arch.exception.pending) 3243 if (vcpu->arch.exception.pending)
3152 __queue_exception(vcpu); 3244 __queue_exception(vcpu);
3153 else if (irqchip_in_kernel(vcpu->kvm))
3154 kvm_x86_ops->inject_pending_irq(vcpu);
3155 else 3245 else
3156 kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run); 3246 inject_pending_irq(vcpu, kvm_run);
3247
3248 /* enable NMI/IRQ window open exits if needed */
3249 if (vcpu->arch.nmi_pending)
3250 kvm_x86_ops->enable_nmi_window(vcpu);
3251 else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
3252 kvm_x86_ops->enable_irq_window(vcpu);
3157 3253
3158 kvm_lapic_sync_to_vapic(vcpu); 3254 if (kvm_lapic_enabled(vcpu)) {
3255 update_cr8_intercept(vcpu);
3256 kvm_lapic_sync_to_vapic(vcpu);
3257 }
3159 3258
3160 up_read(&vcpu->kvm->slots_lock); 3259 up_read(&vcpu->kvm->slots_lock);
3161 3260
@@ -3189,7 +3288,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3189 set_debugreg(vcpu->arch.host_dr6, 6); 3288 set_debugreg(vcpu->arch.host_dr6, 6);
3190 set_debugreg(vcpu->arch.host_dr7, 7); 3289 set_debugreg(vcpu->arch.host_dr7, 7);
3191 3290
3192 vcpu->guest_mode = 0; 3291 set_bit(KVM_REQ_KICK, &vcpu->requests);
3193 local_irq_enable(); 3292 local_irq_enable();
3194 3293
3195 ++vcpu->stat.exits; 3294 ++vcpu->stat.exits;
@@ -3216,8 +3315,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3216 profile_hit(KVM_PROFILING, (void *)rip); 3315 profile_hit(KVM_PROFILING, (void *)rip);
3217 } 3316 }
3218 3317
3219 if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
3220 vcpu->arch.exception.pending = false;
3221 3318
3222 kvm_lapic_sync_from_vapic(vcpu); 3319 kvm_lapic_sync_from_vapic(vcpu);
3223 3320
@@ -3226,6 +3323,7 @@ out:
3226 return r; 3323 return r;
3227} 3324}
3228 3325
3326
3229static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3327static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3230{ 3328{
3231 int r; 3329 int r;
@@ -3252,29 +3350,42 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3252 kvm_vcpu_block(vcpu); 3350 kvm_vcpu_block(vcpu);
3253 down_read(&vcpu->kvm->slots_lock); 3351 down_read(&vcpu->kvm->slots_lock);
3254 if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) 3352 if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
3255 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 3353 {
3354 switch(vcpu->arch.mp_state) {
3355 case KVM_MP_STATE_HALTED:
3256 vcpu->arch.mp_state = 3356 vcpu->arch.mp_state =
3257 KVM_MP_STATE_RUNNABLE; 3357 KVM_MP_STATE_RUNNABLE;
3258 if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) 3358 case KVM_MP_STATE_RUNNABLE:
3259 r = -EINTR; 3359 break;
3360 case KVM_MP_STATE_SIPI_RECEIVED:
3361 default:
3362 r = -EINTR;
3363 break;
3364 }
3365 }
3260 } 3366 }
3261 3367
3262 if (r > 0) { 3368 if (r <= 0)
3263 if (dm_request_for_irq_injection(vcpu, kvm_run)) { 3369 break;
3264 r = -EINTR; 3370
3265 kvm_run->exit_reason = KVM_EXIT_INTR; 3371 clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
3266 ++vcpu->stat.request_irq_exits; 3372 if (kvm_cpu_has_pending_timer(vcpu))
3267 } 3373 kvm_inject_pending_timer_irqs(vcpu);
3268 if (signal_pending(current)) { 3374
3269 r = -EINTR; 3375 if (dm_request_for_irq_injection(vcpu, kvm_run)) {
3270 kvm_run->exit_reason = KVM_EXIT_INTR; 3376 r = -EINTR;
3271 ++vcpu->stat.signal_exits; 3377 kvm_run->exit_reason = KVM_EXIT_INTR;
3272 } 3378 ++vcpu->stat.request_irq_exits;
3273 if (need_resched()) { 3379 }
3274 up_read(&vcpu->kvm->slots_lock); 3380 if (signal_pending(current)) {
3275 kvm_resched(vcpu); 3381 r = -EINTR;
3276 down_read(&vcpu->kvm->slots_lock); 3382 kvm_run->exit_reason = KVM_EXIT_INTR;
3277 } 3383 ++vcpu->stat.signal_exits;
3384 }
3385 if (need_resched()) {
3386 up_read(&vcpu->kvm->slots_lock);
3387 kvm_resched(vcpu);
3388 down_read(&vcpu->kvm->slots_lock);
3278 } 3389 }
3279 } 3390 }
3280 3391
@@ -3438,7 +3549,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
3438 struct kvm_sregs *sregs) 3549 struct kvm_sregs *sregs)
3439{ 3550{
3440 struct descriptor_table dt; 3551 struct descriptor_table dt;
3441 int pending_vec;
3442 3552
3443 vcpu_load(vcpu); 3553 vcpu_load(vcpu);
3444 3554
@@ -3468,16 +3578,11 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
3468 sregs->efer = vcpu->arch.shadow_efer; 3578 sregs->efer = vcpu->arch.shadow_efer;
3469 sregs->apic_base = kvm_get_apic_base(vcpu); 3579 sregs->apic_base = kvm_get_apic_base(vcpu);
3470 3580
3471 if (irqchip_in_kernel(vcpu->kvm)) { 3581 memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
3472 memset(sregs->interrupt_bitmap, 0, 3582
3473 sizeof sregs->interrupt_bitmap); 3583 if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft)
3474 pending_vec = kvm_x86_ops->get_irq(vcpu); 3584 set_bit(vcpu->arch.interrupt.nr,
3475 if (pending_vec >= 0) 3585 (unsigned long *)sregs->interrupt_bitmap);
3476 set_bit(pending_vec,
3477 (unsigned long *)sregs->interrupt_bitmap);
3478 } else
3479 memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending,
3480 sizeof sregs->interrupt_bitmap);
3481 3586
3482 vcpu_put(vcpu); 3587 vcpu_put(vcpu);
3483 3588
@@ -3684,7 +3789,6 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,
3684 tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS); 3789 tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
3685 tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS); 3790 tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
3686 tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); 3791 tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
3687 tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
3688} 3792}
3689 3793
3690static int load_state_from_tss32(struct kvm_vcpu *vcpu, 3794static int load_state_from_tss32(struct kvm_vcpu *vcpu,
@@ -3781,8 +3885,8 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu,
3781} 3885}
3782 3886
3783static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, 3887static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
3784 u32 old_tss_base, 3888 u16 old_tss_sel, u32 old_tss_base,
3785 struct desc_struct *nseg_desc) 3889 struct desc_struct *nseg_desc)
3786{ 3890{
3787 struct tss_segment_16 tss_segment_16; 3891 struct tss_segment_16 tss_segment_16;
3788 int ret = 0; 3892 int ret = 0;
@@ -3801,6 +3905,16 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
3801 &tss_segment_16, sizeof tss_segment_16)) 3905 &tss_segment_16, sizeof tss_segment_16))
3802 goto out; 3906 goto out;
3803 3907
3908 if (old_tss_sel != 0xffff) {
3909 tss_segment_16.prev_task_link = old_tss_sel;
3910
3911 if (kvm_write_guest(vcpu->kvm,
3912 get_tss_base_addr(vcpu, nseg_desc),
3913 &tss_segment_16.prev_task_link,
3914 sizeof tss_segment_16.prev_task_link))
3915 goto out;
3916 }
3917
3804 if (load_state_from_tss16(vcpu, &tss_segment_16)) 3918 if (load_state_from_tss16(vcpu, &tss_segment_16))
3805 goto out; 3919 goto out;
3806 3920
@@ -3810,7 +3924,7 @@ out:
3810} 3924}
3811 3925
3812static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, 3926static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
3813 u32 old_tss_base, 3927 u16 old_tss_sel, u32 old_tss_base,
3814 struct desc_struct *nseg_desc) 3928 struct desc_struct *nseg_desc)
3815{ 3929{
3816 struct tss_segment_32 tss_segment_32; 3930 struct tss_segment_32 tss_segment_32;
@@ -3830,6 +3944,16 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
3830 &tss_segment_32, sizeof tss_segment_32)) 3944 &tss_segment_32, sizeof tss_segment_32))
3831 goto out; 3945 goto out;
3832 3946
3947 if (old_tss_sel != 0xffff) {
3948 tss_segment_32.prev_task_link = old_tss_sel;
3949
3950 if (kvm_write_guest(vcpu->kvm,
3951 get_tss_base_addr(vcpu, nseg_desc),
3952 &tss_segment_32.prev_task_link,
3953 sizeof tss_segment_32.prev_task_link))
3954 goto out;
3955 }
3956
3833 if (load_state_from_tss32(vcpu, &tss_segment_32)) 3957 if (load_state_from_tss32(vcpu, &tss_segment_32))
3834 goto out; 3958 goto out;
3835 3959
@@ -3883,14 +4007,22 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3883 kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); 4007 kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
3884 } 4008 }
3885 4009
3886 kvm_x86_ops->skip_emulated_instruction(vcpu); 4010 /* set back link to prev task only if NT bit is set in eflags
4011 note that old_tss_sel is not used afetr this point */
4012 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
4013 old_tss_sel = 0xffff;
4014
4015 /* set back link to prev task only if NT bit is set in eflags
4016 note that old_tss_sel is not used afetr this point */
4017 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
4018 old_tss_sel = 0xffff;
3887 4019
3888 if (nseg_desc.type & 8) 4020 if (nseg_desc.type & 8)
3889 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base, 4021 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
3890 &nseg_desc); 4022 old_tss_base, &nseg_desc);
3891 else 4023 else
3892 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_base, 4024 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel,
3893 &nseg_desc); 4025 old_tss_base, &nseg_desc);
3894 4026
3895 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { 4027 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
3896 u32 eflags = kvm_x86_ops->get_rflags(vcpu); 4028 u32 eflags = kvm_x86_ops->get_rflags(vcpu);
@@ -3916,7 +4048,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
3916 struct kvm_sregs *sregs) 4048 struct kvm_sregs *sregs)
3917{ 4049{
3918 int mmu_reset_needed = 0; 4050 int mmu_reset_needed = 0;
3919 int i, pending_vec, max_bits; 4051 int pending_vec, max_bits;
3920 struct descriptor_table dt; 4052 struct descriptor_table dt;
3921 4053
3922 vcpu_load(vcpu); 4054 vcpu_load(vcpu);
@@ -3930,7 +4062,13 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
3930 4062
3931 vcpu->arch.cr2 = sregs->cr2; 4063 vcpu->arch.cr2 = sregs->cr2;
3932 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; 4064 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
3933 vcpu->arch.cr3 = sregs->cr3; 4065
4066 down_read(&vcpu->kvm->slots_lock);
4067 if (gfn_to_memslot(vcpu->kvm, sregs->cr3 >> PAGE_SHIFT))
4068 vcpu->arch.cr3 = sregs->cr3;
4069 else
4070 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
4071 up_read(&vcpu->kvm->slots_lock);
3934 4072
3935 kvm_set_cr8(vcpu, sregs->cr8); 4073 kvm_set_cr8(vcpu, sregs->cr8);
3936 4074
@@ -3952,25 +4090,14 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
3952 if (mmu_reset_needed) 4090 if (mmu_reset_needed)
3953 kvm_mmu_reset_context(vcpu); 4091 kvm_mmu_reset_context(vcpu);
3954 4092
3955 if (!irqchip_in_kernel(vcpu->kvm)) { 4093 max_bits = (sizeof sregs->interrupt_bitmap) << 3;
3956 memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap, 4094 pending_vec = find_first_bit(
3957 sizeof vcpu->arch.irq_pending); 4095 (const unsigned long *)sregs->interrupt_bitmap, max_bits);
3958 vcpu->arch.irq_summary = 0; 4096 if (pending_vec < max_bits) {
3959 for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i) 4097 kvm_queue_interrupt(vcpu, pending_vec, false);
3960 if (vcpu->arch.irq_pending[i]) 4098 pr_debug("Set back pending irq %d\n", pending_vec);
3961 __set_bit(i, &vcpu->arch.irq_summary); 4099 if (irqchip_in_kernel(vcpu->kvm))
3962 } else { 4100 kvm_pic_clear_isr_ack(vcpu->kvm);
3963 max_bits = (sizeof sregs->interrupt_bitmap) << 3;
3964 pending_vec = find_first_bit(
3965 (const unsigned long *)sregs->interrupt_bitmap,
3966 max_bits);
3967 /* Only pending external irq is handled here */
3968 if (pending_vec < max_bits) {
3969 kvm_x86_ops->set_irq(vcpu, pending_vec);
3970 pr_debug("Set back pending irq %d\n",
3971 pending_vec);
3972 }
3973 kvm_pic_clear_isr_ack(vcpu->kvm);
3974 } 4101 }
3975 4102
3976 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 4103 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
@@ -4304,7 +4431,6 @@ struct kvm *kvm_arch_create_vm(void)
4304 return ERR_PTR(-ENOMEM); 4431 return ERR_PTR(-ENOMEM);
4305 4432
4306 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 4433 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
4307 INIT_LIST_HEAD(&kvm->arch.oos_global_pages);
4308 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 4434 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
4309 4435
4310 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 4436 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
@@ -4407,12 +4533,14 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
4407 } 4533 }
4408 } 4534 }
4409 4535
4536 spin_lock(&kvm->mmu_lock);
4410 if (!kvm->arch.n_requested_mmu_pages) { 4537 if (!kvm->arch.n_requested_mmu_pages) {
4411 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); 4538 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
4412 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); 4539 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
4413 } 4540 }
4414 4541
4415 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 4542 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
4543 spin_unlock(&kvm->mmu_lock);
4416 kvm_flush_remote_tlbs(kvm); 4544 kvm_flush_remote_tlbs(kvm);
4417 4545
4418 return 0; 4546 return 0;
@@ -4421,6 +4549,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
4421void kvm_arch_flush_shadow(struct kvm *kvm) 4549void kvm_arch_flush_shadow(struct kvm *kvm)
4422{ 4550{
4423 kvm_mmu_zap_all(kvm); 4551 kvm_mmu_zap_all(kvm);
4552 kvm_reload_remote_mmus(kvm);
4424} 4553}
4425 4554
4426int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 4555int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
@@ -4430,28 +4559,24 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
4430 || vcpu->arch.nmi_pending; 4559 || vcpu->arch.nmi_pending;
4431} 4560}
4432 4561
4433static void vcpu_kick_intr(void *info)
4434{
4435#ifdef DEBUG
4436 struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
4437 printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
4438#endif
4439}
4440
4441void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 4562void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
4442{ 4563{
4443 int ipi_pcpu = vcpu->cpu; 4564 int me;
4444 int cpu = get_cpu(); 4565 int cpu = vcpu->cpu;
4445 4566
4446 if (waitqueue_active(&vcpu->wq)) { 4567 if (waitqueue_active(&vcpu->wq)) {
4447 wake_up_interruptible(&vcpu->wq); 4568 wake_up_interruptible(&vcpu->wq);
4448 ++vcpu->stat.halt_wakeup; 4569 ++vcpu->stat.halt_wakeup;
4449 } 4570 }
4450 /* 4571
4451 * We may be called synchronously with irqs disabled in guest mode, 4572 me = get_cpu();
4452 * So need not to call smp_call_function_single() in that case. 4573 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
4453 */ 4574 if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests))
4454 if (vcpu->guest_mode && vcpu->cpu != cpu) 4575 smp_send_reschedule(cpu);
4455 smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0);
4456 put_cpu(); 4576 put_cpu();
4457} 4577}
4578
4579int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
4580{
4581 return kvm_x86_ops->interrupt_allowed(vcpu);
4582}
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 6a4be78a738..4c8e10af78e 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -8,9 +8,11 @@ static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
8 vcpu->arch.exception.pending = false; 8 vcpu->arch.exception.pending = false;
9} 9}
10 10
11static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector) 11static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector,
12 bool soft)
12{ 13{
13 vcpu->arch.interrupt.pending = true; 14 vcpu->arch.interrupt.pending = true;
15 vcpu->arch.interrupt.soft = soft;
14 vcpu->arch.interrupt.nr = vector; 16 vcpu->arch.interrupt.nr = vector;
15} 17}
16 18
@@ -19,4 +21,14 @@ static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu)
19 vcpu->arch.interrupt.pending = false; 21 vcpu->arch.interrupt.pending = false;
20} 22}
21 23
24static inline bool kvm_event_needs_reinjection(struct kvm_vcpu *vcpu)
25{
26 return vcpu->arch.exception.pending || vcpu->arch.interrupt.pending ||
27 vcpu->arch.nmi_injected;
28}
29
30static inline bool kvm_exception_is_soft(unsigned int nr)
31{
32 return (nr == BP_VECTOR) || (nr == OF_VECTOR);
33}
22#endif 34#endif
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index ca91749d208..c1b6c232e02 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -59,13 +59,14 @@
59#define SrcImm (5<<4) /* Immediate operand. */ 59#define SrcImm (5<<4) /* Immediate operand. */
60#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */ 60#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */
61#define SrcOne (7<<4) /* Implied '1' */ 61#define SrcOne (7<<4) /* Implied '1' */
62#define SrcMask (7<<4) 62#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */
63#define SrcMask (0xf<<4)
63/* Generic ModRM decode. */ 64/* Generic ModRM decode. */
64#define ModRM (1<<7) 65#define ModRM (1<<8)
65/* Destination is only written; never read. */ 66/* Destination is only written; never read. */
66#define Mov (1<<8) 67#define Mov (1<<9)
67#define BitOp (1<<9) 68#define BitOp (1<<10)
68#define MemAbs (1<<10) /* Memory operand is absolute displacement */ 69#define MemAbs (1<<11) /* Memory operand is absolute displacement */
69#define String (1<<12) /* String instruction (rep capable) */ 70#define String (1<<12) /* String instruction (rep capable) */
70#define Stack (1<<13) /* Stack instruction (push/pop) */ 71#define Stack (1<<13) /* Stack instruction (push/pop) */
71#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ 72#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
@@ -76,6 +77,7 @@
76#define Src2CL (1<<29) 77#define Src2CL (1<<29)
77#define Src2ImmByte (2<<29) 78#define Src2ImmByte (2<<29)
78#define Src2One (3<<29) 79#define Src2One (3<<29)
80#define Src2Imm16 (4<<29)
79#define Src2Mask (7<<29) 81#define Src2Mask (7<<29)
80 82
81enum { 83enum {
@@ -135,11 +137,11 @@ static u32 opcode_table[256] = {
135 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ 137 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */
136 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ 138 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */
137 /* 0x70 - 0x77 */ 139 /* 0x70 - 0x77 */
138 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, 140 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
139 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, 141 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
140 /* 0x78 - 0x7F */ 142 /* 0x78 - 0x7F */
141 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, 143 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
142 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, 144 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
143 /* 0x80 - 0x87 */ 145 /* 0x80 - 0x87 */
144 Group | Group1_80, Group | Group1_81, 146 Group | Group1_80, Group | Group1_81,
145 Group | Group1_82, Group | Group1_83, 147 Group | Group1_82, Group | Group1_83,
@@ -153,7 +155,8 @@ static u32 opcode_table[256] = {
153 /* 0x90 - 0x97 */ 155 /* 0x90 - 0x97 */
154 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, 156 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
155 /* 0x98 - 0x9F */ 157 /* 0x98 - 0x9F */
156 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, 158 0, 0, SrcImm | Src2Imm16, 0,
159 ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
157 /* 0xA0 - 0xA7 */ 160 /* 0xA0 - 0xA7 */
158 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, 161 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
159 ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, 162 ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
@@ -178,7 +181,8 @@ static u32 opcode_table[256] = {
178 0, ImplicitOps | Stack, 0, 0, 181 0, ImplicitOps | Stack, 0, 0,
179 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, 182 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
180 /* 0xC8 - 0xCF */ 183 /* 0xC8 - 0xCF */
181 0, 0, 0, ImplicitOps | Stack, 0, 0, 0, 0, 184 0, 0, 0, ImplicitOps | Stack,
185 ImplicitOps, SrcImmByte, ImplicitOps, ImplicitOps,
182 /* 0xD0 - 0xD7 */ 186 /* 0xD0 - 0xD7 */
183 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, 187 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
184 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, 188 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
@@ -187,11 +191,11 @@ static u32 opcode_table[256] = {
187 0, 0, 0, 0, 0, 0, 0, 0, 191 0, 0, 0, 0, 0, 0, 0, 0,
188 /* 0xE0 - 0xE7 */ 192 /* 0xE0 - 0xE7 */
189 0, 0, 0, 0, 193 0, 0, 0, 0,
190 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 194 ByteOp | SrcImmUByte, SrcImmUByte,
191 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 195 ByteOp | SrcImmUByte, SrcImmUByte,
192 /* 0xE8 - 0xEF */ 196 /* 0xE8 - 0xEF */
193 ImplicitOps | Stack, SrcImm | ImplicitOps, 197 SrcImm | Stack, SrcImm | ImplicitOps,
194 ImplicitOps, SrcImmByte | ImplicitOps, 198 SrcImm | Src2Imm16, SrcImmByte | ImplicitOps,
195 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 199 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
196 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 200 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
197 /* 0xF0 - 0xF7 */ 201 /* 0xF0 - 0xF7 */
@@ -230,10 +234,8 @@ static u32 twobyte_table[256] = {
230 /* 0x70 - 0x7F */ 234 /* 0x70 - 0x7F */
231 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 235 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
232 /* 0x80 - 0x8F */ 236 /* 0x80 - 0x8F */
233 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, 237 SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
234 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, 238 SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
235 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
236 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
237 /* 0x90 - 0x9F */ 239 /* 0x90 - 0x9F */
238 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 240 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
239 /* 0xA0 - 0xA7 */ 241 /* 0xA0 - 0xA7 */
@@ -1044,10 +1046,14 @@ done_prefixes:
1044 } 1046 }
1045 break; 1047 break;
1046 case SrcImmByte: 1048 case SrcImmByte:
1049 case SrcImmUByte:
1047 c->src.type = OP_IMM; 1050 c->src.type = OP_IMM;
1048 c->src.ptr = (unsigned long *)c->eip; 1051 c->src.ptr = (unsigned long *)c->eip;
1049 c->src.bytes = 1; 1052 c->src.bytes = 1;
1050 c->src.val = insn_fetch(s8, 1, c->eip); 1053 if ((c->d & SrcMask) == SrcImmByte)
1054 c->src.val = insn_fetch(s8, 1, c->eip);
1055 else
1056 c->src.val = insn_fetch(u8, 1, c->eip);
1051 break; 1057 break;
1052 case SrcOne: 1058 case SrcOne:
1053 c->src.bytes = 1; 1059 c->src.bytes = 1;
@@ -1072,6 +1078,12 @@ done_prefixes:
1072 c->src2.bytes = 1; 1078 c->src2.bytes = 1;
1073 c->src2.val = insn_fetch(u8, 1, c->eip); 1079 c->src2.val = insn_fetch(u8, 1, c->eip);
1074 break; 1080 break;
1081 case Src2Imm16:
1082 c->src2.type = OP_IMM;
1083 c->src2.ptr = (unsigned long *)c->eip;
1084 c->src2.bytes = 2;
1085 c->src2.val = insn_fetch(u16, 2, c->eip);
1086 break;
1075 case Src2One: 1087 case Src2One:
1076 c->src2.bytes = 1; 1088 c->src2.bytes = 1;
1077 c->src2.val = 1; 1089 c->src2.val = 1;
@@ -1349,6 +1361,20 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
1349 return 0; 1361 return 0;
1350} 1362}
1351 1363
1364void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
1365{
1366 u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask);
1367 /*
1368 * an sti; sti; sequence only disable interrupts for the first
1369 * instruction. So, if the last instruction, be it emulated or
1370 * not, left the system with the INT_STI flag enabled, it
1371 * means that the last instruction is an sti. We should not
1372 * leave the flag on in this case. The same goes for mov ss
1373 */
1374 if (!(int_shadow & mask))
1375 ctxt->interruptibility = mask;
1376}
1377
1352int 1378int
1353x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) 1379x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1354{ 1380{
@@ -1360,6 +1386,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1360 int io_dir_in; 1386 int io_dir_in;
1361 int rc = 0; 1387 int rc = 0;
1362 1388
1389 ctxt->interruptibility = 0;
1390
1363 /* Shadow copy of register state. Committed on successful emulation. 1391 /* Shadow copy of register state. Committed on successful emulation.
1364 * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't 1392 * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
1365 * modify them. 1393 * modify them.
@@ -1531,13 +1559,10 @@ special_insn:
1531 return -1; 1559 return -1;
1532 } 1560 }
1533 return 0; 1561 return 0;
1534 case 0x70 ... 0x7f: /* jcc (short) */ { 1562 case 0x70 ... 0x7f: /* jcc (short) */
1535 int rel = insn_fetch(s8, 1, c->eip);
1536
1537 if (test_cc(c->b, ctxt->eflags)) 1563 if (test_cc(c->b, ctxt->eflags))
1538 jmp_rel(c, rel); 1564 jmp_rel(c, c->src.val);
1539 break; 1565 break;
1540 }
1541 case 0x80 ... 0x83: /* Grp1 */ 1566 case 0x80 ... 0x83: /* Grp1 */
1542 switch (c->modrm_reg) { 1567 switch (c->modrm_reg) {
1543 case 0: 1568 case 0:
@@ -1609,6 +1634,9 @@ special_insn:
1609 int err; 1634 int err;
1610 1635
1611 sel = c->src.val; 1636 sel = c->src.val;
1637 if (c->modrm_reg == VCPU_SREG_SS)
1638 toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS);
1639
1612 if (c->modrm_reg <= 5) { 1640 if (c->modrm_reg <= 5) {
1613 type_bits = (c->modrm_reg == 1) ? 9 : 1; 1641 type_bits = (c->modrm_reg == 1) ? 9 : 1;
1614 err = kvm_load_segment_descriptor(ctxt->vcpu, sel, 1642 err = kvm_load_segment_descriptor(ctxt->vcpu, sel,
@@ -1769,59 +1797,32 @@ special_insn:
1769 break; 1797 break;
1770 case 0xe4: /* inb */ 1798 case 0xe4: /* inb */
1771 case 0xe5: /* in */ 1799 case 0xe5: /* in */
1772 port = insn_fetch(u8, 1, c->eip); 1800 port = c->src.val;
1773 io_dir_in = 1; 1801 io_dir_in = 1;
1774 goto do_io; 1802 goto do_io;
1775 case 0xe6: /* outb */ 1803 case 0xe6: /* outb */
1776 case 0xe7: /* out */ 1804 case 0xe7: /* out */
1777 port = insn_fetch(u8, 1, c->eip); 1805 port = c->src.val;
1778 io_dir_in = 0; 1806 io_dir_in = 0;
1779 goto do_io; 1807 goto do_io;
1780 case 0xe8: /* call (near) */ { 1808 case 0xe8: /* call (near) */ {
1781 long int rel; 1809 long int rel = c->src.val;
1782 switch (c->op_bytes) {
1783 case 2:
1784 rel = insn_fetch(s16, 2, c->eip);
1785 break;
1786 case 4:
1787 rel = insn_fetch(s32, 4, c->eip);
1788 break;
1789 default:
1790 DPRINTF("Call: Invalid op_bytes\n");
1791 goto cannot_emulate;
1792 }
1793 c->src.val = (unsigned long) c->eip; 1810 c->src.val = (unsigned long) c->eip;
1794 jmp_rel(c, rel); 1811 jmp_rel(c, rel);
1795 c->op_bytes = c->ad_bytes;
1796 emulate_push(ctxt); 1812 emulate_push(ctxt);
1797 break; 1813 break;
1798 } 1814 }
1799 case 0xe9: /* jmp rel */ 1815 case 0xe9: /* jmp rel */
1800 goto jmp; 1816 goto jmp;
1801 case 0xea: /* jmp far */ { 1817 case 0xea: /* jmp far */
1802 uint32_t eip; 1818 if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, 9,
1803 uint16_t sel; 1819 VCPU_SREG_CS) < 0) {
1804
1805 switch (c->op_bytes) {
1806 case 2:
1807 eip = insn_fetch(u16, 2, c->eip);
1808 break;
1809 case 4:
1810 eip = insn_fetch(u32, 4, c->eip);
1811 break;
1812 default:
1813 DPRINTF("jmp far: Invalid op_bytes\n");
1814 goto cannot_emulate;
1815 }
1816 sel = insn_fetch(u16, 2, c->eip);
1817 if (kvm_load_segment_descriptor(ctxt->vcpu, sel, 9, VCPU_SREG_CS) < 0) {
1818 DPRINTF("jmp far: Failed to load CS descriptor\n"); 1820 DPRINTF("jmp far: Failed to load CS descriptor\n");
1819 goto cannot_emulate; 1821 goto cannot_emulate;
1820 } 1822 }
1821 1823
1822 c->eip = eip; 1824 c->eip = c->src.val;
1823 break; 1825 break;
1824 }
1825 case 0xeb: 1826 case 0xeb:
1826 jmp: /* jmp rel short */ 1827 jmp: /* jmp rel short */
1827 jmp_rel(c, c->src.val); 1828 jmp_rel(c, c->src.val);
@@ -1865,6 +1866,7 @@ special_insn:
1865 c->dst.type = OP_NONE; /* Disable writeback. */ 1866 c->dst.type = OP_NONE; /* Disable writeback. */
1866 break; 1867 break;
1867 case 0xfb: /* sti */ 1868 case 0xfb: /* sti */
1869 toggle_interruptibility(ctxt, X86_SHADOW_INT_STI);
1868 ctxt->eflags |= X86_EFLAGS_IF; 1870 ctxt->eflags |= X86_EFLAGS_IF;
1869 c->dst.type = OP_NONE; /* Disable writeback. */ 1871 c->dst.type = OP_NONE; /* Disable writeback. */
1870 break; 1872 break;
@@ -2039,28 +2041,11 @@ twobyte_insn:
2039 if (!test_cc(c->b, ctxt->eflags)) 2041 if (!test_cc(c->b, ctxt->eflags))
2040 c->dst.type = OP_NONE; /* no writeback */ 2042 c->dst.type = OP_NONE; /* no writeback */
2041 break; 2043 break;
2042 case 0x80 ... 0x8f: /* jnz rel, etc*/ { 2044 case 0x80 ... 0x8f: /* jnz rel, etc*/
2043 long int rel;
2044
2045 switch (c->op_bytes) {
2046 case 2:
2047 rel = insn_fetch(s16, 2, c->eip);
2048 break;
2049 case 4:
2050 rel = insn_fetch(s32, 4, c->eip);
2051 break;
2052 case 8:
2053 rel = insn_fetch(s64, 8, c->eip);
2054 break;
2055 default:
2056 DPRINTF("jnz: Invalid op_bytes\n");
2057 goto cannot_emulate;
2058 }
2059 if (test_cc(c->b, ctxt->eflags)) 2045 if (test_cc(c->b, ctxt->eflags))
2060 jmp_rel(c, rel); 2046 jmp_rel(c, c->src.val);
2061 c->dst.type = OP_NONE; 2047 c->dst.type = OP_NONE;
2062 break; 2048 break;
2063 }
2064 case 0xa3: 2049 case 0xa3:
2065 bt: /* bt */ 2050 bt: /* bt */
2066 c->dst.type = OP_NONE; 2051 c->dst.type = OP_NONE;
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
index 8dab8f7844d..38718041efc 100644
--- a/arch/x86/lguest/Kconfig
+++ b/arch/x86/lguest/Kconfig
@@ -2,7 +2,6 @@ config LGUEST_GUEST
2 bool "Lguest guest support" 2 bool "Lguest guest support"
3 select PARAVIRT 3 select PARAVIRT
4 depends on X86_32 4 depends on X86_32
5 depends on !X86_PAE
6 select VIRTIO 5 select VIRTIO
7 select VIRTIO_RING 6 select VIRTIO_RING
8 select VIRTIO_CONSOLE 7 select VIRTIO_CONSOLE
diff --git a/arch/x86/lguest/Makefile b/arch/x86/lguest/Makefile
index 27f0c9ed7f6..94e0e54056a 100644
--- a/arch/x86/lguest/Makefile
+++ b/arch/x86/lguest/Makefile
@@ -1 +1,2 @@
1obj-y := i386_head.o boot.o 1obj-y := i386_head.o boot.o
2CFLAGS_boot.o := $(call cc-option, -fno-stack-protector)
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index ca7ec44bafc..7bc65f0f62c 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -67,6 +67,7 @@
67#include <asm/mce.h> 67#include <asm/mce.h>
68#include <asm/io.h> 68#include <asm/io.h>
69#include <asm/i387.h> 69#include <asm/i387.h>
70#include <asm/stackprotector.h>
70#include <asm/reboot.h> /* for struct machine_ops */ 71#include <asm/reboot.h> /* for struct machine_ops */
71 72
72/*G:010 Welcome to the Guest! 73/*G:010 Welcome to the Guest!
@@ -86,7 +87,7 @@ struct lguest_data lguest_data = {
86 87
87/*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a 88/*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a
88 * ring buffer of stored hypercalls which the Host will run though next time we 89 * ring buffer of stored hypercalls which the Host will run though next time we
89 * do a normal hypercall. Each entry in the ring has 4 slots for the hypercall 90 * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall
90 * arguments, and a "hcall_status" word which is 0 if the call is ready to go, 91 * arguments, and a "hcall_status" word which is 0 if the call is ready to go,
91 * and 255 once the Host has finished with it. 92 * and 255 once the Host has finished with it.
92 * 93 *
@@ -95,7 +96,8 @@ struct lguest_data lguest_data = {
95 * effect of causing the Host to run all the stored calls in the ring buffer 96 * effect of causing the Host to run all the stored calls in the ring buffer
96 * which empties it for next time! */ 97 * which empties it for next time! */
97static void async_hcall(unsigned long call, unsigned long arg1, 98static void async_hcall(unsigned long call, unsigned long arg1,
98 unsigned long arg2, unsigned long arg3) 99 unsigned long arg2, unsigned long arg3,
100 unsigned long arg4)
99{ 101{
100 /* Note: This code assumes we're uniprocessor. */ 102 /* Note: This code assumes we're uniprocessor. */
101 static unsigned int next_call; 103 static unsigned int next_call;
@@ -107,12 +109,13 @@ static void async_hcall(unsigned long call, unsigned long arg1,
107 local_irq_save(flags); 109 local_irq_save(flags);
108 if (lguest_data.hcall_status[next_call] != 0xFF) { 110 if (lguest_data.hcall_status[next_call] != 0xFF) {
109 /* Table full, so do normal hcall which will flush table. */ 111 /* Table full, so do normal hcall which will flush table. */
110 kvm_hypercall3(call, arg1, arg2, arg3); 112 kvm_hypercall4(call, arg1, arg2, arg3, arg4);
111 } else { 113 } else {
112 lguest_data.hcalls[next_call].arg0 = call; 114 lguest_data.hcalls[next_call].arg0 = call;
113 lguest_data.hcalls[next_call].arg1 = arg1; 115 lguest_data.hcalls[next_call].arg1 = arg1;
114 lguest_data.hcalls[next_call].arg2 = arg2; 116 lguest_data.hcalls[next_call].arg2 = arg2;
115 lguest_data.hcalls[next_call].arg3 = arg3; 117 lguest_data.hcalls[next_call].arg3 = arg3;
118 lguest_data.hcalls[next_call].arg4 = arg4;
116 /* Arguments must all be written before we mark it to go */ 119 /* Arguments must all be written before we mark it to go */
117 wmb(); 120 wmb();
118 lguest_data.hcall_status[next_call] = 0; 121 lguest_data.hcall_status[next_call] = 0;
@@ -140,7 +143,7 @@ static void lazy_hcall1(unsigned long call,
140 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) 143 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
141 kvm_hypercall1(call, arg1); 144 kvm_hypercall1(call, arg1);
142 else 145 else
143 async_hcall(call, arg1, 0, 0); 146 async_hcall(call, arg1, 0, 0, 0);
144} 147}
145 148
146static void lazy_hcall2(unsigned long call, 149static void lazy_hcall2(unsigned long call,
@@ -150,7 +153,7 @@ static void lazy_hcall2(unsigned long call,
150 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) 153 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
151 kvm_hypercall2(call, arg1, arg2); 154 kvm_hypercall2(call, arg1, arg2);
152 else 155 else
153 async_hcall(call, arg1, arg2, 0); 156 async_hcall(call, arg1, arg2, 0, 0);
154} 157}
155 158
156static void lazy_hcall3(unsigned long call, 159static void lazy_hcall3(unsigned long call,
@@ -161,18 +164,38 @@ static void lazy_hcall3(unsigned long call,
161 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) 164 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
162 kvm_hypercall3(call, arg1, arg2, arg3); 165 kvm_hypercall3(call, arg1, arg2, arg3);
163 else 166 else
164 async_hcall(call, arg1, arg2, arg3); 167 async_hcall(call, arg1, arg2, arg3, 0);
165} 168}
166 169
170#ifdef CONFIG_X86_PAE
171static void lazy_hcall4(unsigned long call,
172 unsigned long arg1,
173 unsigned long arg2,
174 unsigned long arg3,
175 unsigned long arg4)
176{
177 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
178 kvm_hypercall4(call, arg1, arg2, arg3, arg4);
179 else
180 async_hcall(call, arg1, arg2, arg3, arg4);
181}
182#endif
183
167/* When lazy mode is turned off reset the per-cpu lazy mode variable and then 184/* When lazy mode is turned off reset the per-cpu lazy mode variable and then
168 * issue the do-nothing hypercall to flush any stored calls. */ 185 * issue the do-nothing hypercall to flush any stored calls. */
169static void lguest_leave_lazy_mode(void) 186static void lguest_leave_lazy_mmu_mode(void)
187{
188 kvm_hypercall0(LHCALL_FLUSH_ASYNC);
189 paravirt_leave_lazy_mmu();
190}
191
192static void lguest_end_context_switch(struct task_struct *next)
170{ 193{
171 paravirt_leave_lazy(paravirt_get_lazy_mode());
172 kvm_hypercall0(LHCALL_FLUSH_ASYNC); 194 kvm_hypercall0(LHCALL_FLUSH_ASYNC);
195 paravirt_end_context_switch(next);
173} 196}
174 197
175/*G:033 198/*G:032
176 * After that diversion we return to our first native-instruction 199 * After that diversion we return to our first native-instruction
177 * replacements: four functions for interrupt control. 200 * replacements: four functions for interrupt control.
178 * 201 *
@@ -192,30 +215,28 @@ static unsigned long save_fl(void)
192{ 215{
193 return lguest_data.irq_enabled; 216 return lguest_data.irq_enabled;
194} 217}
195PV_CALLEE_SAVE_REGS_THUNK(save_fl);
196
197/* restore_flags() just sets the flags back to the value given. */
198static void restore_fl(unsigned long flags)
199{
200 lguest_data.irq_enabled = flags;
201}
202PV_CALLEE_SAVE_REGS_THUNK(restore_fl);
203 218
204/* Interrupts go off... */ 219/* Interrupts go off... */
205static void irq_disable(void) 220static void irq_disable(void)
206{ 221{
207 lguest_data.irq_enabled = 0; 222 lguest_data.irq_enabled = 0;
208} 223}
224
225/* Let's pause a moment. Remember how I said these are called so often?
226 * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to
227 * break some rules. In particular, these functions are assumed to save their
228 * own registers if they need to: normal C functions assume they can trash the
229 * eax register. To use normal C functions, we use
230 * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the
231 * C function, then restores it. */
232PV_CALLEE_SAVE_REGS_THUNK(save_fl);
209PV_CALLEE_SAVE_REGS_THUNK(irq_disable); 233PV_CALLEE_SAVE_REGS_THUNK(irq_disable);
234/*:*/
210 235
211/* Interrupts go on... */ 236/* These are in i386_head.S */
212static void irq_enable(void) 237extern void lg_irq_enable(void);
213{ 238extern void lg_restore_fl(unsigned long flags);
214 lguest_data.irq_enabled = X86_EFLAGS_IF;
215}
216PV_CALLEE_SAVE_REGS_THUNK(irq_enable);
217 239
218/*:*/
219/*M:003 Note that we don't check for outstanding interrupts when we re-enable 240/*M:003 Note that we don't check for outstanding interrupts when we re-enable
220 * them (or when we unmask an interrupt). This seems to work for the moment, 241 * them (or when we unmask an interrupt). This seems to work for the moment,
221 * since interrupts are rare and we'll just get the interrupt on the next timer 242 * since interrupts are rare and we'll just get the interrupt on the next timer
@@ -361,8 +382,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
361 case 1: /* Basic feature request. */ 382 case 1: /* Basic feature request. */
362 /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ 383 /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */
363 *cx &= 0x00002201; 384 *cx &= 0x00002201;
364 /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU. */ 385 /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU, PAE. */
365 *dx &= 0x07808111; 386 *dx &= 0x07808151;
366 /* The Host can do a nice optimization if it knows that the 387 /* The Host can do a nice optimization if it knows that the
367 * kernel mappings (addresses above 0xC0000000 or whatever 388 * kernel mappings (addresses above 0xC0000000 or whatever
368 * PAGE_OFFSET is set to) haven't changed. But Linux calls 389 * PAGE_OFFSET is set to) haven't changed. But Linux calls
@@ -381,6 +402,11 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
381 if (*ax > 0x80000008) 402 if (*ax > 0x80000008)
382 *ax = 0x80000008; 403 *ax = 0x80000008;
383 break; 404 break;
405 case 0x80000001:
406 /* Here we should fix nx cap depending on host. */
407 /* For this version of PAE, we just clear NX bit. */
408 *dx &= ~(1 << 20);
409 break;
384 } 410 }
385} 411}
386 412
@@ -514,25 +540,52 @@ static void lguest_write_cr4(unsigned long val)
514static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, 540static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
515 pte_t *ptep) 541 pte_t *ptep)
516{ 542{
543#ifdef CONFIG_X86_PAE
544 lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr,
545 ptep->pte_low, ptep->pte_high);
546#else
517 lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low); 547 lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low);
548#endif
518} 549}
519 550
520static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, 551static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
521 pte_t *ptep, pte_t pteval) 552 pte_t *ptep, pte_t pteval)
522{ 553{
523 *ptep = pteval; 554 native_set_pte(ptep, pteval);
524 lguest_pte_update(mm, addr, ptep); 555 lguest_pte_update(mm, addr, ptep);
525} 556}
526 557
527/* The Guest calls this to set a top-level entry. Again, we set the entry then 558/* The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd
528 * tell the Host which top-level page we changed, and the index of the entry we 559 * to set a middle-level entry when PAE is activated.
529 * changed. */ 560 * Again, we set the entry then tell the Host which page we changed,
561 * and the index of the entry we changed. */
562#ifdef CONFIG_X86_PAE
563static void lguest_set_pud(pud_t *pudp, pud_t pudval)
564{
565 native_set_pud(pudp, pudval);
566
567 /* 32 bytes aligned pdpt address and the index. */
568 lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0,
569 (__pa(pudp) & 0x1F) / sizeof(pud_t));
570}
571
530static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) 572static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
531{ 573{
532 *pmdp = pmdval; 574 native_set_pmd(pmdp, pmdval);
533 lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK, 575 lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK,
534 (__pa(pmdp) & (PAGE_SIZE - 1)) / 4); 576 (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
535} 577}
578#else
579
580/* The Guest calls lguest_set_pmd to set a top-level entry when PAE is not
581 * activated. */
582static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
583{
584 native_set_pmd(pmdp, pmdval);
585 lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK,
586 (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
587}
588#endif
536 589
537/* There are a couple of legacy places where the kernel sets a PTE, but we 590/* There are a couple of legacy places where the kernel sets a PTE, but we
538 * don't know the top level any more. This is useless for us, since we don't 591 * don't know the top level any more. This is useless for us, since we don't
@@ -545,11 +598,31 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
545 * which brings boot back to 0.25 seconds. */ 598 * which brings boot back to 0.25 seconds. */
546static void lguest_set_pte(pte_t *ptep, pte_t pteval) 599static void lguest_set_pte(pte_t *ptep, pte_t pteval)
547{ 600{
548 *ptep = pteval; 601 native_set_pte(ptep, pteval);
549 if (cr3_changed) 602 if (cr3_changed)
550 lazy_hcall1(LHCALL_FLUSH_TLB, 1); 603 lazy_hcall1(LHCALL_FLUSH_TLB, 1);
551} 604}
552 605
606#ifdef CONFIG_X86_PAE
607static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte)
608{
609 native_set_pte_atomic(ptep, pte);
610 if (cr3_changed)
611 lazy_hcall1(LHCALL_FLUSH_TLB, 1);
612}
613
614void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
615{
616 native_pte_clear(mm, addr, ptep);
617 lguest_pte_update(mm, addr, ptep);
618}
619
620void lguest_pmd_clear(pmd_t *pmdp)
621{
622 lguest_set_pmd(pmdp, __pmd(0));
623}
624#endif
625
553/* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on 626/* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
554 * native page table operations. On native hardware you can set a new page 627 * native page table operations. On native hardware you can set a new page
555 * table entry whenever you want, but if you want to remove one you have to do 628 * table entry whenever you want, but if you want to remove one you have to do
@@ -621,13 +694,12 @@ static void __init lguest_init_IRQ(void)
621{ 694{
622 unsigned int i; 695 unsigned int i;
623 696
624 for (i = 0; i < LGUEST_IRQS; i++) { 697 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
625 int vector = FIRST_EXTERNAL_VECTOR + i;
626 /* Some systems map "vectors" to interrupts weirdly. Lguest has 698 /* Some systems map "vectors" to interrupts weirdly. Lguest has
627 * a straightforward 1 to 1 mapping, so force that here. */ 699 * a straightforward 1 to 1 mapping, so force that here. */
628 __get_cpu_var(vector_irq)[vector] = i; 700 __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR;
629 if (vector != SYSCALL_VECTOR) 701 if (i != SYSCALL_VECTOR)
630 set_intr_gate(vector, interrupt[i]); 702 set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
631 } 703 }
632 /* This call is required to set up for 4k stacks, where we have 704 /* This call is required to set up for 4k stacks, where we have
633 * separate stacks for hard and soft interrupts. */ 705 * separate stacks for hard and soft interrupts. */
@@ -636,7 +708,7 @@ static void __init lguest_init_IRQ(void)
636 708
637void lguest_setup_irq(unsigned int irq) 709void lguest_setup_irq(unsigned int irq)
638{ 710{
639 irq_to_desc_alloc_cpu(irq, 0); 711 irq_to_desc_alloc_node(irq, 0);
640 set_irq_chip_and_handler_name(irq, &lguest_irq_controller, 712 set_irq_chip_and_handler_name(irq, &lguest_irq_controller,
641 handle_level_irq, "level"); 713 handle_level_irq, "level");
642} 714}
@@ -966,10 +1038,10 @@ static void lguest_restart(char *reason)
966 * 1038 *
967 * Our current solution is to allow the paravirt back end to optionally patch 1039 * Our current solution is to allow the paravirt back end to optionally patch
968 * over the indirect calls to replace them with something more efficient. We 1040 * over the indirect calls to replace them with something more efficient. We
969 * patch the four most commonly called functions: disable interrupts, enable 1041 * patch two of the simplest of the most commonly called functions: disable
970 * interrupts, restore interrupts and save interrupts. We usually have 6 or 10 1042 * interrupts and save interrupts. We usually have 6 or 10 bytes to patch
971 * bytes to patch into: the Guest versions of these operations are small enough 1043 * into: the Guest versions of these operations are small enough that we can
972 * that we can fit comfortably. 1044 * fit comfortably.
973 * 1045 *
974 * First we need assembly templates of each of the patchable Guest operations, 1046 * First we need assembly templates of each of the patchable Guest operations,
975 * and these are in i386_head.S. */ 1047 * and these are in i386_head.S. */
@@ -980,8 +1052,6 @@ static const struct lguest_insns
980 const char *start, *end; 1052 const char *start, *end;
981} lguest_insns[] = { 1053} lguest_insns[] = {
982 [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli }, 1054 [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli },
983 [PARAVIRT_PATCH(pv_irq_ops.irq_enable)] = { lgstart_sti, lgend_sti },
984 [PARAVIRT_PATCH(pv_irq_ops.restore_fl)] = { lgstart_popf, lgend_popf },
985 [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, 1055 [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf },
986}; 1056};
987 1057
@@ -1019,6 +1089,7 @@ __init void lguest_init(void)
1019 pv_info.name = "lguest"; 1089 pv_info.name = "lguest";
1020 pv_info.paravirt_enabled = 1; 1090 pv_info.paravirt_enabled = 1;
1021 pv_info.kernel_rpl = 1; 1091 pv_info.kernel_rpl = 1;
1092 pv_info.shared_kernel_pmd = 1;
1022 1093
1023 /* We set up all the lguest overrides for sensitive operations. These 1094 /* We set up all the lguest overrides for sensitive operations. These
1024 * are detailed with the operations themselves. */ 1095 * are detailed with the operations themselves. */
@@ -1026,9 +1097,9 @@ __init void lguest_init(void)
1026 /* interrupt-related operations */ 1097 /* interrupt-related operations */
1027 pv_irq_ops.init_IRQ = lguest_init_IRQ; 1098 pv_irq_ops.init_IRQ = lguest_init_IRQ;
1028 pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); 1099 pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl);
1029 pv_irq_ops.restore_fl = PV_CALLEE_SAVE(restore_fl); 1100 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl);
1030 pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable); 1101 pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable);
1031 pv_irq_ops.irq_enable = PV_CALLEE_SAVE(irq_enable); 1102 pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable);
1032 pv_irq_ops.safe_halt = lguest_safe_halt; 1103 pv_irq_ops.safe_halt = lguest_safe_halt;
1033 1104
1034 /* init-time operations */ 1105 /* init-time operations */
@@ -1053,8 +1124,8 @@ __init void lguest_init(void)
1053 pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry; 1124 pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry;
1054 pv_cpu_ops.write_idt_entry = lguest_write_idt_entry; 1125 pv_cpu_ops.write_idt_entry = lguest_write_idt_entry;
1055 pv_cpu_ops.wbinvd = lguest_wbinvd; 1126 pv_cpu_ops.wbinvd = lguest_wbinvd;
1056 pv_cpu_ops.lazy_mode.enter = paravirt_enter_lazy_cpu; 1127 pv_cpu_ops.start_context_switch = paravirt_start_context_switch;
1057 pv_cpu_ops.lazy_mode.leave = lguest_leave_lazy_mode; 1128 pv_cpu_ops.end_context_switch = lguest_end_context_switch;
1058 1129
1059 /* pagetable management */ 1130 /* pagetable management */
1060 pv_mmu_ops.write_cr3 = lguest_write_cr3; 1131 pv_mmu_ops.write_cr3 = lguest_write_cr3;
@@ -1064,10 +1135,16 @@ __init void lguest_init(void)
1064 pv_mmu_ops.set_pte = lguest_set_pte; 1135 pv_mmu_ops.set_pte = lguest_set_pte;
1065 pv_mmu_ops.set_pte_at = lguest_set_pte_at; 1136 pv_mmu_ops.set_pte_at = lguest_set_pte_at;
1066 pv_mmu_ops.set_pmd = lguest_set_pmd; 1137 pv_mmu_ops.set_pmd = lguest_set_pmd;
1138#ifdef CONFIG_X86_PAE
1139 pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic;
1140 pv_mmu_ops.pte_clear = lguest_pte_clear;
1141 pv_mmu_ops.pmd_clear = lguest_pmd_clear;
1142 pv_mmu_ops.set_pud = lguest_set_pud;
1143#endif
1067 pv_mmu_ops.read_cr2 = lguest_read_cr2; 1144 pv_mmu_ops.read_cr2 = lguest_read_cr2;
1068 pv_mmu_ops.read_cr3 = lguest_read_cr3; 1145 pv_mmu_ops.read_cr3 = lguest_read_cr3;
1069 pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; 1146 pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
1070 pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mode; 1147 pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode;
1071 pv_mmu_ops.pte_update = lguest_pte_update; 1148 pv_mmu_ops.pte_update = lguest_pte_update;
1072 pv_mmu_ops.pte_update_defer = lguest_pte_update; 1149 pv_mmu_ops.pte_update_defer = lguest_pte_update;
1073 1150
@@ -1088,13 +1165,21 @@ __init void lguest_init(void)
1088 * lguest_init() where the rest of the fairly chaotic boot setup 1165 * lguest_init() where the rest of the fairly chaotic boot setup
1089 * occurs. */ 1166 * occurs. */
1090 1167
1168 /* The stack protector is a weird thing where gcc places a canary
1169 * value on the stack and then checks it on return. This file is
1170 * compiled with -fno-stack-protector it, so we got this far without
1171 * problems. The value of the canary is kept at offset 20 from the
1172 * %gs register, so we need to set that up before calling C functions
1173 * in other files. */
1174 setup_stack_canary_segment(0);
1175 /* We could just call load_stack_canary_segment(), but we might as
1176 * call switch_to_new_gdt() which loads the whole table and sets up
1177 * the per-cpu segment descriptor register %fs as well. */
1178 switch_to_new_gdt(0);
1179
1091 /* As described in head_32.S, we map the first 128M of memory. */ 1180 /* As described in head_32.S, we map the first 128M of memory. */
1092 max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; 1181 max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
1093 1182
1094 /* Load the %fs segment register (the per-cpu segment register) with
1095 * the normal data segment to get through booting. */
1096 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory");
1097
1098 /* The Host<->Guest Switcher lives at the top of our address space, and 1183 /* The Host<->Guest Switcher lives at the top of our address space, and
1099 * the Host told us how big it is when we made LGUEST_INIT hypercall: 1184 * the Host told us how big it is when we made LGUEST_INIT hypercall:
1100 * it put the answer in lguest_data.reserve_mem */ 1185 * it put the answer in lguest_data.reserve_mem */
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index f7954198947..a9c8cfe61cd 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -46,10 +46,64 @@ ENTRY(lguest_entry)
46 .globl lgstart_##name; .globl lgend_##name 46 .globl lgstart_##name; .globl lgend_##name
47 47
48LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) 48LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled)
49LGUEST_PATCH(sti, movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled)
50LGUEST_PATCH(popf, movl %eax, lguest_data+LGUEST_DATA_irq_enabled)
51LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) 49LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax)
52/*:*/ 50
51/*G:033 But using those wrappers is inefficient (we'll see why that doesn't
52 * matter for save_fl and irq_disable later). If we write our routines
53 * carefully in assembler, we can avoid clobbering any registers and avoid
54 * jumping through the wrapper functions.
55 *
56 * I skipped over our first piece of assembler, but this one is worth studying
57 * in a bit more detail so I'll describe in easy stages. First, the routine
58 * to enable interrupts: */
59ENTRY(lg_irq_enable)
60 /* The reverse of irq_disable, this sets lguest_data.irq_enabled to
61 * X86_EFLAGS_IF (ie. "Interrupts enabled"). */
62 movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled
63 /* But now we need to check if the Host wants to know: there might have
64 * been interrupts waiting to be delivered, in which case it will have
65 * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we
66 * jump to send_interrupts, otherwise we're done. */
67 testl $0, lguest_data+LGUEST_DATA_irq_pending
68 jnz send_interrupts
69 /* One cool thing about x86 is that you can do many things without using
70 * a register. In this case, the normal path hasn't needed to save or
71 * restore any registers at all! */
72 ret
73send_interrupts:
74 /* OK, now we need a register: eax is used for the hypercall number,
75 * which is LHCALL_SEND_INTERRUPTS.
76 *
77 * We used not to bother with this pending detection at all, which was
78 * much simpler. Sooner or later the Host would realize it had to
79 * send us an interrupt. But that turns out to make performance 7
80 * times worse on a simple tcp benchmark. So now we do this the hard
81 * way. */
82 pushl %eax
83 movl $LHCALL_SEND_INTERRUPTS, %eax
84 /* This is a vmcall instruction (same thing that KVM uses). Older
85 * assembler versions might not know the "vmcall" instruction, so we
86 * create one manually here. */
87 .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */
88 popl %eax
89 ret
90
91/* Finally, the "popf" or "restore flags" routine. The %eax register holds the
92 * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're
93 * enabling interrupts again, if it's 0 we're leaving them off. */
94ENTRY(lg_restore_fl)
95 /* This is just "lguest_data.irq_enabled = flags;" */
96 movl %eax, lguest_data+LGUEST_DATA_irq_enabled
97 /* Now, if the %eax value has enabled interrupts and
98 * lguest_data.irq_pending is set, we want to tell the Host so it can
99 * deliver any outstanding interrupts. Fortunately, both values will
100 * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl"
101 * instruction will AND them together for us. If both are set, we
102 * jump to send_interrupts. */
103 testl lguest_data+LGUEST_DATA_irq_pending, %eax
104 jnz send_interrupts
105 /* Again, the normal path has used no extra registers. Clever, huh? */
106 ret
53 107
54/* These demark the EIP range where host should never deliver interrupts. */ 108/* These demark the EIP range where host should never deliver interrupts. */
55.global lguest_noirq_start 109.global lguest_noirq_start
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 55e11aa6d66..f9d35632666 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -2,7 +2,7 @@
2# Makefile for x86 specific library files. 2# Makefile for x86 specific library files.
3# 3#
4 4
5obj-$(CONFIG_SMP) := msr-on-cpu.o 5obj-$(CONFIG_SMP) := msr.o
6 6
7lib-y := delay.o 7lib-y := delay.o
8lib-y += thunk_$(BITS).o 8lib-y += thunk_$(BITS).o
diff --git a/arch/x86/lib/msr-on-cpu.c b/arch/x86/lib/msr-on-cpu.c
deleted file mode 100644
index 321cf720dbb..00000000000
--- a/arch/x86/lib/msr-on-cpu.c
+++ /dev/null
@@ -1,97 +0,0 @@
1#include <linux/module.h>
2#include <linux/preempt.h>
3#include <linux/smp.h>
4#include <asm/msr.h>
5
6struct msr_info {
7 u32 msr_no;
8 u32 l, h;
9 int err;
10};
11
12static void __rdmsr_on_cpu(void *info)
13{
14 struct msr_info *rv = info;
15
16 rdmsr(rv->msr_no, rv->l, rv->h);
17}
18
19static void __wrmsr_on_cpu(void *info)
20{
21 struct msr_info *rv = info;
22
23 wrmsr(rv->msr_no, rv->l, rv->h);
24}
25
26int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
27{
28 int err;
29 struct msr_info rv;
30
31 rv.msr_no = msr_no;
32 err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
33 *l = rv.l;
34 *h = rv.h;
35
36 return err;
37}
38
39int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
40{
41 int err;
42 struct msr_info rv;
43
44 rv.msr_no = msr_no;
45 rv.l = l;
46 rv.h = h;
47 err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
48
49 return err;
50}
51
52/* These "safe" variants are slower and should be used when the target MSR
53 may not actually exist. */
54static void __rdmsr_safe_on_cpu(void *info)
55{
56 struct msr_info *rv = info;
57
58 rv->err = rdmsr_safe(rv->msr_no, &rv->l, &rv->h);
59}
60
61static void __wrmsr_safe_on_cpu(void *info)
62{
63 struct msr_info *rv = info;
64
65 rv->err = wrmsr_safe(rv->msr_no, rv->l, rv->h);
66}
67
68int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
69{
70 int err;
71 struct msr_info rv;
72
73 rv.msr_no = msr_no;
74 err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
75 *l = rv.l;
76 *h = rv.h;
77
78 return err ? err : rv.err;
79}
80
81int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
82{
83 int err;
84 struct msr_info rv;
85
86 rv.msr_no = msr_no;
87 rv.l = l;
88 rv.h = h;
89 err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
90
91 return err ? err : rv.err;
92}
93
94EXPORT_SYMBOL(rdmsr_on_cpu);
95EXPORT_SYMBOL(wrmsr_on_cpu);
96EXPORT_SYMBOL(rdmsr_safe_on_cpu);
97EXPORT_SYMBOL(wrmsr_safe_on_cpu);
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
new file mode 100644
index 00000000000..1440b9c0547
--- /dev/null
+++ b/arch/x86/lib/msr.c
@@ -0,0 +1,183 @@
1#include <linux/module.h>
2#include <linux/preempt.h>
3#include <linux/smp.h>
4#include <asm/msr.h>
5
6struct msr_info {
7 u32 msr_no;
8 struct msr reg;
9 struct msr *msrs;
10 int off;
11 int err;
12};
13
14static void __rdmsr_on_cpu(void *info)
15{
16 struct msr_info *rv = info;
17 struct msr *reg;
18 int this_cpu = raw_smp_processor_id();
19
20 if (rv->msrs)
21 reg = &rv->msrs[this_cpu - rv->off];
22 else
23 reg = &rv->reg;
24
25 rdmsr(rv->msr_no, reg->l, reg->h);
26}
27
28static void __wrmsr_on_cpu(void *info)
29{
30 struct msr_info *rv = info;
31 struct msr *reg;
32 int this_cpu = raw_smp_processor_id();
33
34 if (rv->msrs)
35 reg = &rv->msrs[this_cpu - rv->off];
36 else
37 reg = &rv->reg;
38
39 wrmsr(rv->msr_no, reg->l, reg->h);
40}
41
42int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
43{
44 int err;
45 struct msr_info rv;
46
47 memset(&rv, 0, sizeof(rv));
48
49 rv.msr_no = msr_no;
50 err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
51 *l = rv.reg.l;
52 *h = rv.reg.h;
53
54 return err;
55}
56EXPORT_SYMBOL(rdmsr_on_cpu);
57
58int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
59{
60 int err;
61 struct msr_info rv;
62
63 memset(&rv, 0, sizeof(rv));
64
65 rv.msr_no = msr_no;
66 rv.reg.l = l;
67 rv.reg.h = h;
68 err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
69
70 return err;
71}
72EXPORT_SYMBOL(wrmsr_on_cpu);
73
74/* rdmsr on a bunch of CPUs
75 *
76 * @mask: which CPUs
77 * @msr_no: which MSR
78 * @msrs: array of MSR values
79 *
80 */
81void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs)
82{
83 struct msr_info rv;
84 int this_cpu;
85
86 memset(&rv, 0, sizeof(rv));
87
88 rv.off = cpumask_first(mask);
89 rv.msrs = msrs;
90 rv.msr_no = msr_no;
91
92 preempt_disable();
93 /*
94 * FIXME: handle the CPU we're executing on separately for now until
95 * smp_call_function_many has been fixed to not skip it.
96 */
97 this_cpu = raw_smp_processor_id();
98 smp_call_function_single(this_cpu, __rdmsr_on_cpu, &rv, 1);
99
100 smp_call_function_many(mask, __rdmsr_on_cpu, &rv, 1);
101 preempt_enable();
102}
103EXPORT_SYMBOL(rdmsr_on_cpus);
104
105/*
106 * wrmsr on a bunch of CPUs
107 *
108 * @mask: which CPUs
109 * @msr_no: which MSR
110 * @msrs: array of MSR values
111 *
112 */
113void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs)
114{
115 struct msr_info rv;
116 int this_cpu;
117
118 memset(&rv, 0, sizeof(rv));
119
120 rv.off = cpumask_first(mask);
121 rv.msrs = msrs;
122 rv.msr_no = msr_no;
123
124 preempt_disable();
125 /*
126 * FIXME: handle the CPU we're executing on separately for now until
127 * smp_call_function_many has been fixed to not skip it.
128 */
129 this_cpu = raw_smp_processor_id();
130 smp_call_function_single(this_cpu, __wrmsr_on_cpu, &rv, 1);
131
132 smp_call_function_many(mask, __wrmsr_on_cpu, &rv, 1);
133 preempt_enable();
134}
135EXPORT_SYMBOL(wrmsr_on_cpus);
136
137/* These "safe" variants are slower and should be used when the target MSR
138 may not actually exist. */
139static void __rdmsr_safe_on_cpu(void *info)
140{
141 struct msr_info *rv = info;
142
143 rv->err = rdmsr_safe(rv->msr_no, &rv->reg.l, &rv->reg.h);
144}
145
146static void __wrmsr_safe_on_cpu(void *info)
147{
148 struct msr_info *rv = info;
149
150 rv->err = wrmsr_safe(rv->msr_no, rv->reg.l, rv->reg.h);
151}
152
153int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
154{
155 int err;
156 struct msr_info rv;
157
158 memset(&rv, 0, sizeof(rv));
159
160 rv.msr_no = msr_no;
161 err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
162 *l = rv.reg.l;
163 *h = rv.reg.h;
164
165 return err ? err : rv.err;
166}
167EXPORT_SYMBOL(rdmsr_safe_on_cpu);
168
169int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
170{
171 int err;
172 struct msr_info rv;
173
174 memset(&rv, 0, sizeof(rv));
175
176 rv.msr_no = msr_no;
177 rv.reg.l = l;
178 rv.reg.h = h;
179 err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
180
181 return err ? err : rv.err;
182}
183EXPORT_SYMBOL(wrmsr_safe_on_cpu);
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index fdd30d08ab5..eefdeee8a87 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -10,6 +10,8 @@ obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o
10 10
11obj-$(CONFIG_HIGHMEM) += highmem_32.o 11obj-$(CONFIG_HIGHMEM) += highmem_32.o
12 12
13obj-$(CONFIG_KMEMCHECK) += kmemcheck/
14
13obj-$(CONFIG_MMIOTRACE) += mmiotrace.o 15obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
14mmiotrace-y := kmmio.o pf_in.o mmio-mod.o 16mmiotrace-y := kmmio.o pf_in.o mmio-mod.o
15obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o 17obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index e7277cbcfb4..a725b7f760a 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -161,13 +161,14 @@ static void note_page(struct seq_file *m, struct pg_state *st,
161 st->current_address >= st->marker[1].start_address) { 161 st->current_address >= st->marker[1].start_address) {
162 const char *unit = units; 162 const char *unit = units;
163 unsigned long delta; 163 unsigned long delta;
164 int width = sizeof(unsigned long) * 2;
164 165
165 /* 166 /*
166 * Now print the actual finished series 167 * Now print the actual finished series
167 */ 168 */
168 seq_printf(m, "0x%p-0x%p ", 169 seq_printf(m, "0x%0*lx-0x%0*lx ",
169 (void *)st->start_address, 170 width, st->start_address,
170 (void *)st->current_address); 171 width, st->current_address);
171 172
172 delta = (st->current_address - st->start_address) >> 10; 173 delta = (st->current_address - st->start_address) >> 10;
173 while (!(delta & 1023) && unit[1]) { 174 while (!(delta & 1023) && unit[1]) {
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index a03b7279efa..baa0e86adfb 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -3,40 +3,18 @@
3 * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. 3 * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
4 * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar 4 * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
5 */ 5 */
6#include <linux/interrupt.h> 6#include <linux/magic.h> /* STACK_END_MAGIC */
7#include <linux/mmiotrace.h> 7#include <linux/sched.h> /* test_thread_flag(), ... */
8#include <linux/bootmem.h> 8#include <linux/kdebug.h> /* oops_begin/end, ... */
9#include <linux/compiler.h> 9#include <linux/module.h> /* search_exception_table */
10#include <linux/highmem.h> 10#include <linux/bootmem.h> /* max_low_pfn */
11#include <linux/kprobes.h> 11#include <linux/kprobes.h> /* __kprobes, ... */
12#include <linux/uaccess.h> 12#include <linux/mmiotrace.h> /* kmmio_handler, ... */
13#include <linux/vmalloc.h> 13#include <linux/perf_counter.h> /* perf_swcounter_event */
14#include <linux/vt_kern.h> 14
15#include <linux/signal.h> 15#include <asm/traps.h> /* dotraplinkage, ... */
16#include <linux/kernel.h> 16#include <asm/pgalloc.h> /* pgd_*(), ... */
17#include <linux/ptrace.h> 17#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */
18#include <linux/string.h>
19#include <linux/module.h>
20#include <linux/kdebug.h>
21#include <linux/errno.h>
22#include <linux/magic.h>
23#include <linux/sched.h>
24#include <linux/types.h>
25#include <linux/init.h>
26#include <linux/mman.h>
27#include <linux/tty.h>
28#include <linux/smp.h>
29#include <linux/mm.h>
30
31#include <asm-generic/sections.h>
32
33#include <asm/tlbflush.h>
34#include <asm/pgalloc.h>
35#include <asm/segment.h>
36#include <asm/system.h>
37#include <asm/proto.h>
38#include <asm/traps.h>
39#include <asm/desc.h>
40 18
41/* 19/*
42 * Page fault error code bits: 20 * Page fault error code bits:
@@ -225,12 +203,10 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
225 if (!pmd_present(*pmd_k)) 203 if (!pmd_present(*pmd_k))
226 return NULL; 204 return NULL;
227 205
228 if (!pmd_present(*pmd)) { 206 if (!pmd_present(*pmd))
229 set_pmd(pmd, *pmd_k); 207 set_pmd(pmd, *pmd_k);
230 arch_flush_lazy_mmu_mode(); 208 else
231 } else {
232 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); 209 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
233 }
234 210
235 return pmd_k; 211 return pmd_k;
236} 212}
@@ -538,8 +514,6 @@ bad:
538static int is_errata93(struct pt_regs *regs, unsigned long address) 514static int is_errata93(struct pt_regs *regs, unsigned long address)
539{ 515{
540#ifdef CONFIG_X86_64 516#ifdef CONFIG_X86_64
541 static int once;
542
543 if (address != regs->ip) 517 if (address != regs->ip)
544 return 0; 518 return 0;
545 519
@@ -549,10 +523,7 @@ static int is_errata93(struct pt_regs *regs, unsigned long address)
549 address |= 0xffffffffUL << 32; 523 address |= 0xffffffffUL << 32;
550 if ((address >= (u64)_stext && address <= (u64)_etext) || 524 if ((address >= (u64)_stext && address <= (u64)_etext) ||
551 (address >= MODULES_VADDR && address <= MODULES_END)) { 525 (address >= MODULES_VADDR && address <= MODULES_END)) {
552 if (!once) { 526 printk_once(errata93_warning);
553 printk(errata93_warning);
554 once = 1;
555 }
556 regs->ip = address; 527 regs->ip = address;
557 return 1; 528 return 1;
558 } 529 }
@@ -986,6 +957,13 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
986 /* Get the faulting address: */ 957 /* Get the faulting address: */
987 address = read_cr2(); 958 address = read_cr2();
988 959
960 /*
961 * Detect and handle instructions that would cause a page fault for
962 * both a tracked kernel page and a userspace page.
963 */
964 if (kmemcheck_active(regs))
965 kmemcheck_hide(regs);
966
989 if (unlikely(kmmio_fault(regs, address))) 967 if (unlikely(kmmio_fault(regs, address)))
990 return; 968 return;
991 969
@@ -1003,9 +981,13 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
1003 * protection error (error_code & 9) == 0. 981 * protection error (error_code & 9) == 0.
1004 */ 982 */
1005 if (unlikely(fault_in_kernel_space(address))) { 983 if (unlikely(fault_in_kernel_space(address))) {
1006 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && 984 if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) {
1007 vmalloc_fault(address) >= 0) 985 if (vmalloc_fault(address) >= 0)
1008 return; 986 return;
987
988 if (kmemcheck_fault(regs, address, error_code))
989 return;
990 }
1009 991
1010 /* Can handle a stale RO->RW TLB: */ 992 /* Can handle a stale RO->RW TLB: */
1011 if (spurious_fault(error_code, address)) 993 if (spurious_fault(error_code, address))
@@ -1044,6 +1026,8 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
1044 if (unlikely(error_code & PF_RSVD)) 1026 if (unlikely(error_code & PF_RSVD))
1045 pgtable_bad(regs, error_code, address); 1027 pgtable_bad(regs, error_code, address);
1046 1028
1029 perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
1030
1047 /* 1031 /*
1048 * If we're in an interrupt, have no user context or are running 1032 * If we're in an interrupt, have no user context or are running
1049 * in an atomic region then we must not take the fault: 1033 * in an atomic region then we must not take the fault:
@@ -1137,10 +1121,15 @@ good_area:
1137 return; 1121 return;
1138 } 1122 }
1139 1123
1140 if (fault & VM_FAULT_MAJOR) 1124 if (fault & VM_FAULT_MAJOR) {
1141 tsk->maj_flt++; 1125 tsk->maj_flt++;
1142 else 1126 perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
1127 regs, address);
1128 } else {
1143 tsk->min_flt++; 1129 tsk->min_flt++;
1130 perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
1131 regs, address);
1132 }
1144 1133
1145 check_v8086_mode(regs, address, tsk); 1134 check_v8086_mode(regs, address, tsk);
1146 1135
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 8126e8d1a2a..58f621e8191 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -44,7 +44,6 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
44 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); 44 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
45 BUG_ON(!pte_none(*(kmap_pte-idx))); 45 BUG_ON(!pte_none(*(kmap_pte-idx)));
46 set_pte(kmap_pte-idx, mk_pte(page, prot)); 46 set_pte(kmap_pte-idx, mk_pte(page, prot));
47 arch_flush_lazy_mmu_mode();
48 47
49 return (void *)vaddr; 48 return (void *)vaddr;
50} 49}
@@ -74,7 +73,6 @@ void kunmap_atomic(void *kvaddr, enum km_type type)
74#endif 73#endif
75 } 74 }
76 75
77 arch_flush_lazy_mmu_mode();
78 pagefault_enable(); 76 pagefault_enable();
79} 77}
80 78
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 8f307d914c2..f46c340727b 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -26,12 +26,16 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
26 unsigned long sbase = saddr & PUD_MASK; 26 unsigned long sbase = saddr & PUD_MASK;
27 unsigned long s_end = sbase + PUD_SIZE; 27 unsigned long s_end = sbase + PUD_SIZE;
28 28
29 /* Allow segments to share if only one is marked locked */
30 unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED;
31 unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED;
32
29 /* 33 /*
30 * match the virtual addresses, permission and the alignment of the 34 * match the virtual addresses, permission and the alignment of the
31 * page table page. 35 * page table page.
32 */ 36 */
33 if (pmd_index(addr) != pmd_index(saddr) || 37 if (pmd_index(addr) != pmd_index(saddr) ||
34 vma->vm_flags != svma->vm_flags || 38 vm_flags != svm_flags ||
35 sbase < svma->vm_start || svma->vm_end < s_end) 39 sbase < svma->vm_start || svma->vm_end < s_end)
36 return 0; 40 return 0;
37 41
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index ae4f7b5d710..f53b57e4086 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -1,3 +1,4 @@
1#include <linux/initrd.h>
1#include <linux/ioport.h> 2#include <linux/ioport.h>
2#include <linux/swap.h> 3#include <linux/swap.h>
3 4
@@ -10,6 +11,9 @@
10#include <asm/setup.h> 11#include <asm/setup.h>
11#include <asm/system.h> 12#include <asm/system.h>
12#include <asm/tlbflush.h> 13#include <asm/tlbflush.h>
14#include <asm/tlb.h>
15
16DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
13 17
14unsigned long __initdata e820_table_start; 18unsigned long __initdata e820_table_start;
15unsigned long __meminitdata e820_table_end; 19unsigned long __meminitdata e820_table_end;
@@ -23,6 +27,69 @@ int direct_gbpages
23#endif 27#endif
24; 28;
25 29
30int nx_enabled;
31
32#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
33static int disable_nx __cpuinitdata;
34
35/*
36 * noexec = on|off
37 *
38 * Control non-executable mappings for processes.
39 *
40 * on Enable
41 * off Disable
42 */
43static int __init noexec_setup(char *str)
44{
45 if (!str)
46 return -EINVAL;
47 if (!strncmp(str, "on", 2)) {
48 __supported_pte_mask |= _PAGE_NX;
49 disable_nx = 0;
50 } else if (!strncmp(str, "off", 3)) {
51 disable_nx = 1;
52 __supported_pte_mask &= ~_PAGE_NX;
53 }
54 return 0;
55}
56early_param("noexec", noexec_setup);
57#endif
58
59#ifdef CONFIG_X86_PAE
60static void __init set_nx(void)
61{
62 unsigned int v[4], l, h;
63
64 if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
65 cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
66
67 if ((v[3] & (1 << 20)) && !disable_nx) {
68 rdmsr(MSR_EFER, l, h);
69 l |= EFER_NX;
70 wrmsr(MSR_EFER, l, h);
71 nx_enabled = 1;
72 __supported_pte_mask |= _PAGE_NX;
73 }
74 }
75}
76#else
77static inline void set_nx(void)
78{
79}
80#endif
81
82#ifdef CONFIG_X86_64
83void __cpuinit check_efer(void)
84{
85 unsigned long efer;
86
87 rdmsrl(MSR_EFER, efer);
88 if (!(efer & EFER_NX) || disable_nx)
89 __supported_pte_mask &= ~_PAGE_NX;
90}
91#endif
92
26static void __init find_early_table_space(unsigned long end, int use_pse, 93static void __init find_early_table_space(unsigned long end, int use_pse,
27 int use_gbpages) 94 int use_gbpages)
28{ 95{
@@ -66,12 +133,11 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
66 */ 133 */
67#ifdef CONFIG_X86_32 134#ifdef CONFIG_X86_32
68 start = 0x7000; 135 start = 0x7000;
69 e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT, 136#else
70 tables, PAGE_SIZE);
71#else /* CONFIG_X86_64 */
72 start = 0x8000; 137 start = 0x8000;
73 e820_table_start = find_e820_area(start, end, tables, PAGE_SIZE);
74#endif 138#endif
139 e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
140 tables, PAGE_SIZE);
75 if (e820_table_start == -1UL) 141 if (e820_table_start == -1UL)
76 panic("Cannot find space for the kernel page tables"); 142 panic("Cannot find space for the kernel page tables");
77 143
@@ -147,7 +213,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
147 if (!after_bootmem) 213 if (!after_bootmem)
148 init_gbpages(); 214 init_gbpages();
149 215
150#ifdef CONFIG_DEBUG_PAGEALLOC 216#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
151 /* 217 /*
152 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. 218 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
153 * This will simplify cpa(), which otherwise needs to support splitting 219 * This will simplify cpa(), which otherwise needs to support splitting
@@ -159,12 +225,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
159 use_gbpages = direct_gbpages; 225 use_gbpages = direct_gbpages;
160#endif 226#endif
161 227
162#ifdef CONFIG_X86_32
163#ifdef CONFIG_X86_PAE
164 set_nx(); 228 set_nx();
165 if (nx_enabled) 229 if (nx_enabled)
166 printk(KERN_INFO "NX (Execute Disable) protection: active\n"); 230 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
167#endif
168 231
169 /* Enable PSE if available */ 232 /* Enable PSE if available */
170 if (cpu_has_pse) 233 if (cpu_has_pse)
@@ -175,7 +238,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
175 set_in_cr4(X86_CR4_PGE); 238 set_in_cr4(X86_CR4_PGE);
176 __supported_pte_mask |= _PAGE_GLOBAL; 239 __supported_pte_mask |= _PAGE_GLOBAL;
177 } 240 }
178#endif
179 241
180 if (use_gbpages) 242 if (use_gbpages)
181 page_size_mask |= 1 << PG_LEVEL_1G; 243 page_size_mask |= 1 << PG_LEVEL_1G;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 749559ed80f..3cd7711bb94 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -49,12 +49,9 @@
49#include <asm/paravirt.h> 49#include <asm/paravirt.h>
50#include <asm/setup.h> 50#include <asm/setup.h>
51#include <asm/cacheflush.h> 51#include <asm/cacheflush.h>
52#include <asm/page_types.h>
52#include <asm/init.h> 53#include <asm/init.h>
53 54
54unsigned long max_low_pfn_mapped;
55unsigned long max_pfn_mapped;
56
57DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
58unsigned long highstart_pfn, highend_pfn; 55unsigned long highstart_pfn, highend_pfn;
59 56
60static noinline int do_test_wp_bit(void); 57static noinline int do_test_wp_bit(void);
@@ -114,7 +111,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
114 pte_t *page_table = NULL; 111 pte_t *page_table = NULL;
115 112
116 if (after_bootmem) { 113 if (after_bootmem) {
117#ifdef CONFIG_DEBUG_PAGEALLOC 114#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
118 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); 115 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
119#endif 116#endif
120 if (!page_table) 117 if (!page_table)
@@ -567,7 +564,7 @@ static inline void save_pg_dir(void)
567} 564}
568#endif /* !CONFIG_ACPI_SLEEP */ 565#endif /* !CONFIG_ACPI_SLEEP */
569 566
570void zap_low_mappings(void) 567void zap_low_mappings(bool early)
571{ 568{
572 int i; 569 int i;
573 570
@@ -584,64 +581,16 @@ void zap_low_mappings(void)
584 set_pgd(swapper_pg_dir+i, __pgd(0)); 581 set_pgd(swapper_pg_dir+i, __pgd(0));
585#endif 582#endif
586 } 583 }
587 flush_tlb_all();
588}
589 584
590int nx_enabled; 585 if (early)
586 __flush_tlb();
587 else
588 flush_tlb_all();
589}
591 590
592pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP); 591pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
593EXPORT_SYMBOL_GPL(__supported_pte_mask); 592EXPORT_SYMBOL_GPL(__supported_pte_mask);
594 593
595#ifdef CONFIG_X86_PAE
596
597static int disable_nx __initdata;
598
599/*
600 * noexec = on|off
601 *
602 * Control non executable mappings.
603 *
604 * on Enable
605 * off Disable
606 */
607static int __init noexec_setup(char *str)
608{
609 if (!str || !strcmp(str, "on")) {
610 if (cpu_has_nx) {
611 __supported_pte_mask |= _PAGE_NX;
612 disable_nx = 0;
613 }
614 } else {
615 if (!strcmp(str, "off")) {
616 disable_nx = 1;
617 __supported_pte_mask &= ~_PAGE_NX;
618 } else {
619 return -EINVAL;
620 }
621 }
622
623 return 0;
624}
625early_param("noexec", noexec_setup);
626
627void __init set_nx(void)
628{
629 unsigned int v[4], l, h;
630
631 if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
632 cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
633
634 if ((v[3] & (1 << 20)) && !disable_nx) {
635 rdmsr(MSR_EFER, l, h);
636 l |= EFER_NX;
637 wrmsr(MSR_EFER, l, h);
638 nx_enabled = 1;
639 __supported_pte_mask |= _PAGE_NX;
640 }
641 }
642}
643#endif
644
645/* user-defined highmem size */ 594/* user-defined highmem size */
646static unsigned int highmem_pages = -1; 595static unsigned int highmem_pages = -1;
647 596
@@ -761,15 +710,15 @@ void __init initmem_init(unsigned long start_pfn,
761 highstart_pfn = highend_pfn = max_pfn; 710 highstart_pfn = highend_pfn = max_pfn;
762 if (max_pfn > max_low_pfn) 711 if (max_pfn > max_low_pfn)
763 highstart_pfn = max_low_pfn; 712 highstart_pfn = max_low_pfn;
764 memory_present(0, 0, highend_pfn);
765 e820_register_active_regions(0, 0, highend_pfn); 713 e820_register_active_regions(0, 0, highend_pfn);
714 sparse_memory_present_with_active_regions(0);
766 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", 715 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
767 pages_to_mb(highend_pfn - highstart_pfn)); 716 pages_to_mb(highend_pfn - highstart_pfn));
768 num_physpages = highend_pfn; 717 num_physpages = highend_pfn;
769 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; 718 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
770#else 719#else
771 memory_present(0, 0, max_low_pfn);
772 e820_register_active_regions(0, 0, max_low_pfn); 720 e820_register_active_regions(0, 0, max_low_pfn);
721 sparse_memory_present_with_active_regions(0);
773 num_physpages = max_low_pfn; 722 num_physpages = max_low_pfn;
774 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; 723 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
775#endif 724#endif
@@ -1011,7 +960,7 @@ void __init mem_init(void)
1011 test_wp_bit(); 960 test_wp_bit();
1012 961
1013 save_pg_dir(); 962 save_pg_dir();
1014 zap_low_mappings(); 963 zap_low_mappings(true);
1015} 964}
1016 965
1017#ifdef CONFIG_MEMORY_HOTPLUG 966#ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 1753e8020df..9c543290a81 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -50,18 +50,8 @@
50#include <asm/cacheflush.h> 50#include <asm/cacheflush.h>
51#include <asm/init.h> 51#include <asm/init.h>
52 52
53/*
54 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
55 * The direct mapping extends to max_pfn_mapped, so that we can directly access
56 * apertures, ACPI and other tables without having to play with fixmaps.
57 */
58unsigned long max_low_pfn_mapped;
59unsigned long max_pfn_mapped;
60
61static unsigned long dma_reserve __initdata; 53static unsigned long dma_reserve __initdata;
62 54
63DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
64
65static int __init parse_direct_gbpages_off(char *arg) 55static int __init parse_direct_gbpages_off(char *arg)
66{ 56{
67 direct_gbpages = 0; 57 direct_gbpages = 0;
@@ -85,39 +75,6 @@ early_param("gbpages", parse_direct_gbpages_on);
85pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP; 75pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
86EXPORT_SYMBOL_GPL(__supported_pte_mask); 76EXPORT_SYMBOL_GPL(__supported_pte_mask);
87 77
88static int disable_nx __cpuinitdata;
89
90/*
91 * noexec=on|off
92 * Control non-executable mappings for 64-bit processes.
93 *
94 * on Enable (default)
95 * off Disable
96 */
97static int __init nonx_setup(char *str)
98{
99 if (!str)
100 return -EINVAL;
101 if (!strncmp(str, "on", 2)) {
102 __supported_pte_mask |= _PAGE_NX;
103 disable_nx = 0;
104 } else if (!strncmp(str, "off", 3)) {
105 disable_nx = 1;
106 __supported_pte_mask &= ~_PAGE_NX;
107 }
108 return 0;
109}
110early_param("noexec", nonx_setup);
111
112void __cpuinit check_efer(void)
113{
114 unsigned long efer;
115
116 rdmsrl(MSR_EFER, efer);
117 if (!(efer & EFER_NX) || disable_nx)
118 __supported_pte_mask &= ~_PAGE_NX;
119}
120
121int force_personality32; 78int force_personality32;
122 79
123/* 80/*
@@ -147,7 +104,7 @@ static __ref void *spp_getpage(void)
147 void *ptr; 104 void *ptr;
148 105
149 if (after_bootmem) 106 if (after_bootmem)
150 ptr = (void *) get_zeroed_page(GFP_ATOMIC); 107 ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
151 else 108 else
152 ptr = alloc_bootmem_pages(PAGE_SIZE); 109 ptr = alloc_bootmem_pages(PAGE_SIZE);
153 110
@@ -324,7 +281,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
324 void *adr; 281 void *adr;
325 282
326 if (after_bootmem) { 283 if (after_bootmem) {
327 adr = (void *)get_zeroed_page(GFP_ATOMIC); 284 adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
328 *phys = __pa(adr); 285 *phys = __pa(adr);
329 286
330 return adr; 287 return adr;
@@ -628,6 +585,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
628 early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT); 585 early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
629 reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT); 586 reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
630} 587}
588#endif
631 589
632void __init paging_init(void) 590void __init paging_init(void)
633{ 591{
@@ -638,11 +596,10 @@ void __init paging_init(void)
638 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; 596 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
639 max_zone_pfns[ZONE_NORMAL] = max_pfn; 597 max_zone_pfns[ZONE_NORMAL] = max_pfn;
640 598
641 memory_present(0, 0, max_pfn); 599 sparse_memory_present_with_active_regions(MAX_NUMNODES);
642 sparse_init(); 600 sparse_init();
643 free_area_init_nodes(max_zone_pfns); 601 free_area_init_nodes(max_zone_pfns);
644} 602}
645#endif
646 603
647/* 604/*
648 * Memory hotplug specific functions 605 * Memory hotplug specific functions
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 8056545e2d3..fe6f84ca121 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -82,7 +82,6 @@ iounmap_atomic(void *kvaddr, enum km_type type)
82 if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) 82 if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx))
83 kpte_clear_flush(kmap_pte-idx, vaddr); 83 kpte_clear_flush(kmap_pte-idx, vaddr);
84 84
85 arch_flush_lazy_mmu_mode();
86 pagefault_enable(); 85 pagefault_enable();
87} 86}
88EXPORT_SYMBOL_GPL(iounmap_atomic); 87EXPORT_SYMBOL_GPL(iounmap_atomic);
diff --git a/arch/x86/mm/kmemcheck/Makefile b/arch/x86/mm/kmemcheck/Makefile
new file mode 100644
index 00000000000..520b3bce409
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/Makefile
@@ -0,0 +1 @@
obj-y := error.o kmemcheck.o opcode.o pte.o selftest.o shadow.o
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
new file mode 100644
index 00000000000..4901d0dafda
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -0,0 +1,228 @@
1#include <linux/interrupt.h>
2#include <linux/kdebug.h>
3#include <linux/kmemcheck.h>
4#include <linux/kernel.h>
5#include <linux/types.h>
6#include <linux/ptrace.h>
7#include <linux/stacktrace.h>
8#include <linux/string.h>
9
10#include "error.h"
11#include "shadow.h"
12
13enum kmemcheck_error_type {
14 KMEMCHECK_ERROR_INVALID_ACCESS,
15 KMEMCHECK_ERROR_BUG,
16};
17
18#define SHADOW_COPY_SIZE (1 << CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT)
19
20struct kmemcheck_error {
21 enum kmemcheck_error_type type;
22
23 union {
24 /* KMEMCHECK_ERROR_INVALID_ACCESS */
25 struct {
26 /* Kind of access that caused the error */
27 enum kmemcheck_shadow state;
28 /* Address and size of the erroneous read */
29 unsigned long address;
30 unsigned int size;
31 };
32 };
33
34 struct pt_regs regs;
35 struct stack_trace trace;
36 unsigned long trace_entries[32];
37
38 /* We compress it to a char. */
39 unsigned char shadow_copy[SHADOW_COPY_SIZE];
40 unsigned char memory_copy[SHADOW_COPY_SIZE];
41};
42
43/*
44 * Create a ring queue of errors to output. We can't call printk() directly
45 * from the kmemcheck traps, since this may call the console drivers and
46 * result in a recursive fault.
47 */
48static struct kmemcheck_error error_fifo[CONFIG_KMEMCHECK_QUEUE_SIZE];
49static unsigned int error_count;
50static unsigned int error_rd;
51static unsigned int error_wr;
52static unsigned int error_missed_count;
53
54static struct kmemcheck_error *error_next_wr(void)
55{
56 struct kmemcheck_error *e;
57
58 if (error_count == ARRAY_SIZE(error_fifo)) {
59 ++error_missed_count;
60 return NULL;
61 }
62
63 e = &error_fifo[error_wr];
64 if (++error_wr == ARRAY_SIZE(error_fifo))
65 error_wr = 0;
66 ++error_count;
67 return e;
68}
69
70static struct kmemcheck_error *error_next_rd(void)
71{
72 struct kmemcheck_error *e;
73
74 if (error_count == 0)
75 return NULL;
76
77 e = &error_fifo[error_rd];
78 if (++error_rd == ARRAY_SIZE(error_fifo))
79 error_rd = 0;
80 --error_count;
81 return e;
82}
83
84void kmemcheck_error_recall(void)
85{
86 static const char *desc[] = {
87 [KMEMCHECK_SHADOW_UNALLOCATED] = "unallocated",
88 [KMEMCHECK_SHADOW_UNINITIALIZED] = "uninitialized",
89 [KMEMCHECK_SHADOW_INITIALIZED] = "initialized",
90 [KMEMCHECK_SHADOW_FREED] = "freed",
91 };
92
93 static const char short_desc[] = {
94 [KMEMCHECK_SHADOW_UNALLOCATED] = 'a',
95 [KMEMCHECK_SHADOW_UNINITIALIZED] = 'u',
96 [KMEMCHECK_SHADOW_INITIALIZED] = 'i',
97 [KMEMCHECK_SHADOW_FREED] = 'f',
98 };
99
100 struct kmemcheck_error *e;
101 unsigned int i;
102
103 e = error_next_rd();
104 if (!e)
105 return;
106
107 switch (e->type) {
108 case KMEMCHECK_ERROR_INVALID_ACCESS:
109 printk(KERN_ERR "WARNING: kmemcheck: Caught %d-bit read "
110 "from %s memory (%p)\n",
111 8 * e->size, e->state < ARRAY_SIZE(desc) ?
112 desc[e->state] : "(invalid shadow state)",
113 (void *) e->address);
114
115 printk(KERN_INFO);
116 for (i = 0; i < SHADOW_COPY_SIZE; ++i)
117 printk("%02x", e->memory_copy[i]);
118 printk("\n");
119
120 printk(KERN_INFO);
121 for (i = 0; i < SHADOW_COPY_SIZE; ++i) {
122 if (e->shadow_copy[i] < ARRAY_SIZE(short_desc))
123 printk(" %c", short_desc[e->shadow_copy[i]]);
124 else
125 printk(" ?");
126 }
127 printk("\n");
128 printk(KERN_INFO "%*c\n", 2 + 2
129 * (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^');
130 break;
131 case KMEMCHECK_ERROR_BUG:
132 printk(KERN_EMERG "ERROR: kmemcheck: Fatal error\n");
133 break;
134 }
135
136 __show_regs(&e->regs, 1);
137 print_stack_trace(&e->trace, 0);
138}
139
140static void do_wakeup(unsigned long data)
141{
142 while (error_count > 0)
143 kmemcheck_error_recall();
144
145 if (error_missed_count > 0) {
146 printk(KERN_WARNING "kmemcheck: Lost %d error reports because "
147 "the queue was too small\n", error_missed_count);
148 error_missed_count = 0;
149 }
150}
151
152static DECLARE_TASKLET(kmemcheck_tasklet, &do_wakeup, 0);
153
154/*
155 * Save the context of an error report.
156 */
157void kmemcheck_error_save(enum kmemcheck_shadow state,
158 unsigned long address, unsigned int size, struct pt_regs *regs)
159{
160 static unsigned long prev_ip;
161
162 struct kmemcheck_error *e;
163 void *shadow_copy;
164 void *memory_copy;
165
166 /* Don't report several adjacent errors from the same EIP. */
167 if (regs->ip == prev_ip)
168 return;
169 prev_ip = regs->ip;
170
171 e = error_next_wr();
172 if (!e)
173 return;
174
175 e->type = KMEMCHECK_ERROR_INVALID_ACCESS;
176
177 e->state = state;
178 e->address = address;
179 e->size = size;
180
181 /* Save regs */
182 memcpy(&e->regs, regs, sizeof(*regs));
183
184 /* Save stack trace */
185 e->trace.nr_entries = 0;
186 e->trace.entries = e->trace_entries;
187 e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
188 e->trace.skip = 0;
189 save_stack_trace_bp(&e->trace, regs->bp);
190
191 /* Round address down to nearest 16 bytes */
192 shadow_copy = kmemcheck_shadow_lookup(address
193 & ~(SHADOW_COPY_SIZE - 1));
194 BUG_ON(!shadow_copy);
195
196 memcpy(e->shadow_copy, shadow_copy, SHADOW_COPY_SIZE);
197
198 kmemcheck_show_addr(address);
199 memory_copy = (void *) (address & ~(SHADOW_COPY_SIZE - 1));
200 memcpy(e->memory_copy, memory_copy, SHADOW_COPY_SIZE);
201 kmemcheck_hide_addr(address);
202
203 tasklet_hi_schedule_first(&kmemcheck_tasklet);
204}
205
206/*
207 * Save the context of a kmemcheck bug.
208 */
209void kmemcheck_error_save_bug(struct pt_regs *regs)
210{
211 struct kmemcheck_error *e;
212
213 e = error_next_wr();
214 if (!e)
215 return;
216
217 e->type = KMEMCHECK_ERROR_BUG;
218
219 memcpy(&e->regs, regs, sizeof(*regs));
220
221 e->trace.nr_entries = 0;
222 e->trace.entries = e->trace_entries;
223 e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
224 e->trace.skip = 1;
225 save_stack_trace(&e->trace);
226
227 tasklet_hi_schedule_first(&kmemcheck_tasklet);
228}
diff --git a/arch/x86/mm/kmemcheck/error.h b/arch/x86/mm/kmemcheck/error.h
new file mode 100644
index 00000000000..0efc2e8d0a2
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/error.h
@@ -0,0 +1,15 @@
1#ifndef ARCH__X86__MM__KMEMCHECK__ERROR_H
2#define ARCH__X86__MM__KMEMCHECK__ERROR_H
3
4#include <linux/ptrace.h>
5
6#include "shadow.h"
7
8void kmemcheck_error_save(enum kmemcheck_shadow state,
9 unsigned long address, unsigned int size, struct pt_regs *regs);
10
11void kmemcheck_error_save_bug(struct pt_regs *regs);
12
13void kmemcheck_error_recall(void);
14
15#endif
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
new file mode 100644
index 00000000000..2c55ed09865
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -0,0 +1,640 @@
1/**
2 * kmemcheck - a heavyweight memory checker for the linux kernel
3 * Copyright (C) 2007, 2008 Vegard Nossum <vegardno@ifi.uio.no>
4 * (With a lot of help from Ingo Molnar and Pekka Enberg.)
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License (version 2) as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/init.h>
12#include <linux/interrupt.h>
13#include <linux/kallsyms.h>
14#include <linux/kernel.h>
15#include <linux/kmemcheck.h>
16#include <linux/mm.h>
17#include <linux/module.h>
18#include <linux/page-flags.h>
19#include <linux/percpu.h>
20#include <linux/ptrace.h>
21#include <linux/string.h>
22#include <linux/types.h>
23
24#include <asm/cacheflush.h>
25#include <asm/kmemcheck.h>
26#include <asm/pgtable.h>
27#include <asm/tlbflush.h>
28
29#include "error.h"
30#include "opcode.h"
31#include "pte.h"
32#include "selftest.h"
33#include "shadow.h"
34
35
36#ifdef CONFIG_KMEMCHECK_DISABLED_BY_DEFAULT
37# define KMEMCHECK_ENABLED 0
38#endif
39
40#ifdef CONFIG_KMEMCHECK_ENABLED_BY_DEFAULT
41# define KMEMCHECK_ENABLED 1
42#endif
43
44#ifdef CONFIG_KMEMCHECK_ONESHOT_BY_DEFAULT
45# define KMEMCHECK_ENABLED 2
46#endif
47
48int kmemcheck_enabled = KMEMCHECK_ENABLED;
49
50int __init kmemcheck_init(void)
51{
52#ifdef CONFIG_SMP
53 /*
54 * Limit SMP to use a single CPU. We rely on the fact that this code
55 * runs before SMP is set up.
56 */
57 if (setup_max_cpus > 1) {
58 printk(KERN_INFO
59 "kmemcheck: Limiting number of CPUs to 1.\n");
60 setup_max_cpus = 1;
61 }
62#endif
63
64 if (!kmemcheck_selftest()) {
65 printk(KERN_INFO "kmemcheck: self-tests failed; disabling\n");
66 kmemcheck_enabled = 0;
67 return -EINVAL;
68 }
69
70 printk(KERN_INFO "kmemcheck: Initialized\n");
71 return 0;
72}
73
74early_initcall(kmemcheck_init);
75
76/*
77 * We need to parse the kmemcheck= option before any memory is allocated.
78 */
79static int __init param_kmemcheck(char *str)
80{
81 if (!str)
82 return -EINVAL;
83
84 sscanf(str, "%d", &kmemcheck_enabled);
85 return 0;
86}
87
88early_param("kmemcheck", param_kmemcheck);
89
90int kmemcheck_show_addr(unsigned long address)
91{
92 pte_t *pte;
93
94 pte = kmemcheck_pte_lookup(address);
95 if (!pte)
96 return 0;
97
98 set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
99 __flush_tlb_one(address);
100 return 1;
101}
102
103int kmemcheck_hide_addr(unsigned long address)
104{
105 pte_t *pte;
106
107 pte = kmemcheck_pte_lookup(address);
108 if (!pte)
109 return 0;
110
111 set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
112 __flush_tlb_one(address);
113 return 1;
114}
115
116struct kmemcheck_context {
117 bool busy;
118 int balance;
119
120 /*
121 * There can be at most two memory operands to an instruction, but
122 * each address can cross a page boundary -- so we may need up to
123 * four addresses that must be hidden/revealed for each fault.
124 */
125 unsigned long addr[4];
126 unsigned long n_addrs;
127 unsigned long flags;
128
129 /* Data size of the instruction that caused a fault. */
130 unsigned int size;
131};
132
133static DEFINE_PER_CPU(struct kmemcheck_context, kmemcheck_context);
134
135bool kmemcheck_active(struct pt_regs *regs)
136{
137 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
138
139 return data->balance > 0;
140}
141
142/* Save an address that needs to be shown/hidden */
143static void kmemcheck_save_addr(unsigned long addr)
144{
145 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
146
147 BUG_ON(data->n_addrs >= ARRAY_SIZE(data->addr));
148 data->addr[data->n_addrs++] = addr;
149}
150
151static unsigned int kmemcheck_show_all(void)
152{
153 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
154 unsigned int i;
155 unsigned int n;
156
157 n = 0;
158 for (i = 0; i < data->n_addrs; ++i)
159 n += kmemcheck_show_addr(data->addr[i]);
160
161 return n;
162}
163
164static unsigned int kmemcheck_hide_all(void)
165{
166 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
167 unsigned int i;
168 unsigned int n;
169
170 n = 0;
171 for (i = 0; i < data->n_addrs; ++i)
172 n += kmemcheck_hide_addr(data->addr[i]);
173
174 return n;
175}
176
177/*
178 * Called from the #PF handler.
179 */
180void kmemcheck_show(struct pt_regs *regs)
181{
182 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
183
184 BUG_ON(!irqs_disabled());
185
186 if (unlikely(data->balance != 0)) {
187 kmemcheck_show_all();
188 kmemcheck_error_save_bug(regs);
189 data->balance = 0;
190 return;
191 }
192
193 /*
194 * None of the addresses actually belonged to kmemcheck. Note that
195 * this is not an error.
196 */
197 if (kmemcheck_show_all() == 0)
198 return;
199
200 ++data->balance;
201
202 /*
203 * The IF needs to be cleared as well, so that the faulting
204 * instruction can run "uninterrupted". Otherwise, we might take
205 * an interrupt and start executing that before we've had a chance
206 * to hide the page again.
207 *
208 * NOTE: In the rare case of multiple faults, we must not override
209 * the original flags:
210 */
211 if (!(regs->flags & X86_EFLAGS_TF))
212 data->flags = regs->flags;
213
214 regs->flags |= X86_EFLAGS_TF;
215 regs->flags &= ~X86_EFLAGS_IF;
216}
217
218/*
219 * Called from the #DB handler.
220 */
221void kmemcheck_hide(struct pt_regs *regs)
222{
223 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
224 int n;
225
226 BUG_ON(!irqs_disabled());
227
228 if (data->balance == 0)
229 return;
230
231 if (unlikely(data->balance != 1)) {
232 kmemcheck_show_all();
233 kmemcheck_error_save_bug(regs);
234 data->n_addrs = 0;
235 data->balance = 0;
236
237 if (!(data->flags & X86_EFLAGS_TF))
238 regs->flags &= ~X86_EFLAGS_TF;
239 if (data->flags & X86_EFLAGS_IF)
240 regs->flags |= X86_EFLAGS_IF;
241 return;
242 }
243
244 if (kmemcheck_enabled)
245 n = kmemcheck_hide_all();
246 else
247 n = kmemcheck_show_all();
248
249 if (n == 0)
250 return;
251
252 --data->balance;
253
254 data->n_addrs = 0;
255
256 if (!(data->flags & X86_EFLAGS_TF))
257 regs->flags &= ~X86_EFLAGS_TF;
258 if (data->flags & X86_EFLAGS_IF)
259 regs->flags |= X86_EFLAGS_IF;
260}
261
262void kmemcheck_show_pages(struct page *p, unsigned int n)
263{
264 unsigned int i;
265
266 for (i = 0; i < n; ++i) {
267 unsigned long address;
268 pte_t *pte;
269 unsigned int level;
270
271 address = (unsigned long) page_address(&p[i]);
272 pte = lookup_address(address, &level);
273 BUG_ON(!pte);
274 BUG_ON(level != PG_LEVEL_4K);
275
276 set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
277 set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_HIDDEN));
278 __flush_tlb_one(address);
279 }
280}
281
282bool kmemcheck_page_is_tracked(struct page *p)
283{
284 /* This will also check the "hidden" flag of the PTE. */
285 return kmemcheck_pte_lookup((unsigned long) page_address(p));
286}
287
288void kmemcheck_hide_pages(struct page *p, unsigned int n)
289{
290 unsigned int i;
291
292 for (i = 0; i < n; ++i) {
293 unsigned long address;
294 pte_t *pte;
295 unsigned int level;
296
297 address = (unsigned long) page_address(&p[i]);
298 pte = lookup_address(address, &level);
299 BUG_ON(!pte);
300 BUG_ON(level != PG_LEVEL_4K);
301
302 set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
303 set_pte(pte, __pte(pte_val(*pte) | _PAGE_HIDDEN));
304 __flush_tlb_one(address);
305 }
306}
307
308/* Access may NOT cross page boundary */
309static void kmemcheck_read_strict(struct pt_regs *regs,
310 unsigned long addr, unsigned int size)
311{
312 void *shadow;
313 enum kmemcheck_shadow status;
314
315 shadow = kmemcheck_shadow_lookup(addr);
316 if (!shadow)
317 return;
318
319 kmemcheck_save_addr(addr);
320 status = kmemcheck_shadow_test(shadow, size);
321 if (status == KMEMCHECK_SHADOW_INITIALIZED)
322 return;
323
324 if (kmemcheck_enabled)
325 kmemcheck_error_save(status, addr, size, regs);
326
327 if (kmemcheck_enabled == 2)
328 kmemcheck_enabled = 0;
329
330 /* Don't warn about it again. */
331 kmemcheck_shadow_set(shadow, size);
332}
333
334/* Access may cross page boundary */
335static void kmemcheck_read(struct pt_regs *regs,
336 unsigned long addr, unsigned int size)
337{
338 unsigned long page = addr & PAGE_MASK;
339 unsigned long next_addr = addr + size - 1;
340 unsigned long next_page = next_addr & PAGE_MASK;
341
342 if (likely(page == next_page)) {
343 kmemcheck_read_strict(regs, addr, size);
344 return;
345 }
346
347 /*
348 * What we do is basically to split the access across the
349 * two pages and handle each part separately. Yes, this means
350 * that we may now see reads that are 3 + 5 bytes, for
351 * example (and if both are uninitialized, there will be two
352 * reports), but it makes the code a lot simpler.
353 */
354 kmemcheck_read_strict(regs, addr, next_page - addr);
355 kmemcheck_read_strict(regs, next_page, next_addr - next_page);
356}
357
358static void kmemcheck_write_strict(struct pt_regs *regs,
359 unsigned long addr, unsigned int size)
360{
361 void *shadow;
362
363 shadow = kmemcheck_shadow_lookup(addr);
364 if (!shadow)
365 return;
366
367 kmemcheck_save_addr(addr);
368 kmemcheck_shadow_set(shadow, size);
369}
370
371static void kmemcheck_write(struct pt_regs *regs,
372 unsigned long addr, unsigned int size)
373{
374 unsigned long page = addr & PAGE_MASK;
375 unsigned long next_addr = addr + size - 1;
376 unsigned long next_page = next_addr & PAGE_MASK;
377
378 if (likely(page == next_page)) {
379 kmemcheck_write_strict(regs, addr, size);
380 return;
381 }
382
383 /* See comment in kmemcheck_read(). */
384 kmemcheck_write_strict(regs, addr, next_page - addr);
385 kmemcheck_write_strict(regs, next_page, next_addr - next_page);
386}
387
388/*
389 * Copying is hard. We have two addresses, each of which may be split across
390 * a page (and each page will have different shadow addresses).
391 */
392static void kmemcheck_copy(struct pt_regs *regs,
393 unsigned long src_addr, unsigned long dst_addr, unsigned int size)
394{
395 uint8_t shadow[8];
396 enum kmemcheck_shadow status;
397
398 unsigned long page;
399 unsigned long next_addr;
400 unsigned long next_page;
401
402 uint8_t *x;
403 unsigned int i;
404 unsigned int n;
405
406 BUG_ON(size > sizeof(shadow));
407
408 page = src_addr & PAGE_MASK;
409 next_addr = src_addr + size - 1;
410 next_page = next_addr & PAGE_MASK;
411
412 if (likely(page == next_page)) {
413 /* Same page */
414 x = kmemcheck_shadow_lookup(src_addr);
415 if (x) {
416 kmemcheck_save_addr(src_addr);
417 for (i = 0; i < size; ++i)
418 shadow[i] = x[i];
419 } else {
420 for (i = 0; i < size; ++i)
421 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
422 }
423 } else {
424 n = next_page - src_addr;
425 BUG_ON(n > sizeof(shadow));
426
427 /* First page */
428 x = kmemcheck_shadow_lookup(src_addr);
429 if (x) {
430 kmemcheck_save_addr(src_addr);
431 for (i = 0; i < n; ++i)
432 shadow[i] = x[i];
433 } else {
434 /* Not tracked */
435 for (i = 0; i < n; ++i)
436 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
437 }
438
439 /* Second page */
440 x = kmemcheck_shadow_lookup(next_page);
441 if (x) {
442 kmemcheck_save_addr(next_page);
443 for (i = n; i < size; ++i)
444 shadow[i] = x[i - n];
445 } else {
446 /* Not tracked */
447 for (i = n; i < size; ++i)
448 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
449 }
450 }
451
452 page = dst_addr & PAGE_MASK;
453 next_addr = dst_addr + size - 1;
454 next_page = next_addr & PAGE_MASK;
455
456 if (likely(page == next_page)) {
457 /* Same page */
458 x = kmemcheck_shadow_lookup(dst_addr);
459 if (x) {
460 kmemcheck_save_addr(dst_addr);
461 for (i = 0; i < size; ++i) {
462 x[i] = shadow[i];
463 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
464 }
465 }
466 } else {
467 n = next_page - dst_addr;
468 BUG_ON(n > sizeof(shadow));
469
470 /* First page */
471 x = kmemcheck_shadow_lookup(dst_addr);
472 if (x) {
473 kmemcheck_save_addr(dst_addr);
474 for (i = 0; i < n; ++i) {
475 x[i] = shadow[i];
476 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
477 }
478 }
479
480 /* Second page */
481 x = kmemcheck_shadow_lookup(next_page);
482 if (x) {
483 kmemcheck_save_addr(next_page);
484 for (i = n; i < size; ++i) {
485 x[i - n] = shadow[i];
486 shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
487 }
488 }
489 }
490
491 status = kmemcheck_shadow_test(shadow, size);
492 if (status == KMEMCHECK_SHADOW_INITIALIZED)
493 return;
494
495 if (kmemcheck_enabled)
496 kmemcheck_error_save(status, src_addr, size, regs);
497
498 if (kmemcheck_enabled == 2)
499 kmemcheck_enabled = 0;
500}
501
502enum kmemcheck_method {
503 KMEMCHECK_READ,
504 KMEMCHECK_WRITE,
505};
506
507static void kmemcheck_access(struct pt_regs *regs,
508 unsigned long fallback_address, enum kmemcheck_method fallback_method)
509{
510 const uint8_t *insn;
511 const uint8_t *insn_primary;
512 unsigned int size;
513
514 struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
515
516 /* Recursive fault -- ouch. */
517 if (data->busy) {
518 kmemcheck_show_addr(fallback_address);
519 kmemcheck_error_save_bug(regs);
520 return;
521 }
522
523 data->busy = true;
524
525 insn = (const uint8_t *) regs->ip;
526 insn_primary = kmemcheck_opcode_get_primary(insn);
527
528 kmemcheck_opcode_decode(insn, &size);
529
530 switch (insn_primary[0]) {
531#ifdef CONFIG_KMEMCHECK_BITOPS_OK
532 /* AND, OR, XOR */
533 /*
534 * Unfortunately, these instructions have to be excluded from
535 * our regular checking since they access only some (and not
536 * all) bits. This clears out "bogus" bitfield-access warnings.
537 */
538 case 0x80:
539 case 0x81:
540 case 0x82:
541 case 0x83:
542 switch ((insn_primary[1] >> 3) & 7) {
543 /* OR */
544 case 1:
545 /* AND */
546 case 4:
547 /* XOR */
548 case 6:
549 kmemcheck_write(regs, fallback_address, size);
550 goto out;
551
552 /* ADD */
553 case 0:
554 /* ADC */
555 case 2:
556 /* SBB */
557 case 3:
558 /* SUB */
559 case 5:
560 /* CMP */
561 case 7:
562 break;
563 }
564 break;
565#endif
566
567 /* MOVS, MOVSB, MOVSW, MOVSD */
568 case 0xa4:
569 case 0xa5:
570 /*
571 * These instructions are special because they take two
572 * addresses, but we only get one page fault.
573 */
574 kmemcheck_copy(regs, regs->si, regs->di, size);
575 goto out;
576
577 /* CMPS, CMPSB, CMPSW, CMPSD */
578 case 0xa6:
579 case 0xa7:
580 kmemcheck_read(regs, regs->si, size);
581 kmemcheck_read(regs, regs->di, size);
582 goto out;
583 }
584
585 /*
586 * If the opcode isn't special in any way, we use the data from the
587 * page fault handler to determine the address and type of memory
588 * access.
589 */
590 switch (fallback_method) {
591 case KMEMCHECK_READ:
592 kmemcheck_read(regs, fallback_address, size);
593 goto out;
594 case KMEMCHECK_WRITE:
595 kmemcheck_write(regs, fallback_address, size);
596 goto out;
597 }
598
599out:
600 data->busy = false;
601}
602
603bool kmemcheck_fault(struct pt_regs *regs, unsigned long address,
604 unsigned long error_code)
605{
606 pte_t *pte;
607
608 /*
609 * XXX: Is it safe to assume that memory accesses from virtual 86
610 * mode or non-kernel code segments will _never_ access kernel
611 * memory (e.g. tracked pages)? For now, we need this to avoid
612 * invoking kmemcheck for PnP BIOS calls.
613 */
614 if (regs->flags & X86_VM_MASK)
615 return false;
616 if (regs->cs != __KERNEL_CS)
617 return false;
618
619 pte = kmemcheck_pte_lookup(address);
620 if (!pte)
621 return false;
622
623 if (error_code & 2)
624 kmemcheck_access(regs, address, KMEMCHECK_WRITE);
625 else
626 kmemcheck_access(regs, address, KMEMCHECK_READ);
627
628 kmemcheck_show(regs);
629 return true;
630}
631
632bool kmemcheck_trap(struct pt_regs *regs)
633{
634 if (!kmemcheck_active(regs))
635 return false;
636
637 /* We're done. */
638 kmemcheck_hide(regs);
639 return true;
640}
diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c
new file mode 100644
index 00000000000..63c19e27aa6
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/opcode.c
@@ -0,0 +1,106 @@
1#include <linux/types.h>
2
3#include "opcode.h"
4
5static bool opcode_is_prefix(uint8_t b)
6{
7 return
8 /* Group 1 */
9 b == 0xf0 || b == 0xf2 || b == 0xf3
10 /* Group 2 */
11 || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26
12 || b == 0x64 || b == 0x65 || b == 0x2e || b == 0x3e
13 /* Group 3 */
14 || b == 0x66
15 /* Group 4 */
16 || b == 0x67;
17}
18
19#ifdef CONFIG_X86_64
20static bool opcode_is_rex_prefix(uint8_t b)
21{
22 return (b & 0xf0) == 0x40;
23}
24#else
25static bool opcode_is_rex_prefix(uint8_t b)
26{
27 return false;
28}
29#endif
30
31#define REX_W (1 << 3)
32
33/*
34 * This is a VERY crude opcode decoder. We only need to find the size of the
35 * load/store that caused our #PF and this should work for all the opcodes
36 * that we care about. Moreover, the ones who invented this instruction set
37 * should be shot.
38 */
39void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size)
40{
41 /* Default operand size */
42 int operand_size_override = 4;
43
44 /* prefixes */
45 for (; opcode_is_prefix(*op); ++op) {
46 if (*op == 0x66)
47 operand_size_override = 2;
48 }
49
50 /* REX prefix */
51 if (opcode_is_rex_prefix(*op)) {
52 uint8_t rex = *op;
53
54 ++op;
55 if (rex & REX_W) {
56 switch (*op) {
57 case 0x63:
58 *size = 4;
59 return;
60 case 0x0f:
61 ++op;
62
63 switch (*op) {
64 case 0xb6:
65 case 0xbe:
66 *size = 1;
67 return;
68 case 0xb7:
69 case 0xbf:
70 *size = 2;
71 return;
72 }
73
74 break;
75 }
76
77 *size = 8;
78 return;
79 }
80 }
81
82 /* escape opcode */
83 if (*op == 0x0f) {
84 ++op;
85
86 /*
87 * This is move with zero-extend and sign-extend, respectively;
88 * we don't have to think about 0xb6/0xbe, because this is
89 * already handled in the conditional below.
90 */
91 if (*op == 0xb7 || *op == 0xbf)
92 operand_size_override = 2;
93 }
94
95 *size = (*op & 1) ? operand_size_override : 1;
96}
97
98const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op)
99{
100 /* skip prefixes */
101 while (opcode_is_prefix(*op))
102 ++op;
103 if (opcode_is_rex_prefix(*op))
104 ++op;
105 return op;
106}
diff --git a/arch/x86/mm/kmemcheck/opcode.h b/arch/x86/mm/kmemcheck/opcode.h
new file mode 100644
index 00000000000..6956aad66b5
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/opcode.h
@@ -0,0 +1,9 @@
1#ifndef ARCH__X86__MM__KMEMCHECK__OPCODE_H
2#define ARCH__X86__MM__KMEMCHECK__OPCODE_H
3
4#include <linux/types.h>
5
6void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size);
7const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op);
8
9#endif
diff --git a/arch/x86/mm/kmemcheck/pte.c b/arch/x86/mm/kmemcheck/pte.c
new file mode 100644
index 00000000000..4ead26eeaf9
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/pte.c
@@ -0,0 +1,22 @@
1#include <linux/mm.h>
2
3#include <asm/pgtable.h>
4
5#include "pte.h"
6
7pte_t *kmemcheck_pte_lookup(unsigned long address)
8{
9 pte_t *pte;
10 unsigned int level;
11
12 pte = lookup_address(address, &level);
13 if (!pte)
14 return NULL;
15 if (level != PG_LEVEL_4K)
16 return NULL;
17 if (!pte_hidden(*pte))
18 return NULL;
19
20 return pte;
21}
22
diff --git a/arch/x86/mm/kmemcheck/pte.h b/arch/x86/mm/kmemcheck/pte.h
new file mode 100644
index 00000000000..9f596645649
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/pte.h
@@ -0,0 +1,10 @@
1#ifndef ARCH__X86__MM__KMEMCHECK__PTE_H
2#define ARCH__X86__MM__KMEMCHECK__PTE_H
3
4#include <linux/mm.h>
5
6#include <asm/pgtable.h>
7
8pte_t *kmemcheck_pte_lookup(unsigned long address);
9
10#endif
diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c
new file mode 100644
index 00000000000..036efbea8b2
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/selftest.c
@@ -0,0 +1,69 @@
1#include <linux/kernel.h>
2
3#include "opcode.h"
4#include "selftest.h"
5
6struct selftest_opcode {
7 unsigned int expected_size;
8 const uint8_t *insn;
9 const char *desc;
10};
11
12static const struct selftest_opcode selftest_opcodes[] = {
13 /* REP MOVS */
14 {1, "\xf3\xa4", "rep movsb <mem8>, <mem8>"},
15 {4, "\xf3\xa5", "rep movsl <mem32>, <mem32>"},
16
17 /* MOVZX / MOVZXD */
18 {1, "\x66\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg16>"},
19 {1, "\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg32>"},
20
21 /* MOVSX / MOVSXD */
22 {1, "\x66\x0f\xbe\x51\xf8", "movswq <mem8>, <reg16>"},
23 {1, "\x0f\xbe\x51\xf8", "movswq <mem8>, <reg32>"},
24
25#ifdef CONFIG_X86_64
26 /* MOVZX / MOVZXD */
27 {1, "\x49\x0f\xb6\x51\xf8", "movzbq <mem8>, <reg64>"},
28 {2, "\x49\x0f\xb7\x51\xf8", "movzbq <mem16>, <reg64>"},
29
30 /* MOVSX / MOVSXD */
31 {1, "\x49\x0f\xbe\x51\xf8", "movsbq <mem8>, <reg64>"},
32 {2, "\x49\x0f\xbf\x51\xf8", "movsbq <mem16>, <reg64>"},
33 {4, "\x49\x63\x51\xf8", "movslq <mem32>, <reg64>"},
34#endif
35};
36
37static bool selftest_opcode_one(const struct selftest_opcode *op)
38{
39 unsigned size;
40
41 kmemcheck_opcode_decode(op->insn, &size);
42
43 if (size == op->expected_size)
44 return true;
45
46 printk(KERN_WARNING "kmemcheck: opcode %s: expected size %d, got %d\n",
47 op->desc, op->expected_size, size);
48 return false;
49}
50
51static bool selftest_opcodes_all(void)
52{
53 bool pass = true;
54 unsigned int i;
55
56 for (i = 0; i < ARRAY_SIZE(selftest_opcodes); ++i)
57 pass = pass && selftest_opcode_one(&selftest_opcodes[i]);
58
59 return pass;
60}
61
62bool kmemcheck_selftest(void)
63{
64 bool pass = true;
65
66 pass = pass && selftest_opcodes_all();
67
68 return pass;
69}
diff --git a/arch/x86/mm/kmemcheck/selftest.h b/arch/x86/mm/kmemcheck/selftest.h
new file mode 100644
index 00000000000..8fed4fe11f9
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/selftest.h
@@ -0,0 +1,6 @@
1#ifndef ARCH_X86_MM_KMEMCHECK_SELFTEST_H
2#define ARCH_X86_MM_KMEMCHECK_SELFTEST_H
3
4bool kmemcheck_selftest(void);
5
6#endif
diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c
new file mode 100644
index 00000000000..e773b6bd007
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/shadow.c
@@ -0,0 +1,162 @@
1#include <linux/kmemcheck.h>
2#include <linux/module.h>
3#include <linux/mm.h>
4#include <linux/module.h>
5
6#include <asm/page.h>
7#include <asm/pgtable.h>
8
9#include "pte.h"
10#include "shadow.h"
11
12/*
13 * Return the shadow address for the given address. Returns NULL if the
14 * address is not tracked.
15 *
16 * We need to be extremely careful not to follow any invalid pointers,
17 * because this function can be called for *any* possible address.
18 */
19void *kmemcheck_shadow_lookup(unsigned long address)
20{
21 pte_t *pte;
22 struct page *page;
23
24 if (!virt_addr_valid(address))
25 return NULL;
26
27 pte = kmemcheck_pte_lookup(address);
28 if (!pte)
29 return NULL;
30
31 page = virt_to_page(address);
32 if (!page->shadow)
33 return NULL;
34 return page->shadow + (address & (PAGE_SIZE - 1));
35}
36
37static void mark_shadow(void *address, unsigned int n,
38 enum kmemcheck_shadow status)
39{
40 unsigned long addr = (unsigned long) address;
41 unsigned long last_addr = addr + n - 1;
42 unsigned long page = addr & PAGE_MASK;
43 unsigned long last_page = last_addr & PAGE_MASK;
44 unsigned int first_n;
45 void *shadow;
46
47 /* If the memory range crosses a page boundary, stop there. */
48 if (page == last_page)
49 first_n = n;
50 else
51 first_n = page + PAGE_SIZE - addr;
52
53 shadow = kmemcheck_shadow_lookup(addr);
54 if (shadow)
55 memset(shadow, status, first_n);
56
57 addr += first_n;
58 n -= first_n;
59
60 /* Do full-page memset()s. */
61 while (n >= PAGE_SIZE) {
62 shadow = kmemcheck_shadow_lookup(addr);
63 if (shadow)
64 memset(shadow, status, PAGE_SIZE);
65
66 addr += PAGE_SIZE;
67 n -= PAGE_SIZE;
68 }
69
70 /* Do the remaining page, if any. */
71 if (n > 0) {
72 shadow = kmemcheck_shadow_lookup(addr);
73 if (shadow)
74 memset(shadow, status, n);
75 }
76}
77
78void kmemcheck_mark_unallocated(void *address, unsigned int n)
79{
80 mark_shadow(address, n, KMEMCHECK_SHADOW_UNALLOCATED);
81}
82
83void kmemcheck_mark_uninitialized(void *address, unsigned int n)
84{
85 mark_shadow(address, n, KMEMCHECK_SHADOW_UNINITIALIZED);
86}
87
88/*
89 * Fill the shadow memory of the given address such that the memory at that
90 * address is marked as being initialized.
91 */
92void kmemcheck_mark_initialized(void *address, unsigned int n)
93{
94 mark_shadow(address, n, KMEMCHECK_SHADOW_INITIALIZED);
95}
96EXPORT_SYMBOL_GPL(kmemcheck_mark_initialized);
97
98void kmemcheck_mark_freed(void *address, unsigned int n)
99{
100 mark_shadow(address, n, KMEMCHECK_SHADOW_FREED);
101}
102
103void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n)
104{
105 unsigned int i;
106
107 for (i = 0; i < n; ++i)
108 kmemcheck_mark_unallocated(page_address(&p[i]), PAGE_SIZE);
109}
110
111void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n)
112{
113 unsigned int i;
114
115 for (i = 0; i < n; ++i)
116 kmemcheck_mark_uninitialized(page_address(&p[i]), PAGE_SIZE);
117}
118
119void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n)
120{
121 unsigned int i;
122
123 for (i = 0; i < n; ++i)
124 kmemcheck_mark_initialized(page_address(&p[i]), PAGE_SIZE);
125}
126
127enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size)
128{
129 uint8_t *x;
130 unsigned int i;
131
132 x = shadow;
133
134#ifdef CONFIG_KMEMCHECK_PARTIAL_OK
135 /*
136 * Make sure _some_ bytes are initialized. Gcc frequently generates
137 * code to access neighboring bytes.
138 */
139 for (i = 0; i < size; ++i) {
140 if (x[i] == KMEMCHECK_SHADOW_INITIALIZED)
141 return x[i];
142 }
143#else
144 /* All bytes must be initialized. */
145 for (i = 0; i < size; ++i) {
146 if (x[i] != KMEMCHECK_SHADOW_INITIALIZED)
147 return x[i];
148 }
149#endif
150
151 return x[0];
152}
153
154void kmemcheck_shadow_set(void *shadow, unsigned int size)
155{
156 uint8_t *x;
157 unsigned int i;
158
159 x = shadow;
160 for (i = 0; i < size; ++i)
161 x[i] = KMEMCHECK_SHADOW_INITIALIZED;
162}
diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h
new file mode 100644
index 00000000000..af46d9ab9d8
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/shadow.h
@@ -0,0 +1,16 @@
1#ifndef ARCH__X86__MM__KMEMCHECK__SHADOW_H
2#define ARCH__X86__MM__KMEMCHECK__SHADOW_H
3
4enum kmemcheck_shadow {
5 KMEMCHECK_SHADOW_UNALLOCATED,
6 KMEMCHECK_SHADOW_UNINITIALIZED,
7 KMEMCHECK_SHADOW_INITIALIZED,
8 KMEMCHECK_SHADOW_FREED,
9};
10
11void *kmemcheck_shadow_lookup(unsigned long address);
12
13enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size);
14void kmemcheck_shadow_set(void *shadow, unsigned int size);
15
16#endif
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index 605c8be0621..18d244f7020 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -40,23 +40,22 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
40 40
41static void __init memtest(u64 pattern, u64 start_phys, u64 size) 41static void __init memtest(u64 pattern, u64 start_phys, u64 size)
42{ 42{
43 u64 i, count; 43 u64 *p, *start, *end;
44 u64 *start;
45 u64 start_bad, last_bad; 44 u64 start_bad, last_bad;
46 u64 start_phys_aligned; 45 u64 start_phys_aligned;
47 size_t incr; 46 const size_t incr = sizeof(pattern);
48 47
49 incr = sizeof(pattern);
50 start_phys_aligned = ALIGN(start_phys, incr); 48 start_phys_aligned = ALIGN(start_phys, incr);
51 count = (size - (start_phys_aligned - start_phys))/incr;
52 start = __va(start_phys_aligned); 49 start = __va(start_phys_aligned);
50 end = start + (size - (start_phys_aligned - start_phys)) / incr;
53 start_bad = 0; 51 start_bad = 0;
54 last_bad = 0; 52 last_bad = 0;
55 53
56 for (i = 0; i < count; i++) 54 for (p = start; p < end; p++)
57 start[i] = pattern; 55 *p = pattern;
58 for (i = 0; i < count; i++, start++, start_phys_aligned += incr) { 56
59 if (*start == pattern) 57 for (p = start; p < end; p++, start_phys_aligned += incr) {
58 if (*p == pattern)
60 continue; 59 continue;
61 if (start_phys_aligned == last_bad + incr) { 60 if (start_phys_aligned == last_bad + incr) {
62 last_bad += incr; 61 last_bad += incr;
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 2d05a12029d..459913beac7 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -179,18 +179,25 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
179} 179}
180 180
181/* Initialize bootmem allocator for a node */ 181/* Initialize bootmem allocator for a node */
182void __init setup_node_bootmem(int nodeid, unsigned long start, 182void __init
183 unsigned long end) 183setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
184{ 184{
185 unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size; 185 unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size;
186 const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
186 unsigned long bootmap_start, nodedata_phys; 187 unsigned long bootmap_start, nodedata_phys;
187 void *bootmap; 188 void *bootmap;
188 const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
189 int nid; 189 int nid;
190 190
191 if (!end) 191 if (!end)
192 return; 192 return;
193 193
194 /*
195 * Don't confuse VM with a node that doesn't have the
196 * minimum amount of memory:
197 */
198 if (end && (end - start) < NODE_MIN_SIZE)
199 return;
200
194 start = roundup(start, ZONE_ALIGN); 201 start = roundup(start, ZONE_ALIGN);
195 202
196 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, 203 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid,
@@ -272,9 +279,6 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
272 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, 279 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start,
273 bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT); 280 bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT);
274 281
275#ifdef CONFIG_ACPI_NUMA
276 srat_reserve_add_area(nodeid);
277#endif
278 node_set_online(nodeid); 282 node_set_online(nodeid);
279} 283}
280 284
@@ -578,21 +582,6 @@ unsigned long __init numa_free_all_bootmem(void)
578 return pages; 582 return pages;
579} 583}
580 584
581void __init paging_init(void)
582{
583 unsigned long max_zone_pfns[MAX_NR_ZONES];
584
585 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
586 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
587 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
588 max_zone_pfns[ZONE_NORMAL] = max_pfn;
589
590 sparse_memory_present_with_active_regions(MAX_NUMNODES);
591 sparse_init();
592
593 free_area_init_nodes(max_zone_pfns);
594}
595
596static __init int numa_setup(char *opt) 585static __init int numa_setup(char *opt)
597{ 586{
598 if (!opt) 587 if (!opt)
@@ -606,8 +595,6 @@ static __init int numa_setup(char *opt)
606#ifdef CONFIG_ACPI_NUMA 595#ifdef CONFIG_ACPI_NUMA
607 if (!strncmp(opt, "noacpi", 6)) 596 if (!strncmp(opt, "noacpi", 6))
608 acpi_numa = -1; 597 acpi_numa = -1;
609 if (!strncmp(opt, "hotadd=", 7))
610 hotadd_percent = simple_strtoul(opt+7, NULL, 10);
611#endif 598#endif
612 return 0; 599 return 0;
613} 600}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 797f9f107cb..3cfe9ced8a4 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -153,7 +153,7 @@ static void __cpa_flush_all(void *arg)
153 */ 153 */
154 __flush_tlb_all(); 154 __flush_tlb_all();
155 155
156 if (cache && boot_cpu_data.x86_model >= 4) 156 if (cache && boot_cpu_data.x86 >= 4)
157 wbinvd(); 157 wbinvd();
158} 158}
159 159
@@ -208,20 +208,15 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache,
208 int in_flags, struct page **pages) 208 int in_flags, struct page **pages)
209{ 209{
210 unsigned int i, level; 210 unsigned int i, level;
211 unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
211 212
212 BUG_ON(irqs_disabled()); 213 BUG_ON(irqs_disabled());
213 214
214 on_each_cpu(__cpa_flush_range, NULL, 1); 215 on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1);
215 216
216 if (!cache) 217 if (!cache || do_wbinvd)
217 return; 218 return;
218 219
219 /* 4M threshold */
220 if (numpages >= 1024) {
221 if (boot_cpu_data.x86_model >= 4)
222 wbinvd();
223 return;
224 }
225 /* 220 /*
226 * We only need to flush on one CPU, 221 * We only need to flush on one CPU,
227 * clflush is a MESI-coherent instruction that 222 * clflush is a MESI-coherent instruction that
@@ -475,7 +470,7 @@ static int split_large_page(pte_t *kpte, unsigned long address)
475 470
476 if (!debug_pagealloc) 471 if (!debug_pagealloc)
477 spin_unlock(&cpa_lock); 472 spin_unlock(&cpa_lock);
478 base = alloc_pages(GFP_KERNEL, 0); 473 base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
479 if (!debug_pagealloc) 474 if (!debug_pagealloc)
480 spin_lock(&cpa_lock); 475 spin_lock(&cpa_lock);
481 if (!base) 476 if (!base)
@@ -844,13 +839,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
844 839
845 vm_unmap_aliases(); 840 vm_unmap_aliases();
846 841
847 /*
848 * If we're called with lazy mmu updates enabled, the
849 * in-memory pte state may be stale. Flush pending updates to
850 * bring them up to date.
851 */
852 arch_flush_lazy_mmu_mode();
853
854 cpa.vaddr = addr; 842 cpa.vaddr = addr;
855 cpa.pages = pages; 843 cpa.pages = pages;
856 cpa.numpages = numpages; 844 cpa.numpages = numpages;
@@ -895,13 +883,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
895 } else 883 } else
896 cpa_flush_all(cache); 884 cpa_flush_all(cache);
897 885
898 /*
899 * If we've been called with lazy mmu updates enabled, then
900 * make sure that everything gets flushed out before we
901 * return.
902 */
903 arch_flush_lazy_mmu_mode();
904
905out: 886out:
906 return ret; 887 return ret;
907} 888}
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 7aa03a5389f..8e43bdd4545 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -4,9 +4,11 @@
4#include <asm/tlb.h> 4#include <asm/tlb.h>
5#include <asm/fixmap.h> 5#include <asm/fixmap.h>
6 6
7#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
8
7pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) 9pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
8{ 10{
9 return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); 11 return (pte_t *)__get_free_page(PGALLOC_GFP);
10} 12}
11 13
12pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) 14pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
@@ -14,9 +16,9 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
14 struct page *pte; 16 struct page *pte;
15 17
16#ifdef CONFIG_HIGHPTE 18#ifdef CONFIG_HIGHPTE
17 pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); 19 pte = alloc_pages(PGALLOC_GFP | __GFP_HIGHMEM, 0);
18#else 20#else
19 pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); 21 pte = alloc_pages(PGALLOC_GFP, 0);
20#endif 22#endif
21 if (pte) 23 if (pte)
22 pgtable_page_ctor(pte); 24 pgtable_page_ctor(pte);
@@ -161,7 +163,7 @@ static int preallocate_pmds(pmd_t *pmds[])
161 bool failed = false; 163 bool failed = false;
162 164
163 for(i = 0; i < PREALLOCATED_PMDS; i++) { 165 for(i = 0; i < PREALLOCATED_PMDS; i++) {
164 pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); 166 pmd_t *pmd = (pmd_t *)__get_free_page(PGALLOC_GFP);
165 if (pmd == NULL) 167 if (pmd == NULL)
166 failed = true; 168 failed = true;
167 pmds[i] = pmd; 169 pmds[i] = pmd;
@@ -228,7 +230,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
228 pmd_t *pmds[PREALLOCATED_PMDS]; 230 pmd_t *pmds[PREALLOCATED_PMDS];
229 unsigned long flags; 231 unsigned long flags;
230 232
231 pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); 233 pgd = (pgd_t *)__get_free_page(PGALLOC_GFP);
232 234
233 if (pgd == NULL) 235 if (pgd == NULL)
234 goto out; 236 goto out;
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 01765955baa..2dfcbf9df2a 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -31,17 +31,11 @@ static nodemask_t nodes_parsed __initdata;
31static nodemask_t cpu_nodes_parsed __initdata; 31static nodemask_t cpu_nodes_parsed __initdata;
32static struct bootnode nodes[MAX_NUMNODES] __initdata; 32static struct bootnode nodes[MAX_NUMNODES] __initdata;
33static struct bootnode nodes_add[MAX_NUMNODES]; 33static struct bootnode nodes_add[MAX_NUMNODES];
34static int found_add_area __initdata;
35int hotadd_percent __initdata = 0;
36 34
37static int num_node_memblks __initdata; 35static int num_node_memblks __initdata;
38static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata; 36static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
39static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata; 37static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata;
40 38
41/* Too small nodes confuse the VM badly. Usually they result
42 from BIOS bugs. */
43#define NODE_MIN_SIZE (4*1024*1024)
44
45static __init int setup_node(int pxm) 39static __init int setup_node(int pxm)
46{ 40{
47 return acpi_map_pxm_to_node(pxm); 41 return acpi_map_pxm_to_node(pxm);
@@ -66,9 +60,6 @@ static __init void cutoff_node(int i, unsigned long start, unsigned long end)
66{ 60{
67 struct bootnode *nd = &nodes[i]; 61 struct bootnode *nd = &nodes[i];
68 62
69 if (found_add_area)
70 return;
71
72 if (nd->start < start) { 63 if (nd->start < start) {
73 nd->start = start; 64 nd->start = start;
74 if (nd->end < nd->start) 65 if (nd->end < nd->start)
@@ -86,7 +77,6 @@ static __init void bad_srat(void)
86 int i; 77 int i;
87 printk(KERN_ERR "SRAT: SRAT not used.\n"); 78 printk(KERN_ERR "SRAT: SRAT not used.\n");
88 acpi_numa = -1; 79 acpi_numa = -1;
89 found_add_area = 0;
90 for (i = 0; i < MAX_LOCAL_APIC; i++) 80 for (i = 0; i < MAX_LOCAL_APIC; i++)
91 apicid_to_node[i] = NUMA_NO_NODE; 81 apicid_to_node[i] = NUMA_NO_NODE;
92 for (i = 0; i < MAX_NUMNODES; i++) 82 for (i = 0; i < MAX_NUMNODES; i++)
@@ -182,24 +172,21 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
182 pxm, apic_id, node); 172 pxm, apic_id, node);
183} 173}
184 174
185static int update_end_of_memory(unsigned long end) {return -1;}
186static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
187#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 175#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
188static inline int save_add_info(void) {return 1;} 176static inline int save_add_info(void) {return 1;}
189#else 177#else
190static inline int save_add_info(void) {return 0;} 178static inline int save_add_info(void) {return 0;}
191#endif 179#endif
192/* 180/*
193 * Update nodes_add and decide if to include add are in the zone. 181 * Update nodes_add[]
194 * Both SPARSE and RESERVE need nodes_add information. 182 * This code supports one contiguous hot add area per node
195 * This code supports one contiguous hot add area per node.
196 */ 183 */
197static int __init 184static void __init
198reserve_hotadd(int node, unsigned long start, unsigned long end) 185update_nodes_add(int node, unsigned long start, unsigned long end)
199{ 186{
200 unsigned long s_pfn = start >> PAGE_SHIFT; 187 unsigned long s_pfn = start >> PAGE_SHIFT;
201 unsigned long e_pfn = end >> PAGE_SHIFT; 188 unsigned long e_pfn = end >> PAGE_SHIFT;
202 int ret = 0, changed = 0; 189 int changed = 0;
203 struct bootnode *nd = &nodes_add[node]; 190 struct bootnode *nd = &nodes_add[node];
204 191
205 /* I had some trouble with strange memory hotadd regions breaking 192 /* I had some trouble with strange memory hotadd regions breaking
@@ -210,7 +197,7 @@ reserve_hotadd(int node, unsigned long start, unsigned long end)
210 mistakes */ 197 mistakes */
211 if ((signed long)(end - start) < NODE_MIN_SIZE) { 198 if ((signed long)(end - start) < NODE_MIN_SIZE) {
212 printk(KERN_ERR "SRAT: Hotplug area too small\n"); 199 printk(KERN_ERR "SRAT: Hotplug area too small\n");
213 return -1; 200 return;
214 } 201 }
215 202
216 /* This check might be a bit too strict, but I'm keeping it for now. */ 203 /* This check might be a bit too strict, but I'm keeping it for now. */
@@ -218,12 +205,7 @@ reserve_hotadd(int node, unsigned long start, unsigned long end)
218 printk(KERN_ERR 205 printk(KERN_ERR
219 "SRAT: Hotplug area %lu -> %lu has existing memory\n", 206 "SRAT: Hotplug area %lu -> %lu has existing memory\n",
220 s_pfn, e_pfn); 207 s_pfn, e_pfn);
221 return -1; 208 return;
222 }
223
224 if (!hotadd_enough_memory(&nodes_add[node])) {
225 printk(KERN_ERR "SRAT: Hotplug area too large\n");
226 return -1;
227 } 209 }
228 210
229 /* Looks good */ 211 /* Looks good */
@@ -245,11 +227,9 @@ reserve_hotadd(int node, unsigned long start, unsigned long end)
245 printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n"); 227 printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
246 } 228 }
247 229
248 ret = update_end_of_memory(nd->end);
249
250 if (changed) 230 if (changed)
251 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end); 231 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",
252 return ret; 232 nd->start, nd->end);
253} 233}
254 234
255/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ 235/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
@@ -310,13 +290,10 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
310 start, end); 290 start, end);
311 e820_register_active_regions(node, start >> PAGE_SHIFT, 291 e820_register_active_regions(node, start >> PAGE_SHIFT,
312 end >> PAGE_SHIFT); 292 end >> PAGE_SHIFT);
313 push_node_boundaries(node, nd->start >> PAGE_SHIFT,
314 nd->end >> PAGE_SHIFT);
315 293
316 if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && 294 if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
317 (reserve_hotadd(node, start, end) < 0)) { 295 update_nodes_add(node, start, end);
318 /* Ignore hotadd region. Undo damage */ 296 /* restore nodes[node] */
319 printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
320 *nd = oldnode; 297 *nd = oldnode;
321 if ((nd->start | nd->end) == 0) 298 if ((nd->start | nd->end) == 0)
322 node_clear(node, nodes_parsed); 299 node_clear(node, nodes_parsed);
@@ -345,9 +322,9 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
345 pxmram = 0; 322 pxmram = 0;
346 } 323 }
347 324
348 e820ram = max_pfn - absent_pages_in_range(0, max_pfn); 325 e820ram = max_pfn - (e820_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT);
349 /* We seem to lose 3 pages somewhere. Allow a bit of slack. */ 326 /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
350 if ((long)(e820ram - pxmram) >= 1*1024*1024) { 327 if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) {
351 printk(KERN_ERR 328 printk(KERN_ERR
352 "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n", 329 "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
353 (pxmram << PAGE_SHIFT) >> 20, 330 (pxmram << PAGE_SHIFT) >> 20,
@@ -357,17 +334,6 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
357 return 1; 334 return 1;
358} 335}
359 336
360static void __init unparse_node(int node)
361{
362 int i;
363 node_clear(node, nodes_parsed);
364 node_clear(node, cpu_nodes_parsed);
365 for (i = 0; i < MAX_LOCAL_APIC; i++) {
366 if (apicid_to_node[i] == node)
367 apicid_to_node[i] = NUMA_NO_NODE;
368 }
369}
370
371void __init acpi_numa_arch_fixup(void) {} 337void __init acpi_numa_arch_fixup(void) {}
372 338
373/* Use the information discovered above to actually set up the nodes. */ 339/* Use the information discovered above to actually set up the nodes. */
@@ -379,18 +345,8 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
379 return -1; 345 return -1;
380 346
381 /* First clean up the node list */ 347 /* First clean up the node list */
382 for (i = 0; i < MAX_NUMNODES; i++) { 348 for (i = 0; i < MAX_NUMNODES; i++)
383 cutoff_node(i, start, end); 349 cutoff_node(i, start, end);
384 /*
385 * don't confuse VM with a node that doesn't have the
386 * minimum memory.
387 */
388 if (nodes[i].end &&
389 (nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
390 unparse_node(i);
391 node_set_offline(i);
392 }
393 }
394 350
395 if (!nodes_cover_memory(nodes)) { 351 if (!nodes_cover_memory(nodes)) {
396 bad_srat(); 352 bad_srat();
@@ -423,7 +379,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
423 379
424 if (node == NUMA_NO_NODE) 380 if (node == NUMA_NO_NODE)
425 continue; 381 continue;
426 if (!node_isset(node, node_possible_map)) 382 if (!node_online(node))
427 numa_clear_node(i); 383 numa_clear_node(i);
428 } 384 }
429 numa_init_array(); 385 numa_init_array();
@@ -510,26 +466,6 @@ static int null_slit_node_compare(int a, int b)
510} 466}
511#endif /* CONFIG_NUMA_EMU */ 467#endif /* CONFIG_NUMA_EMU */
512 468
513void __init srat_reserve_add_area(int nodeid)
514{
515 if (found_add_area && nodes_add[nodeid].end) {
516 u64 total_mb;
517
518 printk(KERN_INFO "SRAT: Reserving hot-add memory space "
519 "for node %d at %Lx-%Lx\n",
520 nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
521 total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
522 >> PAGE_SHIFT;
523 total_mb *= sizeof(struct page);
524 total_mb >>= 20;
525 printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
526 "pre-allocated memory.\n", (unsigned long long)total_mb);
527 reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
528 nodes_add[nodeid].end - nodes_add[nodeid].start,
529 BOOTMEM_DEFAULT);
530 }
531}
532
533int __node_distance(int a, int b) 469int __node_distance(int a, int b)
534{ 470{
535 int index; 471 int index;
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 04df67f8a7b..044897be021 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -76,9 +76,9 @@ void
76x86_backtrace(struct pt_regs * const regs, unsigned int depth) 76x86_backtrace(struct pt_regs * const regs, unsigned int depth)
77{ 77{
78 struct frame_head *head = (struct frame_head *)frame_pointer(regs); 78 struct frame_head *head = (struct frame_head *)frame_pointer(regs);
79 unsigned long stack = kernel_trap_sp(regs);
80 79
81 if (!user_mode_vm(regs)) { 80 if (!user_mode_vm(regs)) {
81 unsigned long stack = kernel_stack_pointer(regs);
82 if (depth) 82 if (depth)
83 dump_trace(NULL, regs, (unsigned long *)stack, 0, 83 dump_trace(NULL, regs, (unsigned long *)stack, 0,
84 &backtrace_ops, &depth); 84 &backtrace_ops, &depth);
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 202864ad49a..b07dd8d0b32 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -40,8 +40,9 @@ static int profile_exceptions_notify(struct notifier_block *self,
40 40
41 switch (val) { 41 switch (val) {
42 case DIE_NMI: 42 case DIE_NMI:
43 if (model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu))) 43 case DIE_NMI_IPI:
44 ret = NOTIFY_STOP; 44 model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu));
45 ret = NOTIFY_STOP;
45 break; 46 break;
46 default: 47 default:
47 break; 48 break;
@@ -134,7 +135,7 @@ static void nmi_cpu_setup(void *dummy)
134static struct notifier_block profile_exceptions_nb = { 135static struct notifier_block profile_exceptions_nb = {
135 .notifier_call = profile_exceptions_notify, 136 .notifier_call = profile_exceptions_notify,
136 .next = NULL, 137 .next = NULL,
137 .priority = 0 138 .priority = 2
138}; 139};
139 140
140static int nmi_setup(void) 141static int nmi_setup(void)
@@ -356,14 +357,11 @@ static void exit_sysfs(void)
356#define exit_sysfs() do { } while (0) 357#define exit_sysfs() do { } while (0)
357#endif /* CONFIG_PM */ 358#endif /* CONFIG_PM */
358 359
359static int p4force;
360module_param(p4force, int, 0);
361
362static int __init p4_init(char **cpu_type) 360static int __init p4_init(char **cpu_type)
363{ 361{
364 __u8 cpu_model = boot_cpu_data.x86_model; 362 __u8 cpu_model = boot_cpu_data.x86_model;
365 363
366 if (!p4force && (cpu_model > 6 || cpu_model == 5)) 364 if (cpu_model > 6 || cpu_model == 5)
367 return 0; 365 return 0;
368 366
369#ifndef CONFIG_SMP 367#ifndef CONFIG_SMP
@@ -389,10 +387,25 @@ static int __init p4_init(char **cpu_type)
389 return 0; 387 return 0;
390} 388}
391 389
390static int force_arch_perfmon;
391static int force_cpu_type(const char *str, struct kernel_param *kp)
392{
393 if (!strcmp(str, "archperfmon")) {
394 force_arch_perfmon = 1;
395 printk(KERN_INFO "oprofile: forcing architectural perfmon\n");
396 }
397
398 return 0;
399}
400module_param_call(cpu_type, force_cpu_type, NULL, NULL, 0);
401
392static int __init ppro_init(char **cpu_type) 402static int __init ppro_init(char **cpu_type)
393{ 403{
394 __u8 cpu_model = boot_cpu_data.x86_model; 404 __u8 cpu_model = boot_cpu_data.x86_model;
395 405
406 if (force_arch_perfmon && cpu_has_arch_perfmon)
407 return 0;
408
396 switch (cpu_model) { 409 switch (cpu_model) {
397 case 0 ... 2: 410 case 0 ... 2:
398 *cpu_type = "i386/ppro"; 411 *cpu_type = "i386/ppro";
@@ -414,6 +427,13 @@ static int __init ppro_init(char **cpu_type)
414 case 15: case 23: 427 case 15: case 23:
415 *cpu_type = "i386/core_2"; 428 *cpu_type = "i386/core_2";
416 break; 429 break;
430 case 26:
431 arch_perfmon_setup_counters();
432 *cpu_type = "i386/core_i7";
433 break;
434 case 28:
435 *cpu_type = "i386/atom";
436 break;
417 default: 437 default:
418 /* Unknown */ 438 /* Unknown */
419 return 0; 439 return 0;
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index 10131fbdaad..4da7230b3d1 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -18,7 +18,7 @@
18#include <asm/msr.h> 18#include <asm/msr.h>
19#include <asm/apic.h> 19#include <asm/apic.h>
20#include <asm/nmi.h> 20#include <asm/nmi.h>
21#include <asm/intel_arch_perfmon.h> 21#include <asm/perf_counter.h>
22 22
23#include "op_x86_model.h" 23#include "op_x86_model.h"
24#include "op_counter.h" 24#include "op_counter.h"
@@ -136,6 +136,13 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
136 u64 val; 136 u64 val;
137 int i; 137 int i;
138 138
139 /*
140 * This can happen if perf counters are in use when
141 * we steal the die notifier NMI.
142 */
143 if (unlikely(!reset_value))
144 goto out;
145
139 for (i = 0 ; i < num_counters; ++i) { 146 for (i = 0 ; i < num_counters; ++i) {
140 if (!reset_value[i]) 147 if (!reset_value[i])
141 continue; 148 continue;
@@ -146,6 +153,7 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
146 } 153 }
147 } 154 }
148 155
156out:
149 /* Only P6 based Pentium M need to re-unmask the apic vector but it 157 /* Only P6 based Pentium M need to re-unmask the apic vector but it
150 * doesn't hurt other P6 variant */ 158 * doesn't hurt other P6 variant */
151 apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); 159 apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index fecbce6e7d7..0696d506c4a 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -889,6 +889,9 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
889 return 0; 889 return 0;
890 } 890 }
891 891
892 if (io_apic_assign_pci_irqs)
893 return 0;
894
892 /* Find IRQ routing entry */ 895 /* Find IRQ routing entry */
893 896
894 if (!pirq_table) 897 if (!pirq_table)
@@ -1039,56 +1042,15 @@ static void __init pcibios_fixup_irqs(void)
1039 pirq_penalty[dev->irq]++; 1042 pirq_penalty[dev->irq]++;
1040 } 1043 }
1041 1044
1045 if (io_apic_assign_pci_irqs)
1046 return;
1047
1042 dev = NULL; 1048 dev = NULL;
1043 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 1049 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
1044 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); 1050 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
1045 if (!pin) 1051 if (!pin)
1046 continue; 1052 continue;
1047 1053
1048#ifdef CONFIG_X86_IO_APIC
1049 /*
1050 * Recalculate IRQ numbers if we use the I/O APIC.
1051 */
1052 if (io_apic_assign_pci_irqs) {
1053 int irq;
1054
1055 /*
1056 * interrupt pins are numbered starting from 1
1057 */
1058 irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
1059 PCI_SLOT(dev->devfn), pin - 1);
1060 /*
1061 * Busses behind bridges are typically not listed in the
1062 * MP-table. In this case we have to look up the IRQ
1063 * based on the parent bus, parent slot, and pin number.
1064 * The SMP code detects such bridged busses itself so we
1065 * should get into this branch reliably.
1066 */
1067 if (irq < 0 && dev->bus->parent) {
1068 /* go back to the bridge */
1069 struct pci_dev *bridge = dev->bus->self;
1070 int bus;
1071
1072 pin = pci_swizzle_interrupt_pin(dev, pin);
1073 bus = bridge->bus->number;
1074 irq = IO_APIC_get_PCI_irq_vector(bus,
1075 PCI_SLOT(bridge->devfn), pin - 1);
1076 if (irq >= 0)
1077 dev_warn(&dev->dev,
1078 "using bridge %s INT %c to "
1079 "get IRQ %d\n",
1080 pci_name(bridge),
1081 'A' + pin - 1, irq);
1082 }
1083 if (irq >= 0) {
1084 dev_info(&dev->dev,
1085 "PCI->APIC IRQ transform: INT %c "
1086 "-> IRQ %d\n",
1087 'A' + pin - 1, irq);
1088 dev->irq = irq;
1089 }
1090 }
1091#endif
1092 /* 1054 /*
1093 * Still no IRQ? Try to lookup one... 1055 * Still no IRQ? Try to lookup one...
1094 */ 1056 */
@@ -1183,6 +1145,19 @@ int __init pcibios_irq_init(void)
1183 pcibios_enable_irq = pirq_enable_irq; 1145 pcibios_enable_irq = pirq_enable_irq;
1184 1146
1185 pcibios_fixup_irqs(); 1147 pcibios_fixup_irqs();
1148
1149 if (io_apic_assign_pci_irqs && pci_routeirq) {
1150 struct pci_dev *dev = NULL;
1151 /*
1152 * PCI IRQ routing is set up by pci_enable_device(), but we
1153 * also do it here in case there are still broken drivers that
1154 * don't use pci_enable_device().
1155 */
1156 printk(KERN_INFO "PCI: Routing PCI interrupts for all devices because \"pci=routeirq\" specified\n");
1157 for_each_pci_dev(dev)
1158 pirq_enable_irq(dev);
1159 }
1160
1186 return 0; 1161 return 0;
1187} 1162}
1188 1163
@@ -1213,16 +1188,23 @@ void pcibios_penalize_isa_irq(int irq, int active)
1213static int pirq_enable_irq(struct pci_dev *dev) 1188static int pirq_enable_irq(struct pci_dev *dev)
1214{ 1189{
1215 u8 pin; 1190 u8 pin;
1216 struct pci_dev *temp_dev;
1217 1191
1218 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); 1192 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
1219 if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) { 1193 if (pin && !pcibios_lookup_irq(dev, 1)) {
1220 char *msg = ""; 1194 char *msg = "";
1221 1195
1196 if (!io_apic_assign_pci_irqs && dev->irq)
1197 return 0;
1198
1222 if (io_apic_assign_pci_irqs) { 1199 if (io_apic_assign_pci_irqs) {
1200#ifdef CONFIG_X86_IO_APIC
1201 struct pci_dev *temp_dev;
1223 int irq; 1202 int irq;
1203 struct io_apic_irq_attr irq_attr;
1224 1204
1225 irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin - 1); 1205 irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
1206 PCI_SLOT(dev->devfn),
1207 pin - 1, &irq_attr);
1226 /* 1208 /*
1227 * Busses behind bridges are typically not listed in the MP-table. 1209 * Busses behind bridges are typically not listed in the MP-table.
1228 * In this case we have to look up the IRQ based on the parent bus, 1210 * In this case we have to look up the IRQ based on the parent bus,
@@ -1235,7 +1217,8 @@ static int pirq_enable_irq(struct pci_dev *dev)
1235 1217
1236 pin = pci_swizzle_interrupt_pin(dev, pin); 1218 pin = pci_swizzle_interrupt_pin(dev, pin);
1237 irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, 1219 irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
1238 PCI_SLOT(bridge->devfn), pin - 1); 1220 PCI_SLOT(bridge->devfn),
1221 pin - 1, &irq_attr);
1239 if (irq >= 0) 1222 if (irq >= 0)
1240 dev_warn(&dev->dev, "using bridge %s " 1223 dev_warn(&dev->dev, "using bridge %s "
1241 "INT %c to get IRQ %d\n", 1224 "INT %c to get IRQ %d\n",
@@ -1245,12 +1228,15 @@ static int pirq_enable_irq(struct pci_dev *dev)
1245 } 1228 }
1246 dev = temp_dev; 1229 dev = temp_dev;
1247 if (irq >= 0) { 1230 if (irq >= 0) {
1231 io_apic_set_pci_routing(&dev->dev, irq,
1232 &irq_attr);
1233 dev->irq = irq;
1248 dev_info(&dev->dev, "PCI->APIC IRQ transform: " 1234 dev_info(&dev->dev, "PCI->APIC IRQ transform: "
1249 "INT %c -> IRQ %d\n", 'A' + pin - 1, irq); 1235 "INT %c -> IRQ %d\n", 'A' + pin - 1, irq);
1250 dev->irq = irq;
1251 return 0; 1236 return 0;
1252 } else 1237 } else
1253 msg = "; probably buggy MP table"; 1238 msg = "; probably buggy MP table";
1239#endif
1254 } else if (pci_probe & PCI_BIOS_IRQ_SCAN) 1240 } else if (pci_probe & PCI_BIOS_IRQ_SCAN)
1255 msg = ""; 1241 msg = "";
1256 else 1242 else
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 5fa10bb9604..8766b0e216c 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -375,7 +375,7 @@ static acpi_status __init check_mcfg_resource(struct acpi_resource *res,
375 if (!fixmem32) 375 if (!fixmem32)
376 return AE_OK; 376 return AE_OK;
377 if ((mcfg_res->start >= fixmem32->address) && 377 if ((mcfg_res->start >= fixmem32->address) &&
378 (mcfg_res->end <= (fixmem32->address + 378 (mcfg_res->end < (fixmem32->address +
379 fixmem32->address_length))) { 379 fixmem32->address_length))) {
380 mcfg_res->flags = 1; 380 mcfg_res->flags = 1;
381 return AE_CTRL_TERMINATE; 381 return AE_CTRL_TERMINATE;
@@ -392,7 +392,7 @@ static acpi_status __init check_mcfg_resource(struct acpi_resource *res,
392 return AE_OK; 392 return AE_OK;
393 393
394 if ((mcfg_res->start >= address.minimum) && 394 if ((mcfg_res->start >= address.minimum) &&
395 (mcfg_res->end <= (address.minimum + address.address_length))) { 395 (mcfg_res->end < (address.minimum + address.address_length))) {
396 mcfg_res->flags = 1; 396 mcfg_res->flags = 1;
397 return AE_CTRL_TERMINATE; 397 return AE_CTRL_TERMINATE;
398 } 398 }
@@ -418,7 +418,7 @@ static int __init is_acpi_reserved(u64 start, u64 end, unsigned not_used)
418 struct resource mcfg_res; 418 struct resource mcfg_res;
419 419
420 mcfg_res.start = start; 420 mcfg_res.start = start;
421 mcfg_res.end = end; 421 mcfg_res.end = end - 1;
422 mcfg_res.flags = 0; 422 mcfg_res.flags = 0;
423 423
424 acpi_get_devices("PNP0C01", find_mboard_resource, &mcfg_res, NULL); 424 acpi_get_devices("PNP0C01", find_mboard_resource, &mcfg_res, NULL);
diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile
index 58b32db3312..de2abbd0754 100644
--- a/arch/x86/power/Makefile
+++ b/arch/x86/power/Makefile
@@ -3,5 +3,5 @@
3nostackp := $(call cc-option, -fno-stack-protector) 3nostackp := $(call cc-option, -fno-stack-protector)
4CFLAGS_cpu_$(BITS).o := $(nostackp) 4CFLAGS_cpu_$(BITS).o := $(nostackp)
5 5
6obj-$(CONFIG_PM_SLEEP) += cpu_$(BITS).o 6obj-$(CONFIG_PM_SLEEP) += cpu.o
7obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o 7obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o
diff --git a/arch/x86/power/cpu_64.c b/arch/x86/power/cpu.c
index 46866a13a93..394cbb88987 100644
--- a/arch/x86/power/cpu_64.c
+++ b/arch/x86/power/cpu.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Suspend and hibernation support for x86-64 2 * Suspend support specific for i386/x86-64.
3 * 3 *
4 * Distribute under GPLv2 4 * Distribute under GPLv2
5 * 5 *
@@ -8,19 +8,29 @@
8 * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> 8 * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
9 */ 9 */
10 10
11#include <linux/smp.h>
12#include <linux/suspend.h> 11#include <linux/suspend.h>
13#include <asm/proto.h> 12#include <linux/smp.h>
14#include <asm/page.h> 13
15#include <asm/pgtable.h> 14#include <asm/pgtable.h>
15#include <asm/proto.h>
16#include <asm/mtrr.h> 16#include <asm/mtrr.h>
17#include <asm/page.h>
18#include <asm/mce.h>
17#include <asm/xcr.h> 19#include <asm/xcr.h>
18#include <asm/suspend.h> 20#include <asm/suspend.h>
19#include <asm/debugreg.h> 21#include <asm/debugreg.h>
20 22
21static void fix_processor_context(void); 23#ifdef CONFIG_X86_32
24static struct saved_context saved_context;
22 25
26unsigned long saved_context_ebx;
27unsigned long saved_context_esp, saved_context_ebp;
28unsigned long saved_context_esi, saved_context_edi;
29unsigned long saved_context_eflags;
30#else
31/* CONFIG_X86_64 */
23struct saved_context saved_context; 32struct saved_context saved_context;
33#endif
24 34
25/** 35/**
26 * __save_processor_state - save CPU registers before creating a 36 * __save_processor_state - save CPU registers before creating a
@@ -39,19 +49,35 @@ struct saved_context saved_context;
39 */ 49 */
40static void __save_processor_state(struct saved_context *ctxt) 50static void __save_processor_state(struct saved_context *ctxt)
41{ 51{
52#ifdef CONFIG_X86_32
53 mtrr_save_fixed_ranges(NULL);
54#endif
42 kernel_fpu_begin(); 55 kernel_fpu_begin();
43 56
44 /* 57 /*
45 * descriptor tables 58 * descriptor tables
46 */ 59 */
60#ifdef CONFIG_X86_32
61 store_gdt(&ctxt->gdt);
62 store_idt(&ctxt->idt);
63#else
64/* CONFIG_X86_64 */
47 store_gdt((struct desc_ptr *)&ctxt->gdt_limit); 65 store_gdt((struct desc_ptr *)&ctxt->gdt_limit);
48 store_idt((struct desc_ptr *)&ctxt->idt_limit); 66 store_idt((struct desc_ptr *)&ctxt->idt_limit);
67#endif
49 store_tr(ctxt->tr); 68 store_tr(ctxt->tr);
50 69
51 /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */ 70 /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */
52 /* 71 /*
53 * segment registers 72 * segment registers
54 */ 73 */
74#ifdef CONFIG_X86_32
75 savesegment(es, ctxt->es);
76 savesegment(fs, ctxt->fs);
77 savesegment(gs, ctxt->gs);
78 savesegment(ss, ctxt->ss);
79#else
80/* CONFIG_X86_64 */
55 asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds)); 81 asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds));
56 asm volatile ("movw %%es, %0" : "=m" (ctxt->es)); 82 asm volatile ("movw %%es, %0" : "=m" (ctxt->es));
57 asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs)); 83 asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs));
@@ -63,31 +89,68 @@ static void __save_processor_state(struct saved_context *ctxt)
63 rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); 89 rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
64 mtrr_save_fixed_ranges(NULL); 90 mtrr_save_fixed_ranges(NULL);
65 91
92 rdmsrl(MSR_EFER, ctxt->efer);
93#endif
94
66 /* 95 /*
67 * control registers 96 * control registers
68 */ 97 */
69 rdmsrl(MSR_EFER, ctxt->efer);
70 ctxt->cr0 = read_cr0(); 98 ctxt->cr0 = read_cr0();
71 ctxt->cr2 = read_cr2(); 99 ctxt->cr2 = read_cr2();
72 ctxt->cr3 = read_cr3(); 100 ctxt->cr3 = read_cr3();
101#ifdef CONFIG_X86_32
102 ctxt->cr4 = read_cr4_safe();
103#else
104/* CONFIG_X86_64 */
73 ctxt->cr4 = read_cr4(); 105 ctxt->cr4 = read_cr4();
74 ctxt->cr8 = read_cr8(); 106 ctxt->cr8 = read_cr8();
107#endif
75 hw_breakpoint_disable(); 108 hw_breakpoint_disable();
76} 109}
77 110
111/* Needed by apm.c */
78void save_processor_state(void) 112void save_processor_state(void)
79{ 113{
80 __save_processor_state(&saved_context); 114 __save_processor_state(&saved_context);
81} 115}
116#ifdef CONFIG_X86_32
117EXPORT_SYMBOL(save_processor_state);
118#endif
82 119
83static void do_fpu_end(void) 120static void do_fpu_end(void)
84{ 121{
85 /* 122 /*
86 * Restore FPU regs if necessary 123 * Restore FPU regs if necessary.
87 */ 124 */
88 kernel_fpu_end(); 125 kernel_fpu_end();
89} 126}
90 127
128static void fix_processor_context(void)
129{
130 int cpu = smp_processor_id();
131 struct tss_struct *t = &per_cpu(init_tss, cpu);
132
133 set_tss_desc(cpu, t); /*
134 * This just modifies memory; should not be
135 * necessary. But... This is necessary, because
136 * 386 hardware has concept of busy TSS or some
137 * similar stupidity.
138 */
139
140#ifdef CONFIG_X86_64
141 get_cpu_gdt_table(cpu)[GDT_ENTRY_TSS].type = 9;
142
143 syscall_init(); /* This sets MSR_*STAR and related */
144#endif
145 load_TR_desc(); /* This does ltr */
146 load_LDT(&current->active_mm->context); /* This does lldt */
147
148 /*
149 * Now maybe reload the debug registers
150 */
151 load_debug_registers();
152}
153
91/** 154/**
92 * __restore_processor_state - restore the contents of CPU registers saved 155 * __restore_processor_state - restore the contents of CPU registers saved
93 * by __save_processor_state() 156 * by __save_processor_state()
@@ -98,9 +161,16 @@ static void __restore_processor_state(struct saved_context *ctxt)
98 /* 161 /*
99 * control registers 162 * control registers
100 */ 163 */
164 /* cr4 was introduced in the Pentium CPU */
165#ifdef CONFIG_X86_32
166 if (ctxt->cr4)
167 write_cr4(ctxt->cr4);
168#else
169/* CONFIG X86_64 */
101 wrmsrl(MSR_EFER, ctxt->efer); 170 wrmsrl(MSR_EFER, ctxt->efer);
102 write_cr8(ctxt->cr8); 171 write_cr8(ctxt->cr8);
103 write_cr4(ctxt->cr4); 172 write_cr4(ctxt->cr4);
173#endif
104 write_cr3(ctxt->cr3); 174 write_cr3(ctxt->cr3);
105 write_cr2(ctxt->cr2); 175 write_cr2(ctxt->cr2);
106 write_cr0(ctxt->cr0); 176 write_cr0(ctxt->cr0);
@@ -109,13 +179,31 @@ static void __restore_processor_state(struct saved_context *ctxt)
109 * now restore the descriptor tables to their proper values 179 * now restore the descriptor tables to their proper values
110 * ltr is done i fix_processor_context(). 180 * ltr is done i fix_processor_context().
111 */ 181 */
182#ifdef CONFIG_X86_32
183 load_gdt(&ctxt->gdt);
184 load_idt(&ctxt->idt);
185#else
186/* CONFIG_X86_64 */
112 load_gdt((const struct desc_ptr *)&ctxt->gdt_limit); 187 load_gdt((const struct desc_ptr *)&ctxt->gdt_limit);
113 load_idt((const struct desc_ptr *)&ctxt->idt_limit); 188 load_idt((const struct desc_ptr *)&ctxt->idt_limit);
114 189#endif
115 190
116 /* 191 /*
117 * segment registers 192 * segment registers
118 */ 193 */
194#ifdef CONFIG_X86_32
195 loadsegment(es, ctxt->es);
196 loadsegment(fs, ctxt->fs);
197 loadsegment(gs, ctxt->gs);
198 loadsegment(ss, ctxt->ss);
199
200 /*
201 * sysenter MSRs
202 */
203 if (boot_cpu_has(X86_FEATURE_SEP))
204 enable_sep_cpu();
205#else
206/* CONFIG_X86_64 */
119 asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds)); 207 asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds));
120 asm volatile ("movw %0, %%es" :: "r" (ctxt->es)); 208 asm volatile ("movw %0, %%es" :: "r" (ctxt->es));
121 asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs)); 209 asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs));
@@ -125,6 +213,7 @@ static void __restore_processor_state(struct saved_context *ctxt)
125 wrmsrl(MSR_FS_BASE, ctxt->fs_base); 213 wrmsrl(MSR_FS_BASE, ctxt->fs_base);
126 wrmsrl(MSR_GS_BASE, ctxt->gs_base); 214 wrmsrl(MSR_GS_BASE, ctxt->gs_base);
127 wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); 215 wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
216#endif
128 217
129 /* 218 /*
130 * restore XCR0 for xsave capable cpu's. 219 * restore XCR0 for xsave capable cpu's.
@@ -136,33 +225,17 @@ static void __restore_processor_state(struct saved_context *ctxt)
136 225
137 do_fpu_end(); 226 do_fpu_end();
138 mtrr_ap_init(); 227 mtrr_ap_init();
228
229#ifdef CONFIG_X86_32
230 mcheck_init(&boot_cpu_data);
231#endif
139} 232}
140 233
234/* Needed by apm.c */
141void restore_processor_state(void) 235void restore_processor_state(void)
142{ 236{
143 __restore_processor_state(&saved_context); 237 __restore_processor_state(&saved_context);
144} 238}
145 239#ifdef CONFIG_X86_32
146static void fix_processor_context(void) 240EXPORT_SYMBOL(restore_processor_state);
147{ 241#endif
148 int cpu = smp_processor_id();
149 struct tss_struct *t = &per_cpu(init_tss, cpu);
150
151 /*
152 * This just modifies memory; should not be necessary. But... This
153 * is necessary, because 386 hardware has concept of busy TSS or some
154 * similar stupidity.
155 */
156 set_tss_desc(cpu, t);
157
158 get_cpu_gdt_table(cpu)[GDT_ENTRY_TSS].type = 9;
159
160 syscall_init(); /* This sets MSR_*STAR and related */
161 load_TR_desc(); /* This does ltr */
162 load_LDT(&current->active_mm->context); /* This does lldt */
163
164 /*
165 * Now maybe reload the debug registers
166 */
167 load_debug_registers();
168}
diff --git a/arch/x86/power/cpu_32.c b/arch/x86/power/cpu_32.c
deleted file mode 100644
index 2bc3b016de9..00000000000
--- a/arch/x86/power/cpu_32.c
+++ /dev/null
@@ -1,141 +0,0 @@
1/*
2 * Suspend support specific for i386.
3 *
4 * Distribute under GPLv2
5 *
6 * Copyright (c) 2002 Pavel Machek <pavel@suse.cz>
7 * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
8 */
9
10#include <linux/module.h>
11#include <linux/suspend.h>
12#include <asm/mtrr.h>
13#include <asm/mce.h>
14#include <asm/xcr.h>
15#include <asm/suspend.h>
16#include <asm/debugreg.h>
17
18static struct saved_context saved_context;
19
20unsigned long saved_context_ebx;
21unsigned long saved_context_esp, saved_context_ebp;
22unsigned long saved_context_esi, saved_context_edi;
23unsigned long saved_context_eflags;
24
25static void __save_processor_state(struct saved_context *ctxt)
26{
27 mtrr_save_fixed_ranges(NULL);
28 kernel_fpu_begin();
29
30 /*
31 * descriptor tables
32 */
33 store_gdt(&ctxt->gdt);
34 store_idt(&ctxt->idt);
35 store_tr(ctxt->tr);
36
37 /*
38 * segment registers
39 */
40 savesegment(es, ctxt->es);
41 savesegment(fs, ctxt->fs);
42 savesegment(gs, ctxt->gs);
43 savesegment(ss, ctxt->ss);
44
45 /*
46 * control registers
47 */
48 ctxt->cr0 = read_cr0();
49 ctxt->cr2 = read_cr2();
50 ctxt->cr3 = read_cr3();
51 ctxt->cr4 = read_cr4_safe();
52 hw_breakpoint_disable();
53}
54
55/* Needed by apm.c */
56void save_processor_state(void)
57{
58 __save_processor_state(&saved_context);
59}
60EXPORT_SYMBOL(save_processor_state);
61
62static void do_fpu_end(void)
63{
64 /*
65 * Restore FPU regs if necessary.
66 */
67 kernel_fpu_end();
68}
69
70static void fix_processor_context(void)
71{
72 int cpu = smp_processor_id();
73 struct tss_struct *t = &per_cpu(init_tss, cpu);
74
75 set_tss_desc(cpu, t); /*
76 * This just modifies memory; should not be
77 * necessary. But... This is necessary, because
78 * 386 hardware has concept of busy TSS or some
79 * similar stupidity.
80 */
81
82 load_TR_desc(); /* This does ltr */
83 load_LDT(&current->active_mm->context); /* This does lldt */
84
85 /*
86 * Now maybe reload the debug registers
87 */
88 load_debug_registers();
89}
90
91static void __restore_processor_state(struct saved_context *ctxt)
92{
93 /*
94 * control registers
95 */
96 /* cr4 was introduced in the Pentium CPU */
97 if (ctxt->cr4)
98 write_cr4(ctxt->cr4);
99 write_cr3(ctxt->cr3);
100 write_cr2(ctxt->cr2);
101 write_cr0(ctxt->cr0);
102
103 /*
104 * now restore the descriptor tables to their proper values
105 * ltr is done i fix_processor_context().
106 */
107 load_gdt(&ctxt->gdt);
108 load_idt(&ctxt->idt);
109
110 /*
111 * segment registers
112 */
113 loadsegment(es, ctxt->es);
114 loadsegment(fs, ctxt->fs);
115 loadsegment(gs, ctxt->gs);
116 loadsegment(ss, ctxt->ss);
117
118 /*
119 * sysenter MSRs
120 */
121 if (boot_cpu_has(X86_FEATURE_SEP))
122 enable_sep_cpu();
123
124 /*
125 * restore XCR0 for xsave capable cpu's.
126 */
127 if (cpu_has_xsave)
128 xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
129
130 fix_processor_context();
131 do_fpu_end();
132 mtrr_ap_init();
133 mcheck_init(&boot_cpu_data);
134}
135
136/* Needed by apm.c */
137void restore_processor_state(void)
138{
139 __restore_processor_state(&saved_context);
140}
141EXPORT_SYMBOL(restore_processor_state);
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 1241f118ab5..58bc00f68b1 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -338,6 +338,8 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
338 } 338 }
339 } 339 }
340 340
341 current->mm->context.vdso = (void *)addr;
342
341 if (compat_uses_vma || !compat) { 343 if (compat_uses_vma || !compat) {
342 /* 344 /*
343 * MAYWRITE to allow gdb to COW and set breakpoints 345 * MAYWRITE to allow gdb to COW and set breakpoints
@@ -358,11 +360,13 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
358 goto up_fail; 360 goto up_fail;
359 } 361 }
360 362
361 current->mm->context.vdso = (void *)addr;
362 current_thread_info()->sysenter_return = 363 current_thread_info()->sysenter_return =
363 VDSO32_SYMBOL(addr, SYSENTER_RETURN); 364 VDSO32_SYMBOL(addr, SYSENTER_RETURN);
364 365
365 up_fail: 366 up_fail:
367 if (ret)
368 current->mm->context.vdso = NULL;
369
366 up_write(&mm->mmap_sem); 370 up_write(&mm->mmap_sem);
367 371
368 return ret; 372 return ret;
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 7133cdf9098..21e1aeb9f3e 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -8,6 +8,7 @@
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/init.h> 9#include <linux/init.h>
10#include <linux/random.h> 10#include <linux/random.h>
11#include <linux/elf.h>
11#include <asm/vsyscall.h> 12#include <asm/vsyscall.h>
12#include <asm/vgtod.h> 13#include <asm/vgtod.h>
13#include <asm/proto.h> 14#include <asm/proto.h>
@@ -115,15 +116,18 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
115 goto up_fail; 116 goto up_fail;
116 } 117 }
117 118
119 current->mm->context.vdso = (void *)addr;
120
118 ret = install_special_mapping(mm, addr, vdso_size, 121 ret = install_special_mapping(mm, addr, vdso_size,
119 VM_READ|VM_EXEC| 122 VM_READ|VM_EXEC|
120 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| 123 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
121 VM_ALWAYSDUMP, 124 VM_ALWAYSDUMP,
122 vdso_pages); 125 vdso_pages);
123 if (ret) 126 if (ret) {
127 current->mm->context.vdso = NULL;
124 goto up_fail; 128 goto up_fail;
129 }
125 130
126 current->mm->context.vdso = (void *)addr;
127up_fail: 131up_fail:
128 up_write(&mm->mmap_sem); 132 up_write(&mm->mmap_sem);
129 return ret; 133 return ret;
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 3b767d03fd6..172438f86a0 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -9,5 +9,6 @@ obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
9 time.o xen-asm.o xen-asm_$(BITS).o \ 9 time.o xen-asm.o xen-asm_$(BITS).o \
10 grant-table.o suspend.o 10 grant-table.o suspend.o
11 11
12obj-$(CONFIG_SMP) += smp.o spinlock.o 12obj-$(CONFIG_SMP) += smp.o
13obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o \ No newline at end of file 13obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
14obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index f09e8c36ee8..0a1700a2be9 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -20,6 +20,7 @@
20#include <linux/delay.h> 20#include <linux/delay.h>
21#include <linux/start_kernel.h> 21#include <linux/start_kernel.h>
22#include <linux/sched.h> 22#include <linux/sched.h>
23#include <linux/kprobes.h>
23#include <linux/bootmem.h> 24#include <linux/bootmem.h>
24#include <linux/module.h> 25#include <linux/module.h>
25#include <linux/mm.h> 26#include <linux/mm.h>
@@ -44,6 +45,7 @@
44#include <asm/processor.h> 45#include <asm/processor.h>
45#include <asm/proto.h> 46#include <asm/proto.h>
46#include <asm/msr-index.h> 47#include <asm/msr-index.h>
48#include <asm/traps.h>
47#include <asm/setup.h> 49#include <asm/setup.h>
48#include <asm/desc.h> 50#include <asm/desc.h>
49#include <asm/pgtable.h> 51#include <asm/pgtable.h>
@@ -240,10 +242,10 @@ static unsigned long xen_get_debugreg(int reg)
240 return HYPERVISOR_get_debugreg(reg); 242 return HYPERVISOR_get_debugreg(reg);
241} 243}
242 244
243void xen_leave_lazy(void) 245static void xen_end_context_switch(struct task_struct *next)
244{ 246{
245 paravirt_leave_lazy(paravirt_get_lazy_mode());
246 xen_mc_flush(); 247 xen_mc_flush();
248 paravirt_end_context_switch(next);
247} 249}
248 250
249static unsigned long xen_store_tr(void) 251static unsigned long xen_store_tr(void)
@@ -428,11 +430,44 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
428static int cvt_gate_to_trap(int vector, const gate_desc *val, 430static int cvt_gate_to_trap(int vector, const gate_desc *val,
429 struct trap_info *info) 431 struct trap_info *info)
430{ 432{
433 unsigned long addr;
434
431 if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT) 435 if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT)
432 return 0; 436 return 0;
433 437
434 info->vector = vector; 438 info->vector = vector;
435 info->address = gate_offset(*val); 439
440 addr = gate_offset(*val);
441#ifdef CONFIG_X86_64
442 /*
443 * Look for known traps using IST, and substitute them
444 * appropriately. The debugger ones are the only ones we care
445 * about. Xen will handle faults like double_fault and
446 * machine_check, so we should never see them. Warn if
447 * there's an unexpected IST-using fault handler.
448 */
449 if (addr == (unsigned long)debug)
450 addr = (unsigned long)xen_debug;
451 else if (addr == (unsigned long)int3)
452 addr = (unsigned long)xen_int3;
453 else if (addr == (unsigned long)stack_segment)
454 addr = (unsigned long)xen_stack_segment;
455 else if (addr == (unsigned long)double_fault ||
456 addr == (unsigned long)nmi) {
457 /* Don't need to handle these */
458 return 0;
459#ifdef CONFIG_X86_MCE
460 } else if (addr == (unsigned long)machine_check) {
461 return 0;
462#endif
463 } else {
464 /* Some other trap using IST? */
465 if (WARN_ON(val->ist != 0))
466 return 0;
467 }
468#endif /* CONFIG_X86_64 */
469 info->address = addr;
470
436 info->cs = gate_segment(*val); 471 info->cs = gate_segment(*val);
437 info->flags = val->dpl; 472 info->flags = val->dpl;
438 /* interrupt gates clear IF */ 473 /* interrupt gates clear IF */
@@ -623,10 +658,26 @@ static void xen_clts(void)
623 xen_mc_issue(PARAVIRT_LAZY_CPU); 658 xen_mc_issue(PARAVIRT_LAZY_CPU);
624} 659}
625 660
661static DEFINE_PER_CPU(unsigned long, xen_cr0_value);
662
663static unsigned long xen_read_cr0(void)
664{
665 unsigned long cr0 = percpu_read(xen_cr0_value);
666
667 if (unlikely(cr0 == 0)) {
668 cr0 = native_read_cr0();
669 percpu_write(xen_cr0_value, cr0);
670 }
671
672 return cr0;
673}
674
626static void xen_write_cr0(unsigned long cr0) 675static void xen_write_cr0(unsigned long cr0)
627{ 676{
628 struct multicall_space mcs; 677 struct multicall_space mcs;
629 678
679 percpu_write(xen_cr0_value, cr0);
680
630 /* Only pay attention to cr0.TS; everything else is 681 /* Only pay attention to cr0.TS; everything else is
631 ignored. */ 682 ignored. */
632 mcs = xen_mc_entry(0); 683 mcs = xen_mc_entry(0);
@@ -812,7 +863,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
812 863
813 .clts = xen_clts, 864 .clts = xen_clts,
814 865
815 .read_cr0 = native_read_cr0, 866 .read_cr0 = xen_read_cr0,
816 .write_cr0 = xen_write_cr0, 867 .write_cr0 = xen_write_cr0,
817 868
818 .read_cr4 = native_read_cr4, 869 .read_cr4 = native_read_cr4,
@@ -860,10 +911,8 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
860 /* Xen takes care of %gs when switching to usermode for us */ 911 /* Xen takes care of %gs when switching to usermode for us */
861 .swapgs = paravirt_nop, 912 .swapgs = paravirt_nop,
862 913
863 .lazy_mode = { 914 .start_context_switch = paravirt_start_context_switch,
864 .enter = paravirt_enter_lazy_cpu, 915 .end_context_switch = xen_end_context_switch,
865 .leave = xen_leave_lazy,
866 },
867}; 916};
868 917
869static const struct pv_apic_ops xen_apic_ops __initdata = { 918static const struct pv_apic_ops xen_apic_ops __initdata = {
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index e25a78e1113..4ceb2858165 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -42,6 +42,7 @@
42#include <linux/highmem.h> 42#include <linux/highmem.h>
43#include <linux/debugfs.h> 43#include <linux/debugfs.h>
44#include <linux/bug.h> 44#include <linux/bug.h>
45#include <linux/module.h>
45 46
46#include <asm/pgtable.h> 47#include <asm/pgtable.h>
47#include <asm/tlbflush.h> 48#include <asm/tlbflush.h>
@@ -451,10 +452,6 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
451void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 452void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
452 pte_t *ptep, pte_t pteval) 453 pte_t *ptep, pte_t pteval)
453{ 454{
454 /* updates to init_mm may be done without lock */
455 if (mm == &init_mm)
456 preempt_disable();
457
458 ADD_STATS(set_pte_at, 1); 455 ADD_STATS(set_pte_at, 1);
459// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep)); 456// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
460 ADD_STATS(set_pte_at_current, mm == current->mm); 457 ADD_STATS(set_pte_at_current, mm == current->mm);
@@ -475,9 +472,7 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
475 } 472 }
476 xen_set_pte(ptep, pteval); 473 xen_set_pte(ptep, pteval);
477 474
478out: 475out: return;
479 if (mm == &init_mm)
480 preempt_enable();
481} 476}
482 477
483pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, 478pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
@@ -1151,10 +1146,8 @@ static void drop_other_mm_ref(void *info)
1151 1146
1152 /* If this cpu still has a stale cr3 reference, then make sure 1147 /* If this cpu still has a stale cr3 reference, then make sure
1153 it has been flushed. */ 1148 it has been flushed. */
1154 if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) { 1149 if (percpu_read(xen_current_cr3) == __pa(mm->pgd))
1155 load_cr3(swapper_pg_dir); 1150 load_cr3(swapper_pg_dir);
1156 arch_flush_lazy_cpu_mode();
1157 }
1158} 1151}
1159 1152
1160static void xen_drop_mm_ref(struct mm_struct *mm) 1153static void xen_drop_mm_ref(struct mm_struct *mm)
@@ -1167,7 +1160,6 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
1167 load_cr3(swapper_pg_dir); 1160 load_cr3(swapper_pg_dir);
1168 else 1161 else
1169 leave_mm(smp_processor_id()); 1162 leave_mm(smp_processor_id());
1170 arch_flush_lazy_cpu_mode();
1171 } 1163 }
1172 1164
1173 /* Get the "official" set of cpus referring to our pagetable. */ 1165 /* Get the "official" set of cpus referring to our pagetable. */
@@ -1875,6 +1867,14 @@ __init void xen_post_allocator_init(void)
1875 xen_mark_init_mm_pinned(); 1867 xen_mark_init_mm_pinned();
1876} 1868}
1877 1869
1870static void xen_leave_lazy_mmu(void)
1871{
1872 preempt_disable();
1873 xen_mc_flush();
1874 paravirt_leave_lazy_mmu();
1875 preempt_enable();
1876}
1877
1878const struct pv_mmu_ops xen_mmu_ops __initdata = { 1878const struct pv_mmu_ops xen_mmu_ops __initdata = {
1879 .pagetable_setup_start = xen_pagetable_setup_start, 1879 .pagetable_setup_start = xen_pagetable_setup_start,
1880 .pagetable_setup_done = xen_pagetable_setup_done, 1880 .pagetable_setup_done = xen_pagetable_setup_done,
@@ -1948,7 +1948,7 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = {
1948 1948
1949 .lazy_mode = { 1949 .lazy_mode = {
1950 .enter = paravirt_enter_lazy_mmu, 1950 .enter = paravirt_enter_lazy_mmu,
1951 .leave = xen_leave_lazy, 1951 .leave = xen_leave_lazy_mmu,
1952 }, 1952 },
1953 1953
1954 .set_fixmap = xen_set_fixmap, 1954 .set_fixmap = xen_set_fixmap,
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 15c6c68db6a..ad0047f47cd 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -61,9 +61,9 @@ char * __init xen_memory_setup(void)
61 * - xen_start_info 61 * - xen_start_info
62 * See comment above "struct start_info" in <xen/interface/xen.h> 62 * See comment above "struct start_info" in <xen/interface/xen.h>
63 */ 63 */
64 e820_add_region(__pa(xen_start_info->mfn_list), 64 reserve_early(__pa(xen_start_info->mfn_list),
65 xen_start_info->pt_base - xen_start_info->mfn_list, 65 __pa(xen_start_info->pt_base),
66 E820_RESERVED); 66 "XEN START INFO");
67 67
68 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 68 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
69 69
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 20139464943..22494fd4c9b 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -30,7 +30,6 @@ pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
30void xen_ident_map_ISA(void); 30void xen_ident_map_ISA(void);
31void xen_reserve_top(void); 31void xen_reserve_top(void);
32 32
33void xen_leave_lazy(void);
34void xen_post_allocator_init(void); 33void xen_post_allocator_init(void);
35 34
36char * __init xen_memory_setup(void); 35char * __init xen_memory_setup(void);
@@ -62,15 +61,26 @@ void xen_setup_vcpu_info_placement(void);
62#ifdef CONFIG_SMP 61#ifdef CONFIG_SMP
63void xen_smp_init(void); 62void xen_smp_init(void);
64 63
65void __init xen_init_spinlocks(void);
66__cpuinit void xen_init_lock_cpu(int cpu);
67void xen_uninit_lock_cpu(int cpu);
68
69extern cpumask_var_t xen_cpu_initialized_map; 64extern cpumask_var_t xen_cpu_initialized_map;
70#else 65#else
71static inline void xen_smp_init(void) {} 66static inline void xen_smp_init(void) {}
72#endif 67#endif
73 68
69#ifdef CONFIG_PARAVIRT_SPINLOCKS
70void __init xen_init_spinlocks(void);
71__cpuinit void xen_init_lock_cpu(int cpu);
72void xen_uninit_lock_cpu(int cpu);
73#else
74static inline void xen_init_spinlocks(void)
75{
76}
77static inline void xen_init_lock_cpu(int cpu)
78{
79}
80static inline void xen_uninit_lock_cpu(int cpu)
81{
82}
83#endif
74 84
75/* Declare an asm function, along with symbols needed to make it 85/* Declare an asm function, along with symbols needed to make it
76 inlineable */ 86 inlineable */