aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-04-06 03:02:57 -0400
committerIngo Molnar <mingo@elte.hu>2009-04-06 03:02:57 -0400
commitf541ae326fa120fa5c57433e4d9a133df212ce41 (patch)
treebdbd94ec72cfc601118051cb35e8617d55510177 /arch/x86
parente255357764f92afcafafbd4879b222b8c752065a (diff)
parent0221c81b1b8eb0cbb6b30a0ced52ead32d2b4e4c (diff)
Merge branch 'linus' into perfcounters/core-v2
Merge reason: we have gathered quite a few conflicts, need to merge upstream Conflicts: arch/powerpc/kernel/Makefile arch/x86/ia32/ia32entry.S arch/x86/include/asm/hardirq.h arch/x86/include/asm/unistd_32.h arch/x86/include/asm/unistd_64.h arch/x86/kernel/cpu/common.c arch/x86/kernel/irq.c arch/x86/kernel/syscall_table_32.S arch/x86/mm/iomap_32.c include/linux/sched.h kernel/Makefile Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig40
-rw-r--r--arch/x86/Kconfig.cpu17
-rw-r--r--arch/x86/Kconfig.debug8
-rw-r--r--arch/x86/Makefile29
-rw-r--r--arch/x86/boot/Makefile56
-rw-r--r--arch/x86/boot/compressed/Makefile22
-rw-r--r--arch/x86/boot/compressed/misc.c118
-rw-r--r--arch/x86/boot/header.S29
-rw-r--r--arch/x86/boot/memory.c39
-rw-r--r--arch/x86/boot/pm.c44
-rw-r--r--arch/x86/boot/pmjump.S1
-rw-r--r--arch/x86/boot/setup.ld3
-rw-r--r--arch/x86/boot/tools/build.c9
-rw-r--r--arch/x86/boot/video-vga.c22
-rw-r--r--arch/x86/crypto/Makefile3
-rw-r--r--arch/x86/crypto/aes-i586-asm_32.S18
-rw-r--r--arch/x86/crypto/aes-x86_64-asm_64.S6
-rw-r--r--arch/x86/crypto/aes_glue.c20
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S896
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c461
-rw-r--r--arch/x86/ia32/ia32entry.S4
-rw-r--r--arch/x86/ia32/sys_ia32.c22
-rw-r--r--arch/x86/include/asm/aes.h11
-rw-r--r--arch/x86/include/asm/apic.h61
-rw-r--r--arch/x86/include/asm/apicdef.h1
-rw-r--r--arch/x86/include/asm/boot.h20
-rw-r--r--arch/x86/include/asm/cacheflush.h63
-rwxr-xr-xarch/x86/include/asm/cpu_debug.h226
-rw-r--r--arch/x86/include/asm/cpufeature.h1
-rw-r--r--arch/x86/include/asm/cpumask.h18
-rw-r--r--arch/x86/include/asm/desc.h3
-rw-r--r--arch/x86/include/asm/device.h2
-rw-r--r--arch/x86/include/asm/dma-mapping.h188
-rw-r--r--arch/x86/include/asm/dmi.h19
-rw-r--r--arch/x86/include/asm/e820.h2
-rw-r--r--arch/x86/include/asm/efi.h2
-rw-r--r--arch/x86/include/asm/entry_arch.h2
-rw-r--r--arch/x86/include/asm/fixmap.h141
-rw-r--r--arch/x86/include/asm/fixmap_32.h115
-rw-r--r--arch/x86/include/asm/fixmap_64.h79
-rw-r--r--arch/x86/include/asm/ftrace.h32
-rw-r--r--arch/x86/include/asm/hardirq.h1
-rw-r--r--arch/x86/include/asm/highmem.h1
-rw-r--r--arch/x86/include/asm/hw_irq.h1
-rw-r--r--arch/x86/include/asm/i387.h8
-rw-r--r--arch/x86/include/asm/ia32.h7
-rw-r--r--arch/x86/include/asm/init.h18
-rw-r--r--arch/x86/include/asm/io.h3
-rw-r--r--arch/x86/include/asm/io_apic.h5
-rw-r--r--arch/x86/include/asm/iomap.h5
-rw-r--r--arch/x86/include/asm/iommu.h2
-rw-r--r--arch/x86/include/asm/irq.h1
-rw-r--r--arch/x86/include/asm/irq_remapping.h2
-rw-r--r--arch/x86/include/asm/irq_vectors.h5
-rw-r--r--arch/x86/include/asm/kexec.h13
-rw-r--r--arch/x86/include/asm/kvm.h24
-rw-r--r--arch/x86/include/asm/kvm_host.h61
-rw-r--r--arch/x86/include/asm/lguest_hcall.h24
-rw-r--r--arch/x86/include/asm/linkage.h19
-rw-r--r--arch/x86/include/asm/mce.h35
-rw-r--r--arch/x86/include/asm/mmzone_32.h43
-rw-r--r--arch/x86/include/asm/msidef.h1
-rw-r--r--arch/x86/include/asm/msr-index.h14
-rw-r--r--arch/x86/include/asm/numa_32.h6
-rw-r--r--arch/x86/include/asm/page_32_types.h5
-rw-r--r--arch/x86/include/asm/page_types.h6
-rw-r--r--arch/x86/include/asm/paravirt.h19
-rw-r--r--arch/x86/include/asm/pat.h5
-rw-r--r--arch/x86/include/asm/pci.h44
-rw-r--r--arch/x86/include/asm/pci_32.h34
-rw-r--r--arch/x86/include/asm/pci_64.h22
-rw-r--r--arch/x86/include/asm/pgtable-2level.h7
-rw-r--r--arch/x86/include/asm/pgtable-3level.h17
-rw-r--r--arch/x86/include/asm/pgtable.h4
-rw-r--r--arch/x86/include/asm/pgtable_32.h3
-rw-r--r--arch/x86/include/asm/pgtable_32_types.h5
-rw-r--r--arch/x86/include/asm/pgtable_types.h1
-rw-r--r--arch/x86/include/asm/processor.h14
-rw-r--r--arch/x86/include/asm/ptrace-abi.h3
-rw-r--r--arch/x86/include/asm/sections.h7
-rw-r--r--arch/x86/include/asm/setup.h44
-rw-r--r--arch/x86/include/asm/smp.h13
-rw-r--r--arch/x86/include/asm/socket.h3
-rw-r--r--arch/x86/include/asm/spinlock.h3
-rw-r--r--arch/x86/include/asm/suspend_32.h24
-rw-r--r--arch/x86/include/asm/svm.h4
-rw-r--r--arch/x86/include/asm/sys_ia32.h2
-rw-r--r--arch/x86/include/asm/system.h3
-rw-r--r--arch/x86/include/asm/thread_info.h9
-rw-r--r--arch/x86/include/asm/timer.h2
-rw-r--r--arch/x86/include/asm/topology.h99
-rw-r--r--arch/x86/include/asm/uaccess_32.h4
-rw-r--r--arch/x86/include/asm/uaccess_64.h25
-rw-r--r--arch/x86/include/asm/unistd_32.h2
-rw-r--r--arch/x86/include/asm/unistd_64.h4
-rw-r--r--arch/x86/include/asm/uv/uv.h3
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h18
-rw-r--r--arch/x86/include/asm/uv/uv_mmrs.h153
-rw-r--r--arch/x86/include/asm/virtext.h2
-rw-r--r--arch/x86/include/asm/vmx.h5
-rw-r--r--arch/x86/include/asm/xen/hypercall.h2
-rw-r--r--arch/x86/include/asm/xen/page.h1
-rw-r--r--arch/x86/kernel/Makefile9
-rw-r--r--arch/x86/kernel/acpi/boot.c65
-rw-r--r--arch/x86/kernel/alternative.c46
-rw-r--r--arch/x86/kernel/amd_iommu.c33
-rw-r--r--arch/x86/kernel/apic/apic.c35
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c20
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c53
-rw-r--r--arch/x86/kernel/apic/es7000_32.c238
-rw-r--r--arch/x86/kernel/apic/io_apic.c296
-rw-r--r--arch/x86/kernel/apic/nmi.c11
-rw-r--r--arch/x86/kernel/apic/numaq_32.c23
-rw-r--r--arch/x86/kernel/apic/probe_32.c18
-rw-r--r--arch/x86/kernel/apic/probe_64.c8
-rw-r--r--arch/x86/kernel/apic/summit_32.c70
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c7
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c7
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c53
-rw-r--r--arch/x86/kernel/apm_32.c263
-rw-r--r--arch/x86/kernel/asm-offsets_32.c1
-rw-r--r--arch/x86/kernel/asm-offsets_64.c1
-rw-r--r--arch/x86/kernel/check.c8
-rw-r--r--arch/x86/kernel/cpu/Makefile5
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c2
-rw-r--r--arch/x86/kernel/cpu/amd.c54
-rw-r--r--arch/x86/kernel/cpu/centaur.c36
-rw-r--r--arch/x86/kernel/cpu/centaur_64.c37
-rw-r--r--arch/x86/kernel/cpu/common.c407
-rw-r--r--arch/x86/kernel/cpu/cpu.h25
-rwxr-xr-xarch/x86/kernel/cpu/cpu_debug.c901
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Kconfig19
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Makefile8
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c54
-rw-r--r--arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c54
-rw-r--r--arch/x86/kernel/cpu/cpufreq/e_powersaver.c21
-rw-r--r--arch/x86/kernel/cpu/cpufreq/elanfreq.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/gx-suspmod.c105
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c199
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.h12
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longrun.c25
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c75
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k6.c44
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c239
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c399
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.h5
-rw-r--r--arch/x86/kernel/cpu/cpufreq/sc520_freq.c30
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c72
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.c163
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.h18
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-smi.c166
-rw-r--r--arch/x86/kernel/cpu/cyrix.c16
-rw-r--r--arch/x86/kernel/cpu/intel.c40
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c32
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile1
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_32.c14
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c540
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c68
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel_64.c207
-rw-r--r--arch/x86/kernel/cpu/mcheck/threshold.c29
-rw-r--r--arch/x86/kernel/cpu/mtrr/Makefile2
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c1101
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c199
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c10
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c1069
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h4
-rw-r--r--arch/x86/kernel/cpu/proc.c26
-rw-r--r--arch/x86/kernel/cpu/transmeta.c2
-rw-r--r--arch/x86/kernel/cpu/umc.c2
-rw-r--r--arch/x86/kernel/ds.c3
-rw-r--r--arch/x86/kernel/dumpstack.c7
-rw-r--r--arch/x86/kernel/e820.c142
-rw-r--r--arch/x86/kernel/early_printk.c20
-rw-r--r--arch/x86/kernel/efi.c7
-rw-r--r--arch/x86/kernel/efi_64.c21
-rw-r--r--arch/x86/kernel/entry_32.S18
-rw-r--r--arch/x86/kernel/entry_64.S6
-rw-r--r--arch/x86/kernel/ftrace.c265
-rw-r--r--arch/x86/kernel/head32.c5
-rw-r--r--arch/x86/kernel/head64.c2
-rw-r--r--arch/x86/kernel/head_32.S76
-rw-r--r--arch/x86/kernel/hpet.c80
-rw-r--r--arch/x86/kernel/i387.c2
-rw-r--r--arch/x86/kernel/i8253.c68
-rw-r--r--arch/x86/kernel/io_delay.c27
-rw-r--r--arch/x86/kernel/ioport.c11
-rw-r--r--arch/x86/kernel/irq.c88
-rw-r--r--arch/x86/kernel/irq_32.c29
-rw-r--r--arch/x86/kernel/irqinit_32.c5
-rw-r--r--arch/x86/kernel/irqinit_64.c4
-rw-r--r--arch/x86/kernel/kdebugfs.c82
-rw-r--r--arch/x86/kernel/kprobes.c20
-rw-r--r--arch/x86/kernel/kvm.c7
-rw-r--r--arch/x86/kernel/machine_kexec_32.c17
-rw-r--r--arch/x86/kernel/machine_kexec_64.c99
-rw-r--r--arch/x86/kernel/mfgpt_32.c1
-rw-r--r--arch/x86/kernel/microcode_amd.c43
-rw-r--r--arch/x86/kernel/microcode_core.c160
-rw-r--r--arch/x86/kernel/microcode_intel.c83
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c2
-rw-r--r--arch/x86/kernel/mpparse.c384
-rw-r--r--arch/x86/kernel/paravirt.c1
-rw-r--r--arch/x86/kernel/pci-calgary_64.c38
-rw-r--r--arch/x86/kernel/pci-dma.c17
-rw-r--r--arch/x86/kernel/pci-gart_64.c34
-rw-r--r--arch/x86/kernel/pci-nommu.c39
-rw-r--r--arch/x86/kernel/pci-swiotlb.c (renamed from arch/x86/kernel/pci-swiotlb_64.c)19
-rw-r--r--arch/x86/kernel/process.c217
-rw-r--r--arch/x86/kernel/process_32.c192
-rw-r--r--arch/x86/kernel/process_64.c190
-rw-r--r--arch/x86/kernel/ptrace.c12
-rw-r--r--arch/x86/kernel/quirks.c6
-rw-r--r--arch/x86/kernel/reboot.c8
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S24
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S189
-rw-r--r--arch/x86/kernel/rtc.c20
-rw-r--r--arch/x86/kernel/setup.c80
-rw-r--r--arch/x86/kernel/setup_percpu.c341
-rw-r--r--arch/x86/kernel/signal.c147
-rw-r--r--arch/x86/kernel/smpboot.c157
-rw-r--r--arch/x86/kernel/syscall_table_32.S2
-rw-r--r--arch/x86/kernel/time_64.c2
-rw-r--r--arch/x86/kernel/tlb_uv.c14
-rw-r--r--arch/x86/kernel/topology.c14
-rw-r--r--arch/x86/kernel/traps.c46
-rw-r--r--arch/x86/kernel/tsc.c122
-rw-r--r--arch/x86/kernel/uv_time.c393
-rw-r--r--arch/x86/kernel/visws_quirks.c2
-rw-r--r--arch/x86/kernel/vmi_32.c6
-rw-r--r--arch/x86/kernel/vmiclock_32.c1
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S21
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S101
-rw-r--r--arch/x86/kernel/vsmp_64.c2
-rw-r--r--arch/x86/kvm/Kconfig7
-rw-r--r--arch/x86/kvm/i8254.c21
-rw-r--r--arch/x86/kvm/i8254.h2
-rw-r--r--arch/x86/kvm/i8259.c25
-rw-r--r--arch/x86/kvm/irq.h2
-rw-r--r--arch/x86/kvm/kvm_svm.h16
-rw-r--r--arch/x86/kvm/mmu.c237
-rw-r--r--arch/x86/kvm/mmu.h2
-rw-r--r--arch/x86/kvm/paging_tmpl.h219
-rw-r--r--arch/x86/kvm/svm.c916
-rw-r--r--arch/x86/kvm/vmx.c393
-rw-r--r--arch/x86/kvm/x86.c432
-rw-r--r--arch/x86/kvm/x86_emulate.c56
-rw-r--r--arch/x86/lguest/boot.c115
-rw-r--r--arch/x86/lguest/i386_head.S4
-rw-r--r--arch/x86/lib/memcpy_64.S143
-rw-r--r--arch/x86/math-emu/fpu_aux.c31
-rw-r--r--arch/x86/mm/Makefile4
-rw-r--r--arch/x86/mm/highmem_32.c85
-rw-r--r--arch/x86/mm/init.c393
-rw-r--r--arch/x86/mm/init_32.c361
-rw-r--r--arch/x86/mm/init_64.c416
-rw-r--r--arch/x86/mm/iomap_32.c30
-rw-r--r--arch/x86/mm/ioremap.c52
-rw-r--r--arch/x86/mm/kmmio.c17
-rw-r--r--arch/x86/mm/memtest.c3
-rw-r--r--arch/x86/mm/mmio-mod.c19
-rw-r--r--arch/x86/mm/numa.c67
-rw-r--r--arch/x86/mm/numa_32.c31
-rw-r--r--arch/x86/mm/numa_64.c111
-rw-r--r--arch/x86/mm/pageattr.c158
-rw-r--r--arch/x86/mm/pat.c5
-rw-r--r--arch/x86/mm/pgtable.c18
-rw-r--r--arch/x86/mm/pgtable_32.c20
-rw-r--r--arch/x86/mm/srat_64.c30
-rw-r--r--arch/x86/mm/tlb.c5
-rw-r--r--arch/x86/oprofile/op_model_p4.c2
-rw-r--r--arch/x86/pci/common.c4
-rw-r--r--arch/x86/pci/early.c19
-rw-r--r--arch/x86/pci/fixup.c24
-rw-r--r--arch/x86/pci/i386.c43
-rw-r--r--arch/x86/pci/legacy.c3
-rw-r--r--arch/x86/pci/mmconfig-shared.c227
-rw-r--r--arch/x86/pci/mmconfig_64.c17
-rw-r--r--arch/x86/power/Makefile5
-rw-r--r--arch/x86/power/cpu_32.c1
-rw-r--r--arch/x86/power/cpu_64.c1
-rw-r--r--arch/x86/power/hibernate_64.c1
-rw-r--r--arch/x86/xen/enlighten.c10
-rw-r--r--arch/x86/xen/mmu.c14
-rw-r--r--arch/x86/xen/smp.c12
284 files changed, 13105 insertions, 7546 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e18dfd3c3b7f..bcf4cae711b4 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -34,12 +34,18 @@ config X86
34 select HAVE_FUNCTION_TRACER 34 select HAVE_FUNCTION_TRACER
35 select HAVE_FUNCTION_GRAPH_TRACER 35 select HAVE_FUNCTION_GRAPH_TRACER
36 select HAVE_FUNCTION_TRACE_MCOUNT_TEST 36 select HAVE_FUNCTION_TRACE_MCOUNT_TEST
37 select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE
38 select HAVE_FTRACE_SYSCALLS
37 select HAVE_KVM 39 select HAVE_KVM
38 select HAVE_ARCH_KGDB 40 select HAVE_ARCH_KGDB
39 select HAVE_ARCH_TRACEHOOK 41 select HAVE_ARCH_TRACEHOOK
40 select HAVE_GENERIC_DMA_COHERENT if X86_32 42 select HAVE_GENERIC_DMA_COHERENT if X86_32
41 select HAVE_EFFICIENT_UNALIGNED_ACCESS 43 select HAVE_EFFICIENT_UNALIGNED_ACCESS
42 select USER_STACKTRACE_SUPPORT 44 select USER_STACKTRACE_SUPPORT
45 select HAVE_DMA_API_DEBUG
46 select HAVE_KERNEL_GZIP
47 select HAVE_KERNEL_BZIP2
48 select HAVE_KERNEL_LZMA
43 49
44config ARCH_DEFCONFIG 50config ARCH_DEFCONFIG
45 string 51 string
@@ -135,6 +141,9 @@ config ARCH_HAS_CACHE_LINE_SIZE
135config HAVE_SETUP_PER_CPU_AREA 141config HAVE_SETUP_PER_CPU_AREA
136 def_bool y 142 def_bool y
137 143
144config HAVE_DYNAMIC_PER_CPU_AREA
145 def_bool y
146
138config HAVE_CPUMASK_OF_CPU_MAP 147config HAVE_CPUMASK_OF_CPU_MAP
139 def_bool X86_64_SMP 148 def_bool X86_64_SMP
140 149
@@ -158,11 +167,17 @@ config AUDIT_ARCH
158config ARCH_SUPPORTS_OPTIMIZED_INLINING 167config ARCH_SUPPORTS_OPTIMIZED_INLINING
159 def_bool y 168 def_bool y
160 169
170config ARCH_SUPPORTS_DEBUG_PAGEALLOC
171 def_bool y
172
161# Use the generic interrupt handling code in kernel/irq/: 173# Use the generic interrupt handling code in kernel/irq/:
162config GENERIC_HARDIRQS 174config GENERIC_HARDIRQS
163 bool 175 bool
164 default y 176 default y
165 177
178config GENERIC_HARDIRQS_NO__DO_IRQ
179 def_bool y
180
166config GENERIC_IRQ_PROBE 181config GENERIC_IRQ_PROBE
167 bool 182 bool
168 default y 183 default y
@@ -778,6 +793,11 @@ config X86_MCE_AMD
778 Additional support for AMD specific MCE features such as 793 Additional support for AMD specific MCE features such as
779 the DRAM Error Threshold. 794 the DRAM Error Threshold.
780 795
796config X86_MCE_THRESHOLD
797 depends on X86_MCE_AMD || X86_MCE_INTEL
798 bool
799 default y
800
781config X86_MCE_NONFATAL 801config X86_MCE_NONFATAL
782 tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" 802 tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
783 depends on X86_32 && X86_MCE 803 depends on X86_32 && X86_MCE
@@ -921,6 +941,12 @@ config X86_CPUID
921 with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to 941 with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to
922 /dev/cpu/31/cpuid. 942 /dev/cpu/31/cpuid.
923 943
944config X86_CPU_DEBUG
945 tristate "/sys/kernel/debug/x86/cpu/* - CPU Debug support"
946 ---help---
947 If you select this option, this will provide various x86 CPUs
948 information through debugfs.
949
924choice 950choice
925 prompt "High Memory Support" 951 prompt "High Memory Support"
926 default HIGHMEM4G if !X86_NUMAQ 952 default HIGHMEM4G if !X86_NUMAQ
@@ -1113,7 +1139,7 @@ config NUMA_EMU
1113 1139
1114config NODES_SHIFT 1140config NODES_SHIFT
1115 int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP 1141 int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP
1116 range 1 9 if X86_64 1142 range 1 9
1117 default "9" if MAXSMP 1143 default "9" if MAXSMP
1118 default "6" if X86_64 1144 default "6" if X86_64
1119 default "4" if X86_NUMAQ 1145 default "4" if X86_NUMAQ
@@ -1121,9 +1147,9 @@ config NODES_SHIFT
1121 depends on NEED_MULTIPLE_NODES 1147 depends on NEED_MULTIPLE_NODES
1122 ---help--- 1148 ---help---
1123 Specify the maximum number of NUMA Nodes available on the target 1149 Specify the maximum number of NUMA Nodes available on the target
1124 system. Increases memory reserved to accomodate various tables. 1150 system. Increases memory reserved to accommodate various tables.
1125 1151
1126config HAVE_ARCH_BOOTMEM_NODE 1152config HAVE_ARCH_BOOTMEM
1127 def_bool y 1153 def_bool y
1128 depends on X86_32 && NUMA 1154 depends on X86_32 && NUMA
1129 1155
@@ -1299,7 +1325,7 @@ config MTRR_SANITIZER
1299 add writeback entries. 1325 add writeback entries.
1300 1326
1301 Can be disabled with disable_mtrr_cleanup on the kernel command line. 1327 Can be disabled with disable_mtrr_cleanup on the kernel command line.
1302 The largest mtrr entry size for a continous block can be set with 1328 The largest mtrr entry size for a continuous block can be set with
1303 mtrr_chunk_size. 1329 mtrr_chunk_size.
1304 1330
1305 If unsure, say Y. 1331 If unsure, say Y.
@@ -1421,7 +1447,7 @@ config CRASH_DUMP
1421config KEXEC_JUMP 1447config KEXEC_JUMP
1422 bool "kexec jump (EXPERIMENTAL)" 1448 bool "kexec jump (EXPERIMENTAL)"
1423 depends on EXPERIMENTAL 1449 depends on EXPERIMENTAL
1424 depends on KEXEC && HIBERNATION && X86_32 1450 depends on KEXEC && HIBERNATION
1425 ---help--- 1451 ---help---
1426 Jump between original kernel and kexeced kernel and invoke 1452 Jump between original kernel and kexeced kernel and invoke
1427 code in physical address mode via KEXEC 1453 code in physical address mode via KEXEC
@@ -1814,8 +1840,8 @@ config PCI_MMCONFIG
1814 1840
1815config DMAR 1841config DMAR
1816 bool "Support for DMA Remapping Devices (EXPERIMENTAL)" 1842 bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
1817 depends on X86_64 && PCI_MSI && ACPI && EXPERIMENTAL 1843 depends on PCI_MSI && ACPI && EXPERIMENTAL
1818 ---help--- 1844 help
1819 DMA remapping (DMAR) devices support enables independent address 1845 DMA remapping (DMAR) devices support enables independent address
1820 translations for Direct Memory Access (DMA) from devices. 1846 translations for Direct Memory Access (DMA) from devices.
1821 These DMA remapping devices are reported via ACPI tables 1847 These DMA remapping devices are reported via ACPI tables
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index a95eaf0e582a..924e156a85ab 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -456,24 +456,9 @@ config CPU_SUP_AMD
456 456
457 If unsure, say N. 457 If unsure, say N.
458 458
459config CPU_SUP_CENTAUR_32 459config CPU_SUP_CENTAUR
460 default y 460 default y
461 bool "Support Centaur processors" if PROCESSOR_SELECT 461 bool "Support Centaur processors" if PROCESSOR_SELECT
462 depends on !64BIT
463 ---help---
464 This enables detection, tunings and quirks for Centaur processors
465
466 You need this enabled if you want your kernel to run on a
467 Centaur CPU. Disabling this option on other types of CPUs
468 makes the kernel a tiny bit smaller. Disabling it on a Centaur
469 CPU might render the kernel unbootable.
470
471 If unsure, say N.
472
473config CPU_SUP_CENTAUR_64
474 default y
475 bool "Support Centaur processors" if PROCESSOR_SELECT
476 depends on 64BIT
477 ---help--- 462 ---help---
478 This enables detection, tunings and quirks for Centaur processors 463 This enables detection, tunings and quirks for Centaur processors
479 464
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index fdb45df608b6..d8359e73317f 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -72,14 +72,6 @@ config DEBUG_STACK_USAGE
72 72
73 This option will slow down process creation somewhat. 73 This option will slow down process creation somewhat.
74 74
75config DEBUG_PAGEALLOC
76 bool "Debug page memory allocations"
77 depends on DEBUG_KERNEL
78 ---help---
79 Unmap pages from the kernel linear mapping after free_pages().
80 This results in a large slowdown, but helps to find certain types
81 of memory corruptions.
82
83config DEBUG_PER_CPU_MAPS 75config DEBUG_PER_CPU_MAPS
84 bool "Debug access to per_cpu maps" 76 bool "Debug access to per_cpu maps"
85 depends on DEBUG_KERNEL 77 depends on DEBUG_KERNEL
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 1836191839ee..f05d8c91d9e5 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -153,34 +153,23 @@ endif
153 153
154boot := arch/x86/boot 154boot := arch/x86/boot
155 155
156PHONY += zImage bzImage compressed zlilo bzlilo \ 156BOOT_TARGETS = bzlilo bzdisk fdimage fdimage144 fdimage288 isoimage install
157 zdisk bzdisk fdimage fdimage144 fdimage288 isoimage install 157
158PHONY += bzImage $(BOOT_TARGETS)
158 159
159# Default kernel to build 160# Default kernel to build
160all: bzImage 161all: bzImage
161 162
162# KBUILD_IMAGE specify target image being built 163# KBUILD_IMAGE specify target image being built
163 KBUILD_IMAGE := $(boot)/bzImage 164KBUILD_IMAGE := $(boot)/bzImage
164zImage zlilo zdisk: KBUILD_IMAGE := $(boot)/zImage
165 165
166zImage bzImage: vmlinux 166bzImage: vmlinux
167 $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE) 167 $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
168 $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot 168 $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot
169 $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@ 169 $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@
170 170
171compressed: zImage 171$(BOOT_TARGETS): vmlinux
172 172 $(Q)$(MAKE) $(build)=$(boot) $@
173zlilo bzlilo: vmlinux
174 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zlilo
175
176zdisk bzdisk: vmlinux
177 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zdisk
178
179fdimage fdimage144 fdimage288 isoimage: vmlinux
180 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) $@
181
182install:
183 $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) install
184 173
185PHONY += vdso_install 174PHONY += vdso_install
186vdso_install: 175vdso_install:
@@ -205,7 +194,3 @@ define archhelp
205 echo ' FDARGS="..." arguments for the booted kernel' 194 echo ' FDARGS="..." arguments for the booted kernel'
206 echo ' FDINITRD=file initrd for the booted kernel' 195 echo ' FDINITRD=file initrd for the booted kernel'
207endef 196endef
208
209CLEAN_FILES += arch/x86/boot/fdimage \
210 arch/x86/boot/image.iso \
211 arch/x86/boot/mtools.conf
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index c70eff69a1fb..6633b6e7505a 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -6,26 +6,24 @@
6# for more details. 6# for more details.
7# 7#
8# Copyright (C) 1994 by Linus Torvalds 8# Copyright (C) 1994 by Linus Torvalds
9# Changed by many, many contributors over the years.
9# 10#
10 11
11# ROOT_DEV specifies the default root-device when making the image. 12# ROOT_DEV specifies the default root-device when making the image.
12# This can be either FLOPPY, CURRENT, /dev/xxxx or empty, in which case 13# This can be either FLOPPY, CURRENT, /dev/xxxx or empty, in which case
13# the default of FLOPPY is used by 'build'. 14# the default of FLOPPY is used by 'build'.
14 15
15ROOT_DEV := CURRENT 16ROOT_DEV := CURRENT
16 17
17# If you want to preset the SVGA mode, uncomment the next line and 18# If you want to preset the SVGA mode, uncomment the next line and
18# set SVGA_MODE to whatever number you want. 19# set SVGA_MODE to whatever number you want.
19# Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode. 20# Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode.
20# The number is the same as you would ordinarily press at bootup. 21# The number is the same as you would ordinarily press at bootup.
21 22
22SVGA_MODE := -DSVGA_MODE=NORMAL_VGA 23SVGA_MODE := -DSVGA_MODE=NORMAL_VGA
23 24
24# If you want the RAM disk device, define this to be the size in blocks. 25targets := vmlinux.bin setup.bin setup.elf bzImage
25 26targets += fdimage fdimage144 fdimage288 image.iso mtools.conf
26#RAMDISK := -DRAMDISK=512
27
28targets := vmlinux.bin setup.bin setup.elf zImage bzImage
29subdir- := compressed 27subdir- := compressed
30 28
31setup-y += a20.o cmdline.o copy.o cpu.o cpucheck.o edd.o 29setup-y += a20.o cmdline.o copy.o cpu.o cpucheck.o edd.o
@@ -59,6 +57,7 @@ $(obj)/cpustr.h: $(obj)/mkcpustr FORCE
59# How to compile the 16-bit code. Note we always compile for -march=i386, 57# How to compile the 16-bit code. Note we always compile for -march=i386,
60# that way we can complain to the user if the CPU is insufficient. 58# that way we can complain to the user if the CPU is insufficient.
61KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \ 59KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \
60 -DDISABLE_BRANCH_PROFILING \
62 -Wall -Wstrict-prototypes \ 61 -Wall -Wstrict-prototypes \
63 -march=i386 -mregparm=3 \ 62 -march=i386 -mregparm=3 \
64 -include $(srctree)/$(src)/code16gcc.h \ 63 -include $(srctree)/$(src)/code16gcc.h \
@@ -68,20 +67,16 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \
68 $(call cc-option, -fno-unit-at-a-time)) \ 67 $(call cc-option, -fno-unit-at-a-time)) \
69 $(call cc-option, -fno-stack-protector) \ 68 $(call cc-option, -fno-stack-protector) \
70 $(call cc-option, -mpreferred-stack-boundary=2) 69 $(call cc-option, -mpreferred-stack-boundary=2)
71KBUILD_CFLAGS += $(call cc-option,-m32) 70KBUILD_CFLAGS += $(call cc-option, -m32)
72KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ 71KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
73 72
74$(obj)/zImage: asflags-y := $(SVGA_MODE) $(RAMDISK) 73$(obj)/bzImage: asflags-y := $(SVGA_MODE)
75$(obj)/bzImage: ccflags-y := -D__BIG_KERNEL__
76$(obj)/bzImage: asflags-y := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__
77$(obj)/bzImage: BUILDFLAGS := -b
78 74
79quiet_cmd_image = BUILD $@ 75quiet_cmd_image = BUILD $@
80cmd_image = $(obj)/tools/build $(BUILDFLAGS) $(obj)/setup.bin \ 76cmd_image = $(obj)/tools/build $(obj)/setup.bin $(obj)/vmlinux.bin \
81 $(obj)/vmlinux.bin $(ROOT_DEV) > $@ 77 $(ROOT_DEV) > $@
82 78
83$(obj)/zImage $(obj)/bzImage: $(obj)/setup.bin \ 79$(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin $(obj)/tools/build FORCE
84 $(obj)/vmlinux.bin $(obj)/tools/build FORCE
85 $(call if_changed,image) 80 $(call if_changed,image)
86 @echo 'Kernel: $@ is ready' ' (#'`cat .version`')' 81 @echo 'Kernel: $@ is ready' ' (#'`cat .version`')'
87 82
@@ -116,9 +111,11 @@ $(obj)/setup.bin: $(obj)/setup.elf FORCE
116$(obj)/compressed/vmlinux: FORCE 111$(obj)/compressed/vmlinux: FORCE
117 $(Q)$(MAKE) $(build)=$(obj)/compressed $@ 112 $(Q)$(MAKE) $(build)=$(obj)/compressed $@
118 113
119# Set this if you want to pass append arguments to the zdisk/fdimage/isoimage kernel 114# Set this if you want to pass append arguments to the
115# bzdisk/fdimage/isoimage kernel
120FDARGS = 116FDARGS =
121# Set this if you want an initrd included with the zdisk/fdimage/isoimage kernel 117# Set this if you want an initrd included with the
118# bzdisk/fdimage/isoimage kernel
122FDINITRD = 119FDINITRD =
123 120
124image_cmdline = default linux $(FDARGS) $(if $(FDINITRD),initrd=initrd.img,) 121image_cmdline = default linux $(FDARGS) $(if $(FDINITRD),initrd=initrd.img,)
@@ -127,7 +124,7 @@ $(obj)/mtools.conf: $(src)/mtools.conf.in
127 sed -e 's|@OBJ@|$(obj)|g' < $< > $@ 124 sed -e 's|@OBJ@|$(obj)|g' < $< > $@
128 125
129# This requires write access to /dev/fd0 126# This requires write access to /dev/fd0
130zdisk: $(BOOTIMAGE) $(obj)/mtools.conf 127bzdisk: $(obj)/bzImage $(obj)/mtools.conf
131 MTOOLSRC=$(obj)/mtools.conf mformat a: ; sync 128 MTOOLSRC=$(obj)/mtools.conf mformat a: ; sync
132 syslinux /dev/fd0 ; sync 129 syslinux /dev/fd0 ; sync
133 echo '$(image_cmdline)' | \ 130 echo '$(image_cmdline)' | \
@@ -135,10 +132,10 @@ zdisk: $(BOOTIMAGE) $(obj)/mtools.conf
135 if [ -f '$(FDINITRD)' ] ; then \ 132 if [ -f '$(FDINITRD)' ] ; then \
136 MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' a:initrd.img ; \ 133 MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' a:initrd.img ; \
137 fi 134 fi
138 MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) a:linux ; sync 135 MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage a:linux ; sync
139 136
140# These require being root or having syslinux 2.02 or higher installed 137# These require being root or having syslinux 2.02 or higher installed
141fdimage fdimage144: $(BOOTIMAGE) $(obj)/mtools.conf 138fdimage fdimage144: $(obj)/bzImage $(obj)/mtools.conf
142 dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=1440 139 dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=1440
143 MTOOLSRC=$(obj)/mtools.conf mformat v: ; sync 140 MTOOLSRC=$(obj)/mtools.conf mformat v: ; sync
144 syslinux $(obj)/fdimage ; sync 141 syslinux $(obj)/fdimage ; sync
@@ -147,9 +144,9 @@ fdimage fdimage144: $(BOOTIMAGE) $(obj)/mtools.conf
147 if [ -f '$(FDINITRD)' ] ; then \ 144 if [ -f '$(FDINITRD)' ] ; then \
148 MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' v:initrd.img ; \ 145 MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' v:initrd.img ; \
149 fi 146 fi
150 MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) v:linux ; sync 147 MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage v:linux ; sync
151 148
152fdimage288: $(BOOTIMAGE) $(obj)/mtools.conf 149fdimage288: $(obj)/bzImage $(obj)/mtools.conf
153 dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=2880 150 dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=2880
154 MTOOLSRC=$(obj)/mtools.conf mformat w: ; sync 151 MTOOLSRC=$(obj)/mtools.conf mformat w: ; sync
155 syslinux $(obj)/fdimage ; sync 152 syslinux $(obj)/fdimage ; sync
@@ -158,9 +155,9 @@ fdimage288: $(BOOTIMAGE) $(obj)/mtools.conf
158 if [ -f '$(FDINITRD)' ] ; then \ 155 if [ -f '$(FDINITRD)' ] ; then \
159 MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' w:initrd.img ; \ 156 MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' w:initrd.img ; \
160 fi 157 fi
161 MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) w:linux ; sync 158 MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage w:linux ; sync
162 159
163isoimage: $(BOOTIMAGE) 160isoimage: $(obj)/bzImage
164 -rm -rf $(obj)/isoimage 161 -rm -rf $(obj)/isoimage
165 mkdir $(obj)/isoimage 162 mkdir $(obj)/isoimage
166 for i in lib lib64 share end ; do \ 163 for i in lib lib64 share end ; do \
@@ -170,7 +167,7 @@ isoimage: $(BOOTIMAGE)
170 fi ; \ 167 fi ; \
171 if [ $$i = end ] ; then exit 1 ; fi ; \ 168 if [ $$i = end ] ; then exit 1 ; fi ; \
172 done 169 done
173 cp $(BOOTIMAGE) $(obj)/isoimage/linux 170 cp $(obj)/bzImage $(obj)/isoimage/linux
174 echo '$(image_cmdline)' > $(obj)/isoimage/isolinux.cfg 171 echo '$(image_cmdline)' > $(obj)/isoimage/isolinux.cfg
175 if [ -f '$(FDINITRD)' ] ; then \ 172 if [ -f '$(FDINITRD)' ] ; then \
176 cp '$(FDINITRD)' $(obj)/isoimage/initrd.img ; \ 173 cp '$(FDINITRD)' $(obj)/isoimage/initrd.img ; \
@@ -181,12 +178,13 @@ isoimage: $(BOOTIMAGE)
181 isohybrid $(obj)/image.iso 2>/dev/null || true 178 isohybrid $(obj)/image.iso 2>/dev/null || true
182 rm -rf $(obj)/isoimage 179 rm -rf $(obj)/isoimage
183 180
184zlilo: $(BOOTIMAGE) 181bzlilo: $(obj)/bzImage
185 if [ -f $(INSTALL_PATH)/vmlinuz ]; then mv $(INSTALL_PATH)/vmlinuz $(INSTALL_PATH)/vmlinuz.old; fi 182 if [ -f $(INSTALL_PATH)/vmlinuz ]; then mv $(INSTALL_PATH)/vmlinuz $(INSTALL_PATH)/vmlinuz.old; fi
186 if [ -f $(INSTALL_PATH)/System.map ]; then mv $(INSTALL_PATH)/System.map $(INSTALL_PATH)/System.old; fi 183 if [ -f $(INSTALL_PATH)/System.map ]; then mv $(INSTALL_PATH)/System.map $(INSTALL_PATH)/System.old; fi
187 cat $(BOOTIMAGE) > $(INSTALL_PATH)/vmlinuz 184 cat $(obj)/bzImage > $(INSTALL_PATH)/vmlinuz
188 cp System.map $(INSTALL_PATH)/ 185 cp System.map $(INSTALL_PATH)/
189 if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi 186 if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi
190 187
191install: 188install:
192 sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(BOOTIMAGE) System.map "$(INSTALL_PATH)" 189 sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(obj)/bzImage \
190 System.map "$(INSTALL_PATH)"
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 1771c804e02f..65551c9f8571 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -4,10 +4,11 @@
4# create a compressed vmlinux image from the original vmlinux 4# create a compressed vmlinux image from the original vmlinux
5# 5#
6 6
7targets := vmlinux vmlinux.bin vmlinux.bin.gz head_$(BITS).o misc.o piggy.o 7targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma head_$(BITS).o misc.o piggy.o
8 8
9KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 9KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2
10KBUILD_CFLAGS += -fno-strict-aliasing -fPIC 10KBUILD_CFLAGS += -fno-strict-aliasing -fPIC
11KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
11cflags-$(CONFIG_X86_64) := -mcmodel=small 12cflags-$(CONFIG_X86_64) := -mcmodel=small
12KBUILD_CFLAGS += $(cflags-y) 13KBUILD_CFLAGS += $(cflags-y)
13KBUILD_CFLAGS += $(call cc-option,-ffreestanding) 14KBUILD_CFLAGS += $(call cc-option,-ffreestanding)
@@ -47,18 +48,35 @@ ifeq ($(CONFIG_X86_32),y)
47ifdef CONFIG_RELOCATABLE 48ifdef CONFIG_RELOCATABLE
48$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE 49$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE
49 $(call if_changed,gzip) 50 $(call if_changed,gzip)
51$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin.all FORCE
52 $(call if_changed,bzip2)
53$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin.all FORCE
54 $(call if_changed,lzma)
50else 55else
51$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE 56$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
52 $(call if_changed,gzip) 57 $(call if_changed,gzip)
58$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE
59 $(call if_changed,bzip2)
60$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE
61 $(call if_changed,lzma)
53endif 62endif
54LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T 63LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T
55 64
56else 65else
66
57$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE 67$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
58 $(call if_changed,gzip) 68 $(call if_changed,gzip)
69$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE
70 $(call if_changed,bzip2)
71$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE
72 $(call if_changed,lzma)
59 73
60LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T 74LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T
61endif 75endif
62 76
63$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE 77suffix_$(CONFIG_KERNEL_GZIP) = gz
78suffix_$(CONFIG_KERNEL_BZIP2) = bz2
79suffix_$(CONFIG_KERNEL_LZMA) = lzma
80
81$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix_y) FORCE
64 $(call if_changed,ld) 82 $(call if_changed,ld)
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index da062216948a..e45be73684ff 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -116,71 +116,13 @@
116/* 116/*
117 * gzip declarations 117 * gzip declarations
118 */ 118 */
119
120#define OF(args) args
121#define STATIC static 119#define STATIC static
122 120
123#undef memset 121#undef memset
124#undef memcpy 122#undef memcpy
125#define memzero(s, n) memset((s), 0, (n)) 123#define memzero(s, n) memset((s), 0, (n))
126 124
127typedef unsigned char uch;
128typedef unsigned short ush;
129typedef unsigned long ulg;
130
131/*
132 * Window size must be at least 32k, and a power of two.
133 * We don't actually have a window just a huge output buffer,
134 * so we report a 2G window size, as that should always be
135 * larger than our output buffer:
136 */
137#define WSIZE 0x80000000
138
139/* Input buffer: */
140static unsigned char *inbuf;
141
142/* Sliding window buffer (and final output buffer): */
143static unsigned char *window;
144
145/* Valid bytes in inbuf: */
146static unsigned insize;
147
148/* Index of next byte to be processed in inbuf: */
149static unsigned inptr;
150
151/* Bytes in output buffer: */
152static unsigned outcnt;
153
154/* gzip flag byte */
155#define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */
156#define CONTINUATION 0x02 /* bit 1 set: continuation of multi-part gz file */
157#define EXTRA_FIELD 0x04 /* bit 2 set: extra field present */
158#define ORIG_NAM 0x08 /* bit 3 set: original file name present */
159#define COMMENT 0x10 /* bit 4 set: file comment present */
160#define ENCRYPTED 0x20 /* bit 5 set: file is encrypted */
161#define RESERVED 0xC0 /* bit 6, 7: reserved */
162
163#define get_byte() (inptr < insize ? inbuf[inptr++] : fill_inbuf())
164
165/* Diagnostic functions */
166#ifdef DEBUG
167# define Assert(cond, msg) do { if (!(cond)) error(msg); } while (0)
168# define Trace(x) do { fprintf x; } while (0)
169# define Tracev(x) do { if (verbose) fprintf x ; } while (0)
170# define Tracevv(x) do { if (verbose > 1) fprintf x ; } while (0)
171# define Tracec(c, x) do { if (verbose && (c)) fprintf x ; } while (0)
172# define Tracecv(c, x) do { if (verbose > 1 && (c)) fprintf x ; } while (0)
173#else
174# define Assert(cond, msg)
175# define Trace(x)
176# define Tracev(x)
177# define Tracevv(x)
178# define Tracec(c, x)
179# define Tracecv(c, x)
180#endif
181 125
182static int fill_inbuf(void);
183static void flush_window(void);
184static void error(char *m); 126static void error(char *m);
185 127
186/* 128/*
@@ -189,13 +131,8 @@ static void error(char *m);
189static struct boot_params *real_mode; /* Pointer to real-mode data */ 131static struct boot_params *real_mode; /* Pointer to real-mode data */
190static int quiet; 132static int quiet;
191 133
192extern unsigned char input_data[];
193extern int input_len;
194
195static long bytes_out;
196
197static void *memset(void *s, int c, unsigned n); 134static void *memset(void *s, int c, unsigned n);
198static void *memcpy(void *dest, const void *src, unsigned n); 135void *memcpy(void *dest, const void *src, unsigned n);
199 136
200static void __putstr(int, const char *); 137static void __putstr(int, const char *);
201#define putstr(__x) __putstr(0, __x) 138#define putstr(__x) __putstr(0, __x)
@@ -213,7 +150,17 @@ static char *vidmem;
213static int vidport; 150static int vidport;
214static int lines, cols; 151static int lines, cols;
215 152
216#include "../../../../lib/inflate.c" 153#ifdef CONFIG_KERNEL_GZIP
154#include "../../../../lib/decompress_inflate.c"
155#endif
156
157#ifdef CONFIG_KERNEL_BZIP2
158#include "../../../../lib/decompress_bunzip2.c"
159#endif
160
161#ifdef CONFIG_KERNEL_LZMA
162#include "../../../../lib/decompress_unlzma.c"
163#endif
217 164
218static void scroll(void) 165static void scroll(void)
219{ 166{
@@ -282,7 +229,7 @@ static void *memset(void *s, int c, unsigned n)
282 return s; 229 return s;
283} 230}
284 231
285static void *memcpy(void *dest, const void *src, unsigned n) 232void *memcpy(void *dest, const void *src, unsigned n)
286{ 233{
287 int i; 234 int i;
288 const char *s = src; 235 const char *s = src;
@@ -293,38 +240,6 @@ static void *memcpy(void *dest, const void *src, unsigned n)
293 return dest; 240 return dest;
294} 241}
295 242
296/* ===========================================================================
297 * Fill the input buffer. This is called only when the buffer is empty
298 * and at least one byte is really needed.
299 */
300static int fill_inbuf(void)
301{
302 error("ran out of input data");
303 return 0;
304}
305
306/* ===========================================================================
307 * Write the output window window[0..outcnt-1] and update crc and bytes_out.
308 * (Used for the decompressed data only.)
309 */
310static void flush_window(void)
311{
312 /* With my window equal to my output buffer
313 * I only need to compute the crc here.
314 */
315 unsigned long c = crc; /* temporary variable */
316 unsigned n;
317 unsigned char *in, ch;
318
319 in = window;
320 for (n = 0; n < outcnt; n++) {
321 ch = *in++;
322 c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
323 }
324 crc = c;
325 bytes_out += (unsigned long)outcnt;
326 outcnt = 0;
327}
328 243
329static void error(char *x) 244static void error(char *x)
330{ 245{
@@ -407,12 +322,8 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
407 lines = real_mode->screen_info.orig_video_lines; 322 lines = real_mode->screen_info.orig_video_lines;
408 cols = real_mode->screen_info.orig_video_cols; 323 cols = real_mode->screen_info.orig_video_cols;
409 324
410 window = output; /* Output buffer (Normally at 1M) */
411 free_mem_ptr = heap; /* Heap */ 325 free_mem_ptr = heap; /* Heap */
412 free_mem_end_ptr = heap + BOOT_HEAP_SIZE; 326 free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
413 inbuf = input_data; /* Input buffer */
414 insize = input_len;
415 inptr = 0;
416 327
417#ifdef CONFIG_X86_64 328#ifdef CONFIG_X86_64
418 if ((unsigned long)output & (__KERNEL_ALIGN - 1)) 329 if ((unsigned long)output & (__KERNEL_ALIGN - 1))
@@ -430,10 +341,9 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
430#endif 341#endif
431#endif 342#endif
432 343
433 makecrc();
434 if (!quiet) 344 if (!quiet)
435 putstr("\nDecompressing Linux... "); 345 putstr("\nDecompressing Linux... ");
436 gunzip(); 346 decompress(input_data, input_len, NULL, NULL, output, NULL, error);
437 parse_elf(output); 347 parse_elf(output);
438 if (!quiet) 348 if (!quiet)
439 putstr("done.\nBooting the kernel.\n"); 349 putstr("done.\nBooting the kernel.\n");
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 7ccff4884a23..5d84d1c74e4c 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -24,12 +24,8 @@
24#include "boot.h" 24#include "boot.h"
25#include "offsets.h" 25#include "offsets.h"
26 26
27SETUPSECTS = 4 /* default nr of setup-sectors */
28BOOTSEG = 0x07C0 /* original address of boot-sector */ 27BOOTSEG = 0x07C0 /* original address of boot-sector */
29SYSSEG = DEF_SYSSEG /* system loaded at 0x10000 (65536) */ 28SYSSEG = 0x1000 /* historical load address >> 4 */
30SYSSIZE = DEF_SYSSIZE /* system size: # of 16-byte clicks */
31 /* to be loaded */
32ROOT_DEV = 0 /* ROOT_DEV is now written by "build" */
33 29
34#ifndef SVGA_MODE 30#ifndef SVGA_MODE
35#define SVGA_MODE ASK_VGA 31#define SVGA_MODE ASK_VGA
@@ -97,12 +93,12 @@ bugger_off_msg:
97 .section ".header", "a" 93 .section ".header", "a"
98 .globl hdr 94 .globl hdr
99hdr: 95hdr:
100setup_sects: .byte SETUPSECTS 96setup_sects: .byte 0 /* Filled in by build.c */
101root_flags: .word ROOT_RDONLY 97root_flags: .word ROOT_RDONLY
102syssize: .long SYSSIZE 98syssize: .long 0 /* Filled in by build.c */
103ram_size: .word RAMDISK 99ram_size: .word 0 /* Obsolete */
104vid_mode: .word SVGA_MODE 100vid_mode: .word SVGA_MODE
105root_dev: .word ROOT_DEV 101root_dev: .word 0 /* Filled in by build.c */
106boot_flag: .word 0xAA55 102boot_flag: .word 0xAA55
107 103
108 # offset 512, entry point 104 # offset 512, entry point
@@ -123,14 +119,15 @@ _start:
123 # or else old loadlin-1.5 will fail) 119 # or else old loadlin-1.5 will fail)
124 .globl realmode_swtch 120 .globl realmode_swtch
125realmode_swtch: .word 0, 0 # default_switch, SETUPSEG 121realmode_swtch: .word 0, 0 # default_switch, SETUPSEG
126start_sys_seg: .word SYSSEG 122start_sys_seg: .word SYSSEG # obsolete and meaningless, but just
123 # in case something decided to "use" it
127 .word kernel_version-512 # pointing to kernel version string 124 .word kernel_version-512 # pointing to kernel version string
128 # above section of header is compatible 125 # above section of header is compatible
129 # with loadlin-1.5 (header v1.5). Don't 126 # with loadlin-1.5 (header v1.5). Don't
130 # change it. 127 # change it.
131 128
132type_of_loader: .byte 0 # = 0, old one (LILO, Loadlin, 129type_of_loader: .byte 0 # 0 means ancient bootloader, newer
133 # Bootlin, SYSLX, bootsect...) 130 # bootloaders know to change this.
134 # See Documentation/i386/boot.txt for 131 # See Documentation/i386/boot.txt for
135 # assigned ids 132 # assigned ids
136 133
@@ -142,11 +139,7 @@ CAN_USE_HEAP = 0x80 # If set, the loader also has set
142 # space behind setup.S can be used for 139 # space behind setup.S can be used for
143 # heap purposes. 140 # heap purposes.
144 # Only the loader knows what is free 141 # Only the loader knows what is free
145#ifndef __BIG_KERNEL__
146 .byte 0
147#else
148 .byte LOADED_HIGH 142 .byte LOADED_HIGH
149#endif
150 143
151setup_move_size: .word 0x8000 # size to move, when setup is not 144setup_move_size: .word 0x8000 # size to move, when setup is not
152 # loaded at 0x90000. We will move setup 145 # loaded at 0x90000. We will move setup
@@ -157,11 +150,7 @@ setup_move_size: .word 0x8000 # size to move, when setup is not
157 150
158code32_start: # here loaders can put a different 151code32_start: # here loaders can put a different
159 # start address for 32-bit code. 152 # start address for 32-bit code.
160#ifndef __BIG_KERNEL__
161 .long 0x1000 # 0x1000 = default for zImage
162#else
163 .long 0x100000 # 0x100000 = default for big kernel 153 .long 0x100000 # 0x100000 = default for big kernel
164#endif
165 154
166ramdisk_image: .long 0 # address of loaded ramdisk image 155ramdisk_image: .long 0 # address of loaded ramdisk image
167 # Here the loader puts the 32-bit 156 # Here the loader puts the 32-bit
diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c
index 8c3c25f35578..5054c2ddd1a0 100644
--- a/arch/x86/boot/memory.c
+++ b/arch/x86/boot/memory.c
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -16,24 +17,38 @@
16 17
17#define SMAP 0x534d4150 /* ASCII "SMAP" */ 18#define SMAP 0x534d4150 /* ASCII "SMAP" */
18 19
20struct e820_ext_entry {
21 struct e820entry std;
22 u32 ext_flags;
23} __attribute__((packed));
24
19static int detect_memory_e820(void) 25static int detect_memory_e820(void)
20{ 26{
21 int count = 0; 27 int count = 0;
22 u32 next = 0; 28 u32 next = 0;
23 u32 size, id; 29 u32 size, id, edi;
24 u8 err; 30 u8 err;
25 struct e820entry *desc = boot_params.e820_map; 31 struct e820entry *desc = boot_params.e820_map;
32 static struct e820_ext_entry buf; /* static so it is zeroed */
33
34 /*
35 * Set this here so that if the BIOS doesn't change this field
36 * but still doesn't change %ecx, we're still okay...
37 */
38 buf.ext_flags = 1;
26 39
27 do { 40 do {
28 size = sizeof(struct e820entry); 41 size = sizeof buf;
29 42
30 /* Important: %edx is clobbered by some BIOSes, 43 /* Important: %edx and %esi are clobbered by some BIOSes,
31 so it must be either used for the error output 44 so they must be either used for the error output
32 or explicitly marked clobbered. */ 45 or explicitly marked clobbered. Given that, assume there
33 asm("int $0x15; setc %0" 46 is something out there clobbering %ebp and %edi, too. */
47 asm("pushl %%ebp; int $0x15; popl %%ebp; setc %0"
34 : "=d" (err), "+b" (next), "=a" (id), "+c" (size), 48 : "=d" (err), "+b" (next), "=a" (id), "+c" (size),
35 "=m" (*desc) 49 "=D" (edi), "+m" (buf)
36 : "D" (desc), "d" (SMAP), "a" (0xe820)); 50 : "D" (&buf), "d" (SMAP), "a" (0xe820)
51 : "esi");
37 52
38 /* BIOSes which terminate the chain with CF = 1 as opposed 53 /* BIOSes which terminate the chain with CF = 1 as opposed
39 to %ebx = 0 don't always report the SMAP signature on 54 to %ebx = 0 don't always report the SMAP signature on
@@ -51,8 +66,14 @@ static int detect_memory_e820(void)
51 break; 66 break;
52 } 67 }
53 68
69 /* ACPI 3.0 added the extended flags support. If bit 0
70 in the extended flags is zero, we're supposed to simply
71 ignore the entry -- a backwards incompatible change! */
72 if (size > 20 && !(buf.ext_flags & 1))
73 continue;
74
75 *desc++ = buf.std;
54 count++; 76 count++;
55 desc++;
56 } while (next && count < ARRAY_SIZE(boot_params.e820_map)); 77 } while (next && count < ARRAY_SIZE(boot_params.e820_map));
57 78
58 return boot_params.e820_entries = count; 79 return boot_params.e820_entries = count;
diff --git a/arch/x86/boot/pm.c b/arch/x86/boot/pm.c
index 85a1cd8a8ff8..8062f8915250 100644
--- a/arch/x86/boot/pm.c
+++ b/arch/x86/boot/pm.c
@@ -33,47 +33,6 @@ static void realmode_switch_hook(void)
33} 33}
34 34
35/* 35/*
36 * A zImage kernel is loaded at 0x10000 but wants to run at 0x1000.
37 * A bzImage kernel is loaded and runs at 0x100000.
38 */
39static void move_kernel_around(void)
40{
41 /* Note: rely on the compile-time option here rather than
42 the LOADED_HIGH flag. The Qemu kernel loader unconditionally
43 sets the loadflags to zero. */
44#ifndef __BIG_KERNEL__
45 u16 dst_seg, src_seg;
46 u32 syssize;
47
48 dst_seg = 0x1000 >> 4;
49 src_seg = 0x10000 >> 4;
50 syssize = boot_params.hdr.syssize; /* Size in 16-byte paragraphs */
51
52 while (syssize) {
53 int paras = (syssize >= 0x1000) ? 0x1000 : syssize;
54 int dwords = paras << 2;
55
56 asm volatile("pushw %%es ; "
57 "pushw %%ds ; "
58 "movw %1,%%es ; "
59 "movw %2,%%ds ; "
60 "xorw %%di,%%di ; "
61 "xorw %%si,%%si ; "
62 "rep;movsl ; "
63 "popw %%ds ; "
64 "popw %%es"
65 : "+c" (dwords)
66 : "r" (dst_seg), "r" (src_seg)
67 : "esi", "edi");
68
69 syssize -= paras;
70 dst_seg += paras;
71 src_seg += paras;
72 }
73#endif
74}
75
76/*
77 * Disable all interrupts at the legacy PIC. 36 * Disable all interrupts at the legacy PIC.
78 */ 37 */
79static void mask_all_interrupts(void) 38static void mask_all_interrupts(void)
@@ -147,9 +106,6 @@ void go_to_protected_mode(void)
147 /* Hook before leaving real mode, also disables interrupts */ 106 /* Hook before leaving real mode, also disables interrupts */
148 realmode_switch_hook(); 107 realmode_switch_hook();
149 108
150 /* Move the kernel/setup to their final resting places */
151 move_kernel_around();
152
153 /* Enable the A20 gate */ 109 /* Enable the A20 gate */
154 if (enable_a20()) { 110 if (enable_a20()) {
155 puts("A20 gate not responding, unable to boot...\n"); 111 puts("A20 gate not responding, unable to boot...\n");
diff --git a/arch/x86/boot/pmjump.S b/arch/x86/boot/pmjump.S
index 019c17a75851..3e0edc6d2a20 100644
--- a/arch/x86/boot/pmjump.S
+++ b/arch/x86/boot/pmjump.S
@@ -47,6 +47,7 @@ GLOBAL(protected_mode_jump)
47ENDPROC(protected_mode_jump) 47ENDPROC(protected_mode_jump)
48 48
49 .code32 49 .code32
50 .section ".text32","ax"
50GLOBAL(in_pm32) 51GLOBAL(in_pm32)
51 # Set up data segments for flat 32-bit mode 52 # Set up data segments for flat 32-bit mode
52 movl %ecx, %ds 53 movl %ecx, %ds
diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld
index df9234b3a5e0..bb8dc2de7969 100644
--- a/arch/x86/boot/setup.ld
+++ b/arch/x86/boot/setup.ld
@@ -17,7 +17,8 @@ SECTIONS
17 .header : { *(.header) } 17 .header : { *(.header) }
18 .inittext : { *(.inittext) } 18 .inittext : { *(.inittext) }
19 .initdata : { *(.initdata) } 19 .initdata : { *(.initdata) }
20 .text : { *(.text*) } 20 .text : { *(.text) }
21 .text32 : { *(.text32) }
21 22
22 . = ALIGN(16); 23 . = ALIGN(16);
23 .rodata : { *(.rodata*) } 24 .rodata : { *(.rodata*) }
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index 44dc1923c0e3..ee3a4ea923ac 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -130,7 +130,7 @@ static void die(const char * str, ...)
130 130
131static void usage(void) 131static void usage(void)
132{ 132{
133 die("Usage: build [-b] setup system [rootdev] [> image]"); 133 die("Usage: build setup system [rootdev] [> image]");
134} 134}
135 135
136int main(int argc, char ** argv) 136int main(int argc, char ** argv)
@@ -145,11 +145,6 @@ int main(int argc, char ** argv)
145 void *kernel; 145 void *kernel;
146 u32 crc = 0xffffffffUL; 146 u32 crc = 0xffffffffUL;
147 147
148 if (argc > 2 && !strcmp(argv[1], "-b"))
149 {
150 is_big_kernel = 1;
151 argc--, argv++;
152 }
153 if ((argc < 3) || (argc > 4)) 148 if ((argc < 3) || (argc > 4))
154 usage(); 149 usage();
155 if (argc > 3) { 150 if (argc > 3) {
@@ -216,8 +211,6 @@ int main(int argc, char ** argv)
216 die("Unable to mmap '%s': %m", argv[2]); 211 die("Unable to mmap '%s': %m", argv[2]);
217 /* Number of 16-byte paragraphs, including space for a 4-byte CRC */ 212 /* Number of 16-byte paragraphs, including space for a 4-byte CRC */
218 sys_size = (sz + 15 + 4) / 16; 213 sys_size = (sz + 15 + 4) / 16;
219 if (!is_big_kernel && sys_size > DEF_SYSSIZE)
220 die("System is too big. Try using bzImage or modules.");
221 214
222 /* Patch the setup code with the appropriate size parameters */ 215 /* Patch the setup code with the appropriate size parameters */
223 buf[0x1f1] = setup_sectors-1; 216 buf[0x1f1] = setup_sectors-1;
diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c
index 5d4742ed4aa2..95d86ce0421c 100644
--- a/arch/x86/boot/video-vga.c
+++ b/arch/x86/boot/video-vga.c
@@ -129,41 +129,45 @@ u16 vga_crtc(void)
129 return (inb(0x3cc) & 1) ? 0x3d4 : 0x3b4; 129 return (inb(0x3cc) & 1) ? 0x3d4 : 0x3b4;
130} 130}
131 131
132static void vga_set_480_scanlines(int end) 132static void vga_set_480_scanlines(int lines)
133{ 133{
134 u16 crtc; 134 u16 crtc; /* CRTC base address */
135 u8 csel; 135 u8 csel; /* CRTC miscellaneous output register */
136 u8 ovfw; /* CRTC overflow register */
137 int end = lines-1;
136 138
137 crtc = vga_crtc(); 139 crtc = vga_crtc();
138 140
141 ovfw = 0x3c | ((end >> (8-1)) & 0x02) | ((end >> (9-6)) & 0x40);
142
139 out_idx(0x0c, crtc, 0x11); /* Vertical sync end, unlock CR0-7 */ 143 out_idx(0x0c, crtc, 0x11); /* Vertical sync end, unlock CR0-7 */
140 out_idx(0x0b, crtc, 0x06); /* Vertical total */ 144 out_idx(0x0b, crtc, 0x06); /* Vertical total */
141 out_idx(0x3e, crtc, 0x07); /* Vertical overflow */ 145 out_idx(ovfw, crtc, 0x07); /* Vertical overflow */
142 out_idx(0xea, crtc, 0x10); /* Vertical sync start */ 146 out_idx(0xea, crtc, 0x10); /* Vertical sync start */
143 out_idx(end, crtc, 0x12); /* Vertical display end */ 147 out_idx(end, crtc, 0x12); /* Vertical display end */
144 out_idx(0xe7, crtc, 0x15); /* Vertical blank start */ 148 out_idx(0xe7, crtc, 0x15); /* Vertical blank start */
145 out_idx(0x04, crtc, 0x16); /* Vertical blank end */ 149 out_idx(0x04, crtc, 0x16); /* Vertical blank end */
146 csel = inb(0x3cc); 150 csel = inb(0x3cc);
147 csel &= 0x0d; 151 csel &= 0x0d;
148 csel |= 0xe2; 152 csel |= 0xe2;
149 outb(csel, 0x3cc); 153 outb(csel, 0x3c2);
150} 154}
151 155
152static void vga_set_80x30(void) 156static void vga_set_80x30(void)
153{ 157{
154 vga_set_480_scanlines(0xdf); 158 vga_set_480_scanlines(30*16);
155} 159}
156 160
157static void vga_set_80x34(void) 161static void vga_set_80x34(void)
158{ 162{
159 vga_set_14font(); 163 vga_set_14font();
160 vga_set_480_scanlines(0xdb); 164 vga_set_480_scanlines(34*14);
161} 165}
162 166
163static void vga_set_80x60(void) 167static void vga_set_80x60(void)
164{ 168{
165 vga_set_8font(); 169 vga_set_8font();
166 vga_set_480_scanlines(0xdf); 170 vga_set_480_scanlines(60*8);
167} 171}
168 172
169static int vga_set_mode(struct mode_info *mode) 173static int vga_set_mode(struct mode_info *mode)
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 903de4aa5094..ebe7deedd5b4 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -9,6 +9,7 @@ obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
9obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o 9obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
10obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o 10obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
11obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o 11obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
12obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
12 13
13obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o 14obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
14 15
@@ -19,3 +20,5 @@ salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
19aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o 20aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
20twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o 21twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
21salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o 22salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
23
24aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
diff --git a/arch/x86/crypto/aes-i586-asm_32.S b/arch/x86/crypto/aes-i586-asm_32.S
index e41b147f4509..b949ec2f9af4 100644
--- a/arch/x86/crypto/aes-i586-asm_32.S
+++ b/arch/x86/crypto/aes-i586-asm_32.S
@@ -41,14 +41,14 @@
41#define tlen 1024 // length of each of 4 'xor' arrays (256 32-bit words) 41#define tlen 1024 // length of each of 4 'xor' arrays (256 32-bit words)
42 42
43/* offsets to parameters with one register pushed onto stack */ 43/* offsets to parameters with one register pushed onto stack */
44#define tfm 8 44#define ctx 8
45#define out_blk 12 45#define out_blk 12
46#define in_blk 16 46#define in_blk 16
47 47
48/* offsets in crypto_tfm structure */ 48/* offsets in crypto_aes_ctx structure */
49#define klen (crypto_tfm_ctx_offset + 0) 49#define klen (480)
50#define ekey (crypto_tfm_ctx_offset + 4) 50#define ekey (0)
51#define dkey (crypto_tfm_ctx_offset + 244) 51#define dkey (240)
52 52
53// register mapping for encrypt and decrypt subroutines 53// register mapping for encrypt and decrypt subroutines
54 54
@@ -217,7 +217,7 @@
217 do_col (table, r5,r0,r1,r4, r2,r3); /* idx=r5 */ 217 do_col (table, r5,r0,r1,r4, r2,r3); /* idx=r5 */
218 218
219// AES (Rijndael) Encryption Subroutine 219// AES (Rijndael) Encryption Subroutine
220/* void aes_enc_blk(struct crypto_tfm *tfm, u8 *out_blk, const u8 *in_blk) */ 220/* void aes_enc_blk(struct crypto_aes_ctx *ctx, u8 *out_blk, const u8 *in_blk) */
221 221
222.global aes_enc_blk 222.global aes_enc_blk
223 223
@@ -228,7 +228,7 @@
228 228
229aes_enc_blk: 229aes_enc_blk:
230 push %ebp 230 push %ebp
231 mov tfm(%esp),%ebp 231 mov ctx(%esp),%ebp
232 232
233// CAUTION: the order and the values used in these assigns 233// CAUTION: the order and the values used in these assigns
234// rely on the register mappings 234// rely on the register mappings
@@ -292,7 +292,7 @@ aes_enc_blk:
292 ret 292 ret
293 293
294// AES (Rijndael) Decryption Subroutine 294// AES (Rijndael) Decryption Subroutine
295/* void aes_dec_blk(struct crypto_tfm *tfm, u8 *out_blk, const u8 *in_blk) */ 295/* void aes_dec_blk(struct crypto_aes_ctx *ctx, u8 *out_blk, const u8 *in_blk) */
296 296
297.global aes_dec_blk 297.global aes_dec_blk
298 298
@@ -303,7 +303,7 @@ aes_enc_blk:
303 303
304aes_dec_blk: 304aes_dec_blk:
305 push %ebp 305 push %ebp
306 mov tfm(%esp),%ebp 306 mov ctx(%esp),%ebp
307 307
308// CAUTION: the order and the values used in these assigns 308// CAUTION: the order and the values used in these assigns
309// rely on the register mappings 309// rely on the register mappings
diff --git a/arch/x86/crypto/aes-x86_64-asm_64.S b/arch/x86/crypto/aes-x86_64-asm_64.S
index a120f526c3df..5b577d5a059b 100644
--- a/arch/x86/crypto/aes-x86_64-asm_64.S
+++ b/arch/x86/crypto/aes-x86_64-asm_64.S
@@ -17,8 +17,6 @@
17 17
18#include <asm/asm-offsets.h> 18#include <asm/asm-offsets.h>
19 19
20#define BASE crypto_tfm_ctx_offset
21
22#define R1 %rax 20#define R1 %rax
23#define R1E %eax 21#define R1E %eax
24#define R1X %ax 22#define R1X %ax
@@ -56,13 +54,13 @@
56 .align 8; \ 54 .align 8; \
57FUNC: movq r1,r2; \ 55FUNC: movq r1,r2; \
58 movq r3,r4; \ 56 movq r3,r4; \
59 leaq BASE+KEY+48+4(r8),r9; \ 57 leaq KEY+48(r8),r9; \
60 movq r10,r11; \ 58 movq r10,r11; \
61 movl (r7),r5 ## E; \ 59 movl (r7),r5 ## E; \
62 movl 4(r7),r1 ## E; \ 60 movl 4(r7),r1 ## E; \
63 movl 8(r7),r6 ## E; \ 61 movl 8(r7),r6 ## E; \
64 movl 12(r7),r7 ## E; \ 62 movl 12(r7),r7 ## E; \
65 movl BASE+0(r8),r10 ## E; \ 63 movl 480(r8),r10 ## E; \
66 xorl -48(r9),r5 ## E; \ 64 xorl -48(r9),r5 ## E; \
67 xorl -44(r9),r1 ## E; \ 65 xorl -44(r9),r1 ## E; \
68 xorl -40(r9),r6 ## E; \ 66 xorl -40(r9),r6 ## E; \
diff --git a/arch/x86/crypto/aes_glue.c b/arch/x86/crypto/aes_glue.c
index 71f457827116..49ae9fe32b22 100644
--- a/arch/x86/crypto/aes_glue.c
+++ b/arch/x86/crypto/aes_glue.c
@@ -5,17 +5,29 @@
5 5
6#include <crypto/aes.h> 6#include <crypto/aes.h>
7 7
8asmlinkage void aes_enc_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in); 8asmlinkage void aes_enc_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in);
9asmlinkage void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in); 9asmlinkage void aes_dec_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in);
10
11void crypto_aes_encrypt_x86(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
12{
13 aes_enc_blk(ctx, dst, src);
14}
15EXPORT_SYMBOL_GPL(crypto_aes_encrypt_x86);
16
17void crypto_aes_decrypt_x86(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
18{
19 aes_dec_blk(ctx, dst, src);
20}
21EXPORT_SYMBOL_GPL(crypto_aes_decrypt_x86);
10 22
11static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) 23static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
12{ 24{
13 aes_enc_blk(tfm, dst, src); 25 aes_enc_blk(crypto_tfm_ctx(tfm), dst, src);
14} 26}
15 27
16static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) 28static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
17{ 29{
18 aes_dec_blk(tfm, dst, src); 30 aes_dec_blk(crypto_tfm_ctx(tfm), dst, src);
19} 31}
20 32
21static struct crypto_alg aes_alg = { 33static struct crypto_alg aes_alg = {
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
new file mode 100644
index 000000000000..caba99601703
--- /dev/null
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -0,0 +1,896 @@
1/*
2 * Implement AES algorithm in Intel AES-NI instructions.
3 *
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
6 *
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
10 * Kahraman Akdemir
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 */
17
18#include <linux/linkage.h>
19
20.text
21
22#define STATE1 %xmm0
23#define STATE2 %xmm4
24#define STATE3 %xmm5
25#define STATE4 %xmm6
26#define STATE STATE1
27#define IN1 %xmm1
28#define IN2 %xmm7
29#define IN3 %xmm8
30#define IN4 %xmm9
31#define IN IN1
32#define KEY %xmm2
33#define IV %xmm3
34
35#define KEYP %rdi
36#define OUTP %rsi
37#define INP %rdx
38#define LEN %rcx
39#define IVP %r8
40#define KLEN %r9d
41#define T1 %r10
42#define TKEYP T1
43#define T2 %r11
44
45_key_expansion_128:
46_key_expansion_256a:
47 pshufd $0b11111111, %xmm1, %xmm1
48 shufps $0b00010000, %xmm0, %xmm4
49 pxor %xmm4, %xmm0
50 shufps $0b10001100, %xmm0, %xmm4
51 pxor %xmm4, %xmm0
52 pxor %xmm1, %xmm0
53 movaps %xmm0, (%rcx)
54 add $0x10, %rcx
55 ret
56
57_key_expansion_192a:
58 pshufd $0b01010101, %xmm1, %xmm1
59 shufps $0b00010000, %xmm0, %xmm4
60 pxor %xmm4, %xmm0
61 shufps $0b10001100, %xmm0, %xmm4
62 pxor %xmm4, %xmm0
63 pxor %xmm1, %xmm0
64
65 movaps %xmm2, %xmm5
66 movaps %xmm2, %xmm6
67 pslldq $4, %xmm5
68 pshufd $0b11111111, %xmm0, %xmm3
69 pxor %xmm3, %xmm2
70 pxor %xmm5, %xmm2
71
72 movaps %xmm0, %xmm1
73 shufps $0b01000100, %xmm0, %xmm6
74 movaps %xmm6, (%rcx)
75 shufps $0b01001110, %xmm2, %xmm1
76 movaps %xmm1, 16(%rcx)
77 add $0x20, %rcx
78 ret
79
80_key_expansion_192b:
81 pshufd $0b01010101, %xmm1, %xmm1
82 shufps $0b00010000, %xmm0, %xmm4
83 pxor %xmm4, %xmm0
84 shufps $0b10001100, %xmm0, %xmm4
85 pxor %xmm4, %xmm0
86 pxor %xmm1, %xmm0
87
88 movaps %xmm2, %xmm5
89 pslldq $4, %xmm5
90 pshufd $0b11111111, %xmm0, %xmm3
91 pxor %xmm3, %xmm2
92 pxor %xmm5, %xmm2
93
94 movaps %xmm0, (%rcx)
95 add $0x10, %rcx
96 ret
97
98_key_expansion_256b:
99 pshufd $0b10101010, %xmm1, %xmm1
100 shufps $0b00010000, %xmm2, %xmm4
101 pxor %xmm4, %xmm2
102 shufps $0b10001100, %xmm2, %xmm4
103 pxor %xmm4, %xmm2
104 pxor %xmm1, %xmm2
105 movaps %xmm2, (%rcx)
106 add $0x10, %rcx
107 ret
108
109/*
110 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
111 * unsigned int key_len)
112 */
113ENTRY(aesni_set_key)
114 movups (%rsi), %xmm0 # user key (first 16 bytes)
115 movaps %xmm0, (%rdi)
116 lea 0x10(%rdi), %rcx # key addr
117 movl %edx, 480(%rdi)
118 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
119 cmp $24, %dl
120 jb .Lenc_key128
121 je .Lenc_key192
122 movups 0x10(%rsi), %xmm2 # other user key
123 movaps %xmm2, (%rcx)
124 add $0x10, %rcx
125 # aeskeygenassist $0x1, %xmm2, %xmm1 # round 1
126 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x01
127 call _key_expansion_256a
128 # aeskeygenassist $0x1, %xmm0, %xmm1
129 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x01
130 call _key_expansion_256b
131 # aeskeygenassist $0x2, %xmm2, %xmm1 # round 2
132 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x02
133 call _key_expansion_256a
134 # aeskeygenassist $0x2, %xmm0, %xmm1
135 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x02
136 call _key_expansion_256b
137 # aeskeygenassist $0x4, %xmm2, %xmm1 # round 3
138 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x04
139 call _key_expansion_256a
140 # aeskeygenassist $0x4, %xmm0, %xmm1
141 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x04
142 call _key_expansion_256b
143 # aeskeygenassist $0x8, %xmm2, %xmm1 # round 4
144 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x08
145 call _key_expansion_256a
146 # aeskeygenassist $0x8, %xmm0, %xmm1
147 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x08
148 call _key_expansion_256b
149 # aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
150 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x10
151 call _key_expansion_256a
152 # aeskeygenassist $0x10, %xmm0, %xmm1
153 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x10
154 call _key_expansion_256b
155 # aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
156 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x20
157 call _key_expansion_256a
158 # aeskeygenassist $0x20, %xmm0, %xmm1
159 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x20
160 call _key_expansion_256b
161 # aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
162 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x40
163 call _key_expansion_256a
164 jmp .Ldec_key
165.Lenc_key192:
166 movq 0x10(%rsi), %xmm2 # other user key
167 # aeskeygenassist $0x1, %xmm2, %xmm1 # round 1
168 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x01
169 call _key_expansion_192a
170 # aeskeygenassist $0x2, %xmm2, %xmm1 # round 2
171 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x02
172 call _key_expansion_192b
173 # aeskeygenassist $0x4, %xmm2, %xmm1 # round 3
174 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x04
175 call _key_expansion_192a
176 # aeskeygenassist $0x8, %xmm2, %xmm1 # round 4
177 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x08
178 call _key_expansion_192b
179 # aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
180 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x10
181 call _key_expansion_192a
182 # aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
183 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x20
184 call _key_expansion_192b
185 # aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
186 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x40
187 call _key_expansion_192a
188 # aeskeygenassist $0x80, %xmm2, %xmm1 # round 8
189 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x80
190 call _key_expansion_192b
191 jmp .Ldec_key
192.Lenc_key128:
193 # aeskeygenassist $0x1, %xmm0, %xmm1 # round 1
194 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x01
195 call _key_expansion_128
196 # aeskeygenassist $0x2, %xmm0, %xmm1 # round 2
197 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x02
198 call _key_expansion_128
199 # aeskeygenassist $0x4, %xmm0, %xmm1 # round 3
200 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x04
201 call _key_expansion_128
202 # aeskeygenassist $0x8, %xmm0, %xmm1 # round 4
203 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x08
204 call _key_expansion_128
205 # aeskeygenassist $0x10, %xmm0, %xmm1 # round 5
206 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x10
207 call _key_expansion_128
208 # aeskeygenassist $0x20, %xmm0, %xmm1 # round 6
209 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x20
210 call _key_expansion_128
211 # aeskeygenassist $0x40, %xmm0, %xmm1 # round 7
212 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x40
213 call _key_expansion_128
214 # aeskeygenassist $0x80, %xmm0, %xmm1 # round 8
215 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x80
216 call _key_expansion_128
217 # aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9
218 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x1b
219 call _key_expansion_128
220 # aeskeygenassist $0x36, %xmm0, %xmm1 # round 10
221 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x36
222 call _key_expansion_128
223.Ldec_key:
224 sub $0x10, %rcx
225 movaps (%rdi), %xmm0
226 movaps (%rcx), %xmm1
227 movaps %xmm0, 240(%rcx)
228 movaps %xmm1, 240(%rdi)
229 add $0x10, %rdi
230 lea 240-16(%rcx), %rsi
231.align 4
232.Ldec_key_loop:
233 movaps (%rdi), %xmm0
234 # aesimc %xmm0, %xmm1
235 .byte 0x66, 0x0f, 0x38, 0xdb, 0xc8
236 movaps %xmm1, (%rsi)
237 add $0x10, %rdi
238 sub $0x10, %rsi
239 cmp %rcx, %rdi
240 jb .Ldec_key_loop
241 xor %rax, %rax
242 ret
243
244/*
245 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
246 */
247ENTRY(aesni_enc)
248 movl 480(KEYP), KLEN # key length
249 movups (INP), STATE # input
250 call _aesni_enc1
251 movups STATE, (OUTP) # output
252 ret
253
254/*
255 * _aesni_enc1: internal ABI
256 * input:
257 * KEYP: key struct pointer
258 * KLEN: round count
259 * STATE: initial state (input)
260 * output:
261 * STATE: finial state (output)
262 * changed:
263 * KEY
264 * TKEYP (T1)
265 */
266_aesni_enc1:
267 movaps (KEYP), KEY # key
268 mov KEYP, TKEYP
269 pxor KEY, STATE # round 0
270 add $0x30, TKEYP
271 cmp $24, KLEN
272 jb .Lenc128
273 lea 0x20(TKEYP), TKEYP
274 je .Lenc192
275 add $0x20, TKEYP
276 movaps -0x60(TKEYP), KEY
277 # aesenc KEY, STATE
278 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
279 movaps -0x50(TKEYP), KEY
280 # aesenc KEY, STATE
281 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
282.align 4
283.Lenc192:
284 movaps -0x40(TKEYP), KEY
285 # aesenc KEY, STATE
286 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
287 movaps -0x30(TKEYP), KEY
288 # aesenc KEY, STATE
289 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
290.align 4
291.Lenc128:
292 movaps -0x20(TKEYP), KEY
293 # aesenc KEY, STATE
294 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
295 movaps -0x10(TKEYP), KEY
296 # aesenc KEY, STATE
297 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
298 movaps (TKEYP), KEY
299 # aesenc KEY, STATE
300 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
301 movaps 0x10(TKEYP), KEY
302 # aesenc KEY, STATE
303 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
304 movaps 0x20(TKEYP), KEY
305 # aesenc KEY, STATE
306 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
307 movaps 0x30(TKEYP), KEY
308 # aesenc KEY, STATE
309 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
310 movaps 0x40(TKEYP), KEY
311 # aesenc KEY, STATE
312 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
313 movaps 0x50(TKEYP), KEY
314 # aesenc KEY, STATE
315 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
316 movaps 0x60(TKEYP), KEY
317 # aesenc KEY, STATE
318 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
319 movaps 0x70(TKEYP), KEY
320 # aesenclast KEY, STATE # last round
321 .byte 0x66, 0x0f, 0x38, 0xdd, 0xc2
322 ret
323
324/*
325 * _aesni_enc4: internal ABI
326 * input:
327 * KEYP: key struct pointer
328 * KLEN: round count
329 * STATE1: initial state (input)
330 * STATE2
331 * STATE3
332 * STATE4
333 * output:
334 * STATE1: finial state (output)
335 * STATE2
336 * STATE3
337 * STATE4
338 * changed:
339 * KEY
340 * TKEYP (T1)
341 */
342_aesni_enc4:
343 movaps (KEYP), KEY # key
344 mov KEYP, TKEYP
345 pxor KEY, STATE1 # round 0
346 pxor KEY, STATE2
347 pxor KEY, STATE3
348 pxor KEY, STATE4
349 add $0x30, TKEYP
350 cmp $24, KLEN
351 jb .L4enc128
352 lea 0x20(TKEYP), TKEYP
353 je .L4enc192
354 add $0x20, TKEYP
355 movaps -0x60(TKEYP), KEY
356 # aesenc KEY, STATE1
357 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
358 # aesenc KEY, STATE2
359 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
360 # aesenc KEY, STATE3
361 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
362 # aesenc KEY, STATE4
363 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
364 movaps -0x50(TKEYP), KEY
365 # aesenc KEY, STATE1
366 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
367 # aesenc KEY, STATE2
368 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
369 # aesenc KEY, STATE3
370 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
371 # aesenc KEY, STATE4
372 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
373#.align 4
374.L4enc192:
375 movaps -0x40(TKEYP), KEY
376 # aesenc KEY, STATE1
377 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
378 # aesenc KEY, STATE2
379 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
380 # aesenc KEY, STATE3
381 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
382 # aesenc KEY, STATE4
383 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
384 movaps -0x30(TKEYP), KEY
385 # aesenc KEY, STATE1
386 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
387 # aesenc KEY, STATE2
388 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
389 # aesenc KEY, STATE3
390 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
391 # aesenc KEY, STATE4
392 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
393#.align 4
394.L4enc128:
395 movaps -0x20(TKEYP), KEY
396 # aesenc KEY, STATE1
397 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
398 # aesenc KEY, STATE2
399 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
400 # aesenc KEY, STATE3
401 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
402 # aesenc KEY, STATE4
403 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
404 movaps -0x10(TKEYP), KEY
405 # aesenc KEY, STATE1
406 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
407 # aesenc KEY, STATE2
408 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
409 # aesenc KEY, STATE3
410 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
411 # aesenc KEY, STATE4
412 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
413 movaps (TKEYP), KEY
414 # aesenc KEY, STATE1
415 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
416 # aesenc KEY, STATE2
417 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
418 # aesenc KEY, STATE3
419 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
420 # aesenc KEY, STATE4
421 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
422 movaps 0x10(TKEYP), KEY
423 # aesenc KEY, STATE1
424 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
425 # aesenc KEY, STATE2
426 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
427 # aesenc KEY, STATE3
428 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
429 # aesenc KEY, STATE4
430 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
431 movaps 0x20(TKEYP), KEY
432 # aesenc KEY, STATE1
433 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
434 # aesenc KEY, STATE2
435 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
436 # aesenc KEY, STATE3
437 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
438 # aesenc KEY, STATE4
439 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
440 movaps 0x30(TKEYP), KEY
441 # aesenc KEY, STATE1
442 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
443 # aesenc KEY, STATE2
444 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
445 # aesenc KEY, STATE3
446 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
447 # aesenc KEY, STATE4
448 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
449 movaps 0x40(TKEYP), KEY
450 # aesenc KEY, STATE1
451 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
452 # aesenc KEY, STATE2
453 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
454 # aesenc KEY, STATE3
455 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
456 # aesenc KEY, STATE4
457 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
458 movaps 0x50(TKEYP), KEY
459 # aesenc KEY, STATE1
460 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
461 # aesenc KEY, STATE2
462 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
463 # aesenc KEY, STATE3
464 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
465 # aesenc KEY, STATE4
466 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
467 movaps 0x60(TKEYP), KEY
468 # aesenc KEY, STATE1
469 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
470 # aesenc KEY, STATE2
471 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2
472 # aesenc KEY, STATE3
473 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
474 # aesenc KEY, STATE4
475 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
476 movaps 0x70(TKEYP), KEY
477 # aesenclast KEY, STATE1 # last round
478 .byte 0x66, 0x0f, 0x38, 0xdd, 0xc2
479 # aesenclast KEY, STATE2
480 .byte 0x66, 0x0f, 0x38, 0xdd, 0xe2
481 # aesenclast KEY, STATE3
482 .byte 0x66, 0x0f, 0x38, 0xdd, 0xea
483 # aesenclast KEY, STATE4
484 .byte 0x66, 0x0f, 0x38, 0xdd, 0xf2
485 ret
486
487/*
488 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
489 */
490ENTRY(aesni_dec)
491 mov 480(KEYP), KLEN # key length
492 add $240, KEYP
493 movups (INP), STATE # input
494 call _aesni_dec1
495 movups STATE, (OUTP) #output
496 ret
497
498/*
499 * _aesni_dec1: internal ABI
500 * input:
501 * KEYP: key struct pointer
502 * KLEN: key length
503 * STATE: initial state (input)
504 * output:
505 * STATE: finial state (output)
506 * changed:
507 * KEY
508 * TKEYP (T1)
509 */
510_aesni_dec1:
511 movaps (KEYP), KEY # key
512 mov KEYP, TKEYP
513 pxor KEY, STATE # round 0
514 add $0x30, TKEYP
515 cmp $24, KLEN
516 jb .Ldec128
517 lea 0x20(TKEYP), TKEYP
518 je .Ldec192
519 add $0x20, TKEYP
520 movaps -0x60(TKEYP), KEY
521 # aesdec KEY, STATE
522 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
523 movaps -0x50(TKEYP), KEY
524 # aesdec KEY, STATE
525 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
526.align 4
527.Ldec192:
528 movaps -0x40(TKEYP), KEY
529 # aesdec KEY, STATE
530 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
531 movaps -0x30(TKEYP), KEY
532 # aesdec KEY, STATE
533 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
534.align 4
535.Ldec128:
536 movaps -0x20(TKEYP), KEY
537 # aesdec KEY, STATE
538 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
539 movaps -0x10(TKEYP), KEY
540 # aesdec KEY, STATE
541 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
542 movaps (TKEYP), KEY
543 # aesdec KEY, STATE
544 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
545 movaps 0x10(TKEYP), KEY
546 # aesdec KEY, STATE
547 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
548 movaps 0x20(TKEYP), KEY
549 # aesdec KEY, STATE
550 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
551 movaps 0x30(TKEYP), KEY
552 # aesdec KEY, STATE
553 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
554 movaps 0x40(TKEYP), KEY
555 # aesdec KEY, STATE
556 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
557 movaps 0x50(TKEYP), KEY
558 # aesdec KEY, STATE
559 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
560 movaps 0x60(TKEYP), KEY
561 # aesdec KEY, STATE
562 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
563 movaps 0x70(TKEYP), KEY
564 # aesdeclast KEY, STATE # last round
565 .byte 0x66, 0x0f, 0x38, 0xdf, 0xc2
566 ret
567
568/*
569 * _aesni_dec4: internal ABI
570 * input:
571 * KEYP: key struct pointer
572 * KLEN: key length
573 * STATE1: initial state (input)
574 * STATE2
575 * STATE3
576 * STATE4
577 * output:
578 * STATE1: finial state (output)
579 * STATE2
580 * STATE3
581 * STATE4
582 * changed:
583 * KEY
584 * TKEYP (T1)
585 */
586_aesni_dec4:
587 movaps (KEYP), KEY # key
588 mov KEYP, TKEYP
589 pxor KEY, STATE1 # round 0
590 pxor KEY, STATE2
591 pxor KEY, STATE3
592 pxor KEY, STATE4
593 add $0x30, TKEYP
594 cmp $24, KLEN
595 jb .L4dec128
596 lea 0x20(TKEYP), TKEYP
597 je .L4dec192
598 add $0x20, TKEYP
599 movaps -0x60(TKEYP), KEY
600 # aesdec KEY, STATE1
601 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
602 # aesdec KEY, STATE2
603 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
604 # aesdec KEY, STATE3
605 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
606 # aesdec KEY, STATE4
607 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
608 movaps -0x50(TKEYP), KEY
609 # aesdec KEY, STATE1
610 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
611 # aesdec KEY, STATE2
612 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
613 # aesdec KEY, STATE3
614 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
615 # aesdec KEY, STATE4
616 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
617.align 4
618.L4dec192:
619 movaps -0x40(TKEYP), KEY
620 # aesdec KEY, STATE1
621 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
622 # aesdec KEY, STATE2
623 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
624 # aesdec KEY, STATE3
625 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
626 # aesdec KEY, STATE4
627 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
628 movaps -0x30(TKEYP), KEY
629 # aesdec KEY, STATE1
630 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
631 # aesdec KEY, STATE2
632 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
633 # aesdec KEY, STATE3
634 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
635 # aesdec KEY, STATE4
636 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
637.align 4
638.L4dec128:
639 movaps -0x20(TKEYP), KEY
640 # aesdec KEY, STATE1
641 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
642 # aesdec KEY, STATE2
643 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
644 # aesdec KEY, STATE3
645 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
646 # aesdec KEY, STATE4
647 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
648 movaps -0x10(TKEYP), KEY
649 # aesdec KEY, STATE1
650 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
651 # aesdec KEY, STATE2
652 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
653 # aesdec KEY, STATE3
654 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
655 # aesdec KEY, STATE4
656 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
657 movaps (TKEYP), KEY
658 # aesdec KEY, STATE1
659 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
660 # aesdec KEY, STATE2
661 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
662 # aesdec KEY, STATE3
663 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
664 # aesdec KEY, STATE4
665 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
666 movaps 0x10(TKEYP), KEY
667 # aesdec KEY, STATE1
668 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
669 # aesdec KEY, STATE2
670 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
671 # aesdec KEY, STATE3
672 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
673 # aesdec KEY, STATE4
674 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
675 movaps 0x20(TKEYP), KEY
676 # aesdec KEY, STATE1
677 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
678 # aesdec KEY, STATE2
679 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
680 # aesdec KEY, STATE3
681 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
682 # aesdec KEY, STATE4
683 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
684 movaps 0x30(TKEYP), KEY
685 # aesdec KEY, STATE1
686 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
687 # aesdec KEY, STATE2
688 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
689 # aesdec KEY, STATE3
690 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
691 # aesdec KEY, STATE4
692 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
693 movaps 0x40(TKEYP), KEY
694 # aesdec KEY, STATE1
695 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
696 # aesdec KEY, STATE2
697 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
698 # aesdec KEY, STATE3
699 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
700 # aesdec KEY, STATE4
701 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
702 movaps 0x50(TKEYP), KEY
703 # aesdec KEY, STATE1
704 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
705 # aesdec KEY, STATE2
706 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
707 # aesdec KEY, STATE3
708 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
709 # aesdec KEY, STATE4
710 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
711 movaps 0x60(TKEYP), KEY
712 # aesdec KEY, STATE1
713 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
714 # aesdec KEY, STATE2
715 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2
716 # aesdec KEY, STATE3
717 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
718 # aesdec KEY, STATE4
719 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
720 movaps 0x70(TKEYP), KEY
721 # aesdeclast KEY, STATE1 # last round
722 .byte 0x66, 0x0f, 0x38, 0xdf, 0xc2
723 # aesdeclast KEY, STATE2
724 .byte 0x66, 0x0f, 0x38, 0xdf, 0xe2
725 # aesdeclast KEY, STATE3
726 .byte 0x66, 0x0f, 0x38, 0xdf, 0xea
727 # aesdeclast KEY, STATE4
728 .byte 0x66, 0x0f, 0x38, 0xdf, 0xf2
729 ret
730
731/*
732 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
733 * size_t len)
734 */
735ENTRY(aesni_ecb_enc)
736 test LEN, LEN # check length
737 jz .Lecb_enc_ret
738 mov 480(KEYP), KLEN
739 cmp $16, LEN
740 jb .Lecb_enc_ret
741 cmp $64, LEN
742 jb .Lecb_enc_loop1
743.align 4
744.Lecb_enc_loop4:
745 movups (INP), STATE1
746 movups 0x10(INP), STATE2
747 movups 0x20(INP), STATE3
748 movups 0x30(INP), STATE4
749 call _aesni_enc4
750 movups STATE1, (OUTP)
751 movups STATE2, 0x10(OUTP)
752 movups STATE3, 0x20(OUTP)
753 movups STATE4, 0x30(OUTP)
754 sub $64, LEN
755 add $64, INP
756 add $64, OUTP
757 cmp $64, LEN
758 jge .Lecb_enc_loop4
759 cmp $16, LEN
760 jb .Lecb_enc_ret
761.align 4
762.Lecb_enc_loop1:
763 movups (INP), STATE1
764 call _aesni_enc1
765 movups STATE1, (OUTP)
766 sub $16, LEN
767 add $16, INP
768 add $16, OUTP
769 cmp $16, LEN
770 jge .Lecb_enc_loop1
771.Lecb_enc_ret:
772 ret
773
774/*
775 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
776 * size_t len);
777 */
778ENTRY(aesni_ecb_dec)
779 test LEN, LEN
780 jz .Lecb_dec_ret
781 mov 480(KEYP), KLEN
782 add $240, KEYP
783 cmp $16, LEN
784 jb .Lecb_dec_ret
785 cmp $64, LEN
786 jb .Lecb_dec_loop1
787.align 4
788.Lecb_dec_loop4:
789 movups (INP), STATE1
790 movups 0x10(INP), STATE2
791 movups 0x20(INP), STATE3
792 movups 0x30(INP), STATE4
793 call _aesni_dec4
794 movups STATE1, (OUTP)
795 movups STATE2, 0x10(OUTP)
796 movups STATE3, 0x20(OUTP)
797 movups STATE4, 0x30(OUTP)
798 sub $64, LEN
799 add $64, INP
800 add $64, OUTP
801 cmp $64, LEN
802 jge .Lecb_dec_loop4
803 cmp $16, LEN
804 jb .Lecb_dec_ret
805.align 4
806.Lecb_dec_loop1:
807 movups (INP), STATE1
808 call _aesni_dec1
809 movups STATE1, (OUTP)
810 sub $16, LEN
811 add $16, INP
812 add $16, OUTP
813 cmp $16, LEN
814 jge .Lecb_dec_loop1
815.Lecb_dec_ret:
816 ret
817
818/*
819 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
820 * size_t len, u8 *iv)
821 */
822ENTRY(aesni_cbc_enc)
823 cmp $16, LEN
824 jb .Lcbc_enc_ret
825 mov 480(KEYP), KLEN
826 movups (IVP), STATE # load iv as initial state
827.align 4
828.Lcbc_enc_loop:
829 movups (INP), IN # load input
830 pxor IN, STATE
831 call _aesni_enc1
832 movups STATE, (OUTP) # store output
833 sub $16, LEN
834 add $16, INP
835 add $16, OUTP
836 cmp $16, LEN
837 jge .Lcbc_enc_loop
838 movups STATE, (IVP)
839.Lcbc_enc_ret:
840 ret
841
842/*
843 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
844 * size_t len, u8 *iv)
845 */
846ENTRY(aesni_cbc_dec)
847 cmp $16, LEN
848 jb .Lcbc_dec_ret
849 mov 480(KEYP), KLEN
850 add $240, KEYP
851 movups (IVP), IV
852 cmp $64, LEN
853 jb .Lcbc_dec_loop1
854.align 4
855.Lcbc_dec_loop4:
856 movups (INP), IN1
857 movaps IN1, STATE1
858 movups 0x10(INP), IN2
859 movaps IN2, STATE2
860 movups 0x20(INP), IN3
861 movaps IN3, STATE3
862 movups 0x30(INP), IN4
863 movaps IN4, STATE4
864 call _aesni_dec4
865 pxor IV, STATE1
866 pxor IN1, STATE2
867 pxor IN2, STATE3
868 pxor IN3, STATE4
869 movaps IN4, IV
870 movups STATE1, (OUTP)
871 movups STATE2, 0x10(OUTP)
872 movups STATE3, 0x20(OUTP)
873 movups STATE4, 0x30(OUTP)
874 sub $64, LEN
875 add $64, INP
876 add $64, OUTP
877 cmp $64, LEN
878 jge .Lcbc_dec_loop4
879 cmp $16, LEN
880 jb .Lcbc_dec_ret
881.align 4
882.Lcbc_dec_loop1:
883 movups (INP), IN
884 movaps IN, STATE
885 call _aesni_dec1
886 pxor IV, STATE
887 movups STATE, (OUTP)
888 movaps IN, IV
889 sub $16, LEN
890 add $16, INP
891 add $16, OUTP
892 cmp $16, LEN
893 jge .Lcbc_dec_loop1
894 movups IV, (IVP)
895.Lcbc_dec_ret:
896 ret
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
new file mode 100644
index 000000000000..02af0af65497
--- /dev/null
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -0,0 +1,461 @@
1/*
2 * Support for Intel AES-NI instructions. This file contains glue
3 * code, the real AES implementation is in intel-aes_asm.S.
4 *
5 * Copyright (C) 2008, Intel Corp.
6 * Author: Huang Ying <ying.huang@intel.com>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 */
13
14#include <linux/hardirq.h>
15#include <linux/types.h>
16#include <linux/crypto.h>
17#include <linux/err.h>
18#include <crypto/algapi.h>
19#include <crypto/aes.h>
20#include <crypto/cryptd.h>
21#include <asm/i387.h>
22#include <asm/aes.h>
23
24struct async_aes_ctx {
25 struct cryptd_ablkcipher *cryptd_tfm;
26};
27
28#define AESNI_ALIGN 16
29#define AES_BLOCK_MASK (~(AES_BLOCK_SIZE-1))
30
31asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
32 unsigned int key_len);
33asmlinkage void aesni_enc(struct crypto_aes_ctx *ctx, u8 *out,
34 const u8 *in);
35asmlinkage void aesni_dec(struct crypto_aes_ctx *ctx, u8 *out,
36 const u8 *in);
37asmlinkage void aesni_ecb_enc(struct crypto_aes_ctx *ctx, u8 *out,
38 const u8 *in, unsigned int len);
39asmlinkage void aesni_ecb_dec(struct crypto_aes_ctx *ctx, u8 *out,
40 const u8 *in, unsigned int len);
41asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
42 const u8 *in, unsigned int len, u8 *iv);
43asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
44 const u8 *in, unsigned int len, u8 *iv);
45
46static inline int kernel_fpu_using(void)
47{
48 if (in_interrupt() && !(read_cr0() & X86_CR0_TS))
49 return 1;
50 return 0;
51}
52
53static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
54{
55 unsigned long addr = (unsigned long)raw_ctx;
56 unsigned long align = AESNI_ALIGN;
57
58 if (align <= crypto_tfm_ctx_alignment())
59 align = 1;
60 return (struct crypto_aes_ctx *)ALIGN(addr, align);
61}
62
63static int aes_set_key_common(struct crypto_tfm *tfm, void *raw_ctx,
64 const u8 *in_key, unsigned int key_len)
65{
66 struct crypto_aes_ctx *ctx = aes_ctx(raw_ctx);
67 u32 *flags = &tfm->crt_flags;
68 int err;
69
70 if (key_len != AES_KEYSIZE_128 && key_len != AES_KEYSIZE_192 &&
71 key_len != AES_KEYSIZE_256) {
72 *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
73 return -EINVAL;
74 }
75
76 if (kernel_fpu_using())
77 err = crypto_aes_expand_key(ctx, in_key, key_len);
78 else {
79 kernel_fpu_begin();
80 err = aesni_set_key(ctx, in_key, key_len);
81 kernel_fpu_end();
82 }
83
84 return err;
85}
86
87static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
88 unsigned int key_len)
89{
90 return aes_set_key_common(tfm, crypto_tfm_ctx(tfm), in_key, key_len);
91}
92
93static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
94{
95 struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
96
97 if (kernel_fpu_using())
98 crypto_aes_encrypt_x86(ctx, dst, src);
99 else {
100 kernel_fpu_begin();
101 aesni_enc(ctx, dst, src);
102 kernel_fpu_end();
103 }
104}
105
106static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
107{
108 struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
109
110 if (kernel_fpu_using())
111 crypto_aes_decrypt_x86(ctx, dst, src);
112 else {
113 kernel_fpu_begin();
114 aesni_dec(ctx, dst, src);
115 kernel_fpu_end();
116 }
117}
118
119static struct crypto_alg aesni_alg = {
120 .cra_name = "aes",
121 .cra_driver_name = "aes-aesni",
122 .cra_priority = 300,
123 .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
124 .cra_blocksize = AES_BLOCK_SIZE,
125 .cra_ctxsize = sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1,
126 .cra_alignmask = 0,
127 .cra_module = THIS_MODULE,
128 .cra_list = LIST_HEAD_INIT(aesni_alg.cra_list),
129 .cra_u = {
130 .cipher = {
131 .cia_min_keysize = AES_MIN_KEY_SIZE,
132 .cia_max_keysize = AES_MAX_KEY_SIZE,
133 .cia_setkey = aes_set_key,
134 .cia_encrypt = aes_encrypt,
135 .cia_decrypt = aes_decrypt
136 }
137 }
138};
139
140static int ecb_encrypt(struct blkcipher_desc *desc,
141 struct scatterlist *dst, struct scatterlist *src,
142 unsigned int nbytes)
143{
144 struct crypto_aes_ctx *ctx = aes_ctx(crypto_blkcipher_ctx(desc->tfm));
145 struct blkcipher_walk walk;
146 int err;
147
148 blkcipher_walk_init(&walk, dst, src, nbytes);
149 err = blkcipher_walk_virt(desc, &walk);
150
151 kernel_fpu_begin();
152 while ((nbytes = walk.nbytes)) {
153 aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
154 nbytes & AES_BLOCK_MASK);
155 nbytes &= AES_BLOCK_SIZE - 1;
156 err = blkcipher_walk_done(desc, &walk, nbytes);
157 }
158 kernel_fpu_end();
159
160 return err;
161}
162
163static int ecb_decrypt(struct blkcipher_desc *desc,
164 struct scatterlist *dst, struct scatterlist *src,
165 unsigned int nbytes)
166{
167 struct crypto_aes_ctx *ctx = aes_ctx(crypto_blkcipher_ctx(desc->tfm));
168 struct blkcipher_walk walk;
169 int err;
170
171 blkcipher_walk_init(&walk, dst, src, nbytes);
172 err = blkcipher_walk_virt(desc, &walk);
173
174 kernel_fpu_begin();
175 while ((nbytes = walk.nbytes)) {
176 aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
177 nbytes & AES_BLOCK_MASK);
178 nbytes &= AES_BLOCK_SIZE - 1;
179 err = blkcipher_walk_done(desc, &walk, nbytes);
180 }
181 kernel_fpu_end();
182
183 return err;
184}
185
186static struct crypto_alg blk_ecb_alg = {
187 .cra_name = "__ecb-aes-aesni",
188 .cra_driver_name = "__driver-ecb-aes-aesni",
189 .cra_priority = 0,
190 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
191 .cra_blocksize = AES_BLOCK_SIZE,
192 .cra_ctxsize = sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1,
193 .cra_alignmask = 0,
194 .cra_type = &crypto_blkcipher_type,
195 .cra_module = THIS_MODULE,
196 .cra_list = LIST_HEAD_INIT(blk_ecb_alg.cra_list),
197 .cra_u = {
198 .blkcipher = {
199 .min_keysize = AES_MIN_KEY_SIZE,
200 .max_keysize = AES_MAX_KEY_SIZE,
201 .setkey = aes_set_key,
202 .encrypt = ecb_encrypt,
203 .decrypt = ecb_decrypt,
204 },
205 },
206};
207
208static int cbc_encrypt(struct blkcipher_desc *desc,
209 struct scatterlist *dst, struct scatterlist *src,
210 unsigned int nbytes)
211{
212 struct crypto_aes_ctx *ctx = aes_ctx(crypto_blkcipher_ctx(desc->tfm));
213 struct blkcipher_walk walk;
214 int err;
215
216 blkcipher_walk_init(&walk, dst, src, nbytes);
217 err = blkcipher_walk_virt(desc, &walk);
218
219 kernel_fpu_begin();
220 while ((nbytes = walk.nbytes)) {
221 aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
222 nbytes & AES_BLOCK_MASK, walk.iv);
223 nbytes &= AES_BLOCK_SIZE - 1;
224 err = blkcipher_walk_done(desc, &walk, nbytes);
225 }
226 kernel_fpu_end();
227
228 return err;
229}
230
231static int cbc_decrypt(struct blkcipher_desc *desc,
232 struct scatterlist *dst, struct scatterlist *src,
233 unsigned int nbytes)
234{
235 struct crypto_aes_ctx *ctx = aes_ctx(crypto_blkcipher_ctx(desc->tfm));
236 struct blkcipher_walk walk;
237 int err;
238
239 blkcipher_walk_init(&walk, dst, src, nbytes);
240 err = blkcipher_walk_virt(desc, &walk);
241
242 kernel_fpu_begin();
243 while ((nbytes = walk.nbytes)) {
244 aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
245 nbytes & AES_BLOCK_MASK, walk.iv);
246 nbytes &= AES_BLOCK_SIZE - 1;
247 err = blkcipher_walk_done(desc, &walk, nbytes);
248 }
249 kernel_fpu_end();
250
251 return err;
252}
253
254static struct crypto_alg blk_cbc_alg = {
255 .cra_name = "__cbc-aes-aesni",
256 .cra_driver_name = "__driver-cbc-aes-aesni",
257 .cra_priority = 0,
258 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
259 .cra_blocksize = AES_BLOCK_SIZE,
260 .cra_ctxsize = sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1,
261 .cra_alignmask = 0,
262 .cra_type = &crypto_blkcipher_type,
263 .cra_module = THIS_MODULE,
264 .cra_list = LIST_HEAD_INIT(blk_cbc_alg.cra_list),
265 .cra_u = {
266 .blkcipher = {
267 .min_keysize = AES_MIN_KEY_SIZE,
268 .max_keysize = AES_MAX_KEY_SIZE,
269 .setkey = aes_set_key,
270 .encrypt = cbc_encrypt,
271 .decrypt = cbc_decrypt,
272 },
273 },
274};
275
276static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
277 unsigned int key_len)
278{
279 struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
280
281 return crypto_ablkcipher_setkey(&ctx->cryptd_tfm->base, key, key_len);
282}
283
284static int ablk_encrypt(struct ablkcipher_request *req)
285{
286 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
287 struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
288
289 if (kernel_fpu_using()) {
290 struct ablkcipher_request *cryptd_req =
291 ablkcipher_request_ctx(req);
292 memcpy(cryptd_req, req, sizeof(*req));
293 ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
294 return crypto_ablkcipher_encrypt(cryptd_req);
295 } else {
296 struct blkcipher_desc desc;
297 desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
298 desc.info = req->info;
299 desc.flags = 0;
300 return crypto_blkcipher_crt(desc.tfm)->encrypt(
301 &desc, req->dst, req->src, req->nbytes);
302 }
303}
304
305static int ablk_decrypt(struct ablkcipher_request *req)
306{
307 struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
308 struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
309
310 if (kernel_fpu_using()) {
311 struct ablkcipher_request *cryptd_req =
312 ablkcipher_request_ctx(req);
313 memcpy(cryptd_req, req, sizeof(*req));
314 ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
315 return crypto_ablkcipher_decrypt(cryptd_req);
316 } else {
317 struct blkcipher_desc desc;
318 desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
319 desc.info = req->info;
320 desc.flags = 0;
321 return crypto_blkcipher_crt(desc.tfm)->decrypt(
322 &desc, req->dst, req->src, req->nbytes);
323 }
324}
325
326static void ablk_exit(struct crypto_tfm *tfm)
327{
328 struct async_aes_ctx *ctx = crypto_tfm_ctx(tfm);
329
330 cryptd_free_ablkcipher(ctx->cryptd_tfm);
331}
332
333static void ablk_init_common(struct crypto_tfm *tfm,
334 struct cryptd_ablkcipher *cryptd_tfm)
335{
336 struct async_aes_ctx *ctx = crypto_tfm_ctx(tfm);
337
338 ctx->cryptd_tfm = cryptd_tfm;
339 tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) +
340 crypto_ablkcipher_reqsize(&cryptd_tfm->base);
341}
342
343static int ablk_ecb_init(struct crypto_tfm *tfm)
344{
345 struct cryptd_ablkcipher *cryptd_tfm;
346
347 cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ecb-aes-aesni", 0, 0);
348 if (IS_ERR(cryptd_tfm))
349 return PTR_ERR(cryptd_tfm);
350 ablk_init_common(tfm, cryptd_tfm);
351 return 0;
352}
353
354static struct crypto_alg ablk_ecb_alg = {
355 .cra_name = "ecb(aes)",
356 .cra_driver_name = "ecb-aes-aesni",
357 .cra_priority = 400,
358 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
359 .cra_blocksize = AES_BLOCK_SIZE,
360 .cra_ctxsize = sizeof(struct async_aes_ctx),
361 .cra_alignmask = 0,
362 .cra_type = &crypto_ablkcipher_type,
363 .cra_module = THIS_MODULE,
364 .cra_list = LIST_HEAD_INIT(ablk_ecb_alg.cra_list),
365 .cra_init = ablk_ecb_init,
366 .cra_exit = ablk_exit,
367 .cra_u = {
368 .ablkcipher = {
369 .min_keysize = AES_MIN_KEY_SIZE,
370 .max_keysize = AES_MAX_KEY_SIZE,
371 .setkey = ablk_set_key,
372 .encrypt = ablk_encrypt,
373 .decrypt = ablk_decrypt,
374 },
375 },
376};
377
378static int ablk_cbc_init(struct crypto_tfm *tfm)
379{
380 struct cryptd_ablkcipher *cryptd_tfm;
381
382 cryptd_tfm = cryptd_alloc_ablkcipher("__driver-cbc-aes-aesni", 0, 0);
383 if (IS_ERR(cryptd_tfm))
384 return PTR_ERR(cryptd_tfm);
385 ablk_init_common(tfm, cryptd_tfm);
386 return 0;
387}
388
389static struct crypto_alg ablk_cbc_alg = {
390 .cra_name = "cbc(aes)",
391 .cra_driver_name = "cbc-aes-aesni",
392 .cra_priority = 400,
393 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
394 .cra_blocksize = AES_BLOCK_SIZE,
395 .cra_ctxsize = sizeof(struct async_aes_ctx),
396 .cra_alignmask = 0,
397 .cra_type = &crypto_ablkcipher_type,
398 .cra_module = THIS_MODULE,
399 .cra_list = LIST_HEAD_INIT(ablk_cbc_alg.cra_list),
400 .cra_init = ablk_cbc_init,
401 .cra_exit = ablk_exit,
402 .cra_u = {
403 .ablkcipher = {
404 .min_keysize = AES_MIN_KEY_SIZE,
405 .max_keysize = AES_MAX_KEY_SIZE,
406 .ivsize = AES_BLOCK_SIZE,
407 .setkey = ablk_set_key,
408 .encrypt = ablk_encrypt,
409 .decrypt = ablk_decrypt,
410 },
411 },
412};
413
414static int __init aesni_init(void)
415{
416 int err;
417
418 if (!cpu_has_aes) {
419 printk(KERN_ERR "Intel AES-NI instructions are not detected.\n");
420 return -ENODEV;
421 }
422 if ((err = crypto_register_alg(&aesni_alg)))
423 goto aes_err;
424 if ((err = crypto_register_alg(&blk_ecb_alg)))
425 goto blk_ecb_err;
426 if ((err = crypto_register_alg(&blk_cbc_alg)))
427 goto blk_cbc_err;
428 if ((err = crypto_register_alg(&ablk_ecb_alg)))
429 goto ablk_ecb_err;
430 if ((err = crypto_register_alg(&ablk_cbc_alg)))
431 goto ablk_cbc_err;
432
433 return err;
434
435ablk_cbc_err:
436 crypto_unregister_alg(&ablk_ecb_alg);
437ablk_ecb_err:
438 crypto_unregister_alg(&blk_cbc_alg);
439blk_cbc_err:
440 crypto_unregister_alg(&blk_ecb_alg);
441blk_ecb_err:
442 crypto_unregister_alg(&aesni_alg);
443aes_err:
444 return err;
445}
446
447static void __exit aesni_exit(void)
448{
449 crypto_unregister_alg(&ablk_cbc_alg);
450 crypto_unregister_alg(&ablk_ecb_alg);
451 crypto_unregister_alg(&blk_cbc_alg);
452 crypto_unregister_alg(&blk_ecb_alg);
453 crypto_unregister_alg(&aesni_alg);
454}
455
456module_init(aesni_init);
457module_exit(aesni_exit);
458
459MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, Intel AES-NI instructions optimized");
460MODULE_LICENSE("GPL");
461MODULE_ALIAS("aes");
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index e4baa06bbceb..19c61ef6ab57 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -557,7 +557,7 @@ ia32_sys_call_table:
557 .quad sys32_olduname 557 .quad sys32_olduname
558 .quad sys_umask /* 60 */ 558 .quad sys_umask /* 60 */
559 .quad sys_chroot 559 .quad sys_chroot
560 .quad sys32_ustat 560 .quad compat_sys_ustat
561 .quad sys_dup2 561 .quad sys_dup2
562 .quad sys_getppid 562 .quad sys_getppid
563 .quad sys_getpgrp /* 65 */ 563 .quad sys_getpgrp /* 65 */
@@ -828,5 +828,7 @@ ia32_sys_call_table:
828 .quad sys_dup3 /* 330 */ 828 .quad sys_dup3 /* 330 */
829 .quad sys_pipe2 829 .quad sys_pipe2
830 .quad sys_inotify_init1 830 .quad sys_inotify_init1
831 .quad compat_sys_preadv
832 .quad compat_sys_pwritev
831 .quad sys_perf_counter_open 833 .quad sys_perf_counter_open
832ia32_syscall_end: 834ia32_syscall_end:
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 6c0d7f6231af..efac92fd1efb 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -638,28 +638,6 @@ long sys32_uname(struct old_utsname __user *name)
638 return err ? -EFAULT : 0; 638 return err ? -EFAULT : 0;
639} 639}
640 640
641long sys32_ustat(unsigned dev, struct ustat32 __user *u32p)
642{
643 struct ustat u;
644 mm_segment_t seg;
645 int ret;
646
647 seg = get_fs();
648 set_fs(KERNEL_DS);
649 ret = sys_ustat(dev, (struct ustat __user *)&u);
650 set_fs(seg);
651 if (ret < 0)
652 return ret;
653
654 if (!access_ok(VERIFY_WRITE, u32p, sizeof(struct ustat32)) ||
655 __put_user((__u32) u.f_tfree, &u32p->f_tfree) ||
656 __put_user((__u32) u.f_tinode, &u32p->f_tfree) ||
657 __copy_to_user(&u32p->f_fname, u.f_fname, sizeof(u.f_fname)) ||
658 __copy_to_user(&u32p->f_fpack, u.f_fpack, sizeof(u.f_fpack)))
659 ret = -EFAULT;
660 return ret;
661}
662
663asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv, 641asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv,
664 compat_uptr_t __user *envp, struct pt_regs *regs) 642 compat_uptr_t __user *envp, struct pt_regs *regs)
665{ 643{
diff --git a/arch/x86/include/asm/aes.h b/arch/x86/include/asm/aes.h
new file mode 100644
index 000000000000..80545a1cbe39
--- /dev/null
+++ b/arch/x86/include/asm/aes.h
@@ -0,0 +1,11 @@
1#ifndef ASM_X86_AES_H
2#define ASM_X86_AES_H
3
4#include <linux/crypto.h>
5#include <crypto/aes.h>
6
7void crypto_aes_encrypt_x86(struct crypto_aes_ctx *ctx, u8 *dst,
8 const u8 *src);
9void crypto_aes_decrypt_x86(struct crypto_aes_ctx *ctx, u8 *dst,
10 const u8 *src);
11#endif
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index a6208dc74633..df8a300dfe6c 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -75,7 +75,14 @@ static inline void default_inquire_remote_apic(int apicid)
75#define setup_secondary_clock setup_secondary_APIC_clock 75#define setup_secondary_clock setup_secondary_APIC_clock
76#endif 76#endif
77 77
78#ifdef CONFIG_X86_64
78extern int is_vsmp_box(void); 79extern int is_vsmp_box(void);
80#else
81static inline int is_vsmp_box(void)
82{
83 return 0;
84}
85#endif
79extern void xapic_wait_icr_idle(void); 86extern void xapic_wait_icr_idle(void);
80extern u32 safe_xapic_wait_icr_idle(void); 87extern u32 safe_xapic_wait_icr_idle(void);
81extern void xapic_icr_write(u32, u32); 88extern void xapic_icr_write(u32, u32);
@@ -101,6 +108,16 @@ extern void native_apic_icr_write(u32 low, u32 id);
101extern u64 native_apic_icr_read(void); 108extern u64 native_apic_icr_read(void);
102 109
103#ifdef CONFIG_X86_X2APIC 110#ifdef CONFIG_X86_X2APIC
111/*
112 * Make previous memory operations globally visible before
113 * sending the IPI through x2apic wrmsr. We need a serializing instruction or
114 * mfence for this.
115 */
116static inline void x2apic_wrmsr_fence(void)
117{
118 asm volatile("mfence" : : : "memory");
119}
120
104static inline void native_apic_msr_write(u32 reg, u32 v) 121static inline void native_apic_msr_write(u32 reg, u32 v)
105{ 122{
106 if (reg == APIC_DFR || reg == APIC_ID || reg == APIC_LDR || 123 if (reg == APIC_DFR || reg == APIC_ID || reg == APIC_LDR ||
@@ -177,6 +194,9 @@ static inline int x2apic_enabled(void)
177{ 194{
178 return 0; 195 return 0;
179} 196}
197
198#define x2apic 0
199
180#endif 200#endif
181 201
182extern int get_physical_broadcast(void); 202extern int get_physical_broadcast(void);
@@ -306,7 +326,7 @@ struct apic {
306 void (*send_IPI_self)(int vector); 326 void (*send_IPI_self)(int vector);
307 327
308 /* wakeup_secondary_cpu */ 328 /* wakeup_secondary_cpu */
309 int (*wakeup_cpu)(int apicid, unsigned long start_eip); 329 int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip);
310 330
311 int trampoline_phys_low; 331 int trampoline_phys_low;
312 int trampoline_phys_high; 332 int trampoline_phys_high;
@@ -324,8 +344,21 @@ struct apic {
324 u32 (*safe_wait_icr_idle)(void); 344 u32 (*safe_wait_icr_idle)(void);
325}; 345};
326 346
347/*
348 * Pointer to the local APIC driver in use on this system (there's
349 * always just one such driver in use - the kernel decides via an
350 * early probing process which one it picks - and then sticks to it):
351 */
327extern struct apic *apic; 352extern struct apic *apic;
328 353
354/*
355 * APIC functionality to boot other CPUs - only used on SMP:
356 */
357#ifdef CONFIG_SMP
358extern atomic_t init_deasserted;
359extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip);
360#endif
361
329static inline u32 apic_read(u32 reg) 362static inline u32 apic_read(u32 reg)
330{ 363{
331 return apic->read(reg); 364 return apic->read(reg);
@@ -359,6 +392,7 @@ static inline u32 safe_apic_wait_icr_idle(void)
359 392
360static inline void ack_APIC_irq(void) 393static inline void ack_APIC_irq(void)
361{ 394{
395#ifdef CONFIG_X86_LOCAL_APIC
362 /* 396 /*
363 * ack_APIC_irq() actually gets compiled as a single instruction 397 * ack_APIC_irq() actually gets compiled as a single instruction
364 * ... yummie. 398 * ... yummie.
@@ -366,6 +400,7 @@ static inline void ack_APIC_irq(void)
366 400
367 /* Docs say use 0 for future compatibility */ 401 /* Docs say use 0 for future compatibility */
368 apic_write(APIC_EOI, 0); 402 apic_write(APIC_EOI, 0);
403#endif
369} 404}
370 405
371static inline unsigned default_get_apic_id(unsigned long x) 406static inline unsigned default_get_apic_id(unsigned long x)
@@ -384,9 +419,7 @@ static inline unsigned default_get_apic_id(unsigned long x)
384#define DEFAULT_TRAMPOLINE_PHYS_LOW 0x467 419#define DEFAULT_TRAMPOLINE_PHYS_LOW 0x467
385#define DEFAULT_TRAMPOLINE_PHYS_HIGH 0x469 420#define DEFAULT_TRAMPOLINE_PHYS_HIGH 0x469
386 421
387#ifdef CONFIG_X86_32 422#ifdef CONFIG_X86_64
388extern void es7000_update_apic_to_cluster(void);
389#else
390extern struct apic apic_flat; 423extern struct apic apic_flat;
391extern struct apic apic_physflat; 424extern struct apic apic_physflat;
392extern struct apic apic_x2apic_cluster; 425extern struct apic apic_x2apic_cluster;
@@ -456,10 +489,19 @@ static inline int default_apic_id_registered(void)
456 return physid_isset(read_apic_id(), phys_cpu_present_map); 489 return physid_isset(read_apic_id(), phys_cpu_present_map);
457} 490}
458 491
492static inline int default_phys_pkg_id(int cpuid_apic, int index_msb)
493{
494 return cpuid_apic >> index_msb;
495}
496
497extern int default_apicid_to_node(int logical_apicid);
498
499#endif
500
459static inline unsigned int 501static inline unsigned int
460default_cpu_mask_to_apicid(const struct cpumask *cpumask) 502default_cpu_mask_to_apicid(const struct cpumask *cpumask)
461{ 503{
462 return cpumask_bits(cpumask)[0]; 504 return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
463} 505}
464 506
465static inline unsigned int 507static inline unsigned int
@@ -473,15 +515,6 @@ default_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
473 return (unsigned int)(mask1 & mask2 & mask3); 515 return (unsigned int)(mask1 & mask2 & mask3);
474} 516}
475 517
476static inline int default_phys_pkg_id(int cpuid_apic, int index_msb)
477{
478 return cpuid_apic >> index_msb;
479}
480
481extern int default_apicid_to_node(int logical_apicid);
482
483#endif
484
485static inline unsigned long default_check_apicid_used(physid_mask_t bitmap, int apicid) 518static inline unsigned long default_check_apicid_used(physid_mask_t bitmap, int apicid)
486{ 519{
487 return physid_isset(apicid, bitmap); 520 return physid_isset(apicid, bitmap);
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 63134e31e8b9..bc9514fb3b13 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -53,6 +53,7 @@
53#define APIC_ESR_SENDILL 0x00020 53#define APIC_ESR_SENDILL 0x00020
54#define APIC_ESR_RECVILL 0x00040 54#define APIC_ESR_RECVILL 0x00040
55#define APIC_ESR_ILLREGA 0x00080 55#define APIC_ESR_ILLREGA 0x00080
56#define APIC_LVTCMCI 0x2f0
56#define APIC_ICR 0x300 57#define APIC_ICR 0x300
57#define APIC_DEST_SELF 0x40000 58#define APIC_DEST_SELF 0x40000
58#define APIC_DEST_ALLINC 0x80000 59#define APIC_DEST_ALLINC 0x80000
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index dd61616cb73d..6ba23dd9fc92 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -1,26 +1,36 @@
1#ifndef _ASM_X86_BOOT_H 1#ifndef _ASM_X86_BOOT_H
2#define _ASM_X86_BOOT_H 2#define _ASM_X86_BOOT_H
3 3
4/* Don't touch these, unless you really know what you're doing. */
5#define DEF_SYSSEG 0x1000
6#define DEF_SYSSIZE 0x7F00
7
8/* Internal svga startup constants */ 4/* Internal svga startup constants */
9#define NORMAL_VGA 0xffff /* 80x25 mode */ 5#define NORMAL_VGA 0xffff /* 80x25 mode */
10#define EXTENDED_VGA 0xfffe /* 80x50 mode */ 6#define EXTENDED_VGA 0xfffe /* 80x50 mode */
11#define ASK_VGA 0xfffd /* ask for it at bootup */ 7#define ASK_VGA 0xfffd /* ask for it at bootup */
12 8
9#ifdef __KERNEL__
10
13/* Physical address where kernel should be loaded. */ 11/* Physical address where kernel should be loaded. */
14#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \ 12#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \
15 + (CONFIG_PHYSICAL_ALIGN - 1)) \ 13 + (CONFIG_PHYSICAL_ALIGN - 1)) \
16 & ~(CONFIG_PHYSICAL_ALIGN - 1)) 14 & ~(CONFIG_PHYSICAL_ALIGN - 1))
17 15
16#ifdef CONFIG_KERNEL_BZIP2
17#define BOOT_HEAP_SIZE 0x400000
18#else /* !CONFIG_KERNEL_BZIP2 */
19
18#ifdef CONFIG_X86_64 20#ifdef CONFIG_X86_64
19#define BOOT_HEAP_SIZE 0x7000 21#define BOOT_HEAP_SIZE 0x7000
20#define BOOT_STACK_SIZE 0x4000
21#else 22#else
22#define BOOT_HEAP_SIZE 0x4000 23#define BOOT_HEAP_SIZE 0x4000
24#endif
25
26#endif /* !CONFIG_KERNEL_BZIP2 */
27
28#ifdef CONFIG_X86_64
29#define BOOT_STACK_SIZE 0x4000
30#else
23#define BOOT_STACK_SIZE 0x1000 31#define BOOT_STACK_SIZE 0x1000
24#endif 32#endif
25 33
34#endif /* __KERNEL__ */
35
26#endif /* _ASM_X86_BOOT_H */ 36#endif /* _ASM_X86_BOOT_H */
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 2f8466540fb5..e55dfc1ad453 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -5,24 +5,43 @@
5#include <linux/mm.h> 5#include <linux/mm.h>
6 6
7/* Caches aren't brain-dead on the intel. */ 7/* Caches aren't brain-dead on the intel. */
8#define flush_cache_all() do { } while (0) 8static inline void flush_cache_all(void) { }
9#define flush_cache_mm(mm) do { } while (0) 9static inline void flush_cache_mm(struct mm_struct *mm) { }
10#define flush_cache_dup_mm(mm) do { } while (0) 10static inline void flush_cache_dup_mm(struct mm_struct *mm) { }
11#define flush_cache_range(vma, start, end) do { } while (0) 11static inline void flush_cache_range(struct vm_area_struct *vma,
12#define flush_cache_page(vma, vmaddr, pfn) do { } while (0) 12 unsigned long start, unsigned long end) { }
13#define flush_dcache_page(page) do { } while (0) 13static inline void flush_cache_page(struct vm_area_struct *vma,
14#define flush_dcache_mmap_lock(mapping) do { } while (0) 14 unsigned long vmaddr, unsigned long pfn) { }
15#define flush_dcache_mmap_unlock(mapping) do { } while (0) 15static inline void flush_dcache_page(struct page *page) { }
16#define flush_icache_range(start, end) do { } while (0) 16static inline void flush_dcache_mmap_lock(struct address_space *mapping) { }
17#define flush_icache_page(vma, pg) do { } while (0) 17static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { }
18#define flush_icache_user_range(vma, pg, adr, len) do { } while (0) 18static inline void flush_icache_range(unsigned long start,
19#define flush_cache_vmap(start, end) do { } while (0) 19 unsigned long end) { }
20#define flush_cache_vunmap(start, end) do { } while (0) 20static inline void flush_icache_page(struct vm_area_struct *vma,
21 21 struct page *page) { }
22#define copy_to_user_page(vma, page, vaddr, dst, src, len) \ 22static inline void flush_icache_user_range(struct vm_area_struct *vma,
23 memcpy((dst), (src), (len)) 23 struct page *page,
24#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ 24 unsigned long addr,
25 memcpy((dst), (src), (len)) 25 unsigned long len) { }
26static inline void flush_cache_vmap(unsigned long start, unsigned long end) { }
27static inline void flush_cache_vunmap(unsigned long start,
28 unsigned long end) { }
29
30static inline void copy_to_user_page(struct vm_area_struct *vma,
31 struct page *page, unsigned long vaddr,
32 void *dst, const void *src,
33 unsigned long len)
34{
35 memcpy(dst, src, len);
36}
37
38static inline void copy_from_user_page(struct vm_area_struct *vma,
39 struct page *page, unsigned long vaddr,
40 void *dst, const void *src,
41 unsigned long len)
42{
43 memcpy(dst, src, len);
44}
26 45
27#define PG_non_WB PG_arch_1 46#define PG_non_WB PG_arch_1
28PAGEFLAG(NonWB, non_WB) 47PAGEFLAG(NonWB, non_WB)
@@ -71,6 +90,9 @@ int set_memory_4k(unsigned long addr, int numpages);
71int set_memory_array_uc(unsigned long *addr, int addrinarray); 90int set_memory_array_uc(unsigned long *addr, int addrinarray);
72int set_memory_array_wb(unsigned long *addr, int addrinarray); 91int set_memory_array_wb(unsigned long *addr, int addrinarray);
73 92
93int set_pages_array_uc(struct page **pages, int addrinarray);
94int set_pages_array_wb(struct page **pages, int addrinarray);
95
74/* 96/*
75 * For legacy compatibility with the old APIs, a few functions 97 * For legacy compatibility with the old APIs, a few functions
76 * are provided that work on a "struct page". 98 * are provided that work on a "struct page".
@@ -104,6 +126,11 @@ void clflush_cache_range(void *addr, unsigned int size);
104#ifdef CONFIG_DEBUG_RODATA 126#ifdef CONFIG_DEBUG_RODATA
105void mark_rodata_ro(void); 127void mark_rodata_ro(void);
106extern const int rodata_test_data; 128extern const int rodata_test_data;
129void set_kernel_text_rw(void);
130void set_kernel_text_ro(void);
131#else
132static inline void set_kernel_text_rw(void) { }
133static inline void set_kernel_text_ro(void) { }
107#endif 134#endif
108 135
109#ifdef CONFIG_DEBUG_RODATA_TEST 136#ifdef CONFIG_DEBUG_RODATA_TEST
diff --git a/arch/x86/include/asm/cpu_debug.h b/arch/x86/include/asm/cpu_debug.h
new file mode 100755
index 000000000000..222802029fa6
--- /dev/null
+++ b/arch/x86/include/asm/cpu_debug.h
@@ -0,0 +1,226 @@
1#ifndef _ASM_X86_CPU_DEBUG_H
2#define _ASM_X86_CPU_DEBUG_H
3
4/*
5 * CPU x86 architecture debug
6 *
7 * Copyright(C) 2009 Jaswinder Singh Rajput
8 */
9
10/* Register flags */
11enum cpu_debug_bit {
12/* Model Specific Registers (MSRs) */
13 CPU_MC_BIT, /* Machine Check */
14 CPU_MONITOR_BIT, /* Monitor */
15 CPU_TIME_BIT, /* Time */
16 CPU_PMC_BIT, /* Performance Monitor */
17 CPU_PLATFORM_BIT, /* Platform */
18 CPU_APIC_BIT, /* APIC */
19 CPU_POWERON_BIT, /* Power-on */
20 CPU_CONTROL_BIT, /* Control */
21 CPU_FEATURES_BIT, /* Features control */
22 CPU_LBRANCH_BIT, /* Last Branch */
23 CPU_BIOS_BIT, /* BIOS */
24 CPU_FREQ_BIT, /* Frequency */
25 CPU_MTTR_BIT, /* MTRR */
26 CPU_PERF_BIT, /* Performance */
27 CPU_CACHE_BIT, /* Cache */
28 CPU_SYSENTER_BIT, /* Sysenter */
29 CPU_THERM_BIT, /* Thermal */
30 CPU_MISC_BIT, /* Miscellaneous */
31 CPU_DEBUG_BIT, /* Debug */
32 CPU_PAT_BIT, /* PAT */
33 CPU_VMX_BIT, /* VMX */
34 CPU_CALL_BIT, /* System Call */
35 CPU_BASE_BIT, /* BASE Address */
36 CPU_VER_BIT, /* Version ID */
37 CPU_CONF_BIT, /* Configuration */
38 CPU_SMM_BIT, /* System mgmt mode */
39 CPU_SVM_BIT, /*Secure Virtual Machine*/
40 CPU_OSVM_BIT, /* OS-Visible Workaround*/
41/* Standard Registers */
42 CPU_TSS_BIT, /* Task Stack Segment */
43 CPU_CR_BIT, /* Control Registers */
44 CPU_DT_BIT, /* Descriptor Table */
45/* End of Registers flags */
46 CPU_REG_ALL_BIT, /* Select all Registers */
47};
48
49#define CPU_REG_ALL (~0) /* Select all Registers */
50
51#define CPU_MC (1 << CPU_MC_BIT)
52#define CPU_MONITOR (1 << CPU_MONITOR_BIT)
53#define CPU_TIME (1 << CPU_TIME_BIT)
54#define CPU_PMC (1 << CPU_PMC_BIT)
55#define CPU_PLATFORM (1 << CPU_PLATFORM_BIT)
56#define CPU_APIC (1 << CPU_APIC_BIT)
57#define CPU_POWERON (1 << CPU_POWERON_BIT)
58#define CPU_CONTROL (1 << CPU_CONTROL_BIT)
59#define CPU_FEATURES (1 << CPU_FEATURES_BIT)
60#define CPU_LBRANCH (1 << CPU_LBRANCH_BIT)
61#define CPU_BIOS (1 << CPU_BIOS_BIT)
62#define CPU_FREQ (1 << CPU_FREQ_BIT)
63#define CPU_MTRR (1 << CPU_MTTR_BIT)
64#define CPU_PERF (1 << CPU_PERF_BIT)
65#define CPU_CACHE (1 << CPU_CACHE_BIT)
66#define CPU_SYSENTER (1 << CPU_SYSENTER_BIT)
67#define CPU_THERM (1 << CPU_THERM_BIT)
68#define CPU_MISC (1 << CPU_MISC_BIT)
69#define CPU_DEBUG (1 << CPU_DEBUG_BIT)
70#define CPU_PAT (1 << CPU_PAT_BIT)
71#define CPU_VMX (1 << CPU_VMX_BIT)
72#define CPU_CALL (1 << CPU_CALL_BIT)
73#define CPU_BASE (1 << CPU_BASE_BIT)
74#define CPU_VER (1 << CPU_VER_BIT)
75#define CPU_CONF (1 << CPU_CONF_BIT)
76#define CPU_SMM (1 << CPU_SMM_BIT)
77#define CPU_SVM (1 << CPU_SVM_BIT)
78#define CPU_OSVM (1 << CPU_OSVM_BIT)
79#define CPU_TSS (1 << CPU_TSS_BIT)
80#define CPU_CR (1 << CPU_CR_BIT)
81#define CPU_DT (1 << CPU_DT_BIT)
82
83/* Register file flags */
84enum cpu_file_bit {
85 CPU_INDEX_BIT, /* index */
86 CPU_VALUE_BIT, /* value */
87};
88
89#define CPU_FILE_VALUE (1 << CPU_VALUE_BIT)
90
91/*
92 * DisplayFamily_DisplayModel Processor Families/Processor Number Series
93 * -------------------------- ------------------------------------------
94 * 05_01, 05_02, 05_04 Pentium, Pentium with MMX
95 *
96 * 06_01 Pentium Pro
97 * 06_03, 06_05 Pentium II Xeon, Pentium II
98 * 06_07, 06_08, 06_0A, 06_0B Pentium III Xeon, Pentum III
99 *
100 * 06_09, 060D Pentium M
101 *
102 * 06_0E Core Duo, Core Solo
103 *
104 * 06_0F Xeon 3000, 3200, 5100, 5300, 7300 series,
105 * Core 2 Quad, Core 2 Extreme, Core 2 Duo,
106 * Pentium dual-core
107 * 06_17 Xeon 5200, 5400 series, Core 2 Quad Q9650
108 *
109 * 06_1C Atom
110 *
111 * 0F_00, 0F_01, 0F_02 Xeon, Xeon MP, Pentium 4
112 * 0F_03, 0F_04 Xeon, Xeon MP, Pentium 4, Pentium D
113 *
114 * 0F_06 Xeon 7100, 5000 Series, Xeon MP,
115 * Pentium 4, Pentium D
116 */
117
118/* Register processors bits */
119enum cpu_processor_bit {
120 CPU_NONE,
121/* Intel */
122 CPU_INTEL_PENTIUM_BIT,
123 CPU_INTEL_P6_BIT,
124 CPU_INTEL_PENTIUM_M_BIT,
125 CPU_INTEL_CORE_BIT,
126 CPU_INTEL_CORE2_BIT,
127 CPU_INTEL_ATOM_BIT,
128 CPU_INTEL_XEON_P4_BIT,
129 CPU_INTEL_XEON_MP_BIT,
130/* AMD */
131 CPU_AMD_K6_BIT,
132 CPU_AMD_K7_BIT,
133 CPU_AMD_K8_BIT,
134 CPU_AMD_0F_BIT,
135 CPU_AMD_10_BIT,
136 CPU_AMD_11_BIT,
137};
138
139#define CPU_INTEL_PENTIUM (1 << CPU_INTEL_PENTIUM_BIT)
140#define CPU_INTEL_P6 (1 << CPU_INTEL_P6_BIT)
141#define CPU_INTEL_PENTIUM_M (1 << CPU_INTEL_PENTIUM_M_BIT)
142#define CPU_INTEL_CORE (1 << CPU_INTEL_CORE_BIT)
143#define CPU_INTEL_CORE2 (1 << CPU_INTEL_CORE2_BIT)
144#define CPU_INTEL_ATOM (1 << CPU_INTEL_ATOM_BIT)
145#define CPU_INTEL_XEON_P4 (1 << CPU_INTEL_XEON_P4_BIT)
146#define CPU_INTEL_XEON_MP (1 << CPU_INTEL_XEON_MP_BIT)
147
148#define CPU_INTEL_PX (CPU_INTEL_P6 | CPU_INTEL_PENTIUM_M)
149#define CPU_INTEL_COREX (CPU_INTEL_CORE | CPU_INTEL_CORE2)
150#define CPU_INTEL_XEON (CPU_INTEL_XEON_P4 | CPU_INTEL_XEON_MP)
151#define CPU_CO_AT (CPU_INTEL_CORE | CPU_INTEL_ATOM)
152#define CPU_C2_AT (CPU_INTEL_CORE2 | CPU_INTEL_ATOM)
153#define CPU_CX_AT (CPU_INTEL_COREX | CPU_INTEL_ATOM)
154#define CPU_CX_XE (CPU_INTEL_COREX | CPU_INTEL_XEON)
155#define CPU_P6_XE (CPU_INTEL_P6 | CPU_INTEL_XEON)
156#define CPU_PM_CO_AT (CPU_INTEL_PENTIUM_M | CPU_CO_AT)
157#define CPU_C2_AT_XE (CPU_C2_AT | CPU_INTEL_XEON)
158#define CPU_CX_AT_XE (CPU_CX_AT | CPU_INTEL_XEON)
159#define CPU_P6_CX_AT (CPU_INTEL_P6 | CPU_CX_AT)
160#define CPU_P6_CX_XE (CPU_P6_XE | CPU_INTEL_COREX)
161#define CPU_P6_CX_AT_XE (CPU_INTEL_P6 | CPU_CX_AT_XE)
162#define CPU_PM_CX_AT_XE (CPU_INTEL_PENTIUM_M | CPU_CX_AT_XE)
163#define CPU_PM_CX_AT (CPU_INTEL_PENTIUM_M | CPU_CX_AT)
164#define CPU_PM_CX_XE (CPU_INTEL_PENTIUM_M | CPU_CX_XE)
165#define CPU_PX_CX_AT (CPU_INTEL_PX | CPU_CX_AT)
166#define CPU_PX_CX_AT_XE (CPU_INTEL_PX | CPU_CX_AT_XE)
167
168/* Select all supported Intel CPUs */
169#define CPU_INTEL_ALL (CPU_INTEL_PENTIUM | CPU_PX_CX_AT_XE)
170
171#define CPU_AMD_K6 (1 << CPU_AMD_K6_BIT)
172#define CPU_AMD_K7 (1 << CPU_AMD_K7_BIT)
173#define CPU_AMD_K8 (1 << CPU_AMD_K8_BIT)
174#define CPU_AMD_0F (1 << CPU_AMD_0F_BIT)
175#define CPU_AMD_10 (1 << CPU_AMD_10_BIT)
176#define CPU_AMD_11 (1 << CPU_AMD_11_BIT)
177
178#define CPU_K10_PLUS (CPU_AMD_10 | CPU_AMD_11)
179#define CPU_K0F_PLUS (CPU_AMD_0F | CPU_K10_PLUS)
180#define CPU_K8_PLUS (CPU_AMD_K8 | CPU_K0F_PLUS)
181#define CPU_K7_PLUS (CPU_AMD_K7 | CPU_K8_PLUS)
182
183/* Select all supported AMD CPUs */
184#define CPU_AMD_ALL (CPU_AMD_K6 | CPU_K7_PLUS)
185
186/* Select all supported CPUs */
187#define CPU_ALL (CPU_INTEL_ALL | CPU_AMD_ALL)
188
189#define MAX_CPU_FILES 512
190
191struct cpu_private {
192 unsigned cpu;
193 unsigned type;
194 unsigned reg;
195 unsigned file;
196};
197
198struct cpu_debug_base {
199 char *name; /* Register name */
200 unsigned flag; /* Register flag */
201 unsigned write; /* Register write flag */
202};
203
204/*
205 * Currently it looks similar to cpu_debug_base but once we add more files
206 * cpu_file_base will go in different direction
207 */
208struct cpu_file_base {
209 char *name; /* Register file name */
210 unsigned flag; /* Register file flag */
211 unsigned write; /* Register write flag */
212};
213
214struct cpu_cpuX_base {
215 struct dentry *dentry; /* Register dentry */
216 int init; /* Register index file */
217};
218
219struct cpu_debug_range {
220 unsigned min; /* Register range min */
221 unsigned max; /* Register range max */
222 unsigned flag; /* Supported flags */
223 unsigned model; /* Supported models */
224};
225
226#endif /* _ASM_X86_CPU_DEBUG_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 7301e60dc4a8..0beba0d1468d 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -213,6 +213,7 @@ extern const char * const x86_power_flags[32];
213#define cpu_has_xmm boot_cpu_has(X86_FEATURE_XMM) 213#define cpu_has_xmm boot_cpu_has(X86_FEATURE_XMM)
214#define cpu_has_xmm2 boot_cpu_has(X86_FEATURE_XMM2) 214#define cpu_has_xmm2 boot_cpu_has(X86_FEATURE_XMM2)
215#define cpu_has_xmm3 boot_cpu_has(X86_FEATURE_XMM3) 215#define cpu_has_xmm3 boot_cpu_has(X86_FEATURE_XMM3)
216#define cpu_has_aes boot_cpu_has(X86_FEATURE_AES)
216#define cpu_has_ht boot_cpu_has(X86_FEATURE_HT) 217#define cpu_has_ht boot_cpu_has(X86_FEATURE_HT)
217#define cpu_has_mp boot_cpu_has(X86_FEATURE_MP) 218#define cpu_has_mp boot_cpu_has(X86_FEATURE_MP)
218#define cpu_has_nx boot_cpu_has(X86_FEATURE_NX) 219#define cpu_has_nx boot_cpu_has(X86_FEATURE_NX)
diff --git a/arch/x86/include/asm/cpumask.h b/arch/x86/include/asm/cpumask.h
index a7f3c75f8ad7..61c852fa346b 100644
--- a/arch/x86/include/asm/cpumask.h
+++ b/arch/x86/include/asm/cpumask.h
@@ -3,8 +3,6 @@
3#ifndef __ASSEMBLY__ 3#ifndef __ASSEMBLY__
4#include <linux/cpumask.h> 4#include <linux/cpumask.h>
5 5
6#ifdef CONFIG_X86_64
7
8extern cpumask_var_t cpu_callin_mask; 6extern cpumask_var_t cpu_callin_mask;
9extern cpumask_var_t cpu_callout_mask; 7extern cpumask_var_t cpu_callout_mask;
10extern cpumask_var_t cpu_initialized_mask; 8extern cpumask_var_t cpu_initialized_mask;
@@ -12,21 +10,5 @@ extern cpumask_var_t cpu_sibling_setup_mask;
12 10
13extern void setup_cpu_local_masks(void); 11extern void setup_cpu_local_masks(void);
14 12
15#else /* CONFIG_X86_32 */
16
17extern cpumask_t cpu_callin_map;
18extern cpumask_t cpu_callout_map;
19extern cpumask_t cpu_initialized;
20extern cpumask_t cpu_sibling_setup_map;
21
22#define cpu_callin_mask ((struct cpumask *)&cpu_callin_map)
23#define cpu_callout_mask ((struct cpumask *)&cpu_callout_map)
24#define cpu_initialized_mask ((struct cpumask *)&cpu_initialized)
25#define cpu_sibling_setup_mask ((struct cpumask *)&cpu_sibling_setup_map)
26
27static inline void setup_cpu_local_masks(void) { }
28
29#endif /* CONFIG_X86_32 */
30
31#endif /* __ASSEMBLY__ */ 13#endif /* __ASSEMBLY__ */
32#endif /* _ASM_X86_CPUMASK_H */ 14#endif /* _ASM_X86_CPUMASK_H */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index dc27705f5443..5623c50d67b2 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -91,7 +91,6 @@ static inline int desc_empty(const void *ptr)
91#define store_gdt(dtr) native_store_gdt(dtr) 91#define store_gdt(dtr) native_store_gdt(dtr)
92#define store_idt(dtr) native_store_idt(dtr) 92#define store_idt(dtr) native_store_idt(dtr)
93#define store_tr(tr) (tr = native_store_tr()) 93#define store_tr(tr) (tr = native_store_tr())
94#define store_ldt(ldt) asm("sldt %0":"=m" (ldt))
95 94
96#define load_TLS(t, cpu) native_load_tls(t, cpu) 95#define load_TLS(t, cpu) native_load_tls(t, cpu)
97#define set_ldt native_set_ldt 96#define set_ldt native_set_ldt
@@ -112,6 +111,8 @@ static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries)
112} 111}
113#endif /* CONFIG_PARAVIRT */ 112#endif /* CONFIG_PARAVIRT */
114 113
114#define store_ldt(ldt) asm("sldt %0" : "=m"(ldt))
115
115static inline void native_write_idt_entry(gate_desc *idt, int entry, 116static inline void native_write_idt_entry(gate_desc *idt, int entry,
116 const gate_desc *gate) 117 const gate_desc *gate)
117{ 118{
diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h
index 3c034f48fdb0..4994a20acbcb 100644
--- a/arch/x86/include/asm/device.h
+++ b/arch/x86/include/asm/device.h
@@ -6,7 +6,7 @@ struct dev_archdata {
6 void *acpi_handle; 6 void *acpi_handle;
7#endif 7#endif
8#ifdef CONFIG_X86_64 8#ifdef CONFIG_X86_64
9struct dma_mapping_ops *dma_ops; 9struct dma_map_ops *dma_ops;
10#endif 10#endif
11#ifdef CONFIG_DMAR 11#ifdef CONFIG_DMAR
12 void *iommu; /* hook for IOMMU specific extension */ 12 void *iommu; /* hook for IOMMU specific extension */
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 132a134d12f2..cea7b74963e9 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -7,6 +7,8 @@
7 */ 7 */
8 8
9#include <linux/scatterlist.h> 9#include <linux/scatterlist.h>
10#include <linux/dma-debug.h>
11#include <linux/dma-attrs.h>
10#include <asm/io.h> 12#include <asm/io.h>
11#include <asm/swiotlb.h> 13#include <asm/swiotlb.h>
12#include <asm-generic/dma-coherent.h> 14#include <asm-generic/dma-coherent.h>
@@ -16,47 +18,9 @@ extern int iommu_merge;
16extern struct device x86_dma_fallback_dev; 18extern struct device x86_dma_fallback_dev;
17extern int panic_on_overflow; 19extern int panic_on_overflow;
18 20
19struct dma_mapping_ops { 21extern struct dma_map_ops *dma_ops;
20 int (*mapping_error)(struct device *dev, 22
21 dma_addr_t dma_addr); 23static inline struct dma_map_ops *get_dma_ops(struct device *dev)
22 void* (*alloc_coherent)(struct device *dev, size_t size,
23 dma_addr_t *dma_handle, gfp_t gfp);
24 void (*free_coherent)(struct device *dev, size_t size,
25 void *vaddr, dma_addr_t dma_handle);
26 dma_addr_t (*map_single)(struct device *hwdev, phys_addr_t ptr,
27 size_t size, int direction);
28 void (*unmap_single)(struct device *dev, dma_addr_t addr,
29 size_t size, int direction);
30 void (*sync_single_for_cpu)(struct device *hwdev,
31 dma_addr_t dma_handle, size_t size,
32 int direction);
33 void (*sync_single_for_device)(struct device *hwdev,
34 dma_addr_t dma_handle, size_t size,
35 int direction);
36 void (*sync_single_range_for_cpu)(struct device *hwdev,
37 dma_addr_t dma_handle, unsigned long offset,
38 size_t size, int direction);
39 void (*sync_single_range_for_device)(struct device *hwdev,
40 dma_addr_t dma_handle, unsigned long offset,
41 size_t size, int direction);
42 void (*sync_sg_for_cpu)(struct device *hwdev,
43 struct scatterlist *sg, int nelems,
44 int direction);
45 void (*sync_sg_for_device)(struct device *hwdev,
46 struct scatterlist *sg, int nelems,
47 int direction);
48 int (*map_sg)(struct device *hwdev, struct scatterlist *sg,
49 int nents, int direction);
50 void (*unmap_sg)(struct device *hwdev,
51 struct scatterlist *sg, int nents,
52 int direction);
53 int (*dma_supported)(struct device *hwdev, u64 mask);
54 int is_phys;
55};
56
57extern struct dma_mapping_ops *dma_ops;
58
59static inline struct dma_mapping_ops *get_dma_ops(struct device *dev)
60{ 24{
61#ifdef CONFIG_X86_32 25#ifdef CONFIG_X86_32
62 return dma_ops; 26 return dma_ops;
@@ -71,7 +35,7 @@ static inline struct dma_mapping_ops *get_dma_ops(struct device *dev)
71/* Make sure we keep the same behaviour */ 35/* Make sure we keep the same behaviour */
72static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) 36static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
73{ 37{
74 struct dma_mapping_ops *ops = get_dma_ops(dev); 38 struct dma_map_ops *ops = get_dma_ops(dev);
75 if (ops->mapping_error) 39 if (ops->mapping_error)
76 return ops->mapping_error(dev, dma_addr); 40 return ops->mapping_error(dev, dma_addr);
77 41
@@ -90,137 +54,167 @@ extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,
90 54
91static inline dma_addr_t 55static inline dma_addr_t
92dma_map_single(struct device *hwdev, void *ptr, size_t size, 56dma_map_single(struct device *hwdev, void *ptr, size_t size,
93 int direction) 57 enum dma_data_direction dir)
94{ 58{
95 struct dma_mapping_ops *ops = get_dma_ops(hwdev); 59 struct dma_map_ops *ops = get_dma_ops(hwdev);
96 60 dma_addr_t addr;
97 BUG_ON(!valid_dma_direction(direction)); 61
98 return ops->map_single(hwdev, virt_to_phys(ptr), size, direction); 62 BUG_ON(!valid_dma_direction(dir));
63 addr = ops->map_page(hwdev, virt_to_page(ptr),
64 (unsigned long)ptr & ~PAGE_MASK, size,
65 dir, NULL);
66 debug_dma_map_page(hwdev, virt_to_page(ptr),
67 (unsigned long)ptr & ~PAGE_MASK, size,
68 dir, addr, true);
69 return addr;
99} 70}
100 71
101static inline void 72static inline void
102dma_unmap_single(struct device *dev, dma_addr_t addr, size_t size, 73dma_unmap_single(struct device *dev, dma_addr_t addr, size_t size,
103 int direction) 74 enum dma_data_direction dir)
104{ 75{
105 struct dma_mapping_ops *ops = get_dma_ops(dev); 76 struct dma_map_ops *ops = get_dma_ops(dev);
106 77
107 BUG_ON(!valid_dma_direction(direction)); 78 BUG_ON(!valid_dma_direction(dir));
108 if (ops->unmap_single) 79 if (ops->unmap_page)
109 ops->unmap_single(dev, addr, size, direction); 80 ops->unmap_page(dev, addr, size, dir, NULL);
81 debug_dma_unmap_page(dev, addr, size, dir, true);
110} 82}
111 83
112static inline int 84static inline int
113dma_map_sg(struct device *hwdev, struct scatterlist *sg, 85dma_map_sg(struct device *hwdev, struct scatterlist *sg,
114 int nents, int direction) 86 int nents, enum dma_data_direction dir)
115{ 87{
116 struct dma_mapping_ops *ops = get_dma_ops(hwdev); 88 struct dma_map_ops *ops = get_dma_ops(hwdev);
89 int ents;
90
91 BUG_ON(!valid_dma_direction(dir));
92 ents = ops->map_sg(hwdev, sg, nents, dir, NULL);
93 debug_dma_map_sg(hwdev, sg, nents, ents, dir);
117 94
118 BUG_ON(!valid_dma_direction(direction)); 95 return ents;
119 return ops->map_sg(hwdev, sg, nents, direction);
120} 96}
121 97
122static inline void 98static inline void
123dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents, 99dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
124 int direction) 100 enum dma_data_direction dir)
125{ 101{
126 struct dma_mapping_ops *ops = get_dma_ops(hwdev); 102 struct dma_map_ops *ops = get_dma_ops(hwdev);
127 103
128 BUG_ON(!valid_dma_direction(direction)); 104 BUG_ON(!valid_dma_direction(dir));
105 debug_dma_unmap_sg(hwdev, sg, nents, dir);
129 if (ops->unmap_sg) 106 if (ops->unmap_sg)
130 ops->unmap_sg(hwdev, sg, nents, direction); 107 ops->unmap_sg(hwdev, sg, nents, dir, NULL);
131} 108}
132 109
133static inline void 110static inline void
134dma_sync_single_for_cpu(struct device *hwdev, dma_addr_t dma_handle, 111dma_sync_single_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
135 size_t size, int direction) 112 size_t size, enum dma_data_direction dir)
136{ 113{
137 struct dma_mapping_ops *ops = get_dma_ops(hwdev); 114 struct dma_map_ops *ops = get_dma_ops(hwdev);
138 115
139 BUG_ON(!valid_dma_direction(direction)); 116 BUG_ON(!valid_dma_direction(dir));
140 if (ops->sync_single_for_cpu) 117 if (ops->sync_single_for_cpu)
141 ops->sync_single_for_cpu(hwdev, dma_handle, size, direction); 118 ops->sync_single_for_cpu(hwdev, dma_handle, size, dir);
119 debug_dma_sync_single_for_cpu(hwdev, dma_handle, size, dir);
142 flush_write_buffers(); 120 flush_write_buffers();
143} 121}
144 122
145static inline void 123static inline void
146dma_sync_single_for_device(struct device *hwdev, dma_addr_t dma_handle, 124dma_sync_single_for_device(struct device *hwdev, dma_addr_t dma_handle,
147 size_t size, int direction) 125 size_t size, enum dma_data_direction dir)
148{ 126{
149 struct dma_mapping_ops *ops = get_dma_ops(hwdev); 127 struct dma_map_ops *ops = get_dma_ops(hwdev);
150 128
151 BUG_ON(!valid_dma_direction(direction)); 129 BUG_ON(!valid_dma_direction(dir));
152 if (ops->sync_single_for_device) 130 if (ops->sync_single_for_device)
153 ops->sync_single_for_device(hwdev, dma_handle, size, direction); 131 ops->sync_single_for_device(hwdev, dma_handle, size, dir);
132 debug_dma_sync_single_for_device(hwdev, dma_handle, size, dir);
154 flush_write_buffers(); 133 flush_write_buffers();
155} 134}
156 135
157static inline void 136static inline void
158dma_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dma_handle, 137dma_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
159 unsigned long offset, size_t size, int direction) 138 unsigned long offset, size_t size,
139 enum dma_data_direction dir)
160{ 140{
161 struct dma_mapping_ops *ops = get_dma_ops(hwdev); 141 struct dma_map_ops *ops = get_dma_ops(hwdev);
162 142
163 BUG_ON(!valid_dma_direction(direction)); 143 BUG_ON(!valid_dma_direction(dir));
164 if (ops->sync_single_range_for_cpu) 144 if (ops->sync_single_range_for_cpu)
165 ops->sync_single_range_for_cpu(hwdev, dma_handle, offset, 145 ops->sync_single_range_for_cpu(hwdev, dma_handle, offset,
166 size, direction); 146 size, dir);
147 debug_dma_sync_single_range_for_cpu(hwdev, dma_handle,
148 offset, size, dir);
167 flush_write_buffers(); 149 flush_write_buffers();
168} 150}
169 151
170static inline void 152static inline void
171dma_sync_single_range_for_device(struct device *hwdev, dma_addr_t dma_handle, 153dma_sync_single_range_for_device(struct device *hwdev, dma_addr_t dma_handle,
172 unsigned long offset, size_t size, 154 unsigned long offset, size_t size,
173 int direction) 155 enum dma_data_direction dir)
174{ 156{
175 struct dma_mapping_ops *ops = get_dma_ops(hwdev); 157 struct dma_map_ops *ops = get_dma_ops(hwdev);
176 158
177 BUG_ON(!valid_dma_direction(direction)); 159 BUG_ON(!valid_dma_direction(dir));
178 if (ops->sync_single_range_for_device) 160 if (ops->sync_single_range_for_device)
179 ops->sync_single_range_for_device(hwdev, dma_handle, 161 ops->sync_single_range_for_device(hwdev, dma_handle,
180 offset, size, direction); 162 offset, size, dir);
163 debug_dma_sync_single_range_for_device(hwdev, dma_handle,
164 offset, size, dir);
181 flush_write_buffers(); 165 flush_write_buffers();
182} 166}
183 167
184static inline void 168static inline void
185dma_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg, 169dma_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
186 int nelems, int direction) 170 int nelems, enum dma_data_direction dir)
187{ 171{
188 struct dma_mapping_ops *ops = get_dma_ops(hwdev); 172 struct dma_map_ops *ops = get_dma_ops(hwdev);
189 173
190 BUG_ON(!valid_dma_direction(direction)); 174 BUG_ON(!valid_dma_direction(dir));
191 if (ops->sync_sg_for_cpu) 175 if (ops->sync_sg_for_cpu)
192 ops->sync_sg_for_cpu(hwdev, sg, nelems, direction); 176 ops->sync_sg_for_cpu(hwdev, sg, nelems, dir);
177 debug_dma_sync_sg_for_cpu(hwdev, sg, nelems, dir);
193 flush_write_buffers(); 178 flush_write_buffers();
194} 179}
195 180
196static inline void 181static inline void
197dma_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg, 182dma_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
198 int nelems, int direction) 183 int nelems, enum dma_data_direction dir)
199{ 184{
200 struct dma_mapping_ops *ops = get_dma_ops(hwdev); 185 struct dma_map_ops *ops = get_dma_ops(hwdev);
201 186
202 BUG_ON(!valid_dma_direction(direction)); 187 BUG_ON(!valid_dma_direction(dir));
203 if (ops->sync_sg_for_device) 188 if (ops->sync_sg_for_device)
204 ops->sync_sg_for_device(hwdev, sg, nelems, direction); 189 ops->sync_sg_for_device(hwdev, sg, nelems, dir);
190 debug_dma_sync_sg_for_device(hwdev, sg, nelems, dir);
205 191
206 flush_write_buffers(); 192 flush_write_buffers();
207} 193}
208 194
209static inline dma_addr_t dma_map_page(struct device *dev, struct page *page, 195static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
210 size_t offset, size_t size, 196 size_t offset, size_t size,
211 int direction) 197 enum dma_data_direction dir)
212{ 198{
213 struct dma_mapping_ops *ops = get_dma_ops(dev); 199 struct dma_map_ops *ops = get_dma_ops(dev);
200 dma_addr_t addr;
214 201
215 BUG_ON(!valid_dma_direction(direction)); 202 BUG_ON(!valid_dma_direction(dir));
216 return ops->map_single(dev, page_to_phys(page) + offset, 203 addr = ops->map_page(dev, page, offset, size, dir, NULL);
217 size, direction); 204 debug_dma_map_page(dev, page, offset, size, dir, addr, false);
205
206 return addr;
218} 207}
219 208
220static inline void dma_unmap_page(struct device *dev, dma_addr_t addr, 209static inline void dma_unmap_page(struct device *dev, dma_addr_t addr,
221 size_t size, int direction) 210 size_t size, enum dma_data_direction dir)
222{ 211{
223 dma_unmap_single(dev, addr, size, direction); 212 struct dma_map_ops *ops = get_dma_ops(dev);
213
214 BUG_ON(!valid_dma_direction(dir));
215 if (ops->unmap_page)
216 ops->unmap_page(dev, addr, size, dir, NULL);
217 debug_dma_unmap_page(dev, addr, size, dir, false);
224} 218}
225 219
226static inline void 220static inline void
@@ -266,7 +260,7 @@ static inline void *
266dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, 260dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
267 gfp_t gfp) 261 gfp_t gfp)
268{ 262{
269 struct dma_mapping_ops *ops = get_dma_ops(dev); 263 struct dma_map_ops *ops = get_dma_ops(dev);
270 void *memory; 264 void *memory;
271 265
272 gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); 266 gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
@@ -285,20 +279,24 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
285 if (!ops->alloc_coherent) 279 if (!ops->alloc_coherent)
286 return NULL; 280 return NULL;
287 281
288 return ops->alloc_coherent(dev, size, dma_handle, 282 memory = ops->alloc_coherent(dev, size, dma_handle,
289 dma_alloc_coherent_gfp_flags(dev, gfp)); 283 dma_alloc_coherent_gfp_flags(dev, gfp));
284 debug_dma_alloc_coherent(dev, size, *dma_handle, memory);
285
286 return memory;
290} 287}
291 288
292static inline void dma_free_coherent(struct device *dev, size_t size, 289static inline void dma_free_coherent(struct device *dev, size_t size,
293 void *vaddr, dma_addr_t bus) 290 void *vaddr, dma_addr_t bus)
294{ 291{
295 struct dma_mapping_ops *ops = get_dma_ops(dev); 292 struct dma_map_ops *ops = get_dma_ops(dev);
296 293
297 WARN_ON(irqs_disabled()); /* for portability */ 294 WARN_ON(irqs_disabled()); /* for portability */
298 295
299 if (dma_release_from_coherent(dev, get_order(size), vaddr)) 296 if (dma_release_from_coherent(dev, get_order(size), vaddr))
300 return; 297 return;
301 298
299 debug_dma_free_coherent(dev, size, vaddr, bus);
302 if (ops->free_coherent) 300 if (ops->free_coherent)
303 ops->free_coherent(dev, size, vaddr, bus); 301 ops->free_coherent(dev, size, vaddr, bus);
304} 302}
diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h
index bc68212c6bc0..fd8f9e2ca35f 100644
--- a/arch/x86/include/asm/dmi.h
+++ b/arch/x86/include/asm/dmi.h
@@ -1,22 +1,15 @@
1#ifndef _ASM_X86_DMI_H 1#ifndef _ASM_X86_DMI_H
2#define _ASM_X86_DMI_H 2#define _ASM_X86_DMI_H
3 3
4#include <asm/io.h> 4#include <linux/compiler.h>
5 5#include <linux/init.h>
6#define DMI_MAX_DATA 2048
7 6
8extern int dmi_alloc_index; 7#include <asm/io.h>
9extern char dmi_alloc_data[DMI_MAX_DATA]; 8#include <asm/setup.h>
10 9
11/* This is so early that there is no good way to allocate dynamic memory. 10static __always_inline __init void *dmi_alloc(unsigned len)
12 Allocate data in an BSS array. */
13static inline void *dmi_alloc(unsigned len)
14{ 11{
15 int idx = dmi_alloc_index; 12 return extend_brk(len, sizeof(int));
16 if ((dmi_alloc_index + len) > DMI_MAX_DATA)
17 return NULL;
18 dmi_alloc_index += len;
19 return dmi_alloc_data + idx;
20} 13}
21 14
22/* Use early IO mappings for DMI because it's initialized early */ 15/* Use early IO mappings for DMI because it's initialized early */
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 00d41ce4c844..7ecba4d85089 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -72,7 +72,7 @@ extern int e820_all_mapped(u64 start, u64 end, unsigned type);
72extern void e820_add_region(u64 start, u64 size, int type); 72extern void e820_add_region(u64 start, u64 size, int type);
73extern void e820_print_map(char *who); 73extern void e820_print_map(char *who);
74extern int 74extern int
75sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, int *pnr_map); 75sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, u32 *pnr_map);
76extern u64 e820_update_range(u64 start, u64 size, unsigned old_type, 76extern u64 e820_update_range(u64 start, u64 size, unsigned old_type,
77 unsigned new_type); 77 unsigned new_type);
78extern u64 e820_remove_range(u64 start, u64 size, unsigned old_type, 78extern u64 e820_remove_range(u64 start, u64 size, unsigned old_type,
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index ca5ffb2856b6..edc90f23e708 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -37,8 +37,6 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...);
37 37
38#else /* !CONFIG_X86_32 */ 38#else /* !CONFIG_X86_32 */
39 39
40#define MAX_EFI_IO_PAGES 100
41
42extern u64 efi_call0(void *fp); 40extern u64 efi_call0(void *fp);
43extern u64 efi_call1(void *fp, u64 arg1); 41extern u64 efi_call1(void *fp, u64 arg1);
44extern u64 efi_call2(void *fp, u64 arg1, u64 arg2); 42extern u64 efi_call2(void *fp, u64 arg1, u64 arg2);
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 854d538ae857..c2e6bedaf258 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -33,6 +33,8 @@ BUILD_INTERRUPT3(invalidate_interrupt7,INVALIDATE_TLB_VECTOR_START+7,
33 smp_invalidate_interrupt) 33 smp_invalidate_interrupt)
34#endif 34#endif
35 35
36BUILD_INTERRUPT(generic_interrupt, GENERIC_INTERRUPT_VECTOR)
37
36/* 38/*
37 * every pentium local APIC has two 'local interrupts', with a 39 * every pentium local APIC has two 'local interrupts', with a
38 * soft-definable vector attached to both interrupts, one of 40 * soft-definable vector attached to both interrupts, one of
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 23696d44a0af..81937a5dc77c 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -1,11 +1,147 @@
1/*
2 * fixmap.h: compile-time virtual memory allocation
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details.
7 *
8 * Copyright (C) 1998 Ingo Molnar
9 *
10 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
11 * x86_32 and x86_64 integration by Gustavo F. Padovan, February 2009
12 */
13
1#ifndef _ASM_X86_FIXMAP_H 14#ifndef _ASM_X86_FIXMAP_H
2#define _ASM_X86_FIXMAP_H 15#define _ASM_X86_FIXMAP_H
3 16
17#ifndef __ASSEMBLY__
18#include <linux/kernel.h>
19#include <asm/acpi.h>
20#include <asm/apicdef.h>
21#include <asm/page.h>
22#ifdef CONFIG_X86_32
23#include <linux/threads.h>
24#include <asm/kmap_types.h>
25#else
26#include <asm/vsyscall.h>
27#endif
28
29/*
30 * We can't declare FIXADDR_TOP as variable for x86_64 because vsyscall
31 * uses fixmaps that relies on FIXADDR_TOP for proper address calculation.
32 * Because of this, FIXADDR_TOP x86 integration was left as later work.
33 */
34#ifdef CONFIG_X86_32
35/* used by vmalloc.c, vsyscall.lds.S.
36 *
37 * Leave one empty page between vmalloc'ed areas and
38 * the start of the fixmap.
39 */
40extern unsigned long __FIXADDR_TOP;
41#define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP)
42
43#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO)
44#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1)
45#else
46#define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE)
47
48/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */
49#define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL)
50#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE)
51#endif
52
53
54/*
55 * Here we define all the compile-time 'special' virtual
56 * addresses. The point is to have a constant address at
57 * compile time, but to set the physical address only
58 * in the boot process.
59 * for x86_32: We allocate these special addresses
60 * from the end of virtual memory (0xfffff000) backwards.
61 * Also this lets us do fail-safe vmalloc(), we
62 * can guarantee that these special addresses and
63 * vmalloc()-ed addresses never overlap.
64 *
65 * These 'compile-time allocated' memory buffers are
66 * fixed-size 4k pages (or larger if used with an increment
67 * higher than 1). Use set_fixmap(idx,phys) to associate
68 * physical memory with fixmap indices.
69 *
70 * TLB entries of such buffers will not be flushed across
71 * task switches.
72 */
73enum fixed_addresses {
4#ifdef CONFIG_X86_32 74#ifdef CONFIG_X86_32
5# include "fixmap_32.h" 75 FIX_HOLE,
76 FIX_VDSO,
6#else 77#else
7# include "fixmap_64.h" 78 VSYSCALL_LAST_PAGE,
79 VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
80 + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
81 VSYSCALL_HPET,
8#endif 82#endif
83 FIX_DBGP_BASE,
84 FIX_EARLYCON_MEM_BASE,
85#ifdef CONFIG_X86_LOCAL_APIC
86 FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
87#endif
88#ifdef CONFIG_X86_IO_APIC
89 FIX_IO_APIC_BASE_0,
90 FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
91#endif
92#ifdef CONFIG_X86_VISWS_APIC
93 FIX_CO_CPU, /* Cobalt timer */
94 FIX_CO_APIC, /* Cobalt APIC Redirection Table */
95 FIX_LI_PCIA, /* Lithium PCI Bridge A */
96 FIX_LI_PCIB, /* Lithium PCI Bridge B */
97#endif
98#ifdef CONFIG_X86_F00F_BUG
99 FIX_F00F_IDT, /* Virtual mapping for IDT */
100#endif
101#ifdef CONFIG_X86_CYCLONE_TIMER
102 FIX_CYCLONE_TIMER, /*cyclone timer register*/
103#endif
104#ifdef CONFIG_X86_32
105 FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
106 FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
107#ifdef CONFIG_PCI_MMCONFIG
108 FIX_PCIE_MCFG,
109#endif
110#endif
111#ifdef CONFIG_PARAVIRT
112 FIX_PARAVIRT_BOOTMAP,
113#endif
114 FIX_TEXT_POKE0, /* reserve 2 pages for text_poke() */
115 FIX_TEXT_POKE1,
116 __end_of_permanent_fixed_addresses,
117#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
118 FIX_OHCI1394_BASE,
119#endif
120 /*
121 * 256 temporary boot-time mappings, used by early_ioremap(),
122 * before ioremap() is functional.
123 *
124 * We round it up to the next 256 pages boundary so that we
125 * can have a single pgd entry and a single pte table:
126 */
127#define NR_FIX_BTMAPS 64
128#define FIX_BTMAPS_SLOTS 4
129 FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
130 (__end_of_permanent_fixed_addresses & 255),
131 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1,
132#ifdef CONFIG_X86_32
133 FIX_WP_TEST,
134#endif
135 __end_of_fixed_addresses
136};
137
138
139extern void reserve_top_address(unsigned long reserve);
140
141#define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
142#define FIXADDR_BOOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
143#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
144#define FIXADDR_BOOT_START (FIXADDR_TOP - FIXADDR_BOOT_SIZE)
9 145
10extern int fixmaps_set; 146extern int fixmaps_set;
11 147
@@ -69,4 +205,5 @@ static inline unsigned long virt_to_fix(const unsigned long vaddr)
69 BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START); 205 BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
70 return __virt_to_fix(vaddr); 206 return __virt_to_fix(vaddr);
71} 207}
208#endif /* !__ASSEMBLY__ */
72#endif /* _ASM_X86_FIXMAP_H */ 209#endif /* _ASM_X86_FIXMAP_H */
diff --git a/arch/x86/include/asm/fixmap_32.h b/arch/x86/include/asm/fixmap_32.h
deleted file mode 100644
index 047d9bab2b31..000000000000
--- a/arch/x86/include/asm/fixmap_32.h
+++ /dev/null
@@ -1,115 +0,0 @@
1/*
2 * fixmap.h: compile-time virtual memory allocation
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details.
7 *
8 * Copyright (C) 1998 Ingo Molnar
9 *
10 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
11 */
12
13#ifndef _ASM_X86_FIXMAP_32_H
14#define _ASM_X86_FIXMAP_32_H
15
16
17/* used by vmalloc.c, vsyscall.lds.S.
18 *
19 * Leave one empty page between vmalloc'ed areas and
20 * the start of the fixmap.
21 */
22extern unsigned long __FIXADDR_TOP;
23#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO)
24#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1)
25
26#ifndef __ASSEMBLY__
27#include <linux/kernel.h>
28#include <asm/acpi.h>
29#include <asm/apicdef.h>
30#include <asm/page.h>
31#include <linux/threads.h>
32#include <asm/kmap_types.h>
33
34/*
35 * Here we define all the compile-time 'special' virtual
36 * addresses. The point is to have a constant address at
37 * compile time, but to set the physical address only
38 * in the boot process. We allocate these special addresses
39 * from the end of virtual memory (0xfffff000) backwards.
40 * Also this lets us do fail-safe vmalloc(), we
41 * can guarantee that these special addresses and
42 * vmalloc()-ed addresses never overlap.
43 *
44 * these 'compile-time allocated' memory buffers are
45 * fixed-size 4k pages. (or larger if used with an increment
46 * highger than 1) use fixmap_set(idx,phys) to associate
47 * physical memory with fixmap indices.
48 *
49 * TLB entries of such buffers will not be flushed across
50 * task switches.
51 */
52enum fixed_addresses {
53 FIX_HOLE,
54 FIX_VDSO,
55 FIX_DBGP_BASE,
56 FIX_EARLYCON_MEM_BASE,
57#ifdef CONFIG_X86_LOCAL_APIC
58 FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
59#endif
60#ifdef CONFIG_X86_IO_APIC
61 FIX_IO_APIC_BASE_0,
62 FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
63#endif
64#ifdef CONFIG_X86_VISWS_APIC
65 FIX_CO_CPU, /* Cobalt timer */
66 FIX_CO_APIC, /* Cobalt APIC Redirection Table */
67 FIX_LI_PCIA, /* Lithium PCI Bridge A */
68 FIX_LI_PCIB, /* Lithium PCI Bridge B */
69#endif
70#ifdef CONFIG_X86_F00F_BUG
71 FIX_F00F_IDT, /* Virtual mapping for IDT */
72#endif
73#ifdef CONFIG_X86_CYCLONE_TIMER
74 FIX_CYCLONE_TIMER, /*cyclone timer register*/
75#endif
76 FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
77 FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
78#ifdef CONFIG_PCI_MMCONFIG
79 FIX_PCIE_MCFG,
80#endif
81#ifdef CONFIG_PARAVIRT
82 FIX_PARAVIRT_BOOTMAP,
83#endif
84 __end_of_permanent_fixed_addresses,
85 /*
86 * 256 temporary boot-time mappings, used by early_ioremap(),
87 * before ioremap() is functional.
88 *
89 * We round it up to the next 256 pages boundary so that we
90 * can have a single pgd entry and a single pte table:
91 */
92#define NR_FIX_BTMAPS 64
93#define FIX_BTMAPS_SLOTS 4
94 FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
95 (__end_of_permanent_fixed_addresses & 255),
96 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1,
97 FIX_WP_TEST,
98#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
99 FIX_OHCI1394_BASE,
100#endif
101 __end_of_fixed_addresses
102};
103
104extern void reserve_top_address(unsigned long reserve);
105
106
107#define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP)
108
109#define __FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
110#define __FIXADDR_BOOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
111#define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE)
112#define FIXADDR_BOOT_START (FIXADDR_TOP - __FIXADDR_BOOT_SIZE)
113
114#endif /* !__ASSEMBLY__ */
115#endif /* _ASM_X86_FIXMAP_32_H */
diff --git a/arch/x86/include/asm/fixmap_64.h b/arch/x86/include/asm/fixmap_64.h
deleted file mode 100644
index 298d9ba3faeb..000000000000
--- a/arch/x86/include/asm/fixmap_64.h
+++ /dev/null
@@ -1,79 +0,0 @@
1/*
2 * fixmap.h: compile-time virtual memory allocation
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details.
7 *
8 * Copyright (C) 1998 Ingo Molnar
9 */
10
11#ifndef _ASM_X86_FIXMAP_64_H
12#define _ASM_X86_FIXMAP_64_H
13
14#include <linux/kernel.h>
15#include <asm/acpi.h>
16#include <asm/apicdef.h>
17#include <asm/page.h>
18#include <asm/vsyscall.h>
19#include <asm/efi.h>
20
21/*
22 * Here we define all the compile-time 'special' virtual
23 * addresses. The point is to have a constant address at
24 * compile time, but to set the physical address only
25 * in the boot process.
26 *
27 * These 'compile-time allocated' memory buffers are
28 * fixed-size 4k pages (or larger if used with an increment
29 * higher than 1). Use set_fixmap(idx,phys) to associate
30 * physical memory with fixmap indices.
31 *
32 * TLB entries of such buffers will not be flushed across
33 * task switches.
34 */
35
36enum fixed_addresses {
37 VSYSCALL_LAST_PAGE,
38 VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
39 + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
40 VSYSCALL_HPET,
41 FIX_DBGP_BASE,
42 FIX_EARLYCON_MEM_BASE,
43 FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
44 FIX_IO_APIC_BASE_0,
45 FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
46 FIX_EFI_IO_MAP_LAST_PAGE,
47 FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE
48 + MAX_EFI_IO_PAGES - 1,
49#ifdef CONFIG_PARAVIRT
50 FIX_PARAVIRT_BOOTMAP,
51#endif
52 __end_of_permanent_fixed_addresses,
53#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
54 FIX_OHCI1394_BASE,
55#endif
56 /*
57 * 256 temporary boot-time mappings, used by early_ioremap(),
58 * before ioremap() is functional.
59 *
60 * We round it up to the next 256 pages boundary so that we
61 * can have a single pgd entry and a single pte table:
62 */
63#define NR_FIX_BTMAPS 64
64#define FIX_BTMAPS_SLOTS 4
65 FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
66 (__end_of_permanent_fixed_addresses & 255),
67 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1,
68 __end_of_fixed_addresses
69};
70
71#define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE)
72#define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
73#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
74
75/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */
76#define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL)
77#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE)
78
79#endif /* _ASM_X86_FIXMAP_64_H */
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index b55b4a7fbefd..bd2c6511c887 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -28,6 +28,13 @@
28 28
29#endif 29#endif
30 30
31/* FIXME: I don't want to stay hardcoded */
32#ifdef CONFIG_X86_64
33# define FTRACE_SYSCALL_MAX 296
34#else
35# define FTRACE_SYSCALL_MAX 333
36#endif
37
31#ifdef CONFIG_FUNCTION_TRACER 38#ifdef CONFIG_FUNCTION_TRACER
32#define MCOUNT_ADDR ((long)(mcount)) 39#define MCOUNT_ADDR ((long)(mcount))
33#define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ 40#define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */
@@ -55,29 +62,4 @@ struct dyn_arch_ftrace {
55#endif /* __ASSEMBLY__ */ 62#endif /* __ASSEMBLY__ */
56#endif /* CONFIG_FUNCTION_TRACER */ 63#endif /* CONFIG_FUNCTION_TRACER */
57 64
58#ifdef CONFIG_FUNCTION_GRAPH_TRACER
59
60#ifndef __ASSEMBLY__
61
62/*
63 * Stack of return addresses for functions
64 * of a thread.
65 * Used in struct thread_info
66 */
67struct ftrace_ret_stack {
68 unsigned long ret;
69 unsigned long func;
70 unsigned long long calltime;
71};
72
73/*
74 * Primary handler of a function return.
75 * It relays on ftrace_return_to_handler.
76 * Defined in entry_32/64.S
77 */
78extern void return_to_handler(void);
79
80#endif /* __ASSEMBLY__ */
81#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
82
83#endif /* _ASM_X86_FTRACE_H */ 65#endif /* _ASM_X86_FTRACE_H */
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 46ebed797e4f..25454427ceea 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -12,6 +12,7 @@ typedef struct {
12 unsigned int apic_timer_irqs; /* arch dependent */ 12 unsigned int apic_timer_irqs; /* arch dependent */
13 unsigned int irq_spurious_count; 13 unsigned int irq_spurious_count;
14#endif 14#endif
15 unsigned int generic_irqs; /* arch dependent */
15 unsigned int apic_perf_irqs; 16 unsigned int apic_perf_irqs;
16#ifdef CONFIG_SMP 17#ifdef CONFIG_SMP
17 unsigned int irq_resched_count; 18 unsigned int irq_resched_count;
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
index bf9276bea660..014c2b85ae45 100644
--- a/arch/x86/include/asm/highmem.h
+++ b/arch/x86/include/asm/highmem.h
@@ -63,6 +63,7 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot);
63void *kmap_atomic(struct page *page, enum km_type type); 63void *kmap_atomic(struct page *page, enum km_type type);
64void kunmap_atomic(void *kvaddr, enum km_type type); 64void kunmap_atomic(void *kvaddr, enum km_type type);
65void *kmap_atomic_pfn(unsigned long pfn, enum km_type type); 65void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
66void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
66struct page *kmap_atomic_to_page(void *ptr); 67struct page *kmap_atomic_to_page(void *ptr);
67 68
68#ifndef CONFIG_PARAVIRT 69#ifndef CONFIG_PARAVIRT
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index f39881b6b68b..ae80f64973e0 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -27,6 +27,7 @@
27 27
28/* Interrupt handlers registered during init_IRQ */ 28/* Interrupt handlers registered during init_IRQ */
29extern void apic_timer_interrupt(void); 29extern void apic_timer_interrupt(void);
30extern void generic_interrupt(void);
30extern void error_interrupt(void); 31extern void error_interrupt(void);
31extern void perf_counter_interrupt(void); 32extern void perf_counter_interrupt(void);
32 33
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 48f0004db8c9..71c9e5183982 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -172,7 +172,13 @@ static inline void __save_init_fpu(struct task_struct *tsk)
172 172
173#else /* CONFIG_X86_32 */ 173#else /* CONFIG_X86_32 */
174 174
175extern void finit(void); 175#ifdef CONFIG_MATH_EMULATION
176extern void finit_task(struct task_struct *tsk);
177#else
178static inline void finit_task(struct task_struct *tsk)
179{
180}
181#endif
176 182
177static inline void tolerant_fwait(void) 183static inline void tolerant_fwait(void)
178{ 184{
diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h
index 50ca486fd88c..1f7e62517284 100644
--- a/arch/x86/include/asm/ia32.h
+++ b/arch/x86/include/asm/ia32.h
@@ -129,13 +129,6 @@ typedef struct compat_siginfo {
129 } _sifields; 129 } _sifields;
130} compat_siginfo_t; 130} compat_siginfo_t;
131 131
132struct ustat32 {
133 __u32 f_tfree;
134 compat_ino_t f_tinode;
135 char f_fname[6];
136 char f_fpack[6];
137};
138
139#define IA32_STACK_TOP IA32_PAGE_OFFSET 132#define IA32_STACK_TOP IA32_PAGE_OFFSET
140 133
141#ifdef __KERNEL__ 134#ifdef __KERNEL__
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
new file mode 100644
index 000000000000..36fb1a6a5109
--- /dev/null
+++ b/arch/x86/include/asm/init.h
@@ -0,0 +1,18 @@
1#ifndef _ASM_X86_INIT_32_H
2#define _ASM_X86_INIT_32_H
3
4#ifdef CONFIG_X86_32
5extern void __init early_ioremap_page_table_range_init(void);
6#endif
7
8extern unsigned long __init
9kernel_physical_mapping_init(unsigned long start,
10 unsigned long end,
11 unsigned long page_size_mask);
12
13
14extern unsigned long __initdata e820_table_start;
15extern unsigned long __meminitdata e820_table_end;
16extern unsigned long __meminitdata e820_table_top;
17
18#endif /* _ASM_X86_INIT_32_H */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 683d0b4c00fc..e5383e3d2f8c 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -172,8 +172,6 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
172 172
173extern void iounmap(volatile void __iomem *addr); 173extern void iounmap(volatile void __iomem *addr);
174 174
175extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
176
177 175
178#ifdef CONFIG_X86_32 176#ifdef CONFIG_X86_32
179# include "io_32.h" 177# include "io_32.h"
@@ -198,7 +196,6 @@ extern void early_ioremap_reset(void);
198extern void __iomem *early_ioremap(unsigned long offset, unsigned long size); 196extern void __iomem *early_ioremap(unsigned long offset, unsigned long size);
199extern void __iomem *early_memremap(unsigned long offset, unsigned long size); 197extern void __iomem *early_memremap(unsigned long offset, unsigned long size);
200extern void early_iounmap(void __iomem *addr, unsigned long size); 198extern void early_iounmap(void __iomem *addr, unsigned long size);
201extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
202 199
203#define IO_SPACE_LIMIT 0xffff 200#define IO_SPACE_LIMIT 0xffff
204 201
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 59cb4a1317b7..373cc2bbcad2 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -162,7 +162,8 @@ extern int (*ioapic_renumber_irq)(int ioapic, int irq);
162extern void ioapic_init_mappings(void); 162extern void ioapic_init_mappings(void);
163 163
164#ifdef CONFIG_X86_64 164#ifdef CONFIG_X86_64
165extern int save_mask_IO_APIC_setup(void); 165extern int save_IO_APIC_setup(void);
166extern void mask_IO_APIC_setup(void);
166extern void restore_IO_APIC_setup(void); 167extern void restore_IO_APIC_setup(void);
167extern void reinit_intr_remapped_IO_APIC(int); 168extern void reinit_intr_remapped_IO_APIC(int);
168#endif 169#endif
@@ -172,7 +173,7 @@ extern void probe_nr_irqs_gsi(void);
172extern int setup_ioapic_entry(int apic, int irq, 173extern int setup_ioapic_entry(int apic, int irq,
173 struct IO_APIC_route_entry *entry, 174 struct IO_APIC_route_entry *entry,
174 unsigned int destination, int trigger, 175 unsigned int destination, int trigger,
175 int polarity, int vector); 176 int polarity, int vector, int pin);
176extern void ioapic_write_entry(int apic, int pin, 177extern void ioapic_write_entry(int apic, int pin,
177 struct IO_APIC_route_entry e); 178 struct IO_APIC_route_entry e);
178#else /* !CONFIG_X86_IO_APIC */ 179#else /* !CONFIG_X86_IO_APIC */
diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h
index bd46495ff7de..86af26091d6c 100644
--- a/arch/x86/include/asm/iomap.h
+++ b/arch/x86/include/asm/iomap.h
@@ -24,10 +24,7 @@
24#include <asm/tlbflush.h> 24#include <asm/tlbflush.h>
25 25
26int 26int
27reserve_io_memtype_wc(u64 base, unsigned long size, pgprot_t *prot); 27is_io_mapping_possible(resource_size_t base, unsigned long size);
28
29void
30free_io_memtype(u64 base, unsigned long size);
31 28
32void * 29void *
33iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); 30iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index a6ee9e6f530f..af326a2975b5 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -3,7 +3,7 @@
3 3
4extern void pci_iommu_shutdown(void); 4extern void pci_iommu_shutdown(void);
5extern void no_iommu_init(void); 5extern void no_iommu_init(void);
6extern struct dma_mapping_ops nommu_dma_ops; 6extern struct dma_map_ops nommu_dma_ops;
7extern int force_iommu, no_iommu; 7extern int force_iommu, no_iommu;
8extern int iommu_detected; 8extern int iommu_detected;
9 9
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 107eb2196691..f38481bcd455 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -36,6 +36,7 @@ static inline int irq_canonicalize(int irq)
36extern void fixup_irqs(void); 36extern void fixup_irqs(void);
37#endif 37#endif
38 38
39extern void (*generic_interrupt_extension)(void);
39extern void init_IRQ(void); 40extern void init_IRQ(void);
40extern void native_init_IRQ(void); 41extern void native_init_IRQ(void);
41extern bool handle_irq(unsigned irq, struct pt_regs *regs); 42extern bool handle_irq(unsigned irq, struct pt_regs *regs);
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 20e1fd588dbf..0396760fccb8 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -1,8 +1,6 @@
1#ifndef _ASM_X86_IRQ_REMAPPING_H 1#ifndef _ASM_X86_IRQ_REMAPPING_H
2#define _ASM_X86_IRQ_REMAPPING_H 2#define _ASM_X86_IRQ_REMAPPING_H
3 3
4extern int x2apic;
5
6#define IRTE_DEST(dest) ((x2apic) ? dest : dest << 8) 4#define IRTE_DEST(dest) ((x2apic) ? dest : dest << 8)
7 5
8#endif /* _ASM_X86_IRQ_REMAPPING_H */ 6#endif /* _ASM_X86_IRQ_REMAPPING_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 8a285f356f8a..3cbd79bbb47c 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -112,6 +112,11 @@
112#define LOCAL_PERF_VECTOR 0xee 112#define LOCAL_PERF_VECTOR 0xee
113 113
114/* 114/*
115 * Generic system vector for platform specific use
116 */
117#define GENERIC_INTERRUPT_VECTOR 0xed
118
119/*
115 * First APIC vector available to drivers: (vectors 0x30-0xee) we 120 * First APIC vector available to drivers: (vectors 0x30-0xee) we
116 * start at 0x31(0x41) to spread out vectors evenly between priority 121 * start at 0x31(0x41) to spread out vectors evenly between priority
117 * levels. (0x80 is the syscall vector) 122 * levels. (0x80 is the syscall vector)
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 0ceb6d19ed30..317ff1703d0b 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -9,13 +9,13 @@
9# define PAGES_NR 4 9# define PAGES_NR 4
10#else 10#else
11# define PA_CONTROL_PAGE 0 11# define PA_CONTROL_PAGE 0
12# define PA_TABLE_PAGE 1 12# define VA_CONTROL_PAGE 1
13# define PAGES_NR 2 13# define PA_TABLE_PAGE 2
14# define PA_SWAP_PAGE 3
15# define PAGES_NR 4
14#endif 16#endif
15 17
16#ifdef CONFIG_X86_32
17# define KEXEC_CONTROL_CODE_MAX_SIZE 2048 18# define KEXEC_CONTROL_CODE_MAX_SIZE 2048
18#endif
19 19
20#ifndef __ASSEMBLY__ 20#ifndef __ASSEMBLY__
21 21
@@ -136,10 +136,11 @@ relocate_kernel(unsigned long indirection_page,
136 unsigned int has_pae, 136 unsigned int has_pae,
137 unsigned int preserve_context); 137 unsigned int preserve_context);
138#else 138#else
139NORET_TYPE void 139unsigned long
140relocate_kernel(unsigned long indirection_page, 140relocate_kernel(unsigned long indirection_page,
141 unsigned long page_list, 141 unsigned long page_list,
142 unsigned long start_address) ATTRIB_NORET; 142 unsigned long start_address,
143 unsigned int preserve_context);
143#endif 144#endif
144 145
145#define ARCH_HAS_KIMAGE_ARCH 146#define ARCH_HAS_KIMAGE_ARCH
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index 886c9402ec45..dc3f6cf11704 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -15,6 +15,7 @@
15#define __KVM_HAVE_DEVICE_ASSIGNMENT 15#define __KVM_HAVE_DEVICE_ASSIGNMENT
16#define __KVM_HAVE_MSI 16#define __KVM_HAVE_MSI
17#define __KVM_HAVE_USER_NMI 17#define __KVM_HAVE_USER_NMI
18#define __KVM_HAVE_GUEST_DEBUG
18 19
19/* Architectural interrupt line count. */ 20/* Architectural interrupt line count. */
20#define KVM_NR_INTERRUPTS 256 21#define KVM_NR_INTERRUPTS 256
@@ -212,7 +213,30 @@ struct kvm_pit_channel_state {
212 __s64 count_load_time; 213 __s64 count_load_time;
213}; 214};
214 215
216struct kvm_debug_exit_arch {
217 __u32 exception;
218 __u32 pad;
219 __u64 pc;
220 __u64 dr6;
221 __u64 dr7;
222};
223
224#define KVM_GUESTDBG_USE_SW_BP 0x00010000
225#define KVM_GUESTDBG_USE_HW_BP 0x00020000
226#define KVM_GUESTDBG_INJECT_DB 0x00040000
227#define KVM_GUESTDBG_INJECT_BP 0x00080000
228
229/* for KVM_SET_GUEST_DEBUG */
230struct kvm_guest_debug_arch {
231 __u64 debugreg[8];
232};
233
215struct kvm_pit_state { 234struct kvm_pit_state {
216 struct kvm_pit_channel_state channels[3]; 235 struct kvm_pit_channel_state channels[3];
217}; 236};
237
238struct kvm_reinject_control {
239 __u8 pit_reinject;
240 __u8 reserved[31];
241};
218#endif /* _ASM_X86_KVM_H */ 242#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 730843d1d2fb..f0faf58044ff 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -22,6 +22,7 @@
22#include <asm/pvclock-abi.h> 22#include <asm/pvclock-abi.h>
23#include <asm/desc.h> 23#include <asm/desc.h>
24#include <asm/mtrr.h> 24#include <asm/mtrr.h>
25#include <asm/msr-index.h>
25 26
26#define KVM_MAX_VCPUS 16 27#define KVM_MAX_VCPUS 16
27#define KVM_MEMORY_SLOTS 32 28#define KVM_MEMORY_SLOTS 32
@@ -134,11 +135,18 @@ enum {
134 135
135#define KVM_NR_MEM_OBJS 40 136#define KVM_NR_MEM_OBJS 40
136 137
137struct kvm_guest_debug { 138#define KVM_NR_DB_REGS 4
138 int enabled; 139
139 unsigned long bp[4]; 140#define DR6_BD (1 << 13)
140 int singlestep; 141#define DR6_BS (1 << 14)
141}; 142#define DR6_FIXED_1 0xffff0ff0
143#define DR6_VOLATILE 0x0000e00f
144
145#define DR7_BP_EN_MASK 0x000000ff
146#define DR7_GE (1 << 9)
147#define DR7_GD (1 << 13)
148#define DR7_FIXED_1 0x00000400
149#define DR7_VOLATILE 0xffff23ff
142 150
143/* 151/*
144 * We don't want allocation failures within the mmu code, so we preallocate 152 * We don't want allocation failures within the mmu code, so we preallocate
@@ -162,7 +170,8 @@ struct kvm_pte_chain {
162 * bits 0:3 - total guest paging levels (2-4, or zero for real mode) 170 * bits 0:3 - total guest paging levels (2-4, or zero for real mode)
163 * bits 4:7 - page table level for this shadow (1-4) 171 * bits 4:7 - page table level for this shadow (1-4)
164 * bits 8:9 - page table quadrant for 2-level guests 172 * bits 8:9 - page table quadrant for 2-level guests
165 * bit 16 - "metaphysical" - gfn is not a real page (huge page/real mode) 173 * bit 16 - direct mapping of virtual to physical mapping at gfn
174 * used for real mode and two-dimensional paging
166 * bits 17:19 - common access permissions for all ptes in this shadow page 175 * bits 17:19 - common access permissions for all ptes in this shadow page
167 */ 176 */
168union kvm_mmu_page_role { 177union kvm_mmu_page_role {
@@ -172,9 +181,10 @@ union kvm_mmu_page_role {
172 unsigned level:4; 181 unsigned level:4;
173 unsigned quadrant:2; 182 unsigned quadrant:2;
174 unsigned pad_for_nice_hex_output:6; 183 unsigned pad_for_nice_hex_output:6;
175 unsigned metaphysical:1; 184 unsigned direct:1;
176 unsigned access:3; 185 unsigned access:3;
177 unsigned invalid:1; 186 unsigned invalid:1;
187 unsigned cr4_pge:1;
178 }; 188 };
179}; 189};
180 190
@@ -218,6 +228,18 @@ struct kvm_pv_mmu_op_buffer {
218 char buf[512] __aligned(sizeof(long)); 228 char buf[512] __aligned(sizeof(long));
219}; 229};
220 230
231struct kvm_pio_request {
232 unsigned long count;
233 int cur_count;
234 gva_t guest_gva;
235 int in;
236 int port;
237 int size;
238 int string;
239 int down;
240 int rep;
241};
242
221/* 243/*
222 * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level 244 * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
223 * 32-bit). The kvm_mmu structure abstracts the details of the current mmu 245 * 32-bit). The kvm_mmu structure abstracts the details of the current mmu
@@ -236,6 +258,7 @@ struct kvm_mmu {
236 hpa_t root_hpa; 258 hpa_t root_hpa;
237 int root_level; 259 int root_level;
238 int shadow_root_level; 260 int shadow_root_level;
261 union kvm_mmu_page_role base_role;
239 262
240 u64 *pae_root; 263 u64 *pae_root;
241}; 264};
@@ -258,6 +281,7 @@ struct kvm_vcpu_arch {
258 unsigned long cr3; 281 unsigned long cr3;
259 unsigned long cr4; 282 unsigned long cr4;
260 unsigned long cr8; 283 unsigned long cr8;
284 u32 hflags;
261 u64 pdptrs[4]; /* pae */ 285 u64 pdptrs[4]; /* pae */
262 u64 shadow_efer; 286 u64 shadow_efer;
263 u64 apic_base; 287 u64 apic_base;
@@ -338,6 +362,15 @@ struct kvm_vcpu_arch {
338 362
339 struct mtrr_state_type mtrr_state; 363 struct mtrr_state_type mtrr_state;
340 u32 pat; 364 u32 pat;
365
366 int switch_db_regs;
367 unsigned long host_db[KVM_NR_DB_REGS];
368 unsigned long host_dr6;
369 unsigned long host_dr7;
370 unsigned long db[KVM_NR_DB_REGS];
371 unsigned long dr6;
372 unsigned long dr7;
373 unsigned long eff_db[KVM_NR_DB_REGS];
341}; 374};
342 375
343struct kvm_mem_alias { 376struct kvm_mem_alias {
@@ -378,6 +411,7 @@ struct kvm_arch{
378 411
379 unsigned long irq_sources_bitmap; 412 unsigned long irq_sources_bitmap;
380 unsigned long irq_states[KVM_IOAPIC_NUM_PINS]; 413 unsigned long irq_states[KVM_IOAPIC_NUM_PINS];
414 u64 vm_init_tsc;
381}; 415};
382 416
383struct kvm_vm_stat { 417struct kvm_vm_stat {
@@ -446,8 +480,7 @@ struct kvm_x86_ops {
446 void (*vcpu_put)(struct kvm_vcpu *vcpu); 480 void (*vcpu_put)(struct kvm_vcpu *vcpu);
447 481
448 int (*set_guest_debug)(struct kvm_vcpu *vcpu, 482 int (*set_guest_debug)(struct kvm_vcpu *vcpu,
449 struct kvm_debug_guest *dbg); 483 struct kvm_guest_debug *dbg);
450 void (*guest_debug_pre)(struct kvm_vcpu *vcpu);
451 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); 484 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
452 int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); 485 int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
453 u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); 486 u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
@@ -583,16 +616,12 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
583void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, 616void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
584 u32 error_code); 617 u32 error_code);
585 618
586void kvm_pic_set_irq(void *opaque, int irq, int level); 619int kvm_pic_set_irq(void *opaque, int irq, int level);
587 620
588void kvm_inject_nmi(struct kvm_vcpu *vcpu); 621void kvm_inject_nmi(struct kvm_vcpu *vcpu);
589 622
590void fx_init(struct kvm_vcpu *vcpu); 623void fx_init(struct kvm_vcpu *vcpu);
591 624
592int emulator_read_std(unsigned long addr,
593 void *val,
594 unsigned int bytes,
595 struct kvm_vcpu *vcpu);
596int emulator_write_emulated(unsigned long addr, 625int emulator_write_emulated(unsigned long addr,
597 const void *val, 626 const void *val,
598 unsigned int bytes, 627 unsigned int bytes,
@@ -737,6 +766,10 @@ enum {
737 TASK_SWITCH_GATE = 3, 766 TASK_SWITCH_GATE = 3,
738}; 767};
739 768
769#define HF_GIF_MASK (1 << 0)
770#define HF_HIF_MASK (1 << 1)
771#define HF_VINTR_MASK (1 << 2)
772
740/* 773/*
741 * Hardware virtualization extension instructions may fault if a 774 * Hardware virtualization extension instructions may fault if a
742 * reboot turns off virtualization while processes are running. 775 * reboot turns off virtualization while processes are running.
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
index 43894428c3c2..0f4ee7148afe 100644
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -26,36 +26,20 @@
26 26
27#ifndef __ASSEMBLY__ 27#ifndef __ASSEMBLY__
28#include <asm/hw_irq.h> 28#include <asm/hw_irq.h>
29#include <asm/kvm_para.h>
29 30
30/*G:031 But first, how does our Guest contact the Host to ask for privileged 31/*G:031 But first, how does our Guest contact the Host to ask for privileged
31 * operations? There are two ways: the direct way is to make a "hypercall", 32 * operations? There are two ways: the direct way is to make a "hypercall",
32 * to make requests of the Host Itself. 33 * to make requests of the Host Itself.
33 * 34 *
34 * Our hypercall mechanism uses the highest unused trap code (traps 32 and 35 * We use the KVM hypercall mechanism. Eighteen hypercalls are
35 * above are used by real hardware interrupts). Fifteen hypercalls are
36 * available: the hypercall number is put in the %eax register, and the 36 * available: the hypercall number is put in the %eax register, and the
37 * arguments (when required) are placed in %edx, %ebx and %ecx. If a return 37 * arguments (when required) are placed in %ebx, %ecx and %edx. If a return
38 * value makes sense, it's returned in %eax. 38 * value makes sense, it's returned in %eax.
39 * 39 *
40 * Grossly invalid calls result in Sudden Death at the hands of the vengeful 40 * Grossly invalid calls result in Sudden Death at the hands of the vengeful
41 * Host, rather than returning failure. This reflects Winston Churchill's 41 * Host, rather than returning failure. This reflects Winston Churchill's
42 * definition of a gentleman: "someone who is only rude intentionally". */ 42 * definition of a gentleman: "someone who is only rude intentionally". */
43static inline unsigned long
44hcall(unsigned long call,
45 unsigned long arg1, unsigned long arg2, unsigned long arg3)
46{
47 /* "int" is the Intel instruction to trigger a trap. */
48 asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY)
49 /* The call in %eax (aka "a") might be overwritten */
50 : "=a"(call)
51 /* The arguments are in %eax, %edx, %ebx & %ecx */
52 : "a"(call), "d"(arg1), "b"(arg2), "c"(arg3)
53 /* "memory" means this might write somewhere in memory.
54 * This isn't true for all calls, but it's safe to tell
55 * gcc that it might happen so it doesn't get clever. */
56 : "memory");
57 return call;
58}
59/*:*/ 43/*:*/
60 44
61/* Can't use our min() macro here: needs to be a constant */ 45/* Can't use our min() macro here: needs to be a constant */
@@ -64,7 +48,7 @@ hcall(unsigned long call,
64#define LHCALL_RING_SIZE 64 48#define LHCALL_RING_SIZE 64
65struct hcall_args { 49struct hcall_args {
66 /* These map directly onto eax, ebx, ecx, edx in struct lguest_regs */ 50 /* These map directly onto eax, ebx, ecx, edx in struct lguest_regs */
67 unsigned long arg0, arg2, arg3, arg1; 51 unsigned long arg0, arg1, arg2, arg3;
68}; 52};
69 53
70#endif /* !__ASSEMBLY__ */ 54#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index 9320e2a8a26a..12d55e773eb6 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -1,14 +1,11 @@
1#ifndef _ASM_X86_LINKAGE_H 1#ifndef _ASM_X86_LINKAGE_H
2#define _ASM_X86_LINKAGE_H 2#define _ASM_X86_LINKAGE_H
3 3
4#include <linux/stringify.h>
5
4#undef notrace 6#undef notrace
5#define notrace __attribute__((no_instrument_function)) 7#define notrace __attribute__((no_instrument_function))
6 8
7#ifdef CONFIG_X86_64
8#define __ALIGN .p2align 4,,15
9#define __ALIGN_STR ".p2align 4,,15"
10#endif
11
12#ifdef CONFIG_X86_32 9#ifdef CONFIG_X86_32
13#define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0))) 10#define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0)))
14/* 11/*
@@ -50,16 +47,20 @@
50 __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \ 47 __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \
51 "g" (arg4), "g" (arg5), "g" (arg6)) 48 "g" (arg4), "g" (arg5), "g" (arg6))
52 49
53#endif 50#endif /* CONFIG_X86_32 */
51
52#ifdef __ASSEMBLY__
54 53
55#define GLOBAL(name) \ 54#define GLOBAL(name) \
56 .globl name; \ 55 .globl name; \
57 name: 56 name:
58 57
59#ifdef CONFIG_X86_ALIGNMENT_16 58#if defined(CONFIG_X86_64) || defined(CONFIG_X86_ALIGNMENT_16)
60#define __ALIGN .align 16,0x90 59#define __ALIGN .p2align 4, 0x90
61#define __ALIGN_STR ".align 16,0x90" 60#define __ALIGN_STR __stringify(__ALIGN)
62#endif 61#endif
63 62
63#endif /* __ASSEMBLY__ */
64
64#endif /* _ASM_X86_LINKAGE_H */ 65#endif /* _ASM_X86_LINKAGE_H */
65 66
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 32c6e17b960b..563933e06a35 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -11,6 +11,8 @@
11 */ 11 */
12 12
13#define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */ 13#define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */
14#define MCG_EXT_P (1ULL<<9) /* Extended registers available */
15#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */
14 16
15#define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */ 17#define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */
16#define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */ 18#define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */
@@ -90,14 +92,29 @@ extern int mce_disabled;
90 92
91#include <asm/atomic.h> 93#include <asm/atomic.h>
92 94
95void mce_setup(struct mce *m);
93void mce_log(struct mce *m); 96void mce_log(struct mce *m);
94DECLARE_PER_CPU(struct sys_device, device_mce); 97DECLARE_PER_CPU(struct sys_device, device_mce);
95extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 98extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
96 99
100/*
101 * To support more than 128 would need to escape the predefined
102 * Linux defined extended banks first.
103 */
104#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1)
105
97#ifdef CONFIG_X86_MCE_INTEL 106#ifdef CONFIG_X86_MCE_INTEL
98void mce_intel_feature_init(struct cpuinfo_x86 *c); 107void mce_intel_feature_init(struct cpuinfo_x86 *c);
108void cmci_clear(void);
109void cmci_reenable(void);
110void cmci_rediscover(int dying);
111void cmci_recheck(void);
99#else 112#else
100static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { } 113static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { }
114static inline void cmci_clear(void) {}
115static inline void cmci_reenable(void) {}
116static inline void cmci_rediscover(int dying) {}
117static inline void cmci_recheck(void) {}
101#endif 118#endif
102 119
103#ifdef CONFIG_X86_MCE_AMD 120#ifdef CONFIG_X86_MCE_AMD
@@ -106,11 +123,23 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c);
106static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } 123static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
107#endif 124#endif
108 125
109void mce_log_therm_throt_event(unsigned int cpu, __u64 status); 126extern int mce_available(struct cpuinfo_x86 *c);
127
128void mce_log_therm_throt_event(__u64 status);
110 129
111extern atomic_t mce_entry; 130extern atomic_t mce_entry;
112 131
113extern void do_machine_check(struct pt_regs *, long); 132extern void do_machine_check(struct pt_regs *, long);
133
134typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS);
135DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);
136
137enum mcp_flags {
138 MCP_TIMESTAMP = (1 << 0), /* log time stamp */
139 MCP_UC = (1 << 1), /* log uncorrected errors */
140};
141extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
142
114extern int mce_notify_user(void); 143extern int mce_notify_user(void);
115 144
116#endif /* !CONFIG_X86_32 */ 145#endif /* !CONFIG_X86_32 */
@@ -120,8 +149,8 @@ extern void mcheck_init(struct cpuinfo_x86 *c);
120#else 149#else
121#define mcheck_init(c) do { } while (0) 150#define mcheck_init(c) do { } while (0)
122#endif 151#endif
123extern void stop_mce(void); 152
124extern void restart_mce(void); 153extern void (*mce_threshold_vector)(void);
125 154
126#endif /* __KERNEL__ */ 155#endif /* __KERNEL__ */
127#endif /* _ASM_X86_MCE_H */ 156#endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index 105fb90a0635..ede6998bd92c 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -91,46 +91,9 @@ static inline int pfn_valid(int pfn)
91#endif /* CONFIG_DISCONTIGMEM */ 91#endif /* CONFIG_DISCONTIGMEM */
92 92
93#ifdef CONFIG_NEED_MULTIPLE_NODES 93#ifdef CONFIG_NEED_MULTIPLE_NODES
94 94/* always use node 0 for bootmem on this numa platform */
95/* 95#define bootmem_arch_preferred_node(__bdata, size, align, goal, limit) \
96 * Following are macros that are specific to this numa platform. 96 (NODE_DATA(0)->bdata)
97 */
98#define reserve_bootmem(addr, size, flags) \
99 reserve_bootmem_node(NODE_DATA(0), (addr), (size), (flags))
100#define alloc_bootmem(x) \
101 __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
102#define alloc_bootmem_nopanic(x) \
103 __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), SMP_CACHE_BYTES, \
104 __pa(MAX_DMA_ADDRESS))
105#define alloc_bootmem_low(x) \
106 __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0)
107#define alloc_bootmem_pages(x) \
108 __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
109#define alloc_bootmem_pages_nopanic(x) \
110 __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), PAGE_SIZE, \
111 __pa(MAX_DMA_ADDRESS))
112#define alloc_bootmem_low_pages(x) \
113 __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0)
114#define alloc_bootmem_node(pgdat, x) \
115({ \
116 struct pglist_data __maybe_unused \
117 *__alloc_bootmem_node__pgdat = (pgdat); \
118 __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, \
119 __pa(MAX_DMA_ADDRESS)); \
120})
121#define alloc_bootmem_pages_node(pgdat, x) \
122({ \
123 struct pglist_data __maybe_unused \
124 *__alloc_bootmem_node__pgdat = (pgdat); \
125 __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, \
126 __pa(MAX_DMA_ADDRESS)); \
127})
128#define alloc_bootmem_low_pages_node(pgdat, x) \
129({ \
130 struct pglist_data __maybe_unused \
131 *__alloc_bootmem_node__pgdat = (pgdat); \
132 __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0); \
133})
134#endif /* CONFIG_NEED_MULTIPLE_NODES */ 97#endif /* CONFIG_NEED_MULTIPLE_NODES */
135 98
136#endif /* _ASM_X86_MMZONE_32_H */ 99#endif /* _ASM_X86_MMZONE_32_H */
diff --git a/arch/x86/include/asm/msidef.h b/arch/x86/include/asm/msidef.h
index 6706b3006f13..4cc48af23fef 100644
--- a/arch/x86/include/asm/msidef.h
+++ b/arch/x86/include/asm/msidef.h
@@ -47,6 +47,7 @@
47#define MSI_ADDR_DEST_ID_MASK 0x00ffff0 47#define MSI_ADDR_DEST_ID_MASK 0x00ffff0
48#define MSI_ADDR_DEST_ID(dest) (((dest) << MSI_ADDR_DEST_ID_SHIFT) & \ 48#define MSI_ADDR_DEST_ID(dest) (((dest) << MSI_ADDR_DEST_ID_SHIFT) & \
49 MSI_ADDR_DEST_ID_MASK) 49 MSI_ADDR_DEST_ID_MASK)
50#define MSI_ADDR_EXT_DEST_ID(dest) ((dest) & 0xffffff00)
50 51
51#define MSI_ADDR_IR_EXT_INT (1 << 4) 52#define MSI_ADDR_IR_EXT_INT (1 << 4)
52#define MSI_ADDR_IR_SHV (1 << 3) 53#define MSI_ADDR_IR_SHV (1 << 3)
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 358acc59ae04..ec41fc16c167 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -18,11 +18,15 @@
18#define _EFER_LME 8 /* Long mode enable */ 18#define _EFER_LME 8 /* Long mode enable */
19#define _EFER_LMA 10 /* Long mode active (read-only) */ 19#define _EFER_LMA 10 /* Long mode active (read-only) */
20#define _EFER_NX 11 /* No execute enable */ 20#define _EFER_NX 11 /* No execute enable */
21#define _EFER_SVME 12 /* Enable virtualization */
22#define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */
21 23
22#define EFER_SCE (1<<_EFER_SCE) 24#define EFER_SCE (1<<_EFER_SCE)
23#define EFER_LME (1<<_EFER_LME) 25#define EFER_LME (1<<_EFER_LME)
24#define EFER_LMA (1<<_EFER_LMA) 26#define EFER_LMA (1<<_EFER_LMA)
25#define EFER_NX (1<<_EFER_NX) 27#define EFER_NX (1<<_EFER_NX)
28#define EFER_SVME (1<<_EFER_SVME)
29#define EFER_FFXSR (1<<_EFER_FFXSR)
26 30
27/* Intel MSRs. Some also available on other CPUs */ 31/* Intel MSRs. Some also available on other CPUs */
28#define MSR_IA32_PERFCTR0 0x000000c1 32#define MSR_IA32_PERFCTR0 0x000000c1
@@ -77,6 +81,11 @@
77#define MSR_IA32_MC0_ADDR 0x00000402 81#define MSR_IA32_MC0_ADDR 0x00000402
78#define MSR_IA32_MC0_MISC 0x00000403 82#define MSR_IA32_MC0_MISC 0x00000403
79 83
84/* These are consecutive and not in the normal 4er MCE bank block */
85#define MSR_IA32_MC0_CTL2 0x00000280
86#define CMCI_EN (1ULL << 30)
87#define CMCI_THRESHOLD_MASK 0xffffULL
88
80#define MSR_P6_PERFCTR0 0x000000c1 89#define MSR_P6_PERFCTR0 0x000000c1
81#define MSR_P6_PERFCTR1 0x000000c2 90#define MSR_P6_PERFCTR1 0x000000c2
82#define MSR_P6_EVNTSEL0 0x00000186 91#define MSR_P6_EVNTSEL0 0x00000186
@@ -360,4 +369,9 @@
360#define MSR_IA32_VMX_PROCBASED_CTLS2 0x0000048b 369#define MSR_IA32_VMX_PROCBASED_CTLS2 0x0000048b
361#define MSR_IA32_VMX_EPT_VPID_CAP 0x0000048c 370#define MSR_IA32_VMX_EPT_VPID_CAP 0x0000048c
362 371
372/* AMD-V MSRs */
373
374#define MSR_VM_CR 0xc0010114
375#define MSR_VM_HSAVE_PA 0xc0010117
376
363#endif /* _ASM_X86_MSR_INDEX_H */ 377#endif /* _ASM_X86_MSR_INDEX_H */
diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h
index e9f5db796244..a37229011b56 100644
--- a/arch/x86/include/asm/numa_32.h
+++ b/arch/x86/include/asm/numa_32.h
@@ -4,8 +4,12 @@
4extern int pxm_to_nid(int pxm); 4extern int pxm_to_nid(int pxm);
5extern void numa_remove_cpu(int cpu); 5extern void numa_remove_cpu(int cpu);
6 6
7#ifdef CONFIG_NUMA 7#ifdef CONFIG_HIGHMEM
8extern void set_highmem_pages_init(void); 8extern void set_highmem_pages_init(void);
9#else
10static inline void set_highmem_pages_init(void)
11{
12}
9#endif 13#endif
10 14
11#endif /* _ASM_X86_NUMA_32_H */ 15#endif /* _ASM_X86_NUMA_32_H */
diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h
index f1e4a79a6e41..0f915ae649a7 100644
--- a/arch/x86/include/asm/page_32_types.h
+++ b/arch/x86/include/asm/page_32_types.h
@@ -39,6 +39,11 @@
39#define __VIRTUAL_MASK_SHIFT 32 39#define __VIRTUAL_MASK_SHIFT 32
40#endif /* CONFIG_X86_PAE */ 40#endif /* CONFIG_X86_PAE */
41 41
42/*
43 * Kernel image size is limited to 512 MB (see in arch/x86/kernel/head_32.S)
44 */
45#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
46
42#ifndef __ASSEMBLY__ 47#ifndef __ASSEMBLY__
43 48
44/* 49/*
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 2d625da6603c..826ad37006ab 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -40,14 +40,8 @@
40 40
41#ifndef __ASSEMBLY__ 41#ifndef __ASSEMBLY__
42 42
43struct pgprot;
44
45extern int page_is_ram(unsigned long pagenr); 43extern int page_is_ram(unsigned long pagenr);
46extern int devmem_is_allowed(unsigned long pagenr); 44extern int devmem_is_allowed(unsigned long pagenr);
47extern void map_devmem(unsigned long pfn, unsigned long size,
48 struct pgprot vma_prot);
49extern void unmap_devmem(unsigned long pfn, unsigned long size,
50 struct pgprot vma_prot);
51 45
52extern unsigned long max_low_pfn_mapped; 46extern unsigned long max_low_pfn_mapped;
53extern unsigned long max_pfn_mapped; 47extern unsigned long max_pfn_mapped;
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 0617d5cc9712..7727aa8b7dda 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -317,8 +317,6 @@ struct pv_mmu_ops {
317#if PAGETABLE_LEVELS >= 3 317#if PAGETABLE_LEVELS >= 3
318#ifdef CONFIG_X86_PAE 318#ifdef CONFIG_X86_PAE
319 void (*set_pte_atomic)(pte_t *ptep, pte_t pteval); 319 void (*set_pte_atomic)(pte_t *ptep, pte_t pteval);
320 void (*set_pte_present)(struct mm_struct *mm, unsigned long addr,
321 pte_t *ptep, pte_t pte);
322 void (*pte_clear)(struct mm_struct *mm, unsigned long addr, 320 void (*pte_clear)(struct mm_struct *mm, unsigned long addr,
323 pte_t *ptep); 321 pte_t *ptep);
324 void (*pmd_clear)(pmd_t *pmdp); 322 void (*pmd_clear)(pmd_t *pmdp);
@@ -389,7 +387,7 @@ extern struct pv_lock_ops pv_lock_ops;
389 387
390#define paravirt_type(op) \ 388#define paravirt_type(op) \
391 [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \ 389 [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \
392 [paravirt_opptr] "m" (op) 390 [paravirt_opptr] "i" (&(op))
393#define paravirt_clobber(clobber) \ 391#define paravirt_clobber(clobber) \
394 [paravirt_clobber] "i" (clobber) 392 [paravirt_clobber] "i" (clobber)
395 393
@@ -443,7 +441,7 @@ int paravirt_disable_iospace(void);
443 * offset into the paravirt_patch_template structure, and can therefore be 441 * offset into the paravirt_patch_template structure, and can therefore be
444 * freely converted back into a structure offset. 442 * freely converted back into a structure offset.
445 */ 443 */
446#define PARAVIRT_CALL "call *%[paravirt_opptr];" 444#define PARAVIRT_CALL "call *%c[paravirt_opptr];"
447 445
448/* 446/*
449 * These macros are intended to wrap calls through one of the paravirt 447 * These macros are intended to wrap calls through one of the paravirt
@@ -1365,13 +1363,6 @@ static inline void set_pte_atomic(pte_t *ptep, pte_t pte)
1365 pte.pte, pte.pte >> 32); 1363 pte.pte, pte.pte >> 32);
1366} 1364}
1367 1365
1368static inline void set_pte_present(struct mm_struct *mm, unsigned long addr,
1369 pte_t *ptep, pte_t pte)
1370{
1371 /* 5 arg words */
1372 pv_mmu_ops.set_pte_present(mm, addr, ptep, pte);
1373}
1374
1375static inline void pte_clear(struct mm_struct *mm, unsigned long addr, 1366static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
1376 pte_t *ptep) 1367 pte_t *ptep)
1377{ 1368{
@@ -1388,12 +1379,6 @@ static inline void set_pte_atomic(pte_t *ptep, pte_t pte)
1388 set_pte(ptep, pte); 1379 set_pte(ptep, pte);
1389} 1380}
1390 1381
1391static inline void set_pte_present(struct mm_struct *mm, unsigned long addr,
1392 pte_t *ptep, pte_t pte)
1393{
1394 set_pte(ptep, pte);
1395}
1396
1397static inline void pte_clear(struct mm_struct *mm, unsigned long addr, 1382static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
1398 pte_t *ptep) 1383 pte_t *ptep)
1399{ 1384{
diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h
index b0e70056838e..2cd07b9422f4 100644
--- a/arch/x86/include/asm/pat.h
+++ b/arch/x86/include/asm/pat.h
@@ -2,6 +2,7 @@
2#define _ASM_X86_PAT_H 2#define _ASM_X86_PAT_H
3 3
4#include <linux/types.h> 4#include <linux/types.h>
5#include <asm/pgtable_types.h>
5 6
6#ifdef CONFIG_X86_PAT 7#ifdef CONFIG_X86_PAT
7extern int pat_enabled; 8extern int pat_enabled;
@@ -17,5 +18,9 @@ extern int free_memtype(u64 start, u64 end);
17 18
18extern int kernel_map_sync_memtype(u64 base, unsigned long size, 19extern int kernel_map_sync_memtype(u64 base, unsigned long size,
19 unsigned long flag); 20 unsigned long flag);
21extern void map_devmem(unsigned long pfn, unsigned long size,
22 struct pgprot vma_prot);
23extern void unmap_devmem(unsigned long pfn, unsigned long size,
24 struct pgprot vma_prot);
20 25
21#endif /* _ASM_X86_PAT_H */ 26#endif /* _ASM_X86_PAT_H */
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index a977de23cb4d..b51a1e8b0baf 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -86,12 +86,43 @@ static inline void early_quirks(void) { }
86 86
87extern void pci_iommu_alloc(void); 87extern void pci_iommu_alloc(void);
88 88
89#endif /* __KERNEL__ */ 89/* MSI arch hook */
90#define arch_setup_msi_irqs arch_setup_msi_irqs
91
92#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
93
94#if defined(CONFIG_X86_64) || defined(CONFIG_DMA_API_DEBUG)
95
96#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \
97 dma_addr_t ADDR_NAME;
98#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \
99 __u32 LEN_NAME;
100#define pci_unmap_addr(PTR, ADDR_NAME) \
101 ((PTR)->ADDR_NAME)
102#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
103 (((PTR)->ADDR_NAME) = (VAL))
104#define pci_unmap_len(PTR, LEN_NAME) \
105 ((PTR)->LEN_NAME)
106#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
107 (((PTR)->LEN_NAME) = (VAL))
90 108
91#ifdef CONFIG_X86_32
92# include "pci_32.h"
93#else 109#else
94# include "pci_64.h" 110
111#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) dma_addr_t ADDR_NAME[0];
112#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) unsigned LEN_NAME[0];
113#define pci_unmap_addr(PTR, ADDR_NAME) sizeof((PTR)->ADDR_NAME)
114#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
115 do { break; } while (pci_unmap_addr(PTR, ADDR_NAME))
116#define pci_unmap_len(PTR, LEN_NAME) sizeof((PTR)->LEN_NAME)
117#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
118 do { break; } while (pci_unmap_len(PTR, LEN_NAME))
119
120#endif
121
122#endif /* __KERNEL__ */
123
124#ifdef CONFIG_X86_64
125#include "pci_64.h"
95#endif 126#endif
96 127
97/* implement the pci_ DMA API in terms of the generic device dma_ one */ 128/* implement the pci_ DMA API in terms of the generic device dma_ one */
@@ -109,11 +140,6 @@ static inline int __pcibus_to_node(const struct pci_bus *bus)
109 return sd->node; 140 return sd->node;
110} 141}
111 142
112static inline cpumask_t __pcibus_to_cpumask(struct pci_bus *bus)
113{
114 return node_to_cpumask(__pcibus_to_node(bus));
115}
116
117static inline const struct cpumask * 143static inline const struct cpumask *
118cpumask_of_pcibus(const struct pci_bus *bus) 144cpumask_of_pcibus(const struct pci_bus *bus)
119{ 145{
diff --git a/arch/x86/include/asm/pci_32.h b/arch/x86/include/asm/pci_32.h
deleted file mode 100644
index 6f1213a6ef4f..000000000000
--- a/arch/x86/include/asm/pci_32.h
+++ /dev/null
@@ -1,34 +0,0 @@
1#ifndef _ASM_X86_PCI_32_H
2#define _ASM_X86_PCI_32_H
3
4
5#ifdef __KERNEL__
6
7
8/* Dynamic DMA mapping stuff.
9 * i386 has everything mapped statically.
10 */
11
12struct pci_dev;
13
14/* The PCI address space does equal the physical memory
15 * address space. The networking and block device layers use
16 * this boolean for bounce buffer decisions.
17 */
18#define PCI_DMA_BUS_IS_PHYS (1)
19
20/* pci_unmap_{page,single} is a nop so... */
21#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) dma_addr_t ADDR_NAME[0];
22#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) unsigned LEN_NAME[0];
23#define pci_unmap_addr(PTR, ADDR_NAME) sizeof((PTR)->ADDR_NAME)
24#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
25 do { break; } while (pci_unmap_addr(PTR, ADDR_NAME))
26#define pci_unmap_len(PTR, LEN_NAME) sizeof((PTR)->LEN_NAME)
27#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
28 do { break; } while (pci_unmap_len(PTR, LEN_NAME))
29
30
31#endif /* __KERNEL__ */
32
33
34#endif /* _ASM_X86_PCI_32_H */
diff --git a/arch/x86/include/asm/pci_64.h b/arch/x86/include/asm/pci_64.h
index 4da207982777..ae5e40f67daf 100644
--- a/arch/x86/include/asm/pci_64.h
+++ b/arch/x86/include/asm/pci_64.h
@@ -24,28 +24,6 @@ extern int (*pci_config_write)(int seg, int bus, int dev, int fn,
24 24
25extern void dma32_reserve_bootmem(void); 25extern void dma32_reserve_bootmem(void);
26 26
27/* The PCI address space does equal the physical memory
28 * address space. The networking and block device layers use
29 * this boolean for bounce buffer decisions
30 *
31 * On AMD64 it mostly equals, but we set it to zero if a hardware
32 * IOMMU (gart) of sotware IOMMU (swiotlb) is available.
33 */
34#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
35
36#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \
37 dma_addr_t ADDR_NAME;
38#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \
39 __u32 LEN_NAME;
40#define pci_unmap_addr(PTR, ADDR_NAME) \
41 ((PTR)->ADDR_NAME)
42#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
43 (((PTR)->ADDR_NAME) = (VAL))
44#define pci_unmap_len(PTR, LEN_NAME) \
45 ((PTR)->LEN_NAME)
46#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
47 (((PTR)->LEN_NAME) = (VAL))
48
49#endif /* __KERNEL__ */ 27#endif /* __KERNEL__ */
50 28
51#endif /* _ASM_X86_PCI_64_H */ 29#endif /* _ASM_X86_PCI_64_H */
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index c1774ac9da7a..2334982b339e 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -26,13 +26,6 @@ static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
26 native_set_pte(ptep, pte); 26 native_set_pte(ptep, pte);
27} 27}
28 28
29static inline void native_set_pte_present(struct mm_struct *mm,
30 unsigned long addr,
31 pte_t *ptep, pte_t pte)
32{
33 native_set_pte(ptep, pte);
34}
35
36static inline void native_pmd_clear(pmd_t *pmdp) 29static inline void native_pmd_clear(pmd_t *pmdp)
37{ 30{
38 native_set_pmd(pmdp, __pmd(0)); 31 native_set_pmd(pmdp, __pmd(0));
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 3f13cdf61156..177b0165ea01 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -31,23 +31,6 @@ static inline void native_set_pte(pte_t *ptep, pte_t pte)
31 ptep->pte_low = pte.pte_low; 31 ptep->pte_low = pte.pte_low;
32} 32}
33 33
34/*
35 * Since this is only called on user PTEs, and the page fault handler
36 * must handle the already racy situation of simultaneous page faults,
37 * we are justified in merely clearing the PTE present bit, followed
38 * by a set. The ordering here is important.
39 */
40static inline void native_set_pte_present(struct mm_struct *mm,
41 unsigned long addr,
42 pte_t *ptep, pte_t pte)
43{
44 ptep->pte_low = 0;
45 smp_wmb();
46 ptep->pte_high = pte.pte_high;
47 smp_wmb();
48 ptep->pte_low = pte.pte_low;
49}
50
51static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) 34static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
52{ 35{
53 set_64bit((unsigned long long *)(ptep), native_pte_val(pte)); 36 set_64bit((unsigned long long *)(ptep), native_pte_val(pte));
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 1c097a3a6669..29d96d168bc0 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -31,8 +31,6 @@ extern struct list_head pgd_list;
31#define set_pte(ptep, pte) native_set_pte(ptep, pte) 31#define set_pte(ptep, pte) native_set_pte(ptep, pte)
32#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte) 32#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte)
33 33
34#define set_pte_present(mm, addr, ptep, pte) \
35 native_set_pte_present(mm, addr, ptep, pte)
36#define set_pte_atomic(ptep, pte) \ 34#define set_pte_atomic(ptep, pte) \
37 native_set_pte_atomic(ptep, pte) 35 native_set_pte_atomic(ptep, pte)
38 36
@@ -288,6 +286,8 @@ static inline int is_new_memtype_allowed(unsigned long flags,
288 return 1; 286 return 1;
289} 287}
290 288
289pmd_t *populate_extra_pmd(unsigned long vaddr);
290pte_t *populate_extra_pte(unsigned long vaddr);
291#endif /* __ASSEMBLY__ */ 291#endif /* __ASSEMBLY__ */
292 292
293#ifdef CONFIG_X86_32 293#ifdef CONFIG_X86_32
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 97612fc7632f..31bd120cf2a2 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -42,9 +42,6 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
42 */ 42 */
43#undef TEST_ACCESS_OK 43#undef TEST_ACCESS_OK
44 44
45/* The boot page tables (all created as a single array) */
46extern unsigned long pg0[];
47
48#ifdef CONFIG_X86_PAE 45#ifdef CONFIG_X86_PAE
49# include <asm/pgtable-3level.h> 46# include <asm/pgtable-3level.h>
50#else 47#else
diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h
index bd8df3b2fe04..2733fad45f98 100644
--- a/arch/x86/include/asm/pgtable_32_types.h
+++ b/arch/x86/include/asm/pgtable_32_types.h
@@ -25,6 +25,11 @@
25 * area for the same reason. ;) 25 * area for the same reason. ;)
26 */ 26 */
27#define VMALLOC_OFFSET (8 * 1024 * 1024) 27#define VMALLOC_OFFSET (8 * 1024 * 1024)
28
29#ifndef __ASSEMBLER__
30extern bool __vmalloc_start_set; /* set once high_memory is set */
31#endif
32
28#define VMALLOC_START ((unsigned long)high_memory + VMALLOC_OFFSET) 33#define VMALLOC_START ((unsigned long)high_memory + VMALLOC_OFFSET)
29#ifdef CONFIG_X86_PAE 34#ifdef CONFIG_X86_PAE
30#define LAST_PKMAP 512 35#define LAST_PKMAP 512
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 4d258ad76a0f..b8238dc8786d 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -273,6 +273,7 @@ typedef struct page *pgtable_t;
273 273
274extern pteval_t __supported_pte_mask; 274extern pteval_t __supported_pte_mask;
275extern int nx_enabled; 275extern int nx_enabled;
276extern void set_nx(void);
276 277
277#define pgprot_writecombine pgprot_writecombine 278#define pgprot_writecombine pgprot_writecombine
278extern pgprot_t pgprot_writecombine(pgprot_t prot); 279extern pgprot_t pgprot_writecombine(pgprot_t prot);
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c7a98f738210..34c52370f2fe 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -75,9 +75,9 @@ struct cpuinfo_x86 {
75#else 75#else
76 /* Number of 4K pages in DTLB/ITLB combined(in pages): */ 76 /* Number of 4K pages in DTLB/ITLB combined(in pages): */
77 int x86_tlbsize; 77 int x86_tlbsize;
78#endif
78 __u8 x86_virt_bits; 79 __u8 x86_virt_bits;
79 __u8 x86_phys_bits; 80 __u8 x86_phys_bits;
80#endif
81 /* CPUID returned core id bits: */ 81 /* CPUID returned core id bits: */
82 __u8 x86_coreid_bits; 82 __u8 x86_coreid_bits;
83 /* Max extended CPUID function supported: */ 83 /* Max extended CPUID function supported: */
@@ -94,7 +94,7 @@ struct cpuinfo_x86 {
94 unsigned long loops_per_jiffy; 94 unsigned long loops_per_jiffy;
95#ifdef CONFIG_SMP 95#ifdef CONFIG_SMP
96 /* cpus sharing the last level cache: */ 96 /* cpus sharing the last level cache: */
97 cpumask_t llc_shared_map; 97 cpumask_var_t llc_shared_map;
98#endif 98#endif
99 /* cpuid returned max cores value: */ 99 /* cpuid returned max cores value: */
100 u16 x86_max_cores; 100 u16 x86_max_cores;
@@ -248,7 +248,6 @@ struct x86_hw_tss {
248#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long)) 248#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
249#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap) 249#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap)
250#define INVALID_IO_BITMAP_OFFSET 0x8000 250#define INVALID_IO_BITMAP_OFFSET 0x8000
251#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
252 251
253struct tss_struct { 252struct tss_struct {
254 /* 253 /*
@@ -263,11 +262,6 @@ struct tss_struct {
263 * be within the limit. 262 * be within the limit.
264 */ 263 */
265 unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; 264 unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
266 /*
267 * Cache the current maximum and the last task that used the bitmap:
268 */
269 unsigned long io_bitmap_max;
270 struct thread_struct *io_bitmap_owner;
271 265
272 /* 266 /*
273 * .. and then another 0x100 bytes for the emergency kernel stack: 267 * .. and then another 0x100 bytes for the emergency kernel stack:
@@ -397,6 +391,9 @@ DECLARE_PER_CPU(union irq_stack_union, irq_stack_union);
397DECLARE_INIT_PER_CPU(irq_stack_union); 391DECLARE_INIT_PER_CPU(irq_stack_union);
398 392
399DECLARE_PER_CPU(char *, irq_stack_ptr); 393DECLARE_PER_CPU(char *, irq_stack_ptr);
394DECLARE_PER_CPU(unsigned int, irq_count);
395extern unsigned long kernel_eflags;
396extern asmlinkage void ignore_sysret(void);
400#else /* X86_64 */ 397#else /* X86_64 */
401#ifdef CONFIG_CC_STACKPROTECTOR 398#ifdef CONFIG_CC_STACKPROTECTOR
402DECLARE_PER_CPU(unsigned long, stack_canary); 399DECLARE_PER_CPU(unsigned long, stack_canary);
@@ -739,6 +736,7 @@ static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
739extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx); 736extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
740 737
741extern void select_idle_routine(const struct cpuinfo_x86 *c); 738extern void select_idle_routine(const struct cpuinfo_x86 *c);
739extern void init_c1e_mask(void);
742 740
743extern unsigned long boot_option_idle_override; 741extern unsigned long boot_option_idle_override;
744extern unsigned long idle_halt; 742extern unsigned long idle_halt;
diff --git a/arch/x86/include/asm/ptrace-abi.h b/arch/x86/include/asm/ptrace-abi.h
index 8e0f8d199e05..86723035a515 100644
--- a/arch/x86/include/asm/ptrace-abi.h
+++ b/arch/x86/include/asm/ptrace-abi.h
@@ -80,8 +80,6 @@
80 80
81#define PTRACE_SINGLEBLOCK 33 /* resume execution until next branch */ 81#define PTRACE_SINGLEBLOCK 33 /* resume execution until next branch */
82 82
83#ifdef CONFIG_X86_PTRACE_BTS
84
85#ifndef __ASSEMBLY__ 83#ifndef __ASSEMBLY__
86#include <linux/types.h> 84#include <linux/types.h>
87 85
@@ -140,6 +138,5 @@ struct ptrace_bts_config {
140 BTS records are read from oldest to newest. 138 BTS records are read from oldest to newest.
141 Returns number of BTS records drained. 139 Returns number of BTS records drained.
142*/ 140*/
143#endif /* CONFIG_X86_PTRACE_BTS */
144 141
145#endif /* _ASM_X86_PTRACE_ABI_H */ 142#endif /* _ASM_X86_PTRACE_ABI_H */
diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
index 2b8c5160388f..1b7ee5d673c2 100644
--- a/arch/x86/include/asm/sections.h
+++ b/arch/x86/include/asm/sections.h
@@ -1 +1,8 @@
1#ifndef _ASM_X86_SECTIONS_H
2#define _ASM_X86_SECTIONS_H
3
1#include <asm-generic/sections.h> 4#include <asm-generic/sections.h>
5
6extern char __brk_base[], __brk_limit[];
7
8#endif /* _ASM_X86_SECTIONS_H */
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 66801cb72f69..bdc2ada05ae0 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -31,7 +31,6 @@ struct x86_quirks {
31 void (*smp_read_mpc_oem)(struct mpc_oemtable *oemtable, 31 void (*smp_read_mpc_oem)(struct mpc_oemtable *oemtable,
32 unsigned short oemsize); 32 unsigned short oemsize);
33 int (*setup_ioapic_ids)(void); 33 int (*setup_ioapic_ids)(void);
34 int (*update_apic)(void);
35}; 34};
36 35
37extern void x86_quirk_pre_intr_init(void); 36extern void x86_quirk_pre_intr_init(void);
@@ -65,7 +64,11 @@ extern void x86_quirk_time_init(void);
65#include <asm/bootparam.h> 64#include <asm/bootparam.h>
66 65
67/* Interrupt control for vSMPowered x86_64 systems */ 66/* Interrupt control for vSMPowered x86_64 systems */
67#ifdef CONFIG_X86_64
68void vsmp_init(void); 68void vsmp_init(void);
69#else
70static inline void vsmp_init(void) { }
71#endif
69 72
70void setup_bios_corruption_check(void); 73void setup_bios_corruption_check(void);
71 74
@@ -77,8 +80,6 @@ static inline void visws_early_detect(void) { }
77static inline int is_visws_box(void) { return 0; } 80static inline int is_visws_box(void) { return 0; }
78#endif 81#endif
79 82
80extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip);
81extern int wakeup_secondary_cpu_via_init(int apicid, unsigned long start_eip);
82extern struct x86_quirks *x86_quirks; 83extern struct x86_quirks *x86_quirks;
83extern unsigned long saved_video_mode; 84extern unsigned long saved_video_mode;
84 85
@@ -99,20 +100,51 @@ extern struct boot_params boot_params;
99 */ 100 */
100#define LOWMEMSIZE() (0x9f000) 101#define LOWMEMSIZE() (0x9f000)
101 102
103/* exceedingly early brk-like allocator */
104extern unsigned long _brk_end;
105void *extend_brk(size_t size, size_t align);
106
107/*
108 * Reserve space in the brk section. The name must be unique within
109 * the file, and somewhat descriptive. The size is in bytes. Must be
110 * used at file scope.
111 *
112 * (This uses a temp function to wrap the asm so we can pass it the
113 * size parameter; otherwise we wouldn't be able to. We can't use a
114 * "section" attribute on a normal variable because it always ends up
115 * being @progbits, which ends up allocating space in the vmlinux
116 * executable.)
117 */
118#define RESERVE_BRK(name,sz) \
119 static void __section(.discard) __used \
120 __brk_reservation_fn_##name##__(void) { \
121 asm volatile ( \
122 ".pushsection .brk_reservation,\"aw\",@nobits;" \
123 ".brk." #name ":" \
124 " 1:.skip %c0;" \
125 " .size .brk." #name ", . - 1b;" \
126 " .popsection" \
127 : : "i" (sz)); \
128 }
129
102#ifdef __i386__ 130#ifdef __i386__
103 131
104void __init i386_start_kernel(void); 132void __init i386_start_kernel(void);
105extern void probe_roms(void); 133extern void probe_roms(void);
106 134
107extern unsigned long init_pg_tables_start;
108extern unsigned long init_pg_tables_end;
109
110#else 135#else
111void __init x86_64_start_kernel(char *real_mode); 136void __init x86_64_start_kernel(char *real_mode);
112void __init x86_64_start_reservations(char *real_mode_data); 137void __init x86_64_start_reservations(char *real_mode_data);
113 138
114#endif /* __i386__ */ 139#endif /* __i386__ */
115#endif /* _SETUP */ 140#endif /* _SETUP */
141#else
142#define RESERVE_BRK(name,sz) \
143 .pushsection .brk_reservation,"aw",@nobits; \
144.brk.name: \
1451: .skip sz; \
146 .size .brk.name,.-1b; \
147 .popsection
116#endif /* __ASSEMBLY__ */ 148#endif /* __ASSEMBLY__ */
117#endif /* __KERNEL__ */ 149#endif /* __KERNEL__ */
118 150
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 47d0e21f2b9e..19e0d88b966d 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -21,19 +21,19 @@
21extern int smp_num_siblings; 21extern int smp_num_siblings;
22extern unsigned int num_processors; 22extern unsigned int num_processors;
23 23
24DECLARE_PER_CPU(cpumask_t, cpu_sibling_map); 24DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map);
25DECLARE_PER_CPU(cpumask_t, cpu_core_map); 25DECLARE_PER_CPU(cpumask_var_t, cpu_core_map);
26DECLARE_PER_CPU(u16, cpu_llc_id); 26DECLARE_PER_CPU(u16, cpu_llc_id);
27DECLARE_PER_CPU(int, cpu_number); 27DECLARE_PER_CPU(int, cpu_number);
28 28
29static inline struct cpumask *cpu_sibling_mask(int cpu) 29static inline struct cpumask *cpu_sibling_mask(int cpu)
30{ 30{
31 return &per_cpu(cpu_sibling_map, cpu); 31 return per_cpu(cpu_sibling_map, cpu);
32} 32}
33 33
34static inline struct cpumask *cpu_core_mask(int cpu) 34static inline struct cpumask *cpu_core_mask(int cpu)
35{ 35{
36 return &per_cpu(cpu_core_map, cpu); 36 return per_cpu(cpu_core_map, cpu);
37} 37}
38 38
39DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid); 39DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid);
@@ -121,9 +121,10 @@ static inline void arch_send_call_function_single_ipi(int cpu)
121 smp_ops.send_call_func_single_ipi(cpu); 121 smp_ops.send_call_func_single_ipi(cpu);
122} 122}
123 123
124static inline void arch_send_call_function_ipi(cpumask_t mask) 124#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask
125static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask)
125{ 126{
126 smp_ops.send_call_func_ipi(&mask); 127 smp_ops.send_call_func_ipi(mask);
127} 128}
128 129
129void cpu_disable_common(void); 130void cpu_disable_common(void);
diff --git a/arch/x86/include/asm/socket.h b/arch/x86/include/asm/socket.h
index 8ab9cc8b2ecc..ca8bf2cd0ba9 100644
--- a/arch/x86/include/asm/socket.h
+++ b/arch/x86/include/asm/socket.h
@@ -54,4 +54,7 @@
54 54
55#define SO_MARK 36 55#define SO_MARK 36
56 56
57#define SO_TIMESTAMPING 37
58#define SCM_TIMESTAMPING SO_TIMESTAMPING
59
57#endif /* _ASM_X86_SOCKET_H */ 60#endif /* _ASM_X86_SOCKET_H */
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 3a5696656680..e5e6caffec87 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -295,6 +295,9 @@ static inline void __raw_write_unlock(raw_rwlock_t *rw)
295 : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory"); 295 : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory");
296} 296}
297 297
298#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
299#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
300
298#define _raw_spin_relax(lock) cpu_relax() 301#define _raw_spin_relax(lock) cpu_relax()
299#define _raw_read_relax(lock) cpu_relax() 302#define _raw_read_relax(lock) cpu_relax()
300#define _raw_write_relax(lock) cpu_relax() 303#define _raw_write_relax(lock) cpu_relax()
diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h
index a5074bd0f8be..48dcfa62ea07 100644
--- a/arch/x86/include/asm/suspend_32.h
+++ b/arch/x86/include/asm/suspend_32.h
@@ -24,28 +24,4 @@ struct saved_context {
24 unsigned long return_address; 24 unsigned long return_address;
25} __attribute__((packed)); 25} __attribute__((packed));
26 26
27#ifdef CONFIG_ACPI
28extern unsigned long saved_eip;
29extern unsigned long saved_esp;
30extern unsigned long saved_ebp;
31extern unsigned long saved_ebx;
32extern unsigned long saved_esi;
33extern unsigned long saved_edi;
34
35static inline void acpi_save_register_state(unsigned long return_point)
36{
37 saved_eip = return_point;
38 asm volatile("movl %%esp,%0" : "=m" (saved_esp));
39 asm volatile("movl %%ebp,%0" : "=m" (saved_ebp));
40 asm volatile("movl %%ebx,%0" : "=m" (saved_ebx));
41 asm volatile("movl %%edi,%0" : "=m" (saved_edi));
42 asm volatile("movl %%esi,%0" : "=m" (saved_esi));
43}
44
45#define acpi_restore_register_state() do {} while (0)
46
47/* routines for saving/restoring kernel state */
48extern int acpi_save_state_mem(void);
49#endif
50
51#endif /* _ASM_X86_SUSPEND_32_H */ 27#endif /* _ASM_X86_SUSPEND_32_H */
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 1b8afa78e869..82ada75f3ebf 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -174,10 +174,6 @@ struct __attribute__ ((__packed__)) vmcb {
174#define SVM_CPUID_FEATURE_SHIFT 2 174#define SVM_CPUID_FEATURE_SHIFT 2
175#define SVM_CPUID_FUNC 0x8000000a 175#define SVM_CPUID_FUNC 0x8000000a
176 176
177#define MSR_EFER_SVME_MASK (1ULL << 12)
178#define MSR_VM_CR 0xc0010114
179#define MSR_VM_HSAVE_PA 0xc0010117ULL
180
181#define SVM_VM_CR_SVM_DISABLE 4 177#define SVM_VM_CR_SVM_DISABLE 4
182 178
183#define SVM_SELECTOR_S_SHIFT 4 179#define SVM_SELECTOR_S_SHIFT 4
diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h
index ffb08be2a530..72a6dcd1299b 100644
--- a/arch/x86/include/asm/sys_ia32.h
+++ b/arch/x86/include/asm/sys_ia32.h
@@ -70,8 +70,6 @@ struct old_utsname;
70asmlinkage long sys32_olduname(struct oldold_utsname __user *); 70asmlinkage long sys32_olduname(struct oldold_utsname __user *);
71long sys32_uname(struct old_utsname __user *); 71long sys32_uname(struct old_utsname __user *);
72 72
73long sys32_ustat(unsigned, struct ustat32 __user *);
74
75asmlinkage long sys32_execve(char __user *, compat_uptr_t __user *, 73asmlinkage long sys32_execve(char __user *, compat_uptr_t __user *,
76 compat_uptr_t __user *, struct pt_regs *); 74 compat_uptr_t __user *, struct pt_regs *);
77asmlinkage long sys32_clone(unsigned int, unsigned int, struct pt_regs *); 75asmlinkage long sys32_clone(unsigned int, unsigned int, struct pt_regs *);
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index c00bfdbdd456..643c59b4bc6e 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -20,6 +20,9 @@
20struct task_struct; /* one of the stranger aspects of C forward declarations */ 20struct task_struct; /* one of the stranger aspects of C forward declarations */
21struct task_struct *__switch_to(struct task_struct *prev, 21struct task_struct *__switch_to(struct task_struct *prev,
22 struct task_struct *next); 22 struct task_struct *next);
23struct tss_struct;
24void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
25 struct tss_struct *tss);
23 26
24#ifdef CONFIG_X86_32 27#ifdef CONFIG_X86_32
25 28
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index ca7310e02446..3ffd5d2a3676 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -95,6 +95,7 @@ struct thread_info {
95#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ 95#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
96#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ 96#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */
97#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ 97#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */
98#define TIF_SYSCALL_FTRACE 27 /* for ftrace syscall instrumentation */
98 99
99#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) 100#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
100#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) 101#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
@@ -117,15 +118,17 @@ struct thread_info {
117#define _TIF_FORCED_TF (1 << TIF_FORCED_TF) 118#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
118#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) 119#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR)
119#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) 120#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR)
121#define _TIF_SYSCALL_FTRACE (1 << TIF_SYSCALL_FTRACE)
120 122
121/* work to do in syscall_trace_enter() */ 123/* work to do in syscall_trace_enter() */
122#define _TIF_WORK_SYSCALL_ENTRY \ 124#define _TIF_WORK_SYSCALL_ENTRY \
123 (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | \ 125 (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_FTRACE | \
124 _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | _TIF_SINGLESTEP) 126 _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | _TIF_SINGLESTEP)
125 127
126/* work to do in syscall_trace_leave() */ 128/* work to do in syscall_trace_leave() */
127#define _TIF_WORK_SYSCALL_EXIT \ 129#define _TIF_WORK_SYSCALL_EXIT \
128 (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP) 130 (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP | \
131 _TIF_SYSCALL_FTRACE)
129 132
130/* work to do on interrupt/exception return */ 133/* work to do on interrupt/exception return */
131#define _TIF_WORK_MASK \ 134#define _TIF_WORK_MASK \
@@ -134,7 +137,7 @@ struct thread_info {
134 _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU)) 137 _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU))
135 138
136/* work to do on any return to user space */ 139/* work to do on any return to user space */
137#define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP) 140#define _TIF_ALLWORK_MASK ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_FTRACE)
138 141
139/* Only used for 64 bit */ 142/* Only used for 64 bit */
140#define _TIF_DO_NOTIFY_MASK \ 143#define _TIF_DO_NOTIFY_MASK \
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index a81195eaa2b3..bd37ed444a21 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -12,9 +12,9 @@ unsigned long native_calibrate_tsc(void);
12 12
13#ifdef CONFIG_X86_32 13#ifdef CONFIG_X86_32
14extern int timer_ack; 14extern int timer_ack;
15extern int recalibrate_cpu_khz(void);
16extern irqreturn_t timer_interrupt(int irq, void *dev_id); 15extern irqreturn_t timer_interrupt(int irq, void *dev_id);
17#endif /* CONFIG_X86_32 */ 16#endif /* CONFIG_X86_32 */
17extern int recalibrate_cpu_khz(void);
18 18
19extern int no_timer_check; 19extern int no_timer_check;
20 20
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 77cfb2cfb386..892b119dba6f 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -44,9 +44,6 @@
44 44
45#ifdef CONFIG_X86_32 45#ifdef CONFIG_X86_32
46 46
47/* Mappings between node number and cpus on that node. */
48extern cpumask_t node_to_cpumask_map[];
49
50/* Mappings between logical cpu number and node number */ 47/* Mappings between logical cpu number and node number */
51extern int cpu_to_node_map[]; 48extern int cpu_to_node_map[];
52 49
@@ -57,30 +54,8 @@ static inline int cpu_to_node(int cpu)
57} 54}
58#define early_cpu_to_node(cpu) cpu_to_node(cpu) 55#define early_cpu_to_node(cpu) cpu_to_node(cpu)
59 56
60/* Returns a bitmask of CPUs on Node 'node'.
61 *
62 * Side note: this function creates the returned cpumask on the stack
63 * so with a high NR_CPUS count, excessive stack space is used. The
64 * cpumask_of_node function should be used whenever possible.
65 */
66static inline cpumask_t node_to_cpumask(int node)
67{
68 return node_to_cpumask_map[node];
69}
70
71/* Returns a bitmask of CPUs on Node 'node'. */
72static inline const struct cpumask *cpumask_of_node(int node)
73{
74 return &node_to_cpumask_map[node];
75}
76
77static inline void setup_node_to_cpumask_map(void) { }
78
79#else /* CONFIG_X86_64 */ 57#else /* CONFIG_X86_64 */
80 58
81/* Mappings between node number and cpus on that node. */
82extern cpumask_t *node_to_cpumask_map;
83
84/* Mappings between logical cpu number and node number */ 59/* Mappings between logical cpu number and node number */
85DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map); 60DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map);
86 61
@@ -91,8 +66,6 @@ DECLARE_PER_CPU(int, node_number);
91#ifdef CONFIG_DEBUG_PER_CPU_MAPS 66#ifdef CONFIG_DEBUG_PER_CPU_MAPS
92extern int cpu_to_node(int cpu); 67extern int cpu_to_node(int cpu);
93extern int early_cpu_to_node(int cpu); 68extern int early_cpu_to_node(int cpu);
94extern const cpumask_t *cpumask_of_node(int node);
95extern cpumask_t node_to_cpumask(int node);
96 69
97#else /* !CONFIG_DEBUG_PER_CPU_MAPS */ 70#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
98 71
@@ -108,42 +81,32 @@ static inline int early_cpu_to_node(int cpu)
108 return early_per_cpu(x86_cpu_to_node_map, cpu); 81 return early_per_cpu(x86_cpu_to_node_map, cpu);
109} 82}
110 83
111/* Returns a pointer to the cpumask of CPUs on Node 'node'. */ 84#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
112static inline const cpumask_t *cpumask_of_node(int node) 85
113{ 86#endif /* CONFIG_X86_64 */
114 return &node_to_cpumask_map[node];
115}
116 87
117/* Returns a bitmask of CPUs on Node 'node'. */ 88/* Mappings between node number and cpus on that node. */
118static inline cpumask_t node_to_cpumask(int node) 89extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
90
91#ifdef CONFIG_DEBUG_PER_CPU_MAPS
92extern const struct cpumask *cpumask_of_node(int node);
93#else
94/* Returns a pointer to the cpumask of CPUs on Node 'node'. */
95static inline const struct cpumask *cpumask_of_node(int node)
119{ 96{
120 return node_to_cpumask_map[node]; 97 return node_to_cpumask_map[node];
121} 98}
122 99#endif
123#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
124 100
125extern void setup_node_to_cpumask_map(void); 101extern void setup_node_to_cpumask_map(void);
126 102
127/* 103/*
128 * Replace default node_to_cpumask_ptr with optimized version
129 * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)"
130 */
131#define node_to_cpumask_ptr(v, node) \
132 const cpumask_t *v = cpumask_of_node(node)
133
134#define node_to_cpumask_ptr_next(v, node) \
135 v = cpumask_of_node(node)
136
137#endif /* CONFIG_X86_64 */
138
139/*
140 * Returns the number of the node containing Node 'node'. This 104 * Returns the number of the node containing Node 'node'. This
141 * architecture is flat, so it is a pretty simple function! 105 * architecture is flat, so it is a pretty simple function!
142 */ 106 */
143#define parent_node(node) (node) 107#define parent_node(node) (node)
144 108
145#define pcibus_to_node(bus) __pcibus_to_node(bus) 109#define pcibus_to_node(bus) __pcibus_to_node(bus)
146#define pcibus_to_cpumask(bus) __pcibus_to_cpumask(bus)
147 110
148#ifdef CONFIG_X86_32 111#ifdef CONFIG_X86_32
149extern unsigned long node_start_pfn[]; 112extern unsigned long node_start_pfn[];
@@ -209,52 +172,24 @@ static inline int early_cpu_to_node(int cpu)
209 return 0; 172 return 0;
210} 173}
211 174
212static inline const cpumask_t *cpumask_of_node(int node) 175static inline const struct cpumask *cpumask_of_node(int node)
213{
214 return &cpu_online_map;
215}
216static inline cpumask_t node_to_cpumask(int node)
217{
218 return cpu_online_map;
219}
220static inline int node_to_first_cpu(int node)
221{ 176{
222 return first_cpu(cpu_online_map); 177 return cpu_online_mask;
223} 178}
224 179
225static inline void setup_node_to_cpumask_map(void) { } 180static inline void setup_node_to_cpumask_map(void) { }
226 181
227/*
228 * Replace default node_to_cpumask_ptr with optimized version
229 * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)"
230 */
231#define node_to_cpumask_ptr(v, node) \
232 const cpumask_t *v = cpumask_of_node(node)
233
234#define node_to_cpumask_ptr_next(v, node) \
235 v = cpumask_of_node(node)
236#endif 182#endif
237 183
238#include <asm-generic/topology.h> 184#include <asm-generic/topology.h>
239 185
240#ifdef CONFIG_NUMA
241/* Returns the number of the first CPU on Node 'node'. */
242static inline int node_to_first_cpu(int node)
243{
244 return cpumask_first(cpumask_of_node(node));
245}
246#endif
247
248extern cpumask_t cpu_coregroup_map(int cpu);
249extern const struct cpumask *cpu_coregroup_mask(int cpu); 186extern const struct cpumask *cpu_coregroup_mask(int cpu);
250 187
251#ifdef ENABLE_TOPO_DEFINES 188#ifdef ENABLE_TOPO_DEFINES
252#define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id) 189#define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id)
253#define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id) 190#define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id)
254#define topology_core_siblings(cpu) (per_cpu(cpu_core_map, cpu)) 191#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu))
255#define topology_thread_siblings(cpu) (per_cpu(cpu_sibling_map, cpu)) 192#define topology_thread_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu))
256#define topology_core_cpumask(cpu) (&per_cpu(cpu_core_map, cpu))
257#define topology_thread_cpumask(cpu) (&per_cpu(cpu_sibling_map, cpu))
258 193
259/* indicates that pointers to the topology cpumask_t maps are valid */ 194/* indicates that pointers to the topology cpumask_t maps are valid */
260#define arch_provides_topology_pointers yes 195#define arch_provides_topology_pointers yes
@@ -268,7 +203,7 @@ struct pci_bus;
268void set_pci_bus_resources_arch_default(struct pci_bus *b); 203void set_pci_bus_resources_arch_default(struct pci_bus *b);
269 204
270#ifdef CONFIG_SMP 205#ifdef CONFIG_SMP
271#define mc_capable() (cpus_weight(per_cpu(cpu_core_map, 0)) != nr_cpu_ids) 206#define mc_capable() (cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids)
272#define smt_capable() (smp_num_siblings > 1) 207#define smt_capable() (smp_num_siblings > 1)
273#endif 208#endif
274 209
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index a0ba61386972..5e06259e90e5 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -157,7 +157,7 @@ __copy_from_user(void *to, const void __user *from, unsigned long n)
157} 157}
158 158
159static __always_inline unsigned long __copy_from_user_nocache(void *to, 159static __always_inline unsigned long __copy_from_user_nocache(void *to,
160 const void __user *from, unsigned long n, unsigned long total) 160 const void __user *from, unsigned long n)
161{ 161{
162 might_fault(); 162 might_fault();
163 if (__builtin_constant_p(n)) { 163 if (__builtin_constant_p(n)) {
@@ -180,7 +180,7 @@ static __always_inline unsigned long __copy_from_user_nocache(void *to,
180 180
181static __always_inline unsigned long 181static __always_inline unsigned long
182__copy_from_user_inatomic_nocache(void *to, const void __user *from, 182__copy_from_user_inatomic_nocache(void *to, const void __user *from,
183 unsigned long n, unsigned long total) 183 unsigned long n)
184{ 184{
185 return __copy_from_user_ll_nocache_nozero(to, from, n); 185 return __copy_from_user_ll_nocache_nozero(to, from, n);
186} 186}
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index dcaa0404cf7b..8cc687326eb8 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -188,29 +188,18 @@ __copy_to_user_inatomic(void __user *dst, const void *src, unsigned size)
188extern long __copy_user_nocache(void *dst, const void __user *src, 188extern long __copy_user_nocache(void *dst, const void __user *src,
189 unsigned size, int zerorest); 189 unsigned size, int zerorest);
190 190
191static inline int __copy_from_user_nocache(void *dst, const void __user *src, 191static inline int
192 unsigned size, unsigned long total) 192__copy_from_user_nocache(void *dst, const void __user *src, unsigned size)
193{ 193{
194 might_sleep(); 194 might_sleep();
195 /* 195 return __copy_user_nocache(dst, src, size, 1);
196 * In practice this limit means that large file write()s
197 * which get chunked to 4K copies get handled via
198 * non-temporal stores here. Smaller writes get handled
199 * via regular __copy_from_user():
200 */
201 if (likely(total >= PAGE_SIZE))
202 return __copy_user_nocache(dst, src, size, 1);
203 else
204 return __copy_from_user(dst, src, size);
205} 196}
206 197
207static inline int __copy_from_user_inatomic_nocache(void *dst, 198static inline int
208 const void __user *src, unsigned size, unsigned total) 199__copy_from_user_inatomic_nocache(void *dst, const void __user *src,
200 unsigned size)
209{ 201{
210 if (likely(total >= PAGE_SIZE)) 202 return __copy_user_nocache(dst, src, size, 0);
211 return __copy_user_nocache(dst, src, size, 0);
212 else
213 return __copy_from_user_inatomic(dst, src, size);
214} 203}
215 204
216unsigned long 205unsigned long
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 7e47658b0a6f..0b4d8c2b157d 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -338,6 +338,8 @@
338#define __NR_dup3 330 338#define __NR_dup3 330
339#define __NR_pipe2 331 339#define __NR_pipe2 331
340#define __NR_inotify_init1 332 340#define __NR_inotify_init1 332
341#define __NR_preadv 333
342#define __NR_pwritev 334
341#define __NR_perf_counter_open 333 343#define __NR_perf_counter_open 333
342 344
343#ifdef __KERNEL__ 345#ifdef __KERNEL__
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 53025feaf88d..d9aad876ad76 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -653,6 +653,10 @@ __SYSCALL(__NR_dup3, sys_dup3)
653__SYSCALL(__NR_pipe2, sys_pipe2) 653__SYSCALL(__NR_pipe2, sys_pipe2)
654#define __NR_inotify_init1 294 654#define __NR_inotify_init1 294
655__SYSCALL(__NR_inotify_init1, sys_inotify_init1) 655__SYSCALL(__NR_inotify_init1, sys_inotify_init1)
656#define __NR_preadv 295
657__SYSCALL(__NR_preadv, sys_preadv)
658#define __NR_pwritev 296
659__SYSCALL(__NR_pwritev, sys_pwritev)
656#define __NR_perf_counter_open 295 660#define __NR_perf_counter_open 295
657__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open) 661__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
658 662
diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h
index 8242bf965812..c0a01b5d985b 100644
--- a/arch/x86/include/asm/uv/uv.h
+++ b/arch/x86/include/asm/uv/uv.h
@@ -12,7 +12,6 @@ extern enum uv_system_type get_uv_system_type(void);
12extern int is_uv_system(void); 12extern int is_uv_system(void);
13extern void uv_cpu_init(void); 13extern void uv_cpu_init(void);
14extern void uv_system_init(void); 14extern void uv_system_init(void);
15extern int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip);
16extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, 15extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
17 struct mm_struct *mm, 16 struct mm_struct *mm,
18 unsigned long va, 17 unsigned long va,
@@ -24,8 +23,6 @@ static inline enum uv_system_type get_uv_system_type(void) { return UV_NONE; }
24static inline int is_uv_system(void) { return 0; } 23static inline int is_uv_system(void) { return 0; }
25static inline void uv_cpu_init(void) { } 24static inline void uv_cpu_init(void) { }
26static inline void uv_system_init(void) { } 25static inline void uv_system_init(void) { }
27static inline int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip)
28{ return 1; }
29static inline const struct cpumask * 26static inline const struct cpumask *
30uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, 27uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm,
31 unsigned long va, unsigned int cpu) 28 unsigned long va, unsigned int cpu)
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 777327ef05c1..d3a98ea1062e 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -11,11 +11,13 @@
11#ifndef _ASM_X86_UV_UV_HUB_H 11#ifndef _ASM_X86_UV_UV_HUB_H
12#define _ASM_X86_UV_UV_HUB_H 12#define _ASM_X86_UV_UV_HUB_H
13 13
14#ifdef CONFIG_X86_64
14#include <linux/numa.h> 15#include <linux/numa.h>
15#include <linux/percpu.h> 16#include <linux/percpu.h>
16#include <linux/timer.h> 17#include <linux/timer.h>
17#include <asm/types.h> 18#include <asm/types.h>
18#include <asm/percpu.h> 19#include <asm/percpu.h>
20#include <asm/uv/uv_mmrs.h>
19 21
20 22
21/* 23/*
@@ -199,6 +201,10 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
199#define SCIR_CPU_ACTIVITY 0x02 /* not idle */ 201#define SCIR_CPU_ACTIVITY 0x02 /* not idle */
200#define SCIR_CPU_HB_INTERVAL (HZ) /* once per second */ 202#define SCIR_CPU_HB_INTERVAL (HZ) /* once per second */
201 203
204/* Loop through all installed blades */
205#define for_each_possible_blade(bid) \
206 for ((bid) = 0; (bid) < uv_num_possible_blades(); (bid)++)
207
202/* 208/*
203 * Macros for converting between kernel virtual addresses, socket local physical 209 * Macros for converting between kernel virtual addresses, socket local physical
204 * addresses, and UV global physical addresses. 210 * addresses, and UV global physical addresses.
@@ -393,6 +399,7 @@ static inline void uv_set_scir_bits(unsigned char value)
393 uv_write_local_mmr8(uv_hub_info->scir.offset, value); 399 uv_write_local_mmr8(uv_hub_info->scir.offset, value);
394 } 400 }
395} 401}
402
396static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value) 403static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value)
397{ 404{
398 if (uv_cpu_hub_info(cpu)->scir.state != value) { 405 if (uv_cpu_hub_info(cpu)->scir.state != value) {
@@ -401,4 +408,15 @@ static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value)
401 } 408 }
402} 409}
403 410
411static inline void uv_hub_send_ipi(int pnode, int apicid, int vector)
412{
413 unsigned long val;
414
415 val = (1UL << UVH_IPI_INT_SEND_SHFT) |
416 ((apicid & 0x3f) << UVH_IPI_INT_APIC_ID_SHFT) |
417 (vector << UVH_IPI_INT_VECTOR_SHFT);
418 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
419}
420
421#endif /* CONFIG_X86_64 */
404#endif /* _ASM_X86_UV_UV_HUB_H */ 422#endif /* _ASM_X86_UV_UV_HUB_H */
diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h
index dd627793a234..db68ac8a5ac2 100644
--- a/arch/x86/include/asm/uv/uv_mmrs.h
+++ b/arch/x86/include/asm/uv/uv_mmrs.h
@@ -1,3 +1,4 @@
1
1/* 2/*
2 * This file is subject to the terms and conditions of the GNU General Public 3 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive 4 * License. See the file "COPYING" in the main directory of this archive
@@ -243,6 +244,158 @@ union uvh_event_occurred0_u {
243#define UVH_EVENT_OCCURRED0_ALIAS_32 0x005f0 244#define UVH_EVENT_OCCURRED0_ALIAS_32 0x005f0
244 245
245/* ========================================================================= */ 246/* ========================================================================= */
247/* UVH_GR0_TLB_INT0_CONFIG */
248/* ========================================================================= */
249#define UVH_GR0_TLB_INT0_CONFIG 0x61b00UL
250
251#define UVH_GR0_TLB_INT0_CONFIG_VECTOR_SHFT 0
252#define UVH_GR0_TLB_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL
253#define UVH_GR0_TLB_INT0_CONFIG_DM_SHFT 8
254#define UVH_GR0_TLB_INT0_CONFIG_DM_MASK 0x0000000000000700UL
255#define UVH_GR0_TLB_INT0_CONFIG_DESTMODE_SHFT 11
256#define UVH_GR0_TLB_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL
257#define UVH_GR0_TLB_INT0_CONFIG_STATUS_SHFT 12
258#define UVH_GR0_TLB_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL
259#define UVH_GR0_TLB_INT0_CONFIG_P_SHFT 13
260#define UVH_GR0_TLB_INT0_CONFIG_P_MASK 0x0000000000002000UL
261#define UVH_GR0_TLB_INT0_CONFIG_T_SHFT 15
262#define UVH_GR0_TLB_INT0_CONFIG_T_MASK 0x0000000000008000UL
263#define UVH_GR0_TLB_INT0_CONFIG_M_SHFT 16
264#define UVH_GR0_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL
265#define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_SHFT 32
266#define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
267
268union uvh_gr0_tlb_int0_config_u {
269 unsigned long v;
270 struct uvh_gr0_tlb_int0_config_s {
271 unsigned long vector_ : 8; /* RW */
272 unsigned long dm : 3; /* RW */
273 unsigned long destmode : 1; /* RW */
274 unsigned long status : 1; /* RO */
275 unsigned long p : 1; /* RO */
276 unsigned long rsvd_14 : 1; /* */
277 unsigned long t : 1; /* RO */
278 unsigned long m : 1; /* RW */
279 unsigned long rsvd_17_31: 15; /* */
280 unsigned long apic_id : 32; /* RW */
281 } s;
282};
283
284/* ========================================================================= */
285/* UVH_GR0_TLB_INT1_CONFIG */
286/* ========================================================================= */
287#define UVH_GR0_TLB_INT1_CONFIG 0x61b40UL
288
289#define UVH_GR0_TLB_INT1_CONFIG_VECTOR_SHFT 0
290#define UVH_GR0_TLB_INT1_CONFIG_VECTOR_MASK 0x00000000000000ffUL
291#define UVH_GR0_TLB_INT1_CONFIG_DM_SHFT 8
292#define UVH_GR0_TLB_INT1_CONFIG_DM_MASK 0x0000000000000700UL
293#define UVH_GR0_TLB_INT1_CONFIG_DESTMODE_SHFT 11
294#define UVH_GR0_TLB_INT1_CONFIG_DESTMODE_MASK 0x0000000000000800UL
295#define UVH_GR0_TLB_INT1_CONFIG_STATUS_SHFT 12
296#define UVH_GR0_TLB_INT1_CONFIG_STATUS_MASK 0x0000000000001000UL
297#define UVH_GR0_TLB_INT1_CONFIG_P_SHFT 13
298#define UVH_GR0_TLB_INT1_CONFIG_P_MASK 0x0000000000002000UL
299#define UVH_GR0_TLB_INT1_CONFIG_T_SHFT 15
300#define UVH_GR0_TLB_INT1_CONFIG_T_MASK 0x0000000000008000UL
301#define UVH_GR0_TLB_INT1_CONFIG_M_SHFT 16
302#define UVH_GR0_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL
303#define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_SHFT 32
304#define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
305
306union uvh_gr0_tlb_int1_config_u {
307 unsigned long v;
308 struct uvh_gr0_tlb_int1_config_s {
309 unsigned long vector_ : 8; /* RW */
310 unsigned long dm : 3; /* RW */
311 unsigned long destmode : 1; /* RW */
312 unsigned long status : 1; /* RO */
313 unsigned long p : 1; /* RO */
314 unsigned long rsvd_14 : 1; /* */
315 unsigned long t : 1; /* RO */
316 unsigned long m : 1; /* RW */
317 unsigned long rsvd_17_31: 15; /* */
318 unsigned long apic_id : 32; /* RW */
319 } s;
320};
321
322/* ========================================================================= */
323/* UVH_GR1_TLB_INT0_CONFIG */
324/* ========================================================================= */
325#define UVH_GR1_TLB_INT0_CONFIG 0x61f00UL
326
327#define UVH_GR1_TLB_INT0_CONFIG_VECTOR_SHFT 0
328#define UVH_GR1_TLB_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL
329#define UVH_GR1_TLB_INT0_CONFIG_DM_SHFT 8
330#define UVH_GR1_TLB_INT0_CONFIG_DM_MASK 0x0000000000000700UL
331#define UVH_GR1_TLB_INT0_CONFIG_DESTMODE_SHFT 11
332#define UVH_GR1_TLB_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL
333#define UVH_GR1_TLB_INT0_CONFIG_STATUS_SHFT 12
334#define UVH_GR1_TLB_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL
335#define UVH_GR1_TLB_INT0_CONFIG_P_SHFT 13
336#define UVH_GR1_TLB_INT0_CONFIG_P_MASK 0x0000000000002000UL
337#define UVH_GR1_TLB_INT0_CONFIG_T_SHFT 15
338#define UVH_GR1_TLB_INT0_CONFIG_T_MASK 0x0000000000008000UL
339#define UVH_GR1_TLB_INT0_CONFIG_M_SHFT 16
340#define UVH_GR1_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL
341#define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_SHFT 32
342#define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
343
344union uvh_gr1_tlb_int0_config_u {
345 unsigned long v;
346 struct uvh_gr1_tlb_int0_config_s {
347 unsigned long vector_ : 8; /* RW */
348 unsigned long dm : 3; /* RW */
349 unsigned long destmode : 1; /* RW */
350 unsigned long status : 1; /* RO */
351 unsigned long p : 1; /* RO */
352 unsigned long rsvd_14 : 1; /* */
353 unsigned long t : 1; /* RO */
354 unsigned long m : 1; /* RW */
355 unsigned long rsvd_17_31: 15; /* */
356 unsigned long apic_id : 32; /* RW */
357 } s;
358};
359
360/* ========================================================================= */
361/* UVH_GR1_TLB_INT1_CONFIG */
362/* ========================================================================= */
363#define UVH_GR1_TLB_INT1_CONFIG 0x61f40UL
364
365#define UVH_GR1_TLB_INT1_CONFIG_VECTOR_SHFT 0
366#define UVH_GR1_TLB_INT1_CONFIG_VECTOR_MASK 0x00000000000000ffUL
367#define UVH_GR1_TLB_INT1_CONFIG_DM_SHFT 8
368#define UVH_GR1_TLB_INT1_CONFIG_DM_MASK 0x0000000000000700UL
369#define UVH_GR1_TLB_INT1_CONFIG_DESTMODE_SHFT 11
370#define UVH_GR1_TLB_INT1_CONFIG_DESTMODE_MASK 0x0000000000000800UL
371#define UVH_GR1_TLB_INT1_CONFIG_STATUS_SHFT 12
372#define UVH_GR1_TLB_INT1_CONFIG_STATUS_MASK 0x0000000000001000UL
373#define UVH_GR1_TLB_INT1_CONFIG_P_SHFT 13
374#define UVH_GR1_TLB_INT1_CONFIG_P_MASK 0x0000000000002000UL
375#define UVH_GR1_TLB_INT1_CONFIG_T_SHFT 15
376#define UVH_GR1_TLB_INT1_CONFIG_T_MASK 0x0000000000008000UL
377#define UVH_GR1_TLB_INT1_CONFIG_M_SHFT 16
378#define UVH_GR1_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL
379#define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_SHFT 32
380#define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
381
382union uvh_gr1_tlb_int1_config_u {
383 unsigned long v;
384 struct uvh_gr1_tlb_int1_config_s {
385 unsigned long vector_ : 8; /* RW */
386 unsigned long dm : 3; /* RW */
387 unsigned long destmode : 1; /* RW */
388 unsigned long status : 1; /* RO */
389 unsigned long p : 1; /* RO */
390 unsigned long rsvd_14 : 1; /* */
391 unsigned long t : 1; /* RO */
392 unsigned long m : 1; /* RW */
393 unsigned long rsvd_17_31: 15; /* */
394 unsigned long apic_id : 32; /* RW */
395 } s;
396};
397
398/* ========================================================================= */
246/* UVH_INT_CMPB */ 399/* UVH_INT_CMPB */
247/* ========================================================================= */ 400/* ========================================================================= */
248#define UVH_INT_CMPB 0x22080UL 401#define UVH_INT_CMPB 0x22080UL
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
index 593636275238..e0f9aa16358b 100644
--- a/arch/x86/include/asm/virtext.h
+++ b/arch/x86/include/asm/virtext.h
@@ -118,7 +118,7 @@ static inline void cpu_svm_disable(void)
118 118
119 wrmsrl(MSR_VM_HSAVE_PA, 0); 119 wrmsrl(MSR_VM_HSAVE_PA, 0);
120 rdmsrl(MSR_EFER, efer); 120 rdmsrl(MSR_EFER, efer);
121 wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK); 121 wrmsrl(MSR_EFER, efer & ~EFER_SVME);
122} 122}
123 123
124/** Makes sure SVM is disabled, if it is supported on the CPU 124/** Makes sure SVM is disabled, if it is supported on the CPU
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index d0238e6151d8..498f944010b9 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -270,8 +270,9 @@ enum vmcs_field {
270 270
271#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ 271#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */
272#define INTR_TYPE_NMI_INTR (2 << 8) /* NMI */ 272#define INTR_TYPE_NMI_INTR (2 << 8) /* NMI */
273#define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */ 273#define INTR_TYPE_HARD_EXCEPTION (3 << 8) /* processor exception */
274#define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */ 274#define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */
275#define INTR_TYPE_SOFT_EXCEPTION (6 << 8) /* software exception */
275 276
276/* GUEST_INTERRUPTIBILITY_INFO flags. */ 277/* GUEST_INTERRUPTIBILITY_INFO flags. */
277#define GUEST_INTR_STATE_STI 0x00000001 278#define GUEST_INTR_STATE_STI 0x00000001
@@ -311,7 +312,7 @@ enum vmcs_field {
311#define DEBUG_REG_ACCESS_TYPE 0x10 /* 4, direction of access */ 312#define DEBUG_REG_ACCESS_TYPE 0x10 /* 4, direction of access */
312#define TYPE_MOV_TO_DR (0 << 4) 313#define TYPE_MOV_TO_DR (0 << 4)
313#define TYPE_MOV_FROM_DR (1 << 4) 314#define TYPE_MOV_FROM_DR (1 << 4)
314#define DEBUG_REG_ACCESS_REG 0xf00 /* 11:8, general purpose reg. */ 315#define DEBUG_REG_ACCESS_REG(eq) (((eq) >> 8) & 0xf) /* 11:8, general purpose reg. */
315 316
316 317
317/* segment AR */ 318/* segment AR */
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 5e79ca694326..9c371e4a9fa6 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -296,6 +296,8 @@ HYPERVISOR_get_debugreg(int reg)
296static inline int 296static inline int
297HYPERVISOR_update_descriptor(u64 ma, u64 desc) 297HYPERVISOR_update_descriptor(u64 ma, u64 desc)
298{ 298{
299 if (sizeof(u64) == sizeof(long))
300 return _hypercall2(int, update_descriptor, ma, desc);
299 return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32); 301 return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32);
300} 302}
301 303
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 4bd990ee43df..1a918dde46b5 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -164,6 +164,7 @@ static inline pte_t __pte_ma(pteval_t x)
164 164
165 165
166xmaddr_t arbitrary_virt_to_machine(void *address); 166xmaddr_t arbitrary_virt_to_machine(void *address);
167unsigned long arbitrary_virt_to_mfn(void *vaddr);
167void make_lowmem_page_readonly(void *vaddr); 168void make_lowmem_page_readonly(void *vaddr);
168void make_lowmem_page_readwrite(void *vaddr); 169void make_lowmem_page_readwrite(void *vaddr);
169 170
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index de5657c039e9..145cce75cda7 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -66,11 +66,11 @@ obj-$(CONFIG_X86_MPPARSE) += mpparse.o
66obj-y += apic/ 66obj-y += apic/
67obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o 67obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
68obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o 68obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
69obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o 69obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
70obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
70obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o 71obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
71obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o 72obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
72obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o 73obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
73obj-y += vsmp_64.o
74obj-$(CONFIG_KPROBES) += kprobes.o 74obj-$(CONFIG_KPROBES) += kprobes.o
75obj-$(CONFIG_MODULES) += module_$(BITS).o 75obj-$(CONFIG_MODULES) += module_$(BITS).o
76obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o 76obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o
@@ -106,12 +106,12 @@ obj-$(CONFIG_MICROCODE) += microcode.o
106 106
107obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o 107obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
108 108
109obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o # NB rename without _64 109obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
110 110
111### 111###
112# 64 bit specific files 112# 64 bit specific files
113ifeq ($(CONFIG_X86_64),y) 113ifeq ($(CONFIG_X86_64),y)
114 obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o 114 obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o
115 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o 115 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
116 obj-$(CONFIG_AUDIT) += audit_64.o 116 obj-$(CONFIG_AUDIT) += audit_64.o
117 117
@@ -120,4 +120,5 @@ ifeq ($(CONFIG_X86_64),y)
120 obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o 120 obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o
121 121
122 obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o 122 obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
123 obj-y += vsmp_64.o
123endif 124endif
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index a18eb7ce2236..723989d7f802 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -230,6 +230,35 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled)
230} 230}
231 231
232static int __init 232static int __init
233acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
234{
235 struct acpi_madt_local_x2apic *processor = NULL;
236
237 processor = (struct acpi_madt_local_x2apic *)header;
238
239 if (BAD_MADT_ENTRY(processor, end))
240 return -EINVAL;
241
242 acpi_table_print_madt_entry(header);
243
244#ifdef CONFIG_X86_X2APIC
245 /*
246 * We need to register disabled CPU as well to permit
247 * counting disabled CPUs. This allows us to size
248 * cpus_possible_map more accurately, to permit
249 * to not preallocating memory for all NR_CPUS
250 * when we use CPU hotplug.
251 */
252 acpi_register_lapic(processor->local_apic_id, /* APIC ID */
253 processor->lapic_flags & ACPI_MADT_ENABLED);
254#else
255 printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
256#endif
257
258 return 0;
259}
260
261static int __init
233acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end) 262acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end)
234{ 263{
235 struct acpi_madt_local_apic *processor = NULL; 264 struct acpi_madt_local_apic *processor = NULL;
@@ -289,6 +318,25 @@ acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header,
289} 318}
290 319
291static int __init 320static int __init
321acpi_parse_x2apic_nmi(struct acpi_subtable_header *header,
322 const unsigned long end)
323{
324 struct acpi_madt_local_x2apic_nmi *x2apic_nmi = NULL;
325
326 x2apic_nmi = (struct acpi_madt_local_x2apic_nmi *)header;
327
328 if (BAD_MADT_ENTRY(x2apic_nmi, end))
329 return -EINVAL;
330
331 acpi_table_print_madt_entry(header);
332
333 if (x2apic_nmi->lint != 1)
334 printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n");
335
336 return 0;
337}
338
339static int __init
292acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long end) 340acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long end)
293{ 341{
294 struct acpi_madt_local_apic_nmi *lapic_nmi = NULL; 342 struct acpi_madt_local_apic_nmi *lapic_nmi = NULL;
@@ -793,6 +841,7 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
793static int __init acpi_parse_madt_lapic_entries(void) 841static int __init acpi_parse_madt_lapic_entries(void)
794{ 842{
795 int count; 843 int count;
844 int x2count = 0;
796 845
797 if (!cpu_has_apic) 846 if (!cpu_has_apic)
798 return -ENODEV; 847 return -ENODEV;
@@ -816,22 +865,28 @@ static int __init acpi_parse_madt_lapic_entries(void)
816 count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC, 865 count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC,
817 acpi_parse_sapic, MAX_APICS); 866 acpi_parse_sapic, MAX_APICS);
818 867
819 if (!count) 868 if (!count) {
869 x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC,
870 acpi_parse_x2apic, MAX_APICS);
820 count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC, 871 count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC,
821 acpi_parse_lapic, MAX_APICS); 872 acpi_parse_lapic, MAX_APICS);
822 if (!count) { 873 }
874 if (!count && !x2count) {
823 printk(KERN_ERR PREFIX "No LAPIC entries present\n"); 875 printk(KERN_ERR PREFIX "No LAPIC entries present\n");
824 /* TBD: Cleanup to allow fallback to MPS */ 876 /* TBD: Cleanup to allow fallback to MPS */
825 return -ENODEV; 877 return -ENODEV;
826 } else if (count < 0) { 878 } else if (count < 0 || x2count < 0) {
827 printk(KERN_ERR PREFIX "Error parsing LAPIC entry\n"); 879 printk(KERN_ERR PREFIX "Error parsing LAPIC entry\n");
828 /* TBD: Cleanup to allow fallback to MPS */ 880 /* TBD: Cleanup to allow fallback to MPS */
829 return count; 881 return count;
830 } 882 }
831 883
884 x2count =
885 acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC_NMI,
886 acpi_parse_x2apic_nmi, 0);
832 count = 887 count =
833 acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI, acpi_parse_lapic_nmi, 0); 888 acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI, acpi_parse_lapic_nmi, 0);
834 if (count < 0) { 889 if (count < 0 || x2count < 0) {
835 printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n"); 890 printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n");
836 /* TBD: Cleanup to allow fallback to MPS */ 891 /* TBD: Cleanup to allow fallback to MPS */
837 return count; 892 return count;
@@ -1470,7 +1525,7 @@ static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
1470 1525
1471/* 1526/*
1472 * If your system is blacklisted here, but you find that acpi=force 1527 * If your system is blacklisted here, but you find that acpi=force
1473 * works for you, please contact acpi-devel@sourceforge.net 1528 * works for you, please contact linux-acpi@vger.kernel.org
1474 */ 1529 */
1475static struct dmi_system_id __initdata acpi_dmi_table[] = { 1530static struct dmi_system_id __initdata acpi_dmi_table[] = {
1476 /* 1531 /*
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 6907b8e85d52..f57658702571 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -5,6 +5,7 @@
5#include <linux/kprobes.h> 5#include <linux/kprobes.h>
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/vmalloc.h> 7#include <linux/vmalloc.h>
8#include <linux/memory.h>
8#include <asm/alternative.h> 9#include <asm/alternative.h>
9#include <asm/sections.h> 10#include <asm/sections.h>
10#include <asm/pgtable.h> 11#include <asm/pgtable.h>
@@ -12,7 +13,9 @@
12#include <asm/nmi.h> 13#include <asm/nmi.h>
13#include <asm/vsyscall.h> 14#include <asm/vsyscall.h>
14#include <asm/cacheflush.h> 15#include <asm/cacheflush.h>
16#include <asm/tlbflush.h>
15#include <asm/io.h> 17#include <asm/io.h>
18#include <asm/fixmap.h>
16 19
17#define MAX_PATCH_LEN (255-1) 20#define MAX_PATCH_LEN (255-1)
18 21
@@ -226,6 +229,7 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
226{ 229{
227 u8 **ptr; 230 u8 **ptr;
228 231
232 mutex_lock(&text_mutex);
229 for (ptr = start; ptr < end; ptr++) { 233 for (ptr = start; ptr < end; ptr++) {
230 if (*ptr < text) 234 if (*ptr < text)
231 continue; 235 continue;
@@ -234,6 +238,7 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
234 /* turn DS segment override prefix into lock prefix */ 238 /* turn DS segment override prefix into lock prefix */
235 text_poke(*ptr, ((unsigned char []){0xf0}), 1); 239 text_poke(*ptr, ((unsigned char []){0xf0}), 1);
236 }; 240 };
241 mutex_unlock(&text_mutex);
237} 242}
238 243
239static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) 244static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
@@ -243,6 +248,7 @@ static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end
243 if (noreplace_smp) 248 if (noreplace_smp)
244 return; 249 return;
245 250
251 mutex_lock(&text_mutex);
246 for (ptr = start; ptr < end; ptr++) { 252 for (ptr = start; ptr < end; ptr++) {
247 if (*ptr < text) 253 if (*ptr < text)
248 continue; 254 continue;
@@ -251,6 +257,7 @@ static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end
251 /* turn lock prefix into DS segment override prefix */ 257 /* turn lock prefix into DS segment override prefix */
252 text_poke(*ptr, ((unsigned char []){0x3E}), 1); 258 text_poke(*ptr, ((unsigned char []){0x3E}), 1);
253 }; 259 };
260 mutex_unlock(&text_mutex);
254} 261}
255 262
256struct smp_alt_module { 263struct smp_alt_module {
@@ -414,9 +421,17 @@ void __init alternative_instructions(void)
414 that might execute the to be patched code. 421 that might execute the to be patched code.
415 Other CPUs are not running. */ 422 Other CPUs are not running. */
416 stop_nmi(); 423 stop_nmi();
417#ifdef CONFIG_X86_MCE 424
418 stop_mce(); 425 /*
419#endif 426 * Don't stop machine check exceptions while patching.
427 * MCEs only happen when something got corrupted and in this
428 * case we must do something about the corruption.
429 * Ignoring it is worse than a unlikely patching race.
430 * Also machine checks tend to be broadcast and if one CPU
431 * goes into machine check the others follow quickly, so we don't
432 * expect a machine check to cause undue problems during to code
433 * patching.
434 */
420 435
421 apply_alternatives(__alt_instructions, __alt_instructions_end); 436 apply_alternatives(__alt_instructions, __alt_instructions_end);
422 437
@@ -456,9 +471,6 @@ void __init alternative_instructions(void)
456 (unsigned long)__smp_locks_end); 471 (unsigned long)__smp_locks_end);
457 472
458 restart_nmi(); 473 restart_nmi();
459#ifdef CONFIG_X86_MCE
460 restart_mce();
461#endif
462} 474}
463 475
464/** 476/**
@@ -495,15 +507,16 @@ void *text_poke_early(void *addr, const void *opcode, size_t len)
495 * It means the size must be writable atomically and the address must be aligned 507 * It means the size must be writable atomically and the address must be aligned
496 * in a way that permits an atomic write. It also makes sure we fit on a single 508 * in a way that permits an atomic write. It also makes sure we fit on a single
497 * page. 509 * page.
510 *
511 * Note: Must be called under text_mutex.
498 */ 512 */
499void *__kprobes text_poke(void *addr, const void *opcode, size_t len) 513void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
500{ 514{
515 unsigned long flags;
501 char *vaddr; 516 char *vaddr;
502 int nr_pages = 2;
503 struct page *pages[2]; 517 struct page *pages[2];
504 int i; 518 int i;
505 519
506 might_sleep();
507 if (!core_kernel_text((unsigned long)addr)) { 520 if (!core_kernel_text((unsigned long)addr)) {
508 pages[0] = vmalloc_to_page(addr); 521 pages[0] = vmalloc_to_page(addr);
509 pages[1] = vmalloc_to_page(addr + PAGE_SIZE); 522 pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
@@ -513,18 +526,21 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
513 pages[1] = virt_to_page(addr + PAGE_SIZE); 526 pages[1] = virt_to_page(addr + PAGE_SIZE);
514 } 527 }
515 BUG_ON(!pages[0]); 528 BUG_ON(!pages[0]);
516 if (!pages[1]) 529 local_irq_save(flags);
517 nr_pages = 1; 530 set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0]));
518 vaddr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); 531 if (pages[1])
519 BUG_ON(!vaddr); 532 set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1]));
520 local_irq_disable(); 533 vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0);
521 memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len); 534 memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len);
522 local_irq_enable(); 535 clear_fixmap(FIX_TEXT_POKE0);
523 vunmap(vaddr); 536 if (pages[1])
537 clear_fixmap(FIX_TEXT_POKE1);
538 local_flush_tlb();
524 sync_core(); 539 sync_core();
525 /* Could also do a CLFLUSH here to speed up CPU recovery; but 540 /* Could also do a CLFLUSH here to speed up CPU recovery; but
526 that causes hangs on some VIA CPUs. */ 541 that causes hangs on some VIA CPUs. */
527 for (i = 0; i < len; i++) 542 for (i = 0; i < len; i++)
528 BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]); 543 BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]);
544 local_irq_restore(flags);
529 return addr; 545 return addr;
530} 546}
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 5113c080f0c4..a97db99dad52 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -22,10 +22,9 @@
22#include <linux/bitops.h> 22#include <linux/bitops.h>
23#include <linux/debugfs.h> 23#include <linux/debugfs.h>
24#include <linux/scatterlist.h> 24#include <linux/scatterlist.h>
25#include <linux/dma-mapping.h>
25#include <linux/iommu-helper.h> 26#include <linux/iommu-helper.h>
26#ifdef CONFIG_IOMMU_API
27#include <linux/iommu.h> 27#include <linux/iommu.h>
28#endif
29#include <asm/proto.h> 28#include <asm/proto.h>
30#include <asm/iommu.h> 29#include <asm/iommu.h>
31#include <asm/gart.h> 30#include <asm/gart.h>
@@ -1297,8 +1296,10 @@ static void __unmap_single(struct amd_iommu *iommu,
1297/* 1296/*
1298 * The exported map_single function for dma_ops. 1297 * The exported map_single function for dma_ops.
1299 */ 1298 */
1300static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, 1299static dma_addr_t map_page(struct device *dev, struct page *page,
1301 size_t size, int dir) 1300 unsigned long offset, size_t size,
1301 enum dma_data_direction dir,
1302 struct dma_attrs *attrs)
1302{ 1303{
1303 unsigned long flags; 1304 unsigned long flags;
1304 struct amd_iommu *iommu; 1305 struct amd_iommu *iommu;
@@ -1306,6 +1307,7 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
1306 u16 devid; 1307 u16 devid;
1307 dma_addr_t addr; 1308 dma_addr_t addr;
1308 u64 dma_mask; 1309 u64 dma_mask;
1310 phys_addr_t paddr = page_to_phys(page) + offset;
1309 1311
1310 INC_STATS_COUNTER(cnt_map_single); 1312 INC_STATS_COUNTER(cnt_map_single);
1311 1313
@@ -1340,8 +1342,8 @@ out:
1340/* 1342/*
1341 * The exported unmap_single function for dma_ops. 1343 * The exported unmap_single function for dma_ops.
1342 */ 1344 */
1343static void unmap_single(struct device *dev, dma_addr_t dma_addr, 1345static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
1344 size_t size, int dir) 1346 enum dma_data_direction dir, struct dma_attrs *attrs)
1345{ 1347{
1346 unsigned long flags; 1348 unsigned long flags;
1347 struct amd_iommu *iommu; 1349 struct amd_iommu *iommu;
@@ -1390,7 +1392,8 @@ static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
1390 * lists). 1392 * lists).
1391 */ 1393 */
1392static int map_sg(struct device *dev, struct scatterlist *sglist, 1394static int map_sg(struct device *dev, struct scatterlist *sglist,
1393 int nelems, int dir) 1395 int nelems, enum dma_data_direction dir,
1396 struct dma_attrs *attrs)
1394{ 1397{
1395 unsigned long flags; 1398 unsigned long flags;
1396 struct amd_iommu *iommu; 1399 struct amd_iommu *iommu;
@@ -1457,7 +1460,8 @@ unmap:
1457 * lists). 1460 * lists).
1458 */ 1461 */
1459static void unmap_sg(struct device *dev, struct scatterlist *sglist, 1462static void unmap_sg(struct device *dev, struct scatterlist *sglist,
1460 int nelems, int dir) 1463 int nelems, enum dma_data_direction dir,
1464 struct dma_attrs *attrs)
1461{ 1465{
1462 unsigned long flags; 1466 unsigned long flags;
1463 struct amd_iommu *iommu; 1467 struct amd_iommu *iommu;
@@ -1644,11 +1648,11 @@ static void prealloc_protection_domains(void)
1644 } 1648 }
1645} 1649}
1646 1650
1647static struct dma_mapping_ops amd_iommu_dma_ops = { 1651static struct dma_map_ops amd_iommu_dma_ops = {
1648 .alloc_coherent = alloc_coherent, 1652 .alloc_coherent = alloc_coherent,
1649 .free_coherent = free_coherent, 1653 .free_coherent = free_coherent,
1650 .map_single = map_single, 1654 .map_page = map_page,
1651 .unmap_single = unmap_single, 1655 .unmap_page = unmap_page,
1652 .map_sg = map_sg, 1656 .map_sg = map_sg,
1653 .unmap_sg = unmap_sg, 1657 .unmap_sg = unmap_sg,
1654 .dma_supported = amd_iommu_dma_supported, 1658 .dma_supported = amd_iommu_dma_supported,
@@ -1924,6 +1928,12 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
1924 return paddr; 1928 return paddr;
1925} 1929}
1926 1930
1931static int amd_iommu_domain_has_cap(struct iommu_domain *domain,
1932 unsigned long cap)
1933{
1934 return 0;
1935}
1936
1927static struct iommu_ops amd_iommu_ops = { 1937static struct iommu_ops amd_iommu_ops = {
1928 .domain_init = amd_iommu_domain_init, 1938 .domain_init = amd_iommu_domain_init,
1929 .domain_destroy = amd_iommu_domain_destroy, 1939 .domain_destroy = amd_iommu_domain_destroy,
@@ -1932,5 +1942,6 @@ static struct iommu_ops amd_iommu_ops = {
1932 .map = amd_iommu_map_range, 1942 .map = amd_iommu_map_range,
1933 .unmap = amd_iommu_unmap_range, 1943 .unmap = amd_iommu_unmap_range,
1934 .iova_to_phys = amd_iommu_iova_to_phys, 1944 .iova_to_phys = amd_iommu_iova_to_phys,
1945 .domain_has_cap = amd_iommu_domain_has_cap,
1935}; 1946};
1936 1947
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 4732768c5348..b0e5e712a7af 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -47,6 +47,7 @@
47#include <asm/idle.h> 47#include <asm/idle.h>
48#include <asm/mtrr.h> 48#include <asm/mtrr.h>
49#include <asm/smp.h> 49#include <asm/smp.h>
50#include <asm/mce.h>
50 51
51unsigned int num_processors; 52unsigned int num_processors;
52 53
@@ -811,7 +812,7 @@ void clear_local_APIC(void)
811 u32 v; 812 u32 v;
812 813
813 /* APIC hasn't been mapped yet */ 814 /* APIC hasn't been mapped yet */
814 if (!apic_phys) 815 if (!x2apic && !apic_phys)
815 return; 816 return;
816 817
817 maxlvt = lapic_get_maxlvt(); 818 maxlvt = lapic_get_maxlvt();
@@ -845,6 +846,14 @@ void clear_local_APIC(void)
845 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); 846 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
846 } 847 }
847#endif 848#endif
849#ifdef CONFIG_X86_MCE_INTEL
850 if (maxlvt >= 6) {
851 v = apic_read(APIC_LVTCMCI);
852 if (!(v & APIC_LVT_MASKED))
853 apic_write(APIC_LVTCMCI, v | APIC_LVT_MASKED);
854 }
855#endif
856
848 /* 857 /*
849 * Clean APIC state for other OSs: 858 * Clean APIC state for other OSs:
850 */ 859 */
@@ -1245,6 +1254,12 @@ void __cpuinit setup_local_APIC(void)
1245 apic_write(APIC_LVT1, value); 1254 apic_write(APIC_LVT1, value);
1246 1255
1247 preempt_enable(); 1256 preempt_enable();
1257
1258#ifdef CONFIG_X86_MCE_INTEL
1259 /* Recheck CMCI information after local APIC is up on CPU #0 */
1260 if (smp_processor_id() == 0)
1261 cmci_recheck();
1262#endif
1248} 1263}
1249 1264
1250void __cpuinit end_local_APIC_setup(void) 1265void __cpuinit end_local_APIC_setup(void)
@@ -1323,15 +1338,16 @@ void __init enable_IR_x2apic(void)
1323 return; 1338 return;
1324 } 1339 }
1325 1340
1326 local_irq_save(flags); 1341 ret = save_IO_APIC_setup();
1327 mask_8259A();
1328
1329 ret = save_mask_IO_APIC_setup();
1330 if (ret) { 1342 if (ret) {
1331 pr_info("Saving IO-APIC state failed: %d\n", ret); 1343 pr_info("Saving IO-APIC state failed: %d\n", ret);
1332 goto end; 1344 goto end;
1333 } 1345 }
1334 1346
1347 local_irq_save(flags);
1348 mask_IO_APIC_setup();
1349 mask_8259A();
1350
1335 ret = enable_intr_remapping(1); 1351 ret = enable_intr_remapping(1);
1336 1352
1337 if (ret && x2apic_preenabled) { 1353 if (ret && x2apic_preenabled) {
@@ -1356,10 +1372,10 @@ end_restore:
1356 else 1372 else
1357 reinit_intr_remapped_IO_APIC(x2apic_preenabled); 1373 reinit_intr_remapped_IO_APIC(x2apic_preenabled);
1358 1374
1359end:
1360 unmask_8259A(); 1375 unmask_8259A();
1361 local_irq_restore(flags); 1376 local_irq_restore(flags);
1362 1377
1378end:
1363 if (!ret) { 1379 if (!ret) {
1364 if (!x2apic_preenabled) 1380 if (!x2apic_preenabled)
1365 pr_info("Enabled x2apic and interrupt-remapping\n"); 1381 pr_info("Enabled x2apic and interrupt-remapping\n");
@@ -1512,12 +1528,10 @@ void __init early_init_lapic_mapping(void)
1512 */ 1528 */
1513void __init init_apic_mappings(void) 1529void __init init_apic_mappings(void)
1514{ 1530{
1515#ifdef CONFIG_X86_X2APIC
1516 if (x2apic) { 1531 if (x2apic) {
1517 boot_cpu_physical_apicid = read_apic_id(); 1532 boot_cpu_physical_apicid = read_apic_id();
1518 return; 1533 return;
1519 } 1534 }
1520#endif
1521 1535
1522 /* 1536 /*
1523 * If no local APIC can be found then set up a fake all 1537 * If no local APIC can be found then set up a fake all
@@ -1961,12 +1975,9 @@ static int lapic_resume(struct sys_device *dev)
1961 1975
1962 local_irq_save(flags); 1976 local_irq_save(flags);
1963 1977
1964#ifdef CONFIG_X86_X2APIC
1965 if (x2apic) 1978 if (x2apic)
1966 enable_x2apic(); 1979 enable_x2apic();
1967 else 1980 else {
1968#endif
1969 {
1970 /* 1981 /*
1971 * Make sure the APICBASE points to the right address 1982 * Make sure the APICBASE points to the right address
1972 * 1983 *
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 3b002995e145..0014714ea97b 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -159,20 +159,6 @@ static int flat_apic_id_registered(void)
159 return physid_isset(read_xapic_id(), phys_cpu_present_map); 159 return physid_isset(read_xapic_id(), phys_cpu_present_map);
160} 160}
161 161
162static unsigned int flat_cpu_mask_to_apicid(const struct cpumask *cpumask)
163{
164 return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
165}
166
167static unsigned int flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
168 const struct cpumask *andmask)
169{
170 unsigned long mask1 = cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
171 unsigned long mask2 = cpumask_bits(andmask)[0] & APIC_ALL_CPUS;
172
173 return mask1 & mask2;
174}
175
176static int flat_phys_pkg_id(int initial_apic_id, int index_msb) 162static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
177{ 163{
178 return hard_smp_processor_id() >> index_msb; 164 return hard_smp_processor_id() >> index_msb;
@@ -213,8 +199,8 @@ struct apic apic_flat = {
213 .set_apic_id = set_apic_id, 199 .set_apic_id = set_apic_id,
214 .apic_id_mask = 0xFFu << 24, 200 .apic_id_mask = 0xFFu << 24,
215 201
216 .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, 202 .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
217 .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, 203 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
218 204
219 .send_IPI_mask = flat_send_IPI_mask, 205 .send_IPI_mask = flat_send_IPI_mask,
220 .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself, 206 .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself,
@@ -222,7 +208,6 @@ struct apic apic_flat = {
222 .send_IPI_all = flat_send_IPI_all, 208 .send_IPI_all = flat_send_IPI_all,
223 .send_IPI_self = apic_send_IPI_self, 209 .send_IPI_self = apic_send_IPI_self,
224 210
225 .wakeup_cpu = NULL,
226 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, 211 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
227 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, 212 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
228 .wait_for_init_deassert = NULL, 213 .wait_for_init_deassert = NULL,
@@ -373,7 +358,6 @@ struct apic apic_physflat = {
373 .send_IPI_all = physflat_send_IPI_all, 358 .send_IPI_all = physflat_send_IPI_all,
374 .send_IPI_self = apic_send_IPI_self, 359 .send_IPI_self = apic_send_IPI_self,
375 360
376 .wakeup_cpu = NULL,
377 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, 361 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
378 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, 362 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
379 .wait_for_init_deassert = NULL, 363 .wait_for_init_deassert = NULL,
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 0b1093394fdf..676cdac385c0 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -16,32 +16,31 @@
16#include <asm/apic.h> 16#include <asm/apic.h>
17#include <asm/ipi.h> 17#include <asm/ipi.h>
18 18
19static inline unsigned bigsmp_get_apic_id(unsigned long x) 19static unsigned bigsmp_get_apic_id(unsigned long x)
20{ 20{
21 return (x >> 24) & 0xFF; 21 return (x >> 24) & 0xFF;
22} 22}
23 23
24static inline int bigsmp_apic_id_registered(void) 24static int bigsmp_apic_id_registered(void)
25{ 25{
26 return 1; 26 return 1;
27} 27}
28 28
29static inline const cpumask_t *bigsmp_target_cpus(void) 29static const struct cpumask *bigsmp_target_cpus(void)
30{ 30{
31#ifdef CONFIG_SMP 31#ifdef CONFIG_SMP
32 return &cpu_online_map; 32 return cpu_online_mask;
33#else 33#else
34 return &cpumask_of_cpu(0); 34 return cpumask_of(0);
35#endif 35#endif
36} 36}
37 37
38static inline unsigned long 38static unsigned long bigsmp_check_apicid_used(physid_mask_t bitmap, int apicid)
39bigsmp_check_apicid_used(physid_mask_t bitmap, int apicid)
40{ 39{
41 return 0; 40 return 0;
42} 41}
43 42
44static inline unsigned long bigsmp_check_apicid_present(int bit) 43static unsigned long bigsmp_check_apicid_present(int bit)
45{ 44{
46 return 1; 45 return 1;
47} 46}
@@ -64,7 +63,7 @@ static inline unsigned long calculate_ldr(int cpu)
64 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel 63 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
65 * document number 292116). So here it goes... 64 * document number 292116). So here it goes...
66 */ 65 */
67static inline void bigsmp_init_apic_ldr(void) 66static void bigsmp_init_apic_ldr(void)
68{ 67{
69 unsigned long val; 68 unsigned long val;
70 int cpu = smp_processor_id(); 69 int cpu = smp_processor_id();
@@ -74,19 +73,19 @@ static inline void bigsmp_init_apic_ldr(void)
74 apic_write(APIC_LDR, val); 73 apic_write(APIC_LDR, val);
75} 74}
76 75
77static inline void bigsmp_setup_apic_routing(void) 76static void bigsmp_setup_apic_routing(void)
78{ 77{
79 printk(KERN_INFO 78 printk(KERN_INFO
80 "Enabling APIC mode: Physflat. Using %d I/O APICs\n", 79 "Enabling APIC mode: Physflat. Using %d I/O APICs\n",
81 nr_ioapics); 80 nr_ioapics);
82} 81}
83 82
84static inline int bigsmp_apicid_to_node(int logical_apicid) 83static int bigsmp_apicid_to_node(int logical_apicid)
85{ 84{
86 return apicid_2_node[hard_smp_processor_id()]; 85 return apicid_2_node[hard_smp_processor_id()];
87} 86}
88 87
89static inline int bigsmp_cpu_present_to_apicid(int mps_cpu) 88static int bigsmp_cpu_present_to_apicid(int mps_cpu)
90{ 89{
91 if (mps_cpu < nr_cpu_ids) 90 if (mps_cpu < nr_cpu_ids)
92 return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu); 91 return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu);
@@ -94,7 +93,7 @@ static inline int bigsmp_cpu_present_to_apicid(int mps_cpu)
94 return BAD_APICID; 93 return BAD_APICID;
95} 94}
96 95
97static inline physid_mask_t bigsmp_apicid_to_cpu_present(int phys_apicid) 96static physid_mask_t bigsmp_apicid_to_cpu_present(int phys_apicid)
98{ 97{
99 return physid_mask_of_physid(phys_apicid); 98 return physid_mask_of_physid(phys_apicid);
100} 99}
@@ -107,29 +106,24 @@ static inline int bigsmp_cpu_to_logical_apicid(int cpu)
107 return cpu_physical_id(cpu); 106 return cpu_physical_id(cpu);
108} 107}
109 108
110static inline physid_mask_t bigsmp_ioapic_phys_id_map(physid_mask_t phys_map) 109static physid_mask_t bigsmp_ioapic_phys_id_map(physid_mask_t phys_map)
111{ 110{
112 /* For clustered we don't have a good way to do this yet - hack */ 111 /* For clustered we don't have a good way to do this yet - hack */
113 return physids_promote(0xFFL); 112 return physids_promote(0xFFL);
114} 113}
115 114
116static inline void bigsmp_setup_portio_remap(void) 115static int bigsmp_check_phys_apicid_present(int boot_cpu_physical_apicid)
117{
118}
119
120static inline int bigsmp_check_phys_apicid_present(int boot_cpu_physical_apicid)
121{ 116{
122 return 1; 117 return 1;
123} 118}
124 119
125/* As we are using single CPU as destination, pick only one CPU here */ 120/* As we are using single CPU as destination, pick only one CPU here */
126static inline unsigned int bigsmp_cpu_mask_to_apicid(const cpumask_t *cpumask) 121static unsigned int bigsmp_cpu_mask_to_apicid(const struct cpumask *cpumask)
127{ 122{
128 return bigsmp_cpu_to_logical_apicid(first_cpu(*cpumask)); 123 return bigsmp_cpu_to_logical_apicid(cpumask_first(cpumask));
129} 124}
130 125
131static inline unsigned int 126static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
132bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
133 const struct cpumask *andmask) 127 const struct cpumask *andmask)
134{ 128{
135 int cpu; 129 int cpu;
@@ -148,7 +142,7 @@ bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
148 return BAD_APICID; 142 return BAD_APICID;
149} 143}
150 144
151static inline int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) 145static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
152{ 146{
153 return cpuid_apic >> index_msb; 147 return cpuid_apic >> index_msb;
154} 148}
@@ -158,12 +152,12 @@ static inline void bigsmp_send_IPI_mask(const struct cpumask *mask, int vector)
158 default_send_IPI_mask_sequence_phys(mask, vector); 152 default_send_IPI_mask_sequence_phys(mask, vector);
159} 153}
160 154
161static inline void bigsmp_send_IPI_allbutself(int vector) 155static void bigsmp_send_IPI_allbutself(int vector)
162{ 156{
163 default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector); 157 default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
164} 158}
165 159
166static inline void bigsmp_send_IPI_all(int vector) 160static void bigsmp_send_IPI_all(int vector)
167{ 161{
168 bigsmp_send_IPI_mask(cpu_online_mask, vector); 162 bigsmp_send_IPI_mask(cpu_online_mask, vector);
169} 163}
@@ -194,10 +188,10 @@ static const struct dmi_system_id bigsmp_dmi_table[] = {
194 { } /* NULL entry stops DMI scanning */ 188 { } /* NULL entry stops DMI scanning */
195}; 189};
196 190
197static void bigsmp_vector_allocation_domain(int cpu, cpumask_t *retmask) 191static void bigsmp_vector_allocation_domain(int cpu, struct cpumask *retmask)
198{ 192{
199 cpus_clear(*retmask); 193 cpumask_clear(retmask);
200 cpu_set(cpu, *retmask); 194 cpumask_set_cpu(cpu, retmask);
201} 195}
202 196
203static int probe_bigsmp(void) 197static int probe_bigsmp(void)
@@ -256,7 +250,6 @@ struct apic apic_bigsmp = {
256 .send_IPI_all = bigsmp_send_IPI_all, 250 .send_IPI_all = bigsmp_send_IPI_all,
257 .send_IPI_self = default_send_IPI_self, 251 .send_IPI_self = default_send_IPI_self,
258 252
259 .wakeup_cpu = NULL,
260 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, 253 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
261 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, 254 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
262 255
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 320f2d2e4e54..1c11b819f245 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -163,22 +163,17 @@ static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
163 return 0; 163 return 0;
164} 164}
165 165
166static int __init es7000_update_apic(void) 166static int es7000_apic_is_cluster(void)
167{ 167{
168 apic->wakeup_cpu = wakeup_secondary_cpu_via_mip;
169
170 /* MPENTIUMIII */ 168 /* MPENTIUMIII */
171 if (boot_cpu_data.x86 == 6 && 169 if (boot_cpu_data.x86 == 6 &&
172 (boot_cpu_data.x86_model >= 7 || boot_cpu_data.x86_model <= 11)) { 170 (boot_cpu_data.x86_model >= 7 || boot_cpu_data.x86_model <= 11))
173 es7000_update_apic_to_cluster(); 171 return 1;
174 apic->wait_for_init_deassert = NULL;
175 apic->wakeup_cpu = wakeup_secondary_cpu_via_mip;
176 }
177 172
178 return 0; 173 return 0;
179} 174}
180 175
181static void __init setup_unisys(void) 176static void setup_unisys(void)
182{ 177{
183 /* 178 /*
184 * Determine the generation of the ES7000 currently running. 179 * Determine the generation of the ES7000 currently running.
@@ -192,14 +187,12 @@ static void __init setup_unisys(void)
192 else 187 else
193 es7000_plat = ES7000_CLASSIC; 188 es7000_plat = ES7000_CLASSIC;
194 ioapic_renumber_irq = es7000_rename_gsi; 189 ioapic_renumber_irq = es7000_rename_gsi;
195
196 x86_quirks->update_apic = es7000_update_apic;
197} 190}
198 191
199/* 192/*
200 * Parse the OEM Table: 193 * Parse the OEM Table:
201 */ 194 */
202static int __init parse_unisys_oem(char *oemptr) 195static int parse_unisys_oem(char *oemptr)
203{ 196{
204 int i; 197 int i;
205 int success = 0; 198 int success = 0;
@@ -261,7 +254,7 @@ static int __init parse_unisys_oem(char *oemptr)
261} 254}
262 255
263#ifdef CONFIG_ACPI 256#ifdef CONFIG_ACPI
264static int __init find_unisys_acpi_oem_table(unsigned long *oem_addr) 257static int find_unisys_acpi_oem_table(unsigned long *oem_addr)
265{ 258{
266 struct acpi_table_header *header = NULL; 259 struct acpi_table_header *header = NULL;
267 struct es7000_oem_table *table; 260 struct es7000_oem_table *table;
@@ -292,7 +285,7 @@ static int __init find_unisys_acpi_oem_table(unsigned long *oem_addr)
292 return 0; 285 return 0;
293} 286}
294 287
295static void __init unmap_unisys_acpi_oem_table(unsigned long oem_addr) 288static void unmap_unisys_acpi_oem_table(unsigned long oem_addr)
296{ 289{
297 if (!oem_addr) 290 if (!oem_addr)
298 return; 291 return;
@@ -310,8 +303,10 @@ static int es7000_check_dsdt(void)
310 return 0; 303 return 0;
311} 304}
312 305
306static int es7000_acpi_ret;
307
313/* Hook from generic ACPI tables.c */ 308/* Hook from generic ACPI tables.c */
314static int __init es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 309static int es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
315{ 310{
316 unsigned long oem_addr = 0; 311 unsigned long oem_addr = 0;
317 int check_dsdt; 312 int check_dsdt;
@@ -332,10 +327,26 @@ static int __init es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
332 */ 327 */
333 unmap_unisys_acpi_oem_table(oem_addr); 328 unmap_unisys_acpi_oem_table(oem_addr);
334 } 329 }
335 return ret; 330
331 es7000_acpi_ret = ret;
332
333 return ret && !es7000_apic_is_cluster();
336} 334}
335
336static int es7000_acpi_madt_oem_check_cluster(char *oem_id, char *oem_table_id)
337{
338 int ret = es7000_acpi_ret;
339
340 return ret && es7000_apic_is_cluster();
341}
342
337#else /* !CONFIG_ACPI: */ 343#else /* !CONFIG_ACPI: */
338static int __init es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 344static int es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
345{
346 return 0;
347}
348
349static int es7000_acpi_madt_oem_check_cluster(char *oem_id, char *oem_table_id)
339{ 350{
340 return 0; 351 return 0;
341} 352}
@@ -349,8 +360,7 @@ static void es7000_spin(int n)
349 rep_nop(); 360 rep_nop();
350} 361}
351 362
352static int __init 363static int es7000_mip_write(struct mip_reg *mip_reg)
353es7000_mip_write(struct mip_reg *mip_reg)
354{ 364{
355 int status = 0; 365 int status = 0;
356 int spin; 366 int spin;
@@ -383,7 +393,7 @@ es7000_mip_write(struct mip_reg *mip_reg)
383 return status; 393 return status;
384} 394}
385 395
386static void __init es7000_enable_apic_mode(void) 396static void es7000_enable_apic_mode(void)
387{ 397{
388 struct mip_reg es7000_mip_reg; 398 struct mip_reg es7000_mip_reg;
389 int mip_status; 399 int mip_status;
@@ -400,7 +410,7 @@ static void __init es7000_enable_apic_mode(void)
400 WARN(1, "Command failed, status = %x\n", mip_status); 410 WARN(1, "Command failed, status = %x\n", mip_status);
401} 411}
402 412
403static void es7000_vector_allocation_domain(int cpu, cpumask_t *retmask) 413static void es7000_vector_allocation_domain(int cpu, struct cpumask *retmask)
404{ 414{
405 /* Careful. Some cpus do not strictly honor the set of cpus 415 /* Careful. Some cpus do not strictly honor the set of cpus
406 * specified in the interrupt destination when using lowest 416 * specified in the interrupt destination when using lowest
@@ -410,17 +420,15 @@ static void es7000_vector_allocation_domain(int cpu, cpumask_t *retmask)
410 * deliver interrupts to the wrong hyperthread when only one 420 * deliver interrupts to the wrong hyperthread when only one
411 * hyperthread was specified in the interrupt desitination. 421 * hyperthread was specified in the interrupt desitination.
412 */ 422 */
413 *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } }; 423 cpumask_clear(retmask);
424 cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
414} 425}
415 426
416 427
417static void es7000_wait_for_init_deassert(atomic_t *deassert) 428static void es7000_wait_for_init_deassert(atomic_t *deassert)
418{ 429{
419#ifndef CONFIG_ES7000_CLUSTERED_APIC
420 while (!atomic_read(deassert)) 430 while (!atomic_read(deassert))
421 cpu_relax(); 431 cpu_relax();
422#endif
423 return;
424} 432}
425 433
426static unsigned int es7000_get_apic_id(unsigned long x) 434static unsigned int es7000_get_apic_id(unsigned long x)
@@ -448,14 +456,14 @@ static int es7000_apic_id_registered(void)
448 return 1; 456 return 1;
449} 457}
450 458
451static const cpumask_t *target_cpus_cluster(void) 459static const struct cpumask *target_cpus_cluster(void)
452{ 460{
453 return &CPU_MASK_ALL; 461 return cpu_all_mask;
454} 462}
455 463
456static const cpumask_t *es7000_target_cpus(void) 464static const struct cpumask *es7000_target_cpus(void)
457{ 465{
458 return &cpumask_of_cpu(smp_processor_id()); 466 return cpumask_of(smp_processor_id());
459} 467}
460 468
461static unsigned long 469static unsigned long
@@ -510,7 +518,7 @@ static void es7000_setup_apic_routing(void)
510 "Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n", 518 "Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n",
511 (apic_version[apic] == 0x14) ? 519 (apic_version[apic] == 0x14) ?
512 "Physical Cluster" : "Logical Cluster", 520 "Physical Cluster" : "Logical Cluster",
513 nr_ioapics, cpus_addr(*es7000_target_cpus())[0]); 521 nr_ioapics, cpumask_bits(es7000_target_cpus())[0]);
514} 522}
515 523
516static int es7000_apicid_to_node(int logical_apicid) 524static int es7000_apicid_to_node(int logical_apicid)
@@ -565,72 +573,24 @@ static int es7000_check_phys_apicid_present(int cpu_physical_apicid)
565 return 1; 573 return 1;
566} 574}
567 575
568static unsigned int 576static unsigned int es7000_cpu_mask_to_apicid(const struct cpumask *cpumask)
569es7000_cpu_mask_to_apicid_cluster(const struct cpumask *cpumask)
570{
571 int cpus_found = 0;
572 int num_bits_set;
573 int apicid;
574 int cpu;
575
576 num_bits_set = cpumask_weight(cpumask);
577 /* Return id to all */
578 if (num_bits_set == nr_cpu_ids)
579 return 0xFF;
580 /*
581 * The cpus in the mask must all be on the apic cluster. If are not
582 * on the same apicid cluster return default value of target_cpus():
583 */
584 cpu = cpumask_first(cpumask);
585 apicid = es7000_cpu_to_logical_apicid(cpu);
586
587 while (cpus_found < num_bits_set) {
588 if (cpumask_test_cpu(cpu, cpumask)) {
589 int new_apicid = es7000_cpu_to_logical_apicid(cpu);
590
591 if (APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
592 WARN(1, "Not a valid mask!");
593
594 return 0xFF;
595 }
596 apicid = new_apicid;
597 cpus_found++;
598 }
599 cpu++;
600 }
601 return apicid;
602}
603
604static unsigned int es7000_cpu_mask_to_apicid(const cpumask_t *cpumask)
605{ 577{
606 int cpus_found = 0; 578 unsigned int round = 0;
607 int num_bits_set; 579 int cpu, uninitialized_var(apicid);
608 int apicid;
609 int cpu;
610 580
611 num_bits_set = cpus_weight(*cpumask);
612 /* Return id to all */
613 if (num_bits_set == nr_cpu_ids)
614 return es7000_cpu_to_logical_apicid(0);
615 /* 581 /*
616 * The cpus in the mask must all be on the apic cluster. If are not 582 * The cpus in the mask must all be on the apic cluster.
617 * on the same apicid cluster return default value of target_cpus():
618 */ 583 */
619 cpu = first_cpu(*cpumask); 584 for_each_cpu(cpu, cpumask) {
620 apicid = es7000_cpu_to_logical_apicid(cpu); 585 int new_apicid = es7000_cpu_to_logical_apicid(cpu);
621 while (cpus_found < num_bits_set) {
622 if (cpu_isset(cpu, *cpumask)) {
623 int new_apicid = es7000_cpu_to_logical_apicid(cpu);
624 586
625 if (APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { 587 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
626 printk("%s: Not a valid mask!\n", __func__); 588 WARN(1, "Not a valid mask!");
627 589
628 return es7000_cpu_to_logical_apicid(0); 590 return BAD_APICID;
629 }
630 apicid = new_apicid;
631 cpus_found++;
632 } 591 }
633 cpu++; 592 apicid = new_apicid;
593 round++;
634 } 594 }
635 return apicid; 595 return apicid;
636} 596}
@@ -659,37 +619,103 @@ static int es7000_phys_pkg_id(int cpuid_apic, int index_msb)
659 return cpuid_apic >> index_msb; 619 return cpuid_apic >> index_msb;
660} 620}
661 621
662void __init es7000_update_apic_to_cluster(void)
663{
664 apic->target_cpus = target_cpus_cluster;
665 apic->irq_delivery_mode = dest_LowestPrio;
666 /* logical delivery broadcast to all procs: */
667 apic->irq_dest_mode = 1;
668
669 apic->init_apic_ldr = es7000_init_apic_ldr_cluster;
670
671 apic->cpu_mask_to_apicid = es7000_cpu_mask_to_apicid_cluster;
672}
673
674static int probe_es7000(void) 622static int probe_es7000(void)
675{ 623{
676 /* probed later in mptable/ACPI hooks */ 624 /* probed later in mptable/ACPI hooks */
677 return 0; 625 return 0;
678} 626}
679 627
680static __init int 628static int es7000_mps_ret;
681es7000_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid) 629static int es7000_mps_oem_check(struct mpc_table *mpc, char *oem,
630 char *productid)
682{ 631{
632 int ret = 0;
633
683 if (mpc->oemptr) { 634 if (mpc->oemptr) {
684 struct mpc_oemtable *oem_table = 635 struct mpc_oemtable *oem_table =
685 (struct mpc_oemtable *)mpc->oemptr; 636 (struct mpc_oemtable *)mpc->oemptr;
686 637
687 if (!strncmp(oem, "UNISYS", 6)) 638 if (!strncmp(oem, "UNISYS", 6))
688 return parse_unisys_oem((char *)oem_table); 639 ret = parse_unisys_oem((char *)oem_table);
689 } 640 }
690 return 0; 641
642 es7000_mps_ret = ret;
643
644 return ret && !es7000_apic_is_cluster();
691} 645}
692 646
647static int es7000_mps_oem_check_cluster(struct mpc_table *mpc, char *oem,
648 char *productid)
649{
650 int ret = es7000_mps_ret;
651
652 return ret && es7000_apic_is_cluster();
653}
654
655struct apic apic_es7000_cluster = {
656
657 .name = "es7000",
658 .probe = probe_es7000,
659 .acpi_madt_oem_check = es7000_acpi_madt_oem_check_cluster,
660 .apic_id_registered = es7000_apic_id_registered,
661
662 .irq_delivery_mode = dest_LowestPrio,
663 /* logical delivery broadcast to all procs: */
664 .irq_dest_mode = 1,
665
666 .target_cpus = target_cpus_cluster,
667 .disable_esr = 1,
668 .dest_logical = 0,
669 .check_apicid_used = es7000_check_apicid_used,
670 .check_apicid_present = es7000_check_apicid_present,
671
672 .vector_allocation_domain = es7000_vector_allocation_domain,
673 .init_apic_ldr = es7000_init_apic_ldr_cluster,
674
675 .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
676 .setup_apic_routing = es7000_setup_apic_routing,
677 .multi_timer_check = NULL,
678 .apicid_to_node = es7000_apicid_to_node,
679 .cpu_to_logical_apicid = es7000_cpu_to_logical_apicid,
680 .cpu_present_to_apicid = es7000_cpu_present_to_apicid,
681 .apicid_to_cpu_present = es7000_apicid_to_cpu_present,
682 .setup_portio_remap = NULL,
683 .check_phys_apicid_present = es7000_check_phys_apicid_present,
684 .enable_apic_mode = es7000_enable_apic_mode,
685 .phys_pkg_id = es7000_phys_pkg_id,
686 .mps_oem_check = es7000_mps_oem_check_cluster,
687
688 .get_apic_id = es7000_get_apic_id,
689 .set_apic_id = NULL,
690 .apic_id_mask = 0xFF << 24,
691
692 .cpu_mask_to_apicid = es7000_cpu_mask_to_apicid,
693 .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and,
694
695 .send_IPI_mask = es7000_send_IPI_mask,
696 .send_IPI_mask_allbutself = NULL,
697 .send_IPI_allbutself = es7000_send_IPI_allbutself,
698 .send_IPI_all = es7000_send_IPI_all,
699 .send_IPI_self = default_send_IPI_self,
700
701 .wakeup_secondary_cpu = wakeup_secondary_cpu_via_mip,
702
703 .trampoline_phys_low = 0x467,
704 .trampoline_phys_high = 0x469,
705
706 .wait_for_init_deassert = NULL,
707
708 /* Nothing to do for most platforms, since cleared by the INIT cycle: */
709 .smp_callin_clear_local_apic = NULL,
710 .inquire_remote_apic = default_inquire_remote_apic,
711
712 .read = native_apic_mem_read,
713 .write = native_apic_mem_write,
714 .icr_read = native_apic_icr_read,
715 .icr_write = native_apic_icr_write,
716 .wait_icr_idle = native_apic_wait_icr_idle,
717 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
718};
693 719
694struct apic apic_es7000 = { 720struct apic apic_es7000 = {
695 721
@@ -737,8 +763,6 @@ struct apic apic_es7000 = {
737 .send_IPI_all = es7000_send_IPI_all, 763 .send_IPI_all = es7000_send_IPI_all,
738 .send_IPI_self = default_send_IPI_self, 764 .send_IPI_self = default_send_IPI_self,
739 765
740 .wakeup_cpu = NULL,
741
742 .trampoline_phys_low = 0x467, 766 .trampoline_phys_low = 0x467,
743 .trampoline_phys_high = 0x469, 767 .trampoline_phys_high = 0x469,
744 768
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 00e6071cefc4..1bb5c6cee3eb 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -389,6 +389,8 @@ struct io_apic {
389 unsigned int index; 389 unsigned int index;
390 unsigned int unused[3]; 390 unsigned int unused[3];
391 unsigned int data; 391 unsigned int data;
392 unsigned int unused2[11];
393 unsigned int eoi;
392}; 394};
393 395
394static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) 396static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
@@ -397,6 +399,12 @@ static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
397 + (mp_ioapics[idx].apicaddr & ~PAGE_MASK); 399 + (mp_ioapics[idx].apicaddr & ~PAGE_MASK);
398} 400}
399 401
402static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
403{
404 struct io_apic __iomem *io_apic = io_apic_base(apic);
405 writel(vector, &io_apic->eoi);
406}
407
400static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) 408static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
401{ 409{
402 struct io_apic __iomem *io_apic = io_apic_base(apic); 410 struct io_apic __iomem *io_apic = io_apic_base(apic);
@@ -546,16 +554,12 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
546 554
547 apic = entry->apic; 555 apic = entry->apic;
548 pin = entry->pin; 556 pin = entry->pin;
549#ifdef CONFIG_INTR_REMAP
550 /* 557 /*
551 * With interrupt-remapping, destination information comes 558 * With interrupt-remapping, destination information comes
552 * from interrupt-remapping table entry. 559 * from interrupt-remapping table entry.
553 */ 560 */
554 if (!irq_remapped(irq)) 561 if (!irq_remapped(irq))
555 io_apic_write(apic, 0x11 + pin*2, dest); 562 io_apic_write(apic, 0x11 + pin*2, dest);
556#else
557 io_apic_write(apic, 0x11 + pin*2, dest);
558#endif
559 reg = io_apic_read(apic, 0x10 + pin*2); 563 reg = io_apic_read(apic, 0x10 + pin*2);
560 reg &= ~IO_APIC_REDIR_VECTOR_MASK; 564 reg &= ~IO_APIC_REDIR_VECTOR_MASK;
561 reg |= vector; 565 reg |= vector;
@@ -588,10 +592,12 @@ set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
588 if (assign_irq_vector(irq, cfg, mask)) 592 if (assign_irq_vector(irq, cfg, mask))
589 return BAD_APICID; 593 return BAD_APICID;
590 594
591 cpumask_and(desc->affinity, cfg->domain, mask); 595 /* check that before desc->addinity get updated */
592 set_extra_move_desc(desc, mask); 596 set_extra_move_desc(desc, mask);
593 597
594 return apic->cpu_mask_to_apicid_and(desc->affinity, cpu_online_mask); 598 cpumask_copy(desc->affinity, mask);
599
600 return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
595} 601}
596 602
597static void 603static void
@@ -849,9 +855,9 @@ __setup("pirq=", ioapic_pirq_setup);
849static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS]; 855static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS];
850 856
851/* 857/*
852 * Saves and masks all the unmasked IO-APIC RTE's 858 * Saves all the IO-APIC RTE's
853 */ 859 */
854int save_mask_IO_APIC_setup(void) 860int save_IO_APIC_setup(void)
855{ 861{
856 union IO_APIC_reg_01 reg_01; 862 union IO_APIC_reg_01 reg_01;
857 unsigned long flags; 863 unsigned long flags;
@@ -876,16 +882,9 @@ int save_mask_IO_APIC_setup(void)
876 } 882 }
877 883
878 for (apic = 0; apic < nr_ioapics; apic++) 884 for (apic = 0; apic < nr_ioapics; apic++)
879 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { 885 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
880 struct IO_APIC_route_entry entry; 886 early_ioapic_entries[apic][pin] =
881
882 entry = early_ioapic_entries[apic][pin] =
883 ioapic_read_entry(apic, pin); 887 ioapic_read_entry(apic, pin);
884 if (!entry.mask) {
885 entry.mask = 1;
886 ioapic_write_entry(apic, pin, entry);
887 }
888 }
889 888
890 return 0; 889 return 0;
891 890
@@ -898,6 +897,25 @@ nomem:
898 return -ENOMEM; 897 return -ENOMEM;
899} 898}
900 899
900void mask_IO_APIC_setup(void)
901{
902 int apic, pin;
903
904 for (apic = 0; apic < nr_ioapics; apic++) {
905 if (!early_ioapic_entries[apic])
906 break;
907 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
908 struct IO_APIC_route_entry entry;
909
910 entry = early_ioapic_entries[apic][pin];
911 if (!entry.mask) {
912 entry.mask = 1;
913 ioapic_write_entry(apic, pin, entry);
914 }
915 }
916 }
917}
918
901void restore_IO_APIC_setup(void) 919void restore_IO_APIC_setup(void)
902{ 920{
903 int apic, pin; 921 int apic, pin;
@@ -1411,9 +1429,7 @@ void __setup_vector_irq(int cpu)
1411} 1429}
1412 1430
1413static struct irq_chip ioapic_chip; 1431static struct irq_chip ioapic_chip;
1414#ifdef CONFIG_INTR_REMAP
1415static struct irq_chip ir_ioapic_chip; 1432static struct irq_chip ir_ioapic_chip;
1416#endif
1417 1433
1418#define IOAPIC_AUTO -1 1434#define IOAPIC_AUTO -1
1419#define IOAPIC_EDGE 0 1435#define IOAPIC_EDGE 0
@@ -1452,7 +1468,6 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t
1452 else 1468 else
1453 desc->status &= ~IRQ_LEVEL; 1469 desc->status &= ~IRQ_LEVEL;
1454 1470
1455#ifdef CONFIG_INTR_REMAP
1456 if (irq_remapped(irq)) { 1471 if (irq_remapped(irq)) {
1457 desc->status |= IRQ_MOVE_PCNTXT; 1472 desc->status |= IRQ_MOVE_PCNTXT;
1458 if (trigger) 1473 if (trigger)
@@ -1464,7 +1479,7 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t
1464 handle_edge_irq, "edge"); 1479 handle_edge_irq, "edge");
1465 return; 1480 return;
1466 } 1481 }
1467#endif 1482
1468 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || 1483 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
1469 trigger == IOAPIC_LEVEL) 1484 trigger == IOAPIC_LEVEL)
1470 set_irq_chip_and_handler_name(irq, &ioapic_chip, 1485 set_irq_chip_and_handler_name(irq, &ioapic_chip,
@@ -1478,14 +1493,13 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t
1478int setup_ioapic_entry(int apic_id, int irq, 1493int setup_ioapic_entry(int apic_id, int irq,
1479 struct IO_APIC_route_entry *entry, 1494 struct IO_APIC_route_entry *entry,
1480 unsigned int destination, int trigger, 1495 unsigned int destination, int trigger,
1481 int polarity, int vector) 1496 int polarity, int vector, int pin)
1482{ 1497{
1483 /* 1498 /*
1484 * add it to the IO-APIC irq-routing table: 1499 * add it to the IO-APIC irq-routing table:
1485 */ 1500 */
1486 memset(entry,0,sizeof(*entry)); 1501 memset(entry,0,sizeof(*entry));
1487 1502
1488#ifdef CONFIG_INTR_REMAP
1489 if (intr_remapping_enabled) { 1503 if (intr_remapping_enabled) {
1490 struct intel_iommu *iommu = map_ioapic_to_ir(apic_id); 1504 struct intel_iommu *iommu = map_ioapic_to_ir(apic_id);
1491 struct irte irte; 1505 struct irte irte;
@@ -1504,7 +1518,14 @@ int setup_ioapic_entry(int apic_id, int irq,
1504 1518
1505 irte.present = 1; 1519 irte.present = 1;
1506 irte.dst_mode = apic->irq_dest_mode; 1520 irte.dst_mode = apic->irq_dest_mode;
1507 irte.trigger_mode = trigger; 1521 /*
1522 * Trigger mode in the IRTE will always be edge, and the
1523 * actual level or edge trigger will be setup in the IO-APIC
1524 * RTE. This will help simplify level triggered irq migration.
1525 * For more details, see the comments above explainig IO-APIC
1526 * irq migration in the presence of interrupt-remapping.
1527 */
1528 irte.trigger_mode = 0;
1508 irte.dlvry_mode = apic->irq_delivery_mode; 1529 irte.dlvry_mode = apic->irq_delivery_mode;
1509 irte.vector = vector; 1530 irte.vector = vector;
1510 irte.dest_id = IRTE_DEST(destination); 1531 irte.dest_id = IRTE_DEST(destination);
@@ -1515,18 +1536,21 @@ int setup_ioapic_entry(int apic_id, int irq,
1515 ir_entry->zero = 0; 1536 ir_entry->zero = 0;
1516 ir_entry->format = 1; 1537 ir_entry->format = 1;
1517 ir_entry->index = (index & 0x7fff); 1538 ir_entry->index = (index & 0x7fff);
1518 } else 1539 /*
1519#endif 1540 * IO-APIC RTE will be configured with virtual vector.
1520 { 1541 * irq handler will do the explicit EOI to the io-apic.
1542 */
1543 ir_entry->vector = pin;
1544 } else {
1521 entry->delivery_mode = apic->irq_delivery_mode; 1545 entry->delivery_mode = apic->irq_delivery_mode;
1522 entry->dest_mode = apic->irq_dest_mode; 1546 entry->dest_mode = apic->irq_dest_mode;
1523 entry->dest = destination; 1547 entry->dest = destination;
1548 entry->vector = vector;
1524 } 1549 }
1525 1550
1526 entry->mask = 0; /* enable IRQ */ 1551 entry->mask = 0; /* enable IRQ */
1527 entry->trigger = trigger; 1552 entry->trigger = trigger;
1528 entry->polarity = polarity; 1553 entry->polarity = polarity;
1529 entry->vector = vector;
1530 1554
1531 /* Mask level triggered irqs. 1555 /* Mask level triggered irqs.
1532 * Use IRQ_DELAYED_DISABLE for edge triggered irqs. 1556 * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
@@ -1561,7 +1585,7 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
1561 1585
1562 1586
1563 if (setup_ioapic_entry(mp_ioapics[apic_id].apicid, irq, &entry, 1587 if (setup_ioapic_entry(mp_ioapics[apic_id].apicid, irq, &entry,
1564 dest, trigger, polarity, cfg->vector)) { 1588 dest, trigger, polarity, cfg->vector, pin)) {
1565 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", 1589 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
1566 mp_ioapics[apic_id].apicid, pin); 1590 mp_ioapics[apic_id].apicid, pin);
1567 __clear_irq_vector(irq, cfg); 1591 __clear_irq_vector(irq, cfg);
@@ -1642,10 +1666,8 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin,
1642{ 1666{
1643 struct IO_APIC_route_entry entry; 1667 struct IO_APIC_route_entry entry;
1644 1668
1645#ifdef CONFIG_INTR_REMAP
1646 if (intr_remapping_enabled) 1669 if (intr_remapping_enabled)
1647 return; 1670 return;
1648#endif
1649 1671
1650 memset(&entry, 0, sizeof(entry)); 1672 memset(&entry, 0, sizeof(entry));
1651 1673
@@ -2040,8 +2062,13 @@ void disable_IO_APIC(void)
2040 * If the i8259 is routed through an IOAPIC 2062 * If the i8259 is routed through an IOAPIC
2041 * Put that IOAPIC in virtual wire mode 2063 * Put that IOAPIC in virtual wire mode
2042 * so legacy interrupts can be delivered. 2064 * so legacy interrupts can be delivered.
2065 *
2066 * With interrupt-remapping, for now we will use virtual wire A mode,
2067 * as virtual wire B is little complex (need to configure both
2068 * IOAPIC RTE aswell as interrupt-remapping table entry).
2069 * As this gets called during crash dump, keep this simple for now.
2043 */ 2070 */
2044 if (ioapic_i8259.pin != -1) { 2071 if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) {
2045 struct IO_APIC_route_entry entry; 2072 struct IO_APIC_route_entry entry;
2046 2073
2047 memset(&entry, 0, sizeof(entry)); 2074 memset(&entry, 0, sizeof(entry));
@@ -2061,7 +2088,10 @@ void disable_IO_APIC(void)
2061 ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); 2088 ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
2062 } 2089 }
2063 2090
2064 disconnect_bsp_APIC(ioapic_i8259.pin != -1); 2091 /*
2092 * Use virtual wire A mode when interrupt remapping is enabled.
2093 */
2094 disconnect_bsp_APIC(!intr_remapping_enabled && ioapic_i8259.pin != -1);
2065} 2095}
2066 2096
2067#ifdef CONFIG_X86_32 2097#ifdef CONFIG_X86_32
@@ -2303,37 +2333,24 @@ static int ioapic_retrigger_irq(unsigned int irq)
2303#ifdef CONFIG_SMP 2333#ifdef CONFIG_SMP
2304 2334
2305#ifdef CONFIG_INTR_REMAP 2335#ifdef CONFIG_INTR_REMAP
2306static void ir_irq_migration(struct work_struct *work);
2307
2308static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration);
2309 2336
2310/* 2337/*
2311 * Migrate the IO-APIC irq in the presence of intr-remapping. 2338 * Migrate the IO-APIC irq in the presence of intr-remapping.
2312 * 2339 *
2313 * For edge triggered, irq migration is a simple atomic update(of vector 2340 * For both level and edge triggered, irq migration is a simple atomic
2314 * and cpu destination) of IRTE and flush the hardware cache. 2341 * update(of vector and cpu destination) of IRTE and flush the hardware cache.
2315 *
2316 * For level triggered, we need to modify the io-apic RTE aswell with the update
2317 * vector information, along with modifying IRTE with vector and destination.
2318 * So irq migration for level triggered is little bit more complex compared to
2319 * edge triggered migration. But the good news is, we use the same algorithm
2320 * for level triggered migration as we have today, only difference being,
2321 * we now initiate the irq migration from process context instead of the
2322 * interrupt context.
2323 * 2342 *
2324 * In future, when we do a directed EOI (combined with cpu EOI broadcast 2343 * For level triggered, we eliminate the io-apic RTE modification (with the
2325 * suppression) to the IO-APIC, level triggered irq migration will also be 2344 * updated vector information), by using a virtual vector (io-apic pin number).
2326 * as simple as edge triggered migration and we can do the irq migration 2345 * Real vector that is used for interrupting cpu will be coming from
2327 * with a simple atomic update to IO-APIC RTE. 2346 * the interrupt-remapping table entry.
2328 */ 2347 */
2329static void 2348static void
2330migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) 2349migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2331{ 2350{
2332 struct irq_cfg *cfg; 2351 struct irq_cfg *cfg;
2333 struct irte irte; 2352 struct irte irte;
2334 int modify_ioapic_rte;
2335 unsigned int dest; 2353 unsigned int dest;
2336 unsigned long flags;
2337 unsigned int irq; 2354 unsigned int irq;
2338 2355
2339 if (!cpumask_intersects(mask, cpu_online_mask)) 2356 if (!cpumask_intersects(mask, cpu_online_mask))
@@ -2351,13 +2368,6 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2351 2368
2352 dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); 2369 dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
2353 2370
2354 modify_ioapic_rte = desc->status & IRQ_LEVEL;
2355 if (modify_ioapic_rte) {
2356 spin_lock_irqsave(&ioapic_lock, flags);
2357 __target_IO_APIC_irq(irq, dest, cfg);
2358 spin_unlock_irqrestore(&ioapic_lock, flags);
2359 }
2360
2361 irte.vector = cfg->vector; 2371 irte.vector = cfg->vector;
2362 irte.dest_id = IRTE_DEST(dest); 2372 irte.dest_id = IRTE_DEST(dest);
2363 2373
@@ -2372,73 +2382,12 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2372 cpumask_copy(desc->affinity, mask); 2382 cpumask_copy(desc->affinity, mask);
2373} 2383}
2374 2384
2375static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
2376{
2377 int ret = -1;
2378 struct irq_cfg *cfg = desc->chip_data;
2379
2380 mask_IO_APIC_irq_desc(desc);
2381
2382 if (io_apic_level_ack_pending(cfg)) {
2383 /*
2384 * Interrupt in progress. Migrating irq now will change the
2385 * vector information in the IO-APIC RTE and that will confuse
2386 * the EOI broadcast performed by cpu.
2387 * So, delay the irq migration to the next instance.
2388 */
2389 schedule_delayed_work(&ir_migration_work, 1);
2390 goto unmask;
2391 }
2392
2393 /* everthing is clear. we have right of way */
2394 migrate_ioapic_irq_desc(desc, desc->pending_mask);
2395
2396 ret = 0;
2397 desc->status &= ~IRQ_MOVE_PENDING;
2398 cpumask_clear(desc->pending_mask);
2399
2400unmask:
2401 unmask_IO_APIC_irq_desc(desc);
2402
2403 return ret;
2404}
2405
2406static void ir_irq_migration(struct work_struct *work)
2407{
2408 unsigned int irq;
2409 struct irq_desc *desc;
2410
2411 for_each_irq_desc(irq, desc) {
2412 if (desc->status & IRQ_MOVE_PENDING) {
2413 unsigned long flags;
2414
2415 spin_lock_irqsave(&desc->lock, flags);
2416 if (!desc->chip->set_affinity ||
2417 !(desc->status & IRQ_MOVE_PENDING)) {
2418 desc->status &= ~IRQ_MOVE_PENDING;
2419 spin_unlock_irqrestore(&desc->lock, flags);
2420 continue;
2421 }
2422
2423 desc->chip->set_affinity(irq, desc->pending_mask);
2424 spin_unlock_irqrestore(&desc->lock, flags);
2425 }
2426 }
2427}
2428
2429/* 2385/*
2430 * Migrates the IRQ destination in the process context. 2386 * Migrates the IRQ destination in the process context.
2431 */ 2387 */
2432static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, 2388static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
2433 const struct cpumask *mask) 2389 const struct cpumask *mask)
2434{ 2390{
2435 if (desc->status & IRQ_LEVEL) {
2436 desc->status |= IRQ_MOVE_PENDING;
2437 cpumask_copy(desc->pending_mask, mask);
2438 migrate_irq_remapped_level_desc(desc);
2439 return;
2440 }
2441
2442 migrate_ioapic_irq_desc(desc, mask); 2391 migrate_ioapic_irq_desc(desc, mask);
2443} 2392}
2444static void set_ir_ioapic_affinity_irq(unsigned int irq, 2393static void set_ir_ioapic_affinity_irq(unsigned int irq,
@@ -2448,6 +2397,11 @@ static void set_ir_ioapic_affinity_irq(unsigned int irq,
2448 2397
2449 set_ir_ioapic_affinity_irq_desc(desc, mask); 2398 set_ir_ioapic_affinity_irq_desc(desc, mask);
2450} 2399}
2400#else
2401static inline void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
2402 const struct cpumask *mask)
2403{
2404}
2451#endif 2405#endif
2452 2406
2453asmlinkage void smp_irq_move_cleanup_interrupt(void) 2407asmlinkage void smp_irq_move_cleanup_interrupt(void)
@@ -2461,6 +2415,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2461 me = smp_processor_id(); 2415 me = smp_processor_id();
2462 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { 2416 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
2463 unsigned int irq; 2417 unsigned int irq;
2418 unsigned int irr;
2464 struct irq_desc *desc; 2419 struct irq_desc *desc;
2465 struct irq_cfg *cfg; 2420 struct irq_cfg *cfg;
2466 irq = __get_cpu_var(vector_irq)[vector]; 2421 irq = __get_cpu_var(vector_irq)[vector];
@@ -2480,6 +2435,18 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2480 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) 2435 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
2481 goto unlock; 2436 goto unlock;
2482 2437
2438 irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
2439 /*
2440 * Check if the vector that needs to be cleanedup is
2441 * registered at the cpu's IRR. If so, then this is not
2442 * the best time to clean it up. Lets clean it up in the
2443 * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR
2444 * to myself.
2445 */
2446 if (irr & (1 << (vector % 32))) {
2447 apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
2448 goto unlock;
2449 }
2483 __get_cpu_var(vector_irq)[vector] = -1; 2450 __get_cpu_var(vector_irq)[vector] = -1;
2484 cfg->move_cleanup_count--; 2451 cfg->move_cleanup_count--;
2485unlock: 2452unlock:
@@ -2529,9 +2496,44 @@ static inline void irq_complete_move(struct irq_desc **descp) {}
2529#endif 2496#endif
2530 2497
2531#ifdef CONFIG_INTR_REMAP 2498#ifdef CONFIG_INTR_REMAP
2499static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2500{
2501 int apic, pin;
2502 struct irq_pin_list *entry;
2503
2504 entry = cfg->irq_2_pin;
2505 for (;;) {
2506
2507 if (!entry)
2508 break;
2509
2510 apic = entry->apic;
2511 pin = entry->pin;
2512 io_apic_eoi(apic, pin);
2513 entry = entry->next;
2514 }
2515}
2516
2517static void
2518eoi_ioapic_irq(struct irq_desc *desc)
2519{
2520 struct irq_cfg *cfg;
2521 unsigned long flags;
2522 unsigned int irq;
2523
2524 irq = desc->irq;
2525 cfg = desc->chip_data;
2526
2527 spin_lock_irqsave(&ioapic_lock, flags);
2528 __eoi_ioapic_irq(irq, cfg);
2529 spin_unlock_irqrestore(&ioapic_lock, flags);
2530}
2531
2532static void ack_x2apic_level(unsigned int irq) 2532static void ack_x2apic_level(unsigned int irq)
2533{ 2533{
2534 struct irq_desc *desc = irq_to_desc(irq);
2534 ack_x2APIC_irq(); 2535 ack_x2APIC_irq();
2536 eoi_ioapic_irq(desc);
2535} 2537}
2536 2538
2537static void ack_x2apic_edge(unsigned int irq) 2539static void ack_x2apic_edge(unsigned int irq)
@@ -2662,20 +2664,20 @@ static struct irq_chip ioapic_chip __read_mostly = {
2662 .retrigger = ioapic_retrigger_irq, 2664 .retrigger = ioapic_retrigger_irq,
2663}; 2665};
2664 2666
2665#ifdef CONFIG_INTR_REMAP
2666static struct irq_chip ir_ioapic_chip __read_mostly = { 2667static struct irq_chip ir_ioapic_chip __read_mostly = {
2667 .name = "IR-IO-APIC", 2668 .name = "IR-IO-APIC",
2668 .startup = startup_ioapic_irq, 2669 .startup = startup_ioapic_irq,
2669 .mask = mask_IO_APIC_irq, 2670 .mask = mask_IO_APIC_irq,
2670 .unmask = unmask_IO_APIC_irq, 2671 .unmask = unmask_IO_APIC_irq,
2672#ifdef CONFIG_INTR_REMAP
2671 .ack = ack_x2apic_edge, 2673 .ack = ack_x2apic_edge,
2672 .eoi = ack_x2apic_level, 2674 .eoi = ack_x2apic_level,
2673#ifdef CONFIG_SMP 2675#ifdef CONFIG_SMP
2674 .set_affinity = set_ir_ioapic_affinity_irq, 2676 .set_affinity = set_ir_ioapic_affinity_irq,
2675#endif 2677#endif
2678#endif
2676 .retrigger = ioapic_retrigger_irq, 2679 .retrigger = ioapic_retrigger_irq,
2677}; 2680};
2678#endif
2679 2681
2680static inline void init_IO_APIC_traps(void) 2682static inline void init_IO_APIC_traps(void)
2681{ 2683{
@@ -2901,10 +2903,8 @@ static inline void __init check_timer(void)
2901 * 8259A. 2903 * 8259A.
2902 */ 2904 */
2903 if (pin1 == -1) { 2905 if (pin1 == -1) {
2904#ifdef CONFIG_INTR_REMAP
2905 if (intr_remapping_enabled) 2906 if (intr_remapping_enabled)
2906 panic("BIOS bug: timer not connected to IO-APIC"); 2907 panic("BIOS bug: timer not connected to IO-APIC");
2907#endif
2908 pin1 = pin2; 2908 pin1 = pin2;
2909 apic1 = apic2; 2909 apic1 = apic2;
2910 no_pin1 = 1; 2910 no_pin1 = 1;
@@ -2940,10 +2940,8 @@ static inline void __init check_timer(void)
2940 clear_IO_APIC_pin(0, pin1); 2940 clear_IO_APIC_pin(0, pin1);
2941 goto out; 2941 goto out;
2942 } 2942 }
2943#ifdef CONFIG_INTR_REMAP
2944 if (intr_remapping_enabled) 2943 if (intr_remapping_enabled)
2945 panic("timer doesn't work through Interrupt-remapped IO-APIC"); 2944 panic("timer doesn't work through Interrupt-remapped IO-APIC");
2946#endif
2947 local_irq_disable(); 2945 local_irq_disable();
2948 clear_IO_APIC_pin(apic1, pin1); 2946 clear_IO_APIC_pin(apic1, pin1);
2949 if (!no_pin1) 2947 if (!no_pin1)
@@ -3237,9 +3235,7 @@ void destroy_irq(unsigned int irq)
3237 if (desc) 3235 if (desc)
3238 desc->chip_data = cfg; 3236 desc->chip_data = cfg;
3239 3237
3240#ifdef CONFIG_INTR_REMAP
3241 free_irte(irq); 3238 free_irte(irq);
3242#endif
3243 spin_lock_irqsave(&vector_lock, flags); 3239 spin_lock_irqsave(&vector_lock, flags);
3244 __clear_irq_vector(irq, cfg); 3240 __clear_irq_vector(irq, cfg);
3245 spin_unlock_irqrestore(&vector_lock, flags); 3241 spin_unlock_irqrestore(&vector_lock, flags);
@@ -3265,7 +3261,6 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
3265 3261
3266 dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); 3262 dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
3267 3263
3268#ifdef CONFIG_INTR_REMAP
3269 if (irq_remapped(irq)) { 3264 if (irq_remapped(irq)) {
3270 struct irte irte; 3265 struct irte irte;
3271 int ir_index; 3266 int ir_index;
@@ -3291,10 +3286,13 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
3291 MSI_ADDR_IR_SHV | 3286 MSI_ADDR_IR_SHV |
3292 MSI_ADDR_IR_INDEX1(ir_index) | 3287 MSI_ADDR_IR_INDEX1(ir_index) |
3293 MSI_ADDR_IR_INDEX2(ir_index); 3288 MSI_ADDR_IR_INDEX2(ir_index);
3294 } else 3289 } else {
3295#endif 3290 if (x2apic_enabled())
3296 { 3291 msg->address_hi = MSI_ADDR_BASE_HI |
3297 msg->address_hi = MSI_ADDR_BASE_HI; 3292 MSI_ADDR_EXT_DEST_ID(dest);
3293 else
3294 msg->address_hi = MSI_ADDR_BASE_HI;
3295
3298 msg->address_lo = 3296 msg->address_lo =
3299 MSI_ADDR_BASE_LO | 3297 MSI_ADDR_BASE_LO |
3300 ((apic->irq_dest_mode == 0) ? 3298 ((apic->irq_dest_mode == 0) ?
@@ -3394,15 +3392,16 @@ static struct irq_chip msi_chip = {
3394 .retrigger = ioapic_retrigger_irq, 3392 .retrigger = ioapic_retrigger_irq,
3395}; 3393};
3396 3394
3397#ifdef CONFIG_INTR_REMAP
3398static struct irq_chip msi_ir_chip = { 3395static struct irq_chip msi_ir_chip = {
3399 .name = "IR-PCI-MSI", 3396 .name = "IR-PCI-MSI",
3400 .unmask = unmask_msi_irq, 3397 .unmask = unmask_msi_irq,
3401 .mask = mask_msi_irq, 3398 .mask = mask_msi_irq,
3399#ifdef CONFIG_INTR_REMAP
3402 .ack = ack_x2apic_edge, 3400 .ack = ack_x2apic_edge,
3403#ifdef CONFIG_SMP 3401#ifdef CONFIG_SMP
3404 .set_affinity = ir_set_msi_irq_affinity, 3402 .set_affinity = ir_set_msi_irq_affinity,
3405#endif 3403#endif
3404#endif
3406 .retrigger = ioapic_retrigger_irq, 3405 .retrigger = ioapic_retrigger_irq,
3407}; 3406};
3408 3407
@@ -3432,7 +3431,6 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
3432 } 3431 }
3433 return index; 3432 return index;
3434} 3433}
3435#endif
3436 3434
3437static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) 3435static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3438{ 3436{
@@ -3446,7 +3444,6 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3446 set_irq_msi(irq, msidesc); 3444 set_irq_msi(irq, msidesc);
3447 write_msi_msg(irq, &msg); 3445 write_msi_msg(irq, &msg);
3448 3446
3449#ifdef CONFIG_INTR_REMAP
3450 if (irq_remapped(irq)) { 3447 if (irq_remapped(irq)) {
3451 struct irq_desc *desc = irq_to_desc(irq); 3448 struct irq_desc *desc = irq_to_desc(irq);
3452 /* 3449 /*
@@ -3455,7 +3452,6 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3455 desc->status |= IRQ_MOVE_PCNTXT; 3452 desc->status |= IRQ_MOVE_PCNTXT;
3456 set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge"); 3453 set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
3457 } else 3454 } else
3458#endif
3459 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); 3455 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
3460 3456
3461 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq); 3457 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
@@ -3469,11 +3465,12 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3469 int ret, sub_handle; 3465 int ret, sub_handle;
3470 struct msi_desc *msidesc; 3466 struct msi_desc *msidesc;
3471 unsigned int irq_want; 3467 unsigned int irq_want;
3472 3468 struct intel_iommu *iommu = NULL;
3473#ifdef CONFIG_INTR_REMAP
3474 struct intel_iommu *iommu = 0;
3475 int index = 0; 3469 int index = 0;
3476#endif 3470
3471 /* x86 doesn't support multiple MSI yet */
3472 if (type == PCI_CAP_ID_MSI && nvec > 1)
3473 return 1;
3477 3474
3478 irq_want = nr_irqs_gsi; 3475 irq_want = nr_irqs_gsi;
3479 sub_handle = 0; 3476 sub_handle = 0;
@@ -3482,7 +3479,6 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3482 if (irq == 0) 3479 if (irq == 0)
3483 return -1; 3480 return -1;
3484 irq_want = irq + 1; 3481 irq_want = irq + 1;
3485#ifdef CONFIG_INTR_REMAP
3486 if (!intr_remapping_enabled) 3482 if (!intr_remapping_enabled)
3487 goto no_ir; 3483 goto no_ir;
3488 3484
@@ -3510,7 +3506,6 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3510 set_irte_irq(irq, iommu, index, sub_handle); 3506 set_irte_irq(irq, iommu, index, sub_handle);
3511 } 3507 }
3512no_ir: 3508no_ir:
3513#endif
3514 ret = setup_msi_irq(dev, msidesc, irq); 3509 ret = setup_msi_irq(dev, msidesc, irq);
3515 if (ret < 0) 3510 if (ret < 0)
3516 goto error; 3511 goto error;
@@ -3528,7 +3523,7 @@ void arch_teardown_msi_irq(unsigned int irq)
3528 destroy_irq(irq); 3523 destroy_irq(irq);
3529} 3524}
3530 3525
3531#ifdef CONFIG_DMAR 3526#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
3532#ifdef CONFIG_SMP 3527#ifdef CONFIG_SMP
3533static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) 3528static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3534{ 3529{
@@ -3609,7 +3604,7 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3609 3604
3610#endif /* CONFIG_SMP */ 3605#endif /* CONFIG_SMP */
3611 3606
3612struct irq_chip hpet_msi_type = { 3607static struct irq_chip hpet_msi_type = {
3613 .name = "HPET_MSI", 3608 .name = "HPET_MSI",
3614 .unmask = hpet_msi_unmask, 3609 .unmask = hpet_msi_unmask,
3615 .mask = hpet_msi_mask, 3610 .mask = hpet_msi_mask,
@@ -4045,11 +4040,9 @@ void __init setup_ioapic_dest(void)
4045 else 4040 else
4046 mask = apic->target_cpus(); 4041 mask = apic->target_cpus();
4047 4042
4048#ifdef CONFIG_INTR_REMAP
4049 if (intr_remapping_enabled) 4043 if (intr_remapping_enabled)
4050 set_ir_ioapic_affinity_irq_desc(desc, mask); 4044 set_ir_ioapic_affinity_irq_desc(desc, mask);
4051 else 4045 else
4052#endif
4053 set_ioapic_affinity_irq_desc(desc, mask); 4046 set_ioapic_affinity_irq_desc(desc, mask);
4054 } 4047 }
4055 4048
@@ -4142,9 +4135,12 @@ static int __init ioapic_insert_resources(void)
4142 struct resource *r = ioapic_resources; 4135 struct resource *r = ioapic_resources;
4143 4136
4144 if (!r) { 4137 if (!r) {
4145 printk(KERN_ERR 4138 if (nr_ioapics > 0) {
4146 "IO APIC resources could be not be allocated.\n"); 4139 printk(KERN_ERR
4147 return -1; 4140 "IO APIC resources couldn't be allocated.\n");
4141 return -1;
4142 }
4143 return 0;
4148 } 4144 }
4149 4145
4150 for (i = 0; i < nr_ioapics; i++) { 4146 for (i = 0; i < nr_ioapics; i++) {
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index bdfad80c3cf1..d6bd62407152 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -39,7 +39,7 @@
39int unknown_nmi_panic; 39int unknown_nmi_panic;
40int nmi_watchdog_enabled; 40int nmi_watchdog_enabled;
41 41
42static cpumask_t backtrace_mask = CPU_MASK_NONE; 42static cpumask_var_t backtrace_mask;
43 43
44/* nmi_active: 44/* nmi_active:
45 * >0: the lapic NMI watchdog is active, but can be disabled 45 * >0: the lapic NMI watchdog is active, but can be disabled
@@ -138,6 +138,7 @@ int __init check_nmi_watchdog(void)
138 if (!prev_nmi_count) 138 if (!prev_nmi_count)
139 goto error; 139 goto error;
140 140
141 alloc_cpumask_var(&backtrace_mask, GFP_KERNEL);
141 printk(KERN_INFO "Testing NMI watchdog ... "); 142 printk(KERN_INFO "Testing NMI watchdog ... ");
142 143
143#ifdef CONFIG_SMP 144#ifdef CONFIG_SMP
@@ -413,14 +414,14 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
413 touched = 1; 414 touched = 1;
414 } 415 }
415 416
416 if (cpu_isset(cpu, backtrace_mask)) { 417 if (cpumask_test_cpu(cpu, backtrace_mask)) {
417 static DEFINE_SPINLOCK(lock); /* Serialise the printks */ 418 static DEFINE_SPINLOCK(lock); /* Serialise the printks */
418 419
419 spin_lock(&lock); 420 spin_lock(&lock);
420 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); 421 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
421 dump_stack(); 422 dump_stack();
422 spin_unlock(&lock); 423 spin_unlock(&lock);
423 cpu_clear(cpu, backtrace_mask); 424 cpumask_clear_cpu(cpu, backtrace_mask);
424 } 425 }
425 426
426 /* Could check oops_in_progress here too, but it's safer not to */ 427 /* Could check oops_in_progress here too, but it's safer not to */
@@ -554,10 +555,10 @@ void __trigger_all_cpu_backtrace(void)
554{ 555{
555 int i; 556 int i;
556 557
557 backtrace_mask = cpu_online_map; 558 cpumask_copy(backtrace_mask, cpu_online_mask);
558 /* Wait for up to 10 seconds for all CPUs to do the backtrace */ 559 /* Wait for up to 10 seconds for all CPUs to do the backtrace */
559 for (i = 0; i < 10 * 1000; i++) { 560 for (i = 0; i < 10 * 1000; i++) {
560 if (cpus_empty(backtrace_mask)) 561 if (cpumask_empty(backtrace_mask))
561 break; 562 break;
562 mdelay(1); 563 mdelay(1);
563 } 564 }
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index d9d6d61eed82..533e59c6fc82 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -69,7 +69,7 @@ struct mpc_trans {
69/* x86_quirks member */ 69/* x86_quirks member */
70static int mpc_record; 70static int mpc_record;
71 71
72static __cpuinitdata struct mpc_trans *translation_table[MAX_MPC_ENTRY]; 72static struct mpc_trans *translation_table[MAX_MPC_ENTRY];
73 73
74int mp_bus_id_to_node[MAX_MP_BUSSES]; 74int mp_bus_id_to_node[MAX_MP_BUSSES];
75int mp_bus_id_to_local[MAX_MP_BUSSES]; 75int mp_bus_id_to_local[MAX_MP_BUSSES];
@@ -256,13 +256,6 @@ static int __init numaq_setup_ioapic_ids(void)
256 return 1; 256 return 1;
257} 257}
258 258
259static int __init numaq_update_apic(void)
260{
261 apic->wakeup_cpu = wakeup_secondary_cpu_via_nmi;
262
263 return 0;
264}
265
266static struct x86_quirks numaq_x86_quirks __initdata = { 259static struct x86_quirks numaq_x86_quirks __initdata = {
267 .arch_pre_time_init = numaq_pre_time_init, 260 .arch_pre_time_init = numaq_pre_time_init,
268 .arch_time_init = NULL, 261 .arch_time_init = NULL,
@@ -278,7 +271,6 @@ static struct x86_quirks numaq_x86_quirks __initdata = {
278 .mpc_oem_pci_bus = mpc_oem_pci_bus, 271 .mpc_oem_pci_bus = mpc_oem_pci_bus,
279 .smp_read_mpc_oem = smp_read_mpc_oem, 272 .smp_read_mpc_oem = smp_read_mpc_oem,
280 .setup_ioapic_ids = numaq_setup_ioapic_ids, 273 .setup_ioapic_ids = numaq_setup_ioapic_ids,
281 .update_apic = numaq_update_apic,
282}; 274};
283 275
284static __init void early_check_numaq(void) 276static __init void early_check_numaq(void)
@@ -342,9 +334,9 @@ static inline void numaq_smp_callin_clear_local_apic(void)
342 clear_local_APIC(); 334 clear_local_APIC();
343} 335}
344 336
345static inline const cpumask_t *numaq_target_cpus(void) 337static inline const struct cpumask *numaq_target_cpus(void)
346{ 338{
347 return &CPU_MASK_ALL; 339 return cpu_all_mask;
348} 340}
349 341
350static inline unsigned long 342static inline unsigned long
@@ -435,7 +427,7 @@ static inline int numaq_check_phys_apicid_present(int boot_cpu_physical_apicid)
435 * We use physical apicids here, not logical, so just return the default 427 * We use physical apicids here, not logical, so just return the default
436 * physical broadcast to stop people from breaking us 428 * physical broadcast to stop people from breaking us
437 */ 429 */
438static inline unsigned int numaq_cpu_mask_to_apicid(const cpumask_t *cpumask) 430static unsigned int numaq_cpu_mask_to_apicid(const struct cpumask *cpumask)
439{ 431{
440 return 0x0F; 432 return 0x0F;
441} 433}
@@ -470,7 +462,7 @@ static int probe_numaq(void)
470 return found_numaq; 462 return found_numaq;
471} 463}
472 464
473static void numaq_vector_allocation_domain(int cpu, cpumask_t *retmask) 465static void numaq_vector_allocation_domain(int cpu, struct cpumask *retmask)
474{ 466{
475 /* Careful. Some cpus do not strictly honor the set of cpus 467 /* Careful. Some cpus do not strictly honor the set of cpus
476 * specified in the interrupt destination when using lowest 468 * specified in the interrupt destination when using lowest
@@ -480,7 +472,8 @@ static void numaq_vector_allocation_domain(int cpu, cpumask_t *retmask)
480 * deliver interrupts to the wrong hyperthread when only one 472 * deliver interrupts to the wrong hyperthread when only one
481 * hyperthread was specified in the interrupt desitination. 473 * hyperthread was specified in the interrupt desitination.
482 */ 474 */
483 *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } }; 475 cpumask_clear(retmask);
476 cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
484} 477}
485 478
486static void numaq_setup_portio_remap(void) 479static void numaq_setup_portio_remap(void)
@@ -546,7 +539,7 @@ struct apic apic_numaq = {
546 .send_IPI_all = numaq_send_IPI_all, 539 .send_IPI_all = numaq_send_IPI_all,
547 .send_IPI_self = default_send_IPI_self, 540 .send_IPI_self = default_send_IPI_self,
548 541
549 .wakeup_cpu = NULL, 542 .wakeup_secondary_cpu = wakeup_secondary_cpu_via_nmi,
550 .trampoline_phys_low = NUMAQ_TRAMPOLINE_PHYS_LOW, 543 .trampoline_phys_low = NUMAQ_TRAMPOLINE_PHYS_LOW,
551 .trampoline_phys_high = NUMAQ_TRAMPOLINE_PHYS_HIGH, 544 .trampoline_phys_high = NUMAQ_TRAMPOLINE_PHYS_HIGH,
552 545
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 3a730fa574bb..01eda2ac65e4 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -83,7 +83,8 @@ static void default_vector_allocation_domain(int cpu, struct cpumask *retmask)
83 * deliver interrupts to the wrong hyperthread when only one 83 * deliver interrupts to the wrong hyperthread when only one
84 * hyperthread was specified in the interrupt desitination. 84 * hyperthread was specified in the interrupt desitination.
85 */ 85 */
86 *retmask = (cpumask_t) { { [0] = APIC_ALL_CPUS } }; 86 cpumask_clear(retmask);
87 cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
87} 88}
88 89
89/* should be called last. */ 90/* should be called last. */
@@ -138,7 +139,6 @@ struct apic apic_default = {
138 .send_IPI_all = default_send_IPI_all, 139 .send_IPI_all = default_send_IPI_all,
139 .send_IPI_self = default_send_IPI_self, 140 .send_IPI_self = default_send_IPI_self,
140 141
141 .wakeup_cpu = NULL,
142 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, 142 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
143 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, 143 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
144 144
@@ -159,6 +159,7 @@ extern struct apic apic_numaq;
159extern struct apic apic_summit; 159extern struct apic apic_summit;
160extern struct apic apic_bigsmp; 160extern struct apic apic_bigsmp;
161extern struct apic apic_es7000; 161extern struct apic apic_es7000;
162extern struct apic apic_es7000_cluster;
162extern struct apic apic_default; 163extern struct apic apic_default;
163 164
164struct apic *apic = &apic_default; 165struct apic *apic = &apic_default;
@@ -176,6 +177,7 @@ static struct apic *apic_probe[] __initdata = {
176#endif 177#endif
177#ifdef CONFIG_X86_ES7000 178#ifdef CONFIG_X86_ES7000
178 &apic_es7000, 179 &apic_es7000,
180 &apic_es7000_cluster,
179#endif 181#endif
180 &apic_default, /* must be last */ 182 &apic_default, /* must be last */
181 NULL, 183 NULL,
@@ -197,9 +199,6 @@ static int __init parse_apic(char *arg)
197 } 199 }
198 } 200 }
199 201
200 if (x86_quirks->update_apic)
201 x86_quirks->update_apic();
202
203 /* Parsed again by __setup for debug/verbose */ 202 /* Parsed again by __setup for debug/verbose */
204 return 0; 203 return 0;
205} 204}
@@ -218,8 +217,6 @@ void __init generic_bigsmp_probe(void)
218 if (!cmdline_apic && apic == &apic_default) { 217 if (!cmdline_apic && apic == &apic_default) {
219 if (apic_bigsmp.probe()) { 218 if (apic_bigsmp.probe()) {
220 apic = &apic_bigsmp; 219 apic = &apic_bigsmp;
221 if (x86_quirks->update_apic)
222 x86_quirks->update_apic();
223 printk(KERN_INFO "Overriding APIC driver with %s\n", 220 printk(KERN_INFO "Overriding APIC driver with %s\n",
224 apic->name); 221 apic->name);
225 } 222 }
@@ -240,9 +237,6 @@ void __init generic_apic_probe(void)
240 /* Not visible without early console */ 237 /* Not visible without early console */
241 if (!apic_probe[i]) 238 if (!apic_probe[i])
242 panic("Didn't find an APIC driver"); 239 panic("Didn't find an APIC driver");
243
244 if (x86_quirks->update_apic)
245 x86_quirks->update_apic();
246 } 240 }
247 printk(KERN_INFO "Using APIC driver %s\n", apic->name); 241 printk(KERN_INFO "Using APIC driver %s\n", apic->name);
248} 242}
@@ -262,8 +256,6 @@ generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
262 256
263 if (!cmdline_apic) { 257 if (!cmdline_apic) {
264 apic = apic_probe[i]; 258 apic = apic_probe[i];
265 if (x86_quirks->update_apic)
266 x86_quirks->update_apic();
267 printk(KERN_INFO "Switched to APIC driver `%s'.\n", 259 printk(KERN_INFO "Switched to APIC driver `%s'.\n",
268 apic->name); 260 apic->name);
269 } 261 }
@@ -284,8 +276,6 @@ int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
284 276
285 if (!cmdline_apic) { 277 if (!cmdline_apic) {
286 apic = apic_probe[i]; 278 apic = apic_probe[i];
287 if (x86_quirks->update_apic)
288 x86_quirks->update_apic();
289 printk(KERN_INFO "Switched to APIC driver `%s'.\n", 279 printk(KERN_INFO "Switched to APIC driver `%s'.\n",
290 apic->name); 280 apic->name);
291 } 281 }
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index e7c163661c77..1783652bb0e5 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -69,8 +69,12 @@ void __init default_setup_apic_routing(void)
69 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); 69 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
70 } 70 }
71 71
72 if (x86_quirks->update_apic) 72 /*
73 x86_quirks->update_apic(); 73 * Now that apic routing model is selected, configure the
74 * fault handling for intr remapping.
75 */
76 if (intr_remapping_enabled)
77 enable_drhd_fault_handling();
74} 78}
75 79
76/* Same for both flat and physical. */ 80/* Same for both flat and physical. */
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 32838b57a945..9cfe1f415d81 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -53,23 +53,19 @@ static unsigned summit_get_apic_id(unsigned long x)
53 return (x >> 24) & 0xFF; 53 return (x >> 24) & 0xFF;
54} 54}
55 55
56static inline void summit_send_IPI_mask(const cpumask_t *mask, int vector) 56static inline void summit_send_IPI_mask(const struct cpumask *mask, int vector)
57{ 57{
58 default_send_IPI_mask_sequence_logical(mask, vector); 58 default_send_IPI_mask_sequence_logical(mask, vector);
59} 59}
60 60
61static void summit_send_IPI_allbutself(int vector) 61static void summit_send_IPI_allbutself(int vector)
62{ 62{
63 cpumask_t mask = cpu_online_map; 63 default_send_IPI_mask_allbutself_logical(cpu_online_mask, vector);
64 cpu_clear(smp_processor_id(), mask);
65
66 if (!cpus_empty(mask))
67 summit_send_IPI_mask(&mask, vector);
68} 64}
69 65
70static void summit_send_IPI_all(int vector) 66static void summit_send_IPI_all(int vector)
71{ 67{
72 summit_send_IPI_mask(&cpu_online_map, vector); 68 summit_send_IPI_mask(cpu_online_mask, vector);
73} 69}
74 70
75#include <asm/tsc.h> 71#include <asm/tsc.h>
@@ -77,9 +73,9 @@ static void summit_send_IPI_all(int vector)
77extern int use_cyclone; 73extern int use_cyclone;
78 74
79#ifdef CONFIG_X86_SUMMIT_NUMA 75#ifdef CONFIG_X86_SUMMIT_NUMA
80extern void setup_summit(void); 76static void setup_summit(void);
81#else 77#else
82#define setup_summit() {} 78static inline void setup_summit(void) {}
83#endif 79#endif
84 80
85static int summit_mps_oem_check(struct mpc_table *mpc, char *oem, 81static int summit_mps_oem_check(struct mpc_table *mpc, char *oem,
@@ -186,13 +182,13 @@ static inline int is_WPEG(struct rio_detail *rio){
186 182
187#define SUMMIT_APIC_DFR_VALUE (APIC_DFR_CLUSTER) 183#define SUMMIT_APIC_DFR_VALUE (APIC_DFR_CLUSTER)
188 184
189static const cpumask_t *summit_target_cpus(void) 185static const struct cpumask *summit_target_cpus(void)
190{ 186{
191 /* CPU_MASK_ALL (0xff) has undefined behaviour with 187 /* CPU_MASK_ALL (0xff) has undefined behaviour with
192 * dest_LowestPrio mode logical clustered apic interrupt routing 188 * dest_LowestPrio mode logical clustered apic interrupt routing
193 * Just start on cpu 0. IRQ balancing will spread load 189 * Just start on cpu 0. IRQ balancing will spread load
194 */ 190 */
195 return &cpumask_of_cpu(0); 191 return cpumask_of(0);
196} 192}
197 193
198static unsigned long summit_check_apicid_used(physid_mask_t bitmap, int apicid) 194static unsigned long summit_check_apicid_used(physid_mask_t bitmap, int apicid)
@@ -289,35 +285,23 @@ static int summit_check_phys_apicid_present(int boot_cpu_physical_apicid)
289 return 1; 285 return 1;
290} 286}
291 287
292static unsigned int summit_cpu_mask_to_apicid(const cpumask_t *cpumask) 288static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask)
293{ 289{
294 int cpus_found = 0; 290 unsigned int round = 0;
295 int num_bits_set; 291 int cpu, apicid = 0;
296 int apicid;
297 int cpu;
298 292
299 num_bits_set = cpus_weight(*cpumask);
300 if (num_bits_set >= nr_cpu_ids)
301 return BAD_APICID;
302 /* 293 /*
303 * The cpus in the mask must all be on the apic cluster. 294 * The cpus in the mask must all be on the apic cluster.
304 */ 295 */
305 cpu = first_cpu(*cpumask); 296 for_each_cpu(cpu, cpumask) {
306 apicid = summit_cpu_to_logical_apicid(cpu); 297 int new_apicid = summit_cpu_to_logical_apicid(cpu);
307
308 while (cpus_found < num_bits_set) {
309 if (cpu_isset(cpu, *cpumask)) {
310 int new_apicid = summit_cpu_to_logical_apicid(cpu);
311 298
312 if (APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { 299 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
313 printk("%s: Not a valid mask!\n", __func__); 300 printk("%s: Not a valid mask!\n", __func__);
314 301 return BAD_APICID;
315 return BAD_APICID;
316 }
317 apicid = apicid | new_apicid;
318 cpus_found++;
319 } 302 }
320 cpu++; 303 apicid |= new_apicid;
304 round++;
321 } 305 }
322 return apicid; 306 return apicid;
323} 307}
@@ -358,7 +342,7 @@ static int probe_summit(void)
358 return 0; 342 return 0;
359} 343}
360 344
361static void summit_vector_allocation_domain(int cpu, cpumask_t *retmask) 345static void summit_vector_allocation_domain(int cpu, struct cpumask *retmask)
362{ 346{
363 /* Careful. Some cpus do not strictly honor the set of cpus 347 /* Careful. Some cpus do not strictly honor the set of cpus
364 * specified in the interrupt destination when using lowest 348 * specified in the interrupt destination when using lowest
@@ -368,19 +352,20 @@ static void summit_vector_allocation_domain(int cpu, cpumask_t *retmask)
368 * deliver interrupts to the wrong hyperthread when only one 352 * deliver interrupts to the wrong hyperthread when only one
369 * hyperthread was specified in the interrupt desitination. 353 * hyperthread was specified in the interrupt desitination.
370 */ 354 */
371 *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } }; 355 cpumask_clear(retmask);
356 cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
372} 357}
373 358
374#ifdef CONFIG_X86_SUMMIT_NUMA 359#ifdef CONFIG_X86_SUMMIT_NUMA
375static struct rio_table_hdr *rio_table_hdr __initdata; 360static struct rio_table_hdr *rio_table_hdr;
376static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata; 361static struct scal_detail *scal_devs[MAX_NUMNODES];
377static struct rio_detail *rio_devs[MAX_NUMNODES*4] __initdata; 362static struct rio_detail *rio_devs[MAX_NUMNODES*4];
378 363
379#ifndef CONFIG_X86_NUMAQ 364#ifndef CONFIG_X86_NUMAQ
380static int mp_bus_id_to_node[MAX_MP_BUSSES] __initdata; 365static int mp_bus_id_to_node[MAX_MP_BUSSES];
381#endif 366#endif
382 367
383static int __init setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus) 368static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
384{ 369{
385 int twister = 0, node = 0; 370 int twister = 0, node = 0;
386 int i, bus, num_buses; 371 int i, bus, num_buses;
@@ -442,7 +427,7 @@ static int __init setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
442 return bus; 427 return bus;
443} 428}
444 429
445static int __init build_detail_arrays(void) 430static int build_detail_arrays(void)
446{ 431{
447 unsigned long ptr; 432 unsigned long ptr;
448 int i, scal_detail_size, rio_detail_size; 433 int i, scal_detail_size, rio_detail_size;
@@ -476,7 +461,7 @@ static int __init build_detail_arrays(void)
476 return 1; 461 return 1;
477} 462}
478 463
479void __init setup_summit(void) 464void setup_summit(void)
480{ 465{
481 unsigned long ptr; 466 unsigned long ptr;
482 unsigned short offset; 467 unsigned short offset;
@@ -574,7 +559,6 @@ struct apic apic_summit = {
574 .send_IPI_all = summit_send_IPI_all, 559 .send_IPI_all = summit_send_IPI_all,
575 .send_IPI_self = default_send_IPI_self, 560 .send_IPI_self = default_send_IPI_self,
576 561
577 .wakeup_cpu = NULL,
578 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, 562 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
579 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, 563 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
580 564
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 354b9c45601d..4a903e2f0d17 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -57,6 +57,8 @@ static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
57 unsigned long query_cpu; 57 unsigned long query_cpu;
58 unsigned long flags; 58 unsigned long flags;
59 59
60 x2apic_wrmsr_fence();
61
60 local_irq_save(flags); 62 local_irq_save(flags);
61 for_each_cpu(query_cpu, mask) { 63 for_each_cpu(query_cpu, mask) {
62 __x2apic_send_IPI_dest( 64 __x2apic_send_IPI_dest(
@@ -73,6 +75,8 @@ static void
73 unsigned long query_cpu; 75 unsigned long query_cpu;
74 unsigned long flags; 76 unsigned long flags;
75 77
78 x2apic_wrmsr_fence();
79
76 local_irq_save(flags); 80 local_irq_save(flags);
77 for_each_cpu(query_cpu, mask) { 81 for_each_cpu(query_cpu, mask) {
78 if (query_cpu == this_cpu) 82 if (query_cpu == this_cpu)
@@ -90,6 +94,8 @@ static void x2apic_send_IPI_allbutself(int vector)
90 unsigned long query_cpu; 94 unsigned long query_cpu;
91 unsigned long flags; 95 unsigned long flags;
92 96
97 x2apic_wrmsr_fence();
98
93 local_irq_save(flags); 99 local_irq_save(flags);
94 for_each_online_cpu(query_cpu) { 100 for_each_online_cpu(query_cpu) {
95 if (query_cpu == this_cpu) 101 if (query_cpu == this_cpu)
@@ -224,7 +230,6 @@ struct apic apic_x2apic_cluster = {
224 .send_IPI_all = x2apic_send_IPI_all, 230 .send_IPI_all = x2apic_send_IPI_all,
225 .send_IPI_self = x2apic_send_IPI_self, 231 .send_IPI_self = x2apic_send_IPI_self,
226 232
227 .wakeup_cpu = NULL,
228 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, 233 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
229 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, 234 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
230 .wait_for_init_deassert = NULL, 235 .wait_for_init_deassert = NULL,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 5bcb174409bc..a284359627e7 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -58,6 +58,8 @@ static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
58 unsigned long query_cpu; 58 unsigned long query_cpu;
59 unsigned long flags; 59 unsigned long flags;
60 60
61 x2apic_wrmsr_fence();
62
61 local_irq_save(flags); 63 local_irq_save(flags);
62 for_each_cpu(query_cpu, mask) { 64 for_each_cpu(query_cpu, mask) {
63 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu), 65 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
@@ -73,6 +75,8 @@ static void
73 unsigned long query_cpu; 75 unsigned long query_cpu;
74 unsigned long flags; 76 unsigned long flags;
75 77
78 x2apic_wrmsr_fence();
79
76 local_irq_save(flags); 80 local_irq_save(flags);
77 for_each_cpu(query_cpu, mask) { 81 for_each_cpu(query_cpu, mask) {
78 if (query_cpu != this_cpu) 82 if (query_cpu != this_cpu)
@@ -89,6 +93,8 @@ static void x2apic_send_IPI_allbutself(int vector)
89 unsigned long query_cpu; 93 unsigned long query_cpu;
90 unsigned long flags; 94 unsigned long flags;
91 95
96 x2apic_wrmsr_fence();
97
92 local_irq_save(flags); 98 local_irq_save(flags);
93 for_each_online_cpu(query_cpu) { 99 for_each_online_cpu(query_cpu) {
94 if (query_cpu == this_cpu) 100 if (query_cpu == this_cpu)
@@ -213,7 +219,6 @@ struct apic apic_x2apic_phys = {
213 .send_IPI_all = x2apic_send_IPI_all, 219 .send_IPI_all = x2apic_send_IPI_all,
214 .send_IPI_self = x2apic_send_IPI_self, 220 .send_IPI_self = x2apic_send_IPI_self,
215 221
216 .wakeup_cpu = NULL,
217 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, 222 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
218 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, 223 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
219 .wait_for_init_deassert = NULL, 224 .wait_for_init_deassert = NULL,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 20b4ad07c3a1..1248318436e8 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -7,28 +7,28 @@
7 * 7 *
8 * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved. 8 * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved.
9 */ 9 */
10
11#include <linux/kernel.h>
12#include <linux/threads.h>
13#include <linux/cpu.h>
14#include <linux/cpumask.h> 10#include <linux/cpumask.h>
11#include <linux/hardirq.h>
12#include <linux/proc_fs.h>
13#include <linux/threads.h>
14#include <linux/kernel.h>
15#include <linux/module.h>
15#include <linux/string.h> 16#include <linux/string.h>
16#include <linux/ctype.h> 17#include <linux/ctype.h>
17#include <linux/init.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/module.h>
20#include <linux/hardirq.h>
21#include <linux/timer.h> 19#include <linux/timer.h>
22#include <linux/proc_fs.h> 20#include <linux/cpu.h>
23#include <asm/current.h> 21#include <linux/init.h>
24#include <asm/smp.h> 22
25#include <asm/apic.h>
26#include <asm/ipi.h>
27#include <asm/pgtable.h>
28#include <asm/uv/uv.h>
29#include <asm/uv/uv_mmrs.h> 23#include <asm/uv/uv_mmrs.h>
30#include <asm/uv/uv_hub.h> 24#include <asm/uv/uv_hub.h>
25#include <asm/current.h>
26#include <asm/pgtable.h>
31#include <asm/uv/bios.h> 27#include <asm/uv/bios.h>
28#include <asm/uv/uv.h>
29#include <asm/apic.h>
30#include <asm/ipi.h>
31#include <asm/smp.h>
32 32
33DEFINE_PER_CPU(int, x2apic_extra_bits); 33DEFINE_PER_CPU(int, x2apic_extra_bits);
34 34
@@ -91,40 +91,39 @@ static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
91 cpumask_set_cpu(cpu, retmask); 91 cpumask_set_cpu(cpu, retmask);
92} 92}
93 93
94int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip) 94static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
95{ 95{
96#ifdef CONFIG_SMP
96 unsigned long val; 97 unsigned long val;
97 int pnode; 98 int pnode;
98 99
99 pnode = uv_apicid_to_pnode(phys_apicid); 100 pnode = uv_apicid_to_pnode(phys_apicid);
100 val = (1UL << UVH_IPI_INT_SEND_SHFT) | 101 val = (1UL << UVH_IPI_INT_SEND_SHFT) |
101 (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | 102 (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
102 (((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | 103 ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
103 APIC_DM_INIT; 104 APIC_DM_INIT;
104 uv_write_global_mmr64(pnode, UVH_IPI_INT, val); 105 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
105 mdelay(10); 106 mdelay(10);
106 107
107 val = (1UL << UVH_IPI_INT_SEND_SHFT) | 108 val = (1UL << UVH_IPI_INT_SEND_SHFT) |
108 (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | 109 (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
109 (((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | 110 ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
110 APIC_DM_STARTUP; 111 APIC_DM_STARTUP;
111 uv_write_global_mmr64(pnode, UVH_IPI_INT, val); 112 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
113
114 atomic_set(&init_deasserted, 1);
115#endif
112 return 0; 116 return 0;
113} 117}
114 118
115static void uv_send_IPI_one(int cpu, int vector) 119static void uv_send_IPI_one(int cpu, int vector)
116{ 120{
117 unsigned long val, apicid; 121 unsigned long apicid;
118 int pnode; 122 int pnode;
119 123
120 apicid = per_cpu(x86_cpu_to_apicid, cpu); 124 apicid = per_cpu(x86_cpu_to_apicid, cpu);
121 pnode = uv_apicid_to_pnode(apicid); 125 pnode = uv_apicid_to_pnode(apicid);
122 126 uv_hub_send_ipi(pnode, apicid, vector);
123 val = (1UL << UVH_IPI_INT_SEND_SHFT) |
124 (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
125 (vector << UVH_IPI_INT_VECTOR_SHFT);
126
127 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
128} 127}
129 128
130static void uv_send_IPI_mask(const struct cpumask *mask, int vector) 129static void uv_send_IPI_mask(const struct cpumask *mask, int vector)
@@ -285,7 +284,7 @@ struct apic apic_x2apic_uv_x = {
285 .send_IPI_all = uv_send_IPI_all, 284 .send_IPI_all = uv_send_IPI_all,
286 .send_IPI_self = uv_send_IPI_self, 285 .send_IPI_self = uv_send_IPI_self,
287 286
288 .wakeup_cpu = NULL, 287 .wakeup_secondary_cpu = uv_wakeup_secondary,
289 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, 288 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
290 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, 289 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
291 .wait_for_init_deassert = NULL, 290 .wait_for_init_deassert = NULL,
@@ -365,7 +364,7 @@ static __init void map_high(char *id, unsigned long base, int shift,
365 paddr = base << shift; 364 paddr = base << shift;
366 bytes = (1UL << shift) * (max_pnode + 1); 365 bytes = (1UL << shift) * (max_pnode + 1);
367 printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, 366 printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr,
368 paddr + bytes); 367 paddr + bytes);
369 if (map_type == map_uc) 368 if (map_type == map_uc)
370 init_extra_mapping_uc(paddr, bytes); 369 init_extra_mapping_uc(paddr, bytes);
371 else 370 else
@@ -528,7 +527,7 @@ late_initcall(uv_init_heartbeat);
528 527
529/* 528/*
530 * Called on each cpu to initialize the per_cpu UV data area. 529 * Called on each cpu to initialize the per_cpu UV data area.
531 * ZZZ hotplug not supported yet 530 * FIXME: hotplug not supported yet
532 */ 531 */
533void __cpuinit uv_cpu_init(void) 532void __cpuinit uv_cpu_init(void)
534{ 533{
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 10033fe718e0..49e0939bac42 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -466,7 +466,7 @@ static const lookup_t error_table[] = {
466 * @err: APM BIOS return code 466 * @err: APM BIOS return code
467 * 467 *
468 * Write a meaningful log entry to the kernel log in the event of 468 * Write a meaningful log entry to the kernel log in the event of
469 * an APM error. 469 * an APM error. Note that this also handles (negative) kernel errors.
470 */ 470 */
471 471
472static void apm_error(char *str, int err) 472static void apm_error(char *str, int err)
@@ -478,43 +478,14 @@ static void apm_error(char *str, int err)
478 break; 478 break;
479 if (i < ERROR_COUNT) 479 if (i < ERROR_COUNT)
480 printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg); 480 printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg);
481 else if (err < 0)
482 printk(KERN_NOTICE "apm: %s: linux error code %i\n", str, err);
481 else 483 else
482 printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n", 484 printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n",
483 str, err); 485 str, err);
484} 486}
485 487
486/* 488/*
487 * Lock APM functionality to physical CPU 0
488 */
489
490#ifdef CONFIG_SMP
491
492static cpumask_t apm_save_cpus(void)
493{
494 cpumask_t x = current->cpus_allowed;
495 /* Some bioses don't like being called from CPU != 0 */
496 set_cpus_allowed(current, cpumask_of_cpu(0));
497 BUG_ON(smp_processor_id() != 0);
498 return x;
499}
500
501static inline void apm_restore_cpus(cpumask_t mask)
502{
503 set_cpus_allowed(current, mask);
504}
505
506#else
507
508/*
509 * No CPU lockdown needed on a uniprocessor
510 */
511
512#define apm_save_cpus() (current->cpus_allowed)
513#define apm_restore_cpus(x) (void)(x)
514
515#endif
516
517/*
518 * These are the actual BIOS calls. Depending on APM_ZERO_SEGS and 489 * These are the actual BIOS calls. Depending on APM_ZERO_SEGS and
519 * apm_info.allow_ints, we are being really paranoid here! Not only 490 * apm_info.allow_ints, we are being really paranoid here! Not only
520 * are interrupts disabled, but all the segment registers (except SS) 491 * are interrupts disabled, but all the segment registers (except SS)
@@ -568,16 +539,23 @@ static inline void apm_irq_restore(unsigned long flags)
568# define APM_DO_RESTORE_SEGS 539# define APM_DO_RESTORE_SEGS
569#endif 540#endif
570 541
542struct apm_bios_call {
543 u32 func;
544 /* In and out */
545 u32 ebx;
546 u32 ecx;
547 /* Out only */
548 u32 eax;
549 u32 edx;
550 u32 esi;
551
552 /* Error: -ENOMEM, or bits 8-15 of eax */
553 int err;
554};
555
571/** 556/**
572 * apm_bios_call - Make an APM BIOS 32bit call 557 * __apm_bios_call - Make an APM BIOS 32bit call
573 * @func: APM function to execute 558 * @_call: pointer to struct apm_bios_call.
574 * @ebx_in: EBX register for call entry
575 * @ecx_in: ECX register for call entry
576 * @eax: EAX register return
577 * @ebx: EBX register return
578 * @ecx: ECX register return
579 * @edx: EDX register return
580 * @esi: ESI register return
581 * 559 *
582 * Make an APM call using the 32bit protected mode interface. The 560 * Make an APM call using the 32bit protected mode interface. The
583 * caller is responsible for knowing if APM BIOS is configured and 561 * caller is responsible for knowing if APM BIOS is configured and
@@ -586,80 +564,142 @@ static inline void apm_irq_restore(unsigned long flags)
586 * flag is loaded into AL. If there is an error, then the error 564 * flag is loaded into AL. If there is an error, then the error
587 * code is returned in AH (bits 8-15 of eax) and this function 565 * code is returned in AH (bits 8-15 of eax) and this function
588 * returns non-zero. 566 * returns non-zero.
567 *
568 * Note: this makes the call on the current CPU.
589 */ 569 */
590 570static long __apm_bios_call(void *_call)
591static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in,
592 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, u32 *esi)
593{ 571{
594 APM_DECL_SEGS 572 APM_DECL_SEGS
595 unsigned long flags; 573 unsigned long flags;
596 cpumask_t cpus;
597 int cpu; 574 int cpu;
598 struct desc_struct save_desc_40; 575 struct desc_struct save_desc_40;
599 struct desc_struct *gdt; 576 struct desc_struct *gdt;
600 577 struct apm_bios_call *call = _call;
601 cpus = apm_save_cpus();
602 578
603 cpu = get_cpu(); 579 cpu = get_cpu();
580 BUG_ON(cpu != 0);
604 gdt = get_cpu_gdt_table(cpu); 581 gdt = get_cpu_gdt_table(cpu);
605 save_desc_40 = gdt[0x40 / 8]; 582 save_desc_40 = gdt[0x40 / 8];
606 gdt[0x40 / 8] = bad_bios_desc; 583 gdt[0x40 / 8] = bad_bios_desc;
607 584
608 apm_irq_save(flags); 585 apm_irq_save(flags);
609 APM_DO_SAVE_SEGS; 586 APM_DO_SAVE_SEGS;
610 apm_bios_call_asm(func, ebx_in, ecx_in, eax, ebx, ecx, edx, esi); 587 apm_bios_call_asm(call->func, call->ebx, call->ecx,
588 &call->eax, &call->ebx, &call->ecx, &call->edx,
589 &call->esi);
611 APM_DO_RESTORE_SEGS; 590 APM_DO_RESTORE_SEGS;
612 apm_irq_restore(flags); 591 apm_irq_restore(flags);
613 gdt[0x40 / 8] = save_desc_40; 592 gdt[0x40 / 8] = save_desc_40;
614 put_cpu(); 593 put_cpu();
615 apm_restore_cpus(cpus);
616 594
617 return *eax & 0xff; 595 return call->eax & 0xff;
596}
597
598/* Run __apm_bios_call or __apm_bios_call_simple on CPU 0 */
599static int on_cpu0(long (*fn)(void *), struct apm_bios_call *call)
600{
601 int ret;
602
603 /* Don't bother with work_on_cpu in the common case, so we don't
604 * have to worry about OOM or overhead. */
605 if (get_cpu() == 0) {
606 ret = fn(call);
607 put_cpu();
608 } else {
609 put_cpu();
610 ret = work_on_cpu(0, fn, call);
611 }
612
613 /* work_on_cpu can fail with -ENOMEM */
614 if (ret < 0)
615 call->err = ret;
616 else
617 call->err = (call->eax >> 8) & 0xff;
618
619 return ret;
618} 620}
619 621
620/** 622/**
621 * apm_bios_call_simple - make a simple APM BIOS 32bit call 623 * apm_bios_call - Make an APM BIOS 32bit call (on CPU 0)
622 * @func: APM function to invoke 624 * @call: the apm_bios_call registers.
623 * @ebx_in: EBX register value for BIOS call 625 *
624 * @ecx_in: ECX register value for BIOS call 626 * If there is an error, it is returned in @call.err.
625 * @eax: EAX register on return from the BIOS call 627 */
628static int apm_bios_call(struct apm_bios_call *call)
629{
630 return on_cpu0(__apm_bios_call, call);
631}
632
633/**
634 * __apm_bios_call_simple - Make an APM BIOS 32bit call (on CPU 0)
635 * @_call: pointer to struct apm_bios_call.
626 * 636 *
627 * Make a BIOS call that returns one value only, or just status. 637 * Make a BIOS call that returns one value only, or just status.
628 * If there is an error, then the error code is returned in AH 638 * If there is an error, then the error code is returned in AH
629 * (bits 8-15 of eax) and this function returns non-zero. This is 639 * (bits 8-15 of eax) and this function returns non-zero (it can
630 * used for simpler BIOS operations. This call may hold interrupts 640 * also return -ENOMEM). This is used for simpler BIOS operations.
631 * off for a long time on some laptops. 641 * This call may hold interrupts off for a long time on some laptops.
642 *
643 * Note: this makes the call on the current CPU.
632 */ 644 */
633 645static long __apm_bios_call_simple(void *_call)
634static u8 apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax)
635{ 646{
636 u8 error; 647 u8 error;
637 APM_DECL_SEGS 648 APM_DECL_SEGS
638 unsigned long flags; 649 unsigned long flags;
639 cpumask_t cpus;
640 int cpu; 650 int cpu;
641 struct desc_struct save_desc_40; 651 struct desc_struct save_desc_40;
642 struct desc_struct *gdt; 652 struct desc_struct *gdt;
643 653 struct apm_bios_call *call = _call;
644 cpus = apm_save_cpus();
645 654
646 cpu = get_cpu(); 655 cpu = get_cpu();
656 BUG_ON(cpu != 0);
647 gdt = get_cpu_gdt_table(cpu); 657 gdt = get_cpu_gdt_table(cpu);
648 save_desc_40 = gdt[0x40 / 8]; 658 save_desc_40 = gdt[0x40 / 8];
649 gdt[0x40 / 8] = bad_bios_desc; 659 gdt[0x40 / 8] = bad_bios_desc;
650 660
651 apm_irq_save(flags); 661 apm_irq_save(flags);
652 APM_DO_SAVE_SEGS; 662 APM_DO_SAVE_SEGS;
653 error = apm_bios_call_simple_asm(func, ebx_in, ecx_in, eax); 663 error = apm_bios_call_simple_asm(call->func, call->ebx, call->ecx,
664 &call->eax);
654 APM_DO_RESTORE_SEGS; 665 APM_DO_RESTORE_SEGS;
655 apm_irq_restore(flags); 666 apm_irq_restore(flags);
656 gdt[0x40 / 8] = save_desc_40; 667 gdt[0x40 / 8] = save_desc_40;
657 put_cpu(); 668 put_cpu();
658 apm_restore_cpus(cpus);
659 return error; 669 return error;
660} 670}
661 671
662/** 672/**
673 * apm_bios_call_simple - make a simple APM BIOS 32bit call
674 * @func: APM function to invoke
675 * @ebx_in: EBX register value for BIOS call
676 * @ecx_in: ECX register value for BIOS call
677 * @eax: EAX register on return from the BIOS call
678 * @err: bits
679 *
680 * Make a BIOS call that returns one value only, or just status.
681 * If there is an error, then the error code is returned in @err
682 * and this function returns non-zero. This is used for simpler
683 * BIOS operations. This call may hold interrupts off for a long
684 * time on some laptops.
685 */
686static int apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax,
687 int *err)
688{
689 struct apm_bios_call call;
690 int ret;
691
692 call.func = func;
693 call.ebx = ebx_in;
694 call.ecx = ecx_in;
695
696 ret = on_cpu0(__apm_bios_call_simple, &call);
697 *eax = call.eax;
698 *err = call.err;
699 return ret;
700}
701
702/**
663 * apm_driver_version - APM driver version 703 * apm_driver_version - APM driver version
664 * @val: loaded with the APM version on return 704 * @val: loaded with the APM version on return
665 * 705 *
@@ -678,9 +718,10 @@ static u8 apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax)
678static int apm_driver_version(u_short *val) 718static int apm_driver_version(u_short *val)
679{ 719{
680 u32 eax; 720 u32 eax;
721 int err;
681 722
682 if (apm_bios_call_simple(APM_FUNC_VERSION, 0, *val, &eax)) 723 if (apm_bios_call_simple(APM_FUNC_VERSION, 0, *val, &eax, &err))
683 return (eax >> 8) & 0xff; 724 return err;
684 *val = eax; 725 *val = eax;
685 return APM_SUCCESS; 726 return APM_SUCCESS;
686} 727}
@@ -701,22 +742,21 @@ static int apm_driver_version(u_short *val)
701 * that APM 1.2 is in use. If no messges are pending the value 0x80 742 * that APM 1.2 is in use. If no messges are pending the value 0x80
702 * is returned (No power management events pending). 743 * is returned (No power management events pending).
703 */ 744 */
704
705static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info) 745static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info)
706{ 746{
707 u32 eax; 747 struct apm_bios_call call;
708 u32 ebx;
709 u32 ecx;
710 u32 dummy;
711 748
712 if (apm_bios_call(APM_FUNC_GET_EVENT, 0, 0, &eax, &ebx, &ecx, 749 call.func = APM_FUNC_GET_EVENT;
713 &dummy, &dummy)) 750 call.ebx = call.ecx = 0;
714 return (eax >> 8) & 0xff; 751
715 *event = ebx; 752 if (apm_bios_call(&call))
753 return call.err;
754
755 *event = call.ebx;
716 if (apm_info.connection_version < 0x0102) 756 if (apm_info.connection_version < 0x0102)
717 *info = ~0; /* indicate info not valid */ 757 *info = ~0; /* indicate info not valid */
718 else 758 else
719 *info = ecx; 759 *info = call.ecx;
720 return APM_SUCCESS; 760 return APM_SUCCESS;
721} 761}
722 762
@@ -737,9 +777,10 @@ static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info)
737static int set_power_state(u_short what, u_short state) 777static int set_power_state(u_short what, u_short state)
738{ 778{
739 u32 eax; 779 u32 eax;
780 int err;
740 781
741 if (apm_bios_call_simple(APM_FUNC_SET_STATE, what, state, &eax)) 782 if (apm_bios_call_simple(APM_FUNC_SET_STATE, what, state, &eax, &err))
742 return (eax >> 8) & 0xff; 783 return err;
743 return APM_SUCCESS; 784 return APM_SUCCESS;
744} 785}
745 786
@@ -770,6 +811,7 @@ static int apm_do_idle(void)
770 u8 ret = 0; 811 u8 ret = 0;
771 int idled = 0; 812 int idled = 0;
772 int polling; 813 int polling;
814 int err;
773 815
774 polling = !!(current_thread_info()->status & TS_POLLING); 816 polling = !!(current_thread_info()->status & TS_POLLING);
775 if (polling) { 817 if (polling) {
@@ -782,7 +824,7 @@ static int apm_do_idle(void)
782 } 824 }
783 if (!need_resched()) { 825 if (!need_resched()) {
784 idled = 1; 826 idled = 1;
785 ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax); 827 ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax, &err);
786 } 828 }
787 if (polling) 829 if (polling)
788 current_thread_info()->status |= TS_POLLING; 830 current_thread_info()->status |= TS_POLLING;
@@ -797,8 +839,7 @@ static int apm_do_idle(void)
797 * Only report the failure the first 5 times. 839 * Only report the failure the first 5 times.
798 */ 840 */
799 if (++t < 5) { 841 if (++t < 5) {
800 printk(KERN_DEBUG "apm_do_idle failed (%d)\n", 842 printk(KERN_DEBUG "apm_do_idle failed (%d)\n", err);
801 (eax >> 8) & 0xff);
802 t = jiffies; 843 t = jiffies;
803 } 844 }
804 return -1; 845 return -1;
@@ -816,9 +857,10 @@ static int apm_do_idle(void)
816static void apm_do_busy(void) 857static void apm_do_busy(void)
817{ 858{
818 u32 dummy; 859 u32 dummy;
860 int err;
819 861
820 if (clock_slowed || ALWAYS_CALL_BUSY) { 862 if (clock_slowed || ALWAYS_CALL_BUSY) {
821 (void)apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy); 863 (void)apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy, &err);
822 clock_slowed = 0; 864 clock_slowed = 0;
823 } 865 }
824} 866}
@@ -937,7 +979,7 @@ static void apm_power_off(void)
937 979
938 /* Some bioses don't like being called from CPU != 0 */ 980 /* Some bioses don't like being called from CPU != 0 */
939 if (apm_info.realmode_power_off) { 981 if (apm_info.realmode_power_off) {
940 (void)apm_save_cpus(); 982 set_cpus_allowed_ptr(current, cpumask_of(0));
941 machine_real_restart(po_bios_call, sizeof(po_bios_call)); 983 machine_real_restart(po_bios_call, sizeof(po_bios_call));
942 } else { 984 } else {
943 (void)set_system_power_state(APM_STATE_OFF); 985 (void)set_system_power_state(APM_STATE_OFF);
@@ -956,12 +998,13 @@ static void apm_power_off(void)
956static int apm_enable_power_management(int enable) 998static int apm_enable_power_management(int enable)
957{ 999{
958 u32 eax; 1000 u32 eax;
1001 int err;
959 1002
960 if ((enable == 0) && (apm_info.bios.flags & APM_BIOS_DISENGAGED)) 1003 if ((enable == 0) && (apm_info.bios.flags & APM_BIOS_DISENGAGED))
961 return APM_NOT_ENGAGED; 1004 return APM_NOT_ENGAGED;
962 if (apm_bios_call_simple(APM_FUNC_ENABLE_PM, APM_DEVICE_BALL, 1005 if (apm_bios_call_simple(APM_FUNC_ENABLE_PM, APM_DEVICE_BALL,
963 enable, &eax)) 1006 enable, &eax, &err))
964 return (eax >> 8) & 0xff; 1007 return err;
965 if (enable) 1008 if (enable)
966 apm_info.bios.flags &= ~APM_BIOS_DISABLED; 1009 apm_info.bios.flags &= ~APM_BIOS_DISABLED;
967 else 1010 else
@@ -986,24 +1029,23 @@ static int apm_enable_power_management(int enable)
986 1029
987static int apm_get_power_status(u_short *status, u_short *bat, u_short *life) 1030static int apm_get_power_status(u_short *status, u_short *bat, u_short *life)
988{ 1031{
989 u32 eax; 1032 struct apm_bios_call call;
990 u32 ebx; 1033
991 u32 ecx; 1034 call.func = APM_FUNC_GET_STATUS;
992 u32 edx; 1035 call.ebx = APM_DEVICE_ALL;
993 u32 dummy; 1036 call.ecx = 0;
994 1037
995 if (apm_info.get_power_status_broken) 1038 if (apm_info.get_power_status_broken)
996 return APM_32_UNSUPPORTED; 1039 return APM_32_UNSUPPORTED;
997 if (apm_bios_call(APM_FUNC_GET_STATUS, APM_DEVICE_ALL, 0, 1040 if (apm_bios_call(&call))
998 &eax, &ebx, &ecx, &edx, &dummy)) 1041 return call.err;
999 return (eax >> 8) & 0xff; 1042 *status = call.ebx;
1000 *status = ebx; 1043 *bat = call.ecx;
1001 *bat = ecx;
1002 if (apm_info.get_power_status_swabinminutes) { 1044 if (apm_info.get_power_status_swabinminutes) {
1003 *life = swab16((u16)edx); 1045 *life = swab16((u16)call.edx);
1004 *life |= 0x8000; 1046 *life |= 0x8000;
1005 } else 1047 } else
1006 *life = edx; 1048 *life = call.edx;
1007 return APM_SUCCESS; 1049 return APM_SUCCESS;
1008} 1050}
1009 1051
@@ -1048,12 +1090,14 @@ static int apm_get_battery_status(u_short which, u_short *status,
1048static int apm_engage_power_management(u_short device, int enable) 1090static int apm_engage_power_management(u_short device, int enable)
1049{ 1091{
1050 u32 eax; 1092 u32 eax;
1093 int err;
1051 1094
1052 if ((enable == 0) && (device == APM_DEVICE_ALL) 1095 if ((enable == 0) && (device == APM_DEVICE_ALL)
1053 && (apm_info.bios.flags & APM_BIOS_DISABLED)) 1096 && (apm_info.bios.flags & APM_BIOS_DISABLED))
1054 return APM_DISABLED; 1097 return APM_DISABLED;
1055 if (apm_bios_call_simple(APM_FUNC_ENGAGE_PM, device, enable, &eax)) 1098 if (apm_bios_call_simple(APM_FUNC_ENGAGE_PM, device, enable,
1056 return (eax >> 8) & 0xff; 1099 &eax, &err))
1100 return err;
1057 if (device == APM_DEVICE_ALL) { 1101 if (device == APM_DEVICE_ALL) {
1058 if (enable) 1102 if (enable)
1059 apm_info.bios.flags &= ~APM_BIOS_DISENGAGED; 1103 apm_info.bios.flags &= ~APM_BIOS_DISENGAGED;
@@ -1190,8 +1234,10 @@ static int suspend(int vetoable)
1190 struct apm_user *as; 1234 struct apm_user *as;
1191 1235
1192 device_suspend(PMSG_SUSPEND); 1236 device_suspend(PMSG_SUSPEND);
1193 local_irq_disable(); 1237
1194 device_power_down(PMSG_SUSPEND); 1238 device_power_down(PMSG_SUSPEND);
1239
1240 local_irq_disable();
1195 sysdev_suspend(PMSG_SUSPEND); 1241 sysdev_suspend(PMSG_SUSPEND);
1196 1242
1197 local_irq_enable(); 1243 local_irq_enable();
@@ -1209,9 +1255,12 @@ static int suspend(int vetoable)
1209 if (err != APM_SUCCESS) 1255 if (err != APM_SUCCESS)
1210 apm_error("suspend", err); 1256 apm_error("suspend", err);
1211 err = (err == APM_SUCCESS) ? 0 : -EIO; 1257 err = (err == APM_SUCCESS) ? 0 : -EIO;
1258
1212 sysdev_resume(); 1259 sysdev_resume();
1213 device_power_up(PMSG_RESUME);
1214 local_irq_enable(); 1260 local_irq_enable();
1261
1262 device_power_up(PMSG_RESUME);
1263
1215 device_resume(PMSG_RESUME); 1264 device_resume(PMSG_RESUME);
1216 queue_event(APM_NORMAL_RESUME, NULL); 1265 queue_event(APM_NORMAL_RESUME, NULL);
1217 spin_lock(&user_list_lock); 1266 spin_lock(&user_list_lock);
@@ -1228,8 +1277,9 @@ static void standby(void)
1228{ 1277{
1229 int err; 1278 int err;
1230 1279
1231 local_irq_disable();
1232 device_power_down(PMSG_SUSPEND); 1280 device_power_down(PMSG_SUSPEND);
1281
1282 local_irq_disable();
1233 sysdev_suspend(PMSG_SUSPEND); 1283 sysdev_suspend(PMSG_SUSPEND);
1234 local_irq_enable(); 1284 local_irq_enable();
1235 1285
@@ -1239,8 +1289,9 @@ static void standby(void)
1239 1289
1240 local_irq_disable(); 1290 local_irq_disable();
1241 sysdev_resume(); 1291 sysdev_resume();
1242 device_power_up(PMSG_RESUME);
1243 local_irq_enable(); 1292 local_irq_enable();
1293
1294 device_power_up(PMSG_RESUME);
1244} 1295}
1245 1296
1246static apm_event_t get_event(void) 1297static apm_event_t get_event(void)
@@ -1682,16 +1733,14 @@ static int apm(void *unused)
1682 char *power_stat; 1733 char *power_stat;
1683 char *bat_stat; 1734 char *bat_stat;
1684 1735
1685#ifdef CONFIG_SMP
1686 /* 2002/08/01 - WT 1736 /* 2002/08/01 - WT
1687 * This is to avoid random crashes at boot time during initialization 1737 * This is to avoid random crashes at boot time during initialization
1688 * on SMP systems in case of "apm=power-off" mode. Seen on ASUS A7M266D. 1738 * on SMP systems in case of "apm=power-off" mode. Seen on ASUS A7M266D.
1689 * Some bioses don't like being called from CPU != 0. 1739 * Some bioses don't like being called from CPU != 0.
1690 * Method suggested by Ingo Molnar. 1740 * Method suggested by Ingo Molnar.
1691 */ 1741 */
1692 set_cpus_allowed(current, cpumask_of_cpu(0)); 1742 set_cpus_allowed_ptr(current, cpumask_of(0));
1693 BUG_ON(smp_processor_id() != 0); 1743 BUG_ON(smp_processor_id() != 0);
1694#endif
1695 1744
1696 if (apm_info.connection_version == 0) { 1745 if (apm_info.connection_version == 0) {
1697 apm_info.connection_version = apm_info.bios.version; 1746 apm_info.connection_version = apm_info.bios.version;
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index fbf2f33e3080..5a6aa1c1162f 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -18,6 +18,7 @@
18#include <asm/thread_info.h> 18#include <asm/thread_info.h>
19#include <asm/bootparam.h> 19#include <asm/bootparam.h>
20#include <asm/elf.h> 20#include <asm/elf.h>
21#include <asm/suspend.h>
21 22
22#include <xen/interface/xen.h> 23#include <xen/interface/xen.h>
23 24
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 8793ab33e2c1..e72f062fb4b5 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -16,6 +16,7 @@
16#include <asm/thread_info.h> 16#include <asm/thread_info.h>
17#include <asm/ia32.h> 17#include <asm/ia32.h>
18#include <asm/bootparam.h> 18#include <asm/bootparam.h>
19#include <asm/suspend.h>
19 20
20#include <xen/interface/xen.h> 21#include <xen/interface/xen.h>
21 22
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index 2ac0ab71412a..fc999e6fc46a 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -83,15 +83,15 @@ void __init setup_bios_corruption_check(void)
83 u64 size; 83 u64 size;
84 addr = find_e820_area_size(addr, &size, PAGE_SIZE); 84 addr = find_e820_area_size(addr, &size, PAGE_SIZE);
85 85
86 if (addr == 0) 86 if (!(addr + 1))
87 break;
88
89 if (addr >= corruption_check_size)
87 break; 90 break;
88 91
89 if ((addr + size) > corruption_check_size) 92 if ((addr + size) > corruption_check_size)
90 size = corruption_check_size - addr; 93 size = corruption_check_size - addr;
91 94
92 if (size == 0)
93 break;
94
95 e820_update_range(addr, size, E820_RAM, E820_RESERVED); 95 e820_update_range(addr, size, E820_RAM, E820_RESERVED);
96 scan_areas[num_scan_areas].addr = addr; 96 scan_areas[num_scan_areas].addr = addr;
97 scan_areas[num_scan_areas].size = size; 97 scan_areas[num_scan_areas].size = size;
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index c3813306e0b4..3efcb2b96a15 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -14,11 +14,12 @@ obj-y += vmware.o hypervisor.o
14obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o 14obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
15obj-$(CONFIG_X86_64) += bugs_64.o 15obj-$(CONFIG_X86_64) += bugs_64.o
16 16
17obj-$(CONFIG_X86_CPU_DEBUG) += cpu_debug.o
18
17obj-$(CONFIG_CPU_SUP_INTEL) += intel.o 19obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
18obj-$(CONFIG_CPU_SUP_AMD) += amd.o 20obj-$(CONFIG_CPU_SUP_AMD) += amd.o
19obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o 21obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
20obj-$(CONFIG_CPU_SUP_CENTAUR_32) += centaur.o 22obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
21obj-$(CONFIG_CPU_SUP_CENTAUR_64) += centaur_64.o
22obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o 23obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
23obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o 24obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
24 25
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 6882a735d9c0..8220ae69849d 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -29,7 +29,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
29 u32 regs[4]; 29 u32 regs[4];
30 const struct cpuid_bit *cb; 30 const struct cpuid_bit *cb;
31 31
32 static const struct cpuid_bit cpuid_bits[] = { 32 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
33 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 }, 33 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
34 { 0, 0, 0, 0 } 34 { 0, 0, 0, 0 }
35 }; 35 };
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index edcde52bd170..fd69c514ca2a 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -5,6 +5,7 @@
5#include <asm/io.h> 5#include <asm/io.h>
6#include <asm/processor.h> 6#include <asm/processor.h>
7#include <asm/apic.h> 7#include <asm/apic.h>
8#include <asm/cpu.h>
8 9
9#ifdef CONFIG_X86_64 10#ifdef CONFIG_X86_64
10# include <asm/numa_64.h> 11# include <asm/numa_64.h>
@@ -141,6 +142,55 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
141 } 142 }
142} 143}
143 144
145static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
146{
147#ifdef CONFIG_SMP
148 /* calling is from identify_secondary_cpu() ? */
149 if (c->cpu_index == boot_cpu_id)
150 return;
151
152 /*
153 * Certain Athlons might work (for various values of 'work') in SMP
154 * but they are not certified as MP capable.
155 */
156 /* Athlon 660/661 is valid. */
157 if ((c->x86_model == 6) && ((c->x86_mask == 0) ||
158 (c->x86_mask == 1)))
159 goto valid_k7;
160
161 /* Duron 670 is valid */
162 if ((c->x86_model == 7) && (c->x86_mask == 0))
163 goto valid_k7;
164
165 /*
166 * Athlon 662, Duron 671, and Athlon >model 7 have capability
167 * bit. It's worth noting that the A5 stepping (662) of some
168 * Athlon XP's have the MP bit set.
169 * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
170 * more.
171 */
172 if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
173 ((c->x86_model == 7) && (c->x86_mask >= 1)) ||
174 (c->x86_model > 7))
175 if (cpu_has_mp)
176 goto valid_k7;
177
178 /* If we get here, not a certified SMP capable AMD system. */
179
180 /*
181 * Don't taint if we are running SMP kernel on a single non-MP
182 * approved Athlon
183 */
184 WARN_ONCE(1, "WARNING: This combination of AMD"
185 "processors is not suitable for SMP.\n");
186 if (!test_taint(TAINT_UNSAFE_SMP))
187 add_taint(TAINT_UNSAFE_SMP);
188
189valid_k7:
190 ;
191#endif
192}
193
144static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) 194static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
145{ 195{
146 u32 l, h; 196 u32 l, h;
@@ -175,6 +225,8 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
175 } 225 }
176 226
177 set_cpu_cap(c, X86_FEATURE_K7); 227 set_cpu_cap(c, X86_FEATURE_K7);
228
229 amd_k7_smp_check(c);
178} 230}
179#endif 231#endif
180 232
@@ -454,7 +506,7 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int
454} 506}
455#endif 507#endif
456 508
457static struct cpu_dev amd_cpu_dev __cpuinitdata = { 509static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
458 .c_vendor = "AMD", 510 .c_vendor = "AMD",
459 .c_ident = { "AuthenticAMD" }, 511 .c_ident = { "AuthenticAMD" },
460#ifdef CONFIG_X86_32 512#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index 89bfdd9cacc6..c95e831bb095 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -1,11 +1,11 @@
1#include <linux/bitops.h>
1#include <linux/kernel.h> 2#include <linux/kernel.h>
2#include <linux/init.h> 3#include <linux/init.h>
3#include <linux/bitops.h>
4 4
5#include <asm/processor.h> 5#include <asm/processor.h>
6#include <asm/msr.h>
7#include <asm/e820.h> 6#include <asm/e820.h>
8#include <asm/mtrr.h> 7#include <asm/mtrr.h>
8#include <asm/msr.h>
9 9
10#include "cpu.h" 10#include "cpu.h"
11 11
@@ -276,7 +276,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
276 */ 276 */
277 c->x86_capability[5] = cpuid_edx(0xC0000001); 277 c->x86_capability[5] = cpuid_edx(0xC0000001);
278 } 278 }
279 279#ifdef CONFIG_X86_32
280 /* Cyrix III family needs CX8 & PGE explicitly enabled. */ 280 /* Cyrix III family needs CX8 & PGE explicitly enabled. */
281 if (c->x86_model >= 6 && c->x86_model <= 9) { 281 if (c->x86_model >= 6 && c->x86_model <= 9) {
282 rdmsr(MSR_VIA_FCR, lo, hi); 282 rdmsr(MSR_VIA_FCR, lo, hi);
@@ -288,6 +288,11 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
288 /* Before Nehemiah, the C3's had 3dNOW! */ 288 /* Before Nehemiah, the C3's had 3dNOW! */
289 if (c->x86_model >= 6 && c->x86_model < 9) 289 if (c->x86_model >= 6 && c->x86_model < 9)
290 set_cpu_cap(c, X86_FEATURE_3DNOW); 290 set_cpu_cap(c, X86_FEATURE_3DNOW);
291#endif
292 if (c->x86 == 0x6 && c->x86_model >= 0xf) {
293 c->x86_cache_alignment = c->x86_clflush_size * 2;
294 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
295 }
291 296
292 display_cacheinfo(c); 297 display_cacheinfo(c);
293} 298}
@@ -316,16 +321,25 @@ enum {
316static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) 321static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
317{ 322{
318 switch (c->x86) { 323 switch (c->x86) {
324#ifdef CONFIG_X86_32
319 case 5: 325 case 5:
320 /* Emulate MTRRs using Centaur's MCR. */ 326 /* Emulate MTRRs using Centaur's MCR. */
321 set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR); 327 set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR);
322 break; 328 break;
329#endif
330 case 6:
331 if (c->x86_model >= 0xf)
332 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
333 break;
323 } 334 }
335#ifdef CONFIG_X86_64
336 set_cpu_cap(c, X86_FEATURE_SYSENTER32);
337#endif
324} 338}
325 339
326static void __cpuinit init_centaur(struct cpuinfo_x86 *c) 340static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
327{ 341{
328 342#ifdef CONFIG_X86_32
329 char *name; 343 char *name;
330 u32 fcr_set = 0; 344 u32 fcr_set = 0;
331 u32 fcr_clr = 0; 345 u32 fcr_clr = 0;
@@ -337,8 +351,10 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
337 * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway 351 * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
338 */ 352 */
339 clear_cpu_cap(c, 0*32+31); 353 clear_cpu_cap(c, 0*32+31);
340 354#endif
355 early_init_centaur(c);
341 switch (c->x86) { 356 switch (c->x86) {
357#ifdef CONFIG_X86_32
342 case 5: 358 case 5:
343 switch (c->x86_model) { 359 switch (c->x86_model) {
344 case 4: 360 case 4:
@@ -442,16 +458,20 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
442 } 458 }
443 sprintf(c->x86_model_id, "WinChip %s", name); 459 sprintf(c->x86_model_id, "WinChip %s", name);
444 break; 460 break;
445 461#endif
446 case 6: 462 case 6:
447 init_c3(c); 463 init_c3(c);
448 break; 464 break;
449 } 465 }
466#ifdef CONFIG_X86_64
467 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
468#endif
450} 469}
451 470
452static unsigned int __cpuinit 471static unsigned int __cpuinit
453centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size) 472centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
454{ 473{
474#ifdef CONFIG_X86_32
455 /* VIA C3 CPUs (670-68F) need further shifting. */ 475 /* VIA C3 CPUs (670-68F) need further shifting. */
456 if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8))) 476 if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8)))
457 size >>= 8; 477 size >>= 8;
@@ -464,11 +484,11 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
464 if ((c->x86 == 6) && (c->x86_model == 9) && 484 if ((c->x86 == 6) && (c->x86_model == 9) &&
465 (c->x86_mask == 1) && (size == 65)) 485 (c->x86_mask == 1) && (size == 65))
466 size -= 1; 486 size -= 1;
467 487#endif
468 return size; 488 return size;
469} 489}
470 490
471static struct cpu_dev centaur_cpu_dev __cpuinitdata = { 491static const struct cpu_dev __cpuinitconst centaur_cpu_dev = {
472 .c_vendor = "Centaur", 492 .c_vendor = "Centaur",
473 .c_ident = { "CentaurHauls" }, 493 .c_ident = { "CentaurHauls" },
474 .c_early_init = early_init_centaur, 494 .c_early_init = early_init_centaur,
diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c
deleted file mode 100644
index a1625f5a1e78..000000000000
--- a/arch/x86/kernel/cpu/centaur_64.c
+++ /dev/null
@@ -1,37 +0,0 @@
1#include <linux/init.h>
2#include <linux/smp.h>
3
4#include <asm/cpufeature.h>
5#include <asm/processor.h>
6
7#include "cpu.h"
8
9static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
10{
11 if (c->x86 == 0x6 && c->x86_model >= 0xf)
12 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
13
14 set_cpu_cap(c, X86_FEATURE_SYSENTER32);
15}
16
17static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
18{
19 early_init_centaur(c);
20
21 if (c->x86 == 0x6 && c->x86_model >= 0xf) {
22 c->x86_cache_alignment = c->x86_clflush_size * 2;
23 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
24 }
25 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
26}
27
28static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
29 .c_vendor = "Centaur",
30 .c_ident = { "CentaurHauls" },
31 .c_early_init = early_init_centaur,
32 .c_init = init_centaur,
33 .c_x86_vendor = X86_VENDOR_CENTAUR,
34};
35
36cpu_dev_register(centaur_cpu_dev);
37
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index b66af09a6c7d..a86769efe0df 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1,53 +1,51 @@
1#include <linux/init.h>
2#include <linux/kernel.h>
3#include <linux/sched.h>
4#include <linux/string.h>
5#include <linux/bootmem.h> 1#include <linux/bootmem.h>
2#include <linux/linkage.h>
6#include <linux/bitops.h> 3#include <linux/bitops.h>
4#include <linux/kernel.h>
7#include <linux/module.h> 5#include <linux/module.h>
8#include <linux/kgdb.h> 6#include <linux/percpu.h>
9#include <linux/topology.h> 7#include <linux/string.h>
10#include <linux/delay.h> 8#include <linux/delay.h>
9#include <linux/sched.h>
10#include <linux/init.h>
11#include <linux/kgdb.h>
11#include <linux/smp.h> 12#include <linux/smp.h>
12#include <linux/percpu.h> 13#include <linux/io.h>
13#include <asm/i387.h> 14
14#include <asm/msr.h> 15#include <asm/stackprotector.h>
15#include <asm/io.h> 16#include <asm/perf_counter.h>
16#include <asm/linkage.h>
17#include <asm/mmu_context.h> 17#include <asm/mmu_context.h>
18#include <asm/hypervisor.h>
19#include <asm/processor.h>
20#include <asm/sections.h>
21#include <asm/topology.h>
22#include <asm/cpumask.h>
23#include <asm/pgtable.h>
24#include <asm/atomic.h>
25#include <asm/proto.h>
26#include <asm/setup.h>
27#include <asm/apic.h>
28#include <asm/desc.h>
29#include <asm/i387.h>
18#include <asm/mtrr.h> 30#include <asm/mtrr.h>
31#include <asm/numa.h>
32#include <asm/asm.h>
33#include <asm/cpu.h>
19#include <asm/mce.h> 34#include <asm/mce.h>
20#include <asm/perf_counter.h> 35#include <asm/msr.h>
21#include <asm/pat.h> 36#include <asm/pat.h>
22#include <asm/asm.h>
23#include <asm/numa.h>
24#include <asm/smp.h> 37#include <asm/smp.h>
25#include <asm/cpu.h>
26#include <asm/cpumask.h>
27#include <asm/apic.h>
28 38
29#ifdef CONFIG_X86_LOCAL_APIC 39#ifdef CONFIG_X86_LOCAL_APIC
30#include <asm/uv/uv.h> 40#include <asm/uv/uv.h>
31#endif 41#endif
32 42
33#include <asm/pgtable.h>
34#include <asm/processor.h>
35#include <asm/desc.h>
36#include <asm/atomic.h>
37#include <asm/proto.h>
38#include <asm/sections.h>
39#include <asm/setup.h>
40#include <asm/hypervisor.h>
41#include <asm/stackprotector.h>
42
43#include "cpu.h" 43#include "cpu.h"
44 44
45#ifdef CONFIG_X86_64
46
47/* all of these masks are initialized in setup_cpu_local_masks() */ 45/* all of these masks are initialized in setup_cpu_local_masks() */
48cpumask_var_t cpu_callin_mask;
49cpumask_var_t cpu_callout_mask;
50cpumask_var_t cpu_initialized_mask; 46cpumask_var_t cpu_initialized_mask;
47cpumask_var_t cpu_callout_mask;
48cpumask_var_t cpu_callin_mask;
51 49
52/* representing cpus for which sibling maps can be computed */ 50/* representing cpus for which sibling maps can be computed */
53cpumask_var_t cpu_sibling_setup_mask; 51cpumask_var_t cpu_sibling_setup_mask;
@@ -61,17 +59,7 @@ void __init setup_cpu_local_masks(void)
61 alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask); 59 alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
62} 60}
63 61
64#else /* CONFIG_X86_32 */ 62static const struct cpu_dev *this_cpu __cpuinitdata;
65
66cpumask_t cpu_callin_map;
67cpumask_t cpu_callout_map;
68cpumask_t cpu_initialized;
69cpumask_t cpu_sibling_setup_map;
70
71#endif /* CONFIG_X86_32 */
72
73
74static struct cpu_dev *this_cpu __cpuinitdata;
75 63
76DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { 64DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
77#ifdef CONFIG_X86_64 65#ifdef CONFIG_X86_64
@@ -80,48 +68,48 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
80 * IRET will check the segment types kkeil 2000/10/28 68 * IRET will check the segment types kkeil 2000/10/28
81 * Also sysret mandates a special GDT layout 69 * Also sysret mandates a special GDT layout
82 * 70 *
83 * The TLS descriptors are currently at a different place compared to i386. 71 * TLS descriptors are currently at a different place compared to i386.
84 * Hopefully nobody expects them at a fixed place (Wine?) 72 * Hopefully nobody expects them at a fixed place (Wine?)
85 */ 73 */
86 [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, 74 [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
87 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, 75 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
88 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, 76 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
89 [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, 77 [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
90 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, 78 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
91 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, 79 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
92#else 80#else
93 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, 81 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
94 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, 82 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
95 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, 83 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
96 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } }, 84 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } },
97 /* 85 /*
98 * Segments used for calling PnP BIOS have byte granularity. 86 * Segments used for calling PnP BIOS have byte granularity.
99 * They code segments and data segments have fixed 64k limits, 87 * They code segments and data segments have fixed 64k limits,
100 * the transfer segment sizes are set at run time. 88 * the transfer segment sizes are set at run time.
101 */ 89 */
102 /* 32-bit code */ 90 /* 32-bit code */
103 [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } }, 91 [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } },
104 /* 16-bit code */ 92 /* 16-bit code */
105 [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } }, 93 [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } },
106 /* 16-bit data */ 94 /* 16-bit data */
107 [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } }, 95 [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } },
108 /* 16-bit data */ 96 /* 16-bit data */
109 [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } }, 97 [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } },
110 /* 16-bit data */ 98 /* 16-bit data */
111 [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } }, 99 [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } },
112 /* 100 /*
113 * The APM segments have byte granularity and their bases 101 * The APM segments have byte granularity and their bases
114 * are set at run time. All have 64k limits. 102 * are set at run time. All have 64k limits.
115 */ 103 */
116 /* 32-bit code */ 104 /* 32-bit code */
117 [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } }, 105 [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } },
118 /* 16-bit code */ 106 /* 16-bit code */
119 [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } }, 107 [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } },
120 /* data */ 108 /* data */
121 [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, 109 [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
122 110
123 [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, 111 [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
124 [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } }, 112 [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } },
125 GDT_STACK_CANARY_INIT 113 GDT_STACK_CANARY_INIT
126#endif 114#endif
127} }; 115} };
@@ -165,16 +153,17 @@ static inline int flag_is_changeable_p(u32 flag)
165 * the CPUID. Add "volatile" to not allow gcc to 153 * the CPUID. Add "volatile" to not allow gcc to
166 * optimize the subsequent calls to this function. 154 * optimize the subsequent calls to this function.
167 */ 155 */
168 asm volatile ("pushfl\n\t" 156 asm volatile ("pushfl \n\t"
169 "pushfl\n\t" 157 "pushfl \n\t"
170 "popl %0\n\t" 158 "popl %0 \n\t"
171 "movl %0,%1\n\t" 159 "movl %0, %1 \n\t"
172 "xorl %2,%0\n\t" 160 "xorl %2, %0 \n\t"
173 "pushl %0\n\t" 161 "pushl %0 \n\t"
174 "popfl\n\t" 162 "popfl \n\t"
175 "pushfl\n\t" 163 "pushfl \n\t"
176 "popl %0\n\t" 164 "popl %0 \n\t"
177 "popfl\n\t" 165 "popfl \n\t"
166
178 : "=&r" (f1), "=&r" (f2) 167 : "=&r" (f1), "=&r" (f2)
179 : "ir" (flag)); 168 : "ir" (flag));
180 169
@@ -189,18 +178,22 @@ static int __cpuinit have_cpuid_p(void)
189 178
190static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) 179static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
191{ 180{
192 if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) { 181 unsigned long lo, hi;
193 /* Disable processor serial number */ 182
194 unsigned long lo, hi; 183 if (!cpu_has(c, X86_FEATURE_PN) || !disable_x86_serial_nr)
195 rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi); 184 return;
196 lo |= 0x200000; 185
197 wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi); 186 /* Disable processor serial number: */
198 printk(KERN_NOTICE "CPU serial number disabled.\n"); 187
199 clear_cpu_cap(c, X86_FEATURE_PN); 188 rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
200 189 lo |= 0x200000;
201 /* Disabling the serial number may affect the cpuid level */ 190 wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
202 c->cpuid_level = cpuid_eax(0); 191
203 } 192 printk(KERN_NOTICE "CPU serial number disabled.\n");
193 clear_cpu_cap(c, X86_FEATURE_PN);
194
195 /* Disabling the serial number may affect the cpuid level */
196 c->cpuid_level = cpuid_eax(0);
204} 197}
205 198
206static int __init x86_serial_nr_setup(char *s) 199static int __init x86_serial_nr_setup(char *s)
@@ -233,6 +226,7 @@ struct cpuid_dependent_feature {
233 u32 feature; 226 u32 feature;
234 u32 level; 227 u32 level;
235}; 228};
229
236static const struct cpuid_dependent_feature __cpuinitconst 230static const struct cpuid_dependent_feature __cpuinitconst
237cpuid_dependent_features[] = { 231cpuid_dependent_features[] = {
238 { X86_FEATURE_MWAIT, 0x00000005 }, 232 { X86_FEATURE_MWAIT, 0x00000005 },
@@ -244,7 +238,11 @@ cpuid_dependent_features[] = {
244static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn) 238static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
245{ 239{
246 const struct cpuid_dependent_feature *df; 240 const struct cpuid_dependent_feature *df;
241
247 for (df = cpuid_dependent_features; df->feature; df++) { 242 for (df = cpuid_dependent_features; df->feature; df++) {
243
244 if (!cpu_has(c, df->feature))
245 continue;
248 /* 246 /*
249 * Note: cpuid_level is set to -1 if unavailable, but 247 * Note: cpuid_level is set to -1 if unavailable, but
250 * extended_extended_level is set to 0 if unavailable 248 * extended_extended_level is set to 0 if unavailable
@@ -252,32 +250,32 @@ static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
252 * when signed; hence the weird messing around with 250 * when signed; hence the weird messing around with
253 * signs here... 251 * signs here...
254 */ 252 */
255 if (cpu_has(c, df->feature) && 253 if (!((s32)df->level < 0 ?
256 ((s32)df->level < 0 ?
257 (u32)df->level > (u32)c->extended_cpuid_level : 254 (u32)df->level > (u32)c->extended_cpuid_level :
258 (s32)df->level > (s32)c->cpuid_level)) { 255 (s32)df->level > (s32)c->cpuid_level))
259 clear_cpu_cap(c, df->feature); 256 continue;
260 if (warn) 257
261 printk(KERN_WARNING 258 clear_cpu_cap(c, df->feature);
262 "CPU: CPU feature %s disabled " 259 if (!warn)
263 "due to lack of CPUID level 0x%x\n", 260 continue;
264 x86_cap_flags[df->feature], 261
265 df->level); 262 printk(KERN_WARNING
266 } 263 "CPU: CPU feature %s disabled, no CPUID level 0x%x\n",
264 x86_cap_flags[df->feature], df->level);
267 } 265 }
268} 266}
269 267
270/* 268/*
271 * Naming convention should be: <Name> [(<Codename>)] 269 * Naming convention should be: <Name> [(<Codename>)]
272 * This table only is used unless init_<vendor>() below doesn't set it; 270 * This table only is used unless init_<vendor>() below doesn't set it;
273 * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used 271 * in particular, if CPUID levels 0x80000002..4 are supported, this
274 * 272 * isn't used
275 */ 273 */
276 274
277/* Look up CPU names by table lookup. */ 275/* Look up CPU names by table lookup. */
278static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c) 276static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c)
279{ 277{
280 struct cpu_model_info *info; 278 const struct cpu_model_info *info;
281 279
282 if (c->x86_model >= 16) 280 if (c->x86_model >= 16)
283 return NULL; /* Range check */ 281 return NULL; /* Range check */
@@ -308,8 +306,10 @@ void load_percpu_segment(int cpu)
308 load_stack_canary_segment(); 306 load_stack_canary_segment();
309} 307}
310 308
311/* Current gdt points %fs at the "master" per-cpu area: after this, 309/*
312 * it's on the real one. */ 310 * Current gdt points %fs at the "master" per-cpu area: after this,
311 * it's on the real one.
312 */
313void switch_to_new_gdt(int cpu) 313void switch_to_new_gdt(int cpu)
314{ 314{
315 struct desc_ptr gdt_descr; 315 struct desc_ptr gdt_descr;
@@ -322,7 +322,7 @@ void switch_to_new_gdt(int cpu)
322 load_percpu_segment(cpu); 322 load_percpu_segment(cpu);
323} 323}
324 324
325static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; 325static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {};
326 326
327static void __cpuinit default_init(struct cpuinfo_x86 *c) 327static void __cpuinit default_init(struct cpuinfo_x86 *c)
328{ 328{
@@ -341,7 +341,7 @@ static void __cpuinit default_init(struct cpuinfo_x86 *c)
341#endif 341#endif
342} 342}
343 343
344static struct cpu_dev __cpuinitdata default_cpu = { 344static const struct cpu_dev __cpuinitconst default_cpu = {
345 .c_init = default_init, 345 .c_init = default_init,
346 .c_vendor = "Unknown", 346 .c_vendor = "Unknown",
347 .c_x86_vendor = X86_VENDOR_UNKNOWN, 347 .c_x86_vendor = X86_VENDOR_UNKNOWN,
@@ -355,22 +355,24 @@ static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
355 if (c->extended_cpuid_level < 0x80000004) 355 if (c->extended_cpuid_level < 0x80000004)
356 return; 356 return;
357 357
358 v = (unsigned int *) c->x86_model_id; 358 v = (unsigned int *)c->x86_model_id;
359 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); 359 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
360 cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); 360 cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
361 cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); 361 cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
362 c->x86_model_id[48] = 0; 362 c->x86_model_id[48] = 0;
363 363
364 /* Intel chips right-justify this string for some dumb reason; 364 /*
365 undo that brain damage */ 365 * Intel chips right-justify this string for some dumb reason;
366 * undo that brain damage:
367 */
366 p = q = &c->x86_model_id[0]; 368 p = q = &c->x86_model_id[0];
367 while (*p == ' ') 369 while (*p == ' ')
368 p++; 370 p++;
369 if (p != q) { 371 if (p != q) {
370 while (*p) 372 while (*p)
371 *q++ = *p++; 373 *q++ = *p++;
372 while (q <= &c->x86_model_id[48]) 374 while (q <= &c->x86_model_id[48])
373 *q++ = '\0'; /* Zero-pad the rest */ 375 *q++ = '\0'; /* Zero-pad the rest */
374 } 376 }
375} 377}
376 378
@@ -439,27 +441,30 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
439 441
440 if (smp_num_siblings == 1) { 442 if (smp_num_siblings == 1) {
441 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); 443 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
442 } else if (smp_num_siblings > 1) { 444 goto out;
445 }
443 446
444 if (smp_num_siblings > nr_cpu_ids) { 447 if (smp_num_siblings <= 1)
445 printk(KERN_WARNING "CPU: Unsupported number of siblings %d", 448 goto out;
446 smp_num_siblings);
447 smp_num_siblings = 1;
448 return;
449 }
450 449
451 index_msb = get_count_order(smp_num_siblings); 450 if (smp_num_siblings > nr_cpu_ids) {
452 c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb); 451 pr_warning("CPU: Unsupported number of siblings %d",
452 smp_num_siblings);
453 smp_num_siblings = 1;
454 return;
455 }
453 456
454 smp_num_siblings = smp_num_siblings / c->x86_max_cores; 457 index_msb = get_count_order(smp_num_siblings);
458 c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
455 459
456 index_msb = get_count_order(smp_num_siblings); 460 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
457 461
458 core_bits = get_count_order(c->x86_max_cores); 462 index_msb = get_count_order(smp_num_siblings);
459 463
460 c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) & 464 core_bits = get_count_order(c->x86_max_cores);
461 ((1 << core_bits) - 1); 465
462 } 466 c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) &
467 ((1 << core_bits) - 1);
463 468
464out: 469out:
465 if ((c->x86_max_cores * smp_num_siblings) > 1) { 470 if ((c->x86_max_cores * smp_num_siblings) > 1) {
@@ -474,8 +479,8 @@ out:
474static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) 479static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
475{ 480{
476 char *v = c->x86_vendor_id; 481 char *v = c->x86_vendor_id;
477 int i;
478 static int printed; 482 static int printed;
483 int i;
479 484
480 for (i = 0; i < X86_VENDOR_NUM; i++) { 485 for (i = 0; i < X86_VENDOR_NUM; i++) {
481 if (!cpu_devs[i]) 486 if (!cpu_devs[i])
@@ -484,6 +489,7 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
484 if (!strcmp(v, cpu_devs[i]->c_ident[0]) || 489 if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
485 (cpu_devs[i]->c_ident[1] && 490 (cpu_devs[i]->c_ident[1] &&
486 !strcmp(v, cpu_devs[i]->c_ident[1]))) { 491 !strcmp(v, cpu_devs[i]->c_ident[1]))) {
492
487 this_cpu = cpu_devs[i]; 493 this_cpu = cpu_devs[i];
488 c->x86_vendor = this_cpu->c_x86_vendor; 494 c->x86_vendor = this_cpu->c_x86_vendor;
489 return; 495 return;
@@ -492,7 +498,9 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
492 498
493 if (!printed) { 499 if (!printed) {
494 printed++; 500 printed++;
495 printk(KERN_ERR "CPU: vendor_id '%s' unknown, using generic init.\n", v); 501 printk(KERN_ERR
502 "CPU: vendor_id '%s' unknown, using generic init.\n", v);
503
496 printk(KERN_ERR "CPU: Your system may be unstable.\n"); 504 printk(KERN_ERR "CPU: Your system may be unstable.\n");
497 } 505 }
498 506
@@ -512,14 +520,17 @@ void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
512 /* Intel-defined flags: level 0x00000001 */ 520 /* Intel-defined flags: level 0x00000001 */
513 if (c->cpuid_level >= 0x00000001) { 521 if (c->cpuid_level >= 0x00000001) {
514 u32 junk, tfms, cap0, misc; 522 u32 junk, tfms, cap0, misc;
523
515 cpuid(0x00000001, &tfms, &misc, &junk, &cap0); 524 cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
516 c->x86 = (tfms >> 8) & 0xf; 525 c->x86 = (tfms >> 8) & 0xf;
517 c->x86_model = (tfms >> 4) & 0xf; 526 c->x86_model = (tfms >> 4) & 0xf;
518 c->x86_mask = tfms & 0xf; 527 c->x86_mask = tfms & 0xf;
528
519 if (c->x86 == 0xf) 529 if (c->x86 == 0xf)
520 c->x86 += (tfms >> 20) & 0xff; 530 c->x86 += (tfms >> 20) & 0xff;
521 if (c->x86 >= 0x6) 531 if (c->x86 >= 0x6)
522 c->x86_model += ((tfms >> 16) & 0xf) << 4; 532 c->x86_model += ((tfms >> 16) & 0xf) << 4;
533
523 if (cap0 & (1<<19)) { 534 if (cap0 & (1<<19)) {
524 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; 535 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
525 c->x86_cache_alignment = c->x86_clflush_size; 536 c->x86_cache_alignment = c->x86_clflush_size;
@@ -535,6 +546,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
535 /* Intel-defined flags: level 0x00000001 */ 546 /* Intel-defined flags: level 0x00000001 */
536 if (c->cpuid_level >= 0x00000001) { 547 if (c->cpuid_level >= 0x00000001) {
537 u32 capability, excap; 548 u32 capability, excap;
549
538 cpuid(0x00000001, &tfms, &ebx, &excap, &capability); 550 cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
539 c->x86_capability[0] = capability; 551 c->x86_capability[0] = capability;
540 c->x86_capability[4] = excap; 552 c->x86_capability[4] = excap;
@@ -543,6 +555,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
543 /* AMD-defined flags: level 0x80000001 */ 555 /* AMD-defined flags: level 0x80000001 */
544 xlvl = cpuid_eax(0x80000000); 556 xlvl = cpuid_eax(0x80000000);
545 c->extended_cpuid_level = xlvl; 557 c->extended_cpuid_level = xlvl;
558
546 if ((xlvl & 0xffff0000) == 0x80000000) { 559 if ((xlvl & 0xffff0000) == 0x80000000) {
547 if (xlvl >= 0x80000001) { 560 if (xlvl >= 0x80000001) {
548 c->x86_capability[1] = cpuid_edx(0x80000001); 561 c->x86_capability[1] = cpuid_edx(0x80000001);
@@ -550,13 +563,15 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
550 } 563 }
551 } 564 }
552 565
553#ifdef CONFIG_X86_64
554 if (c->extended_cpuid_level >= 0x80000008) { 566 if (c->extended_cpuid_level >= 0x80000008) {
555 u32 eax = cpuid_eax(0x80000008); 567 u32 eax = cpuid_eax(0x80000008);
556 568
557 c->x86_virt_bits = (eax >> 8) & 0xff; 569 c->x86_virt_bits = (eax >> 8) & 0xff;
558 c->x86_phys_bits = eax & 0xff; 570 c->x86_phys_bits = eax & 0xff;
559 } 571 }
572#ifdef CONFIG_X86_32
573 else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))
574 c->x86_phys_bits = 36;
560#endif 575#endif
561 576
562 if (c->extended_cpuid_level >= 0x80000007) 577 if (c->extended_cpuid_level >= 0x80000007)
@@ -603,8 +618,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
603{ 618{
604#ifdef CONFIG_X86_64 619#ifdef CONFIG_X86_64
605 c->x86_clflush_size = 64; 620 c->x86_clflush_size = 64;
621 c->x86_phys_bits = 36;
622 c->x86_virt_bits = 48;
606#else 623#else
607 c->x86_clflush_size = 32; 624 c->x86_clflush_size = 32;
625 c->x86_phys_bits = 32;
626 c->x86_virt_bits = 32;
608#endif 627#endif
609 c->x86_cache_alignment = c->x86_clflush_size; 628 c->x86_cache_alignment = c->x86_clflush_size;
610 629
@@ -635,12 +654,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
635 654
636void __init early_cpu_init(void) 655void __init early_cpu_init(void)
637{ 656{
638 struct cpu_dev **cdev; 657 const struct cpu_dev *const *cdev;
639 int count = 0; 658 int count = 0;
640 659
641 printk("KERNEL supported cpus:\n"); 660 printk(KERN_INFO "KERNEL supported cpus:\n");
642 for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { 661 for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
643 struct cpu_dev *cpudev = *cdev; 662 const struct cpu_dev *cpudev = *cdev;
644 unsigned int j; 663 unsigned int j;
645 664
646 if (count >= X86_VENDOR_NUM) 665 if (count >= X86_VENDOR_NUM)
@@ -651,7 +670,7 @@ void __init early_cpu_init(void)
651 for (j = 0; j < 2; j++) { 670 for (j = 0; j < 2; j++) {
652 if (!cpudev->c_ident[j]) 671 if (!cpudev->c_ident[j])
653 continue; 672 continue;
654 printk(" %s %s\n", cpudev->c_vendor, 673 printk(KERN_INFO " %s %s\n", cpudev->c_vendor,
655 cpudev->c_ident[j]); 674 cpudev->c_ident[j]);
656 } 675 }
657 } 676 }
@@ -727,9 +746,13 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
727 c->x86_coreid_bits = 0; 746 c->x86_coreid_bits = 0;
728#ifdef CONFIG_X86_64 747#ifdef CONFIG_X86_64
729 c->x86_clflush_size = 64; 748 c->x86_clflush_size = 64;
749 c->x86_phys_bits = 36;
750 c->x86_virt_bits = 48;
730#else 751#else
731 c->cpuid_level = -1; /* CPUID not detected */ 752 c->cpuid_level = -1; /* CPUID not detected */
732 c->x86_clflush_size = 32; 753 c->x86_clflush_size = 32;
754 c->x86_phys_bits = 32;
755 c->x86_virt_bits = 32;
733#endif 756#endif
734 c->x86_cache_alignment = c->x86_clflush_size; 757 c->x86_cache_alignment = c->x86_clflush_size;
735 memset(&c->x86_capability, 0, sizeof c->x86_capability); 758 memset(&c->x86_capability, 0, sizeof c->x86_capability);
@@ -760,8 +783,8 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
760 squash_the_stupid_serial_number(c); 783 squash_the_stupid_serial_number(c);
761 784
762 /* 785 /*
763 * The vendor-specific functions might have changed features. Now 786 * The vendor-specific functions might have changed features.
764 * we do "generic changes." 787 * Now we do "generic changes."
765 */ 788 */
766 789
767 /* Filter out anything that depends on CPUID levels we don't have */ 790 /* Filter out anything that depends on CPUID levels we don't have */
@@ -769,7 +792,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
769 792
770 /* If the model name is still unset, do table lookup. */ 793 /* If the model name is still unset, do table lookup. */
771 if (!c->x86_model_id[0]) { 794 if (!c->x86_model_id[0]) {
772 char *p; 795 const char *p;
773 p = table_lookup_model(c); 796 p = table_lookup_model(c);
774 if (p) 797 if (p)
775 strcpy(c->x86_model_id, p); 798 strcpy(c->x86_model_id, p);
@@ -825,6 +848,7 @@ static void vgetcpu_set_mode(void)
825void __init identify_boot_cpu(void) 848void __init identify_boot_cpu(void)
826{ 849{
827 identify_cpu(&boot_cpu_data); 850 identify_cpu(&boot_cpu_data);
851 init_c1e_mask();
828#ifdef CONFIG_X86_32 852#ifdef CONFIG_X86_32
829 sysenter_setup(); 853 sysenter_setup();
830 enable_sep_cpu(); 854 enable_sep_cpu();
@@ -845,11 +869,11 @@ void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
845} 869}
846 870
847struct msr_range { 871struct msr_range {
848 unsigned min; 872 unsigned min;
849 unsigned max; 873 unsigned max;
850}; 874};
851 875
852static struct msr_range msr_range_array[] __cpuinitdata = { 876static const struct msr_range msr_range_array[] __cpuinitconst = {
853 { 0x00000000, 0x00000418}, 877 { 0x00000000, 0x00000418},
854 { 0xc0000000, 0xc000040b}, 878 { 0xc0000000, 0xc000040b},
855 { 0xc0010000, 0xc0010142}, 879 { 0xc0010000, 0xc0010142},
@@ -858,14 +882,15 @@ static struct msr_range msr_range_array[] __cpuinitdata = {
858 882
859static void __cpuinit print_cpu_msr(void) 883static void __cpuinit print_cpu_msr(void)
860{ 884{
885 unsigned index_min, index_max;
861 unsigned index; 886 unsigned index;
862 u64 val; 887 u64 val;
863 int i; 888 int i;
864 unsigned index_min, index_max;
865 889
866 for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) { 890 for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
867 index_min = msr_range_array[i].min; 891 index_min = msr_range_array[i].min;
868 index_max = msr_range_array[i].max; 892 index_max = msr_range_array[i].max;
893
869 for (index = index_min; index < index_max; index++) { 894 for (index = index_min; index < index_max; index++) {
870 if (rdmsrl_amd_safe(index, &val)) 895 if (rdmsrl_amd_safe(index, &val))
871 continue; 896 continue;
@@ -875,6 +900,7 @@ static void __cpuinit print_cpu_msr(void)
875} 900}
876 901
877static int show_msr __cpuinitdata; 902static int show_msr __cpuinitdata;
903
878static __init int setup_show_msr(char *arg) 904static __init int setup_show_msr(char *arg)
879{ 905{
880 int num; 906 int num;
@@ -896,12 +922,14 @@ __setup("noclflush", setup_noclflush);
896 922
897void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) 923void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
898{ 924{
899 char *vendor = NULL; 925 const char *vendor = NULL;
900 926
901 if (c->x86_vendor < X86_VENDOR_NUM) 927 if (c->x86_vendor < X86_VENDOR_NUM) {
902 vendor = this_cpu->c_vendor; 928 vendor = this_cpu->c_vendor;
903 else if (c->cpuid_level >= 0) 929 } else {
904 vendor = c->x86_vendor_id; 930 if (c->cpuid_level >= 0)
931 vendor = c->x86_vendor_id;
932 }
905 933
906 if (vendor && !strstr(c->x86_model_id, vendor)) 934 if (vendor && !strstr(c->x86_model_id, vendor))
907 printk(KERN_CONT "%s ", vendor); 935 printk(KERN_CONT "%s ", vendor);
@@ -928,10 +956,12 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
928static __init int setup_disablecpuid(char *arg) 956static __init int setup_disablecpuid(char *arg)
929{ 957{
930 int bit; 958 int bit;
959
931 if (get_option(&arg, &bit) && bit < NCAPINTS*32) 960 if (get_option(&arg, &bit) && bit < NCAPINTS*32)
932 setup_clear_cpu_cap(bit); 961 setup_clear_cpu_cap(bit);
933 else 962 else
934 return 0; 963 return 0;
964
935 return 1; 965 return 1;
936} 966}
937__setup("clearcpuid=", setup_disablecpuid); 967__setup("clearcpuid=", setup_disablecpuid);
@@ -941,6 +971,7 @@ struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
941 971
942DEFINE_PER_CPU_FIRST(union irq_stack_union, 972DEFINE_PER_CPU_FIRST(union irq_stack_union,
943 irq_stack_union) __aligned(PAGE_SIZE); 973 irq_stack_union) __aligned(PAGE_SIZE);
974
944DEFINE_PER_CPU(char *, irq_stack_ptr) = 975DEFINE_PER_CPU(char *, irq_stack_ptr) =
945 init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; 976 init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
946 977
@@ -950,12 +981,21 @@ EXPORT_PER_CPU_SYMBOL(kernel_stack);
950 981
951DEFINE_PER_CPU(unsigned int, irq_count) = -1; 982DEFINE_PER_CPU(unsigned int, irq_count) = -1;
952 983
984/*
985 * Special IST stacks which the CPU switches to when it calls
986 * an IST-marked descriptor entry. Up to 7 stacks (hardware
987 * limit), all of them are 4K, except the debug stack which
988 * is 8K.
989 */
990static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
991 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
992 [DEBUG_STACK - 1] = DEBUG_STKSZ
993};
994
953static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks 995static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
954 [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]) 996 [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ])
955 __aligned(PAGE_SIZE); 997 __aligned(PAGE_SIZE);
956 998
957extern asmlinkage void ignore_sysret(void);
958
959/* May not be marked __init: used by software suspend */ 999/* May not be marked __init: used by software suspend */
960void syscall_init(void) 1000void syscall_init(void)
961{ 1001{
@@ -985,7 +1025,7 @@ unsigned long kernel_eflags;
985 */ 1025 */
986DEFINE_PER_CPU(struct orig_ist, orig_ist); 1026DEFINE_PER_CPU(struct orig_ist, orig_ist);
987 1027
988#else /* x86_64 */ 1028#else /* CONFIG_X86_64 */
989 1029
990#ifdef CONFIG_CC_STACKPROTECTOR 1030#ifdef CONFIG_CC_STACKPROTECTOR
991DEFINE_PER_CPU(unsigned long, stack_canary); 1031DEFINE_PER_CPU(unsigned long, stack_canary);
@@ -997,9 +1037,26 @@ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
997 memset(regs, 0, sizeof(struct pt_regs)); 1037 memset(regs, 0, sizeof(struct pt_regs));
998 regs->fs = __KERNEL_PERCPU; 1038 regs->fs = __KERNEL_PERCPU;
999 regs->gs = __KERNEL_STACK_CANARY; 1039 regs->gs = __KERNEL_STACK_CANARY;
1040
1000 return regs; 1041 return regs;
1001} 1042}
1002#endif /* x86_64 */ 1043#endif /* CONFIG_X86_64 */
1044
1045/*
1046 * Clear all 6 debug registers:
1047 */
1048static void clear_all_debug_regs(void)
1049{
1050 int i;
1051
1052 for (i = 0; i < 8; i++) {
1053 /* Ignore db4, db5 */
1054 if ((i == 4) || (i == 5))
1055 continue;
1056
1057 set_debugreg(0, i);
1058 }
1059}
1003 1060
1004/* 1061/*
1005 * cpu_init() initializes state that is per-CPU. Some data is already 1062 * cpu_init() initializes state that is per-CPU. Some data is already
@@ -1009,15 +1066,20 @@ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
1009 * A lot of state is already set up in PDA init for 64 bit 1066 * A lot of state is already set up in PDA init for 64 bit
1010 */ 1067 */
1011#ifdef CONFIG_X86_64 1068#ifdef CONFIG_X86_64
1069
1012void __cpuinit cpu_init(void) 1070void __cpuinit cpu_init(void)
1013{ 1071{
1014 int cpu = stack_smp_processor_id(); 1072 struct orig_ist *orig_ist;
1015 struct tss_struct *t = &per_cpu(init_tss, cpu);
1016 struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
1017 unsigned long v;
1018 struct task_struct *me; 1073 struct task_struct *me;
1074 struct tss_struct *t;
1075 unsigned long v;
1076 int cpu;
1019 int i; 1077 int i;
1020 1078
1079 cpu = stack_smp_processor_id();
1080 t = &per_cpu(init_tss, cpu);
1081 orig_ist = &per_cpu(orig_ist, cpu);
1082
1021#ifdef CONFIG_NUMA 1083#ifdef CONFIG_NUMA
1022 if (cpu != 0 && percpu_read(node_number) == 0 && 1084 if (cpu != 0 && percpu_read(node_number) == 0 &&
1023 cpu_to_node(cpu) != NUMA_NO_NODE) 1085 cpu_to_node(cpu) != NUMA_NO_NODE)
@@ -1058,19 +1120,17 @@ void __cpuinit cpu_init(void)
1058 * set up and load the per-CPU TSS 1120 * set up and load the per-CPU TSS
1059 */ 1121 */
1060 if (!orig_ist->ist[0]) { 1122 if (!orig_ist->ist[0]) {
1061 static const unsigned int sizes[N_EXCEPTION_STACKS] = {
1062 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
1063 [DEBUG_STACK - 1] = DEBUG_STKSZ
1064 };
1065 char *estacks = per_cpu(exception_stacks, cpu); 1123 char *estacks = per_cpu(exception_stacks, cpu);
1124
1066 for (v = 0; v < N_EXCEPTION_STACKS; v++) { 1125 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
1067 estacks += sizes[v]; 1126 estacks += exception_stack_sizes[v];
1068 orig_ist->ist[v] = t->x86_tss.ist[v] = 1127 orig_ist->ist[v] = t->x86_tss.ist[v] =
1069 (unsigned long)estacks; 1128 (unsigned long)estacks;
1070 } 1129 }
1071 } 1130 }
1072 1131
1073 t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); 1132 t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
1133
1074 /* 1134 /*
1075 * <= is required because the CPU will access up to 1135 * <= is required because the CPU will access up to
1076 * 8 bits beyond the end of the IO permission bitmap. 1136 * 8 bits beyond the end of the IO permission bitmap.
@@ -1080,8 +1140,7 @@ void __cpuinit cpu_init(void)
1080 1140
1081 atomic_inc(&init_mm.mm_count); 1141 atomic_inc(&init_mm.mm_count);
1082 me->active_mm = &init_mm; 1142 me->active_mm = &init_mm;
1083 if (me->mm) 1143 BUG_ON(me->mm);
1084 BUG();
1085 enter_lazy_tlb(&init_mm, me); 1144 enter_lazy_tlb(&init_mm, me);
1086 1145
1087 load_sp0(t, &current->thread); 1146 load_sp0(t, &current->thread);
@@ -1100,17 +1159,7 @@ void __cpuinit cpu_init(void)
1100 arch_kgdb_ops.correct_hw_break(); 1159 arch_kgdb_ops.correct_hw_break();
1101 else 1160 else
1102#endif 1161#endif
1103 { 1162 clear_all_debug_regs();
1104 /*
1105 * Clear all 6 debug registers:
1106 */
1107 set_debugreg(0UL, 0);
1108 set_debugreg(0UL, 1);
1109 set_debugreg(0UL, 2);
1110 set_debugreg(0UL, 3);
1111 set_debugreg(0UL, 6);
1112 set_debugreg(0UL, 7);
1113 }
1114 1163
1115 fpu_init(); 1164 fpu_init();
1116 1165
@@ -1131,7 +1180,8 @@ void __cpuinit cpu_init(void)
1131 1180
1132 if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) { 1181 if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) {
1133 printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); 1182 printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
1134 for (;;) local_irq_enable(); 1183 for (;;)
1184 local_irq_enable();
1135 } 1185 }
1136 1186
1137 printk(KERN_INFO "Initializing CPU#%d\n", cpu); 1187 printk(KERN_INFO "Initializing CPU#%d\n", cpu);
@@ -1147,8 +1197,7 @@ void __cpuinit cpu_init(void)
1147 */ 1197 */
1148 atomic_inc(&init_mm.mm_count); 1198 atomic_inc(&init_mm.mm_count);
1149 curr->active_mm = &init_mm; 1199 curr->active_mm = &init_mm;
1150 if (curr->mm) 1200 BUG_ON(curr->mm);
1151 BUG();
1152 enter_lazy_tlb(&init_mm, curr); 1201 enter_lazy_tlb(&init_mm, curr);
1153 1202
1154 load_sp0(t, thread); 1203 load_sp0(t, thread);
@@ -1161,13 +1210,7 @@ void __cpuinit cpu_init(void)
1161 __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); 1210 __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
1162#endif 1211#endif
1163 1212
1164 /* Clear all 6 debug registers: */ 1213 clear_all_debug_regs();
1165 set_debugreg(0, 0);
1166 set_debugreg(0, 1);
1167 set_debugreg(0, 2);
1168 set_debugreg(0, 3);
1169 set_debugreg(0, 6);
1170 set_debugreg(0, 7);
1171 1214
1172 /* 1215 /*
1173 * Force FPU initialization: 1216 * Force FPU initialization:
@@ -1187,6 +1230,4 @@ void __cpuinit cpu_init(void)
1187 1230
1188 xsave_init(); 1231 xsave_init();
1189} 1232}
1190
1191
1192#endif 1233#endif
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index de4094a39210..6de9a908e400 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -3,33 +3,34 @@
3#define ARCH_X86_CPU_H 3#define ARCH_X86_CPU_H
4 4
5struct cpu_model_info { 5struct cpu_model_info {
6 int vendor; 6 int vendor;
7 int family; 7 int family;
8 char *model_names[16]; 8 const char *model_names[16];
9}; 9};
10 10
11/* attempt to consolidate cpu attributes */ 11/* attempt to consolidate cpu attributes */
12struct cpu_dev { 12struct cpu_dev {
13 char * c_vendor; 13 const char *c_vendor;
14 14
15 /* some have two possibilities for cpuid string */ 15 /* some have two possibilities for cpuid string */
16 char * c_ident[2]; 16 const char *c_ident[2];
17 17
18 struct cpu_model_info c_models[4]; 18 struct cpu_model_info c_models[4];
19 19
20 void (*c_early_init)(struct cpuinfo_x86 *c); 20 void (*c_early_init)(struct cpuinfo_x86 *);
21 void (*c_init)(struct cpuinfo_x86 * c); 21 void (*c_init)(struct cpuinfo_x86 *);
22 void (*c_identify)(struct cpuinfo_x86 * c); 22 void (*c_identify)(struct cpuinfo_x86 *);
23 unsigned int (*c_size_cache)(struct cpuinfo_x86 * c, unsigned int size); 23 unsigned int (*c_size_cache)(struct cpuinfo_x86 *, unsigned int);
24 int c_x86_vendor; 24 int c_x86_vendor;
25}; 25};
26 26
27#define cpu_dev_register(cpu_devX) \ 27#define cpu_dev_register(cpu_devX) \
28 static struct cpu_dev *__cpu_dev_##cpu_devX __used \ 28 static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \
29 __attribute__((__section__(".x86_cpu_dev.init"))) = \ 29 __attribute__((__section__(".x86_cpu_dev.init"))) = \
30 &cpu_devX; 30 &cpu_devX;
31 31
32extern struct cpu_dev *__x86_cpu_dev_start[], *__x86_cpu_dev_end[]; 32extern const struct cpu_dev *const __x86_cpu_dev_start[],
33 *const __x86_cpu_dev_end[];
33 34
34extern void display_cacheinfo(struct cpuinfo_x86 *c); 35extern void display_cacheinfo(struct cpuinfo_x86 *c);
35 36
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
new file mode 100755
index 000000000000..46e29ab96c6a
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpu_debug.c
@@ -0,0 +1,901 @@
1/*
2 * CPU x86 architecture debug code
3 *
4 * Copyright(C) 2009 Jaswinder Singh Rajput
5 *
6 * For licencing details see kernel-base/COPYING
7 */
8
9#include <linux/interrupt.h>
10#include <linux/compiler.h>
11#include <linux/seq_file.h>
12#include <linux/debugfs.h>
13#include <linux/kprobes.h>
14#include <linux/uaccess.h>
15#include <linux/kernel.h>
16#include <linux/module.h>
17#include <linux/percpu.h>
18#include <linux/signal.h>
19#include <linux/errno.h>
20#include <linux/sched.h>
21#include <linux/types.h>
22#include <linux/init.h>
23#include <linux/slab.h>
24#include <linux/smp.h>
25
26#include <asm/cpu_debug.h>
27#include <asm/paravirt.h>
28#include <asm/system.h>
29#include <asm/traps.h>
30#include <asm/apic.h>
31#include <asm/desc.h>
32
33static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]);
34static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]);
35static DEFINE_PER_CPU(unsigned, cpu_modelflag);
36static DEFINE_PER_CPU(int, cpu_priv_count);
37static DEFINE_PER_CPU(unsigned, cpu_model);
38
39static DEFINE_MUTEX(cpu_debug_lock);
40
41static struct dentry *cpu_debugfs_dir;
42
43static struct cpu_debug_base cpu_base[] = {
44 { "mc", CPU_MC, 0 },
45 { "monitor", CPU_MONITOR, 0 },
46 { "time", CPU_TIME, 0 },
47 { "pmc", CPU_PMC, 1 },
48 { "platform", CPU_PLATFORM, 0 },
49 { "apic", CPU_APIC, 0 },
50 { "poweron", CPU_POWERON, 0 },
51 { "control", CPU_CONTROL, 0 },
52 { "features", CPU_FEATURES, 0 },
53 { "lastbranch", CPU_LBRANCH, 0 },
54 { "bios", CPU_BIOS, 0 },
55 { "freq", CPU_FREQ, 0 },
56 { "mtrr", CPU_MTRR, 0 },
57 { "perf", CPU_PERF, 0 },
58 { "cache", CPU_CACHE, 0 },
59 { "sysenter", CPU_SYSENTER, 0 },
60 { "therm", CPU_THERM, 0 },
61 { "misc", CPU_MISC, 0 },
62 { "debug", CPU_DEBUG, 0 },
63 { "pat", CPU_PAT, 0 },
64 { "vmx", CPU_VMX, 0 },
65 { "call", CPU_CALL, 0 },
66 { "base", CPU_BASE, 0 },
67 { "ver", CPU_VER, 0 },
68 { "conf", CPU_CONF, 0 },
69 { "smm", CPU_SMM, 0 },
70 { "svm", CPU_SVM, 0 },
71 { "osvm", CPU_OSVM, 0 },
72 { "tss", CPU_TSS, 0 },
73 { "cr", CPU_CR, 0 },
74 { "dt", CPU_DT, 0 },
75 { "registers", CPU_REG_ALL, 0 },
76};
77
78static struct cpu_file_base cpu_file[] = {
79 { "index", CPU_REG_ALL, 0 },
80 { "value", CPU_REG_ALL, 1 },
81};
82
83/* Intel Registers Range */
84static struct cpu_debug_range cpu_intel_range[] = {
85 { 0x00000000, 0x00000001, CPU_MC, CPU_INTEL_ALL },
86 { 0x00000006, 0x00000007, CPU_MONITOR, CPU_CX_AT_XE },
87 { 0x00000010, 0x00000010, CPU_TIME, CPU_INTEL_ALL },
88 { 0x00000011, 0x00000013, CPU_PMC, CPU_INTEL_PENTIUM },
89 { 0x00000017, 0x00000017, CPU_PLATFORM, CPU_PX_CX_AT_XE },
90 { 0x0000001B, 0x0000001B, CPU_APIC, CPU_P6_CX_AT_XE },
91
92 { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_PX_CX_AT_XE },
93 { 0x0000002B, 0x0000002B, CPU_POWERON, CPU_INTEL_XEON },
94 { 0x0000002C, 0x0000002C, CPU_FREQ, CPU_INTEL_XEON },
95 { 0x0000003A, 0x0000003A, CPU_CONTROL, CPU_CX_AT_XE },
96
97 { 0x00000040, 0x00000043, CPU_LBRANCH, CPU_PM_CX_AT_XE },
98 { 0x00000044, 0x00000047, CPU_LBRANCH, CPU_PM_CO_AT },
99 { 0x00000060, 0x00000063, CPU_LBRANCH, CPU_C2_AT },
100 { 0x00000064, 0x00000067, CPU_LBRANCH, CPU_INTEL_ATOM },
101
102 { 0x00000079, 0x00000079, CPU_BIOS, CPU_P6_CX_AT_XE },
103 { 0x00000088, 0x0000008A, CPU_CACHE, CPU_INTEL_P6 },
104 { 0x0000008B, 0x0000008B, CPU_BIOS, CPU_P6_CX_AT_XE },
105 { 0x0000009B, 0x0000009B, CPU_MONITOR, CPU_INTEL_XEON },
106
107 { 0x000000C1, 0x000000C2, CPU_PMC, CPU_P6_CX_AT },
108 { 0x000000CD, 0x000000CD, CPU_FREQ, CPU_CX_AT },
109 { 0x000000E7, 0x000000E8, CPU_PERF, CPU_CX_AT },
110 { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_P6_CX_XE },
111
112 { 0x00000116, 0x00000116, CPU_CACHE, CPU_INTEL_P6 },
113 { 0x00000118, 0x00000118, CPU_CACHE, CPU_INTEL_P6 },
114 { 0x00000119, 0x00000119, CPU_CACHE, CPU_INTEL_PX },
115 { 0x0000011A, 0x0000011B, CPU_CACHE, CPU_INTEL_P6 },
116 { 0x0000011E, 0x0000011E, CPU_CACHE, CPU_PX_CX_AT },
117
118 { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_P6_CX_AT_XE },
119 { 0x00000179, 0x0000017A, CPU_MC, CPU_PX_CX_AT_XE },
120 { 0x0000017B, 0x0000017B, CPU_MC, CPU_P6_XE },
121 { 0x00000186, 0x00000187, CPU_PMC, CPU_P6_CX_AT },
122 { 0x00000198, 0x00000199, CPU_PERF, CPU_PM_CX_AT_XE },
123 { 0x0000019A, 0x0000019A, CPU_TIME, CPU_PM_CX_AT_XE },
124 { 0x0000019B, 0x0000019D, CPU_THERM, CPU_PM_CX_AT_XE },
125 { 0x000001A0, 0x000001A0, CPU_MISC, CPU_PM_CX_AT_XE },
126
127 { 0x000001C9, 0x000001C9, CPU_LBRANCH, CPU_PM_CX_AT },
128 { 0x000001D7, 0x000001D8, CPU_LBRANCH, CPU_INTEL_XEON },
129 { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_CX_AT_XE },
130 { 0x000001DA, 0x000001DA, CPU_LBRANCH, CPU_INTEL_XEON },
131 { 0x000001DB, 0x000001DB, CPU_LBRANCH, CPU_P6_XE },
132 { 0x000001DC, 0x000001DC, CPU_LBRANCH, CPU_INTEL_P6 },
133 { 0x000001DD, 0x000001DE, CPU_LBRANCH, CPU_PX_CX_AT_XE },
134 { 0x000001E0, 0x000001E0, CPU_LBRANCH, CPU_INTEL_P6 },
135
136 { 0x00000200, 0x0000020F, CPU_MTRR, CPU_P6_CX_XE },
137 { 0x00000250, 0x00000250, CPU_MTRR, CPU_P6_CX_XE },
138 { 0x00000258, 0x00000259, CPU_MTRR, CPU_P6_CX_XE },
139 { 0x00000268, 0x0000026F, CPU_MTRR, CPU_P6_CX_XE },
140 { 0x00000277, 0x00000277, CPU_PAT, CPU_C2_AT_XE },
141 { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_P6_CX_XE },
142
143 { 0x00000300, 0x00000308, CPU_PMC, CPU_INTEL_XEON },
144 { 0x00000309, 0x0000030B, CPU_PMC, CPU_C2_AT_XE },
145 { 0x0000030C, 0x00000311, CPU_PMC, CPU_INTEL_XEON },
146 { 0x00000345, 0x00000345, CPU_PMC, CPU_C2_AT },
147 { 0x00000360, 0x00000371, CPU_PMC, CPU_INTEL_XEON },
148 { 0x0000038D, 0x00000390, CPU_PMC, CPU_C2_AT },
149 { 0x000003A0, 0x000003BE, CPU_PMC, CPU_INTEL_XEON },
150 { 0x000003C0, 0x000003CD, CPU_PMC, CPU_INTEL_XEON },
151 { 0x000003E0, 0x000003E1, CPU_PMC, CPU_INTEL_XEON },
152 { 0x000003F0, 0x000003F0, CPU_PMC, CPU_INTEL_XEON },
153 { 0x000003F1, 0x000003F1, CPU_PMC, CPU_C2_AT_XE },
154 { 0x000003F2, 0x000003F2, CPU_PMC, CPU_INTEL_XEON },
155
156 { 0x00000400, 0x00000402, CPU_MC, CPU_PM_CX_AT_XE },
157 { 0x00000403, 0x00000403, CPU_MC, CPU_INTEL_XEON },
158 { 0x00000404, 0x00000406, CPU_MC, CPU_PM_CX_AT_XE },
159 { 0x00000407, 0x00000407, CPU_MC, CPU_INTEL_XEON },
160 { 0x00000408, 0x0000040A, CPU_MC, CPU_PM_CX_AT_XE },
161 { 0x0000040B, 0x0000040B, CPU_MC, CPU_INTEL_XEON },
162 { 0x0000040C, 0x0000040E, CPU_MC, CPU_PM_CX_XE },
163 { 0x0000040F, 0x0000040F, CPU_MC, CPU_INTEL_XEON },
164 { 0x00000410, 0x00000412, CPU_MC, CPU_PM_CX_AT_XE },
165 { 0x00000413, 0x00000417, CPU_MC, CPU_CX_AT_XE },
166 { 0x00000480, 0x0000048B, CPU_VMX, CPU_CX_AT_XE },
167
168 { 0x00000600, 0x00000600, CPU_DEBUG, CPU_PM_CX_AT_XE },
169 { 0x00000680, 0x0000068F, CPU_LBRANCH, CPU_INTEL_XEON },
170 { 0x000006C0, 0x000006CF, CPU_LBRANCH, CPU_INTEL_XEON },
171
172 { 0x000107CC, 0x000107D3, CPU_PMC, CPU_INTEL_XEON_MP },
173
174 { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_INTEL_XEON },
175 { 0xC0000081, 0xC0000082, CPU_CALL, CPU_INTEL_XEON },
176 { 0xC0000084, 0xC0000084, CPU_CALL, CPU_INTEL_XEON },
177 { 0xC0000100, 0xC0000102, CPU_BASE, CPU_INTEL_XEON },
178};
179
180/* AMD Registers Range */
181static struct cpu_debug_range cpu_amd_range[] = {
182 { 0x00000000, 0x00000001, CPU_MC, CPU_K10_PLUS, },
183 { 0x00000010, 0x00000010, CPU_TIME, CPU_K8_PLUS, },
184 { 0x0000001B, 0x0000001B, CPU_APIC, CPU_K8_PLUS, },
185 { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_K7_PLUS },
186 { 0x0000008B, 0x0000008B, CPU_VER, CPU_K8_PLUS },
187 { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_K8_PLUS, },
188
189 { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_K8_PLUS, },
190 { 0x00000179, 0x0000017B, CPU_MC, CPU_K8_PLUS, },
191 { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_K8_PLUS, },
192 { 0x000001DB, 0x000001DE, CPU_LBRANCH, CPU_K8_PLUS, },
193
194 { 0x00000200, 0x0000020F, CPU_MTRR, CPU_K8_PLUS, },
195 { 0x00000250, 0x00000250, CPU_MTRR, CPU_K8_PLUS, },
196 { 0x00000258, 0x00000259, CPU_MTRR, CPU_K8_PLUS, },
197 { 0x00000268, 0x0000026F, CPU_MTRR, CPU_K8_PLUS, },
198 { 0x00000277, 0x00000277, CPU_PAT, CPU_K8_PLUS, },
199 { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_K8_PLUS, },
200
201 { 0x00000400, 0x00000413, CPU_MC, CPU_K8_PLUS, },
202
203 { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_AMD_ALL, },
204 { 0xC0000081, 0xC0000084, CPU_CALL, CPU_K8_PLUS, },
205 { 0xC0000100, 0xC0000102, CPU_BASE, CPU_K8_PLUS, },
206 { 0xC0000103, 0xC0000103, CPU_TIME, CPU_K10_PLUS, },
207
208 { 0xC0010000, 0xC0010007, CPU_PMC, CPU_K8_PLUS, },
209 { 0xC0010010, 0xC0010010, CPU_CONF, CPU_K7_PLUS, },
210 { 0xC0010015, 0xC0010015, CPU_CONF, CPU_K7_PLUS, },
211 { 0xC0010016, 0xC001001A, CPU_MTRR, CPU_K8_PLUS, },
212 { 0xC001001D, 0xC001001D, CPU_MTRR, CPU_K8_PLUS, },
213 { 0xC001001F, 0xC001001F, CPU_CONF, CPU_K8_PLUS, },
214 { 0xC0010030, 0xC0010035, CPU_BIOS, CPU_K8_PLUS, },
215 { 0xC0010044, 0xC0010048, CPU_MC, CPU_K8_PLUS, },
216 { 0xC0010050, 0xC0010056, CPU_SMM, CPU_K0F_PLUS, },
217 { 0xC0010058, 0xC0010058, CPU_CONF, CPU_K10_PLUS, },
218 { 0xC0010060, 0xC0010060, CPU_CACHE, CPU_AMD_11, },
219 { 0xC0010061, 0xC0010068, CPU_SMM, CPU_K10_PLUS, },
220 { 0xC0010069, 0xC001006B, CPU_SMM, CPU_AMD_11, },
221 { 0xC0010070, 0xC0010071, CPU_SMM, CPU_K10_PLUS, },
222 { 0xC0010111, 0xC0010113, CPU_SMM, CPU_K8_PLUS, },
223 { 0xC0010114, 0xC0010118, CPU_SVM, CPU_K10_PLUS, },
224 { 0xC0010140, 0xC0010141, CPU_OSVM, CPU_K10_PLUS, },
225 { 0xC0011022, 0xC0011023, CPU_CONF, CPU_K10_PLUS, },
226};
227
228
229/* Intel */
230static int get_intel_modelflag(unsigned model)
231{
232 int flag;
233
234 switch (model) {
235 case 0x0501:
236 case 0x0502:
237 case 0x0504:
238 flag = CPU_INTEL_PENTIUM;
239 break;
240 case 0x0601:
241 case 0x0603:
242 case 0x0605:
243 case 0x0607:
244 case 0x0608:
245 case 0x060A:
246 case 0x060B:
247 flag = CPU_INTEL_P6;
248 break;
249 case 0x0609:
250 case 0x060D:
251 flag = CPU_INTEL_PENTIUM_M;
252 break;
253 case 0x060E:
254 flag = CPU_INTEL_CORE;
255 break;
256 case 0x060F:
257 case 0x0617:
258 flag = CPU_INTEL_CORE2;
259 break;
260 case 0x061C:
261 flag = CPU_INTEL_ATOM;
262 break;
263 case 0x0F00:
264 case 0x0F01:
265 case 0x0F02:
266 case 0x0F03:
267 case 0x0F04:
268 flag = CPU_INTEL_XEON_P4;
269 break;
270 case 0x0F06:
271 flag = CPU_INTEL_XEON_MP;
272 break;
273 default:
274 flag = CPU_NONE;
275 break;
276 }
277
278 return flag;
279}
280
281/* AMD */
282static int get_amd_modelflag(unsigned model)
283{
284 int flag;
285
286 switch (model >> 8) {
287 case 0x6:
288 flag = CPU_AMD_K6;
289 break;
290 case 0x7:
291 flag = CPU_AMD_K7;
292 break;
293 case 0x8:
294 flag = CPU_AMD_K8;
295 break;
296 case 0xf:
297 flag = CPU_AMD_0F;
298 break;
299 case 0x10:
300 flag = CPU_AMD_10;
301 break;
302 case 0x11:
303 flag = CPU_AMD_11;
304 break;
305 default:
306 flag = CPU_NONE;
307 break;
308 }
309
310 return flag;
311}
312
313static int get_cpu_modelflag(unsigned cpu)
314{
315 int flag;
316
317 flag = per_cpu(cpu_model, cpu);
318
319 switch (flag >> 16) {
320 case X86_VENDOR_INTEL:
321 flag = get_intel_modelflag(flag);
322 break;
323 case X86_VENDOR_AMD:
324 flag = get_amd_modelflag(flag & 0xffff);
325 break;
326 default:
327 flag = CPU_NONE;
328 break;
329 }
330
331 return flag;
332}
333
334static int get_cpu_range_count(unsigned cpu)
335{
336 int index;
337
338 switch (per_cpu(cpu_model, cpu) >> 16) {
339 case X86_VENDOR_INTEL:
340 index = ARRAY_SIZE(cpu_intel_range);
341 break;
342 case X86_VENDOR_AMD:
343 index = ARRAY_SIZE(cpu_amd_range);
344 break;
345 default:
346 index = 0;
347 break;
348 }
349
350 return index;
351}
352
353static int is_typeflag_valid(unsigned cpu, unsigned flag)
354{
355 unsigned vendor, modelflag;
356 int i, index;
357
358 /* Standard Registers should be always valid */
359 if (flag >= CPU_TSS)
360 return 1;
361
362 modelflag = per_cpu(cpu_modelflag, cpu);
363 vendor = per_cpu(cpu_model, cpu) >> 16;
364 index = get_cpu_range_count(cpu);
365
366 for (i = 0; i < index; i++) {
367 switch (vendor) {
368 case X86_VENDOR_INTEL:
369 if ((cpu_intel_range[i].model & modelflag) &&
370 (cpu_intel_range[i].flag & flag))
371 return 1;
372 break;
373 case X86_VENDOR_AMD:
374 if ((cpu_amd_range[i].model & modelflag) &&
375 (cpu_amd_range[i].flag & flag))
376 return 1;
377 break;
378 }
379 }
380
381 /* Invalid */
382 return 0;
383}
384
385static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max,
386 int index, unsigned flag)
387{
388 unsigned modelflag;
389
390 modelflag = per_cpu(cpu_modelflag, cpu);
391 *max = 0;
392 switch (per_cpu(cpu_model, cpu) >> 16) {
393 case X86_VENDOR_INTEL:
394 if ((cpu_intel_range[index].model & modelflag) &&
395 (cpu_intel_range[index].flag & flag)) {
396 *min = cpu_intel_range[index].min;
397 *max = cpu_intel_range[index].max;
398 }
399 break;
400 case X86_VENDOR_AMD:
401 if ((cpu_amd_range[index].model & modelflag) &&
402 (cpu_amd_range[index].flag & flag)) {
403 *min = cpu_amd_range[index].min;
404 *max = cpu_amd_range[index].max;
405 }
406 break;
407 }
408
409 return *max;
410}
411
412/* This function can also be called with seq = NULL for printk */
413static void print_cpu_data(struct seq_file *seq, unsigned type,
414 u32 low, u32 high)
415{
416 struct cpu_private *priv;
417 u64 val = high;
418
419 if (seq) {
420 priv = seq->private;
421 if (priv->file) {
422 val = (val << 32) | low;
423 seq_printf(seq, "0x%llx\n", val);
424 } else
425 seq_printf(seq, " %08x: %08x_%08x\n",
426 type, high, low);
427 } else
428 printk(KERN_INFO " %08x: %08x_%08x\n", type, high, low);
429}
430
431/* This function can also be called with seq = NULL for printk */
432static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
433{
434 unsigned msr, msr_min, msr_max;
435 struct cpu_private *priv;
436 u32 low, high;
437 int i, range;
438
439 if (seq) {
440 priv = seq->private;
441 if (priv->file) {
442 if (!rdmsr_safe_on_cpu(priv->cpu, priv->reg,
443 &low, &high))
444 print_cpu_data(seq, priv->reg, low, high);
445 return;
446 }
447 }
448
449 range = get_cpu_range_count(cpu);
450
451 for (i = 0; i < range; i++) {
452 if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag))
453 continue;
454
455 for (msr = msr_min; msr <= msr_max; msr++) {
456 if (rdmsr_safe_on_cpu(cpu, msr, &low, &high))
457 continue;
458 print_cpu_data(seq, msr, low, high);
459 }
460 }
461}
462
463static void print_tss(void *arg)
464{
465 struct pt_regs *regs = task_pt_regs(current);
466 struct seq_file *seq = arg;
467 unsigned int seg;
468
469 seq_printf(seq, " RAX\t: %016lx\n", regs->ax);
470 seq_printf(seq, " RBX\t: %016lx\n", regs->bx);
471 seq_printf(seq, " RCX\t: %016lx\n", regs->cx);
472 seq_printf(seq, " RDX\t: %016lx\n", regs->dx);
473
474 seq_printf(seq, " RSI\t: %016lx\n", regs->si);
475 seq_printf(seq, " RDI\t: %016lx\n", regs->di);
476 seq_printf(seq, " RBP\t: %016lx\n", regs->bp);
477 seq_printf(seq, " ESP\t: %016lx\n", regs->sp);
478
479#ifdef CONFIG_X86_64
480 seq_printf(seq, " R08\t: %016lx\n", regs->r8);
481 seq_printf(seq, " R09\t: %016lx\n", regs->r9);
482 seq_printf(seq, " R10\t: %016lx\n", regs->r10);
483 seq_printf(seq, " R11\t: %016lx\n", regs->r11);
484 seq_printf(seq, " R12\t: %016lx\n", regs->r12);
485 seq_printf(seq, " R13\t: %016lx\n", regs->r13);
486 seq_printf(seq, " R14\t: %016lx\n", regs->r14);
487 seq_printf(seq, " R15\t: %016lx\n", regs->r15);
488#endif
489
490 asm("movl %%cs,%0" : "=r" (seg));
491 seq_printf(seq, " CS\t: %04x\n", seg);
492 asm("movl %%ds,%0" : "=r" (seg));
493 seq_printf(seq, " DS\t: %04x\n", seg);
494 seq_printf(seq, " SS\t: %04lx\n", regs->ss & 0xffff);
495 asm("movl %%es,%0" : "=r" (seg));
496 seq_printf(seq, " ES\t: %04x\n", seg);
497 asm("movl %%fs,%0" : "=r" (seg));
498 seq_printf(seq, " FS\t: %04x\n", seg);
499 asm("movl %%gs,%0" : "=r" (seg));
500 seq_printf(seq, " GS\t: %04x\n", seg);
501
502 seq_printf(seq, " EFLAGS\t: %016lx\n", regs->flags);
503
504 seq_printf(seq, " EIP\t: %016lx\n", regs->ip);
505}
506
507static void print_cr(void *arg)
508{
509 struct seq_file *seq = arg;
510
511 seq_printf(seq, " cr0\t: %016lx\n", read_cr0());
512 seq_printf(seq, " cr2\t: %016lx\n", read_cr2());
513 seq_printf(seq, " cr3\t: %016lx\n", read_cr3());
514 seq_printf(seq, " cr4\t: %016lx\n", read_cr4_safe());
515#ifdef CONFIG_X86_64
516 seq_printf(seq, " cr8\t: %016lx\n", read_cr8());
517#endif
518}
519
520static void print_desc_ptr(char *str, struct seq_file *seq, struct desc_ptr dt)
521{
522 seq_printf(seq, " %s\t: %016llx\n", str, (u64)(dt.address | dt.size));
523}
524
525static void print_dt(void *seq)
526{
527 struct desc_ptr dt;
528 unsigned long ldt;
529
530 /* IDT */
531 store_idt((struct desc_ptr *)&dt);
532 print_desc_ptr("IDT", seq, dt);
533
534 /* GDT */
535 store_gdt((struct desc_ptr *)&dt);
536 print_desc_ptr("GDT", seq, dt);
537
538 /* LDT */
539 store_ldt(ldt);
540 seq_printf(seq, " LDT\t: %016lx\n", ldt);
541
542 /* TR */
543 store_tr(ldt);
544 seq_printf(seq, " TR\t: %016lx\n", ldt);
545}
546
547static void print_dr(void *arg)
548{
549 struct seq_file *seq = arg;
550 unsigned long dr;
551 int i;
552
553 for (i = 0; i < 8; i++) {
554 /* Ignore db4, db5 */
555 if ((i == 4) || (i == 5))
556 continue;
557 get_debugreg(dr, i);
558 seq_printf(seq, " dr%d\t: %016lx\n", i, dr);
559 }
560
561 seq_printf(seq, "\n MSR\t:\n");
562}
563
564static void print_apic(void *arg)
565{
566 struct seq_file *seq = arg;
567
568#ifdef CONFIG_X86_LOCAL_APIC
569 seq_printf(seq, " LAPIC\t:\n");
570 seq_printf(seq, " ID\t\t: %08x\n", apic_read(APIC_ID) >> 24);
571 seq_printf(seq, " LVR\t\t: %08x\n", apic_read(APIC_LVR));
572 seq_printf(seq, " TASKPRI\t: %08x\n", apic_read(APIC_TASKPRI));
573 seq_printf(seq, " ARBPRI\t\t: %08x\n", apic_read(APIC_ARBPRI));
574 seq_printf(seq, " PROCPRI\t: %08x\n", apic_read(APIC_PROCPRI));
575 seq_printf(seq, " LDR\t\t: %08x\n", apic_read(APIC_LDR));
576 seq_printf(seq, " DFR\t\t: %08x\n", apic_read(APIC_DFR));
577 seq_printf(seq, " SPIV\t\t: %08x\n", apic_read(APIC_SPIV));
578 seq_printf(seq, " ISR\t\t: %08x\n", apic_read(APIC_ISR));
579 seq_printf(seq, " ESR\t\t: %08x\n", apic_read(APIC_ESR));
580 seq_printf(seq, " ICR\t\t: %08x\n", apic_read(APIC_ICR));
581 seq_printf(seq, " ICR2\t\t: %08x\n", apic_read(APIC_ICR2));
582 seq_printf(seq, " LVTT\t\t: %08x\n", apic_read(APIC_LVTT));
583 seq_printf(seq, " LVTTHMR\t: %08x\n", apic_read(APIC_LVTTHMR));
584 seq_printf(seq, " LVTPC\t\t: %08x\n", apic_read(APIC_LVTPC));
585 seq_printf(seq, " LVT0\t\t: %08x\n", apic_read(APIC_LVT0));
586 seq_printf(seq, " LVT1\t\t: %08x\n", apic_read(APIC_LVT1));
587 seq_printf(seq, " LVTERR\t\t: %08x\n", apic_read(APIC_LVTERR));
588 seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT));
589 seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT));
590 seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR));
591#endif /* CONFIG_X86_LOCAL_APIC */
592
593 seq_printf(seq, "\n MSR\t:\n");
594}
595
596static int cpu_seq_show(struct seq_file *seq, void *v)
597{
598 struct cpu_private *priv = seq->private;
599
600 if (priv == NULL)
601 return -EINVAL;
602
603 switch (cpu_base[priv->type].flag) {
604 case CPU_TSS:
605 smp_call_function_single(priv->cpu, print_tss, seq, 1);
606 break;
607 case CPU_CR:
608 smp_call_function_single(priv->cpu, print_cr, seq, 1);
609 break;
610 case CPU_DT:
611 smp_call_function_single(priv->cpu, print_dt, seq, 1);
612 break;
613 case CPU_DEBUG:
614 if (priv->file == CPU_INDEX_BIT)
615 smp_call_function_single(priv->cpu, print_dr, seq, 1);
616 print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
617 break;
618 case CPU_APIC:
619 if (priv->file == CPU_INDEX_BIT)
620 smp_call_function_single(priv->cpu, print_apic, seq, 1);
621 print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
622 break;
623
624 default:
625 print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
626 break;
627 }
628 seq_printf(seq, "\n");
629
630 return 0;
631}
632
633static void *cpu_seq_start(struct seq_file *seq, loff_t *pos)
634{
635 if (*pos == 0) /* One time is enough ;-) */
636 return seq;
637
638 return NULL;
639}
640
641static void *cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
642{
643 (*pos)++;
644
645 return cpu_seq_start(seq, pos);
646}
647
648static void cpu_seq_stop(struct seq_file *seq, void *v)
649{
650}
651
652static const struct seq_operations cpu_seq_ops = {
653 .start = cpu_seq_start,
654 .next = cpu_seq_next,
655 .stop = cpu_seq_stop,
656 .show = cpu_seq_show,
657};
658
659static int cpu_seq_open(struct inode *inode, struct file *file)
660{
661 struct cpu_private *priv = inode->i_private;
662 struct seq_file *seq;
663 int err;
664
665 err = seq_open(file, &cpu_seq_ops);
666 if (!err) {
667 seq = file->private_data;
668 seq->private = priv;
669 }
670
671 return err;
672}
673
674static int write_msr(struct cpu_private *priv, u64 val)
675{
676 u32 low, high;
677
678 high = (val >> 32) & 0xffffffff;
679 low = val & 0xffffffff;
680
681 if (!wrmsr_safe_on_cpu(priv->cpu, priv->reg, low, high))
682 return 0;
683
684 return -EPERM;
685}
686
687static int write_cpu_register(struct cpu_private *priv, const char *buf)
688{
689 int ret = -EPERM;
690 u64 val;
691
692 ret = strict_strtoull(buf, 0, &val);
693 if (ret < 0)
694 return ret;
695
696 /* Supporting only MSRs */
697 if (priv->type < CPU_TSS_BIT)
698 return write_msr(priv, val);
699
700 return ret;
701}
702
703static ssize_t cpu_write(struct file *file, const char __user *ubuf,
704 size_t count, loff_t *off)
705{
706 struct seq_file *seq = file->private_data;
707 struct cpu_private *priv = seq->private;
708 char buf[19];
709
710 if ((priv == NULL) || (count >= sizeof(buf)))
711 return -EINVAL;
712
713 if (copy_from_user(&buf, ubuf, count))
714 return -EFAULT;
715
716 buf[count] = 0;
717
718 if ((cpu_base[priv->type].write) && (cpu_file[priv->file].write))
719 if (!write_cpu_register(priv, buf))
720 return count;
721
722 return -EACCES;
723}
724
725static const struct file_operations cpu_fops = {
726 .owner = THIS_MODULE,
727 .open = cpu_seq_open,
728 .read = seq_read,
729 .write = cpu_write,
730 .llseek = seq_lseek,
731 .release = seq_release,
732};
733
734static int cpu_create_file(unsigned cpu, unsigned type, unsigned reg,
735 unsigned file, struct dentry *dentry)
736{
737 struct cpu_private *priv = NULL;
738
739 /* Already intialized */
740 if (file == CPU_INDEX_BIT)
741 if (per_cpu(cpu_arr[type].init, cpu))
742 return 0;
743
744 priv = kzalloc(sizeof(*priv), GFP_KERNEL);
745 if (priv == NULL)
746 return -ENOMEM;
747
748 priv->cpu = cpu;
749 priv->type = type;
750 priv->reg = reg;
751 priv->file = file;
752 mutex_lock(&cpu_debug_lock);
753 per_cpu(priv_arr[type], cpu) = priv;
754 per_cpu(cpu_priv_count, cpu)++;
755 mutex_unlock(&cpu_debug_lock);
756
757 if (file)
758 debugfs_create_file(cpu_file[file].name, S_IRUGO,
759 dentry, (void *)priv, &cpu_fops);
760 else {
761 debugfs_create_file(cpu_base[type].name, S_IRUGO,
762 per_cpu(cpu_arr[type].dentry, cpu),
763 (void *)priv, &cpu_fops);
764 mutex_lock(&cpu_debug_lock);
765 per_cpu(cpu_arr[type].init, cpu) = 1;
766 mutex_unlock(&cpu_debug_lock);
767 }
768
769 return 0;
770}
771
772static int cpu_init_regfiles(unsigned cpu, unsigned int type, unsigned reg,
773 struct dentry *dentry)
774{
775 unsigned file;
776 int err = 0;
777
778 for (file = 0; file < ARRAY_SIZE(cpu_file); file++) {
779 err = cpu_create_file(cpu, type, reg, file, dentry);
780 if (err)
781 return err;
782 }
783
784 return err;
785}
786
787static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry)
788{
789 struct dentry *cpu_dentry = NULL;
790 unsigned reg, reg_min, reg_max;
791 int i, range, err = 0;
792 char reg_dir[12];
793 u32 low, high;
794
795 range = get_cpu_range_count(cpu);
796
797 for (i = 0; i < range; i++) {
798 if (!get_cpu_range(cpu, &reg_min, &reg_max, i,
799 cpu_base[type].flag))
800 continue;
801
802 for (reg = reg_min; reg <= reg_max; reg++) {
803 if (rdmsr_safe_on_cpu(cpu, reg, &low, &high))
804 continue;
805
806 sprintf(reg_dir, "0x%x", reg);
807 cpu_dentry = debugfs_create_dir(reg_dir, dentry);
808 err = cpu_init_regfiles(cpu, type, reg, cpu_dentry);
809 if (err)
810 return err;
811 }
812 }
813
814 return err;
815}
816
817static int cpu_init_allreg(unsigned cpu, struct dentry *dentry)
818{
819 struct dentry *cpu_dentry = NULL;
820 unsigned type;
821 int err = 0;
822
823 for (type = 0; type < ARRAY_SIZE(cpu_base) - 1; type++) {
824 if (!is_typeflag_valid(cpu, cpu_base[type].flag))
825 continue;
826 cpu_dentry = debugfs_create_dir(cpu_base[type].name, dentry);
827 per_cpu(cpu_arr[type].dentry, cpu) = cpu_dentry;
828
829 if (type < CPU_TSS_BIT)
830 err = cpu_init_msr(cpu, type, cpu_dentry);
831 else
832 err = cpu_create_file(cpu, type, 0, CPU_INDEX_BIT,
833 cpu_dentry);
834 if (err)
835 return err;
836 }
837
838 return err;
839}
840
841static int cpu_init_cpu(void)
842{
843 struct dentry *cpu_dentry = NULL;
844 struct cpuinfo_x86 *cpui;
845 char cpu_dir[12];
846 unsigned cpu;
847 int err = 0;
848
849 for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
850 cpui = &cpu_data(cpu);
851 if (!cpu_has(cpui, X86_FEATURE_MSR))
852 continue;
853 per_cpu(cpu_model, cpu) = ((cpui->x86_vendor << 16) |
854 (cpui->x86 << 8) |
855 (cpui->x86_model));
856 per_cpu(cpu_modelflag, cpu) = get_cpu_modelflag(cpu);
857
858 sprintf(cpu_dir, "cpu%d", cpu);
859 cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir);
860 err = cpu_init_allreg(cpu, cpu_dentry);
861
862 pr_info("cpu%d(%d) debug files %d\n",
863 cpu, nr_cpu_ids, per_cpu(cpu_priv_count, cpu));
864 if (per_cpu(cpu_priv_count, cpu) > MAX_CPU_FILES) {
865 pr_err("Register files count %d exceeds limit %d\n",
866 per_cpu(cpu_priv_count, cpu), MAX_CPU_FILES);
867 per_cpu(cpu_priv_count, cpu) = MAX_CPU_FILES;
868 err = -ENFILE;
869 }
870 if (err)
871 return err;
872 }
873
874 return err;
875}
876
877static int __init cpu_debug_init(void)
878{
879 cpu_debugfs_dir = debugfs_create_dir("cpu", arch_debugfs_dir);
880
881 return cpu_init_cpu();
882}
883
884static void __exit cpu_debug_exit(void)
885{
886 int i, cpu;
887
888 if (cpu_debugfs_dir)
889 debugfs_remove_recursive(cpu_debugfs_dir);
890
891 for (cpu = 0; cpu < nr_cpu_ids; cpu++)
892 for (i = 0; i < per_cpu(cpu_priv_count, cpu); i++)
893 kfree(per_cpu(priv_arr[i], cpu));
894}
895
896module_init(cpu_debug_init);
897module_exit(cpu_debug_exit);
898
899MODULE_AUTHOR("Jaswinder Singh Rajput");
900MODULE_DESCRIPTION("CPU Debug module");
901MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
index 65792c2cc462..52c839875478 100644
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ b/arch/x86/kernel/cpu/cpufreq/Kconfig
@@ -87,30 +87,15 @@ config X86_POWERNOW_K7_ACPI
87config X86_POWERNOW_K8 87config X86_POWERNOW_K8
88 tristate "AMD Opteron/Athlon64 PowerNow!" 88 tristate "AMD Opteron/Athlon64 PowerNow!"
89 select CPU_FREQ_TABLE 89 select CPU_FREQ_TABLE
90 depends on ACPI && ACPI_PROCESSOR
90 help 91 help
91 This adds the CPUFreq driver for mobile AMD Opteron/Athlon64 processors. 92 This adds the CPUFreq driver for K8/K10 Opteron/Athlon64 processors.
92 93
93 To compile this driver as a module, choose M here: the 94 To compile this driver as a module, choose M here: the
94 module will be called powernow-k8. 95 module will be called powernow-k8.
95 96
96 For details, take a look at <file:Documentation/cpu-freq/>. 97 For details, take a look at <file:Documentation/cpu-freq/>.
97 98
98 If in doubt, say N.
99
100config X86_POWERNOW_K8_ACPI
101 bool
102 prompt "ACPI Support" if X86_32
103 depends on ACPI && X86_POWERNOW_K8 && ACPI_PROCESSOR
104 depends on !(X86_POWERNOW_K8 = y && ACPI_PROCESSOR = m)
105 default y
106 help
107 This provides access to the K8s Processor Performance States via ACPI.
108 This driver is probably required for CPUFreq to work with multi-socket and
109 SMP systems. It is not required on at least some single-socket yet
110 multi-core systems, even if SMP is enabled.
111
112 It is safe to say Y here.
113
114config X86_GX_SUSPMOD 99config X86_GX_SUSPMOD
115 tristate "Cyrix MediaGX/NatSemi Geode Suspend Modulation" 100 tristate "Cyrix MediaGX/NatSemi Geode Suspend Modulation"
116 depends on X86_32 && PCI 101 depends on X86_32 && PCI
diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile
index 560f7760dae5..509296df294d 100644
--- a/arch/x86/kernel/cpu/cpufreq/Makefile
+++ b/arch/x86/kernel/cpu/cpufreq/Makefile
@@ -1,6 +1,11 @@
1# Link order matters. K8 is preferred to ACPI because of firmware bugs in early
2# K8 systems. ACPI is preferred to all other hardware-specific drivers.
3# speedstep-* is preferred over p4-clockmod.
4
5obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o
6obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o
1obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o 7obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o
2obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o 8obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o
3obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o
4obj-$(CONFIG_X86_LONGHAUL) += longhaul.o 9obj-$(CONFIG_X86_LONGHAUL) += longhaul.o
5obj-$(CONFIG_X86_E_POWERSAVER) += e_powersaver.o 10obj-$(CONFIG_X86_E_POWERSAVER) += e_powersaver.o
6obj-$(CONFIG_ELAN_CPUFREQ) += elanfreq.o 11obj-$(CONFIG_ELAN_CPUFREQ) += elanfreq.o
@@ -10,7 +15,6 @@ obj-$(CONFIG_X86_GX_SUSPMOD) += gx-suspmod.o
10obj-$(CONFIG_X86_SPEEDSTEP_ICH) += speedstep-ich.o 15obj-$(CONFIG_X86_SPEEDSTEP_ICH) += speedstep-ich.o
11obj-$(CONFIG_X86_SPEEDSTEP_LIB) += speedstep-lib.o 16obj-$(CONFIG_X86_SPEEDSTEP_LIB) += speedstep-lib.o
12obj-$(CONFIG_X86_SPEEDSTEP_SMI) += speedstep-smi.o 17obj-$(CONFIG_X86_SPEEDSTEP_SMI) += speedstep-smi.o
13obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o
14obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO) += speedstep-centrino.o 18obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO) += speedstep-centrino.o
15obj-$(CONFIG_X86_P4_CLOCKMOD) += p4-clockmod.o 19obj-$(CONFIG_X86_P4_CLOCKMOD) += p4-clockmod.o
16obj-$(CONFIG_X86_CPUFREQ_NFORCE2) += cpufreq-nforce2.o 20obj-$(CONFIG_X86_CPUFREQ_NFORCE2) += cpufreq-nforce2.o
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 4b1c319d30c3..19f6b9d27e83 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * acpi-cpufreq.c - ACPI Processor P-States Driver ($Revision: 1.4 $) 2 * acpi-cpufreq.c - ACPI Processor P-States Driver
3 * 3 *
4 * Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com> 4 * Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
5 * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> 5 * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
@@ -33,19 +33,21 @@
33#include <linux/cpufreq.h> 33#include <linux/cpufreq.h>
34#include <linux/compiler.h> 34#include <linux/compiler.h>
35#include <linux/dmi.h> 35#include <linux/dmi.h>
36#include <linux/ftrace.h> 36#include <trace/power.h>
37 37
38#include <linux/acpi.h> 38#include <linux/acpi.h>
39#include <linux/io.h>
40#include <linux/delay.h>
41#include <linux/uaccess.h>
42
39#include <acpi/processor.h> 43#include <acpi/processor.h>
40 44
41#include <asm/io.h>
42#include <asm/msr.h> 45#include <asm/msr.h>
43#include <asm/processor.h> 46#include <asm/processor.h>
44#include <asm/cpufeature.h> 47#include <asm/cpufeature.h>
45#include <asm/delay.h>
46#include <asm/uaccess.h>
47 48
48#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "acpi-cpufreq", msg) 49#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
50 "acpi-cpufreq", msg)
49 51
50MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski"); 52MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski");
51MODULE_DESCRIPTION("ACPI Processor P-States Driver"); 53MODULE_DESCRIPTION("ACPI Processor P-States Driver");
@@ -70,6 +72,8 @@ struct acpi_cpufreq_data {
70 72
71static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data); 73static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data);
72 74
75DEFINE_TRACE(power_mark);
76
73/* acpi_perf_data is a pointer to percpu data. */ 77/* acpi_perf_data is a pointer to percpu data. */
74static struct acpi_processor_performance *acpi_perf_data; 78static struct acpi_processor_performance *acpi_perf_data;
75 79
@@ -95,7 +99,7 @@ static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data)
95 99
96 perf = data->acpi_data; 100 perf = data->acpi_data;
97 101
98 for (i=0; i<perf->state_count; i++) { 102 for (i = 0; i < perf->state_count; i++) {
99 if (value == perf->states[i].status) 103 if (value == perf->states[i].status)
100 return data->freq_table[i].frequency; 104 return data->freq_table[i].frequency;
101 } 105 }
@@ -110,7 +114,7 @@ static unsigned extract_msr(u32 msr, struct acpi_cpufreq_data *data)
110 msr &= INTEL_MSR_RANGE; 114 msr &= INTEL_MSR_RANGE;
111 perf = data->acpi_data; 115 perf = data->acpi_data;
112 116
113 for (i=0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) { 117 for (i = 0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) {
114 if (msr == perf->states[data->freq_table[i].index].status) 118 if (msr == perf->states[data->freq_table[i].index].status)
115 return data->freq_table[i].frequency; 119 return data->freq_table[i].frequency;
116 } 120 }
@@ -138,15 +142,13 @@ struct io_addr {
138 u8 bit_width; 142 u8 bit_width;
139}; 143};
140 144
141typedef union {
142 struct msr_addr msr;
143 struct io_addr io;
144} drv_addr_union;
145
146struct drv_cmd { 145struct drv_cmd {
147 unsigned int type; 146 unsigned int type;
148 const struct cpumask *mask; 147 const struct cpumask *mask;
149 drv_addr_union addr; 148 union {
149 struct msr_addr msr;
150 struct io_addr io;
151 } addr;
150 u32 val; 152 u32 val;
151}; 153};
152 154
@@ -369,7 +371,7 @@ static unsigned int check_freqs(const struct cpumask *mask, unsigned int freq,
369 unsigned int cur_freq; 371 unsigned int cur_freq;
370 unsigned int i; 372 unsigned int i;
371 373
372 for (i=0; i<100; i++) { 374 for (i = 0; i < 100; i++) {
373 cur_freq = extract_freq(get_cur_val(mask), data); 375 cur_freq = extract_freq(get_cur_val(mask), data);
374 if (cur_freq == freq) 376 if (cur_freq == freq)
375 return 1; 377 return 1;
@@ -494,7 +496,7 @@ acpi_cpufreq_guess_freq(struct acpi_cpufreq_data *data, unsigned int cpu)
494 unsigned long freq; 496 unsigned long freq;
495 unsigned long freqn = perf->states[0].core_frequency * 1000; 497 unsigned long freqn = perf->states[0].core_frequency * 1000;
496 498
497 for (i=0; i<(perf->state_count-1); i++) { 499 for (i = 0; i < (perf->state_count-1); i++) {
498 freq = freqn; 500 freq = freqn;
499 freqn = perf->states[i+1].core_frequency * 1000; 501 freqn = perf->states[i+1].core_frequency * 1000;
500 if ((2 * cpu_khz) > (freqn + freq)) { 502 if ((2 * cpu_khz) > (freqn + freq)) {
@@ -601,7 +603,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
601 if (!data) 603 if (!data)
602 return -ENOMEM; 604 return -ENOMEM;
603 605
604 data->acpi_data = percpu_ptr(acpi_perf_data, cpu); 606 data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
605 per_cpu(drv_data, cpu) = data; 607 per_cpu(drv_data, cpu) = data;
606 608
607 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) 609 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
@@ -673,17 +675,29 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
673 675
674 /* detect transition latency */ 676 /* detect transition latency */
675 policy->cpuinfo.transition_latency = 0; 677 policy->cpuinfo.transition_latency = 0;
676 for (i=0; i<perf->state_count; i++) { 678 for (i = 0; i < perf->state_count; i++) {
677 if ((perf->states[i].transition_latency * 1000) > 679 if ((perf->states[i].transition_latency * 1000) >
678 policy->cpuinfo.transition_latency) 680 policy->cpuinfo.transition_latency)
679 policy->cpuinfo.transition_latency = 681 policy->cpuinfo.transition_latency =
680 perf->states[i].transition_latency * 1000; 682 perf->states[i].transition_latency * 1000;
681 } 683 }
682 684
685 /* Check for high latency (>20uS) from buggy BIOSes, like on T42 */
686 if (perf->control_register.space_id == ACPI_ADR_SPACE_FIXED_HARDWARE &&
687 policy->cpuinfo.transition_latency > 20 * 1000) {
688 static int print_once;
689 policy->cpuinfo.transition_latency = 20 * 1000;
690 if (!print_once) {
691 print_once = 1;
692 printk(KERN_INFO "Capping off P-state tranision latency"
693 " at 20 uS\n");
694 }
695 }
696
683 data->max_freq = perf->states[0].core_frequency * 1000; 697 data->max_freq = perf->states[0].core_frequency * 1000;
684 /* table init */ 698 /* table init */
685 for (i=0; i<perf->state_count; i++) { 699 for (i = 0; i < perf->state_count; i++) {
686 if (i>0 && perf->states[i].core_frequency >= 700 if (i > 0 && perf->states[i].core_frequency >=
687 data->freq_table[valid_states-1].frequency / 1000) 701 data->freq_table[valid_states-1].frequency / 1000)
688 continue; 702 continue;
689 703
diff --git a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
index 965ea52767ac..733093d60436 100644
--- a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
+++ b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
@@ -32,7 +32,7 @@
32 * nforce2_chipset: 32 * nforce2_chipset:
33 * FSB is changed using the chipset 33 * FSB is changed using the chipset
34 */ 34 */
35static struct pci_dev *nforce2_chipset_dev; 35static struct pci_dev *nforce2_dev;
36 36
37/* fid: 37/* fid:
38 * multiplier * 10 38 * multiplier * 10
@@ -56,7 +56,9 @@ MODULE_PARM_DESC(fid, "CPU multiplier to use (11.5 = 115)");
56MODULE_PARM_DESC(min_fsb, 56MODULE_PARM_DESC(min_fsb,
57 "Minimum FSB to use, if not defined: current FSB - 50"); 57 "Minimum FSB to use, if not defined: current FSB - 50");
58 58
59#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "cpufreq-nforce2", msg) 59#define PFX "cpufreq-nforce2: "
60#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
61 "cpufreq-nforce2", msg)
60 62
61/** 63/**
62 * nforce2_calc_fsb - calculate FSB 64 * nforce2_calc_fsb - calculate FSB
@@ -118,11 +120,11 @@ static void nforce2_write_pll(int pll)
118 int temp; 120 int temp;
119 121
120 /* Set the pll addr. to 0x00 */ 122 /* Set the pll addr. to 0x00 */
121 pci_write_config_dword(nforce2_chipset_dev, NFORCE2_PLLADR, 0); 123 pci_write_config_dword(nforce2_dev, NFORCE2_PLLADR, 0);
122 124
123 /* Now write the value in all 64 registers */ 125 /* Now write the value in all 64 registers */
124 for (temp = 0; temp <= 0x3f; temp++) 126 for (temp = 0; temp <= 0x3f; temp++)
125 pci_write_config_dword(nforce2_chipset_dev, NFORCE2_PLLREG, pll); 127 pci_write_config_dword(nforce2_dev, NFORCE2_PLLREG, pll);
126 128
127 return; 129 return;
128} 130}
@@ -139,8 +141,8 @@ static unsigned int nforce2_fsb_read(int bootfsb)
139 u32 fsb, temp = 0; 141 u32 fsb, temp = 0;
140 142
141 /* Get chipset boot FSB from subdevice 5 (FSB at boot-time) */ 143 /* Get chipset boot FSB from subdevice 5 (FSB at boot-time) */
142 nforce2_sub5 = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, 144 nforce2_sub5 = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, 0x01EF,
143 0x01EF, PCI_ANY_ID, PCI_ANY_ID, NULL); 145 PCI_ANY_ID, PCI_ANY_ID, NULL);
144 if (!nforce2_sub5) 146 if (!nforce2_sub5)
145 return 0; 147 return 0;
146 148
@@ -148,13 +150,13 @@ static unsigned int nforce2_fsb_read(int bootfsb)
148 fsb /= 1000000; 150 fsb /= 1000000;
149 151
150 /* Check if PLL register is already set */ 152 /* Check if PLL register is already set */
151 pci_read_config_byte(nforce2_chipset_dev, NFORCE2_PLLENABLE, (u8 *)&temp); 153 pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
152 154
153 if (bootfsb || !temp) 155 if (bootfsb || !temp)
154 return fsb; 156 return fsb;
155 157
156 /* Use PLL register FSB value */ 158 /* Use PLL register FSB value */
157 pci_read_config_dword(nforce2_chipset_dev, NFORCE2_PLLREG, &temp); 159 pci_read_config_dword(nforce2_dev, NFORCE2_PLLREG, &temp);
158 fsb = nforce2_calc_fsb(temp); 160 fsb = nforce2_calc_fsb(temp);
159 161
160 return fsb; 162 return fsb;
@@ -174,18 +176,18 @@ static int nforce2_set_fsb(unsigned int fsb)
174 int pll = 0; 176 int pll = 0;
175 177
176 if ((fsb > max_fsb) || (fsb < NFORCE2_MIN_FSB)) { 178 if ((fsb > max_fsb) || (fsb < NFORCE2_MIN_FSB)) {
177 printk(KERN_ERR "cpufreq: FSB %d is out of range!\n", fsb); 179 printk(KERN_ERR PFX "FSB %d is out of range!\n", fsb);
178 return -EINVAL; 180 return -EINVAL;
179 } 181 }
180 182
181 tfsb = nforce2_fsb_read(0); 183 tfsb = nforce2_fsb_read(0);
182 if (!tfsb) { 184 if (!tfsb) {
183 printk(KERN_ERR "cpufreq: Error while reading the FSB\n"); 185 printk(KERN_ERR PFX "Error while reading the FSB\n");
184 return -EINVAL; 186 return -EINVAL;
185 } 187 }
186 188
187 /* First write? Then set actual value */ 189 /* First write? Then set actual value */
188 pci_read_config_byte(nforce2_chipset_dev, NFORCE2_PLLENABLE, (u8 *)&temp); 190 pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
189 if (!temp) { 191 if (!temp) {
190 pll = nforce2_calc_pll(tfsb); 192 pll = nforce2_calc_pll(tfsb);
191 193
@@ -197,7 +199,7 @@ static int nforce2_set_fsb(unsigned int fsb)
197 199
198 /* Enable write access */ 200 /* Enable write access */
199 temp = 0x01; 201 temp = 0x01;
200 pci_write_config_byte(nforce2_chipset_dev, NFORCE2_PLLENABLE, (u8)temp); 202 pci_write_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8)temp);
201 203
202 diff = tfsb - fsb; 204 diff = tfsb - fsb;
203 205
@@ -222,7 +224,7 @@ static int nforce2_set_fsb(unsigned int fsb)
222 } 224 }
223 225
224 temp = 0x40; 226 temp = 0x40;
225 pci_write_config_byte(nforce2_chipset_dev, NFORCE2_PLLADR, (u8)temp); 227 pci_write_config_byte(nforce2_dev, NFORCE2_PLLADR, (u8)temp);
226 228
227 return 0; 229 return 0;
228} 230}
@@ -244,7 +246,8 @@ static unsigned int nforce2_get(unsigned int cpu)
244 * nforce2_target - set a new CPUFreq policy 246 * nforce2_target - set a new CPUFreq policy
245 * @policy: new policy 247 * @policy: new policy
246 * @target_freq: the target frequency 248 * @target_freq: the target frequency
247 * @relation: how that frequency relates to achieved frequency (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) 249 * @relation: how that frequency relates to achieved frequency
250 * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
248 * 251 *
249 * Sets a new CPUFreq policy. 252 * Sets a new CPUFreq policy.
250 */ 253 */
@@ -276,7 +279,7 @@ static int nforce2_target(struct cpufreq_policy *policy,
276 /* local_irq_save(flags); */ 279 /* local_irq_save(flags); */
277 280
278 if (nforce2_set_fsb(target_fsb) < 0) 281 if (nforce2_set_fsb(target_fsb) < 0)
279 printk(KERN_ERR "cpufreq: Changing FSB to %d failed\n", 282 printk(KERN_ERR PFX "Changing FSB to %d failed\n",
280 target_fsb); 283 target_fsb);
281 else 284 else
282 dprintk("Changed FSB successfully to %d\n", 285 dprintk("Changed FSB successfully to %d\n",
@@ -327,8 +330,8 @@ static int nforce2_cpu_init(struct cpufreq_policy *policy)
327 /* FIX: Get FID from CPU */ 330 /* FIX: Get FID from CPU */
328 if (!fid) { 331 if (!fid) {
329 if (!cpu_khz) { 332 if (!cpu_khz) {
330 printk(KERN_WARNING 333 printk(KERN_WARNING PFX
331 "cpufreq: cpu_khz not set, can't calculate multiplier!\n"); 334 "cpu_khz not set, can't calculate multiplier!\n");
332 return -ENODEV; 335 return -ENODEV;
333 } 336 }
334 337
@@ -343,7 +346,7 @@ static int nforce2_cpu_init(struct cpufreq_policy *policy)
343 } 346 }
344 } 347 }
345 348
346 printk(KERN_INFO "cpufreq: FSB currently at %i MHz, FID %d.%d\n", fsb, 349 printk(KERN_INFO PFX "FSB currently at %i MHz, FID %d.%d\n", fsb,
347 fid / 10, fid % 10); 350 fid / 10, fid % 10);
348 351
349 /* Set maximum FSB to FSB at boot time */ 352 /* Set maximum FSB to FSB at boot time */
@@ -392,17 +395,18 @@ static struct cpufreq_driver nforce2_driver = {
392 */ 395 */
393static unsigned int nforce2_detect_chipset(void) 396static unsigned int nforce2_detect_chipset(void)
394{ 397{
395 nforce2_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, 398 nforce2_dev = pci_get_subsys(PCI_VENDOR_ID_NVIDIA,
396 PCI_DEVICE_ID_NVIDIA_NFORCE2, 399 PCI_DEVICE_ID_NVIDIA_NFORCE2,
397 PCI_ANY_ID, PCI_ANY_ID, NULL); 400 PCI_ANY_ID, PCI_ANY_ID, NULL);
398 401
399 if (nforce2_chipset_dev == NULL) 402 if (nforce2_dev == NULL)
400 return -ENODEV; 403 return -ENODEV;
401 404
402 printk(KERN_INFO "cpufreq: Detected nForce2 chipset revision %X\n", 405 printk(KERN_INFO PFX "Detected nForce2 chipset revision %X\n",
403 nforce2_chipset_dev->revision); 406 nforce2_dev->revision);
404 printk(KERN_INFO 407 printk(KERN_INFO PFX
405 "cpufreq: FSB changing is maybe unstable and can lead to crashes and data loss.\n"); 408 "FSB changing is maybe unstable and can lead to "
409 "crashes and data loss.\n");
406 410
407 return 0; 411 return 0;
408} 412}
@@ -420,7 +424,7 @@ static int __init nforce2_init(void)
420 424
421 /* detect chipset */ 425 /* detect chipset */
422 if (nforce2_detect_chipset()) { 426 if (nforce2_detect_chipset()) {
423 printk(KERN_ERR "cpufreq: No nForce2 chipset.\n"); 427 printk(KERN_INFO PFX "No nForce2 chipset.\n");
424 return -ENODEV; 428 return -ENODEV;
425 } 429 }
426 430
diff --git a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
index 41ab3f064cb1..35a257dd4bb7 100644
--- a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
+++ b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
@@ -12,12 +12,12 @@
12#include <linux/cpufreq.h> 12#include <linux/cpufreq.h>
13#include <linux/ioport.h> 13#include <linux/ioport.h>
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/timex.h>
16#include <linux/io.h>
17#include <linux/delay.h>
15 18
16#include <asm/msr.h> 19#include <asm/msr.h>
17#include <asm/tsc.h> 20#include <asm/tsc.h>
18#include <asm/timex.h>
19#include <asm/io.h>
20#include <asm/delay.h>
21 21
22#define EPS_BRAND_C7M 0 22#define EPS_BRAND_C7M 0
23#define EPS_BRAND_C7 1 23#define EPS_BRAND_C7 1
@@ -184,7 +184,7 @@ static int eps_cpu_init(struct cpufreq_policy *policy)
184 break; 184 break;
185 } 185 }
186 186
187 switch(brand) { 187 switch (brand) {
188 case EPS_BRAND_C7M: 188 case EPS_BRAND_C7M:
189 printk(KERN_CONT "C7-M\n"); 189 printk(KERN_CONT "C7-M\n");
190 break; 190 break;
@@ -218,17 +218,20 @@ static int eps_cpu_init(struct cpufreq_policy *policy)
218 /* Print voltage and multiplier */ 218 /* Print voltage and multiplier */
219 rdmsr(MSR_IA32_PERF_STATUS, lo, hi); 219 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
220 current_voltage = lo & 0xff; 220 current_voltage = lo & 0xff;
221 printk(KERN_INFO "eps: Current voltage = %dmV\n", current_voltage * 16 + 700); 221 printk(KERN_INFO "eps: Current voltage = %dmV\n",
222 current_voltage * 16 + 700);
222 current_multiplier = (lo >> 8) & 0xff; 223 current_multiplier = (lo >> 8) & 0xff;
223 printk(KERN_INFO "eps: Current multiplier = %d\n", current_multiplier); 224 printk(KERN_INFO "eps: Current multiplier = %d\n", current_multiplier);
224 225
225 /* Print limits */ 226 /* Print limits */
226 max_voltage = hi & 0xff; 227 max_voltage = hi & 0xff;
227 printk(KERN_INFO "eps: Highest voltage = %dmV\n", max_voltage * 16 + 700); 228 printk(KERN_INFO "eps: Highest voltage = %dmV\n",
229 max_voltage * 16 + 700);
228 max_multiplier = (hi >> 8) & 0xff; 230 max_multiplier = (hi >> 8) & 0xff;
229 printk(KERN_INFO "eps: Highest multiplier = %d\n", max_multiplier); 231 printk(KERN_INFO "eps: Highest multiplier = %d\n", max_multiplier);
230 min_voltage = (hi >> 16) & 0xff; 232 min_voltage = (hi >> 16) & 0xff;
231 printk(KERN_INFO "eps: Lowest voltage = %dmV\n", min_voltage * 16 + 700); 233 printk(KERN_INFO "eps: Lowest voltage = %dmV\n",
234 min_voltage * 16 + 700);
232 min_multiplier = (hi >> 24) & 0xff; 235 min_multiplier = (hi >> 24) & 0xff;
233 printk(KERN_INFO "eps: Lowest multiplier = %d\n", min_multiplier); 236 printk(KERN_INFO "eps: Lowest multiplier = %d\n", min_multiplier);
234 237
@@ -318,7 +321,7 @@ static int eps_cpu_exit(struct cpufreq_policy *policy)
318 return 0; 321 return 0;
319} 322}
320 323
321static struct freq_attr* eps_attr[] = { 324static struct freq_attr *eps_attr[] = {
322 &cpufreq_freq_attr_scaling_available_freqs, 325 &cpufreq_freq_attr_scaling_available_freqs,
323 NULL, 326 NULL,
324}; 327};
@@ -356,7 +359,7 @@ static void __exit eps_exit(void)
356 cpufreq_unregister_driver(&eps_driver); 359 cpufreq_unregister_driver(&eps_driver);
357} 360}
358 361
359MODULE_AUTHOR("Rafa³ Bilski <rafalbilski@interia.pl>"); 362MODULE_AUTHOR("Rafal Bilski <rafalbilski@interia.pl>");
360MODULE_DESCRIPTION("Enhanced PowerSaver driver for VIA C7 CPU's."); 363MODULE_DESCRIPTION("Enhanced PowerSaver driver for VIA C7 CPU's.");
361MODULE_LICENSE("GPL"); 364MODULE_LICENSE("GPL");
362 365
diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
index fe613c93b366..006b278b0d5d 100644
--- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
@@ -184,7 +184,8 @@ static int elanfreq_target(struct cpufreq_policy *policy,
184{ 184{
185 unsigned int newstate = 0; 185 unsigned int newstate = 0;
186 186
187 if (cpufreq_frequency_table_target(policy, &elanfreq_table[0], target_freq, relation, &newstate)) 187 if (cpufreq_frequency_table_target(policy, &elanfreq_table[0],
188 target_freq, relation, &newstate))
188 return -EINVAL; 189 return -EINVAL;
189 190
190 elanfreq_set_cpu_state(newstate); 191 elanfreq_set_cpu_state(newstate);
@@ -301,7 +302,8 @@ static void __exit elanfreq_exit(void)
301module_param(max_freq, int, 0444); 302module_param(max_freq, int, 0444);
302 303
303MODULE_LICENSE("GPL"); 304MODULE_LICENSE("GPL");
304MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, Sven Geggus <sven@geggus.net>"); 305MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, "
306 "Sven Geggus <sven@geggus.net>");
305MODULE_DESCRIPTION("cpufreq driver for AMD's Elan CPUs"); 307MODULE_DESCRIPTION("cpufreq driver for AMD's Elan CPUs");
306 308
307module_init(elanfreq_init); 309module_init(elanfreq_init);
diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
index 9d9eae82e60f..ac27ec2264d5 100644
--- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
@@ -79,8 +79,9 @@
79#include <linux/smp.h> 79#include <linux/smp.h>
80#include <linux/cpufreq.h> 80#include <linux/cpufreq.h>
81#include <linux/pci.h> 81#include <linux/pci.h>
82#include <linux/errno.h>
83
82#include <asm/processor-cyrix.h> 84#include <asm/processor-cyrix.h>
83#include <asm/errno.h>
84 85
85/* PCI config registers, all at F0 */ 86/* PCI config registers, all at F0 */
86#define PCI_PMER1 0x80 /* power management enable register 1 */ 87#define PCI_PMER1 0x80 /* power management enable register 1 */
@@ -122,8 +123,8 @@ static struct gxfreq_params *gx_params;
122static int stock_freq; 123static int stock_freq;
123 124
124/* PCI bus clock - defaults to 30.000 if cpu_khz is not available */ 125/* PCI bus clock - defaults to 30.000 if cpu_khz is not available */
125static int pci_busclk = 0; 126static int pci_busclk;
126module_param (pci_busclk, int, 0444); 127module_param(pci_busclk, int, 0444);
127 128
128/* maximum duration for which the cpu may be suspended 129/* maximum duration for which the cpu may be suspended
129 * (32us * MAX_DURATION). If no parameter is given, this defaults 130 * (32us * MAX_DURATION). If no parameter is given, this defaults
@@ -132,7 +133,7 @@ module_param (pci_busclk, int, 0444);
132 * is suspended -- processing power is just 0.39% of what it used to be, 133 * is suspended -- processing power is just 0.39% of what it used to be,
133 * though. 781.25 kHz(!) for a 200 MHz processor -- wow. */ 134 * though. 781.25 kHz(!) for a 200 MHz processor -- wow. */
134static int max_duration = 255; 135static int max_duration = 255;
135module_param (max_duration, int, 0444); 136module_param(max_duration, int, 0444);
136 137
137/* For the default policy, we want at least some processing power 138/* For the default policy, we want at least some processing power
138 * - let's say 5%. (min = maxfreq / POLICY_MIN_DIV) 139 * - let's say 5%. (min = maxfreq / POLICY_MIN_DIV)
@@ -140,7 +141,8 @@ module_param (max_duration, int, 0444);
140#define POLICY_MIN_DIV 20 141#define POLICY_MIN_DIV 20
141 142
142 143
143#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "gx-suspmod", msg) 144#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
145 "gx-suspmod", msg)
144 146
145/** 147/**
146 * we can detect a core multipiler from dir0_lsb 148 * we can detect a core multipiler from dir0_lsb
@@ -166,12 +168,20 @@ static int gx_freq_mult[16] = {
166 * Low Level chipset interface * 168 * Low Level chipset interface *
167 ****************************************************************/ 169 ****************************************************************/
168static struct pci_device_id gx_chipset_tbl[] __initdata = { 170static struct pci_device_id gx_chipset_tbl[] __initdata = {
169 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, PCI_ANY_ID, PCI_ANY_ID }, 171 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY,
170 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520, PCI_ANY_ID, PCI_ANY_ID }, 172 PCI_ANY_ID, PCI_ANY_ID },
171 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510, PCI_ANY_ID, PCI_ANY_ID }, 173 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520,
174 PCI_ANY_ID, PCI_ANY_ID },
175 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510,
176 PCI_ANY_ID, PCI_ANY_ID },
172 { 0, }, 177 { 0, },
173}; 178};
174 179
180static void gx_write_byte(int reg, int value)
181{
182 pci_write_config_byte(gx_params->cs55x0, reg, value);
183}
184
175/** 185/**
176 * gx_detect_chipset: 186 * gx_detect_chipset:
177 * 187 *
@@ -200,7 +210,8 @@ static __init struct pci_dev *gx_detect_chipset(void)
200/** 210/**
201 * gx_get_cpuspeed: 211 * gx_get_cpuspeed:
202 * 212 *
203 * Finds out at which efficient frequency the Cyrix MediaGX/NatSemi Geode CPU runs. 213 * Finds out at which efficient frequency the Cyrix MediaGX/NatSemi
214 * Geode CPU runs.
204 */ 215 */
205static unsigned int gx_get_cpuspeed(unsigned int cpu) 216static unsigned int gx_get_cpuspeed(unsigned int cpu)
206{ 217{
@@ -217,17 +228,18 @@ static unsigned int gx_get_cpuspeed(unsigned int cpu)
217 * 228 *
218 **/ 229 **/
219 230
220static unsigned int gx_validate_speed(unsigned int khz, u8 *on_duration, u8 *off_duration) 231static unsigned int gx_validate_speed(unsigned int khz, u8 *on_duration,
232 u8 *off_duration)
221{ 233{
222 unsigned int i; 234 unsigned int i;
223 u8 tmp_on, tmp_off; 235 u8 tmp_on, tmp_off;
224 int old_tmp_freq = stock_freq; 236 int old_tmp_freq = stock_freq;
225 int tmp_freq; 237 int tmp_freq;
226 238
227 *off_duration=1; 239 *off_duration = 1;
228 *on_duration=0; 240 *on_duration = 0;
229 241
230 for (i=max_duration; i>0; i--) { 242 for (i = max_duration; i > 0; i--) {
231 tmp_off = ((khz * i) / stock_freq) & 0xff; 243 tmp_off = ((khz * i) / stock_freq) & 0xff;
232 tmp_on = i - tmp_off; 244 tmp_on = i - tmp_off;
233 tmp_freq = (stock_freq * tmp_off) / i; 245 tmp_freq = (stock_freq * tmp_off) / i;
@@ -259,26 +271,34 @@ static void gx_set_cpuspeed(unsigned int khz)
259 freqs.cpu = 0; 271 freqs.cpu = 0;
260 freqs.old = gx_get_cpuspeed(0); 272 freqs.old = gx_get_cpuspeed(0);
261 273
262 new_khz = gx_validate_speed(khz, &gx_params->on_duration, &gx_params->off_duration); 274 new_khz = gx_validate_speed(khz, &gx_params->on_duration,
275 &gx_params->off_duration);
263 276
264 freqs.new = new_khz; 277 freqs.new = new_khz;
265 278
266 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 279 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
267 local_irq_save(flags); 280 local_irq_save(flags);
268 281
269 if (new_khz != stock_freq) { /* if new khz == 100% of CPU speed, it is special case */ 282
283
284 if (new_khz != stock_freq) {
285 /* if new khz == 100% of CPU speed, it is special case */
270 switch (gx_params->cs55x0->device) { 286 switch (gx_params->cs55x0->device) {
271 case PCI_DEVICE_ID_CYRIX_5530_LEGACY: 287 case PCI_DEVICE_ID_CYRIX_5530_LEGACY:
272 pmer1 = gx_params->pci_pmer1 | IRQ_SPDUP | VID_SPDUP; 288 pmer1 = gx_params->pci_pmer1 | IRQ_SPDUP | VID_SPDUP;
273 /* FIXME: need to test other values -- Zwane,Miura */ 289 /* FIXME: need to test other values -- Zwane,Miura */
274 pci_write_config_byte(gx_params->cs55x0, PCI_IRQTC, 4); /* typical 2 to 4ms */ 290 /* typical 2 to 4ms */
275 pci_write_config_byte(gx_params->cs55x0, PCI_VIDTC, 100);/* typical 50 to 100ms */ 291 gx_write_byte(PCI_IRQTC, 4);
276 pci_write_config_byte(gx_params->cs55x0, PCI_PMER1, pmer1); 292 /* typical 50 to 100ms */
277 293 gx_write_byte(PCI_VIDTC, 100);
278 if (gx_params->cs55x0->revision < 0x10) { /* CS5530(rev 1.2, 1.3) */ 294 gx_write_byte(PCI_PMER1, pmer1);
279 suscfg = gx_params->pci_suscfg | SUSMOD; 295
280 } else { /* CS5530A,B.. */ 296 if (gx_params->cs55x0->revision < 0x10) {
281 suscfg = gx_params->pci_suscfg | SUSMOD | PWRSVE; 297 /* CS5530(rev 1.2, 1.3) */
298 suscfg = gx_params->pci_suscfg|SUSMOD;
299 } else {
300 /* CS5530A,B.. */
301 suscfg = gx_params->pci_suscfg|SUSMOD|PWRSVE;
282 } 302 }
283 break; 303 break;
284 case PCI_DEVICE_ID_CYRIX_5520: 304 case PCI_DEVICE_ID_CYRIX_5520:
@@ -294,13 +314,13 @@ static void gx_set_cpuspeed(unsigned int khz)
294 suscfg = gx_params->pci_suscfg & ~(SUSMOD); 314 suscfg = gx_params->pci_suscfg & ~(SUSMOD);
295 gx_params->off_duration = 0; 315 gx_params->off_duration = 0;
296 gx_params->on_duration = 0; 316 gx_params->on_duration = 0;
297 dprintk("suspend modulation disabled: cpu runs 100 percent speed.\n"); 317 dprintk("suspend modulation disabled: cpu runs 100%% speed.\n");
298 } 318 }
299 319
300 pci_write_config_byte(gx_params->cs55x0, PCI_MODOFF, gx_params->off_duration); 320 gx_write_byte(PCI_MODOFF, gx_params->off_duration);
301 pci_write_config_byte(gx_params->cs55x0, PCI_MODON, gx_params->on_duration); 321 gx_write_byte(PCI_MODON, gx_params->on_duration);
302 322
303 pci_write_config_byte(gx_params->cs55x0, PCI_SUSCFG, suscfg); 323 gx_write_byte(PCI_SUSCFG, suscfg);
304 pci_read_config_byte(gx_params->cs55x0, PCI_SUSCFG, &suscfg); 324 pci_read_config_byte(gx_params->cs55x0, PCI_SUSCFG, &suscfg);
305 325
306 local_irq_restore(flags); 326 local_irq_restore(flags);
@@ -334,7 +354,8 @@ static int cpufreq_gx_verify(struct cpufreq_policy *policy)
334 return -EINVAL; 354 return -EINVAL;
335 355
336 policy->cpu = 0; 356 policy->cpu = 0;
337 cpufreq_verify_within_limits(policy, (stock_freq / max_duration), stock_freq); 357 cpufreq_verify_within_limits(policy, (stock_freq / max_duration),
358 stock_freq);
338 359
339 /* it needs to be assured that at least one supported frequency is 360 /* it needs to be assured that at least one supported frequency is
340 * within policy->min and policy->max. If it is not, policy->max 361 * within policy->min and policy->max. If it is not, policy->max
@@ -354,7 +375,8 @@ static int cpufreq_gx_verify(struct cpufreq_policy *policy)
354 policy->max = tmp_freq; 375 policy->max = tmp_freq;
355 if (policy->max < policy->min) 376 if (policy->max < policy->min)
356 policy->max = policy->min; 377 policy->max = policy->min;
357 cpufreq_verify_within_limits(policy, (stock_freq / max_duration), stock_freq); 378 cpufreq_verify_within_limits(policy, (stock_freq / max_duration),
379 stock_freq);
358 380
359 return 0; 381 return 0;
360} 382}
@@ -398,18 +420,18 @@ static int cpufreq_gx_cpu_init(struct cpufreq_policy *policy)
398 return -ENODEV; 420 return -ENODEV;
399 421
400 /* determine maximum frequency */ 422 /* determine maximum frequency */
401 if (pci_busclk) { 423 if (pci_busclk)
402 maxfreq = pci_busclk * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f]; 424 maxfreq = pci_busclk * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f];
403 } else if (cpu_khz) { 425 else if (cpu_khz)
404 maxfreq = cpu_khz; 426 maxfreq = cpu_khz;
405 } else { 427 else
406 maxfreq = 30000 * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f]; 428 maxfreq = 30000 * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f];
407 } 429
408 stock_freq = maxfreq; 430 stock_freq = maxfreq;
409 curfreq = gx_get_cpuspeed(0); 431 curfreq = gx_get_cpuspeed(0);
410 432
411 dprintk("cpu max frequency is %d.\n", maxfreq); 433 dprintk("cpu max frequency is %d.\n", maxfreq);
412 dprintk("cpu current frequency is %dkHz.\n",curfreq); 434 dprintk("cpu current frequency is %dkHz.\n", curfreq);
413 435
414 /* setup basic struct for cpufreq API */ 436 /* setup basic struct for cpufreq API */
415 policy->cpu = 0; 437 policy->cpu = 0;
@@ -447,7 +469,8 @@ static int __init cpufreq_gx_init(void)
447 struct pci_dev *gx_pci; 469 struct pci_dev *gx_pci;
448 470
449 /* Test if we have the right hardware */ 471 /* Test if we have the right hardware */
450 if ((gx_pci = gx_detect_chipset()) == NULL) 472 gx_pci = gx_detect_chipset();
473 if (gx_pci == NULL)
451 return -ENODEV; 474 return -ENODEV;
452 475
453 /* check whether module parameters are sane */ 476 /* check whether module parameters are sane */
@@ -468,9 +491,11 @@ static int __init cpufreq_gx_init(void)
468 pci_read_config_byte(params->cs55x0, PCI_PMER1, &(params->pci_pmer1)); 491 pci_read_config_byte(params->cs55x0, PCI_PMER1, &(params->pci_pmer1));
469 pci_read_config_byte(params->cs55x0, PCI_PMER2, &(params->pci_pmer2)); 492 pci_read_config_byte(params->cs55x0, PCI_PMER2, &(params->pci_pmer2));
470 pci_read_config_byte(params->cs55x0, PCI_MODON, &(params->on_duration)); 493 pci_read_config_byte(params->cs55x0, PCI_MODON, &(params->on_duration));
471 pci_read_config_byte(params->cs55x0, PCI_MODOFF, &(params->off_duration)); 494 pci_read_config_byte(params->cs55x0, PCI_MODOFF,
495 &(params->off_duration));
472 496
473 if ((ret = cpufreq_register_driver(&gx_suspmod_driver))) { 497 ret = cpufreq_register_driver(&gx_suspmod_driver);
498 if (ret) {
474 kfree(params); 499 kfree(params);
475 return ret; /* register error! */ 500 return ret; /* register error! */
476 } 501 }
@@ -485,9 +510,9 @@ static void __exit cpufreq_gx_exit(void)
485 kfree(gx_params); 510 kfree(gx_params);
486} 511}
487 512
488MODULE_AUTHOR ("Hiroshi Miura <miura@da-cha.org>"); 513MODULE_AUTHOR("Hiroshi Miura <miura@da-cha.org>");
489MODULE_DESCRIPTION ("Cpufreq driver for Cyrix MediaGX and NatSemi Geode"); 514MODULE_DESCRIPTION("Cpufreq driver for Cyrix MediaGX and NatSemi Geode");
490MODULE_LICENSE ("GPL"); 515MODULE_LICENSE("GPL");
491 516
492module_init(cpufreq_gx_init); 517module_init(cpufreq_gx_init);
493module_exit(cpufreq_gx_exit); 518module_exit(cpufreq_gx_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index a4cff5d6e380..0bd48e65a0ca 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -30,12 +30,12 @@
30#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/string.h> 31#include <linux/string.h>
32#include <linux/delay.h> 32#include <linux/delay.h>
33#include <linux/timex.h>
34#include <linux/io.h>
35#include <linux/acpi.h>
36#include <linux/kernel.h>
33 37
34#include <asm/msr.h> 38#include <asm/msr.h>
35#include <asm/timex.h>
36#include <asm/io.h>
37#include <asm/acpi.h>
38#include <linux/acpi.h>
39#include <acpi/processor.h> 39#include <acpi/processor.h>
40 40
41#include "longhaul.h" 41#include "longhaul.h"
@@ -58,7 +58,7 @@
58#define USE_NORTHBRIDGE (1 << 2) 58#define USE_NORTHBRIDGE (1 << 2)
59 59
60static int cpu_model; 60static int cpu_model;
61static unsigned int numscales=16; 61static unsigned int numscales = 16;
62static unsigned int fsb; 62static unsigned int fsb;
63 63
64static const struct mV_pos *vrm_mV_table; 64static const struct mV_pos *vrm_mV_table;
@@ -67,8 +67,8 @@ static const unsigned char *mV_vrm_table;
67static unsigned int highest_speed, lowest_speed; /* kHz */ 67static unsigned int highest_speed, lowest_speed; /* kHz */
68static unsigned int minmult, maxmult; 68static unsigned int minmult, maxmult;
69static int can_scale_voltage; 69static int can_scale_voltage;
70static struct acpi_processor *pr = NULL; 70static struct acpi_processor *pr;
71static struct acpi_processor_cx *cx = NULL; 71static struct acpi_processor_cx *cx;
72static u32 acpi_regs_addr; 72static u32 acpi_regs_addr;
73static u8 longhaul_flags; 73static u8 longhaul_flags;
74static unsigned int longhaul_index; 74static unsigned int longhaul_index;
@@ -78,12 +78,13 @@ static int scale_voltage;
78static int disable_acpi_c3; 78static int disable_acpi_c3;
79static int revid_errata; 79static int revid_errata;
80 80
81#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "longhaul", msg) 81#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
82 "longhaul", msg)
82 83
83 84
84/* Clock ratios multiplied by 10 */ 85/* Clock ratios multiplied by 10 */
85static int clock_ratio[32]; 86static int mults[32];
86static int eblcr_table[32]; 87static int eblcr[32];
87static int longhaul_version; 88static int longhaul_version;
88static struct cpufreq_frequency_table *longhaul_table; 89static struct cpufreq_frequency_table *longhaul_table;
89 90
@@ -93,7 +94,7 @@ static char speedbuffer[8];
93static char *print_speed(int speed) 94static char *print_speed(int speed)
94{ 95{
95 if (speed < 1000) { 96 if (speed < 1000) {
96 snprintf(speedbuffer, sizeof(speedbuffer),"%dMHz", speed); 97 snprintf(speedbuffer, sizeof(speedbuffer), "%dMHz", speed);
97 return speedbuffer; 98 return speedbuffer;
98 } 99 }
99 100
@@ -122,27 +123,28 @@ static unsigned int calc_speed(int mult)
122 123
123static int longhaul_get_cpu_mult(void) 124static int longhaul_get_cpu_mult(void)
124{ 125{
125 unsigned long invalue=0,lo, hi; 126 unsigned long invalue = 0, lo, hi;
126 127
127 rdmsr (MSR_IA32_EBL_CR_POWERON, lo, hi); 128 rdmsr(MSR_IA32_EBL_CR_POWERON, lo, hi);
128 invalue = (lo & (1<<22|1<<23|1<<24|1<<25)) >>22; 129 invalue = (lo & (1<<22|1<<23|1<<24|1<<25))>>22;
129 if (longhaul_version==TYPE_LONGHAUL_V2 || longhaul_version==TYPE_POWERSAVER) { 130 if (longhaul_version == TYPE_LONGHAUL_V2 ||
131 longhaul_version == TYPE_POWERSAVER) {
130 if (lo & (1<<27)) 132 if (lo & (1<<27))
131 invalue+=16; 133 invalue += 16;
132 } 134 }
133 return eblcr_table[invalue]; 135 return eblcr[invalue];
134} 136}
135 137
136/* For processor with BCR2 MSR */ 138/* For processor with BCR2 MSR */
137 139
138static void do_longhaul1(unsigned int clock_ratio_index) 140static void do_longhaul1(unsigned int mults_index)
139{ 141{
140 union msr_bcr2 bcr2; 142 union msr_bcr2 bcr2;
141 143
142 rdmsrl(MSR_VIA_BCR2, bcr2.val); 144 rdmsrl(MSR_VIA_BCR2, bcr2.val);
143 /* Enable software clock multiplier */ 145 /* Enable software clock multiplier */
144 bcr2.bits.ESOFTBF = 1; 146 bcr2.bits.ESOFTBF = 1;
145 bcr2.bits.CLOCKMUL = clock_ratio_index & 0xff; 147 bcr2.bits.CLOCKMUL = mults_index & 0xff;
146 148
147 /* Sync to timer tick */ 149 /* Sync to timer tick */
148 safe_halt(); 150 safe_halt();
@@ -161,7 +163,7 @@ static void do_longhaul1(unsigned int clock_ratio_index)
161 163
162/* For processor with Longhaul MSR */ 164/* For processor with Longhaul MSR */
163 165
164static void do_powersaver(int cx_address, unsigned int clock_ratio_index, 166static void do_powersaver(int cx_address, unsigned int mults_index,
165 unsigned int dir) 167 unsigned int dir)
166{ 168{
167 union msr_longhaul longhaul; 169 union msr_longhaul longhaul;
@@ -173,11 +175,11 @@ static void do_powersaver(int cx_address, unsigned int clock_ratio_index,
173 longhaul.bits.RevisionKey = longhaul.bits.RevisionID; 175 longhaul.bits.RevisionKey = longhaul.bits.RevisionID;
174 else 176 else
175 longhaul.bits.RevisionKey = 0; 177 longhaul.bits.RevisionKey = 0;
176 longhaul.bits.SoftBusRatio = clock_ratio_index & 0xf; 178 longhaul.bits.SoftBusRatio = mults_index & 0xf;
177 longhaul.bits.SoftBusRatio4 = (clock_ratio_index & 0x10) >> 4; 179 longhaul.bits.SoftBusRatio4 = (mults_index & 0x10) >> 4;
178 /* Setup new voltage */ 180 /* Setup new voltage */
179 if (can_scale_voltage) 181 if (can_scale_voltage)
180 longhaul.bits.SoftVID = (clock_ratio_index >> 8) & 0x1f; 182 longhaul.bits.SoftVID = (mults_index >> 8) & 0x1f;
181 /* Sync to timer tick */ 183 /* Sync to timer tick */
182 safe_halt(); 184 safe_halt();
183 /* Raise voltage if necessary */ 185 /* Raise voltage if necessary */
@@ -240,14 +242,14 @@ static void do_powersaver(int cx_address, unsigned int clock_ratio_index,
240 242
241/** 243/**
242 * longhaul_set_cpu_frequency() 244 * longhaul_set_cpu_frequency()
243 * @clock_ratio_index : bitpattern of the new multiplier. 245 * @mults_index : bitpattern of the new multiplier.
244 * 246 *
245 * Sets a new clock ratio. 247 * Sets a new clock ratio.
246 */ 248 */
247 249
248static void longhaul_setstate(unsigned int table_index) 250static void longhaul_setstate(unsigned int table_index)
249{ 251{
250 unsigned int clock_ratio_index; 252 unsigned int mults_index;
251 int speed, mult; 253 int speed, mult;
252 struct cpufreq_freqs freqs; 254 struct cpufreq_freqs freqs;
253 unsigned long flags; 255 unsigned long flags;
@@ -256,9 +258,9 @@ static void longhaul_setstate(unsigned int table_index)
256 u32 bm_timeout = 1000; 258 u32 bm_timeout = 1000;
257 unsigned int dir = 0; 259 unsigned int dir = 0;
258 260
259 clock_ratio_index = longhaul_table[table_index].index; 261 mults_index = longhaul_table[table_index].index;
260 /* Safety precautions */ 262 /* Safety precautions */
261 mult = clock_ratio[clock_ratio_index & 0x1f]; 263 mult = mults[mults_index & 0x1f];
262 if (mult == -1) 264 if (mult == -1)
263 return; 265 return;
264 speed = calc_speed(mult); 266 speed = calc_speed(mult);
@@ -274,7 +276,7 @@ static void longhaul_setstate(unsigned int table_index)
274 276
275 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 277 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
276 278
277 dprintk ("Setting to FSB:%dMHz Mult:%d.%dx (%s)\n", 279 dprintk("Setting to FSB:%dMHz Mult:%d.%dx (%s)\n",
278 fsb, mult/10, mult%10, print_speed(speed/1000)); 280 fsb, mult/10, mult%10, print_speed(speed/1000));
279retry_loop: 281retry_loop:
280 preempt_disable(); 282 preempt_disable();
@@ -282,8 +284,8 @@ retry_loop:
282 284
283 pic2_mask = inb(0xA1); 285 pic2_mask = inb(0xA1);
284 pic1_mask = inb(0x21); /* works on C3. save mask. */ 286 pic1_mask = inb(0x21); /* works on C3. save mask. */
285 outb(0xFF,0xA1); /* Overkill */ 287 outb(0xFF, 0xA1); /* Overkill */
286 outb(0xFE,0x21); /* TMR0 only */ 288 outb(0xFE, 0x21); /* TMR0 only */
287 289
288 /* Wait while PCI bus is busy. */ 290 /* Wait while PCI bus is busy. */
289 if (acpi_regs_addr && (longhaul_flags & USE_NORTHBRIDGE 291 if (acpi_regs_addr && (longhaul_flags & USE_NORTHBRIDGE
@@ -303,7 +305,7 @@ retry_loop:
303 outb(3, 0x22); 305 outb(3, 0x22);
304 } else if ((pr != NULL) && pr->flags.bm_control) { 306 } else if ((pr != NULL) && pr->flags.bm_control) {
305 /* Disable bus master arbitration */ 307 /* Disable bus master arbitration */
306 acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1); 308 acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 1);
307 } 309 }
308 switch (longhaul_version) { 310 switch (longhaul_version) {
309 311
@@ -312,7 +314,7 @@ retry_loop:
312 * Software controlled multipliers only. 314 * Software controlled multipliers only.
313 */ 315 */
314 case TYPE_LONGHAUL_V1: 316 case TYPE_LONGHAUL_V1:
315 do_longhaul1(clock_ratio_index); 317 do_longhaul1(mults_index);
316 break; 318 break;
317 319
318 /* 320 /*
@@ -326,10 +328,10 @@ retry_loop:
326 case TYPE_POWERSAVER: 328 case TYPE_POWERSAVER:
327 if (longhaul_flags & USE_ACPI_C3) { 329 if (longhaul_flags & USE_ACPI_C3) {
328 /* Don't allow wakeup */ 330 /* Don't allow wakeup */
329 acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0); 331 acpi_write_bit_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
330 do_powersaver(cx->address, clock_ratio_index, dir); 332 do_powersaver(cx->address, mults_index, dir);
331 } else { 333 } else {
332 do_powersaver(0, clock_ratio_index, dir); 334 do_powersaver(0, mults_index, dir);
333 } 335 }
334 break; 336 break;
335 } 337 }
@@ -339,10 +341,10 @@ retry_loop:
339 outb(0, 0x22); 341 outb(0, 0x22);
340 } else if ((pr != NULL) && pr->flags.bm_control) { 342 } else if ((pr != NULL) && pr->flags.bm_control) {
341 /* Enable bus master arbitration */ 343 /* Enable bus master arbitration */
342 acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0); 344 acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 0);
343 } 345 }
344 outb(pic2_mask,0xA1); /* restore mask */ 346 outb(pic2_mask, 0xA1); /* restore mask */
345 outb(pic1_mask,0x21); 347 outb(pic1_mask, 0x21);
346 348
347 local_irq_restore(flags); 349 local_irq_restore(flags);
348 preempt_enable(); 350 preempt_enable();
@@ -392,7 +394,8 @@ retry_loop:
392 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 394 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
393 395
394 if (!bm_timeout) 396 if (!bm_timeout)
395 printk(KERN_INFO PFX "Warning: Timeout while waiting for idle PCI bus.\n"); 397 printk(KERN_INFO PFX "Warning: Timeout while waiting for "
398 "idle PCI bus.\n");
396} 399}
397 400
398/* 401/*
@@ -458,31 +461,32 @@ static int __init longhaul_get_ranges(void)
458 break; 461 break;
459 } 462 }
460 463
461 dprintk ("MinMult:%d.%dx MaxMult:%d.%dx\n", 464 dprintk("MinMult:%d.%dx MaxMult:%d.%dx\n",
462 minmult/10, minmult%10, maxmult/10, maxmult%10); 465 minmult/10, minmult%10, maxmult/10, maxmult%10);
463 466
464 highest_speed = calc_speed(maxmult); 467 highest_speed = calc_speed(maxmult);
465 lowest_speed = calc_speed(minmult); 468 lowest_speed = calc_speed(minmult);
466 dprintk ("FSB:%dMHz Lowest speed: %s Highest speed:%s\n", fsb, 469 dprintk("FSB:%dMHz Lowest speed: %s Highest speed:%s\n", fsb,
467 print_speed(lowest_speed/1000), 470 print_speed(lowest_speed/1000),
468 print_speed(highest_speed/1000)); 471 print_speed(highest_speed/1000));
469 472
470 if (lowest_speed == highest_speed) { 473 if (lowest_speed == highest_speed) {
471 printk (KERN_INFO PFX "highestspeed == lowest, aborting.\n"); 474 printk(KERN_INFO PFX "highestspeed == lowest, aborting.\n");
472 return -EINVAL; 475 return -EINVAL;
473 } 476 }
474 if (lowest_speed > highest_speed) { 477 if (lowest_speed > highest_speed) {
475 printk (KERN_INFO PFX "nonsense! lowest (%d > %d) !\n", 478 printk(KERN_INFO PFX "nonsense! lowest (%d > %d) !\n",
476 lowest_speed, highest_speed); 479 lowest_speed, highest_speed);
477 return -EINVAL; 480 return -EINVAL;
478 } 481 }
479 482
480 longhaul_table = kmalloc((numscales + 1) * sizeof(struct cpufreq_frequency_table), GFP_KERNEL); 483 longhaul_table = kmalloc((numscales + 1) * sizeof(*longhaul_table),
481 if(!longhaul_table) 484 GFP_KERNEL);
485 if (!longhaul_table)
482 return -ENOMEM; 486 return -ENOMEM;
483 487
484 for (j = 0; j < numscales; j++) { 488 for (j = 0; j < numscales; j++) {
485 ratio = clock_ratio[j]; 489 ratio = mults[j];
486 if (ratio == -1) 490 if (ratio == -1)
487 continue; 491 continue;
488 if (ratio > maxmult || ratio < minmult) 492 if (ratio > maxmult || ratio < minmult)
@@ -507,13 +511,10 @@ static int __init longhaul_get_ranges(void)
507 } 511 }
508 } 512 }
509 if (min_i != j) { 513 if (min_i != j) {
510 unsigned int temp; 514 swap(longhaul_table[j].frequency,
511 temp = longhaul_table[j].frequency; 515 longhaul_table[min_i].frequency);
512 longhaul_table[j].frequency = longhaul_table[min_i].frequency; 516 swap(longhaul_table[j].index,
513 longhaul_table[min_i].frequency = temp; 517 longhaul_table[min_i].index);
514 temp = longhaul_table[j].index;
515 longhaul_table[j].index = longhaul_table[min_i].index;
516 longhaul_table[min_i].index = temp;
517 } 518 }
518 } 519 }
519 520
@@ -521,7 +522,7 @@ static int __init longhaul_get_ranges(void)
521 522
522 /* Find index we are running on */ 523 /* Find index we are running on */
523 for (j = 0; j < k; j++) { 524 for (j = 0; j < k; j++) {
524 if (clock_ratio[longhaul_table[j].index & 0x1f] == mult) { 525 if (mults[longhaul_table[j].index & 0x1f] == mult) {
525 longhaul_index = j; 526 longhaul_index = j;
526 break; 527 break;
527 } 528 }
@@ -559,20 +560,22 @@ static void __init longhaul_setup_voltagescaling(void)
559 maxvid = vrm_mV_table[longhaul.bits.MaximumVID]; 560 maxvid = vrm_mV_table[longhaul.bits.MaximumVID];
560 561
561 if (minvid.mV == 0 || maxvid.mV == 0 || minvid.mV > maxvid.mV) { 562 if (minvid.mV == 0 || maxvid.mV == 0 || minvid.mV > maxvid.mV) {
562 printk (KERN_INFO PFX "Bogus values Min:%d.%03d Max:%d.%03d. " 563 printk(KERN_INFO PFX "Bogus values Min:%d.%03d Max:%d.%03d. "
563 "Voltage scaling disabled.\n", 564 "Voltage scaling disabled.\n",
564 minvid.mV/1000, minvid.mV%1000, maxvid.mV/1000, maxvid.mV%1000); 565 minvid.mV/1000, minvid.mV%1000,
566 maxvid.mV/1000, maxvid.mV%1000);
565 return; 567 return;
566 } 568 }
567 569
568 if (minvid.mV == maxvid.mV) { 570 if (minvid.mV == maxvid.mV) {
569 printk (KERN_INFO PFX "Claims to support voltage scaling but min & max are " 571 printk(KERN_INFO PFX "Claims to support voltage scaling but "
570 "both %d.%03d. Voltage scaling disabled\n", 572 "min & max are both %d.%03d. "
573 "Voltage scaling disabled\n",
571 maxvid.mV/1000, maxvid.mV%1000); 574 maxvid.mV/1000, maxvid.mV%1000);
572 return; 575 return;
573 } 576 }
574 577
575 /* How many voltage steps */ 578 /* How many voltage steps*/
576 numvscales = maxvid.pos - minvid.pos + 1; 579 numvscales = maxvid.pos - minvid.pos + 1;
577 printk(KERN_INFO PFX 580 printk(KERN_INFO PFX
578 "Max VID=%d.%03d " 581 "Max VID=%d.%03d "
@@ -586,7 +589,7 @@ static void __init longhaul_setup_voltagescaling(void)
586 j = longhaul.bits.MinMHzBR; 589 j = longhaul.bits.MinMHzBR;
587 if (longhaul.bits.MinMHzBR4) 590 if (longhaul.bits.MinMHzBR4)
588 j += 16; 591 j += 16;
589 min_vid_speed = eblcr_table[j]; 592 min_vid_speed = eblcr[j];
590 if (min_vid_speed == -1) 593 if (min_vid_speed == -1)
591 return; 594 return;
592 switch (longhaul.bits.MinMHzFSB) { 595 switch (longhaul.bits.MinMHzFSB) {
@@ -617,7 +620,8 @@ static void __init longhaul_setup_voltagescaling(void)
617 pos = minvid.pos; 620 pos = minvid.pos;
618 longhaul_table[j].index |= mV_vrm_table[pos] << 8; 621 longhaul_table[j].index |= mV_vrm_table[pos] << 8;
619 vid = vrm_mV_table[mV_vrm_table[pos]]; 622 vid = vrm_mV_table[mV_vrm_table[pos]];
620 printk(KERN_INFO PFX "f: %d kHz, index: %d, vid: %d mV\n", speed, j, vid.mV); 623 printk(KERN_INFO PFX "f: %d kHz, index: %d, vid: %d mV\n",
624 speed, j, vid.mV);
621 j++; 625 j++;
622 } 626 }
623 627
@@ -640,7 +644,8 @@ static int longhaul_target(struct cpufreq_policy *policy,
640 unsigned int dir = 0; 644 unsigned int dir = 0;
641 u8 vid, current_vid; 645 u8 vid, current_vid;
642 646
643 if (cpufreq_frequency_table_target(policy, longhaul_table, target_freq, relation, &table_index)) 647 if (cpufreq_frequency_table_target(policy, longhaul_table, target_freq,
648 relation, &table_index))
644 return -EINVAL; 649 return -EINVAL;
645 650
646 /* Don't set same frequency again */ 651 /* Don't set same frequency again */
@@ -656,7 +661,8 @@ static int longhaul_target(struct cpufreq_policy *policy,
656 * this in hardware, C3 is old and we need to do this 661 * this in hardware, C3 is old and we need to do this
657 * in software. */ 662 * in software. */
658 i = longhaul_index; 663 i = longhaul_index;
659 current_vid = (longhaul_table[longhaul_index].index >> 8) & 0x1f; 664 current_vid = (longhaul_table[longhaul_index].index >> 8);
665 current_vid &= 0x1f;
660 if (table_index > longhaul_index) 666 if (table_index > longhaul_index)
661 dir = 1; 667 dir = 1;
662 while (i != table_index) { 668 while (i != table_index) {
@@ -691,9 +697,9 @@ static acpi_status longhaul_walk_callback(acpi_handle obj_handle,
691{ 697{
692 struct acpi_device *d; 698 struct acpi_device *d;
693 699
694 if ( acpi_bus_get_device(obj_handle, &d) ) { 700 if (acpi_bus_get_device(obj_handle, &d))
695 return 0; 701 return 0;
696 } 702
697 *return_value = acpi_driver_data(d); 703 *return_value = acpi_driver_data(d);
698 return 1; 704 return 1;
699} 705}
@@ -750,7 +756,7 @@ static int longhaul_setup_southbridge(void)
750 /* Find VT8235 southbridge */ 756 /* Find VT8235 southbridge */
751 dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, NULL); 757 dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, NULL);
752 if (dev == NULL) 758 if (dev == NULL)
753 /* Find VT8237 southbridge */ 759 /* Find VT8237 southbridge */
754 dev = pci_get_device(PCI_VENDOR_ID_VIA, 760 dev = pci_get_device(PCI_VENDOR_ID_VIA,
755 PCI_DEVICE_ID_VIA_8237, NULL); 761 PCI_DEVICE_ID_VIA_8237, NULL);
756 if (dev != NULL) { 762 if (dev != NULL) {
@@ -769,7 +775,8 @@ static int longhaul_setup_southbridge(void)
769 if (pci_cmd & 1 << 7) { 775 if (pci_cmd & 1 << 7) {
770 pci_read_config_dword(dev, 0x88, &acpi_regs_addr); 776 pci_read_config_dword(dev, 0x88, &acpi_regs_addr);
771 acpi_regs_addr &= 0xff00; 777 acpi_regs_addr &= 0xff00;
772 printk(KERN_INFO PFX "ACPI I/O at 0x%x\n", acpi_regs_addr); 778 printk(KERN_INFO PFX "ACPI I/O at 0x%x\n",
779 acpi_regs_addr);
773 } 780 }
774 781
775 pci_dev_put(dev); 782 pci_dev_put(dev);
@@ -781,7 +788,7 @@ static int longhaul_setup_southbridge(void)
781static int __init longhaul_cpu_init(struct cpufreq_policy *policy) 788static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
782{ 789{
783 struct cpuinfo_x86 *c = &cpu_data(0); 790 struct cpuinfo_x86 *c = &cpu_data(0);
784 char *cpuname=NULL; 791 char *cpuname = NULL;
785 int ret; 792 int ret;
786 u32 lo, hi; 793 u32 lo, hi;
787 794
@@ -791,8 +798,8 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
791 cpu_model = CPU_SAMUEL; 798 cpu_model = CPU_SAMUEL;
792 cpuname = "C3 'Samuel' [C5A]"; 799 cpuname = "C3 'Samuel' [C5A]";
793 longhaul_version = TYPE_LONGHAUL_V1; 800 longhaul_version = TYPE_LONGHAUL_V1;
794 memcpy (clock_ratio, samuel1_clock_ratio, sizeof(samuel1_clock_ratio)); 801 memcpy(mults, samuel1_mults, sizeof(samuel1_mults));
795 memcpy (eblcr_table, samuel1_eblcr, sizeof(samuel1_eblcr)); 802 memcpy(eblcr, samuel1_eblcr, sizeof(samuel1_eblcr));
796 break; 803 break;
797 804
798 case 7: 805 case 7:
@@ -803,10 +810,8 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
803 cpuname = "C3 'Samuel 2' [C5B]"; 810 cpuname = "C3 'Samuel 2' [C5B]";
804 /* Note, this is not a typo, early Samuel2's had 811 /* Note, this is not a typo, early Samuel2's had
805 * Samuel1 ratios. */ 812 * Samuel1 ratios. */
806 memcpy(clock_ratio, samuel1_clock_ratio, 813 memcpy(mults, samuel1_mults, sizeof(samuel1_mults));
807 sizeof(samuel1_clock_ratio)); 814 memcpy(eblcr, samuel2_eblcr, sizeof(samuel2_eblcr));
808 memcpy(eblcr_table, samuel2_eblcr,
809 sizeof(samuel2_eblcr));
810 break; 815 break;
811 case 1 ... 15: 816 case 1 ... 15:
812 longhaul_version = TYPE_LONGHAUL_V1; 817 longhaul_version = TYPE_LONGHAUL_V1;
@@ -817,10 +822,8 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
817 cpu_model = CPU_EZRA; 822 cpu_model = CPU_EZRA;
818 cpuname = "C3 'Ezra' [C5C]"; 823 cpuname = "C3 'Ezra' [C5C]";
819 } 824 }
820 memcpy(clock_ratio, ezra_clock_ratio, 825 memcpy(mults, ezra_mults, sizeof(ezra_mults));
821 sizeof(ezra_clock_ratio)); 826 memcpy(eblcr, ezra_eblcr, sizeof(ezra_eblcr));
822 memcpy(eblcr_table, ezra_eblcr,
823 sizeof(ezra_eblcr));
824 break; 827 break;
825 } 828 }
826 break; 829 break;
@@ -829,18 +832,16 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
829 cpu_model = CPU_EZRA_T; 832 cpu_model = CPU_EZRA_T;
830 cpuname = "C3 'Ezra-T' [C5M]"; 833 cpuname = "C3 'Ezra-T' [C5M]";
831 longhaul_version = TYPE_POWERSAVER; 834 longhaul_version = TYPE_POWERSAVER;
832 numscales=32; 835 numscales = 32;
833 memcpy (clock_ratio, ezrat_clock_ratio, sizeof(ezrat_clock_ratio)); 836 memcpy(mults, ezrat_mults, sizeof(ezrat_mults));
834 memcpy (eblcr_table, ezrat_eblcr, sizeof(ezrat_eblcr)); 837 memcpy(eblcr, ezrat_eblcr, sizeof(ezrat_eblcr));
835 break; 838 break;
836 839
837 case 9: 840 case 9:
838 longhaul_version = TYPE_POWERSAVER; 841 longhaul_version = TYPE_POWERSAVER;
839 numscales = 32; 842 numscales = 32;
840 memcpy(clock_ratio, 843 memcpy(mults, nehemiah_mults, sizeof(nehemiah_mults));
841 nehemiah_clock_ratio, 844 memcpy(eblcr, nehemiah_eblcr, sizeof(nehemiah_eblcr));
842 sizeof(nehemiah_clock_ratio));
843 memcpy(eblcr_table, nehemiah_eblcr, sizeof(nehemiah_eblcr));
844 switch (c->x86_mask) { 845 switch (c->x86_mask) {
845 case 0 ... 1: 846 case 0 ... 1:
846 cpu_model = CPU_NEHEMIAH; 847 cpu_model = CPU_NEHEMIAH;
@@ -869,14 +870,14 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
869 longhaul_version = TYPE_LONGHAUL_V1; 870 longhaul_version = TYPE_LONGHAUL_V1;
870 } 871 }
871 872
872 printk (KERN_INFO PFX "VIA %s CPU detected. ", cpuname); 873 printk(KERN_INFO PFX "VIA %s CPU detected. ", cpuname);
873 switch (longhaul_version) { 874 switch (longhaul_version) {
874 case TYPE_LONGHAUL_V1: 875 case TYPE_LONGHAUL_V1:
875 case TYPE_LONGHAUL_V2: 876 case TYPE_LONGHAUL_V2:
876 printk ("Longhaul v%d supported.\n", longhaul_version); 877 printk(KERN_CONT "Longhaul v%d supported.\n", longhaul_version);
877 break; 878 break;
878 case TYPE_POWERSAVER: 879 case TYPE_POWERSAVER:
879 printk ("Powersaver supported.\n"); 880 printk(KERN_CONT "Powersaver supported.\n");
880 break; 881 break;
881 }; 882 };
882 883
@@ -940,7 +941,7 @@ static int __devexit longhaul_cpu_exit(struct cpufreq_policy *policy)
940 return 0; 941 return 0;
941} 942}
942 943
943static struct freq_attr* longhaul_attr[] = { 944static struct freq_attr *longhaul_attr[] = {
944 &cpufreq_freq_attr_scaling_available_freqs, 945 &cpufreq_freq_attr_scaling_available_freqs,
945 NULL, 946 NULL,
946}; 947};
@@ -966,13 +967,15 @@ static int __init longhaul_init(void)
966 967
967#ifdef CONFIG_SMP 968#ifdef CONFIG_SMP
968 if (num_online_cpus() > 1) { 969 if (num_online_cpus() > 1) {
969 printk(KERN_ERR PFX "More than 1 CPU detected, longhaul disabled.\n"); 970 printk(KERN_ERR PFX "More than 1 CPU detected, "
971 "longhaul disabled.\n");
970 return -ENODEV; 972 return -ENODEV;
971 } 973 }
972#endif 974#endif
973#ifdef CONFIG_X86_IO_APIC 975#ifdef CONFIG_X86_IO_APIC
974 if (cpu_has_apic) { 976 if (cpu_has_apic) {
975 printk(KERN_ERR PFX "APIC detected. Longhaul is currently broken in this configuration.\n"); 977 printk(KERN_ERR PFX "APIC detected. Longhaul is currently "
978 "broken in this configuration.\n");
976 return -ENODEV; 979 return -ENODEV;
977 } 980 }
978#endif 981#endif
@@ -993,8 +996,8 @@ static void __exit longhaul_exit(void)
993{ 996{
994 int i; 997 int i;
995 998
996 for (i=0; i < numscales; i++) { 999 for (i = 0; i < numscales; i++) {
997 if (clock_ratio[i] == maxmult) { 1000 if (mults[i] == maxmult) {
998 longhaul_setstate(i); 1001 longhaul_setstate(i);
999 break; 1002 break;
1000 } 1003 }
@@ -1007,11 +1010,11 @@ static void __exit longhaul_exit(void)
1007/* Even if BIOS is exporting ACPI C3 state, and it is used 1010/* Even if BIOS is exporting ACPI C3 state, and it is used
1008 * with success when CPU is idle, this state doesn't 1011 * with success when CPU is idle, this state doesn't
1009 * trigger frequency transition in some cases. */ 1012 * trigger frequency transition in some cases. */
1010module_param (disable_acpi_c3, int, 0644); 1013module_param(disable_acpi_c3, int, 0644);
1011MODULE_PARM_DESC(disable_acpi_c3, "Don't use ACPI C3 support"); 1014MODULE_PARM_DESC(disable_acpi_c3, "Don't use ACPI C3 support");
1012/* Change CPU voltage with frequency. Very usefull to save 1015/* Change CPU voltage with frequency. Very usefull to save
1013 * power, but most VIA C3 processors aren't supporting it. */ 1016 * power, but most VIA C3 processors aren't supporting it. */
1014module_param (scale_voltage, int, 0644); 1017module_param(scale_voltage, int, 0644);
1015MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor"); 1018MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor");
1016/* Force revision key to 0 for processors which doesn't 1019/* Force revision key to 0 for processors which doesn't
1017 * support voltage scaling, but are introducing itself as 1020 * support voltage scaling, but are introducing itself as
@@ -1019,9 +1022,9 @@ MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor");
1019module_param(revid_errata, int, 0644); 1022module_param(revid_errata, int, 0644);
1020MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID"); 1023MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID");
1021 1024
1022MODULE_AUTHOR ("Dave Jones <davej@redhat.com>"); 1025MODULE_AUTHOR("Dave Jones <davej@redhat.com>");
1023MODULE_DESCRIPTION ("Longhaul driver for VIA Cyrix processors."); 1026MODULE_DESCRIPTION("Longhaul driver for VIA Cyrix processors.");
1024MODULE_LICENSE ("GPL"); 1027MODULE_LICENSE("GPL");
1025 1028
1026late_initcall(longhaul_init); 1029late_initcall(longhaul_init);
1027module_exit(longhaul_exit); 1030module_exit(longhaul_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.h b/arch/x86/kernel/cpu/cpufreq/longhaul.h
index 4fcc320997df..e2360a469f79 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.h
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.h
@@ -49,14 +49,14 @@ union msr_longhaul {
49 49
50/* 50/*
51 * Clock ratio tables. Div/Mod by 10 to get ratio. 51 * Clock ratio tables. Div/Mod by 10 to get ratio.
52 * The eblcr ones specify the ratio read from the CPU. 52 * The eblcr values specify the ratio read from the CPU.
53 * The clock_ratio ones specify what to write to the CPU. 53 * The mults values specify what to write to the CPU.
54 */ 54 */
55 55
56/* 56/*
57 * VIA C3 Samuel 1 & Samuel 2 (stepping 0) 57 * VIA C3 Samuel 1 & Samuel 2 (stepping 0)
58 */ 58 */
59static const int __initdata samuel1_clock_ratio[16] = { 59static const int __initdata samuel1_mults[16] = {
60 -1, /* 0000 -> RESERVED */ 60 -1, /* 0000 -> RESERVED */
61 30, /* 0001 -> 3.0x */ 61 30, /* 0001 -> 3.0x */
62 40, /* 0010 -> 4.0x */ 62 40, /* 0010 -> 4.0x */
@@ -119,7 +119,7 @@ static const int __initdata samuel2_eblcr[16] = {
119/* 119/*
120 * VIA C3 Ezra 120 * VIA C3 Ezra
121 */ 121 */
122static const int __initdata ezra_clock_ratio[16] = { 122static const int __initdata ezra_mults[16] = {
123 100, /* 0000 -> 10.0x */ 123 100, /* 0000 -> 10.0x */
124 30, /* 0001 -> 3.0x */ 124 30, /* 0001 -> 3.0x */
125 40, /* 0010 -> 4.0x */ 125 40, /* 0010 -> 4.0x */
@@ -160,7 +160,7 @@ static const int __initdata ezra_eblcr[16] = {
160/* 160/*
161 * VIA C3 (Ezra-T) [C5M]. 161 * VIA C3 (Ezra-T) [C5M].
162 */ 162 */
163static const int __initdata ezrat_clock_ratio[32] = { 163static const int __initdata ezrat_mults[32] = {
164 100, /* 0000 -> 10.0x */ 164 100, /* 0000 -> 10.0x */
165 30, /* 0001 -> 3.0x */ 165 30, /* 0001 -> 3.0x */
166 40, /* 0010 -> 4.0x */ 166 40, /* 0010 -> 4.0x */
@@ -235,7 +235,7 @@ static const int __initdata ezrat_eblcr[32] = {
235/* 235/*
236 * VIA C3 Nehemiah */ 236 * VIA C3 Nehemiah */
237 237
238static const int __initdata nehemiah_clock_ratio[32] = { 238static const int __initdata nehemiah_mults[32] = {
239 100, /* 0000 -> 10.0x */ 239 100, /* 0000 -> 10.0x */
240 -1, /* 0001 -> 16.0x */ 240 -1, /* 0001 -> 16.0x */
241 40, /* 0010 -> 4.0x */ 241 40, /* 0010 -> 4.0x */
diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
index 777a7ff075de..da5f70fcb766 100644
--- a/arch/x86/kernel/cpu/cpufreq/longrun.c
+++ b/arch/x86/kernel/cpu/cpufreq/longrun.c
@@ -11,12 +11,13 @@
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/cpufreq.h> 13#include <linux/cpufreq.h>
14#include <linux/timex.h>
14 15
15#include <asm/msr.h> 16#include <asm/msr.h>
16#include <asm/processor.h> 17#include <asm/processor.h>
17#include <asm/timex.h>
18 18
19#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "longrun", msg) 19#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
20 "longrun", msg)
20 21
21static struct cpufreq_driver longrun_driver; 22static struct cpufreq_driver longrun_driver;
22 23
@@ -51,7 +52,7 @@ static void __init longrun_get_policy(struct cpufreq_policy *policy)
51 msr_lo &= 0x0000007F; 52 msr_lo &= 0x0000007F;
52 msr_hi &= 0x0000007F; 53 msr_hi &= 0x0000007F;
53 54
54 if ( longrun_high_freq <= longrun_low_freq ) { 55 if (longrun_high_freq <= longrun_low_freq) {
55 /* Assume degenerate Longrun table */ 56 /* Assume degenerate Longrun table */
56 policy->min = policy->max = longrun_high_freq; 57 policy->min = policy->max = longrun_high_freq;
57 } else { 58 } else {
@@ -79,7 +80,7 @@ static int longrun_set_policy(struct cpufreq_policy *policy)
79 if (!policy) 80 if (!policy)
80 return -EINVAL; 81 return -EINVAL;
81 82
82 if ( longrun_high_freq <= longrun_low_freq ) { 83 if (longrun_high_freq <= longrun_low_freq) {
83 /* Assume degenerate Longrun table */ 84 /* Assume degenerate Longrun table */
84 pctg_lo = pctg_hi = 100; 85 pctg_lo = pctg_hi = 100;
85 } else { 86 } else {
@@ -152,7 +153,7 @@ static unsigned int longrun_get(unsigned int cpu)
152 cpuid(0x80860007, &eax, &ebx, &ecx, &edx); 153 cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
153 dprintk("cpuid eax is %u\n", eax); 154 dprintk("cpuid eax is %u\n", eax);
154 155
155 return (eax * 1000); 156 return eax * 1000;
156} 157}
157 158
158/** 159/**
@@ -196,7 +197,8 @@ static unsigned int __init longrun_determine_freqs(unsigned int *low_freq,
196 rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi); 197 rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi);
197 *high_freq = msr_lo * 1000; /* to kHz */ 198 *high_freq = msr_lo * 1000; /* to kHz */
198 199
199 dprintk("longrun table interface told %u - %u kHz\n", *low_freq, *high_freq); 200 dprintk("longrun table interface told %u - %u kHz\n",
201 *low_freq, *high_freq);
200 202
201 if (*low_freq > *high_freq) 203 if (*low_freq > *high_freq)
202 *low_freq = *high_freq; 204 *low_freq = *high_freq;
@@ -219,7 +221,7 @@ static unsigned int __init longrun_determine_freqs(unsigned int *low_freq,
219 cpuid(0x80860007, &eax, &ebx, &ecx, &edx); 221 cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
220 /* try decreasing in 10% steps, some processors react only 222 /* try decreasing in 10% steps, some processors react only
221 * on some barrier values */ 223 * on some barrier values */
222 for (try_hi = 80; try_hi > 0 && ecx > 90; try_hi -=10) { 224 for (try_hi = 80; try_hi > 0 && ecx > 90; try_hi -= 10) {
223 /* set to 0 to try_hi perf_pctg */ 225 /* set to 0 to try_hi perf_pctg */
224 msr_lo &= 0xFFFFFF80; 226 msr_lo &= 0xFFFFFF80;
225 msr_hi &= 0xFFFFFF80; 227 msr_hi &= 0xFFFFFF80;
@@ -236,7 +238,7 @@ static unsigned int __init longrun_determine_freqs(unsigned int *low_freq,
236 238
237 /* performance_pctg = (current_freq - low_freq)/(high_freq - low_freq) 239 /* performance_pctg = (current_freq - low_freq)/(high_freq - low_freq)
238 * eqals 240 * eqals
239 * low_freq * ( 1 - perf_pctg) = (cur_freq - high_freq * perf_pctg) 241 * low_freq * (1 - perf_pctg) = (cur_freq - high_freq * perf_pctg)
240 * 242 *
241 * high_freq * perf_pctg is stored tempoarily into "ebx". 243 * high_freq * perf_pctg is stored tempoarily into "ebx".
242 */ 244 */
@@ -317,9 +319,10 @@ static void __exit longrun_exit(void)
317} 319}
318 320
319 321
320MODULE_AUTHOR ("Dominik Brodowski <linux@brodo.de>"); 322MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
321MODULE_DESCRIPTION ("LongRun driver for Transmeta Crusoe and Efficeon processors."); 323MODULE_DESCRIPTION("LongRun driver for Transmeta Crusoe and "
322MODULE_LICENSE ("GPL"); 324 "Efficeon processors.");
325MODULE_LICENSE("GPL");
323 326
324module_init(longrun_init); 327module_init(longrun_init);
325module_exit(longrun_exit); 328module_exit(longrun_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index b585e04cbc9e..6ac55bd341ae 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -27,15 +27,17 @@
27#include <linux/cpufreq.h> 27#include <linux/cpufreq.h>
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/cpumask.h> 29#include <linux/cpumask.h>
30#include <linux/timex.h>
30 31
31#include <asm/processor.h> 32#include <asm/processor.h>
32#include <asm/msr.h> 33#include <asm/msr.h>
33#include <asm/timex.h> 34#include <asm/timer.h>
34 35
35#include "speedstep-lib.h" 36#include "speedstep-lib.h"
36 37
37#define PFX "p4-clockmod: " 38#define PFX "p4-clockmod: "
38#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "p4-clockmod", msg) 39#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
40 "p4-clockmod", msg)
39 41
40/* 42/*
41 * Duty Cycle (3bits), note DC_DISABLE is not specified in 43 * Duty Cycle (3bits), note DC_DISABLE is not specified in
@@ -58,7 +60,8 @@ static int cpufreq_p4_setdc(unsigned int cpu, unsigned int newstate)
58{ 60{
59 u32 l, h; 61 u32 l, h;
60 62
61 if (!cpu_online(cpu) || (newstate > DC_DISABLE) || (newstate == DC_RESV)) 63 if (!cpu_online(cpu) ||
64 (newstate > DC_DISABLE) || (newstate == DC_RESV))
62 return -EINVAL; 65 return -EINVAL;
63 66
64 rdmsr_on_cpu(cpu, MSR_IA32_THERM_STATUS, &l, &h); 67 rdmsr_on_cpu(cpu, MSR_IA32_THERM_STATUS, &l, &h);
@@ -66,7 +69,8 @@ static int cpufreq_p4_setdc(unsigned int cpu, unsigned int newstate)
66 if (l & 0x01) 69 if (l & 0x01)
67 dprintk("CPU#%d currently thermal throttled\n", cpu); 70 dprintk("CPU#%d currently thermal throttled\n", cpu);
68 71
69 if (has_N44_O17_errata[cpu] && (newstate == DC_25PT || newstate == DC_DFLT)) 72 if (has_N44_O17_errata[cpu] &&
73 (newstate == DC_25PT || newstate == DC_DFLT))
70 newstate = DC_38PT; 74 newstate = DC_38PT;
71 75
72 rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h); 76 rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h);
@@ -112,7 +116,8 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy,
112 struct cpufreq_freqs freqs; 116 struct cpufreq_freqs freqs;
113 int i; 117 int i;
114 118
115 if (cpufreq_frequency_table_target(policy, &p4clockmod_table[0], target_freq, relation, &newstate)) 119 if (cpufreq_frequency_table_target(policy, &p4clockmod_table[0],
120 target_freq, relation, &newstate))
116 return -EINVAL; 121 return -EINVAL;
117 122
118 freqs.old = cpufreq_p4_get(policy->cpu); 123 freqs.old = cpufreq_p4_get(policy->cpu);
@@ -127,7 +132,8 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy,
127 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 132 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
128 } 133 }
129 134
130 /* run on each logical CPU, see section 13.15.3 of IA32 Intel Architecture Software 135 /* run on each logical CPU,
136 * see section 13.15.3 of IA32 Intel Architecture Software
131 * Developer's Manual, Volume 3 137 * Developer's Manual, Volume 3
132 */ 138 */
133 for_each_cpu(i, policy->cpus) 139 for_each_cpu(i, policy->cpus)
@@ -153,28 +159,30 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
153{ 159{
154 if (c->x86 == 0x06) { 160 if (c->x86 == 0x06) {
155 if (cpu_has(c, X86_FEATURE_EST)) 161 if (cpu_has(c, X86_FEATURE_EST))
156 printk(KERN_WARNING PFX "Warning: EST-capable CPU detected. " 162 printk(KERN_WARNING PFX "Warning: EST-capable CPU "
157 "The acpi-cpufreq module offers voltage scaling" 163 "detected. The acpi-cpufreq module offers "
158 " in addition of frequency scaling. You should use " 164 "voltage scaling in addition of frequency "
159 "that instead of p4-clockmod, if possible.\n"); 165 "scaling. You should use that instead of "
166 "p4-clockmod, if possible.\n");
160 switch (c->x86_model) { 167 switch (c->x86_model) {
161 case 0x0E: /* Core */ 168 case 0x0E: /* Core */
162 case 0x0F: /* Core Duo */ 169 case 0x0F: /* Core Duo */
163 case 0x16: /* Celeron Core */ 170 case 0x16: /* Celeron Core */
164 p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS; 171 p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
165 return speedstep_get_processor_frequency(SPEEDSTEP_PROCESSOR_PCORE); 172 return speedstep_get_frequency(SPEEDSTEP_CPU_PCORE);
166 case 0x0D: /* Pentium M (Dothan) */ 173 case 0x0D: /* Pentium M (Dothan) */
167 p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS; 174 p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
168 /* fall through */ 175 /* fall through */
169 case 0x09: /* Pentium M (Banias) */ 176 case 0x09: /* Pentium M (Banias) */
170 return speedstep_get_processor_frequency(SPEEDSTEP_PROCESSOR_PM); 177 return speedstep_get_frequency(SPEEDSTEP_CPU_PM);
171 } 178 }
172 } 179 }
173 180
174 if (c->x86 != 0xF) { 181 if (c->x86 != 0xF) {
175 if (!cpu_has(c, X86_FEATURE_EST)) 182 if (!cpu_has(c, X86_FEATURE_EST))
176 printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. " 183 printk(KERN_WARNING PFX "Unknown CPU. "
177 "Please send an e-mail to <cpufreq@vger.kernel.org>\n"); 184 "Please send an e-mail to "
185 "<cpufreq@vger.kernel.org>\n");
178 return 0; 186 return 0;
179 } 187 }
180 188
@@ -182,16 +190,16 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
182 * throttling is active or not. */ 190 * throttling is active or not. */
183 p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS; 191 p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
184 192
185 if (speedstep_detect_processor() == SPEEDSTEP_PROCESSOR_P4M) { 193 if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4M) {
186 printk(KERN_WARNING PFX "Warning: Pentium 4-M detected. " 194 printk(KERN_WARNING PFX "Warning: Pentium 4-M detected. "
187 "The speedstep-ich or acpi cpufreq modules offer " 195 "The speedstep-ich or acpi cpufreq modules offer "
188 "voltage scaling in addition of frequency scaling. " 196 "voltage scaling in addition of frequency scaling. "
189 "You should use either one instead of p4-clockmod, " 197 "You should use either one instead of p4-clockmod, "
190 "if possible.\n"); 198 "if possible.\n");
191 return speedstep_get_processor_frequency(SPEEDSTEP_PROCESSOR_P4M); 199 return speedstep_get_frequency(SPEEDSTEP_CPU_P4M);
192 } 200 }
193 201
194 return speedstep_get_processor_frequency(SPEEDSTEP_PROCESSOR_P4D); 202 return speedstep_get_frequency(SPEEDSTEP_CPU_P4D);
195} 203}
196 204
197 205
@@ -203,7 +211,7 @@ static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy)
203 unsigned int i; 211 unsigned int i;
204 212
205#ifdef CONFIG_SMP 213#ifdef CONFIG_SMP
206 cpumask_copy(policy->cpus, &per_cpu(cpu_sibling_map, policy->cpu)); 214 cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));
207#endif 215#endif
208 216
209 /* Errata workaround */ 217 /* Errata workaround */
@@ -217,14 +225,20 @@ static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy)
217 dprintk("has errata -- disabling low frequencies\n"); 225 dprintk("has errata -- disabling low frequencies\n");
218 } 226 }
219 227
228 if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4D &&
229 c->x86_model < 2) {
230 /* switch to maximum frequency and measure result */
231 cpufreq_p4_setdc(policy->cpu, DC_DISABLE);
232 recalibrate_cpu_khz();
233 }
220 /* get max frequency */ 234 /* get max frequency */
221 stock_freq = cpufreq_p4_get_frequency(c); 235 stock_freq = cpufreq_p4_get_frequency(c);
222 if (!stock_freq) 236 if (!stock_freq)
223 return -EINVAL; 237 return -EINVAL;
224 238
225 /* table init */ 239 /* table init */
226 for (i=1; (p4clockmod_table[i].frequency != CPUFREQ_TABLE_END); i++) { 240 for (i = 1; (p4clockmod_table[i].frequency != CPUFREQ_TABLE_END); i++) {
227 if ((i<2) && (has_N44_O17_errata[policy->cpu])) 241 if ((i < 2) && (has_N44_O17_errata[policy->cpu]))
228 p4clockmod_table[i].frequency = CPUFREQ_ENTRY_INVALID; 242 p4clockmod_table[i].frequency = CPUFREQ_ENTRY_INVALID;
229 else 243 else
230 p4clockmod_table[i].frequency = (stock_freq * i)/8; 244 p4clockmod_table[i].frequency = (stock_freq * i)/8;
@@ -232,7 +246,10 @@ static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy)
232 cpufreq_frequency_table_get_attr(p4clockmod_table, policy->cpu); 246 cpufreq_frequency_table_get_attr(p4clockmod_table, policy->cpu);
233 247
234 /* cpuinfo and default policy values */ 248 /* cpuinfo and default policy values */
235 policy->cpuinfo.transition_latency = 1000000; /* assumed */ 249
250 /* the transition latency is set to be 1 higher than the maximum
251 * transition latency of the ondemand governor */
252 policy->cpuinfo.transition_latency = 10000001;
236 policy->cur = stock_freq; 253 policy->cur = stock_freq;
237 254
238 return cpufreq_frequency_table_cpuinfo(policy, &p4clockmod_table[0]); 255 return cpufreq_frequency_table_cpuinfo(policy, &p4clockmod_table[0]);
@@ -258,12 +275,12 @@ static unsigned int cpufreq_p4_get(unsigned int cpu)
258 l = DC_DISABLE; 275 l = DC_DISABLE;
259 276
260 if (l != DC_DISABLE) 277 if (l != DC_DISABLE)
261 return (stock_freq * l / 8); 278 return stock_freq * l / 8;
262 279
263 return stock_freq; 280 return stock_freq;
264} 281}
265 282
266static struct freq_attr* p4clockmod_attr[] = { 283static struct freq_attr *p4clockmod_attr[] = {
267 &cpufreq_freq_attr_scaling_available_freqs, 284 &cpufreq_freq_attr_scaling_available_freqs,
268 NULL, 285 NULL,
269}; 286};
@@ -277,7 +294,6 @@ static struct cpufreq_driver p4clockmod_driver = {
277 .name = "p4-clockmod", 294 .name = "p4-clockmod",
278 .owner = THIS_MODULE, 295 .owner = THIS_MODULE,
279 .attr = p4clockmod_attr, 296 .attr = p4clockmod_attr,
280 .hide_interface = 1,
281}; 297};
282 298
283 299
@@ -299,9 +315,10 @@ static int __init cpufreq_p4_init(void)
299 315
300 ret = cpufreq_register_driver(&p4clockmod_driver); 316 ret = cpufreq_register_driver(&p4clockmod_driver);
301 if (!ret) 317 if (!ret)
302 printk(KERN_INFO PFX "P4/Xeon(TM) CPU On-Demand Clock Modulation available\n"); 318 printk(KERN_INFO PFX "P4/Xeon(TM) CPU On-Demand Clock "
319 "Modulation available\n");
303 320
304 return (ret); 321 return ret;
305} 322}
306 323
307 324
@@ -311,9 +328,9 @@ static void __exit cpufreq_p4_exit(void)
311} 328}
312 329
313 330
314MODULE_AUTHOR ("Zwane Mwaikambo <zwane@commfireservices.com>"); 331MODULE_AUTHOR("Zwane Mwaikambo <zwane@commfireservices.com>");
315MODULE_DESCRIPTION ("cpufreq driver for Pentium(TM) 4/Xeon(TM)"); 332MODULE_DESCRIPTION("cpufreq driver for Pentium(TM) 4/Xeon(TM)");
316MODULE_LICENSE ("GPL"); 333MODULE_LICENSE("GPL");
317 334
318late_initcall(cpufreq_p4_init); 335late_initcall(cpufreq_p4_init);
319module_exit(cpufreq_p4_exit); 336module_exit(cpufreq_p4_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
index c1ac5790c63e..f10dea409f40 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
@@ -1,6 +1,7 @@
1/* 1/*
2 * This file was based upon code in Powertweak Linux (http://powertweak.sf.net) 2 * This file was based upon code in Powertweak Linux (http://powertweak.sf.net)
3 * (C) 2000-2003 Dave Jones, Arjan van de Ven, Janne Pänkälä, Dominik Brodowski. 3 * (C) 2000-2003 Dave Jones, Arjan van de Ven, Janne Pänkälä,
4 * Dominik Brodowski.
4 * 5 *
5 * Licensed under the terms of the GNU GPL License version 2. 6 * Licensed under the terms of the GNU GPL License version 2.
6 * 7 *
@@ -13,14 +14,15 @@
13#include <linux/cpufreq.h> 14#include <linux/cpufreq.h>
14#include <linux/ioport.h> 15#include <linux/ioport.h>
15#include <linux/slab.h> 16#include <linux/slab.h>
16
17#include <asm/msr.h>
18#include <linux/timex.h> 17#include <linux/timex.h>
19#include <linux/io.h> 18#include <linux/io.h>
20 19
20#include <asm/msr.h>
21
21#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long 22#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long
22 as it is unused */ 23 as it is unused */
23 24
25#define PFX "powernow-k6: "
24static unsigned int busfreq; /* FSB, in 10 kHz */ 26static unsigned int busfreq; /* FSB, in 10 kHz */
25static unsigned int max_multiplier; 27static unsigned int max_multiplier;
26 28
@@ -47,8 +49,8 @@ static struct cpufreq_frequency_table clock_ratio[] = {
47 */ 49 */
48static int powernow_k6_get_cpu_multiplier(void) 50static int powernow_k6_get_cpu_multiplier(void)
49{ 51{
50 u64 invalue = 0; 52 u64 invalue = 0;
51 u32 msrval; 53 u32 msrval;
52 54
53 msrval = POWERNOW_IOPORT + 0x1; 55 msrval = POWERNOW_IOPORT + 0x1;
54 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ 56 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
@@ -68,12 +70,12 @@ static int powernow_k6_get_cpu_multiplier(void)
68 */ 70 */
69static void powernow_k6_set_state(unsigned int best_i) 71static void powernow_k6_set_state(unsigned int best_i)
70{ 72{
71 unsigned long outvalue = 0, invalue = 0; 73 unsigned long outvalue = 0, invalue = 0;
72 unsigned long msrval; 74 unsigned long msrval;
73 struct cpufreq_freqs freqs; 75 struct cpufreq_freqs freqs;
74 76
75 if (clock_ratio[best_i].index > max_multiplier) { 77 if (clock_ratio[best_i].index > max_multiplier) {
76 printk(KERN_ERR "cpufreq: invalid target frequency\n"); 78 printk(KERN_ERR PFX "invalid target frequency\n");
77 return; 79 return;
78 } 80 }
79 81
@@ -119,7 +121,8 @@ static int powernow_k6_verify(struct cpufreq_policy *policy)
119 * powernow_k6_setpolicy - sets a new CPUFreq policy 121 * powernow_k6_setpolicy - sets a new CPUFreq policy
120 * @policy: new policy 122 * @policy: new policy
121 * @target_freq: the target frequency 123 * @target_freq: the target frequency
122 * @relation: how that frequency relates to achieved frequency (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) 124 * @relation: how that frequency relates to achieved frequency
125 * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
123 * 126 *
124 * sets a new CPUFreq policy 127 * sets a new CPUFreq policy
125 */ 128 */
@@ -127,9 +130,10 @@ static int powernow_k6_target(struct cpufreq_policy *policy,
127 unsigned int target_freq, 130 unsigned int target_freq,
128 unsigned int relation) 131 unsigned int relation)
129{ 132{
130 unsigned int newstate = 0; 133 unsigned int newstate = 0;
131 134
132 if (cpufreq_frequency_table_target(policy, &clock_ratio[0], target_freq, relation, &newstate)) 135 if (cpufreq_frequency_table_target(policy, &clock_ratio[0],
136 target_freq, relation, &newstate))
133 return -EINVAL; 137 return -EINVAL;
134 138
135 powernow_k6_set_state(newstate); 139 powernow_k6_set_state(newstate);
@@ -140,7 +144,7 @@ static int powernow_k6_target(struct cpufreq_policy *policy,
140 144
141static int powernow_k6_cpu_init(struct cpufreq_policy *policy) 145static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
142{ 146{
143 unsigned int i; 147 unsigned int i, f;
144 int result; 148 int result;
145 149
146 if (policy->cpu != 0) 150 if (policy->cpu != 0)
@@ -152,10 +156,11 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
152 156
153 /* table init */ 157 /* table init */
154 for (i = 0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) { 158 for (i = 0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) {
155 if (clock_ratio[i].index > max_multiplier) 159 f = clock_ratio[i].index;
160 if (f > max_multiplier)
156 clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID; 161 clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID;
157 else 162 else
158 clock_ratio[i].frequency = busfreq * clock_ratio[i].index; 163 clock_ratio[i].frequency = busfreq * f;
159 } 164 }
160 165
161 /* cpuinfo and default policy values */ 166 /* cpuinfo and default policy values */
@@ -185,7 +190,9 @@ static int powernow_k6_cpu_exit(struct cpufreq_policy *policy)
185 190
186static unsigned int powernow_k6_get(unsigned int cpu) 191static unsigned int powernow_k6_get(unsigned int cpu)
187{ 192{
188 return busfreq * powernow_k6_get_cpu_multiplier(); 193 unsigned int ret;
194 ret = (busfreq * powernow_k6_get_cpu_multiplier());
195 return ret;
189} 196}
190 197
191static struct freq_attr *powernow_k6_attr[] = { 198static struct freq_attr *powernow_k6_attr[] = {
@@ -221,7 +228,7 @@ static int __init powernow_k6_init(void)
221 return -ENODEV; 228 return -ENODEV;
222 229
223 if (!request_region(POWERNOW_IOPORT, 16, "PowerNow!")) { 230 if (!request_region(POWERNOW_IOPORT, 16, "PowerNow!")) {
224 printk("cpufreq: PowerNow IOPORT region already used.\n"); 231 printk(KERN_INFO PFX "PowerNow IOPORT region already used.\n");
225 return -EIO; 232 return -EIO;
226 } 233 }
227 234
@@ -246,7 +253,8 @@ static void __exit powernow_k6_exit(void)
246} 253}
247 254
248 255
249MODULE_AUTHOR("Arjan van de Ven, Dave Jones <davej@redhat.com>, Dominik Brodowski <linux@brodo.de>"); 256MODULE_AUTHOR("Arjan van de Ven, Dave Jones <davej@redhat.com>, "
257 "Dominik Brodowski <linux@brodo.de>");
250MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors."); 258MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors.");
251MODULE_LICENSE("GPL"); 259MODULE_LICENSE("GPL");
252 260
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index 1b446d79a8fd..3c28ccd49742 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -6,10 +6,12 @@
6 * Licensed under the terms of the GNU GPL License version 2. 6 * Licensed under the terms of the GNU GPL License version 2.
7 * Based upon datasheets & sample CPUs kindly provided by AMD. 7 * Based upon datasheets & sample CPUs kindly provided by AMD.
8 * 8 *
9 * Errata 5: Processor may fail to execute a FID/VID change in presence of interrupt. 9 * Errata 5:
10 * - We cli/sti on stepping A0 CPUs around the FID/VID transition. 10 * CPU may fail to execute a FID/VID change in presence of interrupt.
11 * Errata 15: Processors with half frequency multipliers may hang upon wakeup from disconnect. 11 * - We cli/sti on stepping A0 CPUs around the FID/VID transition.
12 * - We disable half multipliers if ACPI is used on A0 stepping CPUs. 12 * Errata 15:
13 * CPU with half frequency multipliers may hang upon wakeup from disconnect.
14 * - We disable half multipliers if ACPI is used on A0 stepping CPUs.
13 */ 15 */
14 16
15#include <linux/kernel.h> 17#include <linux/kernel.h>
@@ -20,11 +22,11 @@
20#include <linux/slab.h> 22#include <linux/slab.h>
21#include <linux/string.h> 23#include <linux/string.h>
22#include <linux/dmi.h> 24#include <linux/dmi.h>
25#include <linux/timex.h>
26#include <linux/io.h>
23 27
28#include <asm/timer.h> /* Needed for recalibrate_cpu_khz() */
24#include <asm/msr.h> 29#include <asm/msr.h>
25#include <asm/timer.h>
26#include <asm/timex.h>
27#include <asm/io.h>
28#include <asm/system.h> 30#include <asm/system.h>
29 31
30#ifdef CONFIG_X86_POWERNOW_K7_ACPI 32#ifdef CONFIG_X86_POWERNOW_K7_ACPI
@@ -58,9 +60,9 @@ struct pst_s {
58union powernow_acpi_control_t { 60union powernow_acpi_control_t {
59 struct { 61 struct {
60 unsigned long fid:5, 62 unsigned long fid:5,
61 vid:5, 63 vid:5,
62 sgtc:20, 64 sgtc:20,
63 res1:2; 65 res1:2;
64 } bits; 66 } bits;
65 unsigned long val; 67 unsigned long val;
66}; 68};
@@ -94,14 +96,15 @@ static struct cpufreq_frequency_table *powernow_table;
94 96
95static unsigned int can_scale_bus; 97static unsigned int can_scale_bus;
96static unsigned int can_scale_vid; 98static unsigned int can_scale_vid;
97static unsigned int minimum_speed=-1; 99static unsigned int minimum_speed = -1;
98static unsigned int maximum_speed; 100static unsigned int maximum_speed;
99static unsigned int number_scales; 101static unsigned int number_scales;
100static unsigned int fsb; 102static unsigned int fsb;
101static unsigned int latency; 103static unsigned int latency;
102static char have_a0; 104static char have_a0;
103 105
104#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k7", msg) 106#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
107 "powernow-k7", msg)
105 108
106static int check_fsb(unsigned int fsbspeed) 109static int check_fsb(unsigned int fsbspeed)
107{ 110{
@@ -109,7 +112,7 @@ static int check_fsb(unsigned int fsbspeed)
109 unsigned int f = fsb / 1000; 112 unsigned int f = fsb / 1000;
110 113
111 delta = (fsbspeed > f) ? fsbspeed - f : f - fsbspeed; 114 delta = (fsbspeed > f) ? fsbspeed - f : f - fsbspeed;
112 return (delta < 5); 115 return delta < 5;
113} 116}
114 117
115static int check_powernow(void) 118static int check_powernow(void)
@@ -117,24 +120,26 @@ static int check_powernow(void)
117 struct cpuinfo_x86 *c = &cpu_data(0); 120 struct cpuinfo_x86 *c = &cpu_data(0);
118 unsigned int maxei, eax, ebx, ecx, edx; 121 unsigned int maxei, eax, ebx, ecx, edx;
119 122
120 if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 !=6)) { 123 if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 6)) {
121#ifdef MODULE 124#ifdef MODULE
122 printk (KERN_INFO PFX "This module only works with AMD K7 CPUs\n"); 125 printk(KERN_INFO PFX "This module only works with "
126 "AMD K7 CPUs\n");
123#endif 127#endif
124 return 0; 128 return 0;
125 } 129 }
126 130
127 /* Get maximum capabilities */ 131 /* Get maximum capabilities */
128 maxei = cpuid_eax (0x80000000); 132 maxei = cpuid_eax(0x80000000);
129 if (maxei < 0x80000007) { /* Any powernow info ? */ 133 if (maxei < 0x80000007) { /* Any powernow info ? */
130#ifdef MODULE 134#ifdef MODULE
131 printk (KERN_INFO PFX "No powernow capabilities detected\n"); 135 printk(KERN_INFO PFX "No powernow capabilities detected\n");
132#endif 136#endif
133 return 0; 137 return 0;
134 } 138 }
135 139
136 if ((c->x86_model == 6) && (c->x86_mask == 0)) { 140 if ((c->x86_model == 6) && (c->x86_mask == 0)) {
137 printk (KERN_INFO PFX "K7 660[A0] core detected, enabling errata workarounds\n"); 141 printk(KERN_INFO PFX "K7 660[A0] core detected, "
142 "enabling errata workarounds\n");
138 have_a0 = 1; 143 have_a0 = 1;
139 } 144 }
140 145
@@ -144,37 +149,42 @@ static int check_powernow(void)
144 if (!(edx & (1 << 1 | 1 << 2))) 149 if (!(edx & (1 << 1 | 1 << 2)))
145 return 0; 150 return 0;
146 151
147 printk (KERN_INFO PFX "PowerNOW! Technology present. Can scale: "); 152 printk(KERN_INFO PFX "PowerNOW! Technology present. Can scale: ");
148 153
149 if (edx & 1 << 1) { 154 if (edx & 1 << 1) {
150 printk ("frequency"); 155 printk("frequency");
151 can_scale_bus=1; 156 can_scale_bus = 1;
152 } 157 }
153 158
154 if ((edx & (1 << 1 | 1 << 2)) == 0x6) 159 if ((edx & (1 << 1 | 1 << 2)) == 0x6)
155 printk (" and "); 160 printk(" and ");
156 161
157 if (edx & 1 << 2) { 162 if (edx & 1 << 2) {
158 printk ("voltage"); 163 printk("voltage");
159 can_scale_vid=1; 164 can_scale_vid = 1;
160 } 165 }
161 166
162 printk (".\n"); 167 printk(".\n");
163 return 1; 168 return 1;
164} 169}
165 170
171static void invalidate_entry(unsigned int entry)
172{
173 powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
174}
166 175
167static int get_ranges (unsigned char *pst) 176static int get_ranges(unsigned char *pst)
168{ 177{
169 unsigned int j; 178 unsigned int j;
170 unsigned int speed; 179 unsigned int speed;
171 u8 fid, vid; 180 u8 fid, vid;
172 181
173 powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) * (number_scales + 1)), GFP_KERNEL); 182 powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) *
183 (number_scales + 1)), GFP_KERNEL);
174 if (!powernow_table) 184 if (!powernow_table)
175 return -ENOMEM; 185 return -ENOMEM;
176 186
177 for (j=0 ; j < number_scales; j++) { 187 for (j = 0 ; j < number_scales; j++) {
178 fid = *pst++; 188 fid = *pst++;
179 189
180 powernow_table[j].frequency = (fsb * fid_codes[fid]) / 10; 190 powernow_table[j].frequency = (fsb * fid_codes[fid]) / 10;
@@ -182,10 +192,10 @@ static int get_ranges (unsigned char *pst)
182 192
183 speed = powernow_table[j].frequency; 193 speed = powernow_table[j].frequency;
184 194
185 if ((fid_codes[fid] % 10)==5) { 195 if ((fid_codes[fid] % 10) == 5) {
186#ifdef CONFIG_X86_POWERNOW_K7_ACPI 196#ifdef CONFIG_X86_POWERNOW_K7_ACPI
187 if (have_a0 == 1) 197 if (have_a0 == 1)
188 powernow_table[j].frequency = CPUFREQ_ENTRY_INVALID; 198 invalidate_entry(j);
189#endif 199#endif
190 } 200 }
191 201
@@ -197,7 +207,7 @@ static int get_ranges (unsigned char *pst)
197 vid = *pst++; 207 vid = *pst++;
198 powernow_table[j].index |= (vid << 8); /* upper 8 bits */ 208 powernow_table[j].index |= (vid << 8); /* upper 8 bits */
199 209
200 dprintk (" FID: 0x%x (%d.%dx [%dMHz]) " 210 dprintk(" FID: 0x%x (%d.%dx [%dMHz]) "
201 "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10, 211 "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
202 fid_codes[fid] % 10, speed/1000, vid, 212 fid_codes[fid] % 10, speed/1000, vid,
203 mobile_vid_table[vid]/1000, 213 mobile_vid_table[vid]/1000,
@@ -214,13 +224,13 @@ static void change_FID(int fid)
214{ 224{
215 union msr_fidvidctl fidvidctl; 225 union msr_fidvidctl fidvidctl;
216 226
217 rdmsrl (MSR_K7_FID_VID_CTL, fidvidctl.val); 227 rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
218 if (fidvidctl.bits.FID != fid) { 228 if (fidvidctl.bits.FID != fid) {
219 fidvidctl.bits.SGTC = latency; 229 fidvidctl.bits.SGTC = latency;
220 fidvidctl.bits.FID = fid; 230 fidvidctl.bits.FID = fid;
221 fidvidctl.bits.VIDC = 0; 231 fidvidctl.bits.VIDC = 0;
222 fidvidctl.bits.FIDC = 1; 232 fidvidctl.bits.FIDC = 1;
223 wrmsrl (MSR_K7_FID_VID_CTL, fidvidctl.val); 233 wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
224 } 234 }
225} 235}
226 236
@@ -229,18 +239,18 @@ static void change_VID(int vid)
229{ 239{
230 union msr_fidvidctl fidvidctl; 240 union msr_fidvidctl fidvidctl;
231 241
232 rdmsrl (MSR_K7_FID_VID_CTL, fidvidctl.val); 242 rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
233 if (fidvidctl.bits.VID != vid) { 243 if (fidvidctl.bits.VID != vid) {
234 fidvidctl.bits.SGTC = latency; 244 fidvidctl.bits.SGTC = latency;
235 fidvidctl.bits.VID = vid; 245 fidvidctl.bits.VID = vid;
236 fidvidctl.bits.FIDC = 0; 246 fidvidctl.bits.FIDC = 0;
237 fidvidctl.bits.VIDC = 1; 247 fidvidctl.bits.VIDC = 1;
238 wrmsrl (MSR_K7_FID_VID_CTL, fidvidctl.val); 248 wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
239 } 249 }
240} 250}
241 251
242 252
243static void change_speed (unsigned int index) 253static void change_speed(unsigned int index)
244{ 254{
245 u8 fid, vid; 255 u8 fid, vid;
246 struct cpufreq_freqs freqs; 256 struct cpufreq_freqs freqs;
@@ -257,7 +267,7 @@ static void change_speed (unsigned int index)
257 267
258 freqs.cpu = 0; 268 freqs.cpu = 0;
259 269
260 rdmsrl (MSR_K7_FID_VID_STATUS, fidvidstatus.val); 270 rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
261 cfid = fidvidstatus.bits.CFID; 271 cfid = fidvidstatus.bits.CFID;
262 freqs.old = fsb * fid_codes[cfid] / 10; 272 freqs.old = fsb * fid_codes[cfid] / 10;
263 273
@@ -321,12 +331,14 @@ static int powernow_acpi_init(void)
321 goto err1; 331 goto err1;
322 } 332 }
323 333
324 if (acpi_processor_perf->control_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) { 334 if (acpi_processor_perf->control_register.space_id !=
335 ACPI_ADR_SPACE_FIXED_HARDWARE) {
325 retval = -ENODEV; 336 retval = -ENODEV;
326 goto err2; 337 goto err2;
327 } 338 }
328 339
329 if (acpi_processor_perf->status_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) { 340 if (acpi_processor_perf->status_register.space_id !=
341 ACPI_ADR_SPACE_FIXED_HARDWARE) {
330 retval = -ENODEV; 342 retval = -ENODEV;
331 goto err2; 343 goto err2;
332 } 344 }
@@ -338,7 +350,8 @@ static int powernow_acpi_init(void)
338 goto err2; 350 goto err2;
339 } 351 }
340 352
341 powernow_table = kzalloc((number_scales + 1) * (sizeof(struct cpufreq_frequency_table)), GFP_KERNEL); 353 powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) *
354 (number_scales + 1)), GFP_KERNEL);
342 if (!powernow_table) { 355 if (!powernow_table) {
343 retval = -ENOMEM; 356 retval = -ENOMEM;
344 goto err2; 357 goto err2;
@@ -352,7 +365,7 @@ static int powernow_acpi_init(void)
352 unsigned int speed, speed_mhz; 365 unsigned int speed, speed_mhz;
353 366
354 pc.val = (unsigned long) state->control; 367 pc.val = (unsigned long) state->control;
355 dprintk ("acpi: P%d: %d MHz %d mW %d uS control %08x SGTC %d\n", 368 dprintk("acpi: P%d: %d MHz %d mW %d uS control %08x SGTC %d\n",
356 i, 369 i,
357 (u32) state->core_frequency, 370 (u32) state->core_frequency,
358 (u32) state->power, 371 (u32) state->power,
@@ -381,12 +394,12 @@ static int powernow_acpi_init(void)
381 if (speed % 1000 > 0) 394 if (speed % 1000 > 0)
382 speed_mhz++; 395 speed_mhz++;
383 396
384 if ((fid_codes[fid] % 10)==5) { 397 if ((fid_codes[fid] % 10) == 5) {
385 if (have_a0 == 1) 398 if (have_a0 == 1)
386 powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID; 399 invalidate_entry(i);
387 } 400 }
388 401
389 dprintk (" FID: 0x%x (%d.%dx [%dMHz]) " 402 dprintk(" FID: 0x%x (%d.%dx [%dMHz]) "
390 "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10, 403 "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
391 fid_codes[fid] % 10, speed_mhz, vid, 404 fid_codes[fid] % 10, speed_mhz, vid,
392 mobile_vid_table[vid]/1000, 405 mobile_vid_table[vid]/1000,
@@ -422,7 +435,8 @@ err1:
422err05: 435err05:
423 kfree(acpi_processor_perf); 436 kfree(acpi_processor_perf);
424err0: 437err0:
425 printk(KERN_WARNING PFX "ACPI perflib can not be used in this platform\n"); 438 printk(KERN_WARNING PFX "ACPI perflib can not be used on "
439 "this platform\n");
426 acpi_processor_perf = NULL; 440 acpi_processor_perf = NULL;
427 return retval; 441 return retval;
428} 442}
@@ -435,7 +449,14 @@ static int powernow_acpi_init(void)
435} 449}
436#endif 450#endif
437 451
438static int powernow_decode_bios (int maxfid, int startvid) 452static void print_pst_entry(struct pst_s *pst, unsigned int j)
453{
454 dprintk("PST:%d (@%p)\n", j, pst);
455 dprintk(" cpuid: 0x%x fsb: %d maxFID: 0x%x startvid: 0x%x\n",
456 pst->cpuid, pst->fsbspeed, pst->maxfid, pst->startvid);
457}
458
459static int powernow_decode_bios(int maxfid, int startvid)
439{ 460{
440 struct psb_s *psb; 461 struct psb_s *psb;
441 struct pst_s *pst; 462 struct pst_s *pst;
@@ -446,61 +467,67 @@ static int powernow_decode_bios (int maxfid, int startvid)
446 467
447 etuple = cpuid_eax(0x80000001); 468 etuple = cpuid_eax(0x80000001);
448 469
449 for (i=0xC0000; i < 0xffff0 ; i+=16) { 470 for (i = 0xC0000; i < 0xffff0 ; i += 16) {
450 471
451 p = phys_to_virt(i); 472 p = phys_to_virt(i);
452 473
453 if (memcmp(p, "AMDK7PNOW!", 10) == 0){ 474 if (memcmp(p, "AMDK7PNOW!", 10) == 0) {
454 dprintk ("Found PSB header at %p\n", p); 475 dprintk("Found PSB header at %p\n", p);
455 psb = (struct psb_s *) p; 476 psb = (struct psb_s *) p;
456 dprintk ("Table version: 0x%x\n", psb->tableversion); 477 dprintk("Table version: 0x%x\n", psb->tableversion);
457 if (psb->tableversion != 0x12) { 478 if (psb->tableversion != 0x12) {
458 printk (KERN_INFO PFX "Sorry, only v1.2 tables supported right now\n"); 479 printk(KERN_INFO PFX "Sorry, only v1.2 tables"
480 " supported right now\n");
459 return -ENODEV; 481 return -ENODEV;
460 } 482 }
461 483
462 dprintk ("Flags: 0x%x\n", psb->flags); 484 dprintk("Flags: 0x%x\n", psb->flags);
463 if ((psb->flags & 1)==0) { 485 if ((psb->flags & 1) == 0)
464 dprintk ("Mobile voltage regulator\n"); 486 dprintk("Mobile voltage regulator\n");
465 } else { 487 else
466 dprintk ("Desktop voltage regulator\n"); 488 dprintk("Desktop voltage regulator\n");
467 }
468 489
469 latency = psb->settlingtime; 490 latency = psb->settlingtime;
470 if (latency < 100) { 491 if (latency < 100) {
471 printk(KERN_INFO PFX "BIOS set settling time to %d microseconds. " 492 printk(KERN_INFO PFX "BIOS set settling time "
472 "Should be at least 100. Correcting.\n", latency); 493 "to %d microseconds. "
494 "Should be at least 100. "
495 "Correcting.\n", latency);
473 latency = 100; 496 latency = 100;
474 } 497 }
475 dprintk ("Settling Time: %d microseconds.\n", psb->settlingtime); 498 dprintk("Settling Time: %d microseconds.\n",
476 dprintk ("Has %d PST tables. (Only dumping ones relevant to this CPU).\n", psb->numpst); 499 psb->settlingtime);
500 dprintk("Has %d PST tables. (Only dumping ones "
501 "relevant to this CPU).\n",
502 psb->numpst);
477 503
478 p += sizeof (struct psb_s); 504 p += sizeof(struct psb_s);
479 505
480 pst = (struct pst_s *) p; 506 pst = (struct pst_s *) p;
481 507
482 for (j=0; j<psb->numpst; j++) { 508 for (j = 0; j < psb->numpst; j++) {
483 pst = (struct pst_s *) p; 509 pst = (struct pst_s *) p;
484 number_scales = pst->numpstates; 510 number_scales = pst->numpstates;
485 511
486 if ((etuple == pst->cpuid) && check_fsb(pst->fsbspeed) && 512 if ((etuple == pst->cpuid) &&
487 (maxfid==pst->maxfid) && (startvid==pst->startvid)) 513 check_fsb(pst->fsbspeed) &&
488 { 514 (maxfid == pst->maxfid) &&
489 dprintk ("PST:%d (@%p)\n", j, pst); 515 (startvid == pst->startvid)) {
490 dprintk (" cpuid: 0x%x fsb: %d maxFID: 0x%x startvid: 0x%x\n", 516 print_pst_entry(pst, j);
491 pst->cpuid, pst->fsbspeed, pst->maxfid, pst->startvid); 517 p = (char *)pst + sizeof(struct pst_s);
492 518 ret = get_ranges(p);
493 ret = get_ranges ((char *) pst + sizeof (struct pst_s));
494 return ret; 519 return ret;
495 } else { 520 } else {
496 unsigned int k; 521 unsigned int k;
497 p = (char *) pst + sizeof (struct pst_s); 522 p = (char *)pst + sizeof(struct pst_s);
498 for (k=0; k<number_scales; k++) 523 for (k = 0; k < number_scales; k++)
499 p+=2; 524 p += 2;
500 } 525 }
501 } 526 }
502 printk (KERN_INFO PFX "No PST tables match this cpuid (0x%x)\n", etuple); 527 printk(KERN_INFO PFX "No PST tables match this cpuid "
503 printk (KERN_INFO PFX "This is indicative of a broken BIOS.\n"); 528 "(0x%x)\n", etuple);
529 printk(KERN_INFO PFX "This is indicative of a broken "
530 "BIOS.\n");
504 531
505 return -EINVAL; 532 return -EINVAL;
506 } 533 }
@@ -511,13 +538,14 @@ static int powernow_decode_bios (int maxfid, int startvid)
511} 538}
512 539
513 540
514static int powernow_target (struct cpufreq_policy *policy, 541static int powernow_target(struct cpufreq_policy *policy,
515 unsigned int target_freq, 542 unsigned int target_freq,
516 unsigned int relation) 543 unsigned int relation)
517{ 544{
518 unsigned int newstate; 545 unsigned int newstate;
519 546
520 if (cpufreq_frequency_table_target(policy, powernow_table, target_freq, relation, &newstate)) 547 if (cpufreq_frequency_table_target(policy, powernow_table, target_freq,
548 relation, &newstate))
521 return -EINVAL; 549 return -EINVAL;
522 550
523 change_speed(newstate); 551 change_speed(newstate);
@@ -526,7 +554,7 @@ static int powernow_target (struct cpufreq_policy *policy,
526} 554}
527 555
528 556
529static int powernow_verify (struct cpufreq_policy *policy) 557static int powernow_verify(struct cpufreq_policy *policy)
530{ 558{
531 return cpufreq_frequency_table_verify(policy, powernow_table); 559 return cpufreq_frequency_table_verify(policy, powernow_table);
532} 560}
@@ -566,18 +594,23 @@ static unsigned int powernow_get(unsigned int cpu)
566 594
567 if (cpu) 595 if (cpu)
568 return 0; 596 return 0;
569 rdmsrl (MSR_K7_FID_VID_STATUS, fidvidstatus.val); 597 rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
570 cfid = fidvidstatus.bits.CFID; 598 cfid = fidvidstatus.bits.CFID;
571 599
572 return (fsb * fid_codes[cfid] / 10); 600 return fsb * fid_codes[cfid] / 10;
573} 601}
574 602
575 603
576static int __init acer_cpufreq_pst(const struct dmi_system_id *d) 604static int __init acer_cpufreq_pst(const struct dmi_system_id *d)
577{ 605{
578 printk(KERN_WARNING "%s laptop with broken PST tables in BIOS detected.\n", d->ident); 606 printk(KERN_WARNING PFX
579 printk(KERN_WARNING "You need to downgrade to 3A21 (09/09/2002), or try a newer BIOS than 3A71 (01/20/2003)\n"); 607 "%s laptop with broken PST tables in BIOS detected.\n",
580 printk(KERN_WARNING "cpufreq scaling has been disabled as a result of this.\n"); 608 d->ident);
609 printk(KERN_WARNING PFX
610 "You need to downgrade to 3A21 (09/09/2002), or try a newer "
611 "BIOS than 3A71 (01/20/2003)\n");
612 printk(KERN_WARNING PFX
613 "cpufreq scaling has been disabled as a result of this.\n");
581 return 0; 614 return 0;
582} 615}
583 616
@@ -598,7 +631,7 @@ static struct dmi_system_id __initdata powernow_dmi_table[] = {
598 { } 631 { }
599}; 632};
600 633
601static int __init powernow_cpu_init (struct cpufreq_policy *policy) 634static int __init powernow_cpu_init(struct cpufreq_policy *policy)
602{ 635{
603 union msr_fidvidstatus fidvidstatus; 636 union msr_fidvidstatus fidvidstatus;
604 int result; 637 int result;
@@ -606,7 +639,7 @@ static int __init powernow_cpu_init (struct cpufreq_policy *policy)
606 if (policy->cpu != 0) 639 if (policy->cpu != 0)
607 return -ENODEV; 640 return -ENODEV;
608 641
609 rdmsrl (MSR_K7_FID_VID_STATUS, fidvidstatus.val); 642 rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
610 643
611 recalibrate_cpu_khz(); 644 recalibrate_cpu_khz();
612 645
@@ -618,19 +651,21 @@ static int __init powernow_cpu_init (struct cpufreq_policy *policy)
618 dprintk("FSB: %3dMHz\n", fsb/1000); 651 dprintk("FSB: %3dMHz\n", fsb/1000);
619 652
620 if (dmi_check_system(powernow_dmi_table) || acpi_force) { 653 if (dmi_check_system(powernow_dmi_table) || acpi_force) {
621 printk (KERN_INFO PFX "PSB/PST known to be broken. Trying ACPI instead\n"); 654 printk(KERN_INFO PFX "PSB/PST known to be broken. "
655 "Trying ACPI instead\n");
622 result = powernow_acpi_init(); 656 result = powernow_acpi_init();
623 } else { 657 } else {
624 result = powernow_decode_bios(fidvidstatus.bits.MFID, fidvidstatus.bits.SVID); 658 result = powernow_decode_bios(fidvidstatus.bits.MFID,
659 fidvidstatus.bits.SVID);
625 if (result) { 660 if (result) {
626 printk (KERN_INFO PFX "Trying ACPI perflib\n"); 661 printk(KERN_INFO PFX "Trying ACPI perflib\n");
627 maximum_speed = 0; 662 maximum_speed = 0;
628 minimum_speed = -1; 663 minimum_speed = -1;
629 latency = 0; 664 latency = 0;
630 result = powernow_acpi_init(); 665 result = powernow_acpi_init();
631 if (result) { 666 if (result) {
632 printk (KERN_INFO PFX "ACPI and legacy methods failed\n"); 667 printk(KERN_INFO PFX
633 printk (KERN_INFO PFX "See http://www.codemonkey.org.uk/projects/cpufreq/powernow-k7.html\n"); 668 "ACPI and legacy methods failed\n");
634 } 669 }
635 } else { 670 } else {
636 /* SGTC use the bus clock as timer */ 671 /* SGTC use the bus clock as timer */
@@ -642,10 +677,11 @@ static int __init powernow_cpu_init (struct cpufreq_policy *policy)
642 if (result) 677 if (result)
643 return result; 678 return result;
644 679
645 printk (KERN_INFO PFX "Minimum speed %d MHz. Maximum speed %d MHz.\n", 680 printk(KERN_INFO PFX "Minimum speed %d MHz. Maximum speed %d MHz.\n",
646 minimum_speed/1000, maximum_speed/1000); 681 minimum_speed/1000, maximum_speed/1000);
647 682
648 policy->cpuinfo.transition_latency = cpufreq_scale(2000000UL, fsb, latency); 683 policy->cpuinfo.transition_latency =
684 cpufreq_scale(2000000UL, fsb, latency);
649 685
650 policy->cur = powernow_get(0); 686 policy->cur = powernow_get(0);
651 687
@@ -654,7 +690,8 @@ static int __init powernow_cpu_init (struct cpufreq_policy *policy)
654 return cpufreq_frequency_table_cpuinfo(policy, powernow_table); 690 return cpufreq_frequency_table_cpuinfo(policy, powernow_table);
655} 691}
656 692
657static int powernow_cpu_exit (struct cpufreq_policy *policy) { 693static int powernow_cpu_exit(struct cpufreq_policy *policy)
694{
658 cpufreq_frequency_table_put_attr(policy->cpu); 695 cpufreq_frequency_table_put_attr(policy->cpu);
659 696
660#ifdef CONFIG_X86_POWERNOW_K7_ACPI 697#ifdef CONFIG_X86_POWERNOW_K7_ACPI
@@ -669,7 +706,7 @@ static int powernow_cpu_exit (struct cpufreq_policy *policy) {
669 return 0; 706 return 0;
670} 707}
671 708
672static struct freq_attr* powernow_table_attr[] = { 709static struct freq_attr *powernow_table_attr[] = {
673 &cpufreq_freq_attr_scaling_available_freqs, 710 &cpufreq_freq_attr_scaling_available_freqs,
674 NULL, 711 NULL,
675}; 712};
@@ -685,15 +722,15 @@ static struct cpufreq_driver powernow_driver = {
685 .attr = powernow_table_attr, 722 .attr = powernow_table_attr,
686}; 723};
687 724
688static int __init powernow_init (void) 725static int __init powernow_init(void)
689{ 726{
690 if (check_powernow()==0) 727 if (check_powernow() == 0)
691 return -ENODEV; 728 return -ENODEV;
692 return cpufreq_register_driver(&powernow_driver); 729 return cpufreq_register_driver(&powernow_driver);
693} 730}
694 731
695 732
696static void __exit powernow_exit (void) 733static void __exit powernow_exit(void)
697{ 734{
698 cpufreq_unregister_driver(&powernow_driver); 735 cpufreq_unregister_driver(&powernow_driver);
699} 736}
@@ -701,9 +738,9 @@ static void __exit powernow_exit (void)
701module_param(acpi_force, int, 0444); 738module_param(acpi_force, int, 0444);
702MODULE_PARM_DESC(acpi_force, "Force ACPI to be used."); 739MODULE_PARM_DESC(acpi_force, "Force ACPI to be used.");
703 740
704MODULE_AUTHOR ("Dave Jones <davej@redhat.com>"); 741MODULE_AUTHOR("Dave Jones <davej@redhat.com>");
705MODULE_DESCRIPTION ("Powernow driver for AMD K7 processors."); 742MODULE_DESCRIPTION("Powernow driver for AMD K7 processors.");
706MODULE_LICENSE ("GPL"); 743MODULE_LICENSE("GPL");
707 744
708late_initcall(powernow_init); 745late_initcall(powernow_init);
709module_exit(powernow_exit); 746module_exit(powernow_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 6428aa17b40e..4709ead2db52 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -33,16 +33,14 @@
33#include <linux/string.h> 33#include <linux/string.h>
34#include <linux/cpumask.h> 34#include <linux/cpumask.h>
35#include <linux/sched.h> /* for current / set_cpus_allowed() */ 35#include <linux/sched.h> /* for current / set_cpus_allowed() */
36#include <linux/io.h>
37#include <linux/delay.h>
36 38
37#include <asm/msr.h> 39#include <asm/msr.h>
38#include <asm/io.h>
39#include <asm/delay.h>
40 40
41#ifdef CONFIG_X86_POWERNOW_K8_ACPI
42#include <linux/acpi.h> 41#include <linux/acpi.h>
43#include <linux/mutex.h> 42#include <linux/mutex.h>
44#include <acpi/processor.h> 43#include <acpi/processor.h>
45#endif
46 44
47#define PFX "powernow-k8: " 45#define PFX "powernow-k8: "
48#define VERSION "version 2.20.00" 46#define VERSION "version 2.20.00"
@@ -56,7 +54,10 @@ static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data);
56static int cpu_family = CPU_OPTERON; 54static int cpu_family = CPU_OPTERON;
57 55
58#ifndef CONFIG_SMP 56#ifndef CONFIG_SMP
59DEFINE_PER_CPU(cpumask_t, cpu_core_map); 57static inline const struct cpumask *cpu_core_mask(int cpu)
58{
59 return cpumask_of(0);
60}
60#endif 61#endif
61 62
62/* Return a frequency in MHz, given an input fid */ 63/* Return a frequency in MHz, given an input fid */
@@ -71,7 +72,8 @@ static u32 find_khz_freq_from_fid(u32 fid)
71 return 1000 * find_freq_from_fid(fid); 72 return 1000 * find_freq_from_fid(fid);
72} 73}
73 74
74static u32 find_khz_freq_from_pstate(struct cpufreq_frequency_table *data, u32 pstate) 75static u32 find_khz_freq_from_pstate(struct cpufreq_frequency_table *data,
76 u32 pstate)
75{ 77{
76 return data[pstate].frequency; 78 return data[pstate].frequency;
77} 79}
@@ -186,7 +188,9 @@ static int write_new_fid(struct powernow_k8_data *data, u32 fid)
186 return 1; 188 return 1;
187 } 189 }
188 190
189 lo = fid | (data->currvid << MSR_C_LO_VID_SHIFT) | MSR_C_LO_INIT_FID_VID; 191 lo = fid;
192 lo |= (data->currvid << MSR_C_LO_VID_SHIFT);
193 lo |= MSR_C_LO_INIT_FID_VID;
190 194
191 dprintk("writing fid 0x%x, lo 0x%x, hi 0x%x\n", 195 dprintk("writing fid 0x%x, lo 0x%x, hi 0x%x\n",
192 fid, lo, data->plllock * PLL_LOCK_CONVERSION); 196 fid, lo, data->plllock * PLL_LOCK_CONVERSION);
@@ -194,7 +198,9 @@ static int write_new_fid(struct powernow_k8_data *data, u32 fid)
194 do { 198 do {
195 wrmsr(MSR_FIDVID_CTL, lo, data->plllock * PLL_LOCK_CONVERSION); 199 wrmsr(MSR_FIDVID_CTL, lo, data->plllock * PLL_LOCK_CONVERSION);
196 if (i++ > 100) { 200 if (i++ > 100) {
197 printk(KERN_ERR PFX "Hardware error - pending bit very stuck - no further pstate changes possible\n"); 201 printk(KERN_ERR PFX
202 "Hardware error - pending bit very stuck - "
203 "no further pstate changes possible\n");
198 return 1; 204 return 1;
199 } 205 }
200 } while (query_current_values_with_pending_wait(data)); 206 } while (query_current_values_with_pending_wait(data));
@@ -202,14 +208,16 @@ static int write_new_fid(struct powernow_k8_data *data, u32 fid)
202 count_off_irt(data); 208 count_off_irt(data);
203 209
204 if (savevid != data->currvid) { 210 if (savevid != data->currvid) {
205 printk(KERN_ERR PFX "vid change on fid trans, old 0x%x, new 0x%x\n", 211 printk(KERN_ERR PFX
206 savevid, data->currvid); 212 "vid change on fid trans, old 0x%x, new 0x%x\n",
213 savevid, data->currvid);
207 return 1; 214 return 1;
208 } 215 }
209 216
210 if (fid != data->currfid) { 217 if (fid != data->currfid) {
211 printk(KERN_ERR PFX "fid trans failed, fid 0x%x, curr 0x%x\n", fid, 218 printk(KERN_ERR PFX
212 data->currfid); 219 "fid trans failed, fid 0x%x, curr 0x%x\n", fid,
220 data->currfid);
213 return 1; 221 return 1;
214 } 222 }
215 223
@@ -228,7 +236,9 @@ static int write_new_vid(struct powernow_k8_data *data, u32 vid)
228 return 1; 236 return 1;
229 } 237 }
230 238
231 lo = data->currfid | (vid << MSR_C_LO_VID_SHIFT) | MSR_C_LO_INIT_FID_VID; 239 lo = data->currfid;
240 lo |= (vid << MSR_C_LO_VID_SHIFT);
241 lo |= MSR_C_LO_INIT_FID_VID;
232 242
233 dprintk("writing vid 0x%x, lo 0x%x, hi 0x%x\n", 243 dprintk("writing vid 0x%x, lo 0x%x, hi 0x%x\n",
234 vid, lo, STOP_GRANT_5NS); 244 vid, lo, STOP_GRANT_5NS);
@@ -236,20 +246,24 @@ static int write_new_vid(struct powernow_k8_data *data, u32 vid)
236 do { 246 do {
237 wrmsr(MSR_FIDVID_CTL, lo, STOP_GRANT_5NS); 247 wrmsr(MSR_FIDVID_CTL, lo, STOP_GRANT_5NS);
238 if (i++ > 100) { 248 if (i++ > 100) {
239 printk(KERN_ERR PFX "internal error - pending bit very stuck - no further pstate changes possible\n"); 249 printk(KERN_ERR PFX "internal error - pending bit "
250 "very stuck - no further pstate "
251 "changes possible\n");
240 return 1; 252 return 1;
241 } 253 }
242 } while (query_current_values_with_pending_wait(data)); 254 } while (query_current_values_with_pending_wait(data));
243 255
244 if (savefid != data->currfid) { 256 if (savefid != data->currfid) {
245 printk(KERN_ERR PFX "fid changed on vid trans, old 0x%x new 0x%x\n", 257 printk(KERN_ERR PFX "fid changed on vid trans, old "
258 "0x%x new 0x%x\n",
246 savefid, data->currfid); 259 savefid, data->currfid);
247 return 1; 260 return 1;
248 } 261 }
249 262
250 if (vid != data->currvid) { 263 if (vid != data->currvid) {
251 printk(KERN_ERR PFX "vid trans failed, vid 0x%x, curr 0x%x\n", vid, 264 printk(KERN_ERR PFX "vid trans failed, vid 0x%x, "
252 data->currvid); 265 "curr 0x%x\n",
266 vid, data->currvid);
253 return 1; 267 return 1;
254 } 268 }
255 269
@@ -261,7 +275,8 @@ static int write_new_vid(struct powernow_k8_data *data, u32 vid)
261 * Decreasing vid codes represent increasing voltages: 275 * Decreasing vid codes represent increasing voltages:
262 * vid of 0 is 1.550V, vid of 0x1e is 0.800V, vid of VID_OFF is off. 276 * vid of 0 is 1.550V, vid of 0x1e is 0.800V, vid of VID_OFF is off.
263 */ 277 */
264static int decrease_vid_code_by_step(struct powernow_k8_data *data, u32 reqvid, u32 step) 278static int decrease_vid_code_by_step(struct powernow_k8_data *data,
279 u32 reqvid, u32 step)
265{ 280{
266 if ((data->currvid - reqvid) > step) 281 if ((data->currvid - reqvid) > step)
267 reqvid = data->currvid - step; 282 reqvid = data->currvid - step;
@@ -283,7 +298,8 @@ static int transition_pstate(struct powernow_k8_data *data, u32 pstate)
283} 298}
284 299
285/* Change Opteron/Athlon64 fid and vid, by the 3 phases. */ 300/* Change Opteron/Athlon64 fid and vid, by the 3 phases. */
286static int transition_fid_vid(struct powernow_k8_data *data, u32 reqfid, u32 reqvid) 301static int transition_fid_vid(struct powernow_k8_data *data,
302 u32 reqfid, u32 reqvid)
287{ 303{
288 if (core_voltage_pre_transition(data, reqvid)) 304 if (core_voltage_pre_transition(data, reqvid))
289 return 1; 305 return 1;
@@ -298,7 +314,8 @@ static int transition_fid_vid(struct powernow_k8_data *data, u32 reqfid, u32 req
298 return 1; 314 return 1;
299 315
300 if ((reqfid != data->currfid) || (reqvid != data->currvid)) { 316 if ((reqfid != data->currfid) || (reqvid != data->currvid)) {
301 printk(KERN_ERR PFX "failed (cpu%d): req 0x%x 0x%x, curr 0x%x 0x%x\n", 317 printk(KERN_ERR PFX "failed (cpu%d): req 0x%x 0x%x, "
318 "curr 0x%x 0x%x\n",
302 smp_processor_id(), 319 smp_processor_id(),
303 reqfid, reqvid, data->currfid, data->currvid); 320 reqfid, reqvid, data->currfid, data->currvid);
304 return 1; 321 return 1;
@@ -311,13 +328,15 @@ static int transition_fid_vid(struct powernow_k8_data *data, u32 reqfid, u32 req
311} 328}
312 329
313/* Phase 1 - core voltage transition ... setup voltage */ 330/* Phase 1 - core voltage transition ... setup voltage */
314static int core_voltage_pre_transition(struct powernow_k8_data *data, u32 reqvid) 331static int core_voltage_pre_transition(struct powernow_k8_data *data,
332 u32 reqvid)
315{ 333{
316 u32 rvosteps = data->rvo; 334 u32 rvosteps = data->rvo;
317 u32 savefid = data->currfid; 335 u32 savefid = data->currfid;
318 u32 maxvid, lo; 336 u32 maxvid, lo;
319 337
320 dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, reqvid 0x%x, rvo 0x%x\n", 338 dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, "
339 "reqvid 0x%x, rvo 0x%x\n",
321 smp_processor_id(), 340 smp_processor_id(),
322 data->currfid, data->currvid, reqvid, data->rvo); 341 data->currfid, data->currvid, reqvid, data->rvo);
323 342
@@ -340,7 +359,7 @@ static int core_voltage_pre_transition(struct powernow_k8_data *data, u32 reqvid
340 } else { 359 } else {
341 dprintk("ph1: changing vid for rvo, req 0x%x\n", 360 dprintk("ph1: changing vid for rvo, req 0x%x\n",
342 data->currvid - 1); 361 data->currvid - 1);
343 if (decrease_vid_code_by_step(data, data->currvid - 1, 1)) 362 if (decrease_vid_code_by_step(data, data->currvid-1, 1))
344 return 1; 363 return 1;
345 rvosteps--; 364 rvosteps--;
346 } 365 }
@@ -350,7 +369,8 @@ static int core_voltage_pre_transition(struct powernow_k8_data *data, u32 reqvid
350 return 1; 369 return 1;
351 370
352 if (savefid != data->currfid) { 371 if (savefid != data->currfid) {
353 printk(KERN_ERR PFX "ph1 err, currfid changed 0x%x\n", data->currfid); 372 printk(KERN_ERR PFX "ph1 err, currfid changed 0x%x\n",
373 data->currfid);
354 return 1; 374 return 1;
355 } 375 }
356 376
@@ -363,20 +383,24 @@ static int core_voltage_pre_transition(struct powernow_k8_data *data, u32 reqvid
363/* Phase 2 - core frequency transition */ 383/* Phase 2 - core frequency transition */
364static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid) 384static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
365{ 385{
366 u32 vcoreqfid, vcocurrfid, vcofiddiff, fid_interval, savevid = data->currvid; 386 u32 vcoreqfid, vcocurrfid, vcofiddiff;
387 u32 fid_interval, savevid = data->currvid;
367 388
368 if ((reqfid < HI_FID_TABLE_BOTTOM) && (data->currfid < HI_FID_TABLE_BOTTOM)) { 389 if ((reqfid < HI_FID_TABLE_BOTTOM) &&
369 printk(KERN_ERR PFX "ph2: illegal lo-lo transition 0x%x 0x%x\n", 390 (data->currfid < HI_FID_TABLE_BOTTOM)) {
370 reqfid, data->currfid); 391 printk(KERN_ERR PFX "ph2: illegal lo-lo transition "
392 "0x%x 0x%x\n", reqfid, data->currfid);
371 return 1; 393 return 1;
372 } 394 }
373 395
374 if (data->currfid == reqfid) { 396 if (data->currfid == reqfid) {
375 printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n", data->currfid); 397 printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n",
398 data->currfid);
376 return 0; 399 return 0;
377 } 400 }
378 401
379 dprintk("ph2 (cpu%d): starting, currfid 0x%x, currvid 0x%x, reqfid 0x%x\n", 402 dprintk("ph2 (cpu%d): starting, currfid 0x%x, currvid 0x%x, "
403 "reqfid 0x%x\n",
380 smp_processor_id(), 404 smp_processor_id(),
381 data->currfid, data->currvid, reqfid); 405 data->currfid, data->currvid, reqfid);
382 406
@@ -390,14 +414,14 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
390 414
391 if (reqfid > data->currfid) { 415 if (reqfid > data->currfid) {
392 if (data->currfid > LO_FID_TABLE_TOP) { 416 if (data->currfid > LO_FID_TABLE_TOP) {
393 if (write_new_fid(data, data->currfid + fid_interval)) { 417 if (write_new_fid(data,
418 data->currfid + fid_interval))
394 return 1; 419 return 1;
395 }
396 } else { 420 } else {
397 if (write_new_fid 421 if (write_new_fid
398 (data, 2 + convert_fid_to_vco_fid(data->currfid))) { 422 (data,
423 2 + convert_fid_to_vco_fid(data->currfid)))
399 return 1; 424 return 1;
400 }
401 } 425 }
402 } else { 426 } else {
403 if (write_new_fid(data, data->currfid - fid_interval)) 427 if (write_new_fid(data, data->currfid - fid_interval))
@@ -417,7 +441,8 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
417 441
418 if (data->currfid != reqfid) { 442 if (data->currfid != reqfid) {
419 printk(KERN_ERR PFX 443 printk(KERN_ERR PFX
420 "ph2: mismatch, failed fid transition, curr 0x%x, req 0x%x\n", 444 "ph2: mismatch, failed fid transition, "
445 "curr 0x%x, req 0x%x\n",
421 data->currfid, reqfid); 446 data->currfid, reqfid);
422 return 1; 447 return 1;
423 } 448 }
@@ -435,7 +460,8 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
435} 460}
436 461
437/* Phase 3 - core voltage transition flow ... jump to the final vid. */ 462/* Phase 3 - core voltage transition flow ... jump to the final vid. */
438static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid) 463static int core_voltage_post_transition(struct powernow_k8_data *data,
464 u32 reqvid)
439{ 465{
440 u32 savefid = data->currfid; 466 u32 savefid = data->currfid;
441 u32 savereqvid = reqvid; 467 u32 savereqvid = reqvid;
@@ -457,7 +483,8 @@ static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvi
457 483
458 if (data->currvid != reqvid) { 484 if (data->currvid != reqvid) {
459 printk(KERN_ERR PFX 485 printk(KERN_ERR PFX
460 "ph3: failed vid transition\n, req 0x%x, curr 0x%x", 486 "ph3: failed vid transition\n, "
487 "req 0x%x, curr 0x%x",
461 reqvid, data->currvid); 488 reqvid, data->currvid);
462 return 1; 489 return 1;
463 } 490 }
@@ -508,7 +535,8 @@ static int check_supported_cpu(unsigned int cpu)
508 if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) { 535 if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) {
509 if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) || 536 if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) ||
510 ((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) { 537 ((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) {
511 printk(KERN_INFO PFX "Processor cpuid %x not supported\n", eax); 538 printk(KERN_INFO PFX
539 "Processor cpuid %x not supported\n", eax);
512 goto out; 540 goto out;
513 } 541 }
514 542
@@ -520,8 +548,10 @@ static int check_supported_cpu(unsigned int cpu)
520 } 548 }
521 549
522 cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); 550 cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
523 if ((edx & P_STATE_TRANSITION_CAPABLE) != P_STATE_TRANSITION_CAPABLE) { 551 if ((edx & P_STATE_TRANSITION_CAPABLE)
524 printk(KERN_INFO PFX "Power state transitions not supported\n"); 552 != P_STATE_TRANSITION_CAPABLE) {
553 printk(KERN_INFO PFX
554 "Power state transitions not supported\n");
525 goto out; 555 goto out;
526 } 556 }
527 } else { /* must be a HW Pstate capable processor */ 557 } else { /* must be a HW Pstate capable processor */
@@ -539,7 +569,8 @@ out:
539 return rc; 569 return rc;
540} 570}
541 571
542static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, u8 maxvid) 572static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst,
573 u8 maxvid)
543{ 574{
544 unsigned int j; 575 unsigned int j;
545 u8 lastfid = 0xff; 576 u8 lastfid = 0xff;
@@ -550,12 +581,14 @@ static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, u8
550 j, pst[j].vid); 581 j, pst[j].vid);
551 return -EINVAL; 582 return -EINVAL;
552 } 583 }
553 if (pst[j].vid < data->rvo) { /* vid + rvo >= 0 */ 584 if (pst[j].vid < data->rvo) {
585 /* vid + rvo >= 0 */
554 printk(KERN_ERR FW_BUG PFX "0 vid exceeded with pstate" 586 printk(KERN_ERR FW_BUG PFX "0 vid exceeded with pstate"
555 " %d\n", j); 587 " %d\n", j);
556 return -ENODEV; 588 return -ENODEV;
557 } 589 }
558 if (pst[j].vid < maxvid + data->rvo) { /* vid + rvo >= maxvid */ 590 if (pst[j].vid < maxvid + data->rvo) {
591 /* vid + rvo >= maxvid */
559 printk(KERN_ERR FW_BUG PFX "maxvid exceeded with pstate" 592 printk(KERN_ERR FW_BUG PFX "maxvid exceeded with pstate"
560 " %d\n", j); 593 " %d\n", j);
561 return -ENODEV; 594 return -ENODEV;
@@ -579,23 +612,31 @@ static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, u8
579 return -EINVAL; 612 return -EINVAL;
580 } 613 }
581 if (lastfid > LO_FID_TABLE_TOP) 614 if (lastfid > LO_FID_TABLE_TOP)
582 printk(KERN_INFO FW_BUG PFX "first fid not from lo freq table\n"); 615 printk(KERN_INFO FW_BUG PFX
616 "first fid not from lo freq table\n");
583 617
584 return 0; 618 return 0;
585} 619}
586 620
621static void invalidate_entry(struct powernow_k8_data *data, unsigned int entry)
622{
623 data->powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
624}
625
587static void print_basics(struct powernow_k8_data *data) 626static void print_basics(struct powernow_k8_data *data)
588{ 627{
589 int j; 628 int j;
590 for (j = 0; j < data->numps; j++) { 629 for (j = 0; j < data->numps; j++) {
591 if (data->powernow_table[j].frequency != CPUFREQ_ENTRY_INVALID) { 630 if (data->powernow_table[j].frequency !=
631 CPUFREQ_ENTRY_INVALID) {
592 if (cpu_family == CPU_HW_PSTATE) { 632 if (cpu_family == CPU_HW_PSTATE) {
593 printk(KERN_INFO PFX " %d : pstate %d (%d MHz)\n", 633 printk(KERN_INFO PFX
594 j, 634 " %d : pstate %d (%d MHz)\n", j,
595 data->powernow_table[j].index, 635 data->powernow_table[j].index,
596 data->powernow_table[j].frequency/1000); 636 data->powernow_table[j].frequency/1000);
597 } else { 637 } else {
598 printk(KERN_INFO PFX " %d : fid 0x%x (%d MHz), vid 0x%x\n", 638 printk(KERN_INFO PFX
639 " %d : fid 0x%x (%d MHz), vid 0x%x\n",
599 j, 640 j,
600 data->powernow_table[j].index & 0xff, 641 data->powernow_table[j].index & 0xff,
601 data->powernow_table[j].frequency/1000, 642 data->powernow_table[j].frequency/1000,
@@ -604,20 +645,25 @@ static void print_basics(struct powernow_k8_data *data)
604 } 645 }
605 } 646 }
606 if (data->batps) 647 if (data->batps)
607 printk(KERN_INFO PFX "Only %d pstates on battery\n", data->batps); 648 printk(KERN_INFO PFX "Only %d pstates on battery\n",
649 data->batps);
608} 650}
609 651
610static int fill_powernow_table(struct powernow_k8_data *data, struct pst_s *pst, u8 maxvid) 652static int fill_powernow_table(struct powernow_k8_data *data,
653 struct pst_s *pst, u8 maxvid)
611{ 654{
612 struct cpufreq_frequency_table *powernow_table; 655 struct cpufreq_frequency_table *powernow_table;
613 unsigned int j; 656 unsigned int j;
614 657
615 if (data->batps) { /* use ACPI support to get full speed on mains power */ 658 if (data->batps) {
616 printk(KERN_WARNING PFX "Only %d pstates usable (use ACPI driver for full range\n", data->batps); 659 /* use ACPI support to get full speed on mains power */
660 printk(KERN_WARNING PFX
661 "Only %d pstates usable (use ACPI driver for full "
662 "range\n", data->batps);
617 data->numps = data->batps; 663 data->numps = data->batps;
618 } 664 }
619 665
620 for ( j=1; j<data->numps; j++ ) { 666 for (j = 1; j < data->numps; j++) {
621 if (pst[j-1].fid >= pst[j].fid) { 667 if (pst[j-1].fid >= pst[j].fid) {
622 printk(KERN_ERR PFX "PST out of sequence\n"); 668 printk(KERN_ERR PFX "PST out of sequence\n");
623 return -EINVAL; 669 return -EINVAL;
@@ -640,9 +686,11 @@ static int fill_powernow_table(struct powernow_k8_data *data, struct pst_s *pst,
640 } 686 }
641 687
642 for (j = 0; j < data->numps; j++) { 688 for (j = 0; j < data->numps; j++) {
689 int freq;
643 powernow_table[j].index = pst[j].fid; /* lower 8 bits */ 690 powernow_table[j].index = pst[j].fid; /* lower 8 bits */
644 powernow_table[j].index |= (pst[j].vid << 8); /* upper 8 bits */ 691 powernow_table[j].index |= (pst[j].vid << 8); /* upper 8 bits */
645 powernow_table[j].frequency = find_khz_freq_from_fid(pst[j].fid); 692 freq = find_khz_freq_from_fid(pst[j].fid);
693 powernow_table[j].frequency = freq;
646 } 694 }
647 powernow_table[data->numps].frequency = CPUFREQ_TABLE_END; 695 powernow_table[data->numps].frequency = CPUFREQ_TABLE_END;
648 powernow_table[data->numps].index = 0; 696 powernow_table[data->numps].index = 0;
@@ -654,11 +702,12 @@ static int fill_powernow_table(struct powernow_k8_data *data, struct pst_s *pst,
654 702
655 dprintk("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid); 703 dprintk("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid);
656 data->powernow_table = powernow_table; 704 data->powernow_table = powernow_table;
657 if (first_cpu(per_cpu(cpu_core_map, data->cpu)) == data->cpu) 705 if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
658 print_basics(data); 706 print_basics(data);
659 707
660 for (j = 0; j < data->numps; j++) 708 for (j = 0; j < data->numps; j++)
661 if ((pst[j].fid==data->currfid) && (pst[j].vid==data->currvid)) 709 if ((pst[j].fid == data->currfid) &&
710 (pst[j].vid == data->currvid))
662 return 0; 711 return 0;
663 712
664 dprintk("currfid/vid do not match PST, ignoring\n"); 713 dprintk("currfid/vid do not match PST, ignoring\n");
@@ -698,7 +747,8 @@ static int find_psb_table(struct powernow_k8_data *data)
698 } 747 }
699 748
700 data->vstable = psb->vstable; 749 data->vstable = psb->vstable;
701 dprintk("voltage stabilization time: %d(*20us)\n", data->vstable); 750 dprintk("voltage stabilization time: %d(*20us)\n",
751 data->vstable);
702 752
703 dprintk("flags2: 0x%x\n", psb->flags2); 753 dprintk("flags2: 0x%x\n", psb->flags2);
704 data->rvo = psb->flags2 & 3; 754 data->rvo = psb->flags2 & 3;
@@ -713,11 +763,12 @@ static int find_psb_table(struct powernow_k8_data *data)
713 763
714 dprintk("numpst: 0x%x\n", psb->num_tables); 764 dprintk("numpst: 0x%x\n", psb->num_tables);
715 cpst = psb->num_tables; 765 cpst = psb->num_tables;
716 if ((psb->cpuid == 0x00000fc0) || (psb->cpuid == 0x00000fe0) ){ 766 if ((psb->cpuid == 0x00000fc0) ||
767 (psb->cpuid == 0x00000fe0)) {
717 thiscpuid = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); 768 thiscpuid = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
718 if ((thiscpuid == 0x00000fc0) || (thiscpuid == 0x00000fe0) ) { 769 if ((thiscpuid == 0x00000fc0) ||
770 (thiscpuid == 0x00000fe0))
719 cpst = 1; 771 cpst = 1;
720 }
721 } 772 }
722 if (cpst != 1) { 773 if (cpst != 1) {
723 printk(KERN_ERR FW_BUG PFX "numpst must be 1\n"); 774 printk(KERN_ERR FW_BUG PFX "numpst must be 1\n");
@@ -732,7 +783,8 @@ static int find_psb_table(struct powernow_k8_data *data)
732 783
733 data->numps = psb->numps; 784 data->numps = psb->numps;
734 dprintk("numpstates: 0x%x\n", data->numps); 785 dprintk("numpstates: 0x%x\n", data->numps);
735 return fill_powernow_table(data, (struct pst_s *)(psb+1), maxvid); 786 return fill_powernow_table(data,
787 (struct pst_s *)(psb+1), maxvid);
736 } 788 }
737 /* 789 /*
738 * If you see this message, complain to BIOS manufacturer. If 790 * If you see this message, complain to BIOS manufacturer. If
@@ -745,28 +797,31 @@ static int find_psb_table(struct powernow_k8_data *data)
745 * BIOS and Kernel Developer's Guide, which is available on 797 * BIOS and Kernel Developer's Guide, which is available on
746 * www.amd.com 798 * www.amd.com
747 */ 799 */
748 printk(KERN_ERR PFX "BIOS error - no PSB or ACPI _PSS objects\n"); 800 printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n");
749 return -ENODEV; 801 return -ENODEV;
750} 802}
751 803
752#ifdef CONFIG_X86_POWERNOW_K8_ACPI 804static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data,
753static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index) 805 unsigned int index)
754{ 806{
807 acpi_integer control;
808
755 if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE)) 809 if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE))
756 return; 810 return;
757 811
758 data->irt = (data->acpi_data.states[index].control >> IRT_SHIFT) & IRT_MASK; 812 control = data->acpi_data.states[index].control; data->irt = (control
759 data->rvo = (data->acpi_data.states[index].control >> RVO_SHIFT) & RVO_MASK; 813 >> IRT_SHIFT) & IRT_MASK; data->rvo = (control >>
760 data->exttype = (data->acpi_data.states[index].control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK; 814 RVO_SHIFT) & RVO_MASK; data->exttype = (control
761 data->plllock = (data->acpi_data.states[index].control >> PLL_L_SHIFT) & PLL_L_MASK; 815 >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK;
762 data->vidmvs = 1 << ((data->acpi_data.states[index].control >> MVS_SHIFT) & MVS_MASK); 816 data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK; data->vidmvs = 1
763 data->vstable = (data->acpi_data.states[index].control >> VST_SHIFT) & VST_MASK; 817 << ((control >> MVS_SHIFT) & MVS_MASK); data->vstable =
764} 818 (control >> VST_SHIFT) & VST_MASK; }
765 819
766static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) 820static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
767{ 821{
768 struct cpufreq_frequency_table *powernow_table; 822 struct cpufreq_frequency_table *powernow_table;
769 int ret_val = -ENODEV; 823 int ret_val = -ENODEV;
824 acpi_integer space_id;
770 825
771 if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { 826 if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
772 dprintk("register performance failed: bad ACPI data\n"); 827 dprintk("register performance failed: bad ACPI data\n");
@@ -779,11 +834,12 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
779 goto err_out; 834 goto err_out;
780 } 835 }
781 836
782 if ((data->acpi_data.control_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) || 837 space_id = data->acpi_data.control_register.space_id;
783 (data->acpi_data.status_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) { 838 if ((space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) ||
839 (space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) {
784 dprintk("Invalid control/status registers (%x - %x)\n", 840 dprintk("Invalid control/status registers (%x - %x)\n",
785 data->acpi_data.control_register.space_id, 841 data->acpi_data.control_register.space_id,
786 data->acpi_data.status_register.space_id); 842 space_id);
787 goto err_out; 843 goto err_out;
788 } 844 }
789 845
@@ -802,13 +858,14 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
802 if (ret_val) 858 if (ret_val)
803 goto err_out_mem; 859 goto err_out_mem;
804 860
805 powernow_table[data->acpi_data.state_count].frequency = CPUFREQ_TABLE_END; 861 powernow_table[data->acpi_data.state_count].frequency =
862 CPUFREQ_TABLE_END;
806 powernow_table[data->acpi_data.state_count].index = 0; 863 powernow_table[data->acpi_data.state_count].index = 0;
807 data->powernow_table = powernow_table; 864 data->powernow_table = powernow_table;
808 865
809 /* fill in data */ 866 /* fill in data */
810 data->numps = data->acpi_data.state_count; 867 data->numps = data->acpi_data.state_count;
811 if (first_cpu(per_cpu(cpu_core_map, data->cpu)) == data->cpu) 868 if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
812 print_basics(data); 869 print_basics(data);
813 powernow_k8_acpi_pst_values(data, 0); 870 powernow_k8_acpi_pst_values(data, 0);
814 871
@@ -830,13 +887,15 @@ err_out_mem:
830err_out: 887err_out:
831 acpi_processor_unregister_performance(&data->acpi_data, data->cpu); 888 acpi_processor_unregister_performance(&data->acpi_data, data->cpu);
832 889
833 /* data->acpi_data.state_count informs us at ->exit() whether ACPI was used */ 890 /* data->acpi_data.state_count informs us at ->exit()
891 * whether ACPI was used */
834 data->acpi_data.state_count = 0; 892 data->acpi_data.state_count = 0;
835 893
836 return ret_val; 894 return ret_val;
837} 895}
838 896
839static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table) 897static int fill_powernow_table_pstate(struct powernow_k8_data *data,
898 struct cpufreq_frequency_table *powernow_table)
840{ 899{
841 int i; 900 int i;
842 u32 hi = 0, lo = 0; 901 u32 hi = 0, lo = 0;
@@ -848,84 +907,101 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpuf
848 907
849 index = data->acpi_data.states[i].control & HW_PSTATE_MASK; 908 index = data->acpi_data.states[i].control & HW_PSTATE_MASK;
850 if (index > data->max_hw_pstate) { 909 if (index > data->max_hw_pstate) {
851 printk(KERN_ERR PFX "invalid pstate %d - bad value %d.\n", i, index); 910 printk(KERN_ERR PFX "invalid pstate %d - "
852 printk(KERN_ERR PFX "Please report to BIOS manufacturer\n"); 911 "bad value %d.\n", i, index);
853 powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID; 912 printk(KERN_ERR PFX "Please report to BIOS "
913 "manufacturer\n");
914 invalidate_entry(data, i);
854 continue; 915 continue;
855 } 916 }
856 rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi); 917 rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi);
857 if (!(hi & HW_PSTATE_VALID_MASK)) { 918 if (!(hi & HW_PSTATE_VALID_MASK)) {
858 dprintk("invalid pstate %d, ignoring\n", index); 919 dprintk("invalid pstate %d, ignoring\n", index);
859 powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID; 920 invalidate_entry(data, i);
860 continue; 921 continue;
861 } 922 }
862 923
863 powernow_table[i].index = index; 924 powernow_table[i].index = index;
864 925
865 powernow_table[i].frequency = data->acpi_data.states[i].core_frequency * 1000; 926 powernow_table[i].frequency =
927 data->acpi_data.states[i].core_frequency * 1000;
866 } 928 }
867 return 0; 929 return 0;
868} 930}
869 931
870static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table) 932static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
933 struct cpufreq_frequency_table *powernow_table)
871{ 934{
872 int i; 935 int i;
873 int cntlofreq = 0; 936 int cntlofreq = 0;
937
874 for (i = 0; i < data->acpi_data.state_count; i++) { 938 for (i = 0; i < data->acpi_data.state_count; i++) {
875 u32 fid; 939 u32 fid;
876 u32 vid; 940 u32 vid;
941 u32 freq, index;
942 acpi_integer status, control;
877 943
878 if (data->exttype) { 944 if (data->exttype) {
879 fid = data->acpi_data.states[i].status & EXT_FID_MASK; 945 status = data->acpi_data.states[i].status;
880 vid = (data->acpi_data.states[i].status >> VID_SHIFT) & EXT_VID_MASK; 946 fid = status & EXT_FID_MASK;
947 vid = (status >> VID_SHIFT) & EXT_VID_MASK;
881 } else { 948 } else {
882 fid = data->acpi_data.states[i].control & FID_MASK; 949 control = data->acpi_data.states[i].control;
883 vid = (data->acpi_data.states[i].control >> VID_SHIFT) & VID_MASK; 950 fid = control & FID_MASK;
951 vid = (control >> VID_SHIFT) & VID_MASK;
884 } 952 }
885 953
886 dprintk(" %d : fid 0x%x, vid 0x%x\n", i, fid, vid); 954 dprintk(" %d : fid 0x%x, vid 0x%x\n", i, fid, vid);
887 955
888 powernow_table[i].index = fid; /* lower 8 bits */ 956 index = fid | (vid<<8);
889 powernow_table[i].index |= (vid << 8); /* upper 8 bits */ 957 powernow_table[i].index = index;
890 powernow_table[i].frequency = find_khz_freq_from_fid(fid); 958
959 freq = find_khz_freq_from_fid(fid);
960 powernow_table[i].frequency = freq;
891 961
892 /* verify frequency is OK */ 962 /* verify frequency is OK */
893 if ((powernow_table[i].frequency > (MAX_FREQ * 1000)) || 963 if ((freq > (MAX_FREQ * 1000)) || (freq < (MIN_FREQ * 1000))) {
894 (powernow_table[i].frequency < (MIN_FREQ * 1000))) { 964 dprintk("invalid freq %u kHz, ignoring\n", freq);
895 dprintk("invalid freq %u kHz, ignoring\n", powernow_table[i].frequency); 965 invalidate_entry(data, i);
896 powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID;
897 continue; 966 continue;
898 } 967 }
899 968
900 /* verify voltage is OK - BIOSs are using "off" to indicate invalid */ 969 /* verify voltage is OK -
970 * BIOSs are using "off" to indicate invalid */
901 if (vid == VID_OFF) { 971 if (vid == VID_OFF) {
902 dprintk("invalid vid %u, ignoring\n", vid); 972 dprintk("invalid vid %u, ignoring\n", vid);
903 powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID; 973 invalidate_entry(data, i);
904 continue; 974 continue;
905 } 975 }
906 976
907 /* verify only 1 entry from the lo frequency table */ 977 /* verify only 1 entry from the lo frequency table */
908 if (fid < HI_FID_TABLE_BOTTOM) { 978 if (fid < HI_FID_TABLE_BOTTOM) {
909 if (cntlofreq) { 979 if (cntlofreq) {
910 /* if both entries are the same, ignore this one ... */ 980 /* if both entries are the same,
911 if ((powernow_table[i].frequency != powernow_table[cntlofreq].frequency) || 981 * ignore this one ... */
912 (powernow_table[i].index != powernow_table[cntlofreq].index)) { 982 if ((freq != powernow_table[cntlofreq].frequency) ||
913 printk(KERN_ERR PFX "Too many lo freq table entries\n"); 983 (index != powernow_table[cntlofreq].index)) {
984 printk(KERN_ERR PFX
985 "Too many lo freq table "
986 "entries\n");
914 return 1; 987 return 1;
915 } 988 }
916 989
917 dprintk("double low frequency table entry, ignoring it.\n"); 990 dprintk("double low frequency table entry, "
918 powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID; 991 "ignoring it.\n");
992 invalidate_entry(data, i);
919 continue; 993 continue;
920 } else 994 } else
921 cntlofreq = i; 995 cntlofreq = i;
922 } 996 }
923 997
924 if (powernow_table[i].frequency != (data->acpi_data.states[i].core_frequency * 1000)) { 998 if (freq != (data->acpi_data.states[i].core_frequency * 1000)) {
925 printk(KERN_INFO PFX "invalid freq entries %u kHz vs. %u kHz\n", 999 printk(KERN_INFO PFX "invalid freq entries "
926 powernow_table[i].frequency, 1000 "%u kHz vs. %u kHz\n", freq,
927 (unsigned int) (data->acpi_data.states[i].core_frequency * 1000)); 1001 (unsigned int)
928 powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID; 1002 (data->acpi_data.states[i].core_frequency
1003 * 1000));
1004 invalidate_entry(data, i);
929 continue; 1005 continue;
930 } 1006 }
931 } 1007 }
@@ -935,7 +1011,8 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpuf
935static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) 1011static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data)
936{ 1012{
937 if (data->acpi_data.state_count) 1013 if (data->acpi_data.state_count)
938 acpi_processor_unregister_performance(&data->acpi_data, data->cpu); 1014 acpi_processor_unregister_performance(&data->acpi_data,
1015 data->cpu);
939 free_cpumask_var(data->acpi_data.shared_cpu_map); 1016 free_cpumask_var(data->acpi_data.shared_cpu_map);
940} 1017}
941 1018
@@ -953,15 +1030,9 @@ static int get_transition_latency(struct powernow_k8_data *data)
953 return 1000 * max_latency; 1030 return 1000 * max_latency;
954} 1031}
955 1032
956#else
957static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) { return -ENODEV; }
958static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) { return; }
959static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index) { return; }
960static int get_transition_latency(struct powernow_k8_data *data) { return 0; }
961#endif /* CONFIG_X86_POWERNOW_K8_ACPI */
962
963/* Take a frequency, and issue the fid/vid transition command */ 1033/* Take a frequency, and issue the fid/vid transition command */
964static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned int index) 1034static int transition_frequency_fidvid(struct powernow_k8_data *data,
1035 unsigned int index)
965{ 1036{
966 u32 fid = 0; 1037 u32 fid = 0;
967 u32 vid = 0; 1038 u32 vid = 0;
@@ -989,7 +1060,8 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned i
989 return 0; 1060 return 0;
990 } 1061 }
991 1062
992 if ((fid < HI_FID_TABLE_BOTTOM) && (data->currfid < HI_FID_TABLE_BOTTOM)) { 1063 if ((fid < HI_FID_TABLE_BOTTOM) &&
1064 (data->currfid < HI_FID_TABLE_BOTTOM)) {
993 printk(KERN_ERR PFX 1065 printk(KERN_ERR PFX
994 "ignoring illegal change in lo freq table-%x to 0x%x\n", 1066 "ignoring illegal change in lo freq table-%x to 0x%x\n",
995 data->currfid, fid); 1067 data->currfid, fid);
@@ -1017,7 +1089,8 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned i
1017} 1089}
1018 1090
1019/* Take a frequency, and issue the hardware pstate transition command */ 1091/* Take a frequency, and issue the hardware pstate transition command */
1020static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned int index) 1092static int transition_frequency_pstate(struct powernow_k8_data *data,
1093 unsigned int index)
1021{ 1094{
1022 u32 pstate = 0; 1095 u32 pstate = 0;
1023 int res, i; 1096 int res, i;
@@ -1029,7 +1102,8 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i
1029 pstate = index & HW_PSTATE_MASK; 1102 pstate = index & HW_PSTATE_MASK;
1030 if (pstate > data->max_hw_pstate) 1103 if (pstate > data->max_hw_pstate)
1031 return 0; 1104 return 0;
1032 freqs.old = find_khz_freq_from_pstate(data->powernow_table, data->currpstate); 1105 freqs.old = find_khz_freq_from_pstate(data->powernow_table,
1106 data->currpstate);
1033 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); 1107 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
1034 1108
1035 for_each_cpu_mask_nr(i, *(data->available_cores)) { 1109 for_each_cpu_mask_nr(i, *(data->available_cores)) {
@@ -1048,7 +1122,8 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i
1048} 1122}
1049 1123
1050/* Driver entry point to switch to the target frequency */ 1124/* Driver entry point to switch to the target frequency */
1051static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation) 1125static int powernowk8_target(struct cpufreq_policy *pol,
1126 unsigned targfreq, unsigned relation)
1052{ 1127{
1053 cpumask_t oldmask; 1128 cpumask_t oldmask;
1054 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); 1129 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
@@ -1087,14 +1162,18 @@ static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsi
1087 dprintk("targ: curr fid 0x%x, vid 0x%x\n", 1162 dprintk("targ: curr fid 0x%x, vid 0x%x\n",
1088 data->currfid, data->currvid); 1163 data->currfid, data->currvid);
1089 1164
1090 if ((checkvid != data->currvid) || (checkfid != data->currfid)) { 1165 if ((checkvid != data->currvid) ||
1166 (checkfid != data->currfid)) {
1091 printk(KERN_INFO PFX 1167 printk(KERN_INFO PFX
1092 "error - out of sync, fix 0x%x 0x%x, vid 0x%x 0x%x\n", 1168 "error - out of sync, fix 0x%x 0x%x, "
1093 checkfid, data->currfid, checkvid, data->currvid); 1169 "vid 0x%x 0x%x\n",
1170 checkfid, data->currfid,
1171 checkvid, data->currvid);
1094 } 1172 }
1095 } 1173 }
1096 1174
1097 if (cpufreq_frequency_table_target(pol, data->powernow_table, targfreq, relation, &newstate)) 1175 if (cpufreq_frequency_table_target(pol, data->powernow_table,
1176 targfreq, relation, &newstate))
1098 goto err_out; 1177 goto err_out;
1099 1178
1100 mutex_lock(&fidvid_mutex); 1179 mutex_lock(&fidvid_mutex);
@@ -1114,7 +1193,8 @@ static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsi
1114 mutex_unlock(&fidvid_mutex); 1193 mutex_unlock(&fidvid_mutex);
1115 1194
1116 if (cpu_family == CPU_HW_PSTATE) 1195 if (cpu_family == CPU_HW_PSTATE)
1117 pol->cur = find_khz_freq_from_pstate(data->powernow_table, newstate); 1196 pol->cur = find_khz_freq_from_pstate(data->powernow_table,
1197 newstate);
1118 else 1198 else
1119 pol->cur = find_khz_freq_from_fid(data->currfid); 1199 pol->cur = find_khz_freq_from_fid(data->currfid);
1120 ret = 0; 1200 ret = 0;
@@ -1141,6 +1221,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1141 struct powernow_k8_data *data; 1221 struct powernow_k8_data *data;
1142 cpumask_t oldmask; 1222 cpumask_t oldmask;
1143 int rc; 1223 int rc;
1224 static int print_once;
1144 1225
1145 if (!cpu_online(pol->cpu)) 1226 if (!cpu_online(pol->cpu))
1146 return -ENODEV; 1227 return -ENODEV;
@@ -1163,33 +1244,31 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1163 * an UP version, and is deprecated by AMD. 1244 * an UP version, and is deprecated by AMD.
1164 */ 1245 */
1165 if (num_online_cpus() != 1) { 1246 if (num_online_cpus() != 1) {
1166#ifndef CONFIG_ACPI_PROCESSOR 1247 /*
1167 printk(KERN_ERR PFX "ACPI Processor support is required " 1248 * Replace this one with print_once as soon as such a
1168 "for SMP systems but is absent. Please load the " 1249 * thing gets introduced
1169 "ACPI Processor module before starting this " 1250 */
1170 "driver.\n"); 1251 if (!print_once) {
1171#else 1252 WARN_ONCE(1, KERN_ERR FW_BUG PFX "Your BIOS "
1172 printk(KERN_ERR FW_BUG PFX "Your BIOS does not provide" 1253 "does not provide ACPI _PSS objects "
1173 " ACPI _PSS objects in a way that Linux " 1254 "in a way that Linux understands. "
1174 "understands. Please report this to the Linux " 1255 "Please report this to the Linux ACPI"
1175 "ACPI maintainers and complain to your BIOS " 1256 " maintainers and complain to your "
1176 "vendor.\n"); 1257 "BIOS vendor.\n");
1177#endif 1258 print_once++;
1178 kfree(data); 1259 }
1179 return -ENODEV; 1260 goto err_out;
1180 } 1261 }
1181 if (pol->cpu != 0) { 1262 if (pol->cpu != 0) {
1182 printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for " 1263 printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for "
1183 "CPU other than CPU0. Complain to your BIOS " 1264 "CPU other than CPU0. Complain to your BIOS "
1184 "vendor.\n"); 1265 "vendor.\n");
1185 kfree(data); 1266 goto err_out;
1186 return -ENODEV;
1187 } 1267 }
1188 rc = find_psb_table(data); 1268 rc = find_psb_table(data);
1189 if (rc) { 1269 if (rc)
1190 kfree(data); 1270 goto err_out;
1191 return -ENODEV; 1271
1192 }
1193 /* Take a crude guess here. 1272 /* Take a crude guess here.
1194 * That guess was in microseconds, so multiply with 1000 */ 1273 * That guess was in microseconds, so multiply with 1000 */
1195 pol->cpuinfo.transition_latency = ( 1274 pol->cpuinfo.transition_latency = (
@@ -1204,16 +1283,16 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1204 1283
1205 if (smp_processor_id() != pol->cpu) { 1284 if (smp_processor_id() != pol->cpu) {
1206 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); 1285 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
1207 goto err_out; 1286 goto err_out_unmask;
1208 } 1287 }
1209 1288
1210 if (pending_bit_stuck()) { 1289 if (pending_bit_stuck()) {
1211 printk(KERN_ERR PFX "failing init, change pending bit set\n"); 1290 printk(KERN_ERR PFX "failing init, change pending bit set\n");
1212 goto err_out; 1291 goto err_out_unmask;
1213 } 1292 }
1214 1293
1215 if (query_current_values_with_pending_wait(data)) 1294 if (query_current_values_with_pending_wait(data))
1216 goto err_out; 1295 goto err_out_unmask;
1217 1296
1218 if (cpu_family == CPU_OPTERON) 1297 if (cpu_family == CPU_OPTERON)
1219 fidvid_msr_init(); 1298 fidvid_msr_init();
@@ -1224,11 +1303,12 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1224 if (cpu_family == CPU_HW_PSTATE) 1303 if (cpu_family == CPU_HW_PSTATE)
1225 cpumask_copy(pol->cpus, cpumask_of(pol->cpu)); 1304 cpumask_copy(pol->cpus, cpumask_of(pol->cpu));
1226 else 1305 else
1227 cpumask_copy(pol->cpus, &per_cpu(cpu_core_map, pol->cpu)); 1306 cpumask_copy(pol->cpus, cpu_core_mask(pol->cpu));
1228 data->available_cores = pol->cpus; 1307 data->available_cores = pol->cpus;
1229 1308
1230 if (cpu_family == CPU_HW_PSTATE) 1309 if (cpu_family == CPU_HW_PSTATE)
1231 pol->cur = find_khz_freq_from_pstate(data->powernow_table, data->currpstate); 1310 pol->cur = find_khz_freq_from_pstate(data->powernow_table,
1311 data->currpstate);
1232 else 1312 else
1233 pol->cur = find_khz_freq_from_fid(data->currfid); 1313 pol->cur = find_khz_freq_from_fid(data->currfid);
1234 dprintk("policy current frequency %d kHz\n", pol->cur); 1314 dprintk("policy current frequency %d kHz\n", pol->cur);
@@ -1245,7 +1325,8 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1245 cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu); 1325 cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu);
1246 1326
1247 if (cpu_family == CPU_HW_PSTATE) 1327 if (cpu_family == CPU_HW_PSTATE)
1248 dprintk("cpu_init done, current pstate 0x%x\n", data->currpstate); 1328 dprintk("cpu_init done, current pstate 0x%x\n",
1329 data->currpstate);
1249 else 1330 else
1250 dprintk("cpu_init done, current fid 0x%x, vid 0x%x\n", 1331 dprintk("cpu_init done, current fid 0x%x, vid 0x%x\n",
1251 data->currfid, data->currvid); 1332 data->currfid, data->currvid);
@@ -1254,15 +1335,16 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1254 1335
1255 return 0; 1336 return 0;
1256 1337
1257err_out: 1338err_out_unmask:
1258 set_cpus_allowed_ptr(current, &oldmask); 1339 set_cpus_allowed_ptr(current, &oldmask);
1259 powernow_k8_cpu_exit_acpi(data); 1340 powernow_k8_cpu_exit_acpi(data);
1260 1341
1342err_out:
1261 kfree(data); 1343 kfree(data);
1262 return -ENODEV; 1344 return -ENODEV;
1263} 1345}
1264 1346
1265static int __devexit powernowk8_cpu_exit (struct cpufreq_policy *pol) 1347static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol)
1266{ 1348{
1267 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); 1349 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
1268 1350
@@ -1279,14 +1361,14 @@ static int __devexit powernowk8_cpu_exit (struct cpufreq_policy *pol)
1279 return 0; 1361 return 0;
1280} 1362}
1281 1363
1282static unsigned int powernowk8_get (unsigned int cpu) 1364static unsigned int powernowk8_get(unsigned int cpu)
1283{ 1365{
1284 struct powernow_k8_data *data; 1366 struct powernow_k8_data *data;
1285 cpumask_t oldmask = current->cpus_allowed; 1367 cpumask_t oldmask = current->cpus_allowed;
1286 unsigned int khz = 0; 1368 unsigned int khz = 0;
1287 unsigned int first; 1369 unsigned int first;
1288 1370
1289 first = first_cpu(per_cpu(cpu_core_map, cpu)); 1371 first = cpumask_first(cpu_core_mask(cpu));
1290 data = per_cpu(powernow_data, first); 1372 data = per_cpu(powernow_data, first);
1291 1373
1292 if (!data) 1374 if (!data)
@@ -1315,7 +1397,7 @@ out:
1315 return khz; 1397 return khz;
1316} 1398}
1317 1399
1318static struct freq_attr* powernow_k8_attr[] = { 1400static struct freq_attr *powernow_k8_attr[] = {
1319 &cpufreq_freq_attr_scaling_available_freqs, 1401 &cpufreq_freq_attr_scaling_available_freqs,
1320 NULL, 1402 NULL,
1321}; 1403};
@@ -1360,7 +1442,8 @@ static void __exit powernowk8_exit(void)
1360 cpufreq_unregister_driver(&cpufreq_amd64_driver); 1442 cpufreq_unregister_driver(&cpufreq_amd64_driver);
1361} 1443}
1362 1444
1363MODULE_AUTHOR("Paul Devriendt <paul.devriendt@amd.com> and Mark Langsdorf <mark.langsdorf@amd.com>"); 1445MODULE_AUTHOR("Paul Devriendt <paul.devriendt@amd.com> and "
1446 "Mark Langsdorf <mark.langsdorf@amd.com>");
1364MODULE_DESCRIPTION("AMD Athlon 64 and Opteron processor frequency driver."); 1447MODULE_DESCRIPTION("AMD Athlon 64 and Opteron processor frequency driver.");
1365MODULE_LICENSE("GPL"); 1448MODULE_LICENSE("GPL");
1366 1449
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
index 8ecc75b6c7c3..6c6698feade1 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
@@ -45,11 +45,10 @@ struct powernow_k8_data {
45 * frequency is in kHz */ 45 * frequency is in kHz */
46 struct cpufreq_frequency_table *powernow_table; 46 struct cpufreq_frequency_table *powernow_table;
47 47
48#ifdef CONFIG_X86_POWERNOW_K8_ACPI
49 /* the acpi table needs to be kept. it's only available if ACPI was 48 /* the acpi table needs to be kept. it's only available if ACPI was
50 * used to determine valid frequency/vid/fid states */ 49 * used to determine valid frequency/vid/fid states */
51 struct acpi_processor_performance acpi_data; 50 struct acpi_processor_performance acpi_data;
52#endif 51
53 /* we need to keep track of associated cores, but let cpufreq 52 /* we need to keep track of associated cores, but let cpufreq
54 * handle hotplug events - so just point at cpufreq pol->cpus 53 * handle hotplug events - so just point at cpufreq pol->cpus
55 * structure */ 54 * structure */
@@ -222,10 +221,8 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid);
222 221
223static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index); 222static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index);
224 223
225#ifdef CONFIG_X86_POWERNOW_K8_ACPI
226static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table); 224static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
227static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table); 225static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
228#endif
229 226
230#ifdef CONFIG_SMP 227#ifdef CONFIG_SMP
231static inline void define_siblings(int cpu, cpumask_t cpu_sharedcore_mask[]) 228static inline void define_siblings(int cpu, cpumask_t cpu_sharedcore_mask[])
diff --git a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c b/arch/x86/kernel/cpu/cpufreq/sc520_freq.c
index 42da9bd677d6..435a996a613a 100644
--- a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c
+++ b/arch/x86/kernel/cpu/cpufreq/sc520_freq.c
@@ -19,17 +19,19 @@
19 19
20#include <linux/delay.h> 20#include <linux/delay.h>
21#include <linux/cpufreq.h> 21#include <linux/cpufreq.h>
22#include <linux/timex.h>
23#include <linux/io.h>
22 24
23#include <asm/msr.h> 25#include <asm/msr.h>
24#include <asm/timex.h>
25#include <asm/io.h>
26 26
27#define MMCR_BASE 0xfffef000 /* The default base address */ 27#define MMCR_BASE 0xfffef000 /* The default base address */
28#define OFFS_CPUCTL 0x2 /* CPU Control Register */ 28#define OFFS_CPUCTL 0x2 /* CPU Control Register */
29 29
30static __u8 __iomem *cpuctl; 30static __u8 __iomem *cpuctl;
31 31
32#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "sc520_freq", msg) 32#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
33 "sc520_freq", msg)
34#define PFX "sc520_freq: "
33 35
34static struct cpufreq_frequency_table sc520_freq_table[] = { 36static struct cpufreq_frequency_table sc520_freq_table[] = {
35 {0x01, 100000}, 37 {0x01, 100000},
@@ -43,7 +45,8 @@ static unsigned int sc520_freq_get_cpu_frequency(unsigned int cpu)
43 45
44 switch (clockspeed_reg & 0x03) { 46 switch (clockspeed_reg & 0x03) {
45 default: 47 default:
46 printk(KERN_ERR "sc520_freq: error: cpuctl register has unexpected value %02x\n", clockspeed_reg); 48 printk(KERN_ERR PFX "error: cpuctl register has unexpected "
49 "value %02x\n", clockspeed_reg);
47 case 0x01: 50 case 0x01:
48 return 100000; 51 return 100000;
49 case 0x02: 52 case 0x02:
@@ -51,7 +54,7 @@ static unsigned int sc520_freq_get_cpu_frequency(unsigned int cpu)
51 } 54 }
52} 55}
53 56
54static void sc520_freq_set_cpu_state (unsigned int state) 57static void sc520_freq_set_cpu_state(unsigned int state)
55{ 58{
56 59
57 struct cpufreq_freqs freqs; 60 struct cpufreq_freqs freqs;
@@ -76,18 +79,19 @@ static void sc520_freq_set_cpu_state (unsigned int state)
76 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 79 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
77}; 80};
78 81
79static int sc520_freq_verify (struct cpufreq_policy *policy) 82static int sc520_freq_verify(struct cpufreq_policy *policy)
80{ 83{
81 return cpufreq_frequency_table_verify(policy, &sc520_freq_table[0]); 84 return cpufreq_frequency_table_verify(policy, &sc520_freq_table[0]);
82} 85}
83 86
84static int sc520_freq_target (struct cpufreq_policy *policy, 87static int sc520_freq_target(struct cpufreq_policy *policy,
85 unsigned int target_freq, 88 unsigned int target_freq,
86 unsigned int relation) 89 unsigned int relation)
87{ 90{
88 unsigned int newstate = 0; 91 unsigned int newstate = 0;
89 92
90 if (cpufreq_frequency_table_target(policy, sc520_freq_table, target_freq, relation, &newstate)) 93 if (cpufreq_frequency_table_target(policy, sc520_freq_table,
94 target_freq, relation, &newstate))
91 return -EINVAL; 95 return -EINVAL;
92 96
93 sc520_freq_set_cpu_state(newstate); 97 sc520_freq_set_cpu_state(newstate);
@@ -116,7 +120,7 @@ static int sc520_freq_cpu_init(struct cpufreq_policy *policy)
116 120
117 result = cpufreq_frequency_table_cpuinfo(policy, sc520_freq_table); 121 result = cpufreq_frequency_table_cpuinfo(policy, sc520_freq_table);
118 if (result) 122 if (result)
119 return (result); 123 return result;
120 124
121 cpufreq_frequency_table_get_attr(sc520_freq_table, policy->cpu); 125 cpufreq_frequency_table_get_attr(sc520_freq_table, policy->cpu);
122 126
@@ -131,7 +135,7 @@ static int sc520_freq_cpu_exit(struct cpufreq_policy *policy)
131} 135}
132 136
133 137
134static struct freq_attr* sc520_freq_attr[] = { 138static struct freq_attr *sc520_freq_attr[] = {
135 &cpufreq_freq_attr_scaling_available_freqs, 139 &cpufreq_freq_attr_scaling_available_freqs,
136 NULL, 140 NULL,
137}; 141};
@@ -155,13 +159,13 @@ static int __init sc520_freq_init(void)
155 int err; 159 int err;
156 160
157 /* Test if we have the right hardware */ 161 /* Test if we have the right hardware */
158 if(c->x86_vendor != X86_VENDOR_AMD || 162 if (c->x86_vendor != X86_VENDOR_AMD ||
159 c->x86 != 4 || c->x86_model != 9) { 163 c->x86 != 4 || c->x86_model != 9) {
160 dprintk("no Elan SC520 processor found!\n"); 164 dprintk("no Elan SC520 processor found!\n");
161 return -ENODEV; 165 return -ENODEV;
162 } 166 }
163 cpuctl = ioremap((unsigned long)(MMCR_BASE + OFFS_CPUCTL), 1); 167 cpuctl = ioremap((unsigned long)(MMCR_BASE + OFFS_CPUCTL), 1);
164 if(!cpuctl) { 168 if (!cpuctl) {
165 printk(KERN_ERR "sc520_freq: error: failed to remap memory\n"); 169 printk(KERN_ERR "sc520_freq: error: failed to remap memory\n");
166 return -ENOMEM; 170 return -ENOMEM;
167 } 171 }
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index dedc1e98f168..016c1a4fa3fc 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -39,7 +39,7 @@ static struct pci_dev *speedstep_chipset_dev;
39 39
40/* speedstep_processor 40/* speedstep_processor
41 */ 41 */
42static unsigned int speedstep_processor = 0; 42static unsigned int speedstep_processor;
43 43
44static u32 pmbase; 44static u32 pmbase;
45 45
@@ -54,7 +54,8 @@ static struct cpufreq_frequency_table speedstep_freqs[] = {
54}; 54};
55 55
56 56
57#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-ich", msg) 57#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
58 "speedstep-ich", msg)
58 59
59 60
60/** 61/**
@@ -62,7 +63,7 @@ static struct cpufreq_frequency_table speedstep_freqs[] = {
62 * 63 *
63 * Returns: -ENODEV if no register could be found 64 * Returns: -ENODEV if no register could be found
64 */ 65 */
65static int speedstep_find_register (void) 66static int speedstep_find_register(void)
66{ 67{
67 if (!speedstep_chipset_dev) 68 if (!speedstep_chipset_dev)
68 return -ENODEV; 69 return -ENODEV;
@@ -90,7 +91,7 @@ static int speedstep_find_register (void)
90 * 91 *
91 * Tries to change the SpeedStep state. 92 * Tries to change the SpeedStep state.
92 */ 93 */
93static void speedstep_set_state (unsigned int state) 94static void speedstep_set_state(unsigned int state)
94{ 95{
95 u8 pm2_blk; 96 u8 pm2_blk;
96 u8 value; 97 u8 value;
@@ -133,11 +134,11 @@ static void speedstep_set_state (unsigned int state)
133 134
134 dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value); 135 dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value);
135 136
136 if (state == (value & 0x1)) { 137 if (state == (value & 0x1))
137 dprintk("change to %u MHz succeeded\n", (speedstep_get_processor_frequency(speedstep_processor) / 1000)); 138 dprintk("change to %u MHz succeeded\n",
138 } else { 139 speedstep_get_frequency(speedstep_processor) / 1000);
139 printk (KERN_ERR "cpufreq: change failed - I/O error\n"); 140 else
140 } 141 printk(KERN_ERR "cpufreq: change failed - I/O error\n");
141 142
142 return; 143 return;
143} 144}
@@ -149,7 +150,7 @@ static void speedstep_set_state (unsigned int state)
149 * Tries to activate the SpeedStep status and control registers. 150 * Tries to activate the SpeedStep status and control registers.
150 * Returns -EINVAL on an unsupported chipset, and zero on success. 151 * Returns -EINVAL on an unsupported chipset, and zero on success.
151 */ 152 */
152static int speedstep_activate (void) 153static int speedstep_activate(void)
153{ 154{
154 u16 value = 0; 155 u16 value = 0;
155 156
@@ -175,20 +176,18 @@ static int speedstep_activate (void)
175 * functions. Returns the SPEEDSTEP_CHIPSET_-number for the detected 176 * functions. Returns the SPEEDSTEP_CHIPSET_-number for the detected
176 * chipset, or zero on failure. 177 * chipset, or zero on failure.
177 */ 178 */
178static unsigned int speedstep_detect_chipset (void) 179static unsigned int speedstep_detect_chipset(void)
179{ 180{
180 speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL, 181 speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
181 PCI_DEVICE_ID_INTEL_82801DB_12, 182 PCI_DEVICE_ID_INTEL_82801DB_12,
182 PCI_ANY_ID, 183 PCI_ANY_ID, PCI_ANY_ID,
183 PCI_ANY_ID,
184 NULL); 184 NULL);
185 if (speedstep_chipset_dev) 185 if (speedstep_chipset_dev)
186 return 4; /* 4-M */ 186 return 4; /* 4-M */
187 187
188 speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL, 188 speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
189 PCI_DEVICE_ID_INTEL_82801CA_12, 189 PCI_DEVICE_ID_INTEL_82801CA_12,
190 PCI_ANY_ID, 190 PCI_ANY_ID, PCI_ANY_ID,
191 PCI_ANY_ID,
192 NULL); 191 NULL);
193 if (speedstep_chipset_dev) 192 if (speedstep_chipset_dev)
194 return 3; /* 3-M */ 193 return 3; /* 3-M */
@@ -196,8 +195,7 @@ static unsigned int speedstep_detect_chipset (void)
196 195
197 speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL, 196 speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
198 PCI_DEVICE_ID_INTEL_82801BA_10, 197 PCI_DEVICE_ID_INTEL_82801BA_10,
199 PCI_ANY_ID, 198 PCI_ANY_ID, PCI_ANY_ID,
200 PCI_ANY_ID,
201 NULL); 199 NULL);
202 if (speedstep_chipset_dev) { 200 if (speedstep_chipset_dev) {
203 /* speedstep.c causes lockups on Dell Inspirons 8000 and 201 /* speedstep.c causes lockups on Dell Inspirons 8000 and
@@ -208,8 +206,7 @@ static unsigned int speedstep_detect_chipset (void)
208 206
209 hostbridge = pci_get_subsys(PCI_VENDOR_ID_INTEL, 207 hostbridge = pci_get_subsys(PCI_VENDOR_ID_INTEL,
210 PCI_DEVICE_ID_INTEL_82815_MC, 208 PCI_DEVICE_ID_INTEL_82815_MC,
211 PCI_ANY_ID, 209 PCI_ANY_ID, PCI_ANY_ID,
212 PCI_ANY_ID,
213 NULL); 210 NULL);
214 211
215 if (!hostbridge) 212 if (!hostbridge)
@@ -236,7 +233,7 @@ static unsigned int _speedstep_get(const struct cpumask *cpus)
236 233
237 cpus_allowed = current->cpus_allowed; 234 cpus_allowed = current->cpus_allowed;
238 set_cpus_allowed_ptr(current, cpus); 235 set_cpus_allowed_ptr(current, cpus);
239 speed = speedstep_get_processor_frequency(speedstep_processor); 236 speed = speedstep_get_frequency(speedstep_processor);
240 set_cpus_allowed_ptr(current, &cpus_allowed); 237 set_cpus_allowed_ptr(current, &cpus_allowed);
241 dprintk("detected %u kHz as current frequency\n", speed); 238 dprintk("detected %u kHz as current frequency\n", speed);
242 return speed; 239 return speed;
@@ -251,11 +248,12 @@ static unsigned int speedstep_get(unsigned int cpu)
251 * speedstep_target - set a new CPUFreq policy 248 * speedstep_target - set a new CPUFreq policy
252 * @policy: new policy 249 * @policy: new policy
253 * @target_freq: the target frequency 250 * @target_freq: the target frequency
254 * @relation: how that frequency relates to achieved frequency (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) 251 * @relation: how that frequency relates to achieved frequency
252 * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
255 * 253 *
256 * Sets a new CPUFreq policy. 254 * Sets a new CPUFreq policy.
257 */ 255 */
258static int speedstep_target (struct cpufreq_policy *policy, 256static int speedstep_target(struct cpufreq_policy *policy,
259 unsigned int target_freq, 257 unsigned int target_freq,
260 unsigned int relation) 258 unsigned int relation)
261{ 259{
@@ -264,7 +262,8 @@ static int speedstep_target (struct cpufreq_policy *policy,
264 cpumask_t cpus_allowed; 262 cpumask_t cpus_allowed;
265 int i; 263 int i;
266 264
267 if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], target_freq, relation, &newstate)) 265 if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0],
266 target_freq, relation, &newstate))
268 return -EINVAL; 267 return -EINVAL;
269 268
270 freqs.old = _speedstep_get(policy->cpus); 269 freqs.old = _speedstep_get(policy->cpus);
@@ -308,7 +307,7 @@ static int speedstep_target (struct cpufreq_policy *policy,
308 * Limit must be within speedstep_low_freq and speedstep_high_freq, with 307 * Limit must be within speedstep_low_freq and speedstep_high_freq, with
309 * at least one border included. 308 * at least one border included.
310 */ 309 */
311static int speedstep_verify (struct cpufreq_policy *policy) 310static int speedstep_verify(struct cpufreq_policy *policy)
312{ 311{
313 return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]); 312 return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
314} 313}
@@ -322,7 +321,7 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
322 321
323 /* only run on CPU to be set, or on its sibling */ 322 /* only run on CPU to be set, or on its sibling */
324#ifdef CONFIG_SMP 323#ifdef CONFIG_SMP
325 cpumask_copy(policy->cpus, &per_cpu(cpu_sibling_map, policy->cpu)); 324 cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));
326#endif 325#endif
327 326
328 cpus_allowed = current->cpus_allowed; 327 cpus_allowed = current->cpus_allowed;
@@ -344,7 +343,8 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
344 return -EIO; 343 return -EIO;
345 344
346 dprintk("currently at %s speed setting - %i MHz\n", 345 dprintk("currently at %s speed setting - %i MHz\n",
347 (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency) ? "low" : "high", 346 (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency)
347 ? "low" : "high",
348 (speed / 1000)); 348 (speed / 1000));
349 349
350 /* cpuinfo and default policy values */ 350 /* cpuinfo and default policy values */
@@ -352,9 +352,9 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
352 352
353 result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs); 353 result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs);
354 if (result) 354 if (result)
355 return (result); 355 return result;
356 356
357 cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu); 357 cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu);
358 358
359 return 0; 359 return 0;
360} 360}
@@ -366,7 +366,7 @@ static int speedstep_cpu_exit(struct cpufreq_policy *policy)
366 return 0; 366 return 0;
367} 367}
368 368
369static struct freq_attr* speedstep_attr[] = { 369static struct freq_attr *speedstep_attr[] = {
370 &cpufreq_freq_attr_scaling_available_freqs, 370 &cpufreq_freq_attr_scaling_available_freqs,
371 NULL, 371 NULL,
372}; 372};
@@ -396,13 +396,15 @@ static int __init speedstep_init(void)
396 /* detect processor */ 396 /* detect processor */
397 speedstep_processor = speedstep_detect_processor(); 397 speedstep_processor = speedstep_detect_processor();
398 if (!speedstep_processor) { 398 if (!speedstep_processor) {
399 dprintk("Intel(R) SpeedStep(TM) capable processor not found\n"); 399 dprintk("Intel(R) SpeedStep(TM) capable processor "
400 "not found\n");
400 return -ENODEV; 401 return -ENODEV;
401 } 402 }
402 403
403 /* detect chipset */ 404 /* detect chipset */
404 if (!speedstep_detect_chipset()) { 405 if (!speedstep_detect_chipset()) {
405 dprintk("Intel(R) SpeedStep(TM) for this chipset not (yet) available.\n"); 406 dprintk("Intel(R) SpeedStep(TM) for this chipset not "
407 "(yet) available.\n");
406 return -ENODEV; 408 return -ENODEV;
407 } 409 }
408 410
@@ -431,9 +433,11 @@ static void __exit speedstep_exit(void)
431} 433}
432 434
433 435
434MODULE_AUTHOR ("Dave Jones <davej@redhat.com>, Dominik Brodowski <linux@brodo.de>"); 436MODULE_AUTHOR("Dave Jones <davej@redhat.com>, "
435MODULE_DESCRIPTION ("Speedstep driver for Intel mobile processors on chipsets with ICH-M southbridges."); 437 "Dominik Brodowski <linux@brodo.de>");
436MODULE_LICENSE ("GPL"); 438MODULE_DESCRIPTION("Speedstep driver for Intel mobile processors on chipsets "
439 "with ICH-M southbridges.");
440MODULE_LICENSE("GPL");
437 441
438module_init(speedstep_init); 442module_init(speedstep_init);
439module_exit(speedstep_exit); 443module_exit(speedstep_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
index cdac7d62369b..2e3c6862657b 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
@@ -16,12 +16,16 @@
16#include <linux/slab.h> 16#include <linux/slab.h>
17 17
18#include <asm/msr.h> 18#include <asm/msr.h>
19#include <asm/tsc.h>
19#include "speedstep-lib.h" 20#include "speedstep-lib.h"
20 21
21#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-lib", msg) 22#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
23 "speedstep-lib", msg)
24
25#define PFX "speedstep-lib: "
22 26
23#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK 27#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK
24static int relaxed_check = 0; 28static int relaxed_check;
25#else 29#else
26#define relaxed_check 0 30#define relaxed_check 0
27#endif 31#endif
@@ -30,14 +34,14 @@ static int relaxed_check = 0;
30 * GET PROCESSOR CORE SPEED IN KHZ * 34 * GET PROCESSOR CORE SPEED IN KHZ *
31 *********************************************************************/ 35 *********************************************************************/
32 36
33static unsigned int pentium3_get_frequency (unsigned int processor) 37static unsigned int pentium3_get_frequency(unsigned int processor)
34{ 38{
35 /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */ 39 /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */
36 struct { 40 struct {
37 unsigned int ratio; /* Frequency Multiplier (x10) */ 41 unsigned int ratio; /* Frequency Multiplier (x10) */
38 u8 bitmap; /* power on configuration bits 42 u8 bitmap; /* power on configuration bits
39 [27, 25:22] (in MSR 0x2a) */ 43 [27, 25:22] (in MSR 0x2a) */
40 } msr_decode_mult [] = { 44 } msr_decode_mult[] = {
41 { 30, 0x01 }, 45 { 30, 0x01 },
42 { 35, 0x05 }, 46 { 35, 0x05 },
43 { 40, 0x02 }, 47 { 40, 0x02 },
@@ -52,7 +56,7 @@ static unsigned int pentium3_get_frequency (unsigned int processor)
52 { 85, 0x26 }, 56 { 85, 0x26 },
53 { 90, 0x20 }, 57 { 90, 0x20 },
54 { 100, 0x2b }, 58 { 100, 0x2b },
55 { 0, 0xff } /* error or unknown value */ 59 { 0, 0xff } /* error or unknown value */
56 }; 60 };
57 61
58 /* PIII(-M) FSB settings: see table b1-b of 24547206.pdf */ 62 /* PIII(-M) FSB settings: see table b1-b of 24547206.pdf */
@@ -60,7 +64,7 @@ static unsigned int pentium3_get_frequency (unsigned int processor)
60 unsigned int value; /* Front Side Bus speed in MHz */ 64 unsigned int value; /* Front Side Bus speed in MHz */
61 u8 bitmap; /* power on configuration bits [18: 19] 65 u8 bitmap; /* power on configuration bits [18: 19]
62 (in MSR 0x2a) */ 66 (in MSR 0x2a) */
63 } msr_decode_fsb [] = { 67 } msr_decode_fsb[] = {
64 { 66, 0x0 }, 68 { 66, 0x0 },
65 { 100, 0x2 }, 69 { 100, 0x2 },
66 { 133, 0x1 }, 70 { 133, 0x1 },
@@ -85,7 +89,7 @@ static unsigned int pentium3_get_frequency (unsigned int processor)
85 } 89 }
86 90
87 /* decode the multiplier */ 91 /* decode the multiplier */
88 if (processor == SPEEDSTEP_PROCESSOR_PIII_C_EARLY) { 92 if (processor == SPEEDSTEP_CPU_PIII_C_EARLY) {
89 dprintk("workaround for early PIIIs\n"); 93 dprintk("workaround for early PIIIs\n");
90 msr_lo &= 0x03c00000; 94 msr_lo &= 0x03c00000;
91 } else 95 } else
@@ -97,9 +101,10 @@ static unsigned int pentium3_get_frequency (unsigned int processor)
97 j++; 101 j++;
98 } 102 }
99 103
100 dprintk("speed is %u\n", (msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100)); 104 dprintk("speed is %u\n",
105 (msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100));
101 106
102 return (msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100); 107 return msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100;
103} 108}
104 109
105 110
@@ -112,20 +117,23 @@ static unsigned int pentiumM_get_frequency(void)
112 117
113 /* see table B-2 of 24547212.pdf */ 118 /* see table B-2 of 24547212.pdf */
114 if (msr_lo & 0x00040000) { 119 if (msr_lo & 0x00040000) {
115 printk(KERN_DEBUG "speedstep-lib: PM - invalid FSB: 0x%x 0x%x\n", msr_lo, msr_tmp); 120 printk(KERN_DEBUG PFX "PM - invalid FSB: 0x%x 0x%x\n",
121 msr_lo, msr_tmp);
116 return 0; 122 return 0;
117 } 123 }
118 124
119 msr_tmp = (msr_lo >> 22) & 0x1f; 125 msr_tmp = (msr_lo >> 22) & 0x1f;
120 dprintk("bits 22-26 are 0x%x, speed is %u\n", msr_tmp, (msr_tmp * 100 * 1000)); 126 dprintk("bits 22-26 are 0x%x, speed is %u\n",
127 msr_tmp, (msr_tmp * 100 * 1000));
121 128
122 return (msr_tmp * 100 * 1000); 129 return msr_tmp * 100 * 1000;
123} 130}
124 131
125static unsigned int pentium_core_get_frequency(void) 132static unsigned int pentium_core_get_frequency(void)
126{ 133{
127 u32 fsb = 0; 134 u32 fsb = 0;
128 u32 msr_lo, msr_tmp; 135 u32 msr_lo, msr_tmp;
136 int ret;
129 137
130 rdmsr(MSR_FSB_FREQ, msr_lo, msr_tmp); 138 rdmsr(MSR_FSB_FREQ, msr_lo, msr_tmp);
131 /* see table B-2 of 25366920.pdf */ 139 /* see table B-2 of 25366920.pdf */
@@ -153,12 +161,15 @@ static unsigned int pentium_core_get_frequency(void)
153 } 161 }
154 162
155 rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp); 163 rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
156 dprintk("PCORE - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp); 164 dprintk("PCORE - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n",
165 msr_lo, msr_tmp);
157 166
158 msr_tmp = (msr_lo >> 22) & 0x1f; 167 msr_tmp = (msr_lo >> 22) & 0x1f;
159 dprintk("bits 22-26 are 0x%x, speed is %u\n", msr_tmp, (msr_tmp * fsb)); 168 dprintk("bits 22-26 are 0x%x, speed is %u\n",
169 msr_tmp, (msr_tmp * fsb));
160 170
161 return (msr_tmp * fsb); 171 ret = (msr_tmp * fsb);
172 return ret;
162} 173}
163 174
164 175
@@ -167,6 +178,16 @@ static unsigned int pentium4_get_frequency(void)
167 struct cpuinfo_x86 *c = &boot_cpu_data; 178 struct cpuinfo_x86 *c = &boot_cpu_data;
168 u32 msr_lo, msr_hi, mult; 179 u32 msr_lo, msr_hi, mult;
169 unsigned int fsb = 0; 180 unsigned int fsb = 0;
181 unsigned int ret;
182 u8 fsb_code;
183
184 /* Pentium 4 Model 0 and 1 do not have the Core Clock Frequency
185 * to System Bus Frequency Ratio Field in the Processor Frequency
186 * Configuration Register of the MSR. Therefore the current
187 * frequency cannot be calculated and has to be measured.
188 */
189 if (c->x86_model < 2)
190 return cpu_khz;
170 191
171 rdmsr(0x2c, msr_lo, msr_hi); 192 rdmsr(0x2c, msr_lo, msr_hi);
172 193
@@ -177,62 +198,61 @@ static unsigned int pentium4_get_frequency(void)
177 * revision #12 in Table B-1: MSRs in the Pentium 4 and 198 * revision #12 in Table B-1: MSRs in the Pentium 4 and
178 * Intel Xeon Processors, on page B-4 and B-5. 199 * Intel Xeon Processors, on page B-4 and B-5.
179 */ 200 */
180 if (c->x86_model < 2) 201 fsb_code = (msr_lo >> 16) & 0x7;
202 switch (fsb_code) {
203 case 0:
181 fsb = 100 * 1000; 204 fsb = 100 * 1000;
182 else { 205 break;
183 u8 fsb_code = (msr_lo >> 16) & 0x7; 206 case 1:
184 switch (fsb_code) { 207 fsb = 13333 * 10;
185 case 0: 208 break;
186 fsb = 100 * 1000; 209 case 2:
187 break; 210 fsb = 200 * 1000;
188 case 1: 211 break;
189 fsb = 13333 * 10;
190 break;
191 case 2:
192 fsb = 200 * 1000;
193 break;
194 }
195 } 212 }
196 213
197 if (!fsb) 214 if (!fsb)
198 printk(KERN_DEBUG "speedstep-lib: couldn't detect FSB speed. Please send an e-mail to <linux@brodo.de>\n"); 215 printk(KERN_DEBUG PFX "couldn't detect FSB speed. "
216 "Please send an e-mail to <linux@brodo.de>\n");
199 217
200 /* Multiplier. */ 218 /* Multiplier. */
201 mult = msr_lo >> 24; 219 mult = msr_lo >> 24;
202 220
203 dprintk("P4 - FSB %u kHz; Multiplier %u; Speed %u kHz\n", fsb, mult, (fsb * mult)); 221 dprintk("P4 - FSB %u kHz; Multiplier %u; Speed %u kHz\n",
222 fsb, mult, (fsb * mult));
204 223
205 return (fsb * mult); 224 ret = (fsb * mult);
225 return ret;
206} 226}
207 227
208 228
209unsigned int speedstep_get_processor_frequency(unsigned int processor) 229unsigned int speedstep_get_frequency(unsigned int processor)
210{ 230{
211 switch (processor) { 231 switch (processor) {
212 case SPEEDSTEP_PROCESSOR_PCORE: 232 case SPEEDSTEP_CPU_PCORE:
213 return pentium_core_get_frequency(); 233 return pentium_core_get_frequency();
214 case SPEEDSTEP_PROCESSOR_PM: 234 case SPEEDSTEP_CPU_PM:
215 return pentiumM_get_frequency(); 235 return pentiumM_get_frequency();
216 case SPEEDSTEP_PROCESSOR_P4D: 236 case SPEEDSTEP_CPU_P4D:
217 case SPEEDSTEP_PROCESSOR_P4M: 237 case SPEEDSTEP_CPU_P4M:
218 return pentium4_get_frequency(); 238 return pentium4_get_frequency();
219 case SPEEDSTEP_PROCESSOR_PIII_T: 239 case SPEEDSTEP_CPU_PIII_T:
220 case SPEEDSTEP_PROCESSOR_PIII_C: 240 case SPEEDSTEP_CPU_PIII_C:
221 case SPEEDSTEP_PROCESSOR_PIII_C_EARLY: 241 case SPEEDSTEP_CPU_PIII_C_EARLY:
222 return pentium3_get_frequency(processor); 242 return pentium3_get_frequency(processor);
223 default: 243 default:
224 return 0; 244 return 0;
225 }; 245 };
226 return 0; 246 return 0;
227} 247}
228EXPORT_SYMBOL_GPL(speedstep_get_processor_frequency); 248EXPORT_SYMBOL_GPL(speedstep_get_frequency);
229 249
230 250
231/********************************************************************* 251/*********************************************************************
232 * DETECT SPEEDSTEP-CAPABLE PROCESSOR * 252 * DETECT SPEEDSTEP-CAPABLE PROCESSOR *
233 *********************************************************************/ 253 *********************************************************************/
234 254
235unsigned int speedstep_detect_processor (void) 255unsigned int speedstep_detect_processor(void)
236{ 256{
237 struct cpuinfo_x86 *c = &cpu_data(0); 257 struct cpuinfo_x86 *c = &cpu_data(0);
238 u32 ebx, msr_lo, msr_hi; 258 u32 ebx, msr_lo, msr_hi;
@@ -261,7 +281,7 @@ unsigned int speedstep_detect_processor (void)
261 * sample has ebx = 0x0f, production has 0x0e. 281 * sample has ebx = 0x0f, production has 0x0e.
262 */ 282 */
263 if ((ebx == 0x0e) || (ebx == 0x0f)) 283 if ((ebx == 0x0e) || (ebx == 0x0f))
264 return SPEEDSTEP_PROCESSOR_P4M; 284 return SPEEDSTEP_CPU_P4M;
265 break; 285 break;
266 case 7: 286 case 7:
267 /* 287 /*
@@ -272,7 +292,7 @@ unsigned int speedstep_detect_processor (void)
272 * samples are only of B-stepping... 292 * samples are only of B-stepping...
273 */ 293 */
274 if (ebx == 0x0e) 294 if (ebx == 0x0e)
275 return SPEEDSTEP_PROCESSOR_P4M; 295 return SPEEDSTEP_CPU_P4M;
276 break; 296 break;
277 case 9: 297 case 9:
278 /* 298 /*
@@ -288,10 +308,13 @@ unsigned int speedstep_detect_processor (void)
288 * M-P4-Ms may have either ebx=0xe or 0xf [see above] 308 * M-P4-Ms may have either ebx=0xe or 0xf [see above]
289 * M-P4/533 have either ebx=0xe or 0xf. [25317607.pdf] 309 * M-P4/533 have either ebx=0xe or 0xf. [25317607.pdf]
290 * also, M-P4M HTs have ebx=0x8, too 310 * also, M-P4M HTs have ebx=0x8, too
291 * For now, they are distinguished by the model_id string 311 * For now, they are distinguished by the model_id
312 * string
292 */ 313 */
293 if ((ebx == 0x0e) || (strstr(c->x86_model_id,"Mobile Intel(R) Pentium(R) 4") != NULL)) 314 if ((ebx == 0x0e) ||
294 return SPEEDSTEP_PROCESSOR_P4M; 315 (strstr(c->x86_model_id,
316 "Mobile Intel(R) Pentium(R) 4") != NULL))
317 return SPEEDSTEP_CPU_P4M;
295 break; 318 break;
296 default: 319 default:
297 break; 320 break;
@@ -301,7 +324,8 @@ unsigned int speedstep_detect_processor (void)
301 324
302 switch (c->x86_model) { 325 switch (c->x86_model) {
303 case 0x0B: /* Intel PIII [Tualatin] */ 326 case 0x0B: /* Intel PIII [Tualatin] */
304 /* cpuid_ebx(1) is 0x04 for desktop PIII, 0x06 for mobile PIII-M */ 327 /* cpuid_ebx(1) is 0x04 for desktop PIII,
328 * 0x06 for mobile PIII-M */
305 ebx = cpuid_ebx(0x00000001); 329 ebx = cpuid_ebx(0x00000001);
306 dprintk("ebx is %x\n", ebx); 330 dprintk("ebx is %x\n", ebx);
307 331
@@ -313,14 +337,15 @@ unsigned int speedstep_detect_processor (void)
313 /* So far all PIII-M processors support SpeedStep. See 337 /* So far all PIII-M processors support SpeedStep. See
314 * Intel's 24540640.pdf of June 2003 338 * Intel's 24540640.pdf of June 2003
315 */ 339 */
316 return SPEEDSTEP_PROCESSOR_PIII_T; 340 return SPEEDSTEP_CPU_PIII_T;
317 341
318 case 0x08: /* Intel PIII [Coppermine] */ 342 case 0x08: /* Intel PIII [Coppermine] */
319 343
320 /* all mobile PIII Coppermines have FSB 100 MHz 344 /* all mobile PIII Coppermines have FSB 100 MHz
321 * ==> sort out a few desktop PIIIs. */ 345 * ==> sort out a few desktop PIIIs. */
322 rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_hi); 346 rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_hi);
323 dprintk("Coppermine: MSR_IA32_EBL_CR_POWERON is 0x%x, 0x%x\n", msr_lo, msr_hi); 347 dprintk("Coppermine: MSR_IA32_EBL_CR_POWERON is 0x%x, 0x%x\n",
348 msr_lo, msr_hi);
324 msr_lo &= 0x00c0000; 349 msr_lo &= 0x00c0000;
325 if (msr_lo != 0x0080000) 350 if (msr_lo != 0x0080000)
326 return 0; 351 return 0;
@@ -332,13 +357,15 @@ unsigned int speedstep_detect_processor (void)
332 * bit 56 or 57 is set 357 * bit 56 or 57 is set
333 */ 358 */
334 rdmsr(MSR_IA32_PLATFORM_ID, msr_lo, msr_hi); 359 rdmsr(MSR_IA32_PLATFORM_ID, msr_lo, msr_hi);
335 dprintk("Coppermine: MSR_IA32_PLATFORM ID is 0x%x, 0x%x\n", msr_lo, msr_hi); 360 dprintk("Coppermine: MSR_IA32_PLATFORM ID is 0x%x, 0x%x\n",
336 if ((msr_hi & (1<<18)) && (relaxed_check ? 1 : (msr_hi & (3<<24)))) { 361 msr_lo, msr_hi);
362 if ((msr_hi & (1<<18)) &&
363 (relaxed_check ? 1 : (msr_hi & (3<<24)))) {
337 if (c->x86_mask == 0x01) { 364 if (c->x86_mask == 0x01) {
338 dprintk("early PIII version\n"); 365 dprintk("early PIII version\n");
339 return SPEEDSTEP_PROCESSOR_PIII_C_EARLY; 366 return SPEEDSTEP_CPU_PIII_C_EARLY;
340 } else 367 } else
341 return SPEEDSTEP_PROCESSOR_PIII_C; 368 return SPEEDSTEP_CPU_PIII_C;
342 } 369 }
343 370
344 default: 371 default:
@@ -369,7 +396,7 @@ unsigned int speedstep_get_freqs(unsigned int processor,
369 dprintk("trying to determine both speeds\n"); 396 dprintk("trying to determine both speeds\n");
370 397
371 /* get current speed */ 398 /* get current speed */
372 prev_speed = speedstep_get_processor_frequency(processor); 399 prev_speed = speedstep_get_frequency(processor);
373 if (!prev_speed) 400 if (!prev_speed)
374 return -EIO; 401 return -EIO;
375 402
@@ -379,7 +406,7 @@ unsigned int speedstep_get_freqs(unsigned int processor,
379 406
380 /* switch to low state */ 407 /* switch to low state */
381 set_state(SPEEDSTEP_LOW); 408 set_state(SPEEDSTEP_LOW);
382 *low_speed = speedstep_get_processor_frequency(processor); 409 *low_speed = speedstep_get_frequency(processor);
383 if (!*low_speed) { 410 if (!*low_speed) {
384 ret = -EIO; 411 ret = -EIO;
385 goto out; 412 goto out;
@@ -398,7 +425,7 @@ unsigned int speedstep_get_freqs(unsigned int processor,
398 if (transition_latency) 425 if (transition_latency)
399 do_gettimeofday(&tv2); 426 do_gettimeofday(&tv2);
400 427
401 *high_speed = speedstep_get_processor_frequency(processor); 428 *high_speed = speedstep_get_frequency(processor);
402 if (!*high_speed) { 429 if (!*high_speed) {
403 ret = -EIO; 430 ret = -EIO;
404 goto out; 431 goto out;
@@ -426,9 +453,12 @@ unsigned int speedstep_get_freqs(unsigned int processor,
426 /* check if the latency measurement is too high or too low 453 /* check if the latency measurement is too high or too low
427 * and set it to a safe value (500uSec) in that case 454 * and set it to a safe value (500uSec) in that case
428 */ 455 */
429 if (*transition_latency > 10000000 || *transition_latency < 50000) { 456 if (*transition_latency > 10000000 ||
430 printk (KERN_WARNING "speedstep: frequency transition measured seems out of " 457 *transition_latency < 50000) {
431 "range (%u nSec), falling back to a safe one of %u nSec.\n", 458 printk(KERN_WARNING PFX "frequency transition "
459 "measured seems out of range (%u "
460 "nSec), falling back to a safe one of"
461 "%u nSec.\n",
432 *transition_latency, 500000); 462 *transition_latency, 500000);
433 *transition_latency = 500000; 463 *transition_latency = 500000;
434 } 464 }
@@ -436,15 +466,16 @@ unsigned int speedstep_get_freqs(unsigned int processor,
436 466
437out: 467out:
438 local_irq_restore(flags); 468 local_irq_restore(flags);
439 return (ret); 469 return ret;
440} 470}
441EXPORT_SYMBOL_GPL(speedstep_get_freqs); 471EXPORT_SYMBOL_GPL(speedstep_get_freqs);
442 472
443#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK 473#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK
444module_param(relaxed_check, int, 0444); 474module_param(relaxed_check, int, 0444);
445MODULE_PARM_DESC(relaxed_check, "Don't do all checks for speedstep capability."); 475MODULE_PARM_DESC(relaxed_check,
476 "Don't do all checks for speedstep capability.");
446#endif 477#endif
447 478
448MODULE_AUTHOR ("Dominik Brodowski <linux@brodo.de>"); 479MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
449MODULE_DESCRIPTION ("Library for Intel SpeedStep 1 or 2 cpufreq drivers."); 480MODULE_DESCRIPTION("Library for Intel SpeedStep 1 or 2 cpufreq drivers.");
450MODULE_LICENSE ("GPL"); 481MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
index b11bcc608cac..2b6c04e5a304 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
@@ -12,17 +12,17 @@
12 12
13/* processors */ 13/* processors */
14 14
15#define SPEEDSTEP_PROCESSOR_PIII_C_EARLY 0x00000001 /* Coppermine core */ 15#define SPEEDSTEP_CPU_PIII_C_EARLY 0x00000001 /* Coppermine core */
16#define SPEEDSTEP_PROCESSOR_PIII_C 0x00000002 /* Coppermine core */ 16#define SPEEDSTEP_CPU_PIII_C 0x00000002 /* Coppermine core */
17#define SPEEDSTEP_PROCESSOR_PIII_T 0x00000003 /* Tualatin core */ 17#define SPEEDSTEP_CPU_PIII_T 0x00000003 /* Tualatin core */
18#define SPEEDSTEP_PROCESSOR_P4M 0x00000004 /* P4-M */ 18#define SPEEDSTEP_CPU_P4M 0x00000004 /* P4-M */
19 19
20/* the following processors are not speedstep-capable and are not auto-detected 20/* the following processors are not speedstep-capable and are not auto-detected
21 * in speedstep_detect_processor(). However, their speed can be detected using 21 * in speedstep_detect_processor(). However, their speed can be detected using
22 * the speedstep_get_processor_frequency() call. */ 22 * the speedstep_get_frequency() call. */
23#define SPEEDSTEP_PROCESSOR_PM 0xFFFFFF03 /* Pentium M */ 23#define SPEEDSTEP_CPU_PM 0xFFFFFF03 /* Pentium M */
24#define SPEEDSTEP_PROCESSOR_P4D 0xFFFFFF04 /* desktop P4 */ 24#define SPEEDSTEP_CPU_P4D 0xFFFFFF04 /* desktop P4 */
25#define SPEEDSTEP_PROCESSOR_PCORE 0xFFFFFF05 /* Core */ 25#define SPEEDSTEP_CPU_PCORE 0xFFFFFF05 /* Core */
26 26
27/* speedstep states -- only two of them */ 27/* speedstep states -- only two of them */
28 28
@@ -34,7 +34,7 @@
34extern unsigned int speedstep_detect_processor (void); 34extern unsigned int speedstep_detect_processor (void);
35 35
36/* detect the current speed (in khz) of the processor */ 36/* detect the current speed (in khz) of the processor */
37extern unsigned int speedstep_get_processor_frequency(unsigned int processor); 37extern unsigned int speedstep_get_frequency(unsigned int processor);
38 38
39 39
40/* detect the low and high speeds of the processor. The callback 40/* detect the low and high speeds of the processor. The callback
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
index 8a85c93bd62a..befea088e4f5 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
@@ -19,8 +19,8 @@
19#include <linux/cpufreq.h> 19#include <linux/cpufreq.h>
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <linux/delay.h> 21#include <linux/delay.h>
22#include <linux/io.h>
22#include <asm/ist.h> 23#include <asm/ist.h>
23#include <asm/io.h>
24 24
25#include "speedstep-lib.h" 25#include "speedstep-lib.h"
26 26
@@ -30,12 +30,12 @@
30 * If user gives it, these are used. 30 * If user gives it, these are used.
31 * 31 *
32 */ 32 */
33static int smi_port = 0; 33static int smi_port;
34static int smi_cmd = 0; 34static int smi_cmd;
35static unsigned int smi_sig = 0; 35static unsigned int smi_sig;
36 36
37/* info about the processor */ 37/* info about the processor */
38static unsigned int speedstep_processor = 0; 38static unsigned int speedstep_processor;
39 39
40/* 40/*
41 * There are only two frequency states for each processor. Values 41 * There are only two frequency states for each processor. Values
@@ -56,12 +56,13 @@ static struct cpufreq_frequency_table speedstep_freqs[] = {
56 * of DMA activity going on? */ 56 * of DMA activity going on? */
57#define SMI_TRIES 5 57#define SMI_TRIES 5
58 58
59#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-smi", msg) 59#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
60 "speedstep-smi", msg)
60 61
61/** 62/**
62 * speedstep_smi_ownership 63 * speedstep_smi_ownership
63 */ 64 */
64static int speedstep_smi_ownership (void) 65static int speedstep_smi_ownership(void)
65{ 66{
66 u32 command, result, magic, dummy; 67 u32 command, result, magic, dummy;
67 u32 function = GET_SPEEDSTEP_OWNER; 68 u32 function = GET_SPEEDSTEP_OWNER;
@@ -70,16 +71,18 @@ static int speedstep_smi_ownership (void)
70 command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff); 71 command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
71 magic = virt_to_phys(magic_data); 72 magic = virt_to_phys(magic_data);
72 73
73 dprintk("trying to obtain ownership with command %x at port %x\n", command, smi_port); 74 dprintk("trying to obtain ownership with command %x at port %x\n",
75 command, smi_port);
74 76
75 __asm__ __volatile__( 77 __asm__ __volatile__(
76 "push %%ebp\n" 78 "push %%ebp\n"
77 "out %%al, (%%dx)\n" 79 "out %%al, (%%dx)\n"
78 "pop %%ebp\n" 80 "pop %%ebp\n"
79 : "=D" (result), "=a" (dummy), "=b" (dummy), "=c" (dummy), "=d" (dummy), 81 : "=D" (result),
80 "=S" (dummy) 82 "=a" (dummy), "=b" (dummy), "=c" (dummy), "=d" (dummy),
83 "=S" (dummy)
81 : "a" (command), "b" (function), "c" (0), "d" (smi_port), 84 : "a" (command), "b" (function), "c" (0), "d" (smi_port),
82 "D" (0), "S" (magic) 85 "D" (0), "S" (magic)
83 : "memory" 86 : "memory"
84 ); 87 );
85 88
@@ -97,10 +100,10 @@ static int speedstep_smi_ownership (void)
97 * even hangs [cf. bugme.osdl.org # 1422] on earlier systems. Empirical testing 100 * even hangs [cf. bugme.osdl.org # 1422] on earlier systems. Empirical testing
98 * shows that the latter occurs if !(ist_info.event & 0xFFFF). 101 * shows that the latter occurs if !(ist_info.event & 0xFFFF).
99 */ 102 */
100static int speedstep_smi_get_freqs (unsigned int *low, unsigned int *high) 103static int speedstep_smi_get_freqs(unsigned int *low, unsigned int *high)
101{ 104{
102 u32 command, result = 0, edi, high_mhz, low_mhz, dummy; 105 u32 command, result = 0, edi, high_mhz, low_mhz, dummy;
103 u32 state=0; 106 u32 state = 0;
104 u32 function = GET_SPEEDSTEP_FREQS; 107 u32 function = GET_SPEEDSTEP_FREQS;
105 108
106 if (!(ist_info.event & 0xFFFF)) { 109 if (!(ist_info.event & 0xFFFF)) {
@@ -110,17 +113,25 @@ static int speedstep_smi_get_freqs (unsigned int *low, unsigned int *high)
110 113
111 command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff); 114 command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
112 115
113 dprintk("trying to determine frequencies with command %x at port %x\n", command, smi_port); 116 dprintk("trying to determine frequencies with command %x at port %x\n",
117 command, smi_port);
114 118
115 __asm__ __volatile__( 119 __asm__ __volatile__(
116 "push %%ebp\n" 120 "push %%ebp\n"
117 "out %%al, (%%dx)\n" 121 "out %%al, (%%dx)\n"
118 "pop %%ebp" 122 "pop %%ebp"
119 : "=a" (result), "=b" (high_mhz), "=c" (low_mhz), "=d" (state), "=D" (edi), "=S" (dummy) 123 : "=a" (result),
120 : "a" (command), "b" (function), "c" (state), "d" (smi_port), "S" (0), "D" (0) 124 "=b" (high_mhz),
125 "=c" (low_mhz),
126 "=d" (state), "=D" (edi), "=S" (dummy)
127 : "a" (command),
128 "b" (function),
129 "c" (state),
130 "d" (smi_port), "S" (0), "D" (0)
121 ); 131 );
122 132
123 dprintk("result %x, low_freq %u, high_freq %u\n", result, low_mhz, high_mhz); 133 dprintk("result %x, low_freq %u, high_freq %u\n",
134 result, low_mhz, high_mhz);
124 135
125 /* abort if results are obviously incorrect... */ 136 /* abort if results are obviously incorrect... */
126 if ((high_mhz + low_mhz) < 600) 137 if ((high_mhz + low_mhz) < 600)
@@ -137,26 +148,30 @@ static int speedstep_smi_get_freqs (unsigned int *low, unsigned int *high)
137 * @state: processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH) 148 * @state: processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
138 * 149 *
139 */ 150 */
140static int speedstep_get_state (void) 151static int speedstep_get_state(void)
141{ 152{
142 u32 function=GET_SPEEDSTEP_STATE; 153 u32 function = GET_SPEEDSTEP_STATE;
143 u32 result, state, edi, command, dummy; 154 u32 result, state, edi, command, dummy;
144 155
145 command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff); 156 command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
146 157
147 dprintk("trying to determine current setting with command %x at port %x\n", command, smi_port); 158 dprintk("trying to determine current setting with command %x "
159 "at port %x\n", command, smi_port);
148 160
149 __asm__ __volatile__( 161 __asm__ __volatile__(
150 "push %%ebp\n" 162 "push %%ebp\n"
151 "out %%al, (%%dx)\n" 163 "out %%al, (%%dx)\n"
152 "pop %%ebp\n" 164 "pop %%ebp\n"
153 : "=a" (result), "=b" (state), "=D" (edi), "=c" (dummy), "=d" (dummy), "=S" (dummy) 165 : "=a" (result),
154 : "a" (command), "b" (function), "c" (0), "d" (smi_port), "S" (0), "D" (0) 166 "=b" (state), "=D" (edi),
167 "=c" (dummy), "=d" (dummy), "=S" (dummy)
168 : "a" (command), "b" (function), "c" (0),
169 "d" (smi_port), "S" (0), "D" (0)
155 ); 170 );
156 171
157 dprintk("state is %x, result is %x\n", state, result); 172 dprintk("state is %x, result is %x\n", state, result);
158 173
159 return (state & 1); 174 return state & 1;
160} 175}
161 176
162 177
@@ -165,11 +180,11 @@ static int speedstep_get_state (void)
165 * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH) 180 * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
166 * 181 *
167 */ 182 */
168static void speedstep_set_state (unsigned int state) 183static void speedstep_set_state(unsigned int state)
169{ 184{
170 unsigned int result = 0, command, new_state, dummy; 185 unsigned int result = 0, command, new_state, dummy;
171 unsigned long flags; 186 unsigned long flags;
172 unsigned int function=SET_SPEEDSTEP_STATE; 187 unsigned int function = SET_SPEEDSTEP_STATE;
173 unsigned int retry = 0; 188 unsigned int retry = 0;
174 189
175 if (state > 0x1) 190 if (state > 0x1)
@@ -180,11 +195,14 @@ static void speedstep_set_state (unsigned int state)
180 195
181 command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff); 196 command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
182 197
183 dprintk("trying to set frequency to state %u with command %x at port %x\n", state, command, smi_port); 198 dprintk("trying to set frequency to state %u "
199 "with command %x at port %x\n",
200 state, command, smi_port);
184 201
185 do { 202 do {
186 if (retry) { 203 if (retry) {
187 dprintk("retry %u, previous result %u, waiting...\n", retry, result); 204 dprintk("retry %u, previous result %u, waiting...\n",
205 retry, result);
188 mdelay(retry * 50); 206 mdelay(retry * 50);
189 } 207 }
190 retry++; 208 retry++;
@@ -192,20 +210,26 @@ static void speedstep_set_state (unsigned int state)
192 "push %%ebp\n" 210 "push %%ebp\n"
193 "out %%al, (%%dx)\n" 211 "out %%al, (%%dx)\n"
194 "pop %%ebp" 212 "pop %%ebp"
195 : "=b" (new_state), "=D" (result), "=c" (dummy), "=a" (dummy), 213 : "=b" (new_state), "=D" (result),
196 "=d" (dummy), "=S" (dummy) 214 "=c" (dummy), "=a" (dummy),
197 : "a" (command), "b" (function), "c" (state), "d" (smi_port), "S" (0), "D" (0) 215 "=d" (dummy), "=S" (dummy)
216 : "a" (command), "b" (function), "c" (state),
217 "d" (smi_port), "S" (0), "D" (0)
198 ); 218 );
199 } while ((new_state != state) && (retry <= SMI_TRIES)); 219 } while ((new_state != state) && (retry <= SMI_TRIES));
200 220
201 /* enable IRQs */ 221 /* enable IRQs */
202 local_irq_restore(flags); 222 local_irq_restore(flags);
203 223
204 if (new_state == state) { 224 if (new_state == state)
205 dprintk("change to %u MHz succeeded after %u tries with result %u\n", (speedstep_freqs[new_state].frequency / 1000), retry, result); 225 dprintk("change to %u MHz succeeded after %u tries "
206 } else { 226 "with result %u\n",
207 printk(KERN_ERR "cpufreq: change to state %u failed with new_state %u and result %u\n", state, new_state, result); 227 (speedstep_freqs[new_state].frequency / 1000),
208 } 228 retry, result);
229 else
230 printk(KERN_ERR "cpufreq: change to state %u "
231 "failed with new_state %u and result %u\n",
232 state, new_state, result);
209 233
210 return; 234 return;
211} 235}
@@ -219,13 +243,14 @@ static void speedstep_set_state (unsigned int state)
219 * 243 *
220 * Sets a new CPUFreq policy/freq. 244 * Sets a new CPUFreq policy/freq.
221 */ 245 */
222static int speedstep_target (struct cpufreq_policy *policy, 246static int speedstep_target(struct cpufreq_policy *policy,
223 unsigned int target_freq, unsigned int relation) 247 unsigned int target_freq, unsigned int relation)
224{ 248{
225 unsigned int newstate = 0; 249 unsigned int newstate = 0;
226 struct cpufreq_freqs freqs; 250 struct cpufreq_freqs freqs;
227 251
228 if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], target_freq, relation, &newstate)) 252 if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0],
253 target_freq, relation, &newstate))
229 return -EINVAL; 254 return -EINVAL;
230 255
231 freqs.old = speedstep_freqs[speedstep_get_state()].frequency; 256 freqs.old = speedstep_freqs[speedstep_get_state()].frequency;
@@ -250,7 +275,7 @@ static int speedstep_target (struct cpufreq_policy *policy,
250 * Limit must be within speedstep_low_freq and speedstep_high_freq, with 275 * Limit must be within speedstep_low_freq and speedstep_high_freq, with
251 * at least one border included. 276 * at least one border included.
252 */ 277 */
253static int speedstep_verify (struct cpufreq_policy *policy) 278static int speedstep_verify(struct cpufreq_policy *policy)
254{ 279{
255 return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]); 280 return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
256} 281}
@@ -259,7 +284,8 @@ static int speedstep_verify (struct cpufreq_policy *policy)
259static int speedstep_cpu_init(struct cpufreq_policy *policy) 284static int speedstep_cpu_init(struct cpufreq_policy *policy)
260{ 285{
261 int result; 286 int result;
262 unsigned int speed,state; 287 unsigned int speed, state;
288 unsigned int *low, *high;
263 289
264 /* capability check */ 290 /* capability check */
265 if (policy->cpu != 0) 291 if (policy->cpu != 0)
@@ -272,19 +298,23 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
272 } 298 }
273 299
274 /* detect low and high frequency */ 300 /* detect low and high frequency */
275 result = speedstep_smi_get_freqs(&speedstep_freqs[SPEEDSTEP_LOW].frequency, 301 low = &speedstep_freqs[SPEEDSTEP_LOW].frequency;
276 &speedstep_freqs[SPEEDSTEP_HIGH].frequency); 302 high = &speedstep_freqs[SPEEDSTEP_HIGH].frequency;
303
304 result = speedstep_smi_get_freqs(low, high);
277 if (result) { 305 if (result) {
278 /* fall back to speedstep_lib.c dection mechanism: try both states out */ 306 /* fall back to speedstep_lib.c dection mechanism:
279 dprintk("could not detect low and high frequencies by SMI call.\n"); 307 * try both states out */
308 dprintk("could not detect low and high frequencies "
309 "by SMI call.\n");
280 result = speedstep_get_freqs(speedstep_processor, 310 result = speedstep_get_freqs(speedstep_processor,
281 &speedstep_freqs[SPEEDSTEP_LOW].frequency, 311 low, high,
282 &speedstep_freqs[SPEEDSTEP_HIGH].frequency,
283 NULL, 312 NULL,
284 &speedstep_set_state); 313 &speedstep_set_state);
285 314
286 if (result) { 315 if (result) {
287 dprintk("could not detect two different speeds -- aborting.\n"); 316 dprintk("could not detect two different speeds"
317 " -- aborting.\n");
288 return result; 318 return result;
289 } else 319 } else
290 dprintk("workaround worked.\n"); 320 dprintk("workaround worked.\n");
@@ -295,7 +325,8 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
295 speed = speedstep_freqs[state].frequency; 325 speed = speedstep_freqs[state].frequency;
296 326
297 dprintk("currently at %s speed setting - %i MHz\n", 327 dprintk("currently at %s speed setting - %i MHz\n",
298 (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency) ? "low" : "high", 328 (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency)
329 ? "low" : "high",
299 (speed / 1000)); 330 (speed / 1000));
300 331
301 /* cpuinfo and default policy values */ 332 /* cpuinfo and default policy values */
@@ -304,7 +335,7 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
304 335
305 result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs); 336 result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs);
306 if (result) 337 if (result)
307 return (result); 338 return result;
308 339
309 cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu); 340 cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu);
310 341
@@ -321,7 +352,7 @@ static unsigned int speedstep_get(unsigned int cpu)
321{ 352{
322 if (cpu) 353 if (cpu)
323 return -ENODEV; 354 return -ENODEV;
324 return speedstep_get_processor_frequency(speedstep_processor); 355 return speedstep_get_frequency(speedstep_processor);
325} 356}
326 357
327 358
@@ -335,7 +366,7 @@ static int speedstep_resume(struct cpufreq_policy *policy)
335 return result; 366 return result;
336} 367}
337 368
338static struct freq_attr* speedstep_attr[] = { 369static struct freq_attr *speedstep_attr[] = {
339 &cpufreq_freq_attr_scaling_available_freqs, 370 &cpufreq_freq_attr_scaling_available_freqs,
340 NULL, 371 NULL,
341}; 372};
@@ -364,21 +395,23 @@ static int __init speedstep_init(void)
364 speedstep_processor = speedstep_detect_processor(); 395 speedstep_processor = speedstep_detect_processor();
365 396
366 switch (speedstep_processor) { 397 switch (speedstep_processor) {
367 case SPEEDSTEP_PROCESSOR_PIII_T: 398 case SPEEDSTEP_CPU_PIII_T:
368 case SPEEDSTEP_PROCESSOR_PIII_C: 399 case SPEEDSTEP_CPU_PIII_C:
369 case SPEEDSTEP_PROCESSOR_PIII_C_EARLY: 400 case SPEEDSTEP_CPU_PIII_C_EARLY:
370 break; 401 break;
371 default: 402 default:
372 speedstep_processor = 0; 403 speedstep_processor = 0;
373 } 404 }
374 405
375 if (!speedstep_processor) { 406 if (!speedstep_processor) {
376 dprintk ("No supported Intel CPU detected.\n"); 407 dprintk("No supported Intel CPU detected.\n");
377 return -ENODEV; 408 return -ENODEV;
378 } 409 }
379 410
380 dprintk("signature:0x%.8lx, command:0x%.8lx, event:0x%.8lx, perf_level:0x%.8lx.\n", 411 dprintk("signature:0x%.8lx, command:0x%.8lx, "
381 ist_info.signature, ist_info.command, ist_info.event, ist_info.perf_level); 412 "event:0x%.8lx, perf_level:0x%.8lx.\n",
413 ist_info.signature, ist_info.command,
414 ist_info.event, ist_info.perf_level);
382 415
383 /* Error if no IST-SMI BIOS or no PARM 416 /* Error if no IST-SMI BIOS or no PARM
384 sig= 'ISGE' aka 'Intel Speedstep Gate E' */ 417 sig= 'ISGE' aka 'Intel Speedstep Gate E' */
@@ -416,17 +449,20 @@ static void __exit speedstep_exit(void)
416 cpufreq_unregister_driver(&speedstep_driver); 449 cpufreq_unregister_driver(&speedstep_driver);
417} 450}
418 451
419module_param(smi_port, int, 0444); 452module_param(smi_port, int, 0444);
420module_param(smi_cmd, int, 0444); 453module_param(smi_cmd, int, 0444);
421module_param(smi_sig, uint, 0444); 454module_param(smi_sig, uint, 0444);
422 455
423MODULE_PARM_DESC(smi_port, "Override the BIOS-given IST port with this value -- Intel's default setting is 0xb2"); 456MODULE_PARM_DESC(smi_port, "Override the BIOS-given IST port with this value "
424MODULE_PARM_DESC(smi_cmd, "Override the BIOS-given IST command with this value -- Intel's default setting is 0x82"); 457 "-- Intel's default setting is 0xb2");
425MODULE_PARM_DESC(smi_sig, "Set to 1 to fake the IST signature when using the SMI interface."); 458MODULE_PARM_DESC(smi_cmd, "Override the BIOS-given IST command with this value "
459 "-- Intel's default setting is 0x82");
460MODULE_PARM_DESC(smi_sig, "Set to 1 to fake the IST signature when using the "
461 "SMI interface.");
426 462
427MODULE_AUTHOR ("Hiroshi Miura"); 463MODULE_AUTHOR("Hiroshi Miura");
428MODULE_DESCRIPTION ("Speedstep driver for IST applet SMI interface."); 464MODULE_DESCRIPTION("Speedstep driver for IST applet SMI interface.");
429MODULE_LICENSE ("GPL"); 465MODULE_LICENSE("GPL");
430 466
431module_init(speedstep_init); 467module_init(speedstep_init);
432module_exit(speedstep_exit); 468module_exit(speedstep_exit);
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index ffd0f5ed071a..593171e967ef 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -61,23 +61,23 @@ static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
61 */ 61 */
62static unsigned char Cx86_dir0_msb __cpuinitdata = 0; 62static unsigned char Cx86_dir0_msb __cpuinitdata = 0;
63 63
64static char Cx86_model[][9] __cpuinitdata = { 64static const char __cpuinitconst Cx86_model[][9] = {
65 "Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ", 65 "Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ",
66 "M II ", "Unknown" 66 "M II ", "Unknown"
67}; 67};
68static char Cx486_name[][5] __cpuinitdata = { 68static const char __cpuinitconst Cx486_name[][5] = {
69 "SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx", 69 "SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx",
70 "SRx2", "DRx2" 70 "SRx2", "DRx2"
71}; 71};
72static char Cx486S_name[][4] __cpuinitdata = { 72static const char __cpuinitconst Cx486S_name[][4] = {
73 "S", "S2", "Se", "S2e" 73 "S", "S2", "Se", "S2e"
74}; 74};
75static char Cx486D_name[][4] __cpuinitdata = { 75static const char __cpuinitconst Cx486D_name[][4] = {
76 "DX", "DX2", "?", "?", "?", "DX4" 76 "DX", "DX2", "?", "?", "?", "DX4"
77}; 77};
78static char Cx86_cb[] __cpuinitdata = "?.5x Core/Bus Clock"; 78static char Cx86_cb[] __cpuinitdata = "?.5x Core/Bus Clock";
79static char cyrix_model_mult1[] __cpuinitdata = "12??43"; 79static const char __cpuinitconst cyrix_model_mult1[] = "12??43";
80static char cyrix_model_mult2[] __cpuinitdata = "12233445"; 80static const char __cpuinitconst cyrix_model_mult2[] = "12233445";
81 81
82/* 82/*
83 * Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old 83 * Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old
@@ -435,7 +435,7 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c)
435 } 435 }
436} 436}
437 437
438static struct cpu_dev cyrix_cpu_dev __cpuinitdata = { 438static const struct cpu_dev __cpuinitconst cyrix_cpu_dev = {
439 .c_vendor = "Cyrix", 439 .c_vendor = "Cyrix",
440 .c_ident = { "CyrixInstead" }, 440 .c_ident = { "CyrixInstead" },
441 .c_early_init = early_init_cyrix, 441 .c_early_init = early_init_cyrix,
@@ -446,7 +446,7 @@ static struct cpu_dev cyrix_cpu_dev __cpuinitdata = {
446 446
447cpu_dev_register(cyrix_cpu_dev); 447cpu_dev_register(cyrix_cpu_dev);
448 448
449static struct cpu_dev nsc_cpu_dev __cpuinitdata = { 449static const struct cpu_dev __cpuinitconst nsc_cpu_dev = {
450 .c_vendor = "NSC", 450 .c_vendor = "NSC",
451 .c_ident = { "Geode by NSC" }, 451 .c_ident = { "Geode by NSC" },
452 .c_init = init_nsc, 452 .c_init = init_nsc,
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 25c559ba8d54..7437fa133c02 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -4,6 +4,7 @@
4#include <linux/string.h> 4#include <linux/string.h>
5#include <linux/bitops.h> 5#include <linux/bitops.h>
6#include <linux/smp.h> 6#include <linux/smp.h>
7#include <linux/sched.h>
7#include <linux/thread_info.h> 8#include <linux/thread_info.h>
8#include <linux/module.h> 9#include <linux/module.h>
9 10
@@ -13,6 +14,7 @@
13#include <asm/uaccess.h> 14#include <asm/uaccess.h>
14#include <asm/ds.h> 15#include <asm/ds.h>
15#include <asm/bugs.h> 16#include <asm/bugs.h>
17#include <asm/cpu.h>
16 18
17#ifdef CONFIG_X86_64 19#ifdef CONFIG_X86_64
18#include <asm/topology.h> 20#include <asm/topology.h>
@@ -53,13 +55,23 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
53 c->x86_cache_alignment = 128; 55 c->x86_cache_alignment = 128;
54#endif 56#endif
55 57
58 /* CPUID workaround for 0F33/0F34 CPU */
59 if (c->x86 == 0xF && c->x86_model == 0x3
60 && (c->x86_mask == 0x3 || c->x86_mask == 0x4))
61 c->x86_phys_bits = 36;
62
56 /* 63 /*
57 * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate 64 * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
58 * with P/T states and does not stop in deep C-states 65 * with P/T states and does not stop in deep C-states.
66 *
67 * It is also reliable across cores and sockets. (but not across
68 * cabinets - we turn it off in that case explicitly.)
59 */ 69 */
60 if (c->x86_power & (1 << 8)) { 70 if (c->x86_power & (1 << 8)) {
61 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 71 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
62 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); 72 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
73 set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE);
74 sched_clock_stable = 1;
63 } 75 }
64 76
65 /* 77 /*
@@ -110,6 +122,28 @@ static void __cpuinit trap_init_f00f_bug(void)
110} 122}
111#endif 123#endif
112 124
125static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
126{
127#ifdef CONFIG_SMP
128 /* calling is from identify_secondary_cpu() ? */
129 if (c->cpu_index == boot_cpu_id)
130 return;
131
132 /*
133 * Mask B, Pentium, but not Pentium MMX
134 */
135 if (c->x86 == 5 &&
136 c->x86_mask >= 1 && c->x86_mask <= 4 &&
137 c->x86_model <= 3) {
138 /*
139 * Remember we have B step Pentia with bugs
140 */
141 WARN_ONCE(1, "WARNING: SMP operation may be unreliable"
142 "with B stepping processors.\n");
143 }
144#endif
145}
146
113static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) 147static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
114{ 148{
115 unsigned long lo, hi; 149 unsigned long lo, hi;
@@ -186,6 +220,8 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
186#ifdef CONFIG_X86_NUMAQ 220#ifdef CONFIG_X86_NUMAQ
187 numaq_tsc_disable(); 221 numaq_tsc_disable();
188#endif 222#endif
223
224 intel_smp_check(c);
189} 225}
190#else 226#else
191static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) 227static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
@@ -385,7 +421,7 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i
385} 421}
386#endif 422#endif
387 423
388static struct cpu_dev intel_cpu_dev __cpuinitdata = { 424static const struct cpu_dev __cpuinitconst intel_cpu_dev = {
389 .c_vendor = "Intel", 425 .c_vendor = "Intel",
390 .c_ident = { "GenuineIntel" }, 426 .c_ident = { "GenuineIntel" },
391#ifdef CONFIG_X86_32 427#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 7293508d8f5c..483eda96e102 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -32,7 +32,7 @@ struct _cache_table
32}; 32};
33 33
34/* all the cache descriptor types we care about (no TLB or trace cache entries) */ 34/* all the cache descriptor types we care about (no TLB or trace cache entries) */
35static struct _cache_table cache_table[] __cpuinitdata = 35static const struct _cache_table __cpuinitconst cache_table[] =
36{ 36{
37 { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */ 37 { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */
38 { 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */ 38 { 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */
@@ -159,7 +159,7 @@ struct _cpuid4_info_regs {
159 unsigned long can_disable; 159 unsigned long can_disable;
160}; 160};
161 161
162#ifdef CONFIG_PCI 162#if defined(CONFIG_PCI) && defined(CONFIG_SYSFS)
163static struct pci_device_id k8_nb_id[] = { 163static struct pci_device_id k8_nb_id[] = {
164 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, 164 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) },
165 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, 165 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) },
@@ -206,15 +206,15 @@ union l3_cache {
206 unsigned val; 206 unsigned val;
207}; 207};
208 208
209static unsigned short assocs[] __cpuinitdata = { 209static const unsigned short __cpuinitconst assocs[] = {
210 [1] = 1, [2] = 2, [4] = 4, [6] = 8, 210 [1] = 1, [2] = 2, [4] = 4, [6] = 8,
211 [8] = 16, [0xa] = 32, [0xb] = 48, 211 [8] = 16, [0xa] = 32, [0xb] = 48,
212 [0xc] = 64, 212 [0xc] = 64,
213 [0xf] = 0xffff // ?? 213 [0xf] = 0xffff // ??
214}; 214};
215 215
216static unsigned char levels[] __cpuinitdata = { 1, 1, 2, 3 }; 216static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 };
217static unsigned char types[] __cpuinitdata = { 1, 2, 3, 3 }; 217static const unsigned char __cpuinitconst types[] = { 1, 2, 3, 3 };
218 218
219static void __cpuinit 219static void __cpuinit
220amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, 220amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
@@ -324,15 +324,6 @@ __cpuinit cpuid4_cache_lookup_regs(int index,
324 return 0; 324 return 0;
325} 325}
326 326
327static int
328__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
329{
330 struct _cpuid4_info_regs *leaf_regs =
331 (struct _cpuid4_info_regs *)this_leaf;
332
333 return cpuid4_cache_lookup_regs(index, leaf_regs);
334}
335
336static int __cpuinit find_num_cache_leaves(void) 327static int __cpuinit find_num_cache_leaves(void)
337{ 328{
338 unsigned int eax, ebx, ecx, edx; 329 unsigned int eax, ebx, ecx, edx;
@@ -508,6 +499,8 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
508 return l2; 499 return l2;
509} 500}
510 501
502#ifdef CONFIG_SYSFS
503
511/* pointer to _cpuid4_info array (for each cache leaf) */ 504/* pointer to _cpuid4_info array (for each cache leaf) */
512static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info); 505static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info);
513#define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y])) 506#define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y]))
@@ -571,6 +564,15 @@ static void __cpuinit free_cache_attributes(unsigned int cpu)
571 per_cpu(cpuid4_info, cpu) = NULL; 564 per_cpu(cpuid4_info, cpu) = NULL;
572} 565}
573 566
567static int
568__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
569{
570 struct _cpuid4_info_regs *leaf_regs =
571 (struct _cpuid4_info_regs *)this_leaf;
572
573 return cpuid4_cache_lookup_regs(index, leaf_regs);
574}
575
574static void __cpuinit get_cpu_leaves(void *_retval) 576static void __cpuinit get_cpu_leaves(void *_retval)
575{ 577{
576 int j, *retval = _retval, cpu = smp_processor_id(); 578 int j, *retval = _retval, cpu = smp_processor_id();
@@ -612,8 +614,6 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
612 return retval; 614 return retval;
613} 615}
614 616
615#ifdef CONFIG_SYSFS
616
617#include <linux/kobject.h> 617#include <linux/kobject.h>
618#include <linux/sysfs.h> 618#include <linux/sysfs.h>
619 619
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index d7d2323bbb69..b2f89829bbe8 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o
4obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o 4obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o
5obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o 5obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o
6obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o 6obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
7obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
index dfaebce3633e..3552119b091d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_32.c
@@ -60,20 +60,6 @@ void mcheck_init(struct cpuinfo_x86 *c)
60 } 60 }
61} 61}
62 62
63static unsigned long old_cr4 __initdata;
64
65void __init stop_mce(void)
66{
67 old_cr4 = read_cr4();
68 clear_in_cr4(X86_CR4_MCE);
69}
70
71void __init restart_mce(void)
72{
73 if (old_cr4 & X86_CR4_MCE)
74 set_in_cr4(X86_CR4_MCE);
75}
76
77static int __init mcheck_disable(char *str) 63static int __init mcheck_disable(char *str)
78{ 64{
79 mce_disabled = 1; 65 mce_disabled = 1;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index fe79985ce0f2..863f89568b1a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -3,6 +3,8 @@
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s). 4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it. 5 * 2004 Andi Kleen. Rewrote most of it.
6 * Copyright 2008 Intel Corporation
7 * Author: Andi Kleen
6 */ 8 */
7 9
8#include <linux/init.h> 10#include <linux/init.h>
@@ -24,6 +26,9 @@
24#include <linux/ctype.h> 26#include <linux/ctype.h>
25#include <linux/kmod.h> 27#include <linux/kmod.h>
26#include <linux/kdebug.h> 28#include <linux/kdebug.h>
29#include <linux/kobject.h>
30#include <linux/sysfs.h>
31#include <linux/ratelimit.h>
27#include <asm/processor.h> 32#include <asm/processor.h>
28#include <asm/msr.h> 33#include <asm/msr.h>
29#include <asm/mce.h> 34#include <asm/mce.h>
@@ -32,7 +37,6 @@
32#include <asm/idle.h> 37#include <asm/idle.h>
33 38
34#define MISC_MCELOG_MINOR 227 39#define MISC_MCELOG_MINOR 227
35#define NR_SYSFS_BANKS 6
36 40
37atomic_t mce_entry; 41atomic_t mce_entry;
38 42
@@ -47,7 +51,7 @@ static int mce_dont_init;
47 */ 51 */
48static int tolerant = 1; 52static int tolerant = 1;
49static int banks; 53static int banks;
50static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL }; 54static u64 *bank;
51static unsigned long notify_user; 55static unsigned long notify_user;
52static int rip_msr; 56static int rip_msr;
53static int mce_bootlog = -1; 57static int mce_bootlog = -1;
@@ -58,6 +62,19 @@ static char *trigger_argv[2] = { trigger, NULL };
58 62
59static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 63static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
60 64
65/* MCA banks polled by the period polling timer for corrected events */
66DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
67 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
68};
69
70/* Do initial initialization of a struct mce */
71void mce_setup(struct mce *m)
72{
73 memset(m, 0, sizeof(struct mce));
74 m->cpu = smp_processor_id();
75 rdtscll(m->tsc);
76}
77
61/* 78/*
62 * Lockless MCE logging infrastructure. 79 * Lockless MCE logging infrastructure.
63 * This avoids deadlocks on printk locks without having to break locks. Also 80 * This avoids deadlocks on printk locks without having to break locks. Also
@@ -119,11 +136,11 @@ static void print_mce(struct mce *m)
119 print_symbol("{%s}", m->ip); 136 print_symbol("{%s}", m->ip);
120 printk("\n"); 137 printk("\n");
121 } 138 }
122 printk(KERN_EMERG "TSC %Lx ", m->tsc); 139 printk(KERN_EMERG "TSC %llx ", m->tsc);
123 if (m->addr) 140 if (m->addr)
124 printk("ADDR %Lx ", m->addr); 141 printk("ADDR %llx ", m->addr);
125 if (m->misc) 142 if (m->misc)
126 printk("MISC %Lx ", m->misc); 143 printk("MISC %llx ", m->misc);
127 printk("\n"); 144 printk("\n");
128 printk(KERN_EMERG "This is not a software problem!\n"); 145 printk(KERN_EMERG "This is not a software problem!\n");
129 printk(KERN_EMERG "Run through mcelog --ascii to decode " 146 printk(KERN_EMERG "Run through mcelog --ascii to decode "
@@ -149,8 +166,10 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start)
149 panic(msg); 166 panic(msg);
150} 167}
151 168
152static int mce_available(struct cpuinfo_x86 *c) 169int mce_available(struct cpuinfo_x86 *c)
153{ 170{
171 if (mce_dont_init)
172 return 0;
154 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 173 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
155} 174}
156 175
@@ -172,7 +191,77 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
172} 191}
173 192
174/* 193/*
175 * The actual machine check handler 194 * Poll for corrected events or events that happened before reset.
195 * Those are just logged through /dev/mcelog.
196 *
197 * This is executed in standard interrupt context.
198 */
199void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
200{
201 struct mce m;
202 int i;
203
204 mce_setup(&m);
205
206 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
207 for (i = 0; i < banks; i++) {
208 if (!bank[i] || !test_bit(i, *b))
209 continue;
210
211 m.misc = 0;
212 m.addr = 0;
213 m.bank = i;
214 m.tsc = 0;
215
216 barrier();
217 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
218 if (!(m.status & MCI_STATUS_VAL))
219 continue;
220
221 /*
222 * Uncorrected events are handled by the exception handler
223 * when it is enabled. But when the exception is disabled log
224 * everything.
225 *
226 * TBD do the same check for MCI_STATUS_EN here?
227 */
228 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
229 continue;
230
231 if (m.status & MCI_STATUS_MISCV)
232 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
233 if (m.status & MCI_STATUS_ADDRV)
234 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
235
236 if (!(flags & MCP_TIMESTAMP))
237 m.tsc = 0;
238 /*
239 * Don't get the IP here because it's unlikely to
240 * have anything to do with the actual error location.
241 */
242
243 mce_log(&m);
244 add_taint(TAINT_MACHINE_CHECK);
245
246 /*
247 * Clear state for this bank.
248 */
249 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
250 }
251
252 /*
253 * Don't clear MCG_STATUS here because it's only defined for
254 * exceptions.
255 */
256}
257
258/*
259 * The actual machine check handler. This only handles real
260 * exceptions when something got corrupted coming in through int 18.
261 *
262 * This is executed in NMI context not subject to normal locking rules. This
263 * implies that most kernel services cannot be safely used. Don't even
264 * think about putting a printk in there!
176 */ 265 */
177void do_machine_check(struct pt_regs * regs, long error_code) 266void do_machine_check(struct pt_regs * regs, long error_code)
178{ 267{
@@ -190,17 +279,18 @@ void do_machine_check(struct pt_regs * regs, long error_code)
190 * error. 279 * error.
191 */ 280 */
192 int kill_it = 0; 281 int kill_it = 0;
282 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
193 283
194 atomic_inc(&mce_entry); 284 atomic_inc(&mce_entry);
195 285
196 if ((regs 286 if (notify_die(DIE_NMI, "machine check", regs, error_code,
197 && notify_die(DIE_NMI, "machine check", regs, error_code,
198 18, SIGKILL) == NOTIFY_STOP) 287 18, SIGKILL) == NOTIFY_STOP)
199 || !banks) 288 goto out2;
289 if (!banks)
200 goto out2; 290 goto out2;
201 291
202 memset(&m, 0, sizeof(struct mce)); 292 mce_setup(&m);
203 m.cpu = smp_processor_id(); 293
204 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); 294 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
205 /* if the restart IP is not valid, we're done for */ 295 /* if the restart IP is not valid, we're done for */
206 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 296 if (!(m.mcgstatus & MCG_STATUS_RIPV))
@@ -210,18 +300,32 @@ void do_machine_check(struct pt_regs * regs, long error_code)
210 barrier(); 300 barrier();
211 301
212 for (i = 0; i < banks; i++) { 302 for (i = 0; i < banks; i++) {
213 if (i < NR_SYSFS_BANKS && !bank[i]) 303 __clear_bit(i, toclear);
304 if (!bank[i])
214 continue; 305 continue;
215 306
216 m.misc = 0; 307 m.misc = 0;
217 m.addr = 0; 308 m.addr = 0;
218 m.bank = i; 309 m.bank = i;
219 m.tsc = 0;
220 310
221 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); 311 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
222 if ((m.status & MCI_STATUS_VAL) == 0) 312 if ((m.status & MCI_STATUS_VAL) == 0)
223 continue; 313 continue;
224 314
315 /*
316 * Non uncorrected errors are handled by machine_check_poll
317 * Leave them alone.
318 */
319 if ((m.status & MCI_STATUS_UC) == 0)
320 continue;
321
322 /*
323 * Set taint even when machine check was not enabled.
324 */
325 add_taint(TAINT_MACHINE_CHECK);
326
327 __set_bit(i, toclear);
328
225 if (m.status & MCI_STATUS_EN) { 329 if (m.status & MCI_STATUS_EN) {
226 /* if PCC was set, there's no way out */ 330 /* if PCC was set, there's no way out */
227 no_way_out |= !!(m.status & MCI_STATUS_PCC); 331 no_way_out |= !!(m.status & MCI_STATUS_PCC);
@@ -235,6 +339,12 @@ void do_machine_check(struct pt_regs * regs, long error_code)
235 no_way_out = 1; 339 no_way_out = 1;
236 kill_it = 1; 340 kill_it = 1;
237 } 341 }
342 } else {
343 /*
344 * Machine check event was not enabled. Clear, but
345 * ignore.
346 */
347 continue;
238 } 348 }
239 349
240 if (m.status & MCI_STATUS_MISCV) 350 if (m.status & MCI_STATUS_MISCV)
@@ -243,10 +353,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
243 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); 353 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
244 354
245 mce_get_rip(&m, regs); 355 mce_get_rip(&m, regs);
246 if (error_code >= 0) 356 mce_log(&m);
247 rdtscll(m.tsc);
248 if (error_code != -2)
249 mce_log(&m);
250 357
251 /* Did this bank cause the exception? */ 358 /* Did this bank cause the exception? */
252 /* Assume that the bank with uncorrectable errors did it, 359 /* Assume that the bank with uncorrectable errors did it,
@@ -255,14 +362,8 @@ void do_machine_check(struct pt_regs * regs, long error_code)
255 panicm = m; 362 panicm = m;
256 panicm_found = 1; 363 panicm_found = 1;
257 } 364 }
258
259 add_taint(TAINT_MACHINE_CHECK);
260 } 365 }
261 366
262 /* Never do anything final in the polling timer */
263 if (!regs)
264 goto out;
265
266 /* If we didn't find an uncorrectable error, pick 367 /* If we didn't find an uncorrectable error, pick
267 the last one (shouldn't happen, just being safe). */ 368 the last one (shouldn't happen, just being safe). */
268 if (!panicm_found) 369 if (!panicm_found)
@@ -309,10 +410,11 @@ void do_machine_check(struct pt_regs * regs, long error_code)
309 /* notify userspace ASAP */ 410 /* notify userspace ASAP */
310 set_thread_flag(TIF_MCE_NOTIFY); 411 set_thread_flag(TIF_MCE_NOTIFY);
311 412
312 out:
313 /* the last thing we do is clear state */ 413 /* the last thing we do is clear state */
314 for (i = 0; i < banks; i++) 414 for (i = 0; i < banks; i++) {
315 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 415 if (test_bit(i, toclear))
416 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
417 }
316 wrmsrl(MSR_IA32_MCG_STATUS, 0); 418 wrmsrl(MSR_IA32_MCG_STATUS, 0);
317 out2: 419 out2:
318 atomic_dec(&mce_entry); 420 atomic_dec(&mce_entry);
@@ -332,15 +434,13 @@ void do_machine_check(struct pt_regs * regs, long error_code)
332 * and historically has been the register value of the 434 * and historically has been the register value of the
333 * MSR_IA32_THERMAL_STATUS (Intel) msr. 435 * MSR_IA32_THERMAL_STATUS (Intel) msr.
334 */ 436 */
335void mce_log_therm_throt_event(unsigned int cpu, __u64 status) 437void mce_log_therm_throt_event(__u64 status)
336{ 438{
337 struct mce m; 439 struct mce m;
338 440
339 memset(&m, 0, sizeof(m)); 441 mce_setup(&m);
340 m.cpu = cpu;
341 m.bank = MCE_THERMAL_BANK; 442 m.bank = MCE_THERMAL_BANK;
342 m.status = status; 443 m.status = status;
343 rdtscll(m.tsc);
344 mce_log(&m); 444 mce_log(&m);
345} 445}
346#endif /* CONFIG_X86_MCE_INTEL */ 446#endif /* CONFIG_X86_MCE_INTEL */
@@ -353,18 +453,18 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
353 453
354static int check_interval = 5 * 60; /* 5 minutes */ 454static int check_interval = 5 * 60; /* 5 minutes */
355static int next_interval; /* in jiffies */ 455static int next_interval; /* in jiffies */
356static void mcheck_timer(struct work_struct *work); 456static void mcheck_timer(unsigned long);
357static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer); 457static DEFINE_PER_CPU(struct timer_list, mce_timer);
358 458
359static void mcheck_check_cpu(void *info) 459static void mcheck_timer(unsigned long data)
360{ 460{
361 if (mce_available(&current_cpu_data)) 461 struct timer_list *t = &per_cpu(mce_timer, data);
362 do_machine_check(NULL, 0);
363}
364 462
365static void mcheck_timer(struct work_struct *work) 463 WARN_ON(smp_processor_id() != data);
366{ 464
367 on_each_cpu(mcheck_check_cpu, NULL, 1); 465 if (mce_available(&current_cpu_data))
466 machine_check_poll(MCP_TIMESTAMP,
467 &__get_cpu_var(mce_poll_banks));
368 468
369 /* 469 /*
370 * Alert userspace if needed. If we logged an MCE, reduce the 470 * Alert userspace if needed. If we logged an MCE, reduce the
@@ -377,31 +477,41 @@ static void mcheck_timer(struct work_struct *work)
377 (int)round_jiffies_relative(check_interval*HZ)); 477 (int)round_jiffies_relative(check_interval*HZ));
378 } 478 }
379 479
380 schedule_delayed_work(&mcheck_work, next_interval); 480 t->expires = jiffies + next_interval;
481 add_timer(t);
482}
483
484static void mce_do_trigger(struct work_struct *work)
485{
486 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
381} 487}
382 488
489static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
490
383/* 491/*
384 * This is only called from process context. This is where we do 492 * Notify the user(s) about new machine check events.
385 * anything we need to alert userspace about new MCEs. This is called 493 * Can be called from interrupt context, but not from machine check/NMI
386 * directly from the poller and also from entry.S and idle, thanks to 494 * context.
387 * TIF_MCE_NOTIFY.
388 */ 495 */
389int mce_notify_user(void) 496int mce_notify_user(void)
390{ 497{
498 /* Not more than two messages every minute */
499 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
500
391 clear_thread_flag(TIF_MCE_NOTIFY); 501 clear_thread_flag(TIF_MCE_NOTIFY);
392 if (test_and_clear_bit(0, &notify_user)) { 502 if (test_and_clear_bit(0, &notify_user)) {
393 static unsigned long last_print;
394 unsigned long now = jiffies;
395
396 wake_up_interruptible(&mce_wait); 503 wake_up_interruptible(&mce_wait);
397 if (trigger[0])
398 call_usermodehelper(trigger, trigger_argv, NULL,
399 UMH_NO_WAIT);
400 504
401 if (time_after_eq(now, last_print + (check_interval*HZ))) { 505 /*
402 last_print = now; 506 * There is no risk of missing notifications because
507 * work_pending is always cleared before the function is
508 * executed.
509 */
510 if (trigger[0] && !work_pending(&mce_trigger_work))
511 schedule_work(&mce_trigger_work);
512
513 if (__ratelimit(&ratelimit))
403 printk(KERN_INFO "Machine check events logged\n"); 514 printk(KERN_INFO "Machine check events logged\n");
404 }
405 515
406 return 1; 516 return 1;
407 } 517 }
@@ -425,63 +535,78 @@ static struct notifier_block mce_idle_notifier = {
425 535
426static __init int periodic_mcheck_init(void) 536static __init int periodic_mcheck_init(void)
427{ 537{
428 next_interval = check_interval * HZ; 538 idle_notifier_register(&mce_idle_notifier);
429 if (next_interval) 539 return 0;
430 schedule_delayed_work(&mcheck_work,
431 round_jiffies_relative(next_interval));
432 idle_notifier_register(&mce_idle_notifier);
433 return 0;
434} 540}
435__initcall(periodic_mcheck_init); 541__initcall(periodic_mcheck_init);
436 542
437
438/* 543/*
439 * Initialize Machine Checks for a CPU. 544 * Initialize Machine Checks for a CPU.
440 */ 545 */
441static void mce_init(void *dummy) 546static int mce_cap_init(void)
442{ 547{
443 u64 cap; 548 u64 cap;
444 int i; 549 unsigned b;
445 550
446 rdmsrl(MSR_IA32_MCG_CAP, cap); 551 rdmsrl(MSR_IA32_MCG_CAP, cap);
447 banks = cap & 0xff; 552 b = cap & 0xff;
448 if (banks > MCE_EXTENDED_BANK) { 553 if (b > MAX_NR_BANKS) {
449 banks = MCE_EXTENDED_BANK; 554 printk(KERN_WARNING
450 printk(KERN_INFO "MCE: warning: using only %d banks\n", 555 "MCE: Using only %u machine check banks out of %u\n",
451 MCE_EXTENDED_BANK); 556 MAX_NR_BANKS, b);
557 b = MAX_NR_BANKS;
452 } 558 }
559
560 /* Don't support asymmetric configurations today */
561 WARN_ON(banks != 0 && b != banks);
562 banks = b;
563 if (!bank) {
564 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
565 if (!bank)
566 return -ENOMEM;
567 memset(bank, 0xff, banks * sizeof(u64));
568 }
569
453 /* Use accurate RIP reporting if available. */ 570 /* Use accurate RIP reporting if available. */
454 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) 571 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
455 rip_msr = MSR_IA32_MCG_EIP; 572 rip_msr = MSR_IA32_MCG_EIP;
456 573
457 /* Log the machine checks left over from the previous reset. 574 return 0;
458 This also clears all registers */ 575}
459 do_machine_check(NULL, mce_bootlog ? -1 : -2); 576
577static void mce_init(void *dummy)
578{
579 u64 cap;
580 int i;
581 mce_banks_t all_banks;
582
583 /*
584 * Log the machine checks left over from the previous reset.
585 */
586 bitmap_fill(all_banks, MAX_NR_BANKS);
587 machine_check_poll(MCP_UC, &all_banks);
460 588
461 set_in_cr4(X86_CR4_MCE); 589 set_in_cr4(X86_CR4_MCE);
462 590
591 rdmsrl(MSR_IA32_MCG_CAP, cap);
463 if (cap & MCG_CTL_P) 592 if (cap & MCG_CTL_P)
464 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 593 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
465 594
466 for (i = 0; i < banks; i++) { 595 for (i = 0; i < banks; i++) {
467 if (i < NR_SYSFS_BANKS) 596 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
468 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
469 else
470 wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL);
471
472 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 597 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
473 } 598 }
474} 599}
475 600
476/* Add per CPU specific workarounds here */ 601/* Add per CPU specific workarounds here */
477static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) 602static void mce_cpu_quirks(struct cpuinfo_x86 *c)
478{ 603{
479 /* This should be disabled by the BIOS, but isn't always */ 604 /* This should be disabled by the BIOS, but isn't always */
480 if (c->x86_vendor == X86_VENDOR_AMD) { 605 if (c->x86_vendor == X86_VENDOR_AMD) {
481 if(c->x86 == 15) 606 if (c->x86 == 15 && banks > 4)
482 /* disable GART TBL walk error reporting, which trips off 607 /* disable GART TBL walk error reporting, which trips off
483 incorrectly with the IOMMU & 3ware & Cerberus. */ 608 incorrectly with the IOMMU & 3ware & Cerberus. */
484 clear_bit(10, &bank[4]); 609 clear_bit(10, (unsigned long *)&bank[4]);
485 if(c->x86 <= 17 && mce_bootlog < 0) 610 if(c->x86 <= 17 && mce_bootlog < 0)
486 /* Lots of broken BIOS around that don't clear them 611 /* Lots of broken BIOS around that don't clear them
487 by default and leave crap in there. Don't log. */ 612 by default and leave crap in there. Don't log. */
@@ -504,20 +629,38 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
504 } 629 }
505} 630}
506 631
632static void mce_init_timer(void)
633{
634 struct timer_list *t = &__get_cpu_var(mce_timer);
635
636 /* data race harmless because everyone sets to the same value */
637 if (!next_interval)
638 next_interval = check_interval * HZ;
639 if (!next_interval)
640 return;
641 setup_timer(t, mcheck_timer, smp_processor_id());
642 t->expires = round_jiffies(jiffies + next_interval);
643 add_timer(t);
644}
645
507/* 646/*
508 * Called for each booted CPU to set up machine checks. 647 * Called for each booted CPU to set up machine checks.
509 * Must be called with preempt off. 648 * Must be called with preempt off.
510 */ 649 */
511void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 650void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
512{ 651{
513 mce_cpu_quirks(c); 652 if (!mce_available(c))
653 return;
514 654
515 if (mce_dont_init || 655 if (mce_cap_init() < 0) {
516 !mce_available(c)) 656 mce_dont_init = 1;
517 return; 657 return;
658 }
659 mce_cpu_quirks(c);
518 660
519 mce_init(NULL); 661 mce_init(NULL);
520 mce_cpu_features(c); 662 mce_cpu_features(c);
663 mce_init_timer();
521} 664}
522 665
523/* 666/*
@@ -573,7 +716,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
573{ 716{
574 unsigned long *cpu_tsc; 717 unsigned long *cpu_tsc;
575 static DEFINE_MUTEX(mce_read_mutex); 718 static DEFINE_MUTEX(mce_read_mutex);
576 unsigned next; 719 unsigned prev, next;
577 char __user *buf = ubuf; 720 char __user *buf = ubuf;
578 int i, err; 721 int i, err;
579 722
@@ -592,25 +735,32 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
592 } 735 }
593 736
594 err = 0; 737 err = 0;
595 for (i = 0; i < next; i++) { 738 prev = 0;
596 unsigned long start = jiffies; 739 do {
597 740 for (i = prev; i < next; i++) {
598 while (!mcelog.entry[i].finished) { 741 unsigned long start = jiffies;
599 if (time_after_eq(jiffies, start + 2)) { 742
600 memset(mcelog.entry + i,0, sizeof(struct mce)); 743 while (!mcelog.entry[i].finished) {
601 goto timeout; 744 if (time_after_eq(jiffies, start + 2)) {
745 memset(mcelog.entry + i, 0,
746 sizeof(struct mce));
747 goto timeout;
748 }
749 cpu_relax();
602 } 750 }
603 cpu_relax(); 751 smp_rmb();
752 err |= copy_to_user(buf, mcelog.entry + i,
753 sizeof(struct mce));
754 buf += sizeof(struct mce);
755timeout:
756 ;
604 } 757 }
605 smp_rmb();
606 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
607 buf += sizeof(struct mce);
608 timeout:
609 ;
610 }
611 758
612 memset(mcelog.entry, 0, next * sizeof(struct mce)); 759 memset(mcelog.entry + prev, 0,
613 mcelog.next = 0; 760 (next - prev) * sizeof(struct mce));
761 prev = next;
762 next = cmpxchg(&mcelog.next, prev, 0);
763 } while (next != prev);
614 764
615 synchronize_sched(); 765 synchronize_sched();
616 766
@@ -680,20 +830,6 @@ static struct miscdevice mce_log_device = {
680 &mce_chrdev_ops, 830 &mce_chrdev_ops,
681}; 831};
682 832
683static unsigned long old_cr4 __initdata;
684
685void __init stop_mce(void)
686{
687 old_cr4 = read_cr4();
688 clear_in_cr4(X86_CR4_MCE);
689}
690
691void __init restart_mce(void)
692{
693 if (old_cr4 & X86_CR4_MCE)
694 set_in_cr4(X86_CR4_MCE);
695}
696
697/* 833/*
698 * Old style boot options parsing. Only for compatibility. 834 * Old style boot options parsing. Only for compatibility.
699 */ 835 */
@@ -703,8 +839,7 @@ static int __init mcheck_disable(char *str)
703 return 1; 839 return 1;
704} 840}
705 841
706/* mce=off disables machine check. Note you can re-enable it later 842/* mce=off disables machine check.
707 using sysfs.
708 mce=TOLERANCELEVEL (number, see above) 843 mce=TOLERANCELEVEL (number, see above)
709 mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 844 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
710 mce=nobootlog Don't log MCEs from before booting. */ 845 mce=nobootlog Don't log MCEs from before booting. */
@@ -728,6 +863,29 @@ __setup("mce=", mcheck_enable);
728 * Sysfs support 863 * Sysfs support
729 */ 864 */
730 865
866/*
867 * Disable machine checks on suspend and shutdown. We can't really handle
868 * them later.
869 */
870static int mce_disable(void)
871{
872 int i;
873
874 for (i = 0; i < banks; i++)
875 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
876 return 0;
877}
878
879static int mce_suspend(struct sys_device *dev, pm_message_t state)
880{
881 return mce_disable();
882}
883
884static int mce_shutdown(struct sys_device *dev)
885{
886 return mce_disable();
887}
888
731/* On resume clear all MCE state. Don't want to see leftovers from the BIOS. 889/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
732 Only one CPU is active at this time, the others get readded later using 890 Only one CPU is active at this time, the others get readded later using
733 CPU hotplug. */ 891 CPU hotplug. */
@@ -738,20 +896,24 @@ static int mce_resume(struct sys_device *dev)
738 return 0; 896 return 0;
739} 897}
740 898
899static void mce_cpu_restart(void *data)
900{
901 del_timer_sync(&__get_cpu_var(mce_timer));
902 if (mce_available(&current_cpu_data))
903 mce_init(NULL);
904 mce_init_timer();
905}
906
741/* Reinit MCEs after user configuration changes */ 907/* Reinit MCEs after user configuration changes */
742static void mce_restart(void) 908static void mce_restart(void)
743{ 909{
744 if (next_interval)
745 cancel_delayed_work(&mcheck_work);
746 /* Timer race is harmless here */
747 on_each_cpu(mce_init, NULL, 1);
748 next_interval = check_interval * HZ; 910 next_interval = check_interval * HZ;
749 if (next_interval) 911 on_each_cpu(mce_cpu_restart, NULL, 1);
750 schedule_delayed_work(&mcheck_work,
751 round_jiffies_relative(next_interval));
752} 912}
753 913
754static struct sysdev_class mce_sysclass = { 914static struct sysdev_class mce_sysclass = {
915 .suspend = mce_suspend,
916 .shutdown = mce_shutdown,
755 .resume = mce_resume, 917 .resume = mce_resume,
756 .name = "machinecheck", 918 .name = "machinecheck",
757}; 919};
@@ -778,16 +940,26 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinit
778 } \ 940 } \
779 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); 941 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
780 942
781/* 943static struct sysdev_attribute *bank_attrs;
782 * TBD should generate these dynamically based on number of available banks. 944
783 * Have only 6 contol banks in /sysfs until then. 945static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
784 */ 946 char *buf)
785ACCESSOR(bank0ctl,bank[0],mce_restart()) 947{
786ACCESSOR(bank1ctl,bank[1],mce_restart()) 948 u64 b = bank[attr - bank_attrs];
787ACCESSOR(bank2ctl,bank[2],mce_restart()) 949 return sprintf(buf, "%llx\n", b);
788ACCESSOR(bank3ctl,bank[3],mce_restart()) 950}
789ACCESSOR(bank4ctl,bank[4],mce_restart()) 951
790ACCESSOR(bank5ctl,bank[5],mce_restart()) 952static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
953 const char *buf, size_t siz)
954{
955 char *end;
956 u64 new = simple_strtoull(buf, &end, 0);
957 if (end == buf)
958 return -EINVAL;
959 bank[attr - bank_attrs] = new;
960 mce_restart();
961 return end-buf;
962}
791 963
792static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, 964static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
793 char *buf) 965 char *buf)
@@ -814,13 +986,11 @@ static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
814static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 986static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
815ACCESSOR(check_interval,check_interval,mce_restart()) 987ACCESSOR(check_interval,check_interval,mce_restart())
816static struct sysdev_attribute *mce_attributes[] = { 988static struct sysdev_attribute *mce_attributes[] = {
817 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
818 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
819 &attr_tolerant.attr, &attr_check_interval, &attr_trigger, 989 &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
820 NULL 990 NULL
821}; 991};
822 992
823static cpumask_t mce_device_initialized = CPU_MASK_NONE; 993static cpumask_var_t mce_device_initialized;
824 994
825/* Per cpu sysdev init. All of the cpus still share the same ctl bank */ 995/* Per cpu sysdev init. All of the cpus still share the same ctl bank */
826static __cpuinit int mce_create_device(unsigned int cpu) 996static __cpuinit int mce_create_device(unsigned int cpu)
@@ -845,11 +1015,22 @@ static __cpuinit int mce_create_device(unsigned int cpu)
845 if (err) 1015 if (err)
846 goto error; 1016 goto error;
847 } 1017 }
848 cpu_set(cpu, mce_device_initialized); 1018 for (i = 0; i < banks; i++) {
1019 err = sysdev_create_file(&per_cpu(device_mce, cpu),
1020 &bank_attrs[i]);
1021 if (err)
1022 goto error2;
1023 }
1024 cpumask_set_cpu(cpu, mce_device_initialized);
849 1025
850 return 0; 1026 return 0;
1027error2:
1028 while (--i >= 0) {
1029 sysdev_remove_file(&per_cpu(device_mce, cpu),
1030 &bank_attrs[i]);
1031 }
851error: 1032error:
852 while (i--) { 1033 while (--i >= 0) {
853 sysdev_remove_file(&per_cpu(device_mce,cpu), 1034 sysdev_remove_file(&per_cpu(device_mce,cpu),
854 mce_attributes[i]); 1035 mce_attributes[i]);
855 } 1036 }
@@ -862,14 +1043,44 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
862{ 1043{
863 int i; 1044 int i;
864 1045
865 if (!cpu_isset(cpu, mce_device_initialized)) 1046 if (!cpumask_test_cpu(cpu, mce_device_initialized))
866 return; 1047 return;
867 1048
868 for (i = 0; mce_attributes[i]; i++) 1049 for (i = 0; mce_attributes[i]; i++)
869 sysdev_remove_file(&per_cpu(device_mce,cpu), 1050 sysdev_remove_file(&per_cpu(device_mce,cpu),
870 mce_attributes[i]); 1051 mce_attributes[i]);
1052 for (i = 0; i < banks; i++)
1053 sysdev_remove_file(&per_cpu(device_mce, cpu),
1054 &bank_attrs[i]);
871 sysdev_unregister(&per_cpu(device_mce,cpu)); 1055 sysdev_unregister(&per_cpu(device_mce,cpu));
872 cpu_clear(cpu, mce_device_initialized); 1056 cpumask_clear_cpu(cpu, mce_device_initialized);
1057}
1058
1059/* Make sure there are no machine checks on offlined CPUs. */
1060static void mce_disable_cpu(void *h)
1061{
1062 int i;
1063 unsigned long action = *(unsigned long *)h;
1064
1065 if (!mce_available(&current_cpu_data))
1066 return;
1067 if (!(action & CPU_TASKS_FROZEN))
1068 cmci_clear();
1069 for (i = 0; i < banks; i++)
1070 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1071}
1072
1073static void mce_reenable_cpu(void *h)
1074{
1075 int i;
1076 unsigned long action = *(unsigned long *)h;
1077
1078 if (!mce_available(&current_cpu_data))
1079 return;
1080 if (!(action & CPU_TASKS_FROZEN))
1081 cmci_reenable();
1082 for (i = 0; i < banks; i++)
1083 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
873} 1084}
874 1085
875/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 1086/* Get notified when a cpu comes on/off. Be hotplug friendly. */
@@ -877,6 +1088,7 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
877 unsigned long action, void *hcpu) 1088 unsigned long action, void *hcpu)
878{ 1089{
879 unsigned int cpu = (unsigned long)hcpu; 1090 unsigned int cpu = (unsigned long)hcpu;
1091 struct timer_list *t = &per_cpu(mce_timer, cpu);
880 1092
881 switch (action) { 1093 switch (action) {
882 case CPU_ONLINE: 1094 case CPU_ONLINE:
@@ -891,6 +1103,21 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
891 threshold_cpu_callback(action, cpu); 1103 threshold_cpu_callback(action, cpu);
892 mce_remove_device(cpu); 1104 mce_remove_device(cpu);
893 break; 1105 break;
1106 case CPU_DOWN_PREPARE:
1107 case CPU_DOWN_PREPARE_FROZEN:
1108 del_timer_sync(t);
1109 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1110 break;
1111 case CPU_DOWN_FAILED:
1112 case CPU_DOWN_FAILED_FROZEN:
1113 t->expires = round_jiffies(jiffies + next_interval);
1114 add_timer_on(t, cpu);
1115 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1116 break;
1117 case CPU_POST_DEAD:
1118 /* intentionally ignoring frozen here */
1119 cmci_rediscover(cpu);
1120 break;
894 } 1121 }
895 return NOTIFY_OK; 1122 return NOTIFY_OK;
896} 1123}
@@ -899,6 +1126,34 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = {
899 .notifier_call = mce_cpu_callback, 1126 .notifier_call = mce_cpu_callback,
900}; 1127};
901 1128
1129static __init int mce_init_banks(void)
1130{
1131 int i;
1132
1133 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1134 GFP_KERNEL);
1135 if (!bank_attrs)
1136 return -ENOMEM;
1137
1138 for (i = 0; i < banks; i++) {
1139 struct sysdev_attribute *a = &bank_attrs[i];
1140 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
1141 if (!a->attr.name)
1142 goto nomem;
1143 a->attr.mode = 0644;
1144 a->show = show_bank;
1145 a->store = set_bank;
1146 }
1147 return 0;
1148
1149nomem:
1150 while (--i >= 0)
1151 kfree(bank_attrs[i].attr.name);
1152 kfree(bank_attrs);
1153 bank_attrs = NULL;
1154 return -ENOMEM;
1155}
1156
902static __init int mce_init_device(void) 1157static __init int mce_init_device(void)
903{ 1158{
904 int err; 1159 int err;
@@ -906,6 +1161,13 @@ static __init int mce_init_device(void)
906 1161
907 if (!mce_available(&boot_cpu_data)) 1162 if (!mce_available(&boot_cpu_data))
908 return -EIO; 1163 return -EIO;
1164
1165 alloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
1166
1167 err = mce_init_banks();
1168 if (err)
1169 return err;
1170
909 err = sysdev_class_register(&mce_sysclass); 1171 err = sysdev_class_register(&mce_sysclass);
910 if (err) 1172 if (err)
911 return err; 1173 return err;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 9817506dd469..56dde9c4bc96 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -79,6 +79,8 @@ static unsigned char shared_bank[NR_BANKS] = {
79 79
80static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ 80static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
81 81
82static void amd_threshold_interrupt(void);
83
82/* 84/*
83 * CPU Initialization 85 * CPU Initialization
84 */ 86 */
@@ -90,7 +92,8 @@ struct thresh_restart {
90}; 92};
91 93
92/* must be called with correct cpu affinity */ 94/* must be called with correct cpu affinity */
93static long threshold_restart_bank(void *_tr) 95/* Called via smp_call_function_single() */
96static void threshold_restart_bank(void *_tr)
94{ 97{
95 struct thresh_restart *tr = _tr; 98 struct thresh_restart *tr = _tr;
96 u32 mci_misc_hi, mci_misc_lo; 99 u32 mci_misc_hi, mci_misc_lo;
@@ -117,7 +120,6 @@ static long threshold_restart_bank(void *_tr)
117 120
118 mci_misc_hi |= MASK_COUNT_EN_HI; 121 mci_misc_hi |= MASK_COUNT_EN_HI;
119 wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi); 122 wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
120 return 0;
121} 123}
122 124
123/* cpu init entry point, called from mce.c with preempt off */ 125/* cpu init entry point, called from mce.c with preempt off */
@@ -174,6 +176,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
174 tr.reset = 0; 176 tr.reset = 0;
175 tr.old_limit = 0; 177 tr.old_limit = 0;
176 threshold_restart_bank(&tr); 178 threshold_restart_bank(&tr);
179
180 mce_threshold_vector = amd_threshold_interrupt;
177 } 181 }
178 } 182 }
179} 183}
@@ -187,19 +191,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
187 * the interrupt goes off when error_count reaches threshold_limit. 191 * the interrupt goes off when error_count reaches threshold_limit.
188 * the handler will simply log mcelog w/ software defined bank number. 192 * the handler will simply log mcelog w/ software defined bank number.
189 */ 193 */
190asmlinkage void mce_threshold_interrupt(void) 194static void amd_threshold_interrupt(void)
191{ 195{
192 unsigned int bank, block; 196 unsigned int bank, block;
193 struct mce m; 197 struct mce m;
194 u32 low = 0, high = 0, address = 0; 198 u32 low = 0, high = 0, address = 0;
195 199
196 ack_APIC_irq(); 200 mce_setup(&m);
197 exit_idle();
198 irq_enter();
199
200 memset(&m, 0, sizeof(m));
201 rdtscll(m.tsc);
202 m.cpu = smp_processor_id();
203 201
204 /* assume first bank caused it */ 202 /* assume first bank caused it */
205 for (bank = 0; bank < NR_BANKS; ++bank) { 203 for (bank = 0; bank < NR_BANKS; ++bank) {
@@ -233,7 +231,8 @@ asmlinkage void mce_threshold_interrupt(void)
233 231
234 /* Log the machine check that caused the threshold 232 /* Log the machine check that caused the threshold
235 event. */ 233 event. */
236 do_machine_check(NULL, 0); 234 machine_check_poll(MCP_TIMESTAMP,
235 &__get_cpu_var(mce_poll_banks));
237 236
238 if (high & MASK_OVERFLOW_HI) { 237 if (high & MASK_OVERFLOW_HI) {
239 rdmsrl(address, m.misc); 238 rdmsrl(address, m.misc);
@@ -243,13 +242,10 @@ asmlinkage void mce_threshold_interrupt(void)
243 + bank * NR_BLOCKS 242 + bank * NR_BLOCKS
244 + block; 243 + block;
245 mce_log(&m); 244 mce_log(&m);
246 goto out; 245 return;
247 } 246 }
248 } 247 }
249 } 248 }
250out:
251 inc_irq_stat(irq_threshold_count);
252 irq_exit();
253} 249}
254 250
255/* 251/*
@@ -283,7 +279,7 @@ static ssize_t store_interrupt_enable(struct threshold_block *b,
283 tr.b = b; 279 tr.b = b;
284 tr.reset = 0; 280 tr.reset = 0;
285 tr.old_limit = 0; 281 tr.old_limit = 0;
286 work_on_cpu(b->cpu, threshold_restart_bank, &tr); 282 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
287 283
288 return end - buf; 284 return end - buf;
289} 285}
@@ -305,23 +301,32 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
305 tr.b = b; 301 tr.b = b;
306 tr.reset = 0; 302 tr.reset = 0;
307 303
308 work_on_cpu(b->cpu, threshold_restart_bank, &tr); 304 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
309 305
310 return end - buf; 306 return end - buf;
311} 307}
312 308
313static long local_error_count(void *_b) 309struct threshold_block_cross_cpu {
310 struct threshold_block *tb;
311 long retval;
312};
313
314static void local_error_count_handler(void *_tbcc)
314{ 315{
315 struct threshold_block *b = _b; 316 struct threshold_block_cross_cpu *tbcc = _tbcc;
317 struct threshold_block *b = tbcc->tb;
316 u32 low, high; 318 u32 low, high;
317 319
318 rdmsr(b->address, low, high); 320 rdmsr(b->address, low, high);
319 return (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit); 321 tbcc->retval = (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit);
320} 322}
321 323
322static ssize_t show_error_count(struct threshold_block *b, char *buf) 324static ssize_t show_error_count(struct threshold_block *b, char *buf)
323{ 325{
324 return sprintf(buf, "%lx\n", work_on_cpu(b->cpu, local_error_count, b)); 326 struct threshold_block_cross_cpu tbcc = { .tb = b, };
327
328 smp_call_function_single(b->cpu, local_error_count_handler, &tbcc, 1);
329 return sprintf(buf, "%lx\n", tbcc.retval);
325} 330}
326 331
327static ssize_t store_error_count(struct threshold_block *b, 332static ssize_t store_error_count(struct threshold_block *b,
@@ -329,7 +334,7 @@ static ssize_t store_error_count(struct threshold_block *b,
329{ 334{
330 struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 }; 335 struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 };
331 336
332 work_on_cpu(b->cpu, threshold_restart_bank, &tr); 337 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
333 return 1; 338 return 1;
334} 339}
335 340
@@ -398,7 +403,7 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
398 if ((bank >= NR_BANKS) || (block >= NR_BLOCKS)) 403 if ((bank >= NR_BANKS) || (block >= NR_BLOCKS))
399 return 0; 404 return 0;
400 405
401 if (rdmsr_safe(address, &low, &high)) 406 if (rdmsr_safe_on_cpu(cpu, address, &low, &high))
402 return 0; 407 return 0;
403 408
404 if (!(high & MASK_VALID_HI)) { 409 if (!(high & MASK_VALID_HI)) {
@@ -462,12 +467,11 @@ out_free:
462 return err; 467 return err;
463} 468}
464 469
465static __cpuinit long local_allocate_threshold_blocks(void *_bank) 470static __cpuinit long
471local_allocate_threshold_blocks(int cpu, unsigned int bank)
466{ 472{
467 unsigned int *bank = _bank; 473 return allocate_threshold_blocks(cpu, bank, 0,
468 474 MSR_IA32_MC0_MISC + bank * 4);
469 return allocate_threshold_blocks(smp_processor_id(), *bank, 0,
470 MSR_IA32_MC0_MISC + *bank * 4);
471} 475}
472 476
473/* symlinks sibling shared banks to first core. first core owns dir/files. */ 477/* symlinks sibling shared banks to first core. first core owns dir/files. */
@@ -481,7 +485,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
481 485
482#ifdef CONFIG_SMP 486#ifdef CONFIG_SMP
483 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ 487 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */
484 i = cpumask_first(&per_cpu(cpu_core_map, cpu)); 488 i = cpumask_first(cpu_core_mask(cpu));
485 489
486 /* first core not up yet */ 490 /* first core not up yet */
487 if (cpu_data(i).cpu_core_id) 491 if (cpu_data(i).cpu_core_id)
@@ -501,7 +505,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
501 if (err) 505 if (err)
502 goto out; 506 goto out;
503 507
504 cpumask_copy(b->cpus, &per_cpu(cpu_core_map, cpu)); 508 cpumask_copy(b->cpus, cpu_core_mask(cpu));
505 per_cpu(threshold_banks, cpu)[bank] = b; 509 per_cpu(threshold_banks, cpu)[bank] = b;
506 goto out; 510 goto out;
507 } 511 }
@@ -525,12 +529,12 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
525#ifndef CONFIG_SMP 529#ifndef CONFIG_SMP
526 cpumask_setall(b->cpus); 530 cpumask_setall(b->cpus);
527#else 531#else
528 cpumask_copy(b->cpus, &per_cpu(cpu_core_map, cpu)); 532 cpumask_copy(b->cpus, cpu_core_mask(cpu));
529#endif 533#endif
530 534
531 per_cpu(threshold_banks, cpu)[bank] = b; 535 per_cpu(threshold_banks, cpu)[bank] = b;
532 536
533 err = work_on_cpu(cpu, local_allocate_threshold_blocks, &bank); 537 err = local_allocate_threshold_blocks(cpu, bank);
534 if (err) 538 if (err)
535 goto out_free; 539 goto out_free;
536 540
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index aa5e287c98e0..d6b72df89d69 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -1,6 +1,8 @@
1/* 1/*
2 * Intel specific MCE features. 2 * Intel specific MCE features.
3 * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca> 3 * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
4 * Copyright (C) 2008, 2009 Intel Corporation
5 * Author: Andi Kleen
4 */ 6 */
5 7
6#include <linux/init.h> 8#include <linux/init.h>
@@ -13,6 +15,7 @@
13#include <asm/hw_irq.h> 15#include <asm/hw_irq.h>
14#include <asm/idle.h> 16#include <asm/idle.h>
15#include <asm/therm_throt.h> 17#include <asm/therm_throt.h>
18#include <asm/apic.h>
16 19
17asmlinkage void smp_thermal_interrupt(void) 20asmlinkage void smp_thermal_interrupt(void)
18{ 21{
@@ -25,7 +28,7 @@ asmlinkage void smp_thermal_interrupt(void)
25 28
26 rdmsrl(MSR_IA32_THERM_STATUS, msr_val); 29 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
27 if (therm_throt_process(msr_val & 1)) 30 if (therm_throt_process(msr_val & 1))
28 mce_log_therm_throt_event(smp_processor_id(), msr_val); 31 mce_log_therm_throt_event(msr_val);
29 32
30 inc_irq_stat(irq_thermal_count); 33 inc_irq_stat(irq_thermal_count);
31 irq_exit(); 34 irq_exit();
@@ -85,7 +88,209 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
85 return; 88 return;
86} 89}
87 90
91/*
92 * Support for Intel Correct Machine Check Interrupts. This allows
93 * the CPU to raise an interrupt when a corrected machine check happened.
94 * Normally we pick those up using a regular polling timer.
95 * Also supports reliable discovery of shared banks.
96 */
97
98static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
99
100/*
101 * cmci_discover_lock protects against parallel discovery attempts
102 * which could race against each other.
103 */
104static DEFINE_SPINLOCK(cmci_discover_lock);
105
106#define CMCI_THRESHOLD 1
107
108static int cmci_supported(int *banks)
109{
110 u64 cap;
111
112 /*
113 * Vendor check is not strictly needed, but the initial
114 * initialization is vendor keyed and this
115 * makes sure none of the backdoors are entered otherwise.
116 */
117 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
118 return 0;
119 if (!cpu_has_apic || lapic_get_maxlvt() < 6)
120 return 0;
121 rdmsrl(MSR_IA32_MCG_CAP, cap);
122 *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
123 return !!(cap & MCG_CMCI_P);
124}
125
126/*
127 * The interrupt handler. This is called on every event.
128 * Just call the poller directly to log any events.
129 * This could in theory increase the threshold under high load,
130 * but doesn't for now.
131 */
132static void intel_threshold_interrupt(void)
133{
134 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
135 mce_notify_user();
136}
137
138static void print_update(char *type, int *hdr, int num)
139{
140 if (*hdr == 0)
141 printk(KERN_INFO "CPU %d MCA banks", smp_processor_id());
142 *hdr = 1;
143 printk(KERN_CONT " %s:%d", type, num);
144}
145
146/*
147 * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
148 * on this CPU. Use the algorithm recommended in the SDM to discover shared
149 * banks.
150 */
151static void cmci_discover(int banks, int boot)
152{
153 unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
154 int hdr = 0;
155 int i;
156
157 spin_lock(&cmci_discover_lock);
158 for (i = 0; i < banks; i++) {
159 u64 val;
160
161 if (test_bit(i, owned))
162 continue;
163
164 rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
165
166 /* Already owned by someone else? */
167 if (val & CMCI_EN) {
168 if (test_and_clear_bit(i, owned) || boot)
169 print_update("SHD", &hdr, i);
170 __clear_bit(i, __get_cpu_var(mce_poll_banks));
171 continue;
172 }
173
174 val |= CMCI_EN | CMCI_THRESHOLD;
175 wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
176 rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
177
178 /* Did the enable bit stick? -- the bank supports CMCI */
179 if (val & CMCI_EN) {
180 if (!test_and_set_bit(i, owned) || boot)
181 print_update("CMCI", &hdr, i);
182 __clear_bit(i, __get_cpu_var(mce_poll_banks));
183 } else {
184 WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
185 }
186 }
187 spin_unlock(&cmci_discover_lock);
188 if (hdr)
189 printk(KERN_CONT "\n");
190}
191
192/*
193 * Just in case we missed an event during initialization check
194 * all the CMCI owned banks.
195 */
196void cmci_recheck(void)
197{
198 unsigned long flags;
199 int banks;
200
201 if (!mce_available(&current_cpu_data) || !cmci_supported(&banks))
202 return;
203 local_irq_save(flags);
204 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
205 local_irq_restore(flags);
206}
207
208/*
209 * Disable CMCI on this CPU for all banks it owns when it goes down.
210 * This allows other CPUs to claim the banks on rediscovery.
211 */
212void cmci_clear(void)
213{
214 int i;
215 int banks;
216 u64 val;
217
218 if (!cmci_supported(&banks))
219 return;
220 spin_lock(&cmci_discover_lock);
221 for (i = 0; i < banks; i++) {
222 if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
223 continue;
224 /* Disable CMCI */
225 rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
226 val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
227 wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
228 __clear_bit(i, __get_cpu_var(mce_banks_owned));
229 }
230 spin_unlock(&cmci_discover_lock);
231}
232
233/*
234 * After a CPU went down cycle through all the others and rediscover
235 * Must run in process context.
236 */
237void cmci_rediscover(int dying)
238{
239 int banks;
240 int cpu;
241 cpumask_var_t old;
242
243 if (!cmci_supported(&banks))
244 return;
245 if (!alloc_cpumask_var(&old, GFP_KERNEL))
246 return;
247 cpumask_copy(old, &current->cpus_allowed);
248
249 for_each_online_cpu (cpu) {
250 if (cpu == dying)
251 continue;
252 if (set_cpus_allowed_ptr(current, cpumask_of(cpu)))
253 continue;
254 /* Recheck banks in case CPUs don't all have the same */
255 if (cmci_supported(&banks))
256 cmci_discover(banks, 0);
257 }
258
259 set_cpus_allowed_ptr(current, old);
260 free_cpumask_var(old);
261}
262
263/*
264 * Reenable CMCI on this CPU in case a CPU down failed.
265 */
266void cmci_reenable(void)
267{
268 int banks;
269 if (cmci_supported(&banks))
270 cmci_discover(banks, 0);
271}
272
273static void intel_init_cmci(void)
274{
275 int banks;
276
277 if (!cmci_supported(&banks))
278 return;
279
280 mce_threshold_vector = intel_threshold_interrupt;
281 cmci_discover(banks, 1);
282 /*
283 * For CPU #0 this runs with still disabled APIC, but that's
284 * ok because only the vector is set up. We still do another
285 * check for the banks later for CPU #0 just to make sure
286 * to not miss any events.
287 */
288 apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
289 cmci_recheck();
290}
291
88void mce_intel_feature_init(struct cpuinfo_x86 *c) 292void mce_intel_feature_init(struct cpuinfo_x86 *c)
89{ 293{
90 intel_init_thermal(c); 294 intel_init_thermal(c);
295 intel_init_cmci();
91} 296}
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
new file mode 100644
index 000000000000..23ee9e730f78
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -0,0 +1,29 @@
1/*
2 * Common corrected MCE threshold handler code:
3 */
4#include <linux/interrupt.h>
5#include <linux/kernel.h>
6
7#include <asm/irq_vectors.h>
8#include <asm/apic.h>
9#include <asm/idle.h>
10#include <asm/mce.h>
11
12static void default_threshold_interrupt(void)
13{
14 printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n",
15 THRESHOLD_APIC_VECTOR);
16}
17
18void (*mce_threshold_vector)(void) = default_threshold_interrupt;
19
20asmlinkage void mce_threshold_interrupt(void)
21{
22 exit_idle();
23 irq_enter();
24 inc_irq_stat(irq_threshold_count);
25 mce_threshold_vector();
26 irq_exit();
27 /* Ack only at the end to avoid potential reentry */
28 ack_APIC_irq();
29}
diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile
index 191fc0533649..f4361b56f8e9 100644
--- a/arch/x86/kernel/cpu/mtrr/Makefile
+++ b/arch/x86/kernel/cpu/mtrr/Makefile
@@ -1,3 +1,3 @@
1obj-y := main.o if.o generic.o state.o 1obj-y := main.o if.o generic.o state.o cleanup.o
2obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o 2obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
3 3
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
new file mode 100644
index 000000000000..ce0fe4b5c04f
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -0,0 +1,1101 @@
1/* MTRR (Memory Type Range Register) cleanup
2
3 Copyright (C) 2009 Yinghai Lu
4
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public
7 License as published by the Free Software Foundation; either
8 version 2 of the License, or (at your option) any later version.
9
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
14
15 You should have received a copy of the GNU Library General Public
16 License along with this library; if not, write to the Free
17 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18*/
19
20#include <linux/module.h>
21#include <linux/init.h>
22#include <linux/pci.h>
23#include <linux/smp.h>
24#include <linux/cpu.h>
25#include <linux/mutex.h>
26#include <linux/sort.h>
27
28#include <asm/e820.h>
29#include <asm/mtrr.h>
30#include <asm/uaccess.h>
31#include <asm/processor.h>
32#include <asm/msr.h>
33#include <asm/kvm_para.h>
34#include "mtrr.h"
35
36/* should be related to MTRR_VAR_RANGES nums */
37#define RANGE_NUM 256
38
39struct res_range {
40 unsigned long start;
41 unsigned long end;
42};
43
44static int __init
45add_range(struct res_range *range, int nr_range, unsigned long start,
46 unsigned long end)
47{
48 /* out of slots */
49 if (nr_range >= RANGE_NUM)
50 return nr_range;
51
52 range[nr_range].start = start;
53 range[nr_range].end = end;
54
55 nr_range++;
56
57 return nr_range;
58}
59
60static int __init
61add_range_with_merge(struct res_range *range, int nr_range, unsigned long start,
62 unsigned long end)
63{
64 int i;
65
66 /* try to merge it with old one */
67 for (i = 0; i < nr_range; i++) {
68 unsigned long final_start, final_end;
69 unsigned long common_start, common_end;
70
71 if (!range[i].end)
72 continue;
73
74 common_start = max(range[i].start, start);
75 common_end = min(range[i].end, end);
76 if (common_start > common_end + 1)
77 continue;
78
79 final_start = min(range[i].start, start);
80 final_end = max(range[i].end, end);
81
82 range[i].start = final_start;
83 range[i].end = final_end;
84 return nr_range;
85 }
86
87 /* need to add that */
88 return add_range(range, nr_range, start, end);
89}
90
91static void __init
92subtract_range(struct res_range *range, unsigned long start, unsigned long end)
93{
94 int i, j;
95
96 for (j = 0; j < RANGE_NUM; j++) {
97 if (!range[j].end)
98 continue;
99
100 if (start <= range[j].start && end >= range[j].end) {
101 range[j].start = 0;
102 range[j].end = 0;
103 continue;
104 }
105
106 if (start <= range[j].start && end < range[j].end &&
107 range[j].start < end + 1) {
108 range[j].start = end + 1;
109 continue;
110 }
111
112
113 if (start > range[j].start && end >= range[j].end &&
114 range[j].end > start - 1) {
115 range[j].end = start - 1;
116 continue;
117 }
118
119 if (start > range[j].start && end < range[j].end) {
120 /* find the new spare */
121 for (i = 0; i < RANGE_NUM; i++) {
122 if (range[i].end == 0)
123 break;
124 }
125 if (i < RANGE_NUM) {
126 range[i].end = range[j].end;
127 range[i].start = end + 1;
128 } else {
129 printk(KERN_ERR "run of slot in ranges\n");
130 }
131 range[j].end = start - 1;
132 continue;
133 }
134 }
135}
136
137static int __init cmp_range(const void *x1, const void *x2)
138{
139 const struct res_range *r1 = x1;
140 const struct res_range *r2 = x2;
141 long start1, start2;
142
143 start1 = r1->start;
144 start2 = r2->start;
145
146 return start1 - start2;
147}
148
149struct var_mtrr_range_state {
150 unsigned long base_pfn;
151 unsigned long size_pfn;
152 mtrr_type type;
153};
154
155static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
156static int __initdata debug_print;
157
158static int __init
159x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
160 unsigned long extra_remove_base,
161 unsigned long extra_remove_size)
162{
163 unsigned long base, size;
164 mtrr_type type;
165 int i;
166
167 for (i = 0; i < num_var_ranges; i++) {
168 type = range_state[i].type;
169 if (type != MTRR_TYPE_WRBACK)
170 continue;
171 base = range_state[i].base_pfn;
172 size = range_state[i].size_pfn;
173 nr_range = add_range_with_merge(range, nr_range, base,
174 base + size - 1);
175 }
176 if (debug_print) {
177 printk(KERN_DEBUG "After WB checking\n");
178 for (i = 0; i < nr_range; i++)
179 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
180 range[i].start, range[i].end + 1);
181 }
182
183 /* take out UC ranges */
184 for (i = 0; i < num_var_ranges; i++) {
185 type = range_state[i].type;
186 if (type != MTRR_TYPE_UNCACHABLE &&
187 type != MTRR_TYPE_WRPROT)
188 continue;
189 size = range_state[i].size_pfn;
190 if (!size)
191 continue;
192 base = range_state[i].base_pfn;
193 if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed &&
194 (mtrr_state.enabled & 1)) {
195 /* Var MTRR contains UC entry below 1M? Skip it: */
196 printk(KERN_WARNING "WARNING: BIOS bug: VAR MTRR %d "
197 "contains strange UC entry under 1M, check "
198 "with your system vendor!\n", i);
199 if (base + size <= (1<<(20-PAGE_SHIFT)))
200 continue;
201 size -= (1<<(20-PAGE_SHIFT)) - base;
202 base = 1<<(20-PAGE_SHIFT);
203 }
204 subtract_range(range, base, base + size - 1);
205 }
206 if (extra_remove_size)
207 subtract_range(range, extra_remove_base,
208 extra_remove_base + extra_remove_size - 1);
209
210 /* get new range num */
211 nr_range = 0;
212 for (i = 0; i < RANGE_NUM; i++) {
213 if (!range[i].end)
214 continue;
215 nr_range++;
216 }
217 if (debug_print) {
218 printk(KERN_DEBUG "After UC checking\n");
219 for (i = 0; i < nr_range; i++)
220 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
221 range[i].start, range[i].end + 1);
222 }
223
224 /* sort the ranges */
225 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
226 if (debug_print) {
227 printk(KERN_DEBUG "After sorting\n");
228 for (i = 0; i < nr_range; i++)
229 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
230 range[i].start, range[i].end + 1);
231 }
232
233 /* clear those is not used */
234 for (i = nr_range; i < RANGE_NUM; i++)
235 memset(&range[i], 0, sizeof(range[i]));
236
237 return nr_range;
238}
239
240static struct res_range __initdata range[RANGE_NUM];
241static int __initdata nr_range;
242
243#ifdef CONFIG_MTRR_SANITIZER
244
245static unsigned long __init sum_ranges(struct res_range *range, int nr_range)
246{
247 unsigned long sum;
248 int i;
249
250 sum = 0;
251 for (i = 0; i < nr_range; i++)
252 sum += range[i].end + 1 - range[i].start;
253
254 return sum;
255}
256
257static int enable_mtrr_cleanup __initdata =
258 CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT;
259
260static int __init disable_mtrr_cleanup_setup(char *str)
261{
262 enable_mtrr_cleanup = 0;
263 return 0;
264}
265early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
266
267static int __init enable_mtrr_cleanup_setup(char *str)
268{
269 enable_mtrr_cleanup = 1;
270 return 0;
271}
272early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
273
274static int __init mtrr_cleanup_debug_setup(char *str)
275{
276 debug_print = 1;
277 return 0;
278}
279early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
280
281struct var_mtrr_state {
282 unsigned long range_startk;
283 unsigned long range_sizek;
284 unsigned long chunk_sizek;
285 unsigned long gran_sizek;
286 unsigned int reg;
287};
288
289static void __init
290set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
291 unsigned char type, unsigned int address_bits)
292{
293 u32 base_lo, base_hi, mask_lo, mask_hi;
294 u64 base, mask;
295
296 if (!sizek) {
297 fill_mtrr_var_range(reg, 0, 0, 0, 0);
298 return;
299 }
300
301 mask = (1ULL << address_bits) - 1;
302 mask &= ~((((u64)sizek) << 10) - 1);
303
304 base = ((u64)basek) << 10;
305
306 base |= type;
307 mask |= 0x800;
308
309 base_lo = base & ((1ULL<<32) - 1);
310 base_hi = base >> 32;
311
312 mask_lo = mask & ((1ULL<<32) - 1);
313 mask_hi = mask >> 32;
314
315 fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi);
316}
317
318static void __init
319save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
320 unsigned char type)
321{
322 range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10);
323 range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10);
324 range_state[reg].type = type;
325}
326
327static void __init
328set_var_mtrr_all(unsigned int address_bits)
329{
330 unsigned long basek, sizek;
331 unsigned char type;
332 unsigned int reg;
333
334 for (reg = 0; reg < num_var_ranges; reg++) {
335 basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10);
336 sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10);
337 type = range_state[reg].type;
338
339 set_var_mtrr(reg, basek, sizek, type, address_bits);
340 }
341}
342
343static unsigned long to_size_factor(unsigned long sizek, char *factorp)
344{
345 char factor;
346 unsigned long base = sizek;
347
348 if (base & ((1<<10) - 1)) {
349 /* not MB alignment */
350 factor = 'K';
351 } else if (base & ((1<<20) - 1)) {
352 factor = 'M';
353 base >>= 10;
354 } else {
355 factor = 'G';
356 base >>= 20;
357 }
358
359 *factorp = factor;
360
361 return base;
362}
363
364static unsigned int __init
365range_to_mtrr(unsigned int reg, unsigned long range_startk,
366 unsigned long range_sizek, unsigned char type)
367{
368 if (!range_sizek || (reg >= num_var_ranges))
369 return reg;
370
371 while (range_sizek) {
372 unsigned long max_align, align;
373 unsigned long sizek;
374
375 /* Compute the maximum size I can make a range */
376 if (range_startk)
377 max_align = ffs(range_startk) - 1;
378 else
379 max_align = 32;
380 align = fls(range_sizek) - 1;
381 if (align > max_align)
382 align = max_align;
383
384 sizek = 1 << align;
385 if (debug_print) {
386 char start_factor = 'K', size_factor = 'K';
387 unsigned long start_base, size_base;
388
389 start_base = to_size_factor(range_startk,
390 &start_factor),
391 size_base = to_size_factor(sizek, &size_factor),
392
393 printk(KERN_DEBUG "Setting variable MTRR %d, "
394 "base: %ld%cB, range: %ld%cB, type %s\n",
395 reg, start_base, start_factor,
396 size_base, size_factor,
397 (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
398 ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other")
399 );
400 }
401 save_var_mtrr(reg++, range_startk, sizek, type);
402 range_startk += sizek;
403 range_sizek -= sizek;
404 if (reg >= num_var_ranges)
405 break;
406 }
407 return reg;
408}
409
410static unsigned __init
411range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
412 unsigned long sizek)
413{
414 unsigned long hole_basek, hole_sizek;
415 unsigned long second_basek, second_sizek;
416 unsigned long range0_basek, range0_sizek;
417 unsigned long range_basek, range_sizek;
418 unsigned long chunk_sizek;
419 unsigned long gran_sizek;
420
421 hole_basek = 0;
422 hole_sizek = 0;
423 second_basek = 0;
424 second_sizek = 0;
425 chunk_sizek = state->chunk_sizek;
426 gran_sizek = state->gran_sizek;
427
428 /* align with gran size, prevent small block used up MTRRs */
429 range_basek = ALIGN(state->range_startk, gran_sizek);
430 if ((range_basek > basek) && basek)
431 return second_sizek;
432 state->range_sizek -= (range_basek - state->range_startk);
433 range_sizek = ALIGN(state->range_sizek, gran_sizek);
434
435 while (range_sizek > state->range_sizek) {
436 range_sizek -= gran_sizek;
437 if (!range_sizek)
438 return 0;
439 }
440 state->range_sizek = range_sizek;
441
442 /* try to append some small hole */
443 range0_basek = state->range_startk;
444 range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
445
446 /* no increase */
447 if (range0_sizek == state->range_sizek) {
448 if (debug_print)
449 printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
450 range0_basek<<10,
451 (range0_basek + state->range_sizek)<<10);
452 state->reg = range_to_mtrr(state->reg, range0_basek,
453 state->range_sizek, MTRR_TYPE_WRBACK);
454 return 0;
455 }
456
457 /* only cut back, when it is not the last */
458 if (sizek) {
459 while (range0_basek + range0_sizek > (basek + sizek)) {
460 if (range0_sizek >= chunk_sizek)
461 range0_sizek -= chunk_sizek;
462 else
463 range0_sizek = 0;
464
465 if (!range0_sizek)
466 break;
467 }
468 }
469
470second_try:
471 range_basek = range0_basek + range0_sizek;
472
473 /* one hole in the middle */
474 if (range_basek > basek && range_basek <= (basek + sizek))
475 second_sizek = range_basek - basek;
476
477 if (range0_sizek > state->range_sizek) {
478
479 /* one hole in middle or at end */
480 hole_sizek = range0_sizek - state->range_sizek - second_sizek;
481
482 /* hole size should be less than half of range0 size */
483 if (hole_sizek >= (range0_sizek >> 1) &&
484 range0_sizek >= chunk_sizek) {
485 range0_sizek -= chunk_sizek;
486 second_sizek = 0;
487 hole_sizek = 0;
488
489 goto second_try;
490 }
491 }
492
493 if (range0_sizek) {
494 if (debug_print)
495 printk(KERN_DEBUG "range0: %016lx - %016lx\n",
496 range0_basek<<10,
497 (range0_basek + range0_sizek)<<10);
498 state->reg = range_to_mtrr(state->reg, range0_basek,
499 range0_sizek, MTRR_TYPE_WRBACK);
500 }
501
502 if (range0_sizek < state->range_sizek) {
503 /* need to handle left over */
504 range_sizek = state->range_sizek - range0_sizek;
505
506 if (debug_print)
507 printk(KERN_DEBUG "range: %016lx - %016lx\n",
508 range_basek<<10,
509 (range_basek + range_sizek)<<10);
510 state->reg = range_to_mtrr(state->reg, range_basek,
511 range_sizek, MTRR_TYPE_WRBACK);
512 }
513
514 if (hole_sizek) {
515 hole_basek = range_basek - hole_sizek - second_sizek;
516 if (debug_print)
517 printk(KERN_DEBUG "hole: %016lx - %016lx\n",
518 hole_basek<<10,
519 (hole_basek + hole_sizek)<<10);
520 state->reg = range_to_mtrr(state->reg, hole_basek,
521 hole_sizek, MTRR_TYPE_UNCACHABLE);
522 }
523
524 return second_sizek;
525}
526
527static void __init
528set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
529 unsigned long size_pfn)
530{
531 unsigned long basek, sizek;
532 unsigned long second_sizek = 0;
533
534 if (state->reg >= num_var_ranges)
535 return;
536
537 basek = base_pfn << (PAGE_SHIFT - 10);
538 sizek = size_pfn << (PAGE_SHIFT - 10);
539
540 /* See if I can merge with the last range */
541 if ((basek <= 1024) ||
542 (state->range_startk + state->range_sizek == basek)) {
543 unsigned long endk = basek + sizek;
544 state->range_sizek = endk - state->range_startk;
545 return;
546 }
547 /* Write the range mtrrs */
548 if (state->range_sizek != 0)
549 second_sizek = range_to_mtrr_with_hole(state, basek, sizek);
550
551 /* Allocate an msr */
552 state->range_startk = basek + second_sizek;
553 state->range_sizek = sizek - second_sizek;
554}
555
556/* mininum size of mtrr block that can take hole */
557static u64 mtrr_chunk_size __initdata = (256ULL<<20);
558
559static int __init parse_mtrr_chunk_size_opt(char *p)
560{
561 if (!p)
562 return -EINVAL;
563 mtrr_chunk_size = memparse(p, &p);
564 return 0;
565}
566early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt);
567
568/* granity of mtrr of block */
569static u64 mtrr_gran_size __initdata;
570
571static int __init parse_mtrr_gran_size_opt(char *p)
572{
573 if (!p)
574 return -EINVAL;
575 mtrr_gran_size = memparse(p, &p);
576 return 0;
577}
578early_param("mtrr_gran_size", parse_mtrr_gran_size_opt);
579
580static int nr_mtrr_spare_reg __initdata =
581 CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT;
582
583static int __init parse_mtrr_spare_reg(char *arg)
584{
585 if (arg)
586 nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0);
587 return 0;
588}
589
590early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
591
592static int __init
593x86_setup_var_mtrrs(struct res_range *range, int nr_range,
594 u64 chunk_size, u64 gran_size)
595{
596 struct var_mtrr_state var_state;
597 int i;
598 int num_reg;
599
600 var_state.range_startk = 0;
601 var_state.range_sizek = 0;
602 var_state.reg = 0;
603 var_state.chunk_sizek = chunk_size >> 10;
604 var_state.gran_sizek = gran_size >> 10;
605
606 memset(range_state, 0, sizeof(range_state));
607
608 /* Write the range etc */
609 for (i = 0; i < nr_range; i++)
610 set_var_mtrr_range(&var_state, range[i].start,
611 range[i].end - range[i].start + 1);
612
613 /* Write the last range */
614 if (var_state.range_sizek != 0)
615 range_to_mtrr_with_hole(&var_state, 0, 0);
616
617 num_reg = var_state.reg;
618 /* Clear out the extra MTRR's */
619 while (var_state.reg < num_var_ranges) {
620 save_var_mtrr(var_state.reg, 0, 0, 0);
621 var_state.reg++;
622 }
623
624 return num_reg;
625}
626
627struct mtrr_cleanup_result {
628 unsigned long gran_sizek;
629 unsigned long chunk_sizek;
630 unsigned long lose_cover_sizek;
631 unsigned int num_reg;
632 int bad;
633};
634
635/*
636 * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G
637 * chunk size: gran_size, ..., 2G
638 * so we need (1+16)*8
639 */
640#define NUM_RESULT 136
641#define PSHIFT (PAGE_SHIFT - 10)
642
643static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
644static unsigned long __initdata min_loss_pfn[RANGE_NUM];
645
646static void __init print_out_mtrr_range_state(void)
647{
648 int i;
649 char start_factor = 'K', size_factor = 'K';
650 unsigned long start_base, size_base;
651 mtrr_type type;
652
653 for (i = 0; i < num_var_ranges; i++) {
654
655 size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
656 if (!size_base)
657 continue;
658
659 size_base = to_size_factor(size_base, &size_factor),
660 start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
661 start_base = to_size_factor(start_base, &start_factor),
662 type = range_state[i].type;
663
664 printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
665 i, start_base, start_factor,
666 size_base, size_factor,
667 (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
668 ((type == MTRR_TYPE_WRPROT) ? "WP" :
669 ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
670 );
671 }
672}
673
674static int __init mtrr_need_cleanup(void)
675{
676 int i;
677 mtrr_type type;
678 unsigned long size;
679 /* extra one for all 0 */
680 int num[MTRR_NUM_TYPES + 1];
681
682 /* check entries number */
683 memset(num, 0, sizeof(num));
684 for (i = 0; i < num_var_ranges; i++) {
685 type = range_state[i].type;
686 size = range_state[i].size_pfn;
687 if (type >= MTRR_NUM_TYPES)
688 continue;
689 if (!size)
690 type = MTRR_NUM_TYPES;
691 if (type == MTRR_TYPE_WRPROT)
692 type = MTRR_TYPE_UNCACHABLE;
693 num[type]++;
694 }
695
696 /* check if we got UC entries */
697 if (!num[MTRR_TYPE_UNCACHABLE])
698 return 0;
699
700 /* check if we only had WB and UC */
701 if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
702 num_var_ranges - num[MTRR_NUM_TYPES])
703 return 0;
704
705 return 1;
706}
707
708static unsigned long __initdata range_sums;
709static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
710 unsigned long extra_remove_base,
711 unsigned long extra_remove_size,
712 int i)
713{
714 int num_reg;
715 static struct res_range range_new[RANGE_NUM];
716 static int nr_range_new;
717 unsigned long range_sums_new;
718
719 /* convert ranges to var ranges state */
720 num_reg = x86_setup_var_mtrrs(range, nr_range,
721 chunk_size, gran_size);
722
723 /* we got new setting in range_state, check it */
724 memset(range_new, 0, sizeof(range_new));
725 nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
726 extra_remove_base, extra_remove_size);
727 range_sums_new = sum_ranges(range_new, nr_range_new);
728
729 result[i].chunk_sizek = chunk_size >> 10;
730 result[i].gran_sizek = gran_size >> 10;
731 result[i].num_reg = num_reg;
732 if (range_sums < range_sums_new) {
733 result[i].lose_cover_sizek =
734 (range_sums_new - range_sums) << PSHIFT;
735 result[i].bad = 1;
736 } else
737 result[i].lose_cover_sizek =
738 (range_sums - range_sums_new) << PSHIFT;
739
740 /* double check it */
741 if (!result[i].bad && !result[i].lose_cover_sizek) {
742 if (nr_range_new != nr_range ||
743 memcmp(range, range_new, sizeof(range)))
744 result[i].bad = 1;
745 }
746
747 if (!result[i].bad && (range_sums - range_sums_new <
748 min_loss_pfn[num_reg])) {
749 min_loss_pfn[num_reg] =
750 range_sums - range_sums_new;
751 }
752}
753
754static void __init mtrr_print_out_one_result(int i)
755{
756 char gran_factor, chunk_factor, lose_factor;
757 unsigned long gran_base, chunk_base, lose_base;
758
759 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
760 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
761 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
762 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
763 result[i].bad ? "*BAD*" : " ",
764 gran_base, gran_factor, chunk_base, chunk_factor);
765 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
766 result[i].num_reg, result[i].bad ? "-" : "",
767 lose_base, lose_factor);
768}
769
770static int __init mtrr_search_optimal_index(void)
771{
772 int i;
773 int num_reg_good;
774 int index_good;
775
776 if (nr_mtrr_spare_reg >= num_var_ranges)
777 nr_mtrr_spare_reg = num_var_ranges - 1;
778 num_reg_good = -1;
779 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
780 if (!min_loss_pfn[i])
781 num_reg_good = i;
782 }
783
784 index_good = -1;
785 if (num_reg_good != -1) {
786 for (i = 0; i < NUM_RESULT; i++) {
787 if (!result[i].bad &&
788 result[i].num_reg == num_reg_good &&
789 !result[i].lose_cover_sizek) {
790 index_good = i;
791 break;
792 }
793 }
794 }
795
796 return index_good;
797}
798
799
800int __init mtrr_cleanup(unsigned address_bits)
801{
802 unsigned long extra_remove_base, extra_remove_size;
803 unsigned long base, size, def, dummy;
804 mtrr_type type;
805 u64 chunk_size, gran_size;
806 int index_good;
807 int i;
808
809 if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
810 return 0;
811 rdmsr(MTRRdefType_MSR, def, dummy);
812 def &= 0xff;
813 if (def != MTRR_TYPE_UNCACHABLE)
814 return 0;
815
816 /* get it and store it aside */
817 memset(range_state, 0, sizeof(range_state));
818 for (i = 0; i < num_var_ranges; i++) {
819 mtrr_if->get(i, &base, &size, &type);
820 range_state[i].base_pfn = base;
821 range_state[i].size_pfn = size;
822 range_state[i].type = type;
823 }
824
825 /* check if we need handle it and can handle it */
826 if (!mtrr_need_cleanup())
827 return 0;
828
829 /* print original var MTRRs at first, for debugging: */
830 printk(KERN_DEBUG "original variable MTRRs\n");
831 print_out_mtrr_range_state();
832
833 memset(range, 0, sizeof(range));
834 extra_remove_size = 0;
835 extra_remove_base = 1 << (32 - PAGE_SHIFT);
836 if (mtrr_tom2)
837 extra_remove_size =
838 (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
839 nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
840 extra_remove_size);
841 /*
842 * [0, 1M) should always be coverred by var mtrr with WB
843 * and fixed mtrrs should take effective before var mtrr for it
844 */
845 nr_range = add_range_with_merge(range, nr_range, 0,
846 (1ULL<<(20 - PAGE_SHIFT)) - 1);
847 /* sort the ranges */
848 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
849
850 range_sums = sum_ranges(range, nr_range);
851 printk(KERN_INFO "total RAM coverred: %ldM\n",
852 range_sums >> (20 - PAGE_SHIFT));
853
854 if (mtrr_chunk_size && mtrr_gran_size) {
855 i = 0;
856 mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size,
857 extra_remove_base, extra_remove_size, i);
858
859 mtrr_print_out_one_result(i);
860
861 if (!result[i].bad) {
862 set_var_mtrr_all(address_bits);
863 printk(KERN_DEBUG "New variable MTRRs\n");
864 print_out_mtrr_range_state();
865 return 1;
866 }
867 printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
868 "will find optimal one\n");
869 }
870
871 i = 0;
872 memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
873 memset(result, 0, sizeof(result));
874 for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
875
876 for (chunk_size = gran_size; chunk_size < (1ULL<<32);
877 chunk_size <<= 1) {
878
879 if (i >= NUM_RESULT)
880 continue;
881
882 mtrr_calc_range_state(chunk_size, gran_size,
883 extra_remove_base, extra_remove_size, i);
884 if (debug_print) {
885 mtrr_print_out_one_result(i);
886 printk(KERN_INFO "\n");
887 }
888
889 i++;
890 }
891 }
892
893 /* try to find the optimal index */
894 index_good = mtrr_search_optimal_index();
895
896 if (index_good != -1) {
897 printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
898 i = index_good;
899 mtrr_print_out_one_result(i);
900
901 /* convert ranges to var ranges state */
902 chunk_size = result[i].chunk_sizek;
903 chunk_size <<= 10;
904 gran_size = result[i].gran_sizek;
905 gran_size <<= 10;
906 x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
907 set_var_mtrr_all(address_bits);
908 printk(KERN_DEBUG "New variable MTRRs\n");
909 print_out_mtrr_range_state();
910 return 1;
911 } else {
912 /* print out all */
913 for (i = 0; i < NUM_RESULT; i++)
914 mtrr_print_out_one_result(i);
915 }
916
917 printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
918 printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n");
919
920 return 0;
921}
922#else
923int __init mtrr_cleanup(unsigned address_bits)
924{
925 return 0;
926}
927#endif
928
929static int disable_mtrr_trim;
930
931static int __init disable_mtrr_trim_setup(char *str)
932{
933 disable_mtrr_trim = 1;
934 return 0;
935}
936early_param("disable_mtrr_trim", disable_mtrr_trim_setup);
937
938/*
939 * Newer AMD K8s and later CPUs have a special magic MSR way to force WB
940 * for memory >4GB. Check for that here.
941 * Note this won't check if the MTRRs < 4GB where the magic bit doesn't
942 * apply to are wrong, but so far we don't know of any such case in the wild.
943 */
944#define Tom2Enabled (1U << 21)
945#define Tom2ForceMemTypeWB (1U << 22)
946
947int __init amd_special_default_mtrr(void)
948{
949 u32 l, h;
950
951 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
952 return 0;
953 if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
954 return 0;
955 /* In case some hypervisor doesn't pass SYSCFG through */
956 if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
957 return 0;
958 /*
959 * Memory between 4GB and top of mem is forced WB by this magic bit.
960 * Reserved before K8RevF, but should be zero there.
961 */
962 if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) ==
963 (Tom2Enabled | Tom2ForceMemTypeWB))
964 return 1;
965 return 0;
966}
967
968static u64 __init real_trim_memory(unsigned long start_pfn,
969 unsigned long limit_pfn)
970{
971 u64 trim_start, trim_size;
972 trim_start = start_pfn;
973 trim_start <<= PAGE_SHIFT;
974 trim_size = limit_pfn;
975 trim_size <<= PAGE_SHIFT;
976 trim_size -= trim_start;
977
978 return e820_update_range(trim_start, trim_size, E820_RAM,
979 E820_RESERVED);
980}
981/**
982 * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
983 * @end_pfn: ending page frame number
984 *
985 * Some buggy BIOSes don't setup the MTRRs properly for systems with certain
986 * memory configurations. This routine checks that the highest MTRR matches
987 * the end of memory, to make sure the MTRRs having a write back type cover
988 * all of the memory the kernel is intending to use. If not, it'll trim any
989 * memory off the end by adjusting end_pfn, removing it from the kernel's
990 * allocation pools, warning the user with an obnoxious message.
991 */
992int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
993{
994 unsigned long i, base, size, highest_pfn = 0, def, dummy;
995 mtrr_type type;
996 u64 total_trim_size;
997
998 /* extra one for all 0 */
999 int num[MTRR_NUM_TYPES + 1];
1000 /*
1001 * Make sure we only trim uncachable memory on machines that
1002 * support the Intel MTRR architecture:
1003 */
1004 if (!is_cpu(INTEL) || disable_mtrr_trim)
1005 return 0;
1006 rdmsr(MTRRdefType_MSR, def, dummy);
1007 def &= 0xff;
1008 if (def != MTRR_TYPE_UNCACHABLE)
1009 return 0;
1010
1011 /* get it and store it aside */
1012 memset(range_state, 0, sizeof(range_state));
1013 for (i = 0; i < num_var_ranges; i++) {
1014 mtrr_if->get(i, &base, &size, &type);
1015 range_state[i].base_pfn = base;
1016 range_state[i].size_pfn = size;
1017 range_state[i].type = type;
1018 }
1019
1020 /* Find highest cached pfn */
1021 for (i = 0; i < num_var_ranges; i++) {
1022 type = range_state[i].type;
1023 if (type != MTRR_TYPE_WRBACK)
1024 continue;
1025 base = range_state[i].base_pfn;
1026 size = range_state[i].size_pfn;
1027 if (highest_pfn < base + size)
1028 highest_pfn = base + size;
1029 }
1030
1031 /* kvm/qemu doesn't have mtrr set right, don't trim them all */
1032 if (!highest_pfn) {
1033 printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n");
1034 return 0;
1035 }
1036
1037 /* check entries number */
1038 memset(num, 0, sizeof(num));
1039 for (i = 0; i < num_var_ranges; i++) {
1040 type = range_state[i].type;
1041 if (type >= MTRR_NUM_TYPES)
1042 continue;
1043 size = range_state[i].size_pfn;
1044 if (!size)
1045 type = MTRR_NUM_TYPES;
1046 num[type]++;
1047 }
1048
1049 /* no entry for WB? */
1050 if (!num[MTRR_TYPE_WRBACK])
1051 return 0;
1052
1053 /* check if we only had WB and UC */
1054 if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
1055 num_var_ranges - num[MTRR_NUM_TYPES])
1056 return 0;
1057
1058 memset(range, 0, sizeof(range));
1059 nr_range = 0;
1060 if (mtrr_tom2) {
1061 range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
1062 range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1;
1063 if (highest_pfn < range[nr_range].end + 1)
1064 highest_pfn = range[nr_range].end + 1;
1065 nr_range++;
1066 }
1067 nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
1068
1069 total_trim_size = 0;
1070 /* check the head */
1071 if (range[0].start)
1072 total_trim_size += real_trim_memory(0, range[0].start);
1073 /* check the holes */
1074 for (i = 0; i < nr_range - 1; i++) {
1075 if (range[i].end + 1 < range[i+1].start)
1076 total_trim_size += real_trim_memory(range[i].end + 1,
1077 range[i+1].start);
1078 }
1079 /* check the top */
1080 i = nr_range - 1;
1081 if (range[i].end + 1 < end_pfn)
1082 total_trim_size += real_trim_memory(range[i].end + 1,
1083 end_pfn);
1084
1085 if (total_trim_size) {
1086 printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover"
1087 " all of memory, losing %lluMB of RAM.\n",
1088 total_trim_size >> 20);
1089
1090 if (!changed_by_mtrr_cleanup)
1091 WARN_ON(1);
1092
1093 printk(KERN_INFO "update e820 for mtrr\n");
1094 update_e820();
1095
1096 return 1;
1097 }
1098
1099 return 0;
1100}
1101
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 0c0a455fe95c..0b776c09aff3 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -33,13 +33,31 @@ u64 mtrr_tom2;
33struct mtrr_state_type mtrr_state = {}; 33struct mtrr_state_type mtrr_state = {};
34EXPORT_SYMBOL_GPL(mtrr_state); 34EXPORT_SYMBOL_GPL(mtrr_state);
35 35
36static int __initdata mtrr_show; 36/**
37static int __init mtrr_debug(char *opt) 37 * BIOS is expected to clear MtrrFixDramModEn bit, see for example
38 * "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD
39 * Opteron Processors" (26094 Rev. 3.30 February 2006), section
40 * "13.2.1.2 SYSCFG Register": "The MtrrFixDramModEn bit should be set
41 * to 1 during BIOS initalization of the fixed MTRRs, then cleared to
42 * 0 for operation."
43 */
44static inline void k8_check_syscfg_dram_mod_en(void)
38{ 45{
39 mtrr_show = 1; 46 u32 lo, hi;
40 return 0; 47
48 if (!((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) &&
49 (boot_cpu_data.x86 >= 0x0f)))
50 return;
51
52 rdmsr(MSR_K8_SYSCFG, lo, hi);
53 if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) {
54 printk(KERN_ERR FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]"
55 " not cleared by BIOS, clearing this bit\n",
56 smp_processor_id());
57 lo &= ~K8_MTRRFIXRANGE_DRAM_MODIFY;
58 mtrr_wrmsr(MSR_K8_SYSCFG, lo, hi);
59 }
41} 60}
42early_param("mtrr.show", mtrr_debug);
43 61
44/* 62/*
45 * Returns the effective MTRR type for the region 63 * Returns the effective MTRR type for the region
@@ -174,6 +192,8 @@ get_fixed_ranges(mtrr_type * frs)
174 unsigned int *p = (unsigned int *) frs; 192 unsigned int *p = (unsigned int *) frs;
175 int i; 193 int i;
176 194
195 k8_check_syscfg_dram_mod_en();
196
177 rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]); 197 rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]);
178 198
179 for (i = 0; i < 2; i++) 199 for (i = 0; i < 2; i++)
@@ -188,18 +208,94 @@ void mtrr_save_fixed_ranges(void *info)
188 get_fixed_ranges(mtrr_state.fixed_ranges); 208 get_fixed_ranges(mtrr_state.fixed_ranges);
189} 209}
190 210
191static void print_fixed(unsigned base, unsigned step, const mtrr_type*types) 211static unsigned __initdata last_fixed_start;
212static unsigned __initdata last_fixed_end;
213static mtrr_type __initdata last_fixed_type;
214
215static void __init print_fixed_last(void)
216{
217 if (!last_fixed_end)
218 return;
219
220 printk(KERN_DEBUG " %05X-%05X %s\n", last_fixed_start,
221 last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type));
222
223 last_fixed_end = 0;
224}
225
226static void __init update_fixed_last(unsigned base, unsigned end,
227 mtrr_type type)
228{
229 last_fixed_start = base;
230 last_fixed_end = end;
231 last_fixed_type = type;
232}
233
234static void __init print_fixed(unsigned base, unsigned step,
235 const mtrr_type *types)
192{ 236{
193 unsigned i; 237 unsigned i;
194 238
195 for (i = 0; i < 8; ++i, ++types, base += step) 239 for (i = 0; i < 8; ++i, ++types, base += step) {
196 printk(KERN_INFO "MTRR %05X-%05X %s\n", 240 if (last_fixed_end == 0) {
197 base, base + step - 1, mtrr_attrib_to_str(*types)); 241 update_fixed_last(base, base + step, *types);
242 continue;
243 }
244 if (last_fixed_end == base && last_fixed_type == *types) {
245 last_fixed_end = base + step;
246 continue;
247 }
248 /* new segments: gap or different type */
249 print_fixed_last();
250 update_fixed_last(base, base + step, *types);
251 }
198} 252}
199 253
200static void prepare_set(void); 254static void prepare_set(void);
201static void post_set(void); 255static void post_set(void);
202 256
257static void __init print_mtrr_state(void)
258{
259 unsigned int i;
260 int high_width;
261
262 printk(KERN_DEBUG "MTRR default type: %s\n",
263 mtrr_attrib_to_str(mtrr_state.def_type));
264 if (mtrr_state.have_fixed) {
265 printk(KERN_DEBUG "MTRR fixed ranges %sabled:\n",
266 mtrr_state.enabled & 1 ? "en" : "dis");
267 print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
268 for (i = 0; i < 2; ++i)
269 print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8);
270 for (i = 0; i < 8; ++i)
271 print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8);
272
273 /* tail */
274 print_fixed_last();
275 }
276 printk(KERN_DEBUG "MTRR variable ranges %sabled:\n",
277 mtrr_state.enabled & 2 ? "en" : "dis");
278 high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4;
279 for (i = 0; i < num_var_ranges; ++i) {
280 if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
281 printk(KERN_DEBUG " %u base %0*X%05X000 mask %0*X%05X000 %s\n",
282 i,
283 high_width,
284 mtrr_state.var_ranges[i].base_hi,
285 mtrr_state.var_ranges[i].base_lo >> 12,
286 high_width,
287 mtrr_state.var_ranges[i].mask_hi,
288 mtrr_state.var_ranges[i].mask_lo >> 12,
289 mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff));
290 else
291 printk(KERN_DEBUG " %u disabled\n", i);
292 }
293 if (mtrr_tom2) {
294 printk(KERN_DEBUG "TOM2: %016llx aka %lldM\n",
295 mtrr_tom2, mtrr_tom2>>20);
296 }
297}
298
203/* Grab all of the MTRR state for this CPU into *state */ 299/* Grab all of the MTRR state for this CPU into *state */
204void __init get_mtrr_state(void) 300void __init get_mtrr_state(void)
205{ 301{
@@ -231,41 +327,9 @@ void __init get_mtrr_state(void)
231 mtrr_tom2 |= low; 327 mtrr_tom2 |= low;
232 mtrr_tom2 &= 0xffffff800000ULL; 328 mtrr_tom2 &= 0xffffff800000ULL;
233 } 329 }
234 if (mtrr_show) { 330
235 int high_width; 331 print_mtrr_state();
236 332
237 printk(KERN_INFO "MTRR default type: %s\n", mtrr_attrib_to_str(mtrr_state.def_type));
238 if (mtrr_state.have_fixed) {
239 printk(KERN_INFO "MTRR fixed ranges %sabled:\n",
240 mtrr_state.enabled & 1 ? "en" : "dis");
241 print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
242 for (i = 0; i < 2; ++i)
243 print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8);
244 for (i = 0; i < 8; ++i)
245 print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8);
246 }
247 printk(KERN_INFO "MTRR variable ranges %sabled:\n",
248 mtrr_state.enabled & 2 ? "en" : "dis");
249 high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4;
250 for (i = 0; i < num_var_ranges; ++i) {
251 if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
252 printk(KERN_INFO "MTRR %u base %0*X%05X000 mask %0*X%05X000 %s\n",
253 i,
254 high_width,
255 mtrr_state.var_ranges[i].base_hi,
256 mtrr_state.var_ranges[i].base_lo >> 12,
257 high_width,
258 mtrr_state.var_ranges[i].mask_hi,
259 mtrr_state.var_ranges[i].mask_lo >> 12,
260 mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff));
261 else
262 printk(KERN_INFO "MTRR %u disabled\n", i);
263 }
264 if (mtrr_tom2) {
265 printk(KERN_INFO "TOM2: %016llx aka %lldM\n",
266 mtrr_tom2, mtrr_tom2>>20);
267 }
268 }
269 mtrr_state_set = 1; 333 mtrr_state_set = 1;
270 334
271 /* PAT setup for BP. We need to go through sync steps here */ 335 /* PAT setup for BP. We need to go through sync steps here */
@@ -308,27 +372,10 @@ void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b)
308} 372}
309 373
310/** 374/**
311 * Enable and allow read/write of extended fixed-range MTRR bits on K8 CPUs
312 * see AMD publication no. 24593, chapter 3.2.1 for more information
313 */
314static inline void k8_enable_fixed_iorrs(void)
315{
316 unsigned lo, hi;
317
318 rdmsr(MSR_K8_SYSCFG, lo, hi);
319 mtrr_wrmsr(MSR_K8_SYSCFG, lo
320 | K8_MTRRFIXRANGE_DRAM_ENABLE
321 | K8_MTRRFIXRANGE_DRAM_MODIFY, hi);
322}
323
324/**
325 * set_fixed_range - checks & updates a fixed-range MTRR if it differs from the value it should have 375 * set_fixed_range - checks & updates a fixed-range MTRR if it differs from the value it should have
326 * @msr: MSR address of the MTTR which should be checked and updated 376 * @msr: MSR address of the MTTR which should be checked and updated
327 * @changed: pointer which indicates whether the MTRR needed to be changed 377 * @changed: pointer which indicates whether the MTRR needed to be changed
328 * @msrwords: pointer to the MSR values which the MSR should have 378 * @msrwords: pointer to the MSR values which the MSR should have
329 *
330 * If K8 extentions are wanted, update the K8 SYSCFG MSR also.
331 * See AMD publication no. 24593, chapter 7.8.1, page 233 for more information.
332 */ 379 */
333static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords) 380static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
334{ 381{
@@ -337,10 +384,6 @@ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
337 rdmsr(msr, lo, hi); 384 rdmsr(msr, lo, hi);
338 385
339 if (lo != msrwords[0] || hi != msrwords[1]) { 386 if (lo != msrwords[0] || hi != msrwords[1]) {
340 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
341 (boot_cpu_data.x86 >= 0x0f && boot_cpu_data.x86 <= 0x11) &&
342 ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK))
343 k8_enable_fixed_iorrs();
344 mtrr_wrmsr(msr, msrwords[0], msrwords[1]); 387 mtrr_wrmsr(msr, msrwords[0], msrwords[1]);
345 *changed = true; 388 *changed = true;
346 } 389 }
@@ -376,22 +419,31 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
376{ 419{
377 unsigned int mask_lo, mask_hi, base_lo, base_hi; 420 unsigned int mask_lo, mask_hi, base_lo, base_hi;
378 unsigned int tmp, hi; 421 unsigned int tmp, hi;
422 int cpu;
423
424 /*
425 * get_mtrr doesn't need to update mtrr_state, also it could be called
426 * from any cpu, so try to print it out directly.
427 */
428 cpu = get_cpu();
379 429
380 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); 430 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
431
381 if ((mask_lo & 0x800) == 0) { 432 if ((mask_lo & 0x800) == 0) {
382 /* Invalid (i.e. free) range */ 433 /* Invalid (i.e. free) range */
383 *base = 0; 434 *base = 0;
384 *size = 0; 435 *size = 0;
385 *type = 0; 436 *type = 0;
386 return; 437 goto out_put_cpu;
387 } 438 }
388 439
389 rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi); 440 rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi);
390 441
391 /* Work out the shifted address mask. */ 442 /* Work out the shifted address mask: */
392 tmp = mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT; 443 tmp = mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT;
393 mask_lo = size_or_mask | tmp; 444 mask_lo = size_or_mask | tmp;
394 /* Expand tmp with high bits to all 1s*/ 445
446 /* Expand tmp with high bits to all 1s: */
395 hi = fls(tmp); 447 hi = fls(tmp);
396 if (hi > 0) { 448 if (hi > 0) {
397 tmp |= ~((1<<(hi - 1)) - 1); 449 tmp |= ~((1<<(hi - 1)) - 1);
@@ -402,11 +454,16 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
402 } 454 }
403 } 455 }
404 456
405 /* This works correctly if size is a power of two, i.e. a 457 /*
406 contiguous range. */ 458 * This works correctly if size is a power of two, i.e. a
459 * contiguous range:
460 */
407 *size = -mask_lo; 461 *size = -mask_lo;
408 *base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT; 462 *base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT;
409 *type = base_lo & 0xff; 463 *type = base_lo & 0xff;
464
465out_put_cpu:
466 put_cpu();
410} 467}
411 468
412/** 469/**
@@ -419,6 +476,8 @@ static int set_fixed_ranges(mtrr_type * frs)
419 bool changed = false; 476 bool changed = false;
420 int block=-1, range; 477 int block=-1, range;
421 478
479 k8_check_syscfg_dram_mod_en();
480
422 while (fixed_range_blocks[++block].ranges) 481 while (fixed_range_blocks[++block].ranges)
423 for (range=0; range < fixed_range_blocks[block].ranges; range++) 482 for (range=0; range < fixed_range_blocks[block].ranges; range++)
424 set_fixed_range(fixed_range_blocks[block].base_msr + range, 483 set_fixed_range(fixed_range_blocks[block].base_msr + range,
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index 4c4214690dd1..fb73a52913a4 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -377,10 +377,6 @@ static const struct file_operations mtrr_fops = {
377 .release = mtrr_close, 377 .release = mtrr_close,
378}; 378};
379 379
380
381static struct proc_dir_entry *proc_root_mtrr;
382
383
384static int mtrr_seq_show(struct seq_file *seq, void *offset) 380static int mtrr_seq_show(struct seq_file *seq, void *offset)
385{ 381{
386 char factor; 382 char factor;
@@ -423,11 +419,7 @@ static int __init mtrr_if_init(void)
423 (!cpu_has(c, X86_FEATURE_CENTAUR_MCR))) 419 (!cpu_has(c, X86_FEATURE_CENTAUR_MCR)))
424 return -ENODEV; 420 return -ENODEV;
425 421
426 proc_root_mtrr = 422 proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_fops);
427 proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_fops);
428
429 if (proc_root_mtrr)
430 proc_root_mtrr->owner = THIS_MODULE;
431 return 0; 423 return 0;
432} 424}
433 425
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 236a401b8259..03cda01f57c7 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -574,7 +574,7 @@ struct mtrr_value {
574 unsigned long lsize; 574 unsigned long lsize;
575}; 575};
576 576
577static struct mtrr_value mtrr_state[MTRR_MAX_VAR_RANGES]; 577static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES];
578 578
579static int mtrr_save(struct sys_device * sysdev, pm_message_t state) 579static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
580{ 580{
@@ -582,9 +582,9 @@ static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
582 582
583 for (i = 0; i < num_var_ranges; i++) { 583 for (i = 0; i < num_var_ranges; i++) {
584 mtrr_if->get(i, 584 mtrr_if->get(i,
585 &mtrr_state[i].lbase, 585 &mtrr_value[i].lbase,
586 &mtrr_state[i].lsize, 586 &mtrr_value[i].lsize,
587 &mtrr_state[i].ltype); 587 &mtrr_value[i].ltype);
588 } 588 }
589 return 0; 589 return 0;
590} 590}
@@ -594,11 +594,11 @@ static int mtrr_restore(struct sys_device * sysdev)
594 int i; 594 int i;
595 595
596 for (i = 0; i < num_var_ranges; i++) { 596 for (i = 0; i < num_var_ranges; i++) {
597 if (mtrr_state[i].lsize) 597 if (mtrr_value[i].lsize)
598 set_mtrr(i, 598 set_mtrr(i,
599 mtrr_state[i].lbase, 599 mtrr_value[i].lbase,
600 mtrr_state[i].lsize, 600 mtrr_value[i].lsize,
601 mtrr_state[i].ltype); 601 mtrr_value[i].ltype);
602 } 602 }
603 return 0; 603 return 0;
604} 604}
@@ -610,1058 +610,7 @@ static struct sysdev_driver mtrr_sysdev_driver = {
610 .resume = mtrr_restore, 610 .resume = mtrr_restore,
611}; 611};
612 612
613/* should be related to MTRR_VAR_RANGES nums */ 613int __initdata changed_by_mtrr_cleanup;
614#define RANGE_NUM 256
615
616struct res_range {
617 unsigned long start;
618 unsigned long end;
619};
620
621static int __init
622add_range(struct res_range *range, int nr_range, unsigned long start,
623 unsigned long end)
624{
625 /* out of slots */
626 if (nr_range >= RANGE_NUM)
627 return nr_range;
628
629 range[nr_range].start = start;
630 range[nr_range].end = end;
631
632 nr_range++;
633
634 return nr_range;
635}
636
637static int __init
638add_range_with_merge(struct res_range *range, int nr_range, unsigned long start,
639 unsigned long end)
640{
641 int i;
642
643 /* try to merge it with old one */
644 for (i = 0; i < nr_range; i++) {
645 unsigned long final_start, final_end;
646 unsigned long common_start, common_end;
647
648 if (!range[i].end)
649 continue;
650
651 common_start = max(range[i].start, start);
652 common_end = min(range[i].end, end);
653 if (common_start > common_end + 1)
654 continue;
655
656 final_start = min(range[i].start, start);
657 final_end = max(range[i].end, end);
658
659 range[i].start = final_start;
660 range[i].end = final_end;
661 return nr_range;
662 }
663
664 /* need to add that */
665 return add_range(range, nr_range, start, end);
666}
667
668static void __init
669subtract_range(struct res_range *range, unsigned long start, unsigned long end)
670{
671 int i, j;
672
673 for (j = 0; j < RANGE_NUM; j++) {
674 if (!range[j].end)
675 continue;
676
677 if (start <= range[j].start && end >= range[j].end) {
678 range[j].start = 0;
679 range[j].end = 0;
680 continue;
681 }
682
683 if (start <= range[j].start && end < range[j].end &&
684 range[j].start < end + 1) {
685 range[j].start = end + 1;
686 continue;
687 }
688
689
690 if (start > range[j].start && end >= range[j].end &&
691 range[j].end > start - 1) {
692 range[j].end = start - 1;
693 continue;
694 }
695
696 if (start > range[j].start && end < range[j].end) {
697 /* find the new spare */
698 for (i = 0; i < RANGE_NUM; i++) {
699 if (range[i].end == 0)
700 break;
701 }
702 if (i < RANGE_NUM) {
703 range[i].end = range[j].end;
704 range[i].start = end + 1;
705 } else {
706 printk(KERN_ERR "run of slot in ranges\n");
707 }
708 range[j].end = start - 1;
709 continue;
710 }
711 }
712}
713
714static int __init cmp_range(const void *x1, const void *x2)
715{
716 const struct res_range *r1 = x1;
717 const struct res_range *r2 = x2;
718 long start1, start2;
719
720 start1 = r1->start;
721 start2 = r2->start;
722
723 return start1 - start2;
724}
725
726struct var_mtrr_range_state {
727 unsigned long base_pfn;
728 unsigned long size_pfn;
729 mtrr_type type;
730};
731
732static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
733static int __initdata debug_print;
734
735static int __init
736x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
737 unsigned long extra_remove_base,
738 unsigned long extra_remove_size)
739{
740 unsigned long i, base, size;
741 mtrr_type type;
742
743 for (i = 0; i < num_var_ranges; i++) {
744 type = range_state[i].type;
745 if (type != MTRR_TYPE_WRBACK)
746 continue;
747 base = range_state[i].base_pfn;
748 size = range_state[i].size_pfn;
749 nr_range = add_range_with_merge(range, nr_range, base,
750 base + size - 1);
751 }
752 if (debug_print) {
753 printk(KERN_DEBUG "After WB checking\n");
754 for (i = 0; i < nr_range; i++)
755 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
756 range[i].start, range[i].end + 1);
757 }
758
759 /* take out UC ranges */
760 for (i = 0; i < num_var_ranges; i++) {
761 type = range_state[i].type;
762 if (type != MTRR_TYPE_UNCACHABLE &&
763 type != MTRR_TYPE_WRPROT)
764 continue;
765 size = range_state[i].size_pfn;
766 if (!size)
767 continue;
768 base = range_state[i].base_pfn;
769 subtract_range(range, base, base + size - 1);
770 }
771 if (extra_remove_size)
772 subtract_range(range, extra_remove_base,
773 extra_remove_base + extra_remove_size - 1);
774
775 /* get new range num */
776 nr_range = 0;
777 for (i = 0; i < RANGE_NUM; i++) {
778 if (!range[i].end)
779 continue;
780 nr_range++;
781 }
782 if (debug_print) {
783 printk(KERN_DEBUG "After UC checking\n");
784 for (i = 0; i < nr_range; i++)
785 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
786 range[i].start, range[i].end + 1);
787 }
788
789 /* sort the ranges */
790 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
791 if (debug_print) {
792 printk(KERN_DEBUG "After sorting\n");
793 for (i = 0; i < nr_range; i++)
794 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
795 range[i].start, range[i].end + 1);
796 }
797
798 /* clear those is not used */
799 for (i = nr_range; i < RANGE_NUM; i++)
800 memset(&range[i], 0, sizeof(range[i]));
801
802 return nr_range;
803}
804
805static struct res_range __initdata range[RANGE_NUM];
806static int __initdata nr_range;
807
808#ifdef CONFIG_MTRR_SANITIZER
809
810static unsigned long __init sum_ranges(struct res_range *range, int nr_range)
811{
812 unsigned long sum;
813 int i;
814
815 sum = 0;
816 for (i = 0; i < nr_range; i++)
817 sum += range[i].end + 1 - range[i].start;
818
819 return sum;
820}
821
822static int enable_mtrr_cleanup __initdata =
823 CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT;
824
825static int __init disable_mtrr_cleanup_setup(char *str)
826{
827 enable_mtrr_cleanup = 0;
828 return 0;
829}
830early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
831
832static int __init enable_mtrr_cleanup_setup(char *str)
833{
834 enable_mtrr_cleanup = 1;
835 return 0;
836}
837early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
838
839static int __init mtrr_cleanup_debug_setup(char *str)
840{
841 debug_print = 1;
842 return 0;
843}
844early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
845
846struct var_mtrr_state {
847 unsigned long range_startk;
848 unsigned long range_sizek;
849 unsigned long chunk_sizek;
850 unsigned long gran_sizek;
851 unsigned int reg;
852};
853
854static void __init
855set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
856 unsigned char type, unsigned int address_bits)
857{
858 u32 base_lo, base_hi, mask_lo, mask_hi;
859 u64 base, mask;
860
861 if (!sizek) {
862 fill_mtrr_var_range(reg, 0, 0, 0, 0);
863 return;
864 }
865
866 mask = (1ULL << address_bits) - 1;
867 mask &= ~((((u64)sizek) << 10) - 1);
868
869 base = ((u64)basek) << 10;
870
871 base |= type;
872 mask |= 0x800;
873
874 base_lo = base & ((1ULL<<32) - 1);
875 base_hi = base >> 32;
876
877 mask_lo = mask & ((1ULL<<32) - 1);
878 mask_hi = mask >> 32;
879
880 fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi);
881}
882
883static void __init
884save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
885 unsigned char type)
886{
887 range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10);
888 range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10);
889 range_state[reg].type = type;
890}
891
892static void __init
893set_var_mtrr_all(unsigned int address_bits)
894{
895 unsigned long basek, sizek;
896 unsigned char type;
897 unsigned int reg;
898
899 for (reg = 0; reg < num_var_ranges; reg++) {
900 basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10);
901 sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10);
902 type = range_state[reg].type;
903
904 set_var_mtrr(reg, basek, sizek, type, address_bits);
905 }
906}
907
908static unsigned long to_size_factor(unsigned long sizek, char *factorp)
909{
910 char factor;
911 unsigned long base = sizek;
912
913 if (base & ((1<<10) - 1)) {
914 /* not MB alignment */
915 factor = 'K';
916 } else if (base & ((1<<20) - 1)){
917 factor = 'M';
918 base >>= 10;
919 } else {
920 factor = 'G';
921 base >>= 20;
922 }
923
924 *factorp = factor;
925
926 return base;
927}
928
929static unsigned int __init
930range_to_mtrr(unsigned int reg, unsigned long range_startk,
931 unsigned long range_sizek, unsigned char type)
932{
933 if (!range_sizek || (reg >= num_var_ranges))
934 return reg;
935
936 while (range_sizek) {
937 unsigned long max_align, align;
938 unsigned long sizek;
939
940 /* Compute the maximum size I can make a range */
941 if (range_startk)
942 max_align = ffs(range_startk) - 1;
943 else
944 max_align = 32;
945 align = fls(range_sizek) - 1;
946 if (align > max_align)
947 align = max_align;
948
949 sizek = 1 << align;
950 if (debug_print) {
951 char start_factor = 'K', size_factor = 'K';
952 unsigned long start_base, size_base;
953
954 start_base = to_size_factor(range_startk, &start_factor),
955 size_base = to_size_factor(sizek, &size_factor),
956
957 printk(KERN_DEBUG "Setting variable MTRR %d, "
958 "base: %ld%cB, range: %ld%cB, type %s\n",
959 reg, start_base, start_factor,
960 size_base, size_factor,
961 (type == MTRR_TYPE_UNCACHABLE)?"UC":
962 ((type == MTRR_TYPE_WRBACK)?"WB":"Other")
963 );
964 }
965 save_var_mtrr(reg++, range_startk, sizek, type);
966 range_startk += sizek;
967 range_sizek -= sizek;
968 if (reg >= num_var_ranges)
969 break;
970 }
971 return reg;
972}
973
974static unsigned __init
975range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
976 unsigned long sizek)
977{
978 unsigned long hole_basek, hole_sizek;
979 unsigned long second_basek, second_sizek;
980 unsigned long range0_basek, range0_sizek;
981 unsigned long range_basek, range_sizek;
982 unsigned long chunk_sizek;
983 unsigned long gran_sizek;
984
985 hole_basek = 0;
986 hole_sizek = 0;
987 second_basek = 0;
988 second_sizek = 0;
989 chunk_sizek = state->chunk_sizek;
990 gran_sizek = state->gran_sizek;
991
992 /* align with gran size, prevent small block used up MTRRs */
993 range_basek = ALIGN(state->range_startk, gran_sizek);
994 if ((range_basek > basek) && basek)
995 return second_sizek;
996 state->range_sizek -= (range_basek - state->range_startk);
997 range_sizek = ALIGN(state->range_sizek, gran_sizek);
998
999 while (range_sizek > state->range_sizek) {
1000 range_sizek -= gran_sizek;
1001 if (!range_sizek)
1002 return 0;
1003 }
1004 state->range_sizek = range_sizek;
1005
1006 /* try to append some small hole */
1007 range0_basek = state->range_startk;
1008 range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
1009
1010 /* no increase */
1011 if (range0_sizek == state->range_sizek) {
1012 if (debug_print)
1013 printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
1014 range0_basek<<10,
1015 (range0_basek + state->range_sizek)<<10);
1016 state->reg = range_to_mtrr(state->reg, range0_basek,
1017 state->range_sizek, MTRR_TYPE_WRBACK);
1018 return 0;
1019 }
1020
1021 /* only cut back, when it is not the last */
1022 if (sizek) {
1023 while (range0_basek + range0_sizek > (basek + sizek)) {
1024 if (range0_sizek >= chunk_sizek)
1025 range0_sizek -= chunk_sizek;
1026 else
1027 range0_sizek = 0;
1028
1029 if (!range0_sizek)
1030 break;
1031 }
1032 }
1033
1034second_try:
1035 range_basek = range0_basek + range0_sizek;
1036
1037 /* one hole in the middle */
1038 if (range_basek > basek && range_basek <= (basek + sizek))
1039 second_sizek = range_basek - basek;
1040
1041 if (range0_sizek > state->range_sizek) {
1042
1043 /* one hole in middle or at end */
1044 hole_sizek = range0_sizek - state->range_sizek - second_sizek;
1045
1046 /* hole size should be less than half of range0 size */
1047 if (hole_sizek >= (range0_sizek >> 1) &&
1048 range0_sizek >= chunk_sizek) {
1049 range0_sizek -= chunk_sizek;
1050 second_sizek = 0;
1051 hole_sizek = 0;
1052
1053 goto second_try;
1054 }
1055 }
1056
1057 if (range0_sizek) {
1058 if (debug_print)
1059 printk(KERN_DEBUG "range0: %016lx - %016lx\n",
1060 range0_basek<<10,
1061 (range0_basek + range0_sizek)<<10);
1062 state->reg = range_to_mtrr(state->reg, range0_basek,
1063 range0_sizek, MTRR_TYPE_WRBACK);
1064 }
1065
1066 if (range0_sizek < state->range_sizek) {
1067 /* need to handle left over */
1068 range_sizek = state->range_sizek - range0_sizek;
1069
1070 if (debug_print)
1071 printk(KERN_DEBUG "range: %016lx - %016lx\n",
1072 range_basek<<10,
1073 (range_basek + range_sizek)<<10);
1074 state->reg = range_to_mtrr(state->reg, range_basek,
1075 range_sizek, MTRR_TYPE_WRBACK);
1076 }
1077
1078 if (hole_sizek) {
1079 hole_basek = range_basek - hole_sizek - second_sizek;
1080 if (debug_print)
1081 printk(KERN_DEBUG "hole: %016lx - %016lx\n",
1082 hole_basek<<10,
1083 (hole_basek + hole_sizek)<<10);
1084 state->reg = range_to_mtrr(state->reg, hole_basek,
1085 hole_sizek, MTRR_TYPE_UNCACHABLE);
1086 }
1087
1088 return second_sizek;
1089}
1090
1091static void __init
1092set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
1093 unsigned long size_pfn)
1094{
1095 unsigned long basek, sizek;
1096 unsigned long second_sizek = 0;
1097
1098 if (state->reg >= num_var_ranges)
1099 return;
1100
1101 basek = base_pfn << (PAGE_SHIFT - 10);
1102 sizek = size_pfn << (PAGE_SHIFT - 10);
1103
1104 /* See if I can merge with the last range */
1105 if ((basek <= 1024) ||
1106 (state->range_startk + state->range_sizek == basek)) {
1107 unsigned long endk = basek + sizek;
1108 state->range_sizek = endk - state->range_startk;
1109 return;
1110 }
1111 /* Write the range mtrrs */
1112 if (state->range_sizek != 0)
1113 second_sizek = range_to_mtrr_with_hole(state, basek, sizek);
1114
1115 /* Allocate an msr */
1116 state->range_startk = basek + second_sizek;
1117 state->range_sizek = sizek - second_sizek;
1118}
1119
1120/* mininum size of mtrr block that can take hole */
1121static u64 mtrr_chunk_size __initdata = (256ULL<<20);
1122
1123static int __init parse_mtrr_chunk_size_opt(char *p)
1124{
1125 if (!p)
1126 return -EINVAL;
1127 mtrr_chunk_size = memparse(p, &p);
1128 return 0;
1129}
1130early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt);
1131
1132/* granity of mtrr of block */
1133static u64 mtrr_gran_size __initdata;
1134
1135static int __init parse_mtrr_gran_size_opt(char *p)
1136{
1137 if (!p)
1138 return -EINVAL;
1139 mtrr_gran_size = memparse(p, &p);
1140 return 0;
1141}
1142early_param("mtrr_gran_size", parse_mtrr_gran_size_opt);
1143
1144static int nr_mtrr_spare_reg __initdata =
1145 CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT;
1146
1147static int __init parse_mtrr_spare_reg(char *arg)
1148{
1149 if (arg)
1150 nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0);
1151 return 0;
1152}
1153
1154early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
1155
1156static int __init
1157x86_setup_var_mtrrs(struct res_range *range, int nr_range,
1158 u64 chunk_size, u64 gran_size)
1159{
1160 struct var_mtrr_state var_state;
1161 int i;
1162 int num_reg;
1163
1164 var_state.range_startk = 0;
1165 var_state.range_sizek = 0;
1166 var_state.reg = 0;
1167 var_state.chunk_sizek = chunk_size >> 10;
1168 var_state.gran_sizek = gran_size >> 10;
1169
1170 memset(range_state, 0, sizeof(range_state));
1171
1172 /* Write the range etc */
1173 for (i = 0; i < nr_range; i++)
1174 set_var_mtrr_range(&var_state, range[i].start,
1175 range[i].end - range[i].start + 1);
1176
1177 /* Write the last range */
1178 if (var_state.range_sizek != 0)
1179 range_to_mtrr_with_hole(&var_state, 0, 0);
1180
1181 num_reg = var_state.reg;
1182 /* Clear out the extra MTRR's */
1183 while (var_state.reg < num_var_ranges) {
1184 save_var_mtrr(var_state.reg, 0, 0, 0);
1185 var_state.reg++;
1186 }
1187
1188 return num_reg;
1189}
1190
1191struct mtrr_cleanup_result {
1192 unsigned long gran_sizek;
1193 unsigned long chunk_sizek;
1194 unsigned long lose_cover_sizek;
1195 unsigned int num_reg;
1196 int bad;
1197};
1198
1199/*
1200 * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G
1201 * chunk size: gran_size, ..., 2G
1202 * so we need (1+16)*8
1203 */
1204#define NUM_RESULT 136
1205#define PSHIFT (PAGE_SHIFT - 10)
1206
1207static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
1208static unsigned long __initdata min_loss_pfn[RANGE_NUM];
1209
1210static void __init print_out_mtrr_range_state(void)
1211{
1212 int i;
1213 char start_factor = 'K', size_factor = 'K';
1214 unsigned long start_base, size_base;
1215 mtrr_type type;
1216
1217 for (i = 0; i < num_var_ranges; i++) {
1218
1219 size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
1220 if (!size_base)
1221 continue;
1222
1223 size_base = to_size_factor(size_base, &size_factor),
1224 start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
1225 start_base = to_size_factor(start_base, &start_factor),
1226 type = range_state[i].type;
1227
1228 printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
1229 i, start_base, start_factor,
1230 size_base, size_factor,
1231 (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
1232 ((type == MTRR_TYPE_WRPROT) ? "WP" :
1233 ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
1234 );
1235 }
1236}
1237
1238static int __init mtrr_need_cleanup(void)
1239{
1240 int i;
1241 mtrr_type type;
1242 unsigned long size;
1243 /* extra one for all 0 */
1244 int num[MTRR_NUM_TYPES + 1];
1245
1246 /* check entries number */
1247 memset(num, 0, sizeof(num));
1248 for (i = 0; i < num_var_ranges; i++) {
1249 type = range_state[i].type;
1250 size = range_state[i].size_pfn;
1251 if (type >= MTRR_NUM_TYPES)
1252 continue;
1253 if (!size)
1254 type = MTRR_NUM_TYPES;
1255 if (type == MTRR_TYPE_WRPROT)
1256 type = MTRR_TYPE_UNCACHABLE;
1257 num[type]++;
1258 }
1259
1260 /* check if we got UC entries */
1261 if (!num[MTRR_TYPE_UNCACHABLE])
1262 return 0;
1263
1264 /* check if we only had WB and UC */
1265 if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
1266 num_var_ranges - num[MTRR_NUM_TYPES])
1267 return 0;
1268
1269 return 1;
1270}
1271
1272static unsigned long __initdata range_sums;
1273static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
1274 unsigned long extra_remove_base,
1275 unsigned long extra_remove_size,
1276 int i)
1277{
1278 int num_reg;
1279 static struct res_range range_new[RANGE_NUM];
1280 static int nr_range_new;
1281 unsigned long range_sums_new;
1282
1283 /* convert ranges to var ranges state */
1284 num_reg = x86_setup_var_mtrrs(range, nr_range,
1285 chunk_size, gran_size);
1286
1287 /* we got new setting in range_state, check it */
1288 memset(range_new, 0, sizeof(range_new));
1289 nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
1290 extra_remove_base, extra_remove_size);
1291 range_sums_new = sum_ranges(range_new, nr_range_new);
1292
1293 result[i].chunk_sizek = chunk_size >> 10;
1294 result[i].gran_sizek = gran_size >> 10;
1295 result[i].num_reg = num_reg;
1296 if (range_sums < range_sums_new) {
1297 result[i].lose_cover_sizek =
1298 (range_sums_new - range_sums) << PSHIFT;
1299 result[i].bad = 1;
1300 } else
1301 result[i].lose_cover_sizek =
1302 (range_sums - range_sums_new) << PSHIFT;
1303
1304 /* double check it */
1305 if (!result[i].bad && !result[i].lose_cover_sizek) {
1306 if (nr_range_new != nr_range ||
1307 memcmp(range, range_new, sizeof(range)))
1308 result[i].bad = 1;
1309 }
1310
1311 if (!result[i].bad && (range_sums - range_sums_new <
1312 min_loss_pfn[num_reg])) {
1313 min_loss_pfn[num_reg] =
1314 range_sums - range_sums_new;
1315 }
1316}
1317
1318static void __init mtrr_print_out_one_result(int i)
1319{
1320 char gran_factor, chunk_factor, lose_factor;
1321 unsigned long gran_base, chunk_base, lose_base;
1322
1323 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1324 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1325 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1326 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
1327 result[i].bad ? "*BAD*" : " ",
1328 gran_base, gran_factor, chunk_base, chunk_factor);
1329 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
1330 result[i].num_reg, result[i].bad ? "-" : "",
1331 lose_base, lose_factor);
1332}
1333
1334static int __init mtrr_search_optimal_index(void)
1335{
1336 int i;
1337 int num_reg_good;
1338 int index_good;
1339
1340 if (nr_mtrr_spare_reg >= num_var_ranges)
1341 nr_mtrr_spare_reg = num_var_ranges - 1;
1342 num_reg_good = -1;
1343 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
1344 if (!min_loss_pfn[i])
1345 num_reg_good = i;
1346 }
1347
1348 index_good = -1;
1349 if (num_reg_good != -1) {
1350 for (i = 0; i < NUM_RESULT; i++) {
1351 if (!result[i].bad &&
1352 result[i].num_reg == num_reg_good &&
1353 !result[i].lose_cover_sizek) {
1354 index_good = i;
1355 break;
1356 }
1357 }
1358 }
1359
1360 return index_good;
1361}
1362
1363
1364static int __init mtrr_cleanup(unsigned address_bits)
1365{
1366 unsigned long extra_remove_base, extra_remove_size;
1367 unsigned long base, size, def, dummy;
1368 mtrr_type type;
1369 u64 chunk_size, gran_size;
1370 int index_good;
1371 int i;
1372
1373 if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
1374 return 0;
1375 rdmsr(MTRRdefType_MSR, def, dummy);
1376 def &= 0xff;
1377 if (def != MTRR_TYPE_UNCACHABLE)
1378 return 0;
1379
1380 /* get it and store it aside */
1381 memset(range_state, 0, sizeof(range_state));
1382 for (i = 0; i < num_var_ranges; i++) {
1383 mtrr_if->get(i, &base, &size, &type);
1384 range_state[i].base_pfn = base;
1385 range_state[i].size_pfn = size;
1386 range_state[i].type = type;
1387 }
1388
1389 /* check if we need handle it and can handle it */
1390 if (!mtrr_need_cleanup())
1391 return 0;
1392
1393 /* print original var MTRRs at first, for debugging: */
1394 printk(KERN_DEBUG "original variable MTRRs\n");
1395 print_out_mtrr_range_state();
1396
1397 memset(range, 0, sizeof(range));
1398 extra_remove_size = 0;
1399 extra_remove_base = 1 << (32 - PAGE_SHIFT);
1400 if (mtrr_tom2)
1401 extra_remove_size =
1402 (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
1403 nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
1404 extra_remove_size);
1405 /*
1406 * [0, 1M) should always be coverred by var mtrr with WB
1407 * and fixed mtrrs should take effective before var mtrr for it
1408 */
1409 nr_range = add_range_with_merge(range, nr_range, 0,
1410 (1ULL<<(20 - PAGE_SHIFT)) - 1);
1411 /* sort the ranges */
1412 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
1413
1414 range_sums = sum_ranges(range, nr_range);
1415 printk(KERN_INFO "total RAM coverred: %ldM\n",
1416 range_sums >> (20 - PAGE_SHIFT));
1417
1418 if (mtrr_chunk_size && mtrr_gran_size) {
1419 i = 0;
1420 mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size,
1421 extra_remove_base, extra_remove_size, i);
1422
1423 mtrr_print_out_one_result(i);
1424
1425 if (!result[i].bad) {
1426 set_var_mtrr_all(address_bits);
1427 return 1;
1428 }
1429 printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
1430 "will find optimal one\n");
1431 }
1432
1433 i = 0;
1434 memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
1435 memset(result, 0, sizeof(result));
1436 for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
1437
1438 for (chunk_size = gran_size; chunk_size < (1ULL<<32);
1439 chunk_size <<= 1) {
1440
1441 if (i >= NUM_RESULT)
1442 continue;
1443
1444 mtrr_calc_range_state(chunk_size, gran_size,
1445 extra_remove_base, extra_remove_size, i);
1446 if (debug_print) {
1447 mtrr_print_out_one_result(i);
1448 printk(KERN_INFO "\n");
1449 }
1450
1451 i++;
1452 }
1453 }
1454
1455 /* try to find the optimal index */
1456 index_good = mtrr_search_optimal_index();
1457
1458 if (index_good != -1) {
1459 printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
1460 i = index_good;
1461 mtrr_print_out_one_result(i);
1462
1463 /* convert ranges to var ranges state */
1464 chunk_size = result[i].chunk_sizek;
1465 chunk_size <<= 10;
1466 gran_size = result[i].gran_sizek;
1467 gran_size <<= 10;
1468 x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
1469 set_var_mtrr_all(address_bits);
1470 printk(KERN_DEBUG "New variable MTRRs\n");
1471 print_out_mtrr_range_state();
1472 return 1;
1473 } else {
1474 /* print out all */
1475 for (i = 0; i < NUM_RESULT; i++)
1476 mtrr_print_out_one_result(i);
1477 }
1478
1479 printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
1480 printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n");
1481
1482 return 0;
1483}
1484#else
1485static int __init mtrr_cleanup(unsigned address_bits)
1486{
1487 return 0;
1488}
1489#endif
1490
1491static int __initdata changed_by_mtrr_cleanup;
1492
1493static int disable_mtrr_trim;
1494
1495static int __init disable_mtrr_trim_setup(char *str)
1496{
1497 disable_mtrr_trim = 1;
1498 return 0;
1499}
1500early_param("disable_mtrr_trim", disable_mtrr_trim_setup);
1501
1502/*
1503 * Newer AMD K8s and later CPUs have a special magic MSR way to force WB
1504 * for memory >4GB. Check for that here.
1505 * Note this won't check if the MTRRs < 4GB where the magic bit doesn't
1506 * apply to are wrong, but so far we don't know of any such case in the wild.
1507 */
1508#define Tom2Enabled (1U << 21)
1509#define Tom2ForceMemTypeWB (1U << 22)
1510
1511int __init amd_special_default_mtrr(void)
1512{
1513 u32 l, h;
1514
1515 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
1516 return 0;
1517 if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
1518 return 0;
1519 /* In case some hypervisor doesn't pass SYSCFG through */
1520 if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
1521 return 0;
1522 /*
1523 * Memory between 4GB and top of mem is forced WB by this magic bit.
1524 * Reserved before K8RevF, but should be zero there.
1525 */
1526 if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) ==
1527 (Tom2Enabled | Tom2ForceMemTypeWB))
1528 return 1;
1529 return 0;
1530}
1531
1532static u64 __init real_trim_memory(unsigned long start_pfn,
1533 unsigned long limit_pfn)
1534{
1535 u64 trim_start, trim_size;
1536 trim_start = start_pfn;
1537 trim_start <<= PAGE_SHIFT;
1538 trim_size = limit_pfn;
1539 trim_size <<= PAGE_SHIFT;
1540 trim_size -= trim_start;
1541
1542 return e820_update_range(trim_start, trim_size, E820_RAM,
1543 E820_RESERVED);
1544}
1545/**
1546 * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
1547 * @end_pfn: ending page frame number
1548 *
1549 * Some buggy BIOSes don't setup the MTRRs properly for systems with certain
1550 * memory configurations. This routine checks that the highest MTRR matches
1551 * the end of memory, to make sure the MTRRs having a write back type cover
1552 * all of the memory the kernel is intending to use. If not, it'll trim any
1553 * memory off the end by adjusting end_pfn, removing it from the kernel's
1554 * allocation pools, warning the user with an obnoxious message.
1555 */
1556int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1557{
1558 unsigned long i, base, size, highest_pfn = 0, def, dummy;
1559 mtrr_type type;
1560 u64 total_trim_size;
1561
1562 /* extra one for all 0 */
1563 int num[MTRR_NUM_TYPES + 1];
1564 /*
1565 * Make sure we only trim uncachable memory on machines that
1566 * support the Intel MTRR architecture:
1567 */
1568 if (!is_cpu(INTEL) || disable_mtrr_trim)
1569 return 0;
1570 rdmsr(MTRRdefType_MSR, def, dummy);
1571 def &= 0xff;
1572 if (def != MTRR_TYPE_UNCACHABLE)
1573 return 0;
1574
1575 /* get it and store it aside */
1576 memset(range_state, 0, sizeof(range_state));
1577 for (i = 0; i < num_var_ranges; i++) {
1578 mtrr_if->get(i, &base, &size, &type);
1579 range_state[i].base_pfn = base;
1580 range_state[i].size_pfn = size;
1581 range_state[i].type = type;
1582 }
1583
1584 /* Find highest cached pfn */
1585 for (i = 0; i < num_var_ranges; i++) {
1586 type = range_state[i].type;
1587 if (type != MTRR_TYPE_WRBACK)
1588 continue;
1589 base = range_state[i].base_pfn;
1590 size = range_state[i].size_pfn;
1591 if (highest_pfn < base + size)
1592 highest_pfn = base + size;
1593 }
1594
1595 /* kvm/qemu doesn't have mtrr set right, don't trim them all */
1596 if (!highest_pfn) {
1597 printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n");
1598 return 0;
1599 }
1600
1601 /* check entries number */
1602 memset(num, 0, sizeof(num));
1603 for (i = 0; i < num_var_ranges; i++) {
1604 type = range_state[i].type;
1605 if (type >= MTRR_NUM_TYPES)
1606 continue;
1607 size = range_state[i].size_pfn;
1608 if (!size)
1609 type = MTRR_NUM_TYPES;
1610 num[type]++;
1611 }
1612
1613 /* no entry for WB? */
1614 if (!num[MTRR_TYPE_WRBACK])
1615 return 0;
1616
1617 /* check if we only had WB and UC */
1618 if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
1619 num_var_ranges - num[MTRR_NUM_TYPES])
1620 return 0;
1621
1622 memset(range, 0, sizeof(range));
1623 nr_range = 0;
1624 if (mtrr_tom2) {
1625 range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
1626 range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1;
1627 if (highest_pfn < range[nr_range].end + 1)
1628 highest_pfn = range[nr_range].end + 1;
1629 nr_range++;
1630 }
1631 nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
1632
1633 total_trim_size = 0;
1634 /* check the head */
1635 if (range[0].start)
1636 total_trim_size += real_trim_memory(0, range[0].start);
1637 /* check the holes */
1638 for (i = 0; i < nr_range - 1; i++) {
1639 if (range[i].end + 1 < range[i+1].start)
1640 total_trim_size += real_trim_memory(range[i].end + 1,
1641 range[i+1].start);
1642 }
1643 /* check the top */
1644 i = nr_range - 1;
1645 if (range[i].end + 1 < end_pfn)
1646 total_trim_size += real_trim_memory(range[i].end + 1,
1647 end_pfn);
1648
1649 if (total_trim_size) {
1650 printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover"
1651 " all of memory, losing %lluMB of RAM.\n",
1652 total_trim_size >> 20);
1653
1654 if (!changed_by_mtrr_cleanup)
1655 WARN_ON(1);
1656
1657 printk(KERN_INFO "update e820 for mtrr\n");
1658 update_e820();
1659
1660 return 1;
1661 }
1662
1663 return 0;
1664}
1665 614
1666/** 615/**
1667 * mtrr_bp_init - initialize mtrrs on the boot CPU 616 * mtrr_bp_init - initialize mtrrs on the boot CPU
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index ffd60409cc6d..77f67f7b347a 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -79,6 +79,7 @@ extern struct mtrr_ops * mtrr_if;
79 79
80extern unsigned int num_var_ranges; 80extern unsigned int num_var_ranges;
81extern u64 mtrr_tom2; 81extern u64 mtrr_tom2;
82extern struct mtrr_state_type mtrr_state;
82 83
83void mtrr_state_warn(void); 84void mtrr_state_warn(void);
84const char *mtrr_attrib_to_str(int x); 85const char *mtrr_attrib_to_str(int x);
@@ -88,3 +89,6 @@ void mtrr_wrmsr(unsigned, unsigned, unsigned);
88int amd_init_mtrr(void); 89int amd_init_mtrr(void);
89int cyrix_init_mtrr(void); 90int cyrix_init_mtrr(void);
90int centaur_init_mtrr(void); 91int centaur_init_mtrr(void);
92
93extern int changed_by_mtrr_cleanup;
94extern int mtrr_cleanup(unsigned address_bits);
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 01b1244ef1c0..f93047fed791 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -7,15 +7,14 @@
7/* 7/*
8 * Get CPU information for use by the procfs. 8 * Get CPU information for use by the procfs.
9 */ 9 */
10#ifdef CONFIG_X86_32
11static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c, 10static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
12 unsigned int cpu) 11 unsigned int cpu)
13{ 12{
14#ifdef CONFIG_X86_HT 13#ifdef CONFIG_SMP
15 if (c->x86_max_cores * smp_num_siblings > 1) { 14 if (c->x86_max_cores * smp_num_siblings > 1) {
16 seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); 15 seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
17 seq_printf(m, "siblings\t: %d\n", 16 seq_printf(m, "siblings\t: %d\n",
18 cpus_weight(per_cpu(cpu_core_map, cpu))); 17 cpumask_weight(cpu_sibling_mask(cpu)));
19 seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); 18 seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
20 seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); 19 seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
21 seq_printf(m, "apicid\t\t: %d\n", c->apicid); 20 seq_printf(m, "apicid\t\t: %d\n", c->apicid);
@@ -24,6 +23,7 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
24#endif 23#endif
25} 24}
26 25
26#ifdef CONFIG_X86_32
27static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) 27static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
28{ 28{
29 /* 29 /*
@@ -50,22 +50,6 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
50 c->wp_works_ok ? "yes" : "no"); 50 c->wp_works_ok ? "yes" : "no");
51} 51}
52#else 52#else
53static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
54 unsigned int cpu)
55{
56#ifdef CONFIG_SMP
57 if (c->x86_max_cores * smp_num_siblings > 1) {
58 seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
59 seq_printf(m, "siblings\t: %d\n",
60 cpus_weight(per_cpu(cpu_core_map, cpu)));
61 seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
62 seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
63 seq_printf(m, "apicid\t\t: %d\n", c->apicid);
64 seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid);
65 }
66#endif
67}
68
69static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) 53static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
70{ 54{
71 seq_printf(m, 55 seq_printf(m,
@@ -159,9 +143,9 @@ static int show_cpuinfo(struct seq_file *m, void *v)
159static void *c_start(struct seq_file *m, loff_t *pos) 143static void *c_start(struct seq_file *m, loff_t *pos)
160{ 144{
161 if (*pos == 0) /* just in case, cpu 0 is not the first */ 145 if (*pos == 0) /* just in case, cpu 0 is not the first */
162 *pos = first_cpu(cpu_online_map); 146 *pos = cpumask_first(cpu_online_mask);
163 else 147 else
164 *pos = next_cpu_nr(*pos - 1, cpu_online_map); 148 *pos = cpumask_next(*pos - 1, cpu_online_mask);
165 if ((*pos) < nr_cpu_ids) 149 if ((*pos) < nr_cpu_ids)
166 return &cpu_data(*pos); 150 return &cpu_data(*pos);
167 return NULL; 151 return NULL;
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index 52b3fefbd5af..bb62b3e5caad 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -98,7 +98,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
98#endif 98#endif
99} 99}
100 100
101static struct cpu_dev transmeta_cpu_dev __cpuinitdata = { 101static const struct cpu_dev __cpuinitconst transmeta_cpu_dev = {
102 .c_vendor = "Transmeta", 102 .c_vendor = "Transmeta",
103 .c_ident = { "GenuineTMx86", "TransmetaCPU" }, 103 .c_ident = { "GenuineTMx86", "TransmetaCPU" },
104 .c_early_init = early_init_transmeta, 104 .c_early_init = early_init_transmeta,
diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c
index e777f79e0960..fd2c37bf7acb 100644
--- a/arch/x86/kernel/cpu/umc.c
+++ b/arch/x86/kernel/cpu/umc.c
@@ -8,7 +8,7 @@
8 * so no special init takes place. 8 * so no special init takes place.
9 */ 9 */
10 10
11static struct cpu_dev umc_cpu_dev __cpuinitdata = { 11static const struct cpu_dev __cpuinitconst umc_cpu_dev = {
12 .c_vendor = "UMC", 12 .c_vendor = "UMC",
13 .c_ident = { "UMC UMC UMC" }, 13 .c_ident = { "UMC UMC UMC" },
14 .c_models = { 14 .c_models = {
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 169a120587be..87b67e3a765a 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -729,7 +729,7 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task,
729 729
730 spin_unlock_irqrestore(&ds_lock, irq); 730 spin_unlock_irqrestore(&ds_lock, irq);
731 731
732 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); 732 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
733 ds_resume_pebs(tracer); 733 ds_resume_pebs(tracer);
734 734
735 return tracer; 735 return tracer;
@@ -1029,5 +1029,4 @@ void ds_copy_thread(struct task_struct *tsk, struct task_struct *father)
1029 1029
1030void ds_exit_thread(struct task_struct *tsk) 1030void ds_exit_thread(struct task_struct *tsk)
1031{ 1031{
1032 WARN_ON(tsk->thread.ds_ctx);
1033} 1032}
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 87d103ded1c3..95ea5fa7d444 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -10,10 +10,12 @@
10#include <linux/kdebug.h> 10#include <linux/kdebug.h>
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/ptrace.h> 12#include <linux/ptrace.h>
13#include <linux/ftrace.h>
13#include <linux/kexec.h> 14#include <linux/kexec.h>
14#include <linux/bug.h> 15#include <linux/bug.h>
15#include <linux/nmi.h> 16#include <linux/nmi.h>
16#include <linux/sysfs.h> 17#include <linux/sysfs.h>
18#include <linux/ftrace.h>
17 19
18#include <asm/stacktrace.h> 20#include <asm/stacktrace.h>
19 21
@@ -195,6 +197,11 @@ unsigned __kprobes long oops_begin(void)
195 int cpu; 197 int cpu;
196 unsigned long flags; 198 unsigned long flags;
197 199
200 /* notify the hw-branch tracer so it may disable tracing and
201 add the last trace to the trace buffer -
202 the earlier this happens, the more useful the trace. */
203 trace_hw_branch_oops();
204
198 oops_enter(); 205 oops_enter();
199 206
200 /* racy, but better than risking deadlock. */ 207 /* racy, but better than risking deadlock. */
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 508bec1cee27..ef2c3563357d 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -110,19 +110,50 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type)
110/* 110/*
111 * Add a memory region to the kernel e820 map. 111 * Add a memory region to the kernel e820 map.
112 */ 112 */
113void __init e820_add_region(u64 start, u64 size, int type) 113static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
114 int type)
114{ 115{
115 int x = e820.nr_map; 116 int x = e820x->nr_map;
116 117
117 if (x == ARRAY_SIZE(e820.map)) { 118 if (x == ARRAY_SIZE(e820x->map)) {
118 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); 119 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
119 return; 120 return;
120 } 121 }
121 122
122 e820.map[x].addr = start; 123 e820x->map[x].addr = start;
123 e820.map[x].size = size; 124 e820x->map[x].size = size;
124 e820.map[x].type = type; 125 e820x->map[x].type = type;
125 e820.nr_map++; 126 e820x->nr_map++;
127}
128
129void __init e820_add_region(u64 start, u64 size, int type)
130{
131 __e820_add_region(&e820, start, size, type);
132}
133
134static void __init e820_print_type(u32 type)
135{
136 switch (type) {
137 case E820_RAM:
138 case E820_RESERVED_KERN:
139 printk(KERN_CONT "(usable)");
140 break;
141 case E820_RESERVED:
142 printk(KERN_CONT "(reserved)");
143 break;
144 case E820_ACPI:
145 printk(KERN_CONT "(ACPI data)");
146 break;
147 case E820_NVS:
148 printk(KERN_CONT "(ACPI NVS)");
149 break;
150 case E820_UNUSABLE:
151 printk(KERN_CONT "(unusable)");
152 break;
153 default:
154 printk(KERN_CONT "type %u", type);
155 break;
156 }
126} 157}
127 158
128void __init e820_print_map(char *who) 159void __init e820_print_map(char *who)
@@ -134,27 +165,8 @@ void __init e820_print_map(char *who)
134 (unsigned long long) e820.map[i].addr, 165 (unsigned long long) e820.map[i].addr,
135 (unsigned long long) 166 (unsigned long long)
136 (e820.map[i].addr + e820.map[i].size)); 167 (e820.map[i].addr + e820.map[i].size));
137 switch (e820.map[i].type) { 168 e820_print_type(e820.map[i].type);
138 case E820_RAM: 169 printk(KERN_CONT "\n");
139 case E820_RESERVED_KERN:
140 printk(KERN_CONT "(usable)\n");
141 break;
142 case E820_RESERVED:
143 printk(KERN_CONT "(reserved)\n");
144 break;
145 case E820_ACPI:
146 printk(KERN_CONT "(ACPI data)\n");
147 break;
148 case E820_NVS:
149 printk(KERN_CONT "(ACPI NVS)\n");
150 break;
151 case E820_UNUSABLE:
152 printk("(unusable)\n");
153 break;
154 default:
155 printk(KERN_CONT "type %u\n", e820.map[i].type);
156 break;
157 }
158 } 170 }
159} 171}
160 172
@@ -221,7 +233,7 @@ void __init e820_print_map(char *who)
221 */ 233 */
222 234
223int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, 235int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
224 int *pnr_map) 236 u32 *pnr_map)
225{ 237{
226 struct change_member { 238 struct change_member {
227 struct e820entry *pbios; /* pointer to original bios entry */ 239 struct e820entry *pbios; /* pointer to original bios entry */
@@ -417,11 +429,12 @@ static int __init append_e820_map(struct e820entry *biosmap, int nr_map)
417 return __append_e820_map(biosmap, nr_map); 429 return __append_e820_map(biosmap, nr_map);
418} 430}
419 431
420static u64 __init e820_update_range_map(struct e820map *e820x, u64 start, 432static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
421 u64 size, unsigned old_type, 433 u64 size, unsigned old_type,
422 unsigned new_type) 434 unsigned new_type)
423{ 435{
424 int i; 436 u64 end;
437 unsigned int i;
425 u64 real_updated_size = 0; 438 u64 real_updated_size = 0;
426 439
427 BUG_ON(old_type == new_type); 440 BUG_ON(old_type == new_type);
@@ -429,27 +442,55 @@ static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
429 if (size > (ULLONG_MAX - start)) 442 if (size > (ULLONG_MAX - start))
430 size = ULLONG_MAX - start; 443 size = ULLONG_MAX - start;
431 444
432 for (i = 0; i < e820.nr_map; i++) { 445 end = start + size;
446 printk(KERN_DEBUG "e820 update range: %016Lx - %016Lx ",
447 (unsigned long long) start,
448 (unsigned long long) end);
449 e820_print_type(old_type);
450 printk(KERN_CONT " ==> ");
451 e820_print_type(new_type);
452 printk(KERN_CONT "\n");
453
454 for (i = 0; i < e820x->nr_map; i++) {
433 struct e820entry *ei = &e820x->map[i]; 455 struct e820entry *ei = &e820x->map[i];
434 u64 final_start, final_end; 456 u64 final_start, final_end;
457 u64 ei_end;
458
435 if (ei->type != old_type) 459 if (ei->type != old_type)
436 continue; 460 continue;
437 /* totally covered? */ 461
438 if (ei->addr >= start && 462 ei_end = ei->addr + ei->size;
439 (ei->addr + ei->size) <= (start + size)) { 463 /* totally covered by new range? */
464 if (ei->addr >= start && ei_end <= end) {
440 ei->type = new_type; 465 ei->type = new_type;
441 real_updated_size += ei->size; 466 real_updated_size += ei->size;
442 continue; 467 continue;
443 } 468 }
469
470 /* new range is totally covered? */
471 if (ei->addr < start && ei_end > end) {
472 __e820_add_region(e820x, start, size, new_type);
473 __e820_add_region(e820x, end, ei_end - end, ei->type);
474 ei->size = start - ei->addr;
475 real_updated_size += size;
476 continue;
477 }
478
444 /* partially covered */ 479 /* partially covered */
445 final_start = max(start, ei->addr); 480 final_start = max(start, ei->addr);
446 final_end = min(start + size, ei->addr + ei->size); 481 final_end = min(end, ei_end);
447 if (final_start >= final_end) 482 if (final_start >= final_end)
448 continue; 483 continue;
449 e820_add_region(final_start, final_end - final_start, 484
450 new_type); 485 __e820_add_region(e820x, final_start, final_end - final_start,
486 new_type);
487
451 real_updated_size += final_end - final_start; 488 real_updated_size += final_end - final_start;
452 489
490 /*
491 * left range could be head or tail, so need to update
492 * size at first.
493 */
453 ei->size -= final_end - final_start; 494 ei->size -= final_end - final_start;
454 if (ei->addr < final_start) 495 if (ei->addr < final_start)
455 continue; 496 continue;
@@ -461,13 +502,13 @@ static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
461u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, 502u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
462 unsigned new_type) 503 unsigned new_type)
463{ 504{
464 return e820_update_range_map(&e820, start, size, old_type, new_type); 505 return __e820_update_range(&e820, start, size, old_type, new_type);
465} 506}
466 507
467static u64 __init e820_update_range_saved(u64 start, u64 size, 508static u64 __init e820_update_range_saved(u64 start, u64 size,
468 unsigned old_type, unsigned new_type) 509 unsigned old_type, unsigned new_type)
469{ 510{
470 return e820_update_range_map(&e820_saved, start, size, old_type, 511 return __e820_update_range(&e820_saved, start, size, old_type,
471 new_type); 512 new_type);
472} 513}
473 514
@@ -511,7 +552,7 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
511 552
512void __init update_e820(void) 553void __init update_e820(void)
513{ 554{
514 int nr_map; 555 u32 nr_map;
515 556
516 nr_map = e820.nr_map; 557 nr_map = e820.nr_map;
517 if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map)) 558 if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
@@ -522,7 +563,7 @@ void __init update_e820(void)
522} 563}
523static void __init update_e820_saved(void) 564static void __init update_e820_saved(void)
524{ 565{
525 int nr_map; 566 u32 nr_map;
526 567
527 nr_map = e820_saved.nr_map; 568 nr_map = e820_saved.nr_map;
528 if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map)) 569 if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map))
@@ -1020,8 +1061,8 @@ u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
1020 continue; 1061 continue;
1021 return addr; 1062 return addr;
1022 } 1063 }
1023 return -1UL;
1024 1064
1065 return -1ULL;
1025} 1066}
1026 1067
1027/* 1068/*
@@ -1034,13 +1075,22 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
1034 u64 start; 1075 u64 start;
1035 1076
1036 start = startt; 1077 start = startt;
1037 while (size < sizet) 1078 while (size < sizet && (start + 1))
1038 start = find_e820_area_size(start, &size, align); 1079 start = find_e820_area_size(start, &size, align);
1039 1080
1040 if (size < sizet) 1081 if (size < sizet)
1041 return 0; 1082 return 0;
1042 1083
1084#ifdef CONFIG_X86_32
1085 if (start >= MAXMEM)
1086 return 0;
1087 if (start + size > MAXMEM)
1088 size = MAXMEM - start;
1089#endif
1090
1043 addr = round_down(start + size - sizet, align); 1091 addr = round_down(start + size - sizet, align);
1092 if (addr < start)
1093 return 0;
1044 e820_update_range(addr, sizet, E820_RAM, E820_RESERVED); 1094 e820_update_range(addr, sizet, E820_RAM, E820_RESERVED);
1045 e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED); 1095 e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
1046 printk(KERN_INFO "update e820 for early_reserve_e820\n"); 1096 printk(KERN_INFO "update e820 for early_reserve_e820\n");
@@ -1253,7 +1303,7 @@ early_param("memmap", parse_memmap_opt);
1253void __init finish_e820_parsing(void) 1303void __init finish_e820_parsing(void)
1254{ 1304{
1255 if (userdef) { 1305 if (userdef) {
1256 int nr = e820.nr_map; 1306 u32 nr = e820.nr_map;
1257 1307
1258 if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0) 1308 if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0)
1259 early_panic("Invalid user supplied memory map"); 1309 early_panic("Invalid user supplied memory map");
@@ -1336,7 +1386,7 @@ void __init e820_reserve_resources_late(void)
1336char *__init default_machine_specific_memory_setup(void) 1386char *__init default_machine_specific_memory_setup(void)
1337{ 1387{
1338 char *who = "BIOS-e820"; 1388 char *who = "BIOS-e820";
1339 int new_nr; 1389 u32 new_nr;
1340 /* 1390 /*
1341 * Try to copy the BIOS-supplied E820-map. 1391 * Try to copy the BIOS-supplied E820-map.
1342 * 1392 *
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 639ad98238a2..335f049d110f 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -250,7 +250,7 @@ static int dbgp_wait_until_complete(void)
250 return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl); 250 return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl);
251} 251}
252 252
253static void dbgp_mdelay(int ms) 253static void __init dbgp_mdelay(int ms)
254{ 254{
255 int i; 255 int i;
256 256
@@ -311,7 +311,7 @@ static void dbgp_set_data(const void *buf, int size)
311 writel(hi, &ehci_debug->data47); 311 writel(hi, &ehci_debug->data47);
312} 312}
313 313
314static void dbgp_get_data(void *buf, int size) 314static void __init dbgp_get_data(void *buf, int size)
315{ 315{
316 unsigned char *bytes = buf; 316 unsigned char *bytes = buf;
317 u32 lo, hi; 317 u32 lo, hi;
@@ -355,7 +355,7 @@ static int dbgp_bulk_write(unsigned devnum, unsigned endpoint,
355 return ret; 355 return ret;
356} 356}
357 357
358static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data, 358static int __init dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
359 int size) 359 int size)
360{ 360{
361 u32 pids, addr, ctrl; 361 u32 pids, addr, ctrl;
@@ -386,8 +386,8 @@ static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
386 return ret; 386 return ret;
387} 387}
388 388
389static int dbgp_control_msg(unsigned devnum, int requesttype, int request, 389static int __init dbgp_control_msg(unsigned devnum, int requesttype,
390 int value, int index, void *data, int size) 390 int request, int value, int index, void *data, int size)
391{ 391{
392 u32 pids, addr, ctrl; 392 u32 pids, addr, ctrl;
393 struct usb_ctrlrequest req; 393 struct usb_ctrlrequest req;
@@ -489,7 +489,7 @@ static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc)
489 return 0; 489 return 0;
490} 490}
491 491
492static int ehci_reset_port(int port) 492static int __init ehci_reset_port(int port)
493{ 493{
494 u32 portsc; 494 u32 portsc;
495 u32 delay_time, delay; 495 u32 delay_time, delay;
@@ -532,7 +532,7 @@ static int ehci_reset_port(int port)
532 return -EBUSY; 532 return -EBUSY;
533} 533}
534 534
535static int ehci_wait_for_port(int port) 535static int __init ehci_wait_for_port(int port)
536{ 536{
537 u32 status; 537 u32 status;
538 int ret, reps; 538 int ret, reps;
@@ -557,13 +557,13 @@ static inline void dbgp_printk(const char *fmt, ...) { }
557 557
558typedef void (*set_debug_port_t)(int port); 558typedef void (*set_debug_port_t)(int port);
559 559
560static void default_set_debug_port(int port) 560static void __init default_set_debug_port(int port)
561{ 561{
562} 562}
563 563
564static set_debug_port_t set_debug_port = default_set_debug_port; 564static set_debug_port_t __initdata set_debug_port = default_set_debug_port;
565 565
566static void nvidia_set_debug_port(int port) 566static void __init nvidia_set_debug_port(int port)
567{ 567{
568 u32 dword; 568 u32 dword;
569 dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, 569 dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index b205272ad394..1736acc4d7aa 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -469,7 +469,7 @@ void __init efi_enter_virtual_mode(void)
469 efi_memory_desc_t *md; 469 efi_memory_desc_t *md;
470 efi_status_t status; 470 efi_status_t status;
471 unsigned long size; 471 unsigned long size;
472 u64 end, systab, addr, npages; 472 u64 end, systab, addr, npages, end_pfn;
473 void *p, *va; 473 void *p, *va;
474 474
475 efi.systab = NULL; 475 efi.systab = NULL;
@@ -481,7 +481,10 @@ void __init efi_enter_virtual_mode(void)
481 size = md->num_pages << EFI_PAGE_SHIFT; 481 size = md->num_pages << EFI_PAGE_SHIFT;
482 end = md->phys_addr + size; 482 end = md->phys_addr + size;
483 483
484 if (PFN_UP(end) <= max_low_pfn_mapped) 484 end_pfn = PFN_UP(end);
485 if (end_pfn <= max_low_pfn_mapped
486 || (end_pfn > (1UL << (32 - PAGE_SHIFT))
487 && end_pfn <= max_pfn_mapped))
485 va = __va(md->phys_addr); 488 va = __va(md->phys_addr);
486 else 489 else
487 va = efi_ioremap(md->phys_addr, size); 490 va = efi_ioremap(md->phys_addr, size);
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index a4ee29127fdf..22c3b7828c50 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -100,24 +100,11 @@ void __init efi_call_phys_epilog(void)
100 100
101void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size) 101void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size)
102{ 102{
103 static unsigned pages_mapped __initdata; 103 unsigned long last_map_pfn;
104 unsigned i, pages;
105 unsigned long offset;
106 104
107 pages = PFN_UP(phys_addr + size) - PFN_DOWN(phys_addr); 105 last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size);
108 offset = phys_addr & ~PAGE_MASK; 106 if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size)
109 phys_addr &= PAGE_MASK;
110
111 if (pages_mapped + pages > MAX_EFI_IO_PAGES)
112 return NULL; 107 return NULL;
113 108
114 for (i = 0; i < pages; i++) { 109 return (void __iomem *)__va(phys_addr);
115 __set_fixmap(FIX_EFI_IO_MAP_FIRST_PAGE - pages_mapped,
116 phys_addr, PAGE_KERNEL);
117 phys_addr += PAGE_SIZE;
118 pages_mapped++;
119 }
120
121 return (void __iomem *)__fix_to_virt(FIX_EFI_IO_MAP_FIRST_PAGE - \
122 (pages_mapped - pages)) + offset;
123} 110}
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 899e8938e79f..c929add475c9 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -442,8 +442,7 @@ sysenter_past_esp:
442 442
443 GET_THREAD_INFO(%ebp) 443 GET_THREAD_INFO(%ebp)
444 444
445 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ 445 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
446 testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
447 jnz sysenter_audit 446 jnz sysenter_audit
448sysenter_do_call: 447sysenter_do_call:
449 cmpl $(nr_syscalls), %eax 448 cmpl $(nr_syscalls), %eax
@@ -454,7 +453,7 @@ sysenter_do_call:
454 DISABLE_INTERRUPTS(CLBR_ANY) 453 DISABLE_INTERRUPTS(CLBR_ANY)
455 TRACE_IRQS_OFF 454 TRACE_IRQS_OFF
456 movl TI_flags(%ebp), %ecx 455 movl TI_flags(%ebp), %ecx
457 testw $_TIF_ALLWORK_MASK, %cx 456 testl $_TIF_ALLWORK_MASK, %ecx
458 jne sysexit_audit 457 jne sysexit_audit
459sysenter_exit: 458sysenter_exit:
460/* if something modifies registers it must also disable sysexit */ 459/* if something modifies registers it must also disable sysexit */
@@ -468,7 +467,7 @@ sysenter_exit:
468 467
469#ifdef CONFIG_AUDITSYSCALL 468#ifdef CONFIG_AUDITSYSCALL
470sysenter_audit: 469sysenter_audit:
471 testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp) 470 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
472 jnz syscall_trace_entry 471 jnz syscall_trace_entry
473 addl $4,%esp 472 addl $4,%esp
474 CFI_ADJUST_CFA_OFFSET -4 473 CFI_ADJUST_CFA_OFFSET -4
@@ -485,7 +484,7 @@ sysenter_audit:
485 jmp sysenter_do_call 484 jmp sysenter_do_call
486 485
487sysexit_audit: 486sysexit_audit:
488 testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx 487 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
489 jne syscall_exit_work 488 jne syscall_exit_work
490 TRACE_IRQS_ON 489 TRACE_IRQS_ON
491 ENABLE_INTERRUPTS(CLBR_ANY) 490 ENABLE_INTERRUPTS(CLBR_ANY)
@@ -498,7 +497,7 @@ sysexit_audit:
498 DISABLE_INTERRUPTS(CLBR_ANY) 497 DISABLE_INTERRUPTS(CLBR_ANY)
499 TRACE_IRQS_OFF 498 TRACE_IRQS_OFF
500 movl TI_flags(%ebp), %ecx 499 movl TI_flags(%ebp), %ecx
501 testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx 500 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
502 jne syscall_exit_work 501 jne syscall_exit_work
503 movl PT_EAX(%esp),%eax /* reload syscall return value */ 502 movl PT_EAX(%esp),%eax /* reload syscall return value */
504 jmp sysenter_exit 503 jmp sysenter_exit
@@ -523,8 +522,7 @@ ENTRY(system_call)
523 SAVE_ALL 522 SAVE_ALL
524 GET_THREAD_INFO(%ebp) 523 GET_THREAD_INFO(%ebp)
525 # system call tracing in operation / emulation 524 # system call tracing in operation / emulation
526 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ 525 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
527 testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
528 jnz syscall_trace_entry 526 jnz syscall_trace_entry
529 cmpl $(nr_syscalls), %eax 527 cmpl $(nr_syscalls), %eax
530 jae syscall_badsys 528 jae syscall_badsys
@@ -538,7 +536,7 @@ syscall_exit:
538 # between sampling and the iret 536 # between sampling and the iret
539 TRACE_IRQS_OFF 537 TRACE_IRQS_OFF
540 movl TI_flags(%ebp), %ecx 538 movl TI_flags(%ebp), %ecx
541 testw $_TIF_ALLWORK_MASK, %cx # current->work 539 testl $_TIF_ALLWORK_MASK, %ecx # current->work
542 jne syscall_exit_work 540 jne syscall_exit_work
543 541
544restore_all: 542restore_all:
@@ -673,7 +671,7 @@ END(syscall_trace_entry)
673 # perform syscall exit tracing 671 # perform syscall exit tracing
674 ALIGN 672 ALIGN
675syscall_exit_work: 673syscall_exit_work:
676 testb $_TIF_WORK_SYSCALL_EXIT, %cl 674 testl $_TIF_WORK_SYSCALL_EXIT, %ecx
677 jz work_pending 675 jz work_pending
678 TRACE_IRQS_ON 676 TRACE_IRQS_ON
679 ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call 677 ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 24c7031e23ca..3f129d963a05 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -368,6 +368,7 @@ ENTRY(save_rest)
368END(save_rest) 368END(save_rest)
369 369
370/* save complete stack frame */ 370/* save complete stack frame */
371 .pushsection .kprobes.text, "ax"
371ENTRY(save_paranoid) 372ENTRY(save_paranoid)
372 XCPT_FRAME 1 RDI+8 373 XCPT_FRAME 1 RDI+8
373 cld 374 cld
@@ -396,6 +397,7 @@ ENTRY(save_paranoid)
3961: ret 3971: ret
397 CFI_ENDPROC 398 CFI_ENDPROC
398END(save_paranoid) 399END(save_paranoid)
400 .popsection
399 401
400/* 402/*
401 * A newly forked process directly context switches into this address. 403 * A newly forked process directly context switches into this address.
@@ -416,7 +418,6 @@ ENTRY(ret_from_fork)
416 418
417 GET_THREAD_INFO(%rcx) 419 GET_THREAD_INFO(%rcx)
418 420
419 CFI_REMEMBER_STATE
420 RESTORE_REST 421 RESTORE_REST
421 422
422 testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread? 423 testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
@@ -428,7 +429,6 @@ ENTRY(ret_from_fork)
428 RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET 429 RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
429 jmp ret_from_sys_call # go to the SYSRET fastpath 430 jmp ret_from_sys_call # go to the SYSRET fastpath
430 431
431 CFI_RESTORE_STATE
432 CFI_ENDPROC 432 CFI_ENDPROC
433END(ret_from_fork) 433END(ret_from_fork)
434 434
@@ -984,6 +984,8 @@ apicinterrupt UV_BAU_MESSAGE \
984#endif 984#endif
985apicinterrupt LOCAL_TIMER_VECTOR \ 985apicinterrupt LOCAL_TIMER_VECTOR \
986 apic_timer_interrupt smp_apic_timer_interrupt 986 apic_timer_interrupt smp_apic_timer_interrupt
987apicinterrupt GENERIC_INTERRUPT_VECTOR \
988 generic_interrupt smp_generic_interrupt
987 989
988#ifdef CONFIG_SMP 990#ifdef CONFIG_SMP
989apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ 991apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 231bdd3c5b1c..61df77532120 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -18,6 +18,7 @@
18#include <linux/init.h> 18#include <linux/init.h>
19#include <linux/list.h> 19#include <linux/list.h>
20 20
21#include <asm/cacheflush.h>
21#include <asm/ftrace.h> 22#include <asm/ftrace.h>
22#include <linux/ftrace.h> 23#include <linux/ftrace.h>
23#include <asm/nops.h> 24#include <asm/nops.h>
@@ -26,6 +27,18 @@
26 27
27#ifdef CONFIG_DYNAMIC_FTRACE 28#ifdef CONFIG_DYNAMIC_FTRACE
28 29
30int ftrace_arch_code_modify_prepare(void)
31{
32 set_kernel_text_rw();
33 return 0;
34}
35
36int ftrace_arch_code_modify_post_process(void)
37{
38 set_kernel_text_ro();
39 return 0;
40}
41
29union ftrace_code_union { 42union ftrace_code_union {
30 char code[MCOUNT_INSN_SIZE]; 43 char code[MCOUNT_INSN_SIZE];
31 struct { 44 struct {
@@ -66,11 +79,11 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
66 * 79 *
67 * 1) Put the instruction pointer into the IP buffer 80 * 1) Put the instruction pointer into the IP buffer
68 * and the new code into the "code" buffer. 81 * and the new code into the "code" buffer.
69 * 2) Set a flag that says we are modifying code 82 * 2) Wait for any running NMIs to finish and set a flag that says
70 * 3) Wait for any running NMIs to finish. 83 * we are modifying code, it is done in an atomic operation.
71 * 4) Write the code 84 * 3) Write the code
72 * 5) clear the flag. 85 * 4) clear the flag.
73 * 6) Wait for any running NMIs to finish. 86 * 5) Wait for any running NMIs to finish.
74 * 87 *
75 * If an NMI is executed, the first thing it does is to call 88 * If an NMI is executed, the first thing it does is to call
76 * "ftrace_nmi_enter". This will check if the flag is set to write 89 * "ftrace_nmi_enter". This will check if the flag is set to write
@@ -82,9 +95,9 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
82 * are the same as what exists. 95 * are the same as what exists.
83 */ 96 */
84 97
85static atomic_t in_nmi = ATOMIC_INIT(0); 98#define MOD_CODE_WRITE_FLAG (1 << 31) /* set when NMI should do the write */
99static atomic_t nmi_running = ATOMIC_INIT(0);
86static int mod_code_status; /* holds return value of text write */ 100static int mod_code_status; /* holds return value of text write */
87static int mod_code_write; /* set when NMI should do the write */
88static void *mod_code_ip; /* holds the IP to write to */ 101static void *mod_code_ip; /* holds the IP to write to */
89static void *mod_code_newcode; /* holds the text to write to the IP */ 102static void *mod_code_newcode; /* holds the text to write to the IP */
90 103
@@ -101,6 +114,20 @@ int ftrace_arch_read_dyn_info(char *buf, int size)
101 return r; 114 return r;
102} 115}
103 116
117static void clear_mod_flag(void)
118{
119 int old = atomic_read(&nmi_running);
120
121 for (;;) {
122 int new = old & ~MOD_CODE_WRITE_FLAG;
123
124 if (old == new)
125 break;
126
127 old = atomic_cmpxchg(&nmi_running, old, new);
128 }
129}
130
104static void ftrace_mod_code(void) 131static void ftrace_mod_code(void)
105{ 132{
106 /* 133 /*
@@ -111,37 +138,52 @@ static void ftrace_mod_code(void)
111 */ 138 */
112 mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode, 139 mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode,
113 MCOUNT_INSN_SIZE); 140 MCOUNT_INSN_SIZE);
141
142 /* if we fail, then kill any new writers */
143 if (mod_code_status)
144 clear_mod_flag();
114} 145}
115 146
116void ftrace_nmi_enter(void) 147void ftrace_nmi_enter(void)
117{ 148{
118 atomic_inc(&in_nmi); 149 if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) {
119 /* Must have in_nmi seen before reading write flag */ 150 smp_rmb();
120 smp_mb();
121 if (mod_code_write) {
122 ftrace_mod_code(); 151 ftrace_mod_code();
123 atomic_inc(&nmi_update_count); 152 atomic_inc(&nmi_update_count);
124 } 153 }
154 /* Must have previous changes seen before executions */
155 smp_mb();
125} 156}
126 157
127void ftrace_nmi_exit(void) 158void ftrace_nmi_exit(void)
128{ 159{
129 /* Finish all executions before clearing in_nmi */ 160 /* Finish all executions before clearing nmi_running */
130 smp_wmb(); 161 smp_mb();
131 atomic_dec(&in_nmi); 162 atomic_dec(&nmi_running);
163}
164
165static void wait_for_nmi_and_set_mod_flag(void)
166{
167 if (!atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG))
168 return;
169
170 do {
171 cpu_relax();
172 } while (atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG));
173
174 nmi_wait_count++;
132} 175}
133 176
134static void wait_for_nmi(void) 177static void wait_for_nmi(void)
135{ 178{
136 int waited = 0; 179 if (!atomic_read(&nmi_running))
180 return;
137 181
138 while (atomic_read(&in_nmi)) { 182 do {
139 waited = 1;
140 cpu_relax(); 183 cpu_relax();
141 } 184 } while (atomic_read(&nmi_running));
142 185
143 if (waited) 186 nmi_wait_count++;
144 nmi_wait_count++;
145} 187}
146 188
147static int 189static int
@@ -151,14 +193,9 @@ do_ftrace_mod_code(unsigned long ip, void *new_code)
151 mod_code_newcode = new_code; 193 mod_code_newcode = new_code;
152 194
153 /* The buffers need to be visible before we let NMIs write them */ 195 /* The buffers need to be visible before we let NMIs write them */
154 smp_wmb();
155
156 mod_code_write = 1;
157
158 /* Make sure write bit is visible before we wait on NMIs */
159 smp_mb(); 196 smp_mb();
160 197
161 wait_for_nmi(); 198 wait_for_nmi_and_set_mod_flag();
162 199
163 /* Make sure all running NMIs have finished before we write the code */ 200 /* Make sure all running NMIs have finished before we write the code */
164 smp_mb(); 201 smp_mb();
@@ -166,13 +203,9 @@ do_ftrace_mod_code(unsigned long ip, void *new_code)
166 ftrace_mod_code(); 203 ftrace_mod_code();
167 204
168 /* Make sure the write happens before clearing the bit */ 205 /* Make sure the write happens before clearing the bit */
169 smp_wmb();
170
171 mod_code_write = 0;
172
173 /* make sure NMIs see the cleared bit */
174 smp_mb(); 206 smp_mb();
175 207
208 clear_mod_flag();
176 wait_for_nmi(); 209 wait_for_nmi();
177 210
178 return mod_code_status; 211 return mod_code_status;
@@ -368,100 +401,8 @@ int ftrace_disable_ftrace_graph_caller(void)
368 return ftrace_mod_jmp(ip, old_offset, new_offset); 401 return ftrace_mod_jmp(ip, old_offset, new_offset);
369} 402}
370 403
371#else /* CONFIG_DYNAMIC_FTRACE */
372
373/*
374 * These functions are picked from those used on
375 * this page for dynamic ftrace. They have been
376 * simplified to ignore all traces in NMI context.
377 */
378static atomic_t in_nmi;
379
380void ftrace_nmi_enter(void)
381{
382 atomic_inc(&in_nmi);
383}
384
385void ftrace_nmi_exit(void)
386{
387 atomic_dec(&in_nmi);
388}
389
390#endif /* !CONFIG_DYNAMIC_FTRACE */ 404#endif /* !CONFIG_DYNAMIC_FTRACE */
391 405
392/* Add a function return address to the trace stack on thread info.*/
393static int push_return_trace(unsigned long ret, unsigned long long time,
394 unsigned long func, int *depth)
395{
396 int index;
397
398 if (!current->ret_stack)
399 return -EBUSY;
400
401 /* The return trace stack is full */
402 if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
403 atomic_inc(&current->trace_overrun);
404 return -EBUSY;
405 }
406
407 index = ++current->curr_ret_stack;
408 barrier();
409 current->ret_stack[index].ret = ret;
410 current->ret_stack[index].func = func;
411 current->ret_stack[index].calltime = time;
412 *depth = index;
413
414 return 0;
415}
416
417/* Retrieve a function return address to the trace stack on thread info.*/
418static void pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
419{
420 int index;
421
422 index = current->curr_ret_stack;
423
424 if (unlikely(index < 0)) {
425 ftrace_graph_stop();
426 WARN_ON(1);
427 /* Might as well panic, otherwise we have no where to go */
428 *ret = (unsigned long)panic;
429 return;
430 }
431
432 *ret = current->ret_stack[index].ret;
433 trace->func = current->ret_stack[index].func;
434 trace->calltime = current->ret_stack[index].calltime;
435 trace->overrun = atomic_read(&current->trace_overrun);
436 trace->depth = index;
437 barrier();
438 current->curr_ret_stack--;
439
440}
441
442/*
443 * Send the trace to the ring-buffer.
444 * @return the original return address.
445 */
446unsigned long ftrace_return_to_handler(void)
447{
448 struct ftrace_graph_ret trace;
449 unsigned long ret;
450
451 pop_return_trace(&trace, &ret);
452 trace.rettime = cpu_clock(raw_smp_processor_id());
453 ftrace_graph_return(&trace);
454
455 if (unlikely(!ret)) {
456 ftrace_graph_stop();
457 WARN_ON(1);
458 /* Might as well panic. What else to do? */
459 ret = (unsigned long)panic;
460 }
461
462 return ret;
463}
464
465/* 406/*
466 * Hook the return address and push it in the stack of return addrs 407 * Hook the return address and push it in the stack of return addrs
467 * in current thread info. 408 * in current thread info.
@@ -469,14 +410,13 @@ unsigned long ftrace_return_to_handler(void)
469void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) 410void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
470{ 411{
471 unsigned long old; 412 unsigned long old;
472 unsigned long long calltime;
473 int faulted; 413 int faulted;
474 struct ftrace_graph_ent trace; 414 struct ftrace_graph_ent trace;
475 unsigned long return_hooker = (unsigned long) 415 unsigned long return_hooker = (unsigned long)
476 &return_to_handler; 416 &return_to_handler;
477 417
478 /* Nmi's are currently unsupported */ 418 /* Nmi's are currently unsupported */
479 if (unlikely(atomic_read(&in_nmi))) 419 if (unlikely(in_nmi()))
480 return; 420 return;
481 421
482 if (unlikely(atomic_read(&current->tracing_graph_pause))) 422 if (unlikely(atomic_read(&current->tracing_graph_pause)))
@@ -512,17 +452,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
512 return; 452 return;
513 } 453 }
514 454
515 if (unlikely(!__kernel_text_address(old))) { 455 if (ftrace_push_return_trace(old, self_addr, &trace.depth) == -EBUSY) {
516 ftrace_graph_stop();
517 *parent = old;
518 WARN_ON(1);
519 return;
520 }
521
522 calltime = cpu_clock(raw_smp_processor_id());
523
524 if (push_return_trace(old, calltime,
525 self_addr, &trace.depth) == -EBUSY) {
526 *parent = old; 456 *parent = old;
527 return; 457 return;
528 } 458 }
@@ -536,3 +466,66 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
536 } 466 }
537} 467}
538#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 468#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
469
470#ifdef CONFIG_FTRACE_SYSCALLS
471
472extern unsigned long __start_syscalls_metadata[];
473extern unsigned long __stop_syscalls_metadata[];
474extern unsigned long *sys_call_table;
475
476static struct syscall_metadata **syscalls_metadata;
477
478static struct syscall_metadata *find_syscall_meta(unsigned long *syscall)
479{
480 struct syscall_metadata *start;
481 struct syscall_metadata *stop;
482 char str[KSYM_SYMBOL_LEN];
483
484
485 start = (struct syscall_metadata *)__start_syscalls_metadata;
486 stop = (struct syscall_metadata *)__stop_syscalls_metadata;
487 kallsyms_lookup((unsigned long) syscall, NULL, NULL, NULL, str);
488
489 for ( ; start < stop; start++) {
490 if (start->name && !strcmp(start->name, str))
491 return start;
492 }
493 return NULL;
494}
495
496struct syscall_metadata *syscall_nr_to_meta(int nr)
497{
498 if (!syscalls_metadata || nr >= FTRACE_SYSCALL_MAX || nr < 0)
499 return NULL;
500
501 return syscalls_metadata[nr];
502}
503
504void arch_init_ftrace_syscalls(void)
505{
506 int i;
507 struct syscall_metadata *meta;
508 unsigned long **psys_syscall_table = &sys_call_table;
509 static atomic_t refs;
510
511 if (atomic_inc_return(&refs) != 1)
512 goto end;
513
514 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
515 FTRACE_SYSCALL_MAX, GFP_KERNEL);
516 if (!syscalls_metadata) {
517 WARN_ON(1);
518 return;
519 }
520
521 for (i = 0; i < FTRACE_SYSCALL_MAX; i++) {
522 meta = find_syscall_meta(psys_syscall_table[i]);
523 syscalls_metadata[i] = meta;
524 }
525 return;
526
527 /* Paranoid: avoid overflow */
528end:
529 atomic_dec(&refs);
530}
531#endif
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index ac108d1fe182..3f8579f8d42c 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -18,7 +18,7 @@ void __init i386_start_kernel(void)
18{ 18{
19 reserve_trampoline_memory(); 19 reserve_trampoline_memory();
20 20
21 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); 21 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
22 22
23#ifdef CONFIG_BLK_DEV_INITRD 23#ifdef CONFIG_BLK_DEV_INITRD
24 /* Reserve INITRD */ 24 /* Reserve INITRD */
@@ -29,9 +29,6 @@ void __init i386_start_kernel(void)
29 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); 29 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
30 } 30 }
31#endif 31#endif
32 reserve_early(init_pg_tables_start, init_pg_tables_end,
33 "INIT_PG_TABLE");
34
35 reserve_ebda_region(); 32 reserve_ebda_region();
36 33
37 /* 34 /*
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index f5b272247690..70eaa852c732 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -100,7 +100,7 @@ void __init x86_64_start_reservations(char *real_mode_data)
100 100
101 reserve_trampoline_memory(); 101 reserve_trampoline_memory();
102 102
103 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); 103 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
104 104
105#ifdef CONFIG_BLK_DEV_INITRD 105#ifdef CONFIG_BLK_DEV_INITRD
106 /* Reserve INITRD */ 106 /* Reserve INITRD */
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index c32ca19d591a..30683883e0cd 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -38,42 +38,40 @@
38#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id 38#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id
39 39
40/* 40/*
41 * This is how much memory *in addition to the memory covered up to 41 * This is how much memory in addition to the memory covered up to
42 * and including _end* we need mapped initially. 42 * and including _end we need mapped initially.
43 * We need: 43 * We need:
44 * - one bit for each possible page, but only in low memory, which means 44 * (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE)
45 * 2^32/4096/8 = 128K worst case (4G/4G split.) 45 * (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE)
46 * - enough space to map all low memory, which means
47 * (2^32/4096) / 1024 pages (worst case, non PAE)
48 * (2^32/4096) / 512 + 4 pages (worst case for PAE)
49 * - a few pages for allocator use before the kernel pagetable has
50 * been set up
51 * 46 *
52 * Modulo rounding, each megabyte assigned here requires a kilobyte of 47 * Modulo rounding, each megabyte assigned here requires a kilobyte of
53 * memory, which is currently unreclaimed. 48 * memory, which is currently unreclaimed.
54 * 49 *
55 * This should be a multiple of a page. 50 * This should be a multiple of a page.
51 *
52 * KERNEL_IMAGE_SIZE should be greater than pa(_end)
53 * and small than max_low_pfn, otherwise will waste some page table entries
56 */ 54 */
57LOW_PAGES = 1<<(32-PAGE_SHIFT_asm)
58
59/*
60 * To preserve the DMA pool in PAGEALLOC kernels, we'll allocate
61 * pagetables from above the 16MB DMA limit, so we'll have to set
62 * up pagetables 16MB more (worst-case):
63 */
64#ifdef CONFIG_DEBUG_PAGEALLOC
65LOW_PAGES = LOW_PAGES + 0x1000000
66#endif
67 55
68#if PTRS_PER_PMD > 1 56#if PTRS_PER_PMD > 1
69PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD 57#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD)
70#else 58#else
71PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD) 59#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
72#endif 60#endif
73BOOTBITMAP_SIZE = LOW_PAGES / 8
74ALLOCATOR_SLOP = 4
75 61
76INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm 62/* Enough space to fit pagetables for the low memory linear map */
63MAPPING_BEYOND_END = \
64 PAGE_TABLE_SIZE(((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) << PAGE_SHIFT
65
66/*
67 * Worst-case size of the kernel mapping we need to make:
68 * the worst-case size of the kernel itself, plus the extra we need
69 * to map for the linear map.
70 */
71KERNEL_PAGES = (KERNEL_IMAGE_SIZE + MAPPING_BEYOND_END)>>PAGE_SHIFT
72
73INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE_asm
74RESERVE_BRK(pagetables, INIT_MAP_SIZE)
77 75
78/* 76/*
79 * 32-bit kernel entrypoint; only used by the boot CPU. On entry, 77 * 32-bit kernel entrypoint; only used by the boot CPU. On entry,
@@ -166,10 +164,10 @@ num_subarch_entries = (. - subarch_entries) / 4
166 164
167/* 165/*
168 * Initialize page tables. This creates a PDE and a set of page 166 * Initialize page tables. This creates a PDE and a set of page
169 * tables, which are located immediately beyond _end. The variable 167 * tables, which are located immediately beyond __brk_base. The variable
170 * init_pg_tables_end is set up to point to the first "safe" location. 168 * _brk_end is set up to point to the first "safe" location.
171 * Mappings are created both at virtual address 0 (identity mapping) 169 * Mappings are created both at virtual address 0 (identity mapping)
172 * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END. 170 * and PAGE_OFFSET for up to _end.
173 * 171 *
174 * Note that the stack is not yet set up! 172 * Note that the stack is not yet set up!
175 */ 173 */
@@ -190,8 +188,7 @@ default_entry:
190 188
191 xorl %ebx,%ebx /* %ebx is kept at zero */ 189 xorl %ebx,%ebx /* %ebx is kept at zero */
192 190
193 movl $pa(pg0), %edi 191 movl $pa(__brk_base), %edi
194 movl %edi, pa(init_pg_tables_start)
195 movl $pa(swapper_pg_pmd), %edx 192 movl $pa(swapper_pg_pmd), %edx
196 movl $PTE_IDENT_ATTR, %eax 193 movl $PTE_IDENT_ATTR, %eax
19710: 19410:
@@ -209,14 +206,14 @@ default_entry:
209 loop 11b 206 loop 11b
210 207
211 /* 208 /*
212 * End condition: we must map up to and including INIT_MAP_BEYOND_END 209 * End condition: we must map up to the end + MAPPING_BEYOND_END.
213 * bytes beyond the end of our own page tables.
214 */ 210 */
215 leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp 211 movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
216 cmpl %ebp,%eax 212 cmpl %ebp,%eax
217 jb 10b 213 jb 10b
2181: 2141:
219 movl %edi,pa(init_pg_tables_end) 215 addl $__PAGE_OFFSET, %edi
216 movl %edi, pa(_brk_end)
220 shrl $12, %eax 217 shrl $12, %eax
221 movl %eax, pa(max_pfn_mapped) 218 movl %eax, pa(max_pfn_mapped)
222 219
@@ -227,8 +224,7 @@ default_entry:
227 224
228page_pde_offset = (__PAGE_OFFSET >> 20); 225page_pde_offset = (__PAGE_OFFSET >> 20);
229 226
230 movl $pa(pg0), %edi 227 movl $pa(__brk_base), %edi
231 movl %edi, pa(init_pg_tables_start)
232 movl $pa(swapper_pg_dir), %edx 228 movl $pa(swapper_pg_dir), %edx
233 movl $PTE_IDENT_ATTR, %eax 229 movl $PTE_IDENT_ATTR, %eax
23410: 23010:
@@ -242,14 +238,13 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
242 addl $0x1000,%eax 238 addl $0x1000,%eax
243 loop 11b 239 loop 11b
244 /* 240 /*
245 * End condition: we must map up to and including INIT_MAP_BEYOND_END 241 * End condition: we must map up to the end + MAPPING_BEYOND_END.
246 * bytes beyond the end of our own page tables; the +0x007 is
247 * the attribute bits
248 */ 242 */
249 leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp 243 movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
250 cmpl %ebp,%eax 244 cmpl %ebp,%eax
251 jb 10b 245 jb 10b
252 movl %edi,pa(init_pg_tables_end) 246 addl $__PAGE_OFFSET, %edi
247 movl %edi, pa(_brk_end)
253 shrl $12, %eax 248 shrl $12, %eax
254 movl %eax, pa(max_pfn_mapped) 249 movl %eax, pa(max_pfn_mapped)
255 250
@@ -636,6 +631,7 @@ swapper_pg_fixmap:
636 .fill 1024,4,0 631 .fill 1024,4,0
637ENTRY(empty_zero_page) 632ENTRY(empty_zero_page)
638 .fill 4096,1,0 633 .fill 4096,1,0
634
639/* 635/*
640 * This starts the data section. 636 * This starts the data section.
641 */ 637 */
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index a00545fe5cdd..648b3a2a3a44 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -80,6 +80,7 @@ static inline void hpet_clear_mapping(void)
80 */ 80 */
81static int boot_hpet_disable; 81static int boot_hpet_disable;
82int hpet_force_user; 82int hpet_force_user;
83static int hpet_verbose;
83 84
84static int __init hpet_setup(char *str) 85static int __init hpet_setup(char *str)
85{ 86{
@@ -88,6 +89,8 @@ static int __init hpet_setup(char *str)
88 boot_hpet_disable = 1; 89 boot_hpet_disable = 1;
89 if (!strncmp("force", str, 5)) 90 if (!strncmp("force", str, 5))
90 hpet_force_user = 1; 91 hpet_force_user = 1;
92 if (!strncmp("verbose", str, 7))
93 hpet_verbose = 1;
91 } 94 }
92 return 1; 95 return 1;
93} 96}
@@ -119,6 +122,43 @@ int is_hpet_enabled(void)
119} 122}
120EXPORT_SYMBOL_GPL(is_hpet_enabled); 123EXPORT_SYMBOL_GPL(is_hpet_enabled);
121 124
125static void _hpet_print_config(const char *function, int line)
126{
127 u32 i, timers, l, h;
128 printk(KERN_INFO "hpet: %s(%d):\n", function, line);
129 l = hpet_readl(HPET_ID);
130 h = hpet_readl(HPET_PERIOD);
131 timers = ((l & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
132 printk(KERN_INFO "hpet: ID: 0x%x, PERIOD: 0x%x\n", l, h);
133 l = hpet_readl(HPET_CFG);
134 h = hpet_readl(HPET_STATUS);
135 printk(KERN_INFO "hpet: CFG: 0x%x, STATUS: 0x%x\n", l, h);
136 l = hpet_readl(HPET_COUNTER);
137 h = hpet_readl(HPET_COUNTER+4);
138 printk(KERN_INFO "hpet: COUNTER_l: 0x%x, COUNTER_h: 0x%x\n", l, h);
139
140 for (i = 0; i < timers; i++) {
141 l = hpet_readl(HPET_Tn_CFG(i));
142 h = hpet_readl(HPET_Tn_CFG(i)+4);
143 printk(KERN_INFO "hpet: T%d: CFG_l: 0x%x, CFG_h: 0x%x\n",
144 i, l, h);
145 l = hpet_readl(HPET_Tn_CMP(i));
146 h = hpet_readl(HPET_Tn_CMP(i)+4);
147 printk(KERN_INFO "hpet: T%d: CMP_l: 0x%x, CMP_h: 0x%x\n",
148 i, l, h);
149 l = hpet_readl(HPET_Tn_ROUTE(i));
150 h = hpet_readl(HPET_Tn_ROUTE(i)+4);
151 printk(KERN_INFO "hpet: T%d ROUTE_l: 0x%x, ROUTE_h: 0x%x\n",
152 i, l, h);
153 }
154}
155
156#define hpet_print_config() \
157do { \
158 if (hpet_verbose) \
159 _hpet_print_config(__FUNCTION__, __LINE__); \
160} while (0)
161
122/* 162/*
123 * When the hpet driver (/dev/hpet) is enabled, we need to reserve 163 * When the hpet driver (/dev/hpet) is enabled, we need to reserve
124 * timer 0 and timer 1 in case of RTC emulation. 164 * timer 0 and timer 1 in case of RTC emulation.
@@ -191,27 +231,37 @@ static struct clock_event_device hpet_clockevent = {
191 .rating = 50, 231 .rating = 50,
192}; 232};
193 233
194static void hpet_start_counter(void) 234static void hpet_stop_counter(void)
195{ 235{
196 unsigned long cfg = hpet_readl(HPET_CFG); 236 unsigned long cfg = hpet_readl(HPET_CFG);
197
198 cfg &= ~HPET_CFG_ENABLE; 237 cfg &= ~HPET_CFG_ENABLE;
199 hpet_writel(cfg, HPET_CFG); 238 hpet_writel(cfg, HPET_CFG);
200 hpet_writel(0, HPET_COUNTER); 239 hpet_writel(0, HPET_COUNTER);
201 hpet_writel(0, HPET_COUNTER + 4); 240 hpet_writel(0, HPET_COUNTER + 4);
241}
242
243static void hpet_start_counter(void)
244{
245 unsigned long cfg = hpet_readl(HPET_CFG);
202 cfg |= HPET_CFG_ENABLE; 246 cfg |= HPET_CFG_ENABLE;
203 hpet_writel(cfg, HPET_CFG); 247 hpet_writel(cfg, HPET_CFG);
204} 248}
205 249
250static void hpet_restart_counter(void)
251{
252 hpet_stop_counter();
253 hpet_start_counter();
254}
255
206static void hpet_resume_device(void) 256static void hpet_resume_device(void)
207{ 257{
208 force_hpet_resume(); 258 force_hpet_resume();
209} 259}
210 260
211static void hpet_restart_counter(void) 261static void hpet_resume_counter(void)
212{ 262{
213 hpet_resume_device(); 263 hpet_resume_device();
214 hpet_start_counter(); 264 hpet_restart_counter();
215} 265}
216 266
217static void hpet_enable_legacy_int(void) 267static void hpet_enable_legacy_int(void)
@@ -259,29 +309,23 @@ static int hpet_setup_msi_irq(unsigned int irq);
259static void hpet_set_mode(enum clock_event_mode mode, 309static void hpet_set_mode(enum clock_event_mode mode,
260 struct clock_event_device *evt, int timer) 310 struct clock_event_device *evt, int timer)
261{ 311{
262 unsigned long cfg, cmp, now; 312 unsigned long cfg;
263 uint64_t delta; 313 uint64_t delta;
264 314
265 switch (mode) { 315 switch (mode) {
266 case CLOCK_EVT_MODE_PERIODIC: 316 case CLOCK_EVT_MODE_PERIODIC:
317 hpet_stop_counter();
267 delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult; 318 delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult;
268 delta >>= evt->shift; 319 delta >>= evt->shift;
269 now = hpet_readl(HPET_COUNTER);
270 cmp = now + (unsigned long) delta;
271 cfg = hpet_readl(HPET_Tn_CFG(timer)); 320 cfg = hpet_readl(HPET_Tn_CFG(timer));
272 /* Make sure we use edge triggered interrupts */ 321 /* Make sure we use edge triggered interrupts */
273 cfg &= ~HPET_TN_LEVEL; 322 cfg &= ~HPET_TN_LEVEL;
274 cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | 323 cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
275 HPET_TN_SETVAL | HPET_TN_32BIT; 324 HPET_TN_SETVAL | HPET_TN_32BIT;
276 hpet_writel(cfg, HPET_Tn_CFG(timer)); 325 hpet_writel(cfg, HPET_Tn_CFG(timer));
277 /*
278 * The first write after writing TN_SETVAL to the
279 * config register sets the counter value, the second
280 * write sets the period.
281 */
282 hpet_writel(cmp, HPET_Tn_CMP(timer));
283 udelay(1);
284 hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer)); 326 hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer));
327 hpet_start_counter();
328 hpet_print_config();
285 break; 329 break;
286 330
287 case CLOCK_EVT_MODE_ONESHOT: 331 case CLOCK_EVT_MODE_ONESHOT:
@@ -308,6 +352,7 @@ static void hpet_set_mode(enum clock_event_mode mode,
308 irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu)); 352 irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
309 enable_irq(hdev->irq); 353 enable_irq(hdev->irq);
310 } 354 }
355 hpet_print_config();
311 break; 356 break;
312 } 357 }
313} 358}
@@ -526,6 +571,7 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
526 571
527 num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT); 572 num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT);
528 num_timers++; /* Value read out starts from 0 */ 573 num_timers++; /* Value read out starts from 0 */
574 hpet_print_config();
529 575
530 hpet_devs = kzalloc(sizeof(struct hpet_dev) * num_timers, GFP_KERNEL); 576 hpet_devs = kzalloc(sizeof(struct hpet_dev) * num_timers, GFP_KERNEL);
531 if (!hpet_devs) 577 if (!hpet_devs)
@@ -695,7 +741,7 @@ static struct clocksource clocksource_hpet = {
695 .mask = HPET_MASK, 741 .mask = HPET_MASK,
696 .shift = HPET_SHIFT, 742 .shift = HPET_SHIFT,
697 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 743 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
698 .resume = hpet_restart_counter, 744 .resume = hpet_resume_counter,
699#ifdef CONFIG_X86_64 745#ifdef CONFIG_X86_64
700 .vread = vread_hpet, 746 .vread = vread_hpet,
701#endif 747#endif
@@ -707,7 +753,7 @@ static int hpet_clocksource_register(void)
707 cycle_t t1; 753 cycle_t t1;
708 754
709 /* Start the counter */ 755 /* Start the counter */
710 hpet_start_counter(); 756 hpet_restart_counter();
711 757
712 /* Verify whether hpet counter works */ 758 /* Verify whether hpet counter works */
713 t1 = read_hpet(); 759 t1 = read_hpet();
@@ -793,6 +839,7 @@ int __init hpet_enable(void)
793 * information and the number of channels 839 * information and the number of channels
794 */ 840 */
795 id = hpet_readl(HPET_ID); 841 id = hpet_readl(HPET_ID);
842 hpet_print_config();
796 843
797#ifdef CONFIG_HPET_EMULATE_RTC 844#ifdef CONFIG_HPET_EMULATE_RTC
798 /* 845 /*
@@ -845,6 +892,7 @@ static __init int hpet_late_init(void)
845 return -ENODEV; 892 return -ENODEV;
846 893
847 hpet_reserve_platform_timers(hpet_readl(HPET_ID)); 894 hpet_reserve_platform_timers(hpet_readl(HPET_ID));
895 hpet_print_config();
848 896
849 for_each_online_cpu(cpu) { 897 for_each_online_cpu(cpu) {
850 hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu); 898 hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu);
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index b0f61f0dcd0a..f2f8540a7f3d 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -136,7 +136,7 @@ int init_fpu(struct task_struct *tsk)
136#ifdef CONFIG_X86_32 136#ifdef CONFIG_X86_32
137 if (!HAVE_HWFP) { 137 if (!HAVE_HWFP) {
138 memset(tsk->thread.xstate, 0, xstate_size); 138 memset(tsk->thread.xstate, 0, xstate_size);
139 finit(); 139 finit_task(tsk);
140 set_stopped_child_used_math(tsk); 140 set_stopped_child_used_math(tsk);
141 return 0; 141 return 0;
142 } 142 }
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 10f92fb532f3..3475440baa54 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -3,17 +3,17 @@
3 * 3 *
4 */ 4 */
5#include <linux/clockchips.h> 5#include <linux/clockchips.h>
6#include <linux/init.h>
7#include <linux/interrupt.h> 6#include <linux/interrupt.h>
7#include <linux/spinlock.h>
8#include <linux/jiffies.h> 8#include <linux/jiffies.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/spinlock.h> 10#include <linux/delay.h>
11#include <linux/init.h>
12#include <linux/io.h>
11 13
12#include <asm/smp.h>
13#include <asm/delay.h>
14#include <asm/i8253.h> 14#include <asm/i8253.h>
15#include <asm/io.h>
16#include <asm/hpet.h> 15#include <asm/hpet.h>
16#include <asm/smp.h>
17 17
18DEFINE_SPINLOCK(i8253_lock); 18DEFINE_SPINLOCK(i8253_lock);
19EXPORT_SYMBOL(i8253_lock); 19EXPORT_SYMBOL(i8253_lock);
@@ -40,7 +40,7 @@ static void init_pit_timer(enum clock_event_mode mode,
40{ 40{
41 spin_lock(&i8253_lock); 41 spin_lock(&i8253_lock);
42 42
43 switch(mode) { 43 switch (mode) {
44 case CLOCK_EVT_MODE_PERIODIC: 44 case CLOCK_EVT_MODE_PERIODIC:
45 /* binary, mode 2, LSB/MSB, ch 0 */ 45 /* binary, mode 2, LSB/MSB, ch 0 */
46 outb_pit(0x34, PIT_MODE); 46 outb_pit(0x34, PIT_MODE);
@@ -95,7 +95,7 @@ static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
95 * registered. This mechanism replaces the previous #ifdef LOCAL_APIC - 95 * registered. This mechanism replaces the previous #ifdef LOCAL_APIC -
96 * !using_apic_timer decisions in do_timer_interrupt_hook() 96 * !using_apic_timer decisions in do_timer_interrupt_hook()
97 */ 97 */
98static struct clock_event_device pit_clockevent = { 98static struct clock_event_device pit_ce = {
99 .name = "pit", 99 .name = "pit",
100 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, 100 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
101 .set_mode = init_pit_timer, 101 .set_mode = init_pit_timer,
@@ -114,15 +114,13 @@ void __init setup_pit_timer(void)
114 * Start pit with the boot cpu mask and make it global after the 114 * Start pit with the boot cpu mask and make it global after the
115 * IO_APIC has been initialized. 115 * IO_APIC has been initialized.
116 */ 116 */
117 pit_clockevent.cpumask = cpumask_of(smp_processor_id()); 117 pit_ce.cpumask = cpumask_of(smp_processor_id());
118 pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, 118 pit_ce.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, pit_ce.shift);
119 pit_clockevent.shift); 119 pit_ce.max_delta_ns = clockevent_delta2ns(0x7FFF, &pit_ce);
120 pit_clockevent.max_delta_ns = 120 pit_ce.min_delta_ns = clockevent_delta2ns(0xF, &pit_ce);
121 clockevent_delta2ns(0x7FFF, &pit_clockevent); 121
122 pit_clockevent.min_delta_ns = 122 clockevents_register_device(&pit_ce);
123 clockevent_delta2ns(0xF, &pit_clockevent); 123 global_clock_event = &pit_ce;
124 clockevents_register_device(&pit_clockevent);
125 global_clock_event = &pit_clockevent;
126} 124}
127 125
128#ifndef CONFIG_X86_64 126#ifndef CONFIG_X86_64
@@ -133,11 +131,11 @@ void __init setup_pit_timer(void)
133 */ 131 */
134static cycle_t pit_read(void) 132static cycle_t pit_read(void)
135{ 133{
134 static int old_count;
135 static u32 old_jifs;
136 unsigned long flags; 136 unsigned long flags;
137 int count; 137 int count;
138 u32 jifs; 138 u32 jifs;
139 static int old_count;
140 static u32 old_jifs;
141 139
142 spin_lock_irqsave(&i8253_lock, flags); 140 spin_lock_irqsave(&i8253_lock, flags);
143 /* 141 /*
@@ -179,9 +177,9 @@ static cycle_t pit_read(void)
179 * Previous attempts to handle these cases intelligently were 177 * Previous attempts to handle these cases intelligently were
180 * buggy, so we just do the simple thing now. 178 * buggy, so we just do the simple thing now.
181 */ 179 */
182 if (count > old_count && jifs == old_jifs) { 180 if (count > old_count && jifs == old_jifs)
183 count = old_count; 181 count = old_count;
184 } 182
185 old_count = count; 183 old_count = count;
186 old_jifs = jifs; 184 old_jifs = jifs;
187 185
@@ -192,13 +190,13 @@ static cycle_t pit_read(void)
192 return (cycle_t)(jifs * LATCH) + count; 190 return (cycle_t)(jifs * LATCH) + count;
193} 191}
194 192
195static struct clocksource clocksource_pit = { 193static struct clocksource pit_cs = {
196 .name = "pit", 194 .name = "pit",
197 .rating = 110, 195 .rating = 110,
198 .read = pit_read, 196 .read = pit_read,
199 .mask = CLOCKSOURCE_MASK(32), 197 .mask = CLOCKSOURCE_MASK(32),
200 .mult = 0, 198 .mult = 0,
201 .shift = 20, 199 .shift = 20,
202}; 200};
203 201
204static void pit_disable_clocksource(void) 202static void pit_disable_clocksource(void)
@@ -206,9 +204,9 @@ static void pit_disable_clocksource(void)
206 /* 204 /*
207 * Use mult to check whether it is registered or not 205 * Use mult to check whether it is registered or not
208 */ 206 */
209 if (clocksource_pit.mult) { 207 if (pit_cs.mult) {
210 clocksource_unregister(&clocksource_pit); 208 clocksource_unregister(&pit_cs);
211 clocksource_pit.mult = 0; 209 pit_cs.mult = 0;
212 } 210 }
213} 211}
214 212
@@ -222,13 +220,13 @@ static int __init init_pit_clocksource(void)
222 * - when local APIC timer is active (PIT is switched off) 220 * - when local APIC timer is active (PIT is switched off)
223 */ 221 */
224 if (num_possible_cpus() > 1 || is_hpet_enabled() || 222 if (num_possible_cpus() > 1 || is_hpet_enabled() ||
225 pit_clockevent.mode != CLOCK_EVT_MODE_PERIODIC) 223 pit_ce.mode != CLOCK_EVT_MODE_PERIODIC)
226 return 0; 224 return 0;
227 225
228 clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 226 pit_cs.mult = clocksource_hz2mult(CLOCK_TICK_RATE, pit_cs.shift);
229 clocksource_pit.shift); 227
230 return clocksource_register(&clocksource_pit); 228 return clocksource_register(&pit_cs);
231} 229}
232arch_initcall(init_pit_clocksource); 230arch_initcall(init_pit_clocksource);
233 231
234#endif 232#endif /* !CONFIG_X86_64 */
diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c
index 720d2607aacb..a979b5bd2fc0 100644
--- a/arch/x86/kernel/io_delay.c
+++ b/arch/x86/kernel/io_delay.c
@@ -7,10 +7,10 @@
7 */ 7 */
8#include <linux/kernel.h> 8#include <linux/kernel.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/init.h>
11#include <linux/delay.h> 10#include <linux/delay.h>
11#include <linux/init.h>
12#include <linux/dmi.h> 12#include <linux/dmi.h>
13#include <asm/io.h> 13#include <linux/io.h>
14 14
15int io_delay_type __read_mostly = CONFIG_DEFAULT_IO_DELAY_TYPE; 15int io_delay_type __read_mostly = CONFIG_DEFAULT_IO_DELAY_TYPE;
16 16
@@ -47,8 +47,7 @@ EXPORT_SYMBOL(native_io_delay);
47static int __init dmi_io_delay_0xed_port(const struct dmi_system_id *id) 47static int __init dmi_io_delay_0xed_port(const struct dmi_system_id *id)
48{ 48{
49 if (io_delay_type == CONFIG_IO_DELAY_TYPE_0X80) { 49 if (io_delay_type == CONFIG_IO_DELAY_TYPE_0X80) {
50 printk(KERN_NOTICE "%s: using 0xed I/O delay port\n", 50 pr_notice("%s: using 0xed I/O delay port\n", id->ident);
51 id->ident);
52 io_delay_type = CONFIG_IO_DELAY_TYPE_0XED; 51 io_delay_type = CONFIG_IO_DELAY_TYPE_0XED;
53 } 52 }
54 53
@@ -64,40 +63,40 @@ static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = {
64 .callback = dmi_io_delay_0xed_port, 63 .callback = dmi_io_delay_0xed_port,
65 .ident = "Compaq Presario V6000", 64 .ident = "Compaq Presario V6000",
66 .matches = { 65 .matches = {
67 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), 66 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
68 DMI_MATCH(DMI_BOARD_NAME, "30B7") 67 DMI_MATCH(DMI_BOARD_NAME, "30B7")
69 } 68 }
70 }, 69 },
71 { 70 {
72 .callback = dmi_io_delay_0xed_port, 71 .callback = dmi_io_delay_0xed_port,
73 .ident = "HP Pavilion dv9000z", 72 .ident = "HP Pavilion dv9000z",
74 .matches = { 73 .matches = {
75 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), 74 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
76 DMI_MATCH(DMI_BOARD_NAME, "30B9") 75 DMI_MATCH(DMI_BOARD_NAME, "30B9")
77 } 76 }
78 }, 77 },
79 { 78 {
80 .callback = dmi_io_delay_0xed_port, 79 .callback = dmi_io_delay_0xed_port,
81 .ident = "HP Pavilion dv6000", 80 .ident = "HP Pavilion dv6000",
82 .matches = { 81 .matches = {
83 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), 82 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
84 DMI_MATCH(DMI_BOARD_NAME, "30B8") 83 DMI_MATCH(DMI_BOARD_NAME, "30B8")
85 } 84 }
86 }, 85 },
87 { 86 {
88 .callback = dmi_io_delay_0xed_port, 87 .callback = dmi_io_delay_0xed_port,
89 .ident = "HP Pavilion tx1000", 88 .ident = "HP Pavilion tx1000",
90 .matches = { 89 .matches = {
91 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), 90 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
92 DMI_MATCH(DMI_BOARD_NAME, "30BF") 91 DMI_MATCH(DMI_BOARD_NAME, "30BF")
93 } 92 }
94 }, 93 },
95 { 94 {
96 .callback = dmi_io_delay_0xed_port, 95 .callback = dmi_io_delay_0xed_port,
97 .ident = "Presario F700", 96 .ident = "Presario F700",
98 .matches = { 97 .matches = {
99 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), 98 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
100 DMI_MATCH(DMI_BOARD_NAME, "30D3") 99 DMI_MATCH(DMI_BOARD_NAME, "30D3")
101 } 100 }
102 }, 101 },
103 { } 102 { }
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index e41980a373ab..99c4d308f16b 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -85,19 +85,8 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
85 85
86 t->io_bitmap_max = bytes; 86 t->io_bitmap_max = bytes;
87 87
88#ifdef CONFIG_X86_32
89 /*
90 * Sets the lazy trigger so that the next I/O operation will
91 * reload the correct bitmap.
92 * Reset the owner so that a process switch will not set
93 * tss->io_bitmap_base to IO_BITMAP_OFFSET.
94 */
95 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
96 tss->io_bitmap_owner = NULL;
97#else
98 /* Update the TSS: */ 88 /* Update the TSS: */
99 memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated); 89 memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
100#endif
101 90
102 put_cpu(); 91 put_cpu();
103 92
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 7c95c8918a8f..9c2754302ecc 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -15,6 +15,9 @@
15 15
16atomic_t irq_err_count; 16atomic_t irq_err_count;
17 17
18/* Function pointer for generic interrupt vector handling */
19void (*generic_interrupt_extension)(void) = NULL;
20
18/* 21/*
19 * 'what should we do if we get a hw irq event on an illegal vector'. 22 * 'what should we do if we get a hw irq event on an illegal vector'.
20 * each architecture has to answer this themselves. 23 * each architecture has to answer this themselves.
@@ -42,59 +45,64 @@ void ack_bad_irq(unsigned int irq)
42/* 45/*
43 * /proc/interrupts printing: 46 * /proc/interrupts printing:
44 */ 47 */
45static int show_other_interrupts(struct seq_file *p) 48static int show_other_interrupts(struct seq_file *p, int prec)
46{ 49{
47 int j; 50 int j;
48 51
49 seq_printf(p, "NMI: "); 52 seq_printf(p, "%*s: ", prec, "NMI");
50 for_each_online_cpu(j) 53 for_each_online_cpu(j)
51 seq_printf(p, "%10u ", irq_stats(j)->__nmi_count); 54 seq_printf(p, "%10u ", irq_stats(j)->__nmi_count);
52 seq_printf(p, " Non-maskable interrupts\n"); 55 seq_printf(p, " Non-maskable interrupts\n");
53#ifdef CONFIG_X86_LOCAL_APIC 56#ifdef CONFIG_X86_LOCAL_APIC
54 seq_printf(p, "LOC: "); 57 seq_printf(p, "%*s: ", prec, "LOC");
55 for_each_online_cpu(j) 58 for_each_online_cpu(j)
56 seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); 59 seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
57 seq_printf(p, " Local timer interrupts\n"); 60 seq_printf(p, " Local timer interrupts\n");
61
62 seq_printf(p, "%*s: ", prec, "SPU");
63 for_each_online_cpu(j)
64 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
65 seq_printf(p, " Spurious interrupts\n");
58 seq_printf(p, "CNT: "); 66 seq_printf(p, "CNT: ");
59 for_each_online_cpu(j) 67 for_each_online_cpu(j)
60 seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); 68 seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
61 seq_printf(p, " Performance counter interrupts\n"); 69 seq_printf(p, " Performance counter interrupts\n");
62#endif 70#endif
71 if (generic_interrupt_extension) {
72 seq_printf(p, "PLT: ");
73 for_each_online_cpu(j)
74 seq_printf(p, "%10u ", irq_stats(j)->generic_irqs);
75 seq_printf(p, " Platform interrupts\n");
76 }
63#ifdef CONFIG_SMP 77#ifdef CONFIG_SMP
64 seq_printf(p, "RES: "); 78 seq_printf(p, "%*s: ", prec, "RES");
65 for_each_online_cpu(j) 79 for_each_online_cpu(j)
66 seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count); 80 seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count);
67 seq_printf(p, " Rescheduling interrupts\n"); 81 seq_printf(p, " Rescheduling interrupts\n");
68 seq_printf(p, "CAL: "); 82 seq_printf(p, "%*s: ", prec, "CAL");
69 for_each_online_cpu(j) 83 for_each_online_cpu(j)
70 seq_printf(p, "%10u ", irq_stats(j)->irq_call_count); 84 seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
71 seq_printf(p, " Function call interrupts\n"); 85 seq_printf(p, " Function call interrupts\n");
72 seq_printf(p, "TLB: "); 86 seq_printf(p, "%*s: ", prec, "TLB");
73 for_each_online_cpu(j) 87 for_each_online_cpu(j)
74 seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count); 88 seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
75 seq_printf(p, " TLB shootdowns\n"); 89 seq_printf(p, " TLB shootdowns\n");
76#endif 90#endif
77#ifdef CONFIG_X86_MCE 91#ifdef CONFIG_X86_MCE
78 seq_printf(p, "TRM: "); 92 seq_printf(p, "%*s: ", prec, "TRM");
79 for_each_online_cpu(j) 93 for_each_online_cpu(j)
80 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); 94 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
81 seq_printf(p, " Thermal event interrupts\n"); 95 seq_printf(p, " Thermal event interrupts\n");
82# ifdef CONFIG_X86_64 96# ifdef CONFIG_X86_64
83 seq_printf(p, "THR: "); 97 seq_printf(p, "%*s: ", prec, "THR");
84 for_each_online_cpu(j) 98 for_each_online_cpu(j)
85 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); 99 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
86 seq_printf(p, " Threshold APIC interrupts\n"); 100 seq_printf(p, " Threshold APIC interrupts\n");
87# endif 101# endif
88#endif 102#endif
89#ifdef CONFIG_X86_LOCAL_APIC 103 seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
90 seq_printf(p, "SPU: ");
91 for_each_online_cpu(j)
92 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
93 seq_printf(p, " Spurious interrupts\n");
94#endif
95 seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
96#if defined(CONFIG_X86_IO_APIC) 104#if defined(CONFIG_X86_IO_APIC)
97 seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); 105 seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
98#endif 106#endif
99 return 0; 107 return 0;
100} 108}
@@ -102,19 +110,22 @@ static int show_other_interrupts(struct seq_file *p)
102int show_interrupts(struct seq_file *p, void *v) 110int show_interrupts(struct seq_file *p, void *v)
103{ 111{
104 unsigned long flags, any_count = 0; 112 unsigned long flags, any_count = 0;
105 int i = *(loff_t *) v, j; 113 int i = *(loff_t *) v, j, prec;
106 struct irqaction *action; 114 struct irqaction *action;
107 struct irq_desc *desc; 115 struct irq_desc *desc;
108 116
109 if (i > nr_irqs) 117 if (i > nr_irqs)
110 return 0; 118 return 0;
111 119
120 for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
121 j *= 10;
122
112 if (i == nr_irqs) 123 if (i == nr_irqs)
113 return show_other_interrupts(p); 124 return show_other_interrupts(p, prec);
114 125
115 /* print header */ 126 /* print header */
116 if (i == 0) { 127 if (i == 0) {
117 seq_printf(p, " "); 128 seq_printf(p, "%*s", prec + 8, "");
118 for_each_online_cpu(j) 129 for_each_online_cpu(j)
119 seq_printf(p, "CPU%-8d", j); 130 seq_printf(p, "CPU%-8d", j);
120 seq_putc(p, '\n'); 131 seq_putc(p, '\n');
@@ -125,23 +136,15 @@ int show_interrupts(struct seq_file *p, void *v)
125 return 0; 136 return 0;
126 137
127 spin_lock_irqsave(&desc->lock, flags); 138 spin_lock_irqsave(&desc->lock, flags);
128#ifndef CONFIG_SMP
129 any_count = kstat_irqs(i);
130#else
131 for_each_online_cpu(j) 139 for_each_online_cpu(j)
132 any_count |= kstat_irqs_cpu(i, j); 140 any_count |= kstat_irqs_cpu(i, j);
133#endif
134 action = desc->action; 141 action = desc->action;
135 if (!action && !any_count) 142 if (!action && !any_count)
136 goto out; 143 goto out;
137 144
138 seq_printf(p, "%3d: ", i); 145 seq_printf(p, "%*d: ", prec, i);
139#ifndef CONFIG_SMP
140 seq_printf(p, "%10u ", kstat_irqs(i));
141#else
142 for_each_online_cpu(j) 146 for_each_online_cpu(j)
143 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); 147 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
144#endif
145 seq_printf(p, " %8s", desc->chip->name); 148 seq_printf(p, " %8s", desc->chip->name);
146 seq_printf(p, "-%-8s", desc->name); 149 seq_printf(p, "-%-8s", desc->name);
147 150
@@ -166,8 +169,11 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
166 169
167#ifdef CONFIG_X86_LOCAL_APIC 170#ifdef CONFIG_X86_LOCAL_APIC
168 sum += irq_stats(cpu)->apic_timer_irqs; 171 sum += irq_stats(cpu)->apic_timer_irqs;
172 sum += irq_stats(cpu)->irq_spurious_count;
169 sum += irq_stats(cpu)->apic_perf_irqs; 173 sum += irq_stats(cpu)->apic_perf_irqs;
170#endif 174#endif
175 if (generic_interrupt_extension)
176 sum += irq_stats(cpu)->generic_irqs;
171#ifdef CONFIG_SMP 177#ifdef CONFIG_SMP
172 sum += irq_stats(cpu)->irq_resched_count; 178 sum += irq_stats(cpu)->irq_resched_count;
173 sum += irq_stats(cpu)->irq_call_count; 179 sum += irq_stats(cpu)->irq_call_count;
@@ -179,9 +185,6 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
179 sum += irq_stats(cpu)->irq_threshold_count; 185 sum += irq_stats(cpu)->irq_threshold_count;
180#endif 186#endif
181#endif 187#endif
182#ifdef CONFIG_X86_LOCAL_APIC
183 sum += irq_stats(cpu)->irq_spurious_count;
184#endif
185 return sum; 188 return sum;
186} 189}
187 190
@@ -231,4 +234,27 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
231 return 1; 234 return 1;
232} 235}
233 236
237/*
238 * Handler for GENERIC_INTERRUPT_VECTOR.
239 */
240void smp_generic_interrupt(struct pt_regs *regs)
241{
242 struct pt_regs *old_regs = set_irq_regs(regs);
243
244 ack_APIC_irq();
245
246 exit_idle();
247
248 irq_enter();
249
250 inc_irq_stat(generic_irqs);
251
252 if (generic_interrupt_extension)
253 generic_interrupt_extension();
254
255 irq_exit();
256
257 set_irq_regs(old_regs);
258}
259
234EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); 260EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 9dc6b2b24275..3b09634a5153 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -16,6 +16,7 @@
16#include <linux/cpu.h> 16#include <linux/cpu.h>
17#include <linux/delay.h> 17#include <linux/delay.h>
18#include <linux/uaccess.h> 18#include <linux/uaccess.h>
19#include <linux/percpu.h>
19 20
20#include <asm/apic.h> 21#include <asm/apic.h>
21 22
@@ -55,13 +56,13 @@ static inline void print_stack_overflow(void) { }
55union irq_ctx { 56union irq_ctx {
56 struct thread_info tinfo; 57 struct thread_info tinfo;
57 u32 stack[THREAD_SIZE/sizeof(u32)]; 58 u32 stack[THREAD_SIZE/sizeof(u32)];
58}; 59} __attribute__((aligned(PAGE_SIZE)));
59 60
60static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; 61static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx);
61static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; 62static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx);
62 63
63static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; 64static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack);
64static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; 65static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack);
65 66
66static void call_on_stack(void *func, void *stack) 67static void call_on_stack(void *func, void *stack)
67{ 68{
@@ -81,7 +82,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
81 u32 *isp, arg1, arg2; 82 u32 *isp, arg1, arg2;
82 83
83 curctx = (union irq_ctx *) current_thread_info(); 84 curctx = (union irq_ctx *) current_thread_info();
84 irqctx = hardirq_ctx[smp_processor_id()]; 85 irqctx = __get_cpu_var(hardirq_ctx);
85 86
86 /* 87 /*
87 * this is where we switch to the IRQ stack. However, if we are 88 * this is where we switch to the IRQ stack. However, if we are
@@ -125,34 +126,34 @@ void __cpuinit irq_ctx_init(int cpu)
125{ 126{
126 union irq_ctx *irqctx; 127 union irq_ctx *irqctx;
127 128
128 if (hardirq_ctx[cpu]) 129 if (per_cpu(hardirq_ctx, cpu))
129 return; 130 return;
130 131
131 irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE]; 132 irqctx = &per_cpu(hardirq_stack, cpu);
132 irqctx->tinfo.task = NULL; 133 irqctx->tinfo.task = NULL;
133 irqctx->tinfo.exec_domain = NULL; 134 irqctx->tinfo.exec_domain = NULL;
134 irqctx->tinfo.cpu = cpu; 135 irqctx->tinfo.cpu = cpu;
135 irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; 136 irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
136 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); 137 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
137 138
138 hardirq_ctx[cpu] = irqctx; 139 per_cpu(hardirq_ctx, cpu) = irqctx;
139 140
140 irqctx = (union irq_ctx *) &softirq_stack[cpu*THREAD_SIZE]; 141 irqctx = &per_cpu(softirq_stack, cpu);
141 irqctx->tinfo.task = NULL; 142 irqctx->tinfo.task = NULL;
142 irqctx->tinfo.exec_domain = NULL; 143 irqctx->tinfo.exec_domain = NULL;
143 irqctx->tinfo.cpu = cpu; 144 irqctx->tinfo.cpu = cpu;
144 irqctx->tinfo.preempt_count = 0; 145 irqctx->tinfo.preempt_count = 0;
145 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); 146 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
146 147
147 softirq_ctx[cpu] = irqctx; 148 per_cpu(softirq_ctx, cpu) = irqctx;
148 149
149 printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n", 150 printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
150 cpu, hardirq_ctx[cpu], softirq_ctx[cpu]); 151 cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu));
151} 152}
152 153
153void irq_ctx_exit(int cpu) 154void irq_ctx_exit(int cpu)
154{ 155{
155 hardirq_ctx[cpu] = NULL; 156 per_cpu(hardirq_ctx, cpu) = NULL;
156} 157}
157 158
158asmlinkage void do_softirq(void) 159asmlinkage void do_softirq(void)
@@ -169,7 +170,7 @@ asmlinkage void do_softirq(void)
169 170
170 if (local_softirq_pending()) { 171 if (local_softirq_pending()) {
171 curctx = current_thread_info(); 172 curctx = current_thread_info();
172 irqctx = softirq_ctx[smp_processor_id()]; 173 irqctx = __get_cpu_var(softirq_ctx);
173 irqctx->tinfo.task = curctx->task; 174 irqctx->tinfo.task = curctx->task;
174 irqctx->tinfo.previous_esp = current_stack_pointer; 175 irqctx->tinfo.previous_esp = current_stack_pointer;
175 176
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index f3e11cb295c4..925d87cfc551 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -50,7 +50,6 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id)
50 */ 50 */
51static struct irqaction fpu_irq = { 51static struct irqaction fpu_irq = {
52 .handler = math_error_irq, 52 .handler = math_error_irq,
53 .mask = CPU_MASK_NONE,
54 .name = "fpu", 53 .name = "fpu",
55}; 54};
56 55
@@ -83,7 +82,6 @@ void __init init_ISA_irqs(void)
83 */ 82 */
84static struct irqaction irq2 = { 83static struct irqaction irq2 = {
85 .handler = no_action, 84 .handler = no_action,
86 .mask = CPU_MASK_NONE,
87 .name = "cascade", 85 .name = "cascade",
88}; 86};
89 87
@@ -160,6 +158,9 @@ static void __init apic_intr_init(void)
160 /* self generated IPI for local APIC timer */ 158 /* self generated IPI for local APIC timer */
161 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); 159 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
162 160
161 /* generic IPI for platform specific use */
162 alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
163
163 /* IPI vectors for APIC spurious and error interrupts */ 164 /* IPI vectors for APIC spurious and error interrupts */
164 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 165 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
165 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 166 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 16e1fc687504..665e2ab48abd 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -45,7 +45,6 @@
45 45
46static struct irqaction irq2 = { 46static struct irqaction irq2 = {
47 .handler = no_action, 47 .handler = no_action,
48 .mask = CPU_MASK_NONE,
49 .name = "cascade", 48 .name = "cascade",
50}; 49};
51DEFINE_PER_CPU(vector_irq_t, vector_irq) = { 50DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
@@ -147,6 +146,9 @@ static void __init apic_intr_init(void)
147 /* self generated IPI for local APIC timer */ 146 /* self generated IPI for local APIC timer */
148 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); 147 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
149 148
149 /* generic IPI for platform specific use */
150 alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
151
150 /* IPI vectors for APIC spurious and error interrupts */ 152 /* IPI vectors for APIC spurious and error interrupts */
151 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 153 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
152 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 154 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index ff7d3b0124f1..e444357375ce 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -8,11 +8,11 @@
8 */ 8 */
9#include <linux/debugfs.h> 9#include <linux/debugfs.h>
10#include <linux/uaccess.h> 10#include <linux/uaccess.h>
11#include <linux/stat.h> 11#include <linux/module.h>
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/stat.h>
13#include <linux/io.h> 14#include <linux/io.h>
14#include <linux/mm.h> 15#include <linux/mm.h>
15#include <linux/module.h>
16 16
17#include <asm/setup.h> 17#include <asm/setup.h>
18 18
@@ -26,9 +26,8 @@ struct setup_data_node {
26 u32 len; 26 u32 len;
27}; 27};
28 28
29static ssize_t 29static ssize_t setup_data_read(struct file *file, char __user *user_buf,
30setup_data_read(struct file *file, char __user *user_buf, size_t count, 30 size_t count, loff_t *ppos)
31 loff_t *ppos)
32{ 31{
33 struct setup_data_node *node = file->private_data; 32 struct setup_data_node *node = file->private_data;
34 unsigned long remain; 33 unsigned long remain;
@@ -39,20 +38,21 @@ setup_data_read(struct file *file, char __user *user_buf, size_t count,
39 38
40 if (pos < 0) 39 if (pos < 0)
41 return -EINVAL; 40 return -EINVAL;
41
42 if (pos >= node->len) 42 if (pos >= node->len)
43 return 0; 43 return 0;
44 44
45 if (count > node->len - pos) 45 if (count > node->len - pos)
46 count = node->len - pos; 46 count = node->len - pos;
47
47 pa = node->paddr + sizeof(struct setup_data) + pos; 48 pa = node->paddr + sizeof(struct setup_data) + pos;
48 pg = pfn_to_page((pa + count - 1) >> PAGE_SHIFT); 49 pg = pfn_to_page((pa + count - 1) >> PAGE_SHIFT);
49 if (PageHighMem(pg)) { 50 if (PageHighMem(pg)) {
50 p = ioremap_cache(pa, count); 51 p = ioremap_cache(pa, count);
51 if (!p) 52 if (!p)
52 return -ENXIO; 53 return -ENXIO;
53 } else { 54 } else
54 p = __va(pa); 55 p = __va(pa);
55 }
56 56
57 remain = copy_to_user(user_buf, p, count); 57 remain = copy_to_user(user_buf, p, count);
58 58
@@ -70,12 +70,13 @@ setup_data_read(struct file *file, char __user *user_buf, size_t count,
70static int setup_data_open(struct inode *inode, struct file *file) 70static int setup_data_open(struct inode *inode, struct file *file)
71{ 71{
72 file->private_data = inode->i_private; 72 file->private_data = inode->i_private;
73
73 return 0; 74 return 0;
74} 75}
75 76
76static const struct file_operations fops_setup_data = { 77static const struct file_operations fops_setup_data = {
77 .read = setup_data_read, 78 .read = setup_data_read,
78 .open = setup_data_open, 79 .open = setup_data_open,
79}; 80};
80 81
81static int __init 82static int __init
@@ -84,57 +85,50 @@ create_setup_data_node(struct dentry *parent, int no,
84{ 85{
85 struct dentry *d, *type, *data; 86 struct dentry *d, *type, *data;
86 char buf[16]; 87 char buf[16];
87 int error;
88 88
89 sprintf(buf, "%d", no); 89 sprintf(buf, "%d", no);
90 d = debugfs_create_dir(buf, parent); 90 d = debugfs_create_dir(buf, parent);
91 if (!d) { 91 if (!d)
92 error = -ENOMEM; 92 return -ENOMEM;
93 goto err_return; 93
94 }
95 type = debugfs_create_x32("type", S_IRUGO, d, &node->type); 94 type = debugfs_create_x32("type", S_IRUGO, d, &node->type);
96 if (!type) { 95 if (!type)
97 error = -ENOMEM;
98 goto err_dir; 96 goto err_dir;
99 } 97
100 data = debugfs_create_file("data", S_IRUGO, d, node, &fops_setup_data); 98 data = debugfs_create_file("data", S_IRUGO, d, node, &fops_setup_data);
101 if (!data) { 99 if (!data)
102 error = -ENOMEM;
103 goto err_type; 100 goto err_type;
104 } 101
105 return 0; 102 return 0;
106 103
107err_type: 104err_type:
108 debugfs_remove(type); 105 debugfs_remove(type);
109err_dir: 106err_dir:
110 debugfs_remove(d); 107 debugfs_remove(d);
111err_return: 108 return -ENOMEM;
112 return error;
113} 109}
114 110
115static int __init create_setup_data_nodes(struct dentry *parent) 111static int __init create_setup_data_nodes(struct dentry *parent)
116{ 112{
117 struct setup_data_node *node; 113 struct setup_data_node *node;
118 struct setup_data *data; 114 struct setup_data *data;
119 int error, no = 0; 115 int error = -ENOMEM;
120 struct dentry *d; 116 struct dentry *d;
121 struct page *pg; 117 struct page *pg;
122 u64 pa_data; 118 u64 pa_data;
119 int no = 0;
123 120
124 d = debugfs_create_dir("setup_data", parent); 121 d = debugfs_create_dir("setup_data", parent);
125 if (!d) { 122 if (!d)
126 error = -ENOMEM; 123 return -ENOMEM;
127 goto err_return;
128 }
129 124
130 pa_data = boot_params.hdr.setup_data; 125 pa_data = boot_params.hdr.setup_data;
131 126
132 while (pa_data) { 127 while (pa_data) {
133 node = kmalloc(sizeof(*node), GFP_KERNEL); 128 node = kmalloc(sizeof(*node), GFP_KERNEL);
134 if (!node) { 129 if (!node)
135 error = -ENOMEM;
136 goto err_dir; 130 goto err_dir;
137 } 131
138 pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT); 132 pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT);
139 if (PageHighMem(pg)) { 133 if (PageHighMem(pg)) {
140 data = ioremap_cache(pa_data, sizeof(*data)); 134 data = ioremap_cache(pa_data, sizeof(*data));
@@ -143,9 +137,8 @@ static int __init create_setup_data_nodes(struct dentry *parent)
143 error = -ENXIO; 137 error = -ENXIO;
144 goto err_dir; 138 goto err_dir;
145 } 139 }
146 } else { 140 } else
147 data = __va(pa_data); 141 data = __va(pa_data);
148 }
149 142
150 node->paddr = pa_data; 143 node->paddr = pa_data;
151 node->type = data->type; 144 node->type = data->type;
@@ -159,11 +152,11 @@ static int __init create_setup_data_nodes(struct dentry *parent)
159 goto err_dir; 152 goto err_dir;
160 no++; 153 no++;
161 } 154 }
155
162 return 0; 156 return 0;
163 157
164err_dir: 158err_dir:
165 debugfs_remove(d); 159 debugfs_remove(d);
166err_return:
167 return error; 160 return error;
168} 161}
169 162
@@ -175,28 +168,26 @@ static struct debugfs_blob_wrapper boot_params_blob = {
175static int __init boot_params_kdebugfs_init(void) 168static int __init boot_params_kdebugfs_init(void)
176{ 169{
177 struct dentry *dbp, *version, *data; 170 struct dentry *dbp, *version, *data;
178 int error; 171 int error = -ENOMEM;
179 172
180 dbp = debugfs_create_dir("boot_params", NULL); 173 dbp = debugfs_create_dir("boot_params", NULL);
181 if (!dbp) { 174 if (!dbp)
182 error = -ENOMEM; 175 return -ENOMEM;
183 goto err_return; 176
184 }
185 version = debugfs_create_x16("version", S_IRUGO, dbp, 177 version = debugfs_create_x16("version", S_IRUGO, dbp,
186 &boot_params.hdr.version); 178 &boot_params.hdr.version);
187 if (!version) { 179 if (!version)
188 error = -ENOMEM;
189 goto err_dir; 180 goto err_dir;
190 } 181
191 data = debugfs_create_blob("data", S_IRUGO, dbp, 182 data = debugfs_create_blob("data", S_IRUGO, dbp,
192 &boot_params_blob); 183 &boot_params_blob);
193 if (!data) { 184 if (!data)
194 error = -ENOMEM;
195 goto err_version; 185 goto err_version;
196 } 186
197 error = create_setup_data_nodes(dbp); 187 error = create_setup_data_nodes(dbp);
198 if (error) 188 if (error)
199 goto err_data; 189 goto err_data;
190
200 return 0; 191 return 0;
201 192
202err_data: 193err_data:
@@ -205,10 +196,9 @@ err_version:
205 debugfs_remove(version); 196 debugfs_remove(version);
206err_dir: 197err_dir:
207 debugfs_remove(dbp); 198 debugfs_remove(dbp);
208err_return:
209 return error; 199 return error;
210} 200}
211#endif 201#endif /* CONFIG_DEBUG_BOOT_PARAMS */
212 202
213static int __init arch_kdebugfs_init(void) 203static int __init arch_kdebugfs_init(void)
214{ 204{
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index e948b28a5a9a..7b5169d2b000 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -193,6 +193,9 @@ static int __kprobes can_boost(kprobe_opcode_t *opcodes)
193 kprobe_opcode_t opcode; 193 kprobe_opcode_t opcode;
194 kprobe_opcode_t *orig_opcodes = opcodes; 194 kprobe_opcode_t *orig_opcodes = opcodes;
195 195
196 if (search_exception_tables((unsigned long)opcodes))
197 return 0; /* Page fault may occur on this address. */
198
196retry: 199retry:
197 if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1) 200 if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
198 return 0; 201 return 0;
@@ -635,13 +638,13 @@ static void __used __kprobes kretprobe_trampoline_holder(void)
635#else 638#else
636 " pushf\n" 639 " pushf\n"
637 /* 640 /*
638 * Skip cs, ip, orig_ax. 641 * Skip cs, ip, orig_ax and gs.
639 * trampoline_handler() will plug in these values 642 * trampoline_handler() will plug in these values
640 */ 643 */
641 " subl $12, %esp\n" 644 " subl $16, %esp\n"
642 " pushl %fs\n" 645 " pushl %fs\n"
643 " pushl %ds\n"
644 " pushl %es\n" 646 " pushl %es\n"
647 " pushl %ds\n"
645 " pushl %eax\n" 648 " pushl %eax\n"
646 " pushl %ebp\n" 649 " pushl %ebp\n"
647 " pushl %edi\n" 650 " pushl %edi\n"
@@ -652,10 +655,10 @@ static void __used __kprobes kretprobe_trampoline_holder(void)
652 " movl %esp, %eax\n" 655 " movl %esp, %eax\n"
653 " call trampoline_handler\n" 656 " call trampoline_handler\n"
654 /* Move flags to cs */ 657 /* Move flags to cs */
655 " movl 52(%esp), %edx\n" 658 " movl 56(%esp), %edx\n"
656 " movl %edx, 48(%esp)\n" 659 " movl %edx, 52(%esp)\n"
657 /* Replace saved flags with true return address. */ 660 /* Replace saved flags with true return address. */
658 " movl %eax, 52(%esp)\n" 661 " movl %eax, 56(%esp)\n"
659 " popl %ebx\n" 662 " popl %ebx\n"
660 " popl %ecx\n" 663 " popl %ecx\n"
661 " popl %edx\n" 664 " popl %edx\n"
@@ -663,8 +666,8 @@ static void __used __kprobes kretprobe_trampoline_holder(void)
663 " popl %edi\n" 666 " popl %edi\n"
664 " popl %ebp\n" 667 " popl %ebp\n"
665 " popl %eax\n" 668 " popl %eax\n"
666 /* Skip ip, orig_ax, es, ds, fs */ 669 /* Skip ds, es, fs, gs, orig_ax and ip */
667 " addl $20, %esp\n" 670 " addl $24, %esp\n"
668 " popf\n" 671 " popf\n"
669#endif 672#endif
670 " ret\n"); 673 " ret\n");
@@ -688,6 +691,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
688 regs->cs = __KERNEL_CS; 691 regs->cs = __KERNEL_CS;
689#else 692#else
690 regs->cs = __KERNEL_CS | get_kernel_rpl(); 693 regs->cs = __KERNEL_CS | get_kernel_rpl();
694 regs->gs = 0;
691#endif 695#endif
692 regs->ip = trampoline_address; 696 regs->ip = trampoline_address;
693 regs->orig_ax = ~0UL; 697 regs->orig_ax = ~0UL;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 478bca986eca..33019ddb56b4 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -138,12 +138,6 @@ static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte)
138 kvm_mmu_write(ptep, pte_val(pte)); 138 kvm_mmu_write(ptep, pte_val(pte));
139} 139}
140 140
141static void kvm_set_pte_present(struct mm_struct *mm, unsigned long addr,
142 pte_t *ptep, pte_t pte)
143{
144 kvm_mmu_write(ptep, pte_val(pte));
145}
146
147static void kvm_pte_clear(struct mm_struct *mm, 141static void kvm_pte_clear(struct mm_struct *mm,
148 unsigned long addr, pte_t *ptep) 142 unsigned long addr, pte_t *ptep)
149{ 143{
@@ -220,7 +214,6 @@ static void paravirt_ops_setup(void)
220#if PAGETABLE_LEVELS >= 3 214#if PAGETABLE_LEVELS >= 3
221#ifdef CONFIG_X86_PAE 215#ifdef CONFIG_X86_PAE
222 pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic; 216 pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic;
223 pv_mmu_ops.set_pte_present = kvm_set_pte_present;
224 pv_mmu_ops.pte_clear = kvm_pte_clear; 217 pv_mmu_ops.pte_clear = kvm_pte_clear;
225 pv_mmu_ops.pmd_clear = kvm_pmd_clear; 218 pv_mmu_ops.pmd_clear = kvm_pmd_clear;
226#endif 219#endif
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index f5fc8c781a62..e7368c1da01d 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -14,12 +14,12 @@
14#include <linux/ftrace.h> 14#include <linux/ftrace.h>
15#include <linux/suspend.h> 15#include <linux/suspend.h>
16#include <linux/gfp.h> 16#include <linux/gfp.h>
17#include <linux/io.h>
17 18
18#include <asm/pgtable.h> 19#include <asm/pgtable.h>
19#include <asm/pgalloc.h> 20#include <asm/pgalloc.h>
20#include <asm/tlbflush.h> 21#include <asm/tlbflush.h>
21#include <asm/mmu_context.h> 22#include <asm/mmu_context.h>
22#include <asm/io.h>
23#include <asm/apic.h> 23#include <asm/apic.h>
24#include <asm/cpufeature.h> 24#include <asm/cpufeature.h>
25#include <asm/desc.h> 25#include <asm/desc.h>
@@ -63,7 +63,7 @@ static void load_segments(void)
63 "\tmovl %%eax,%%fs\n" 63 "\tmovl %%eax,%%fs\n"
64 "\tmovl %%eax,%%gs\n" 64 "\tmovl %%eax,%%gs\n"
65 "\tmovl %%eax,%%ss\n" 65 "\tmovl %%eax,%%ss\n"
66 ::: "eax", "memory"); 66 : : : "eax", "memory");
67#undef STR 67#undef STR
68#undef __STR 68#undef __STR
69} 69}
@@ -205,7 +205,8 @@ void machine_kexec(struct kimage *image)
205 205
206 if (image->preserve_context) { 206 if (image->preserve_context) {
207#ifdef CONFIG_X86_IO_APIC 207#ifdef CONFIG_X86_IO_APIC
208 /* We need to put APICs in legacy mode so that we can 208 /*
209 * We need to put APICs in legacy mode so that we can
209 * get timer interrupts in second kernel. kexec/kdump 210 * get timer interrupts in second kernel. kexec/kdump
210 * paths already have calls to disable_IO_APIC() in 211 * paths already have calls to disable_IO_APIC() in
211 * one form or other. kexec jump path also need 212 * one form or other. kexec jump path also need
@@ -227,7 +228,8 @@ void machine_kexec(struct kimage *image)
227 page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) 228 page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
228 << PAGE_SHIFT); 229 << PAGE_SHIFT);
229 230
230 /* The segment registers are funny things, they have both a 231 /*
232 * The segment registers are funny things, they have both a
231 * visible and an invisible part. Whenever the visible part is 233 * visible and an invisible part. Whenever the visible part is
232 * set to a specific selector, the invisible part is loaded 234 * set to a specific selector, the invisible part is loaded
233 * with from a table in memory. At no other time is the 235 * with from a table in memory. At no other time is the
@@ -237,11 +239,12 @@ void machine_kexec(struct kimage *image)
237 * segments, before I zap the gdt with an invalid value. 239 * segments, before I zap the gdt with an invalid value.
238 */ 240 */
239 load_segments(); 241 load_segments();
240 /* The gdt & idt are now invalid. 242 /*
243 * The gdt & idt are now invalid.
241 * If you want to load them you must set up your own idt & gdt. 244 * If you want to load them you must set up your own idt & gdt.
242 */ 245 */
243 set_gdt(phys_to_virt(0),0); 246 set_gdt(phys_to_virt(0), 0);
244 set_idt(phys_to_virt(0),0); 247 set_idt(phys_to_virt(0), 0);
245 248
246 /* now call it */ 249 /* now call it */
247 image->start = relocate_kernel_ptr((unsigned long)image->head, 250 image->start = relocate_kernel_ptr((unsigned long)image->head,
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 6993d51b7fd8..89cea4d44679 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -12,11 +12,47 @@
12#include <linux/reboot.h> 12#include <linux/reboot.h>
13#include <linux/numa.h> 13#include <linux/numa.h>
14#include <linux/ftrace.h> 14#include <linux/ftrace.h>
15#include <linux/io.h>
16#include <linux/suspend.h>
15 17
16#include <asm/pgtable.h> 18#include <asm/pgtable.h>
17#include <asm/tlbflush.h> 19#include <asm/tlbflush.h>
18#include <asm/mmu_context.h> 20#include <asm/mmu_context.h>
19#include <asm/io.h> 21
22static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
23 unsigned long addr)
24{
25 pud_t *pud;
26 pmd_t *pmd;
27 struct page *page;
28 int result = -ENOMEM;
29
30 addr &= PMD_MASK;
31 pgd += pgd_index(addr);
32 if (!pgd_present(*pgd)) {
33 page = kimage_alloc_control_pages(image, 0);
34 if (!page)
35 goto out;
36 pud = (pud_t *)page_address(page);
37 memset(pud, 0, PAGE_SIZE);
38 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
39 }
40 pud = pud_offset(pgd, addr);
41 if (!pud_present(*pud)) {
42 page = kimage_alloc_control_pages(image, 0);
43 if (!page)
44 goto out;
45 pmd = (pmd_t *)page_address(page);
46 memset(pmd, 0, PAGE_SIZE);
47 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
48 }
49 pmd = pmd_offset(pud, addr);
50 if (!pmd_present(*pmd))
51 set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
52 result = 0;
53out:
54 return result;
55}
20 56
21static void init_level2_page(pmd_t *level2p, unsigned long addr) 57static void init_level2_page(pmd_t *level2p, unsigned long addr)
22{ 58{
@@ -83,9 +119,8 @@ static int init_level4_page(struct kimage *image, pgd_t *level4p,
83 } 119 }
84 level3p = (pud_t *)page_address(page); 120 level3p = (pud_t *)page_address(page);
85 result = init_level3_page(image, level3p, addr, last_addr); 121 result = init_level3_page(image, level3p, addr, last_addr);
86 if (result) { 122 if (result)
87 goto out; 123 goto out;
88 }
89 set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE)); 124 set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
90 addr += PGDIR_SIZE; 125 addr += PGDIR_SIZE;
91 } 126 }
@@ -156,6 +191,13 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
156 result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT); 191 result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT);
157 if (result) 192 if (result)
158 return result; 193 return result;
194 /*
195 * image->start may be outside 0 ~ max_pfn, for example when
196 * jump back to original kernel from kexeced kernel
197 */
198 result = init_one_level2_page(image, level4p, image->start);
199 if (result)
200 return result;
159 return init_transition_pgtable(image, level4p); 201 return init_transition_pgtable(image, level4p);
160} 202}
161 203
@@ -229,20 +271,45 @@ void machine_kexec(struct kimage *image)
229{ 271{
230 unsigned long page_list[PAGES_NR]; 272 unsigned long page_list[PAGES_NR];
231 void *control_page; 273 void *control_page;
274 int save_ftrace_enabled;
232 275
233 tracer_disable(); 276#ifdef CONFIG_KEXEC_JUMP
277 if (kexec_image->preserve_context)
278 save_processor_state();
279#endif
280
281 save_ftrace_enabled = __ftrace_enabled_save();
234 282
235 /* Interrupts aren't acceptable while we reboot */ 283 /* Interrupts aren't acceptable while we reboot */
236 local_irq_disable(); 284 local_irq_disable();
237 285
286 if (image->preserve_context) {
287#ifdef CONFIG_X86_IO_APIC
288 /*
289 * We need to put APICs in legacy mode so that we can
290 * get timer interrupts in second kernel. kexec/kdump
291 * paths already have calls to disable_IO_APIC() in
292 * one form or other. kexec jump path also need
293 * one.
294 */
295 disable_IO_APIC();
296#endif
297 }
298
238 control_page = page_address(image->control_code_page) + PAGE_SIZE; 299 control_page = page_address(image->control_code_page) + PAGE_SIZE;
239 memcpy(control_page, relocate_kernel, PAGE_SIZE); 300 memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
240 301
241 page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page); 302 page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
303 page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
242 page_list[PA_TABLE_PAGE] = 304 page_list[PA_TABLE_PAGE] =
243 (unsigned long)__pa(page_address(image->control_code_page)); 305 (unsigned long)__pa(page_address(image->control_code_page));
244 306
245 /* The segment registers are funny things, they have both a 307 if (image->type == KEXEC_TYPE_DEFAULT)
308 page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
309 << PAGE_SHIFT);
310
311 /*
312 * The segment registers are funny things, they have both a
246 * visible and an invisible part. Whenever the visible part is 313 * visible and an invisible part. Whenever the visible part is
247 * set to a specific selector, the invisible part is loaded 314 * set to a specific selector, the invisible part is loaded
248 * with from a table in memory. At no other time is the 315 * with from a table in memory. At no other time is the
@@ -252,15 +319,25 @@ void machine_kexec(struct kimage *image)
252 * segments, before I zap the gdt with an invalid value. 319 * segments, before I zap the gdt with an invalid value.
253 */ 320 */
254 load_segments(); 321 load_segments();
255 /* The gdt & idt are now invalid. 322 /*
323 * The gdt & idt are now invalid.
256 * If you want to load them you must set up your own idt & gdt. 324 * If you want to load them you must set up your own idt & gdt.
257 */ 325 */
258 set_gdt(phys_to_virt(0),0); 326 set_gdt(phys_to_virt(0), 0);
259 set_idt(phys_to_virt(0),0); 327 set_idt(phys_to_virt(0), 0);
260 328
261 /* now call it */ 329 /* now call it */
262 relocate_kernel((unsigned long)image->head, (unsigned long)page_list, 330 image->start = relocate_kernel((unsigned long)image->head,
263 image->start); 331 (unsigned long)page_list,
332 image->start,
333 image->preserve_context);
334
335#ifdef CONFIG_KEXEC_JUMP
336 if (kexec_image->preserve_context)
337 restore_processor_state();
338#endif
339
340 __ftrace_enabled_restore(save_ftrace_enabled);
264} 341}
265 342
266void arch_crash_save_vmcoreinfo(void) 343void arch_crash_save_vmcoreinfo(void)
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
index 8815f3c7fec7..846510b78a09 100644
--- a/arch/x86/kernel/mfgpt_32.c
+++ b/arch/x86/kernel/mfgpt_32.c
@@ -348,7 +348,6 @@ static irqreturn_t mfgpt_tick(int irq, void *dev_id)
348static struct irqaction mfgptirq = { 348static struct irqaction mfgptirq = {
349 .handler = mfgpt_tick, 349 .handler = mfgpt_tick,
350 .flags = IRQF_DISABLED | IRQF_NOBALANCING, 350 .flags = IRQF_DISABLED | IRQF_NOBALANCING,
351 .mask = CPU_MASK_NONE,
352 .name = "mfgpt-timer" 351 .name = "mfgpt-timer"
353}; 352};
354 353
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index c25fdb382292..453b5795a5c6 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -12,31 +12,30 @@
12 * 12 *
13 * Licensed under the terms of the GNU General Public 13 * Licensed under the terms of the GNU General Public
14 * License version 2. See file COPYING for details. 14 * License version 2. See file COPYING for details.
15*/ 15 */
16 16#include <linux/platform_device.h>
17#include <linux/capability.h> 17#include <linux/capability.h>
18#include <linux/kernel.h>
19#include <linux/init.h>
20#include <linux/sched.h>
21#include <linux/cpumask.h>
22#include <linux/module.h>
23#include <linux/slab.h>
24#include <linux/vmalloc.h>
25#include <linux/miscdevice.h> 18#include <linux/miscdevice.h>
19#include <linux/firmware.h>
26#include <linux/spinlock.h> 20#include <linux/spinlock.h>
27#include <linux/mm.h> 21#include <linux/cpumask.h>
28#include <linux/fs.h> 22#include <linux/pci_ids.h>
23#include <linux/uaccess.h>
24#include <linux/vmalloc.h>
25#include <linux/kernel.h>
26#include <linux/module.h>
29#include <linux/mutex.h> 27#include <linux/mutex.h>
28#include <linux/sched.h>
29#include <linux/init.h>
30#include <linux/slab.h>
30#include <linux/cpu.h> 31#include <linux/cpu.h>
31#include <linux/firmware.h>
32#include <linux/platform_device.h>
33#include <linux/pci.h> 32#include <linux/pci.h>
34#include <linux/pci_ids.h> 33#include <linux/fs.h>
35#include <linux/uaccess.h> 34#include <linux/mm.h>
36 35
37#include <asm/msr.h>
38#include <asm/processor.h>
39#include <asm/microcode.h> 36#include <asm/microcode.h>
37#include <asm/processor.h>
38#include <asm/msr.h>
40 39
41MODULE_DESCRIPTION("AMD Microcode Update Driver"); 40MODULE_DESCRIPTION("AMD Microcode Update Driver");
42MODULE_AUTHOR("Peter Oruba"); 41MODULE_AUTHOR("Peter Oruba");
@@ -72,8 +71,8 @@ struct microcode_header_amd {
72} __attribute__((packed)); 71} __attribute__((packed));
73 72
74struct microcode_amd { 73struct microcode_amd {
75 struct microcode_header_amd hdr; 74 struct microcode_header_amd hdr;
76 unsigned int mpb[0]; 75 unsigned int mpb[0];
77}; 76};
78 77
79#define UCODE_MAX_SIZE 2048 78#define UCODE_MAX_SIZE 2048
@@ -184,8 +183,8 @@ static int get_ucode_data(void *to, const u8 *from, size_t n)
184 return 0; 183 return 0;
185} 184}
186 185
187static void *get_next_ucode(const u8 *buf, unsigned int size, 186static void *
188 unsigned int *mc_size) 187get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
189{ 188{
190 unsigned int total_size; 189 unsigned int total_size;
191 u8 section_hdr[UCODE_CONTAINER_SECTION_HDR]; 190 u8 section_hdr[UCODE_CONTAINER_SECTION_HDR];
@@ -223,7 +222,6 @@ static void *get_next_ucode(const u8 *buf, unsigned int size,
223 return mc; 222 return mc;
224} 223}
225 224
226
227static int install_equiv_cpu_table(const u8 *buf) 225static int install_equiv_cpu_table(const u8 *buf)
228{ 226{
229 u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE]; 227 u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE];
@@ -372,4 +370,3 @@ struct microcode_ops * __init init_amd_microcode(void)
372{ 370{
373 return &microcode_amd_ops; 371 return &microcode_amd_ops;
374} 372}
375
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index c9b721ba968c..a0f3851ef310 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -70,67 +70,78 @@
70 * Fix sigmatch() macro to handle old CPUs with pf == 0. 70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug. 71 * Thanks to Stuart Swales for pointing out this bug.
72 */ 72 */
73#include <linux/platform_device.h>
73#include <linux/capability.h> 74#include <linux/capability.h>
74#include <linux/kernel.h> 75#include <linux/miscdevice.h>
75#include <linux/init.h> 76#include <linux/firmware.h>
76#include <linux/sched.h>
77#include <linux/smp_lock.h> 77#include <linux/smp_lock.h>
78#include <linux/spinlock.h>
78#include <linux/cpumask.h> 79#include <linux/cpumask.h>
79#include <linux/module.h> 80#include <linux/uaccess.h>
80#include <linux/slab.h>
81#include <linux/vmalloc.h> 81#include <linux/vmalloc.h>
82#include <linux/miscdevice.h> 82#include <linux/kernel.h>
83#include <linux/spinlock.h> 83#include <linux/module.h>
84#include <linux/mm.h>
85#include <linux/fs.h>
86#include <linux/mutex.h> 84#include <linux/mutex.h>
85#include <linux/sched.h>
86#include <linux/init.h>
87#include <linux/slab.h>
87#include <linux/cpu.h> 88#include <linux/cpu.h>
88#include <linux/firmware.h> 89#include <linux/fs.h>
89#include <linux/platform_device.h> 90#include <linux/mm.h>
90 91
91#include <asm/msr.h>
92#include <asm/uaccess.h>
93#include <asm/processor.h>
94#include <asm/microcode.h> 92#include <asm/microcode.h>
93#include <asm/processor.h>
94#include <asm/msr.h>
95 95
96MODULE_DESCRIPTION("Microcode Update Driver"); 96MODULE_DESCRIPTION("Microcode Update Driver");
97MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); 97MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
98MODULE_LICENSE("GPL"); 98MODULE_LICENSE("GPL");
99 99
100#define MICROCODE_VERSION "2.00" 100#define MICROCODE_VERSION "2.00"
101 101
102static struct microcode_ops *microcode_ops; 102static struct microcode_ops *microcode_ops;
103 103
104/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ 104/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
105static DEFINE_MUTEX(microcode_mutex); 105static DEFINE_MUTEX(microcode_mutex);
106 106
107struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; 107struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
108EXPORT_SYMBOL_GPL(ucode_cpu_info); 108EXPORT_SYMBOL_GPL(ucode_cpu_info);
109 109
110#ifdef CONFIG_MICROCODE_OLD_INTERFACE 110#ifdef CONFIG_MICROCODE_OLD_INTERFACE
111struct update_for_cpu {
112 const void __user *buf;
113 size_t size;
114};
115
116static long update_for_cpu(void *_ufc)
117{
118 struct update_for_cpu *ufc = _ufc;
119 int error;
120
121 error = microcode_ops->request_microcode_user(smp_processor_id(),
122 ufc->buf, ufc->size);
123 if (error < 0)
124 return error;
125 if (!error)
126 microcode_ops->apply_microcode(smp_processor_id());
127 return error;
128}
129
111static int do_microcode_update(const void __user *buf, size_t size) 130static int do_microcode_update(const void __user *buf, size_t size)
112{ 131{
113 cpumask_t old;
114 int error = 0; 132 int error = 0;
115 int cpu; 133 int cpu;
116 134 struct update_for_cpu ufc = { .buf = buf, .size = size };
117 old = current->cpus_allowed;
118 135
119 for_each_online_cpu(cpu) { 136 for_each_online_cpu(cpu) {
120 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 137 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
121 138
122 if (!uci->valid) 139 if (!uci->valid)
123 continue; 140 continue;
124 141 error = work_on_cpu(cpu, update_for_cpu, &ufc);
125 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
126 error = microcode_ops->request_microcode_user(cpu, buf, size);
127 if (error < 0) 142 if (error < 0)
128 goto out; 143 break;
129 if (!error)
130 microcode_ops->apply_microcode(cpu);
131 } 144 }
132out:
133 set_cpus_allowed_ptr(current, &old);
134 return error; 145 return error;
135} 146}
136 147
@@ -198,18 +209,33 @@ static void microcode_dev_exit(void)
198 209
199MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); 210MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
200#else 211#else
201#define microcode_dev_init() 0 212#define microcode_dev_init() 0
202#define microcode_dev_exit() do { } while (0) 213#define microcode_dev_exit() do { } while (0)
203#endif 214#endif
204 215
205/* fake device for request_firmware */ 216/* fake device for request_firmware */
206static struct platform_device *microcode_pdev; 217static struct platform_device *microcode_pdev;
218
219static long reload_for_cpu(void *unused)
220{
221 struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id();
222 int err = 0;
223
224 mutex_lock(&microcode_mutex);
225 if (uci->valid) {
226 err = microcode_ops->request_microcode_fw(smp_processor_id(),
227 &microcode_pdev->dev);
228 if (!err)
229 microcode_ops->apply_microcode(smp_processor_id());
230 }
231 mutex_unlock(&microcode_mutex);
232 return err;
233}
207 234
208static ssize_t reload_store(struct sys_device *dev, 235static ssize_t reload_store(struct sys_device *dev,
209 struct sysdev_attribute *attr, 236 struct sysdev_attribute *attr,
210 const char *buf, size_t sz) 237 const char *buf, size_t sz)
211{ 238{
212 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
213 char *end; 239 char *end;
214 unsigned long val = simple_strtoul(buf, &end, 0); 240 unsigned long val = simple_strtoul(buf, &end, 0);
215 int err = 0; 241 int err = 0;
@@ -218,21 +244,9 @@ static ssize_t reload_store(struct sys_device *dev,
218 if (end == buf) 244 if (end == buf)
219 return -EINVAL; 245 return -EINVAL;
220 if (val == 1) { 246 if (val == 1) {
221 cpumask_t old = current->cpus_allowed;
222
223 get_online_cpus(); 247 get_online_cpus();
224 if (cpu_online(cpu)) { 248 if (cpu_online(cpu))
225 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 249 err = work_on_cpu(cpu, reload_for_cpu, NULL);
226 mutex_lock(&microcode_mutex);
227 if (uci->valid) {
228 err = microcode_ops->request_microcode_fw(cpu,
229 &microcode_pdev->dev);
230 if (!err)
231 microcode_ops->apply_microcode(cpu);
232 }
233 mutex_unlock(&microcode_mutex);
234 set_cpus_allowed_ptr(current, &old);
235 }
236 put_online_cpus(); 250 put_online_cpus();
237 } 251 }
238 if (err) 252 if (err)
@@ -268,8 +282,8 @@ static struct attribute *mc_default_attrs[] = {
268}; 282};
269 283
270static struct attribute_group mc_attr_group = { 284static struct attribute_group mc_attr_group = {
271 .attrs = mc_default_attrs, 285 .attrs = mc_default_attrs,
272 .name = "microcode", 286 .name = "microcode",
273}; 287};
274 288
275static void __microcode_fini_cpu(int cpu) 289static void __microcode_fini_cpu(int cpu)
@@ -328,9 +342,9 @@ static int microcode_resume_cpu(int cpu)
328 return 0; 342 return 0;
329} 343}
330 344
331static void microcode_update_cpu(int cpu) 345static long microcode_update_cpu(void *unused)
332{ 346{
333 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 347 struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id();
334 int err = 0; 348 int err = 0;
335 349
336 /* 350 /*
@@ -338,30 +352,27 @@ static void microcode_update_cpu(int cpu)
338 * otherwise just request a firmware: 352 * otherwise just request a firmware:
339 */ 353 */
340 if (uci->valid) { 354 if (uci->valid) {
341 err = microcode_resume_cpu(cpu); 355 err = microcode_resume_cpu(smp_processor_id());
342 } else { 356 } else {
343 collect_cpu_info(cpu); 357 collect_cpu_info(smp_processor_id());
344 if (uci->valid && system_state == SYSTEM_RUNNING) 358 if (uci->valid && system_state == SYSTEM_RUNNING)
345 err = microcode_ops->request_microcode_fw(cpu, 359 err = microcode_ops->request_microcode_fw(
360 smp_processor_id(),
346 &microcode_pdev->dev); 361 &microcode_pdev->dev);
347 } 362 }
348 if (!err) 363 if (!err)
349 microcode_ops->apply_microcode(cpu); 364 microcode_ops->apply_microcode(smp_processor_id());
365 return err;
350} 366}
351 367
352static void microcode_init_cpu(int cpu) 368static int microcode_init_cpu(int cpu)
353{ 369{
354 cpumask_t old = current->cpus_allowed; 370 int err;
355
356 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
357 /* We should bind the task to the CPU */
358 BUG_ON(raw_smp_processor_id() != cpu);
359
360 mutex_lock(&microcode_mutex); 371 mutex_lock(&microcode_mutex);
361 microcode_update_cpu(cpu); 372 err = work_on_cpu(cpu, microcode_update_cpu, NULL);
362 mutex_unlock(&microcode_mutex); 373 mutex_unlock(&microcode_mutex);
363 374
364 set_cpus_allowed_ptr(current, &old); 375 return err;
365} 376}
366 377
367static int mc_sysdev_add(struct sys_device *sys_dev) 378static int mc_sysdev_add(struct sys_device *sys_dev)
@@ -379,8 +390,11 @@ static int mc_sysdev_add(struct sys_device *sys_dev)
379 if (err) 390 if (err)
380 return err; 391 return err;
381 392
382 microcode_init_cpu(cpu); 393 err = microcode_init_cpu(cpu);
383 return 0; 394 if (err)
395 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
396
397 return err;
384} 398}
385 399
386static int mc_sysdev_remove(struct sys_device *sys_dev) 400static int mc_sysdev_remove(struct sys_device *sys_dev)
@@ -404,14 +418,14 @@ static int mc_sysdev_resume(struct sys_device *dev)
404 return 0; 418 return 0;
405 419
406 /* only CPU 0 will apply ucode here */ 420 /* only CPU 0 will apply ucode here */
407 microcode_update_cpu(0); 421 microcode_update_cpu(NULL);
408 return 0; 422 return 0;
409} 423}
410 424
411static struct sysdev_driver mc_sysdev_driver = { 425static struct sysdev_driver mc_sysdev_driver = {
412 .add = mc_sysdev_add, 426 .add = mc_sysdev_add,
413 .remove = mc_sysdev_remove, 427 .remove = mc_sysdev_remove,
414 .resume = mc_sysdev_resume, 428 .resume = mc_sysdev_resume,
415}; 429};
416 430
417static __cpuinit int 431static __cpuinit int
@@ -424,7 +438,9 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
424 switch (action) { 438 switch (action) {
425 case CPU_ONLINE: 439 case CPU_ONLINE:
426 case CPU_ONLINE_FROZEN: 440 case CPU_ONLINE_FROZEN:
427 microcode_init_cpu(cpu); 441 if (microcode_init_cpu(cpu))
442 printk(KERN_ERR "microcode: failed to init CPU%d\n",
443 cpu);
428 case CPU_DOWN_FAILED: 444 case CPU_DOWN_FAILED:
429 case CPU_DOWN_FAILED_FROZEN: 445 case CPU_DOWN_FAILED_FROZEN:
430 pr_debug("microcode: CPU%d added\n", cpu); 446 pr_debug("microcode: CPU%d added\n", cpu);
@@ -448,7 +464,7 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
448} 464}
449 465
450static struct notifier_block __refdata mc_cpu_notifier = { 466static struct notifier_block __refdata mc_cpu_notifier = {
451 .notifier_call = mc_cpu_callback, 467 .notifier_call = mc_cpu_callback,
452}; 468};
453 469
454static int __init microcode_init(void) 470static int __init microcode_init(void)
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 5e9f4fc51385..149b9ec7c1ab 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -70,28 +70,28 @@
70 * Fix sigmatch() macro to handle old CPUs with pf == 0. 70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug. 71 * Thanks to Stuart Swales for pointing out this bug.
72 */ 72 */
73#include <linux/platform_device.h>
73#include <linux/capability.h> 74#include <linux/capability.h>
74#include <linux/kernel.h> 75#include <linux/miscdevice.h>
75#include <linux/init.h> 76#include <linux/firmware.h>
76#include <linux/sched.h>
77#include <linux/smp_lock.h> 77#include <linux/smp_lock.h>
78#include <linux/spinlock.h>
78#include <linux/cpumask.h> 79#include <linux/cpumask.h>
79#include <linux/module.h> 80#include <linux/uaccess.h>
80#include <linux/slab.h>
81#include <linux/vmalloc.h> 81#include <linux/vmalloc.h>
82#include <linux/miscdevice.h> 82#include <linux/kernel.h>
83#include <linux/spinlock.h> 83#include <linux/module.h>
84#include <linux/mm.h>
85#include <linux/fs.h>
86#include <linux/mutex.h> 84#include <linux/mutex.h>
85#include <linux/sched.h>
86#include <linux/init.h>
87#include <linux/slab.h>
87#include <linux/cpu.h> 88#include <linux/cpu.h>
88#include <linux/firmware.h> 89#include <linux/fs.h>
89#include <linux/platform_device.h> 90#include <linux/mm.h>
90#include <linux/uaccess.h>
91 91
92#include <asm/msr.h>
93#include <asm/processor.h>
94#include <asm/microcode.h> 92#include <asm/microcode.h>
93#include <asm/processor.h>
94#include <asm/msr.h>
95 95
96MODULE_DESCRIPTION("Microcode Update Driver"); 96MODULE_DESCRIPTION("Microcode Update Driver");
97MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); 97MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
@@ -129,12 +129,13 @@ struct extended_sigtable {
129 struct extended_signature sigs[0]; 129 struct extended_signature sigs[0];
130}; 130};
131 131
132#define DEFAULT_UCODE_DATASIZE (2000) 132#define DEFAULT_UCODE_DATASIZE (2000)
133#define MC_HEADER_SIZE (sizeof(struct microcode_header_intel)) 133#define MC_HEADER_SIZE (sizeof(struct microcode_header_intel))
134#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) 134#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
135#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable)) 135#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable))
136#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature)) 136#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature))
137#define DWSIZE (sizeof(u32)) 137#define DWSIZE (sizeof(u32))
138
138#define get_totalsize(mc) \ 139#define get_totalsize(mc) \
139 (((struct microcode_intel *)mc)->hdr.totalsize ? \ 140 (((struct microcode_intel *)mc)->hdr.totalsize ? \
140 ((struct microcode_intel *)mc)->hdr.totalsize : \ 141 ((struct microcode_intel *)mc)->hdr.totalsize : \
@@ -197,30 +198,31 @@ static inline int update_match_cpu(struct cpu_signature *csig, int sig, int pf)
197} 198}
198 199
199static inline int 200static inline int
200update_match_revision(struct microcode_header_intel *mc_header, int rev) 201update_match_revision(struct microcode_header_intel *mc_header, int rev)
201{ 202{
202 return (mc_header->rev <= rev) ? 0 : 1; 203 return (mc_header->rev <= rev) ? 0 : 1;
203} 204}
204 205
205static int microcode_sanity_check(void *mc) 206static int microcode_sanity_check(void *mc)
206{ 207{
208 unsigned long total_size, data_size, ext_table_size;
207 struct microcode_header_intel *mc_header = mc; 209 struct microcode_header_intel *mc_header = mc;
208 struct extended_sigtable *ext_header = NULL; 210 struct extended_sigtable *ext_header = NULL;
209 struct extended_signature *ext_sig;
210 unsigned long total_size, data_size, ext_table_size;
211 int sum, orig_sum, ext_sigcount = 0, i; 211 int sum, orig_sum, ext_sigcount = 0, i;
212 struct extended_signature *ext_sig;
212 213
213 total_size = get_totalsize(mc_header); 214 total_size = get_totalsize(mc_header);
214 data_size = get_datasize(mc_header); 215 data_size = get_datasize(mc_header);
216
215 if (data_size + MC_HEADER_SIZE > total_size) { 217 if (data_size + MC_HEADER_SIZE > total_size) {
216 printk(KERN_ERR "microcode: error! " 218 printk(KERN_ERR "microcode: error! "
217 "Bad data size in microcode data file\n"); 219 "Bad data size in microcode data file\n");
218 return -EINVAL; 220 return -EINVAL;
219 } 221 }
220 222
221 if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { 223 if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
222 printk(KERN_ERR "microcode: error! " 224 printk(KERN_ERR "microcode: error! "
223 "Unknown microcode update format\n"); 225 "Unknown microcode update format\n");
224 return -EINVAL; 226 return -EINVAL;
225 } 227 }
226 ext_table_size = total_size - (MC_HEADER_SIZE + data_size); 228 ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
@@ -318,11 +320,15 @@ get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev)
318 320
319static void apply_microcode(int cpu) 321static void apply_microcode(int cpu)
320{ 322{
323 struct microcode_intel *mc_intel;
324 struct ucode_cpu_info *uci;
321 unsigned long flags; 325 unsigned long flags;
322 unsigned int val[2]; 326 unsigned int val[2];
323 int cpu_num = raw_smp_processor_id(); 327 int cpu_num;
324 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 328
325 struct microcode_intel *mc_intel = uci->mc; 329 cpu_num = raw_smp_processor_id();
330 uci = ucode_cpu_info + cpu;
331 mc_intel = uci->mc;
326 332
327 /* We should bind the task to the CPU */ 333 /* We should bind the task to the CPU */
328 BUG_ON(cpu_num != cpu); 334 BUG_ON(cpu_num != cpu);
@@ -348,15 +354,17 @@ static void apply_microcode(int cpu)
348 spin_unlock_irqrestore(&microcode_update_lock, flags); 354 spin_unlock_irqrestore(&microcode_update_lock, flags);
349 if (val[1] != mc_intel->hdr.rev) { 355 if (val[1] != mc_intel->hdr.rev) {
350 printk(KERN_ERR "microcode: CPU%d update from revision " 356 printk(KERN_ERR "microcode: CPU%d update from revision "
351 "0x%x to 0x%x failed\n", cpu_num, uci->cpu_sig.rev, val[1]); 357 "0x%x to 0x%x failed\n",
358 cpu_num, uci->cpu_sig.rev, val[1]);
352 return; 359 return;
353 } 360 }
354 printk(KERN_INFO "microcode: CPU%d updated from revision " 361 printk(KERN_INFO "microcode: CPU%d updated from revision "
355 "0x%x to 0x%x, date = %04x-%02x-%02x \n", 362 "0x%x to 0x%x, date = %04x-%02x-%02x \n",
356 cpu_num, uci->cpu_sig.rev, val[1], 363 cpu_num, uci->cpu_sig.rev, val[1],
357 mc_intel->hdr.date & 0xffff, 364 mc_intel->hdr.date & 0xffff,
358 mc_intel->hdr.date >> 24, 365 mc_intel->hdr.date >> 24,
359 (mc_intel->hdr.date >> 16) & 0xff); 366 (mc_intel->hdr.date >> 16) & 0xff);
367
360 uci->cpu_sig.rev = val[1]; 368 uci->cpu_sig.rev = val[1];
361} 369}
362 370
@@ -404,18 +412,23 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
404 leftover -= mc_size; 412 leftover -= mc_size;
405 } 413 }
406 414
407 if (new_mc) { 415 if (!new_mc)
408 if (!leftover) { 416 goto out;
409 if (uci->mc) 417
410 vfree(uci->mc); 418 if (leftover) {
411 uci->mc = (struct microcode_intel *)new_mc; 419 vfree(new_mc);
412 pr_debug("microcode: CPU%d found a matching microcode update with" 420 goto out;
413 " version 0x%x (current=0x%x)\n",
414 cpu, new_rev, uci->cpu_sig.rev);
415 } else
416 vfree(new_mc);
417 } 421 }
418 422
423 if (uci->mc)
424 vfree(uci->mc);
425 uci->mc = (struct microcode_intel *)new_mc;
426
427 pr_debug("microcode: CPU%d found a matching microcode update with"
428 " version 0x%x (current=0x%x)\n",
429 cpu, new_rev, uci->cpu_sig.rev);
430
431 out:
419 return (int)leftover; 432 return (int)leftover;
420} 433}
421 434
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index 666e43df51f9..712d15fdc416 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -226,7 +226,7 @@ static int __devinit set_check_enable_amd_mmconf(const struct dmi_system_id *d)
226 return 0; 226 return 0;
227} 227}
228 228
229static struct dmi_system_id __devinitdata mmconf_dmi_table[] = { 229static const struct dmi_system_id __cpuinitconst mmconf_dmi_table[] = {
230 { 230 {
231 .callback = set_check_enable_amd_mmconf, 231 .callback = set_check_enable_amd_mmconf,
232 .ident = "Sun Microsystems Machine", 232 .ident = "Sun Microsystems Machine",
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 37cb1bda1baf..dce99dca6cf8 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -109,9 +109,6 @@ static void __init MP_bus_info(struct mpc_bus *m)
109 } else 109 } else
110 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); 110 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
111} 111}
112#endif
113
114#ifdef CONFIG_X86_IO_APIC
115 112
116static int bad_ioapic(unsigned long address) 113static int bad_ioapic(unsigned long address)
117{ 114{
@@ -224,8 +221,12 @@ static void __init MP_intsrc_info(struct mpc_intsrc *m)
224 if (++mp_irq_entries == MAX_IRQ_SOURCES) 221 if (++mp_irq_entries == MAX_IRQ_SOURCES)
225 panic("Max # of irq sources exceeded!!\n"); 222 panic("Max # of irq sources exceeded!!\n");
226} 223}
224#else /* CONFIG_X86_IO_APIC */
225static inline void __init MP_bus_info(struct mpc_bus *m) {}
226static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {}
227static inline void __init MP_intsrc_info(struct mpc_intsrc *m) {}
228#endif /* CONFIG_X86_IO_APIC */
227 229
228#endif
229 230
230static void __init MP_lintsrc_info(struct mpc_lintsrc *m) 231static void __init MP_lintsrc_info(struct mpc_lintsrc *m)
231{ 232{
@@ -275,6 +276,20 @@ static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str)
275 return 1; 276 return 1;
276} 277}
277 278
279static void skip_entry(unsigned char **ptr, int *count, int size)
280{
281 *ptr += size;
282 *count += size;
283}
284
285static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt)
286{
287 printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"
288 "type %x\n", *mpt);
289 print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
290 1, mpc, mpc->length, 1);
291}
292
278static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) 293static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
279{ 294{
280 char str[16]; 295 char str[16];
@@ -310,61 +325,30 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
310 while (count < mpc->length) { 325 while (count < mpc->length) {
311 switch (*mpt) { 326 switch (*mpt) {
312 case MP_PROCESSOR: 327 case MP_PROCESSOR:
313 { 328 /* ACPI may have already provided this data */
314 struct mpc_cpu *m = (struct mpc_cpu *)mpt; 329 if (!acpi_lapic)
315 /* ACPI may have already provided this data */ 330 MP_processor_info((struct mpc_cpu *)mpt);
316 if (!acpi_lapic) 331 skip_entry(&mpt, &count, sizeof(struct mpc_cpu));
317 MP_processor_info(m); 332 break;
318 mpt += sizeof(*m);
319 count += sizeof(*m);
320 break;
321 }
322 case MP_BUS: 333 case MP_BUS:
323 { 334 MP_bus_info((struct mpc_bus *)mpt);
324 struct mpc_bus *m = (struct mpc_bus *)mpt; 335 skip_entry(&mpt, &count, sizeof(struct mpc_bus));
325#ifdef CONFIG_X86_IO_APIC 336 break;
326 MP_bus_info(m);
327#endif
328 mpt += sizeof(*m);
329 count += sizeof(*m);
330 break;
331 }
332 case MP_IOAPIC: 337 case MP_IOAPIC:
333 { 338 MP_ioapic_info((struct mpc_ioapic *)mpt);
334#ifdef CONFIG_X86_IO_APIC 339 skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
335 struct mpc_ioapic *m = (struct mpc_ioapic *)mpt; 340 break;
336 MP_ioapic_info(m);
337#endif
338 mpt += sizeof(struct mpc_ioapic);
339 count += sizeof(struct mpc_ioapic);
340 break;
341 }
342 case MP_INTSRC: 341 case MP_INTSRC:
343 { 342 MP_intsrc_info((struct mpc_intsrc *)mpt);
344#ifdef CONFIG_X86_IO_APIC 343 skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
345 struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; 344 break;
346
347 MP_intsrc_info(m);
348#endif
349 mpt += sizeof(struct mpc_intsrc);
350 count += sizeof(struct mpc_intsrc);
351 break;
352 }
353 case MP_LINTSRC: 345 case MP_LINTSRC:
354 { 346 MP_lintsrc_info((struct mpc_lintsrc *)mpt);
355 struct mpc_lintsrc *m = 347 skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc));
356 (struct mpc_lintsrc *)mpt; 348 break;
357 MP_lintsrc_info(m);
358 mpt += sizeof(*m);
359 count += sizeof(*m);
360 break;
361 }
362 default: 349 default:
363 /* wrong mptable */ 350 /* wrong mptable */
364 printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"); 351 smp_dump_mptable(mpc, mpt);
365 printk(KERN_ERR "type %x\n", *mpt);
366 print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
367 1, mpc, mpc->length, 1);
368 count = mpc->length; 352 count = mpc->length;
369 break; 353 break;
370 } 354 }
@@ -558,6 +542,68 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
558 542
559static struct mpf_intel *mpf_found; 543static struct mpf_intel *mpf_found;
560 544
545static unsigned long __init get_mpc_size(unsigned long physptr)
546{
547 struct mpc_table *mpc;
548 unsigned long size;
549
550 mpc = early_ioremap(physptr, PAGE_SIZE);
551 size = mpc->length;
552 early_iounmap(mpc, PAGE_SIZE);
553 apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size);
554
555 return size;
556}
557
558static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
559{
560 struct mpc_table *mpc;
561 unsigned long size;
562
563 size = get_mpc_size(mpf->physptr);
564 mpc = early_ioremap(mpf->physptr, size);
565 /*
566 * Read the physical hardware table. Anything here will
567 * override the defaults.
568 */
569 if (!smp_read_mpc(mpc, early)) {
570#ifdef CONFIG_X86_LOCAL_APIC
571 smp_found_config = 0;
572#endif
573 printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"
574 "... disabling SMP support. (tell your hw vendor)\n");
575 early_iounmap(mpc, size);
576 return -1;
577 }
578 early_iounmap(mpc, size);
579
580 if (early)
581 return -1;
582
583#ifdef CONFIG_X86_IO_APIC
584 /*
585 * If there are no explicit MP IRQ entries, then we are
586 * broken. We set up most of the low 16 IO-APIC pins to
587 * ISA defaults and hope it will work.
588 */
589 if (!mp_irq_entries) {
590 struct mpc_bus bus;
591
592 printk(KERN_ERR "BIOS bug, no explicit IRQ entries, "
593 "using default mptable. (tell your hw vendor)\n");
594
595 bus.type = MP_BUS;
596 bus.busid = 0;
597 memcpy(bus.bustype, "ISA ", 6);
598 MP_bus_info(&bus);
599
600 construct_default_ioirq_mptable(0);
601 }
602#endif
603
604 return 0;
605}
606
561/* 607/*
562 * Scan the memory blocks for an SMP configuration block. 608 * Scan the memory blocks for an SMP configuration block.
563 */ 609 */
@@ -611,45 +657,8 @@ static void __init __get_smp_config(unsigned int early)
611 construct_default_ISA_mptable(mpf->feature1); 657 construct_default_ISA_mptable(mpf->feature1);
612 658
613 } else if (mpf->physptr) { 659 } else if (mpf->physptr) {
614 660 if (check_physptr(mpf, early))
615 /*
616 * Read the physical hardware table. Anything here will
617 * override the defaults.
618 */
619 if (!smp_read_mpc(phys_to_virt(mpf->physptr), early)) {
620#ifdef CONFIG_X86_LOCAL_APIC
621 smp_found_config = 0;
622#endif
623 printk(KERN_ERR
624 "BIOS bug, MP table errors detected!...\n");
625 printk(KERN_ERR "... disabling SMP support. "
626 "(tell your hw vendor)\n");
627 return;
628 }
629
630 if (early)
631 return; 661 return;
632#ifdef CONFIG_X86_IO_APIC
633 /*
634 * If there are no explicit MP IRQ entries, then we are
635 * broken. We set up most of the low 16 IO-APIC pins to
636 * ISA defaults and hope it will work.
637 */
638 if (!mp_irq_entries) {
639 struct mpc_bus bus;
640
641 printk(KERN_ERR "BIOS bug, no explicit IRQ entries, "
642 "using default mptable. "
643 "(tell your hw vendor)\n");
644
645 bus.type = MP_BUS;
646 bus.busid = 0;
647 memcpy(bus.bustype, "ISA ", 6);
648 MP_bus_info(&bus);
649
650 construct_default_ioirq_mptable(0);
651 }
652#endif
653 } else 662 } else
654 BUG(); 663 BUG();
655 664
@@ -670,6 +679,31 @@ void __init get_smp_config(void)
670 __get_smp_config(0); 679 __get_smp_config(0);
671} 680}
672 681
682static void smp_reserve_bootmem(struct mpf_intel *mpf)
683{
684 unsigned long size = get_mpc_size(mpf->physptr);
685#ifdef CONFIG_X86_32
686 /*
687 * We cannot access to MPC table to compute table size yet,
688 * as only few megabytes from the bottom is mapped now.
689 * PC-9800's MPC table places on the very last of physical
690 * memory; so that simply reserving PAGE_SIZE from mpf->physptr
691 * yields BUG() in reserve_bootmem.
692 * also need to make sure physptr is below than max_low_pfn
693 * we don't need reserve the area above max_low_pfn
694 */
695 unsigned long end = max_low_pfn * PAGE_SIZE;
696
697 if (mpf->physptr < end) {
698 if (mpf->physptr + size > end)
699 size = end - mpf->physptr;
700 reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
701 }
702#else
703 reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
704#endif
705}
706
673static int __init smp_scan_config(unsigned long base, unsigned long length, 707static int __init smp_scan_config(unsigned long base, unsigned long length,
674 unsigned reserve) 708 unsigned reserve)
675{ 709{
@@ -697,36 +731,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
697 731
698 if (!reserve) 732 if (!reserve)
699 return 1; 733 return 1;
700 reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE, 734 reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf),
701 BOOTMEM_DEFAULT);
702 if (mpf->physptr) {
703 unsigned long size = PAGE_SIZE;
704#ifdef CONFIG_X86_32
705 /*
706 * We cannot access to MPC table to compute
707 * table size yet, as only few megabytes from
708 * the bottom is mapped now.
709 * PC-9800's MPC table places on the very last
710 * of physical memory; so that simply reserving
711 * PAGE_SIZE from mpf->physptr yields BUG()
712 * in reserve_bootmem.
713 * also need to make sure physptr is below than
714 * max_low_pfn
715 * we don't need reserve the area above max_low_pfn
716 */
717 unsigned long end = max_low_pfn * PAGE_SIZE;
718
719 if (mpf->physptr < end) {
720 if (mpf->physptr + size > end)
721 size = end - mpf->physptr;
722 reserve_bootmem_generic(mpf->physptr, size,
723 BOOTMEM_DEFAULT);
724 }
725#else
726 reserve_bootmem_generic(mpf->physptr, size,
727 BOOTMEM_DEFAULT); 735 BOOTMEM_DEFAULT);
728#endif 736 if (mpf->physptr)
729 } 737 smp_reserve_bootmem(mpf);
730 738
731 return 1; 739 return 1;
732 } 740 }
@@ -829,7 +837,57 @@ static int __init get_MP_intsrc_index(struct mpc_intsrc *m)
829#define SPARE_SLOT_NUM 20 837#define SPARE_SLOT_NUM 20
830 838
831static struct mpc_intsrc __initdata *m_spare[SPARE_SLOT_NUM]; 839static struct mpc_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
832#endif 840
841static void check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
842{
843 int i;
844
845 apic_printk(APIC_VERBOSE, "OLD ");
846 print_MP_intsrc_info(m);
847
848 i = get_MP_intsrc_index(m);
849 if (i > 0) {
850 assign_to_mpc_intsrc(&mp_irqs[i], m);
851 apic_printk(APIC_VERBOSE, "NEW ");
852 print_mp_irq_info(&mp_irqs[i]);
853 return;
854 }
855 if (!i) {
856 /* legacy, do nothing */
857 return;
858 }
859 if (*nr_m_spare < SPARE_SLOT_NUM) {
860 /*
861 * not found (-1), or duplicated (-2) are invalid entries,
862 * we need to use the slot later
863 */
864 m_spare[*nr_m_spare] = m;
865 *nr_m_spare += 1;
866 }
867}
868#else /* CONFIG_X86_IO_APIC */
869static inline void check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
870#endif /* CONFIG_X86_IO_APIC */
871
872static int check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length,
873 int count)
874{
875 if (!mpc_new_phys) {
876 pr_info("No spare slots, try to append...take your risk, "
877 "new mpc_length %x\n", count);
878 } else {
879 if (count <= mpc_new_length)
880 pr_info("No spare slots, try to append..., "
881 "new mpc_length %x\n", count);
882 else {
883 pr_err("mpc_new_length %lx is too small\n",
884 mpc_new_length);
885 return -1;
886 }
887 }
888
889 return 0;
890}
833 891
834static int __init replace_intsrc_all(struct mpc_table *mpc, 892static int __init replace_intsrc_all(struct mpc_table *mpc,
835 unsigned long mpc_new_phys, 893 unsigned long mpc_new_phys,
@@ -837,77 +895,33 @@ static int __init replace_intsrc_all(struct mpc_table *mpc,
837{ 895{
838#ifdef CONFIG_X86_IO_APIC 896#ifdef CONFIG_X86_IO_APIC
839 int i; 897 int i;
840 int nr_m_spare = 0;
841#endif 898#endif
842
843 int count = sizeof(*mpc); 899 int count = sizeof(*mpc);
900 int nr_m_spare = 0;
844 unsigned char *mpt = ((unsigned char *)mpc) + count; 901 unsigned char *mpt = ((unsigned char *)mpc) + count;
845 902
846 printk(KERN_INFO "mpc_length %x\n", mpc->length); 903 printk(KERN_INFO "mpc_length %x\n", mpc->length);
847 while (count < mpc->length) { 904 while (count < mpc->length) {
848 switch (*mpt) { 905 switch (*mpt) {
849 case MP_PROCESSOR: 906 case MP_PROCESSOR:
850 { 907 skip_entry(&mpt, &count, sizeof(struct mpc_cpu));
851 struct mpc_cpu *m = (struct mpc_cpu *)mpt; 908 break;
852 mpt += sizeof(*m);
853 count += sizeof(*m);
854 break;
855 }
856 case MP_BUS: 909 case MP_BUS:
857 { 910 skip_entry(&mpt, &count, sizeof(struct mpc_bus));
858 struct mpc_bus *m = (struct mpc_bus *)mpt; 911 break;
859 mpt += sizeof(*m);
860 count += sizeof(*m);
861 break;
862 }
863 case MP_IOAPIC: 912 case MP_IOAPIC:
864 { 913 skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
865 mpt += sizeof(struct mpc_ioapic); 914 break;
866 count += sizeof(struct mpc_ioapic);
867 break;
868 }
869 case MP_INTSRC: 915 case MP_INTSRC:
870 { 916 check_irq_src((struct mpc_intsrc *)mpt, &nr_m_spare);
871#ifdef CONFIG_X86_IO_APIC 917 skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
872 struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; 918 break;
873
874 printk(KERN_INFO "OLD ");
875 print_MP_intsrc_info(m);
876 i = get_MP_intsrc_index(m);
877 if (i > 0) {
878 assign_to_mpc_intsrc(&mp_irqs[i], m);
879 printk(KERN_INFO "NEW ");
880 print_mp_irq_info(&mp_irqs[i]);
881 } else if (!i) {
882 /* legacy, do nothing */
883 } else if (nr_m_spare < SPARE_SLOT_NUM) {
884 /*
885 * not found (-1), or duplicated (-2)
886 * are invalid entries,
887 * we need to use the slot later
888 */
889 m_spare[nr_m_spare] = m;
890 nr_m_spare++;
891 }
892#endif
893 mpt += sizeof(struct mpc_intsrc);
894 count += sizeof(struct mpc_intsrc);
895 break;
896 }
897 case MP_LINTSRC: 919 case MP_LINTSRC:
898 { 920 skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc));
899 struct mpc_lintsrc *m = 921 break;
900 (struct mpc_lintsrc *)mpt;
901 mpt += sizeof(*m);
902 count += sizeof(*m);
903 break;
904 }
905 default: 922 default:
906 /* wrong mptable */ 923 /* wrong mptable */
907 printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"); 924 smp_dump_mptable(mpc, mpt);
908 printk(KERN_ERR "type %x\n", *mpt);
909 print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
910 1, mpc, mpc->length, 1);
911 goto out; 925 goto out;
912 } 926 }
913 } 927 }
@@ -924,23 +938,15 @@ static int __init replace_intsrc_all(struct mpc_table *mpc,
924 continue; 938 continue;
925 939
926 if (nr_m_spare > 0) { 940 if (nr_m_spare > 0) {
927 printk(KERN_INFO "*NEW* found "); 941 apic_printk(APIC_VERBOSE, "*NEW* found\n");
928 nr_m_spare--; 942 nr_m_spare--;
929 assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]); 943 assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]);
930 m_spare[nr_m_spare] = NULL; 944 m_spare[nr_m_spare] = NULL;
931 } else { 945 } else {
932 struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; 946 struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
933 count += sizeof(struct mpc_intsrc); 947 count += sizeof(struct mpc_intsrc);
934 if (!mpc_new_phys) { 948 if (!check_slot(mpc_new_phys, mpc_new_length, count))
935 printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count); 949 goto out;
936 } else {
937 if (count <= mpc_new_length)
938 printk(KERN_INFO "No spare slots, try to append..., new mpc_length %x\n", count);
939 else {
940 printk(KERN_ERR "mpc_new_length %lx is too small\n", mpc_new_length);
941 goto out;
942 }
943 }
944 assign_to_mpc_intsrc(&mp_irqs[i], m); 950 assign_to_mpc_intsrc(&mp_irqs[i], m);
945 mpc->length = count; 951 mpc->length = count;
946 mpt += sizeof(struct mpc_intsrc); 952 mpt += sizeof(struct mpc_intsrc);
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 63dd358d8ee1..8e45f4464880 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -470,7 +470,6 @@ struct pv_mmu_ops pv_mmu_ops = {
470#if PAGETABLE_LEVELS >= 3 470#if PAGETABLE_LEVELS >= 3
471#ifdef CONFIG_X86_PAE 471#ifdef CONFIG_X86_PAE
472 .set_pte_atomic = native_set_pte_atomic, 472 .set_pte_atomic = native_set_pte_atomic,
473 .set_pte_present = native_set_pte_present,
474 .pte_clear = native_pte_clear, 473 .pte_clear = native_pte_clear,
475 .pmd_clear = native_pmd_clear, 474 .pmd_clear = native_pmd_clear,
476#endif 475#endif
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index d28bbdc35e4e..755c21e906f3 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -380,8 +380,9 @@ static inline struct iommu_table *find_iommu_table(struct device *dev)
380 return tbl; 380 return tbl;
381} 381}
382 382
383static void calgary_unmap_sg(struct device *dev, 383static void calgary_unmap_sg(struct device *dev, struct scatterlist *sglist,
384 struct scatterlist *sglist, int nelems, int direction) 384 int nelems,enum dma_data_direction dir,
385 struct dma_attrs *attrs)
385{ 386{
386 struct iommu_table *tbl = find_iommu_table(dev); 387 struct iommu_table *tbl = find_iommu_table(dev);
387 struct scatterlist *s; 388 struct scatterlist *s;
@@ -404,7 +405,8 @@ static void calgary_unmap_sg(struct device *dev,
404} 405}
405 406
406static int calgary_map_sg(struct device *dev, struct scatterlist *sg, 407static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
407 int nelems, int direction) 408 int nelems, enum dma_data_direction dir,
409 struct dma_attrs *attrs)
408{ 410{
409 struct iommu_table *tbl = find_iommu_table(dev); 411 struct iommu_table *tbl = find_iommu_table(dev);
410 struct scatterlist *s; 412 struct scatterlist *s;
@@ -429,15 +431,14 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
429 s->dma_address = (entry << PAGE_SHIFT) | s->offset; 431 s->dma_address = (entry << PAGE_SHIFT) | s->offset;
430 432
431 /* insert into HW table */ 433 /* insert into HW table */
432 tce_build(tbl, entry, npages, vaddr & PAGE_MASK, 434 tce_build(tbl, entry, npages, vaddr & PAGE_MASK, dir);
433 direction);
434 435
435 s->dma_length = s->length; 436 s->dma_length = s->length;
436 } 437 }
437 438
438 return nelems; 439 return nelems;
439error: 440error:
440 calgary_unmap_sg(dev, sg, nelems, direction); 441 calgary_unmap_sg(dev, sg, nelems, dir, NULL);
441 for_each_sg(sg, s, nelems, i) { 442 for_each_sg(sg, s, nelems, i) {
442 sg->dma_address = bad_dma_address; 443 sg->dma_address = bad_dma_address;
443 sg->dma_length = 0; 444 sg->dma_length = 0;
@@ -445,10 +446,12 @@ error:
445 return 0; 446 return 0;
446} 447}
447 448
448static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr, 449static dma_addr_t calgary_map_page(struct device *dev, struct page *page,
449 size_t size, int direction) 450 unsigned long offset, size_t size,
451 enum dma_data_direction dir,
452 struct dma_attrs *attrs)
450{ 453{
451 void *vaddr = phys_to_virt(paddr); 454 void *vaddr = page_address(page) + offset;
452 unsigned long uaddr; 455 unsigned long uaddr;
453 unsigned int npages; 456 unsigned int npages;
454 struct iommu_table *tbl = find_iommu_table(dev); 457 struct iommu_table *tbl = find_iommu_table(dev);
@@ -456,17 +459,18 @@ static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr,
456 uaddr = (unsigned long)vaddr; 459 uaddr = (unsigned long)vaddr;
457 npages = iommu_num_pages(uaddr, size, PAGE_SIZE); 460 npages = iommu_num_pages(uaddr, size, PAGE_SIZE);
458 461
459 return iommu_alloc(dev, tbl, vaddr, npages, direction); 462 return iommu_alloc(dev, tbl, vaddr, npages, dir);
460} 463}
461 464
462static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, 465static void calgary_unmap_page(struct device *dev, dma_addr_t dma_addr,
463 size_t size, int direction) 466 size_t size, enum dma_data_direction dir,
467 struct dma_attrs *attrs)
464{ 468{
465 struct iommu_table *tbl = find_iommu_table(dev); 469 struct iommu_table *tbl = find_iommu_table(dev);
466 unsigned int npages; 470 unsigned int npages;
467 471
468 npages = iommu_num_pages(dma_handle, size, PAGE_SIZE); 472 npages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
469 iommu_free(tbl, dma_handle, npages); 473 iommu_free(tbl, dma_addr, npages);
470} 474}
471 475
472static void* calgary_alloc_coherent(struct device *dev, size_t size, 476static void* calgary_alloc_coherent(struct device *dev, size_t size,
@@ -515,13 +519,13 @@ static void calgary_free_coherent(struct device *dev, size_t size,
515 free_pages((unsigned long)vaddr, get_order(size)); 519 free_pages((unsigned long)vaddr, get_order(size));
516} 520}
517 521
518static struct dma_mapping_ops calgary_dma_ops = { 522static struct dma_map_ops calgary_dma_ops = {
519 .alloc_coherent = calgary_alloc_coherent, 523 .alloc_coherent = calgary_alloc_coherent,
520 .free_coherent = calgary_free_coherent, 524 .free_coherent = calgary_free_coherent,
521 .map_single = calgary_map_single,
522 .unmap_single = calgary_unmap_single,
523 .map_sg = calgary_map_sg, 525 .map_sg = calgary_map_sg,
524 .unmap_sg = calgary_unmap_sg, 526 .unmap_sg = calgary_unmap_sg,
527 .map_page = calgary_map_page,
528 .unmap_page = calgary_unmap_page,
525}; 529};
526 530
527static inline void __iomem * busno_to_bbar(unsigned char num) 531static inline void __iomem * busno_to_bbar(unsigned char num)
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index b25428533141..90f5b9ef5def 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -1,4 +1,5 @@
1#include <linux/dma-mapping.h> 1#include <linux/dma-mapping.h>
2#include <linux/dma-debug.h>
2#include <linux/dmar.h> 3#include <linux/dmar.h>
3#include <linux/bootmem.h> 4#include <linux/bootmem.h>
4#include <linux/pci.h> 5#include <linux/pci.h>
@@ -12,7 +13,7 @@
12 13
13static int forbid_dac __read_mostly; 14static int forbid_dac __read_mostly;
14 15
15struct dma_mapping_ops *dma_ops; 16struct dma_map_ops *dma_ops;
16EXPORT_SYMBOL(dma_ops); 17EXPORT_SYMBOL(dma_ops);
17 18
18static int iommu_sac_force __read_mostly; 19static int iommu_sac_force __read_mostly;
@@ -44,6 +45,9 @@ struct device x86_dma_fallback_dev = {
44}; 45};
45EXPORT_SYMBOL(x86_dma_fallback_dev); 46EXPORT_SYMBOL(x86_dma_fallback_dev);
46 47
48/* Number of entries preallocated for DMA-API debugging */
49#define PREALLOC_DMA_DEBUG_ENTRIES 32768
50
47int dma_set_mask(struct device *dev, u64 mask) 51int dma_set_mask(struct device *dev, u64 mask)
48{ 52{
49 if (!dev->dma_mask || !dma_supported(dev, mask)) 53 if (!dev->dma_mask || !dma_supported(dev, mask))
@@ -224,7 +228,7 @@ early_param("iommu", iommu_setup);
224 228
225int dma_supported(struct device *dev, u64 mask) 229int dma_supported(struct device *dev, u64 mask)
226{ 230{
227 struct dma_mapping_ops *ops = get_dma_ops(dev); 231 struct dma_map_ops *ops = get_dma_ops(dev);
228 232
229#ifdef CONFIG_PCI 233#ifdef CONFIG_PCI
230 if (mask > 0xffffffff && forbid_dac > 0) { 234 if (mask > 0xffffffff && forbid_dac > 0) {
@@ -265,6 +269,12 @@ EXPORT_SYMBOL(dma_supported);
265 269
266static int __init pci_iommu_init(void) 270static int __init pci_iommu_init(void)
267{ 271{
272 dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
273
274#ifdef CONFIG_PCI
275 dma_debug_add_bus(&pci_bus_type);
276#endif
277
268 calgary_iommu_init(); 278 calgary_iommu_init();
269 279
270 intel_iommu_init(); 280 intel_iommu_init();
@@ -290,8 +300,7 @@ fs_initcall(pci_iommu_init);
290static __devinit void via_no_dac(struct pci_dev *dev) 300static __devinit void via_no_dac(struct pci_dev *dev)
291{ 301{
292 if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { 302 if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
293 printk(KERN_INFO 303 dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n");
294 "PCI: VIA PCI bridge detected. Disabling DAC.\n");
295 forbid_dac = 1; 304 forbid_dac = 1;
296 } 305 }
297} 306}
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index d5768b1af080..b284b58c035c 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -255,10 +255,13 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
255} 255}
256 256
257/* Map a single area into the IOMMU */ 257/* Map a single area into the IOMMU */
258static dma_addr_t 258static dma_addr_t gart_map_page(struct device *dev, struct page *page,
259gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir) 259 unsigned long offset, size_t size,
260 enum dma_data_direction dir,
261 struct dma_attrs *attrs)
260{ 262{
261 unsigned long bus; 263 unsigned long bus;
264 phys_addr_t paddr = page_to_phys(page) + offset;
262 265
263 if (!dev) 266 if (!dev)
264 dev = &x86_dma_fallback_dev; 267 dev = &x86_dma_fallback_dev;
@@ -275,8 +278,9 @@ gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir)
275/* 278/*
276 * Free a DMA mapping. 279 * Free a DMA mapping.
277 */ 280 */
278static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, 281static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr,
279 size_t size, int direction) 282 size_t size, enum dma_data_direction dir,
283 struct dma_attrs *attrs)
280{ 284{
281 unsigned long iommu_page; 285 unsigned long iommu_page;
282 int npages; 286 int npages;
@@ -298,8 +302,8 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
298/* 302/*
299 * Wrapper for pci_unmap_single working with scatterlists. 303 * Wrapper for pci_unmap_single working with scatterlists.
300 */ 304 */
301static void 305static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
302gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) 306 enum dma_data_direction dir, struct dma_attrs *attrs)
303{ 307{
304 struct scatterlist *s; 308 struct scatterlist *s;
305 int i; 309 int i;
@@ -307,7 +311,7 @@ gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
307 for_each_sg(sg, s, nents, i) { 311 for_each_sg(sg, s, nents, i) {
308 if (!s->dma_length || !s->length) 312 if (!s->dma_length || !s->length)
309 break; 313 break;
310 gart_unmap_single(dev, s->dma_address, s->dma_length, dir); 314 gart_unmap_page(dev, s->dma_address, s->dma_length, dir, NULL);
311 } 315 }
312} 316}
313 317
@@ -329,7 +333,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
329 addr = dma_map_area(dev, addr, s->length, dir, 0); 333 addr = dma_map_area(dev, addr, s->length, dir, 0);
330 if (addr == bad_dma_address) { 334 if (addr == bad_dma_address) {
331 if (i > 0) 335 if (i > 0)
332 gart_unmap_sg(dev, sg, i, dir); 336 gart_unmap_sg(dev, sg, i, dir, NULL);
333 nents = 0; 337 nents = 0;
334 sg[0].dma_length = 0; 338 sg[0].dma_length = 0;
335 break; 339 break;
@@ -400,8 +404,8 @@ dma_map_cont(struct device *dev, struct scatterlist *start, int nelems,
400 * DMA map all entries in a scatterlist. 404 * DMA map all entries in a scatterlist.
401 * Merge chunks that have page aligned sizes into a continuous mapping. 405 * Merge chunks that have page aligned sizes into a continuous mapping.
402 */ 406 */
403static int 407static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
404gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) 408 enum dma_data_direction dir, struct dma_attrs *attrs)
405{ 409{
406 struct scatterlist *s, *ps, *start_sg, *sgmap; 410 struct scatterlist *s, *ps, *start_sg, *sgmap;
407 int need = 0, nextneed, i, out, start; 411 int need = 0, nextneed, i, out, start;
@@ -468,7 +472,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
468 472
469error: 473error:
470 flush_gart(); 474 flush_gart();
471 gart_unmap_sg(dev, sg, out, dir); 475 gart_unmap_sg(dev, sg, out, dir, NULL);
472 476
473 /* When it was forced or merged try again in a dumb way */ 477 /* When it was forced or merged try again in a dumb way */
474 if (force_iommu || iommu_merge) { 478 if (force_iommu || iommu_merge) {
@@ -521,7 +525,7 @@ static void
521gart_free_coherent(struct device *dev, size_t size, void *vaddr, 525gart_free_coherent(struct device *dev, size_t size, void *vaddr,
522 dma_addr_t dma_addr) 526 dma_addr_t dma_addr)
523{ 527{
524 gart_unmap_single(dev, dma_addr, size, DMA_BIDIRECTIONAL); 528 gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, NULL);
525 free_pages((unsigned long)vaddr, get_order(size)); 529 free_pages((unsigned long)vaddr, get_order(size));
526} 530}
527 531
@@ -707,11 +711,11 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
707 return -1; 711 return -1;
708} 712}
709 713
710static struct dma_mapping_ops gart_dma_ops = { 714static struct dma_map_ops gart_dma_ops = {
711 .map_single = gart_map_single,
712 .unmap_single = gart_unmap_single,
713 .map_sg = gart_map_sg, 715 .map_sg = gart_map_sg,
714 .unmap_sg = gart_unmap_sg, 716 .unmap_sg = gart_unmap_sg,
717 .map_page = gart_map_page,
718 .unmap_page = gart_unmap_page,
715 .alloc_coherent = gart_alloc_coherent, 719 .alloc_coherent = gart_alloc_coherent,
716 .free_coherent = gart_free_coherent, 720 .free_coherent = gart_free_coherent,
717}; 721};
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index c70ab5a5d4c8..c6d703b39326 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -1,14 +1,14 @@
1/* Fallback functions when the main IOMMU code is not compiled in. This 1/* Fallback functions when the main IOMMU code is not compiled in. This
2 code is roughly equivalent to i386. */ 2 code is roughly equivalent to i386. */
3#include <linux/mm.h>
4#include <linux/init.h>
5#include <linux/pci.h>
6#include <linux/string.h>
7#include <linux/dma-mapping.h> 3#include <linux/dma-mapping.h>
8#include <linux/scatterlist.h> 4#include <linux/scatterlist.h>
5#include <linux/string.h>
6#include <linux/init.h>
7#include <linux/pci.h>
8#include <linux/mm.h>
9 9
10#include <asm/iommu.h>
11#include <asm/processor.h> 10#include <asm/processor.h>
11#include <asm/iommu.h>
12#include <asm/dma.h> 12#include <asm/dma.h>
13 13
14static int 14static int
@@ -25,19 +25,19 @@ check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
25 return 1; 25 return 1;
26} 26}
27 27
28static dma_addr_t 28static dma_addr_t nommu_map_page(struct device *dev, struct page *page,
29nommu_map_single(struct device *hwdev, phys_addr_t paddr, size_t size, 29 unsigned long offset, size_t size,
30 int direction) 30 enum dma_data_direction dir,
31 struct dma_attrs *attrs)
31{ 32{
32 dma_addr_t bus = paddr; 33 dma_addr_t bus = page_to_phys(page) + offset;
33 WARN_ON(size == 0); 34 WARN_ON(size == 0);
34 if (!check_addr("map_single", hwdev, bus, size)) 35 if (!check_addr("map_single", dev, bus, size))
35 return bad_dma_address; 36 return bad_dma_address;
36 flush_write_buffers(); 37 flush_write_buffers();
37 return bus; 38 return bus;
38} 39}
39 40
40
41/* Map a set of buffers described by scatterlist in streaming 41/* Map a set of buffers described by scatterlist in streaming
42 * mode for DMA. This is the scatter-gather version of the 42 * mode for DMA. This is the scatter-gather version of the
43 * above pci_map_single interface. Here the scatter gather list 43 * above pci_map_single interface. Here the scatter gather list
@@ -54,7 +54,8 @@ nommu_map_single(struct device *hwdev, phys_addr_t paddr, size_t size,
54 * the same here. 54 * the same here.
55 */ 55 */
56static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, 56static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
57 int nents, int direction) 57 int nents, enum dma_data_direction dir,
58 struct dma_attrs *attrs)
58{ 59{
59 struct scatterlist *s; 60 struct scatterlist *s;
60 int i; 61 int i;
@@ -78,12 +79,12 @@ static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
78 free_pages((unsigned long)vaddr, get_order(size)); 79 free_pages((unsigned long)vaddr, get_order(size));
79} 80}
80 81
81struct dma_mapping_ops nommu_dma_ops = { 82struct dma_map_ops nommu_dma_ops = {
82 .alloc_coherent = dma_generic_alloc_coherent, 83 .alloc_coherent = dma_generic_alloc_coherent,
83 .free_coherent = nommu_free_coherent, 84 .free_coherent = nommu_free_coherent,
84 .map_single = nommu_map_single, 85 .map_sg = nommu_map_sg,
85 .map_sg = nommu_map_sg, 86 .map_page = nommu_map_page,
86 .is_phys = 1, 87 .is_phys = 1,
87}; 88};
88 89
89void __init no_iommu_init(void) 90void __init no_iommu_init(void)
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb.c
index d59c91747665..34f12e9996ed 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -33,18 +33,11 @@ phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr)
33 return baddr; 33 return baddr;
34} 34}
35 35
36int __weak swiotlb_arch_range_needs_mapping(void *ptr, size_t size) 36int __weak swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size)
37{ 37{
38 return 0; 38 return 0;
39} 39}
40 40
41static dma_addr_t
42swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size,
43 int direction)
44{
45 return swiotlb_map_single(hwdev, phys_to_virt(paddr), size, direction);
46}
47
48static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, 41static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
49 dma_addr_t *dma_handle, gfp_t flags) 42 dma_addr_t *dma_handle, gfp_t flags)
50{ 43{
@@ -57,20 +50,20 @@ static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
57 return swiotlb_alloc_coherent(hwdev, size, dma_handle, flags); 50 return swiotlb_alloc_coherent(hwdev, size, dma_handle, flags);
58} 51}
59 52
60struct dma_mapping_ops swiotlb_dma_ops = { 53struct dma_map_ops swiotlb_dma_ops = {
61 .mapping_error = swiotlb_dma_mapping_error, 54 .mapping_error = swiotlb_dma_mapping_error,
62 .alloc_coherent = x86_swiotlb_alloc_coherent, 55 .alloc_coherent = x86_swiotlb_alloc_coherent,
63 .free_coherent = swiotlb_free_coherent, 56 .free_coherent = swiotlb_free_coherent,
64 .map_single = swiotlb_map_single_phys,
65 .unmap_single = swiotlb_unmap_single,
66 .sync_single_for_cpu = swiotlb_sync_single_for_cpu, 57 .sync_single_for_cpu = swiotlb_sync_single_for_cpu,
67 .sync_single_for_device = swiotlb_sync_single_for_device, 58 .sync_single_for_device = swiotlb_sync_single_for_device,
68 .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu, 59 .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
69 .sync_single_range_for_device = swiotlb_sync_single_range_for_device, 60 .sync_single_range_for_device = swiotlb_sync_single_range_for_device,
70 .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, 61 .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
71 .sync_sg_for_device = swiotlb_sync_sg_for_device, 62 .sync_sg_for_device = swiotlb_sync_sg_for_device,
72 .map_sg = swiotlb_map_sg, 63 .map_sg = swiotlb_map_sg_attrs,
73 .unmap_sg = swiotlb_unmap_sg, 64 .unmap_sg = swiotlb_unmap_sg_attrs,
65 .map_page = swiotlb_map_page,
66 .unmap_page = swiotlb_unmap_page,
74 .dma_supported = NULL, 67 .dma_supported = NULL,
75}; 68};
76 69
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 87b69d4fac16..ca989158e847 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -1,16 +1,19 @@
1#include <linux/errno.h> 1#include <linux/errno.h>
2#include <linux/kernel.h> 2#include <linux/kernel.h>
3#include <linux/mm.h> 3#include <linux/mm.h>
4#include <asm/idle.h>
5#include <linux/smp.h> 4#include <linux/smp.h>
5#include <linux/prctl.h>
6#include <linux/slab.h> 6#include <linux/slab.h>
7#include <linux/sched.h> 7#include <linux/sched.h>
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/pm.h> 9#include <linux/pm.h>
10#include <linux/clockchips.h> 10#include <linux/clockchips.h>
11#include <linux/ftrace.h> 11#include <trace/power.h>
12#include <asm/system.h> 12#include <asm/system.h>
13#include <asm/apic.h> 13#include <asm/apic.h>
14#include <asm/idle.h>
15#include <asm/uaccess.h>
16#include <asm/i387.h>
14 17
15unsigned long idle_halt; 18unsigned long idle_halt;
16EXPORT_SYMBOL(idle_halt); 19EXPORT_SYMBOL(idle_halt);
@@ -19,6 +22,9 @@ EXPORT_SYMBOL(idle_nomwait);
19 22
20struct kmem_cache *task_xstate_cachep; 23struct kmem_cache *task_xstate_cachep;
21 24
25DEFINE_TRACE(power_start);
26DEFINE_TRACE(power_end);
27
22int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 28int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
23{ 29{
24 *dst = *src; 30 *dst = *src;
@@ -56,6 +62,193 @@ void arch_task_cache_init(void)
56} 62}
57 63
58/* 64/*
65 * Free current thread data structures etc..
66 */
67void exit_thread(void)
68{
69 struct task_struct *me = current;
70 struct thread_struct *t = &me->thread;
71 unsigned long *bp = t->io_bitmap_ptr;
72
73 if (bp) {
74 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
75
76 t->io_bitmap_ptr = NULL;
77 clear_thread_flag(TIF_IO_BITMAP);
78 /*
79 * Careful, clear this in the TSS too:
80 */
81 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
82 t->io_bitmap_max = 0;
83 put_cpu();
84 kfree(bp);
85 }
86
87 ds_exit_thread(current);
88}
89
90void flush_thread(void)
91{
92 struct task_struct *tsk = current;
93
94#ifdef CONFIG_X86_64
95 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
96 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
97 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
98 clear_tsk_thread_flag(tsk, TIF_IA32);
99 } else {
100 set_tsk_thread_flag(tsk, TIF_IA32);
101 current_thread_info()->status |= TS_COMPAT;
102 }
103 }
104#endif
105
106 clear_tsk_thread_flag(tsk, TIF_DEBUG);
107
108 tsk->thread.debugreg0 = 0;
109 tsk->thread.debugreg1 = 0;
110 tsk->thread.debugreg2 = 0;
111 tsk->thread.debugreg3 = 0;
112 tsk->thread.debugreg6 = 0;
113 tsk->thread.debugreg7 = 0;
114 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
115 /*
116 * Forget coprocessor state..
117 */
118 tsk->fpu_counter = 0;
119 clear_fpu(tsk);
120 clear_used_math();
121}
122
123static void hard_disable_TSC(void)
124{
125 write_cr4(read_cr4() | X86_CR4_TSD);
126}
127
128void disable_TSC(void)
129{
130 preempt_disable();
131 if (!test_and_set_thread_flag(TIF_NOTSC))
132 /*
133 * Must flip the CPU state synchronously with
134 * TIF_NOTSC in the current running context.
135 */
136 hard_disable_TSC();
137 preempt_enable();
138}
139
140static void hard_enable_TSC(void)
141{
142 write_cr4(read_cr4() & ~X86_CR4_TSD);
143}
144
145static void enable_TSC(void)
146{
147 preempt_disable();
148 if (test_and_clear_thread_flag(TIF_NOTSC))
149 /*
150 * Must flip the CPU state synchronously with
151 * TIF_NOTSC in the current running context.
152 */
153 hard_enable_TSC();
154 preempt_enable();
155}
156
157int get_tsc_mode(unsigned long adr)
158{
159 unsigned int val;
160
161 if (test_thread_flag(TIF_NOTSC))
162 val = PR_TSC_SIGSEGV;
163 else
164 val = PR_TSC_ENABLE;
165
166 return put_user(val, (unsigned int __user *)adr);
167}
168
169int set_tsc_mode(unsigned int val)
170{
171 if (val == PR_TSC_SIGSEGV)
172 disable_TSC();
173 else if (val == PR_TSC_ENABLE)
174 enable_TSC();
175 else
176 return -EINVAL;
177
178 return 0;
179}
180
181void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
182 struct tss_struct *tss)
183{
184 struct thread_struct *prev, *next;
185
186 prev = &prev_p->thread;
187 next = &next_p->thread;
188
189 if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
190 test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
191 ds_switch_to(prev_p, next_p);
192 else if (next->debugctlmsr != prev->debugctlmsr)
193 update_debugctlmsr(next->debugctlmsr);
194
195 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
196 set_debugreg(next->debugreg0, 0);
197 set_debugreg(next->debugreg1, 1);
198 set_debugreg(next->debugreg2, 2);
199 set_debugreg(next->debugreg3, 3);
200 /* no 4 and 5 */
201 set_debugreg(next->debugreg6, 6);
202 set_debugreg(next->debugreg7, 7);
203 }
204
205 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
206 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
207 /* prev and next are different */
208 if (test_tsk_thread_flag(next_p, TIF_NOTSC))
209 hard_disable_TSC();
210 else
211 hard_enable_TSC();
212 }
213
214 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
215 /*
216 * Copy the relevant range of the IO bitmap.
217 * Normally this is 128 bytes or less:
218 */
219 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
220 max(prev->io_bitmap_max, next->io_bitmap_max));
221 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
222 /*
223 * Clear any possible leftover bits:
224 */
225 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
226 }
227}
228
229int sys_fork(struct pt_regs *regs)
230{
231 return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
232}
233
234/*
235 * This is trivial, and on the face of it looks like it
236 * could equally well be done in user mode.
237 *
238 * Not so, for quite unobvious reasons - register pressure.
239 * In user mode vfork() cannot have a stack frame, and if
240 * done by calling the "clone()" system call directly, you
241 * do not have enough call-clobbered registers to hold all
242 * the information you need.
243 */
244int sys_vfork(struct pt_regs *regs)
245{
246 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
247 NULL, NULL);
248}
249
250
251/*
59 * Idle related variables and functions 252 * Idle related variables and functions
60 */ 253 */
61unsigned long boot_option_idle_override = 0; 254unsigned long boot_option_idle_override = 0;
@@ -135,7 +328,7 @@ void stop_this_cpu(void *dummy)
135 /* 328 /*
136 * Remove this CPU: 329 * Remove this CPU:
137 */ 330 */
138 cpu_clear(smp_processor_id(), cpu_online_map); 331 set_cpu_online(smp_processor_id(), false);
139 disable_local_APIC(); 332 disable_local_APIC();
140 333
141 for (;;) { 334 for (;;) {
@@ -285,12 +478,13 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
285 return 1; 478 return 1;
286} 479}
287 480
288static cpumask_t c1e_mask = CPU_MASK_NONE; 481static cpumask_var_t c1e_mask;
289static int c1e_detected; 482static int c1e_detected;
290 483
291void c1e_remove_cpu(int cpu) 484void c1e_remove_cpu(int cpu)
292{ 485{
293 cpu_clear(cpu, c1e_mask); 486 if (c1e_mask != NULL)
487 cpumask_clear_cpu(cpu, c1e_mask);
294} 488}
295 489
296/* 490/*
@@ -319,8 +513,8 @@ static void c1e_idle(void)
319 if (c1e_detected) { 513 if (c1e_detected) {
320 int cpu = smp_processor_id(); 514 int cpu = smp_processor_id();
321 515
322 if (!cpu_isset(cpu, c1e_mask)) { 516 if (!cpumask_test_cpu(cpu, c1e_mask)) {
323 cpu_set(cpu, c1e_mask); 517 cpumask_set_cpu(cpu, c1e_mask);
324 /* 518 /*
325 * Force broadcast so ACPI can not interfere. Needs 519 * Force broadcast so ACPI can not interfere. Needs
326 * to run with interrupts enabled as it uses 520 * to run with interrupts enabled as it uses
@@ -372,6 +566,15 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
372 pm_idle = default_idle; 566 pm_idle = default_idle;
373} 567}
374 568
569void __init init_c1e_mask(void)
570{
571 /* If we're using c1e_idle, we need to allocate c1e_mask. */
572 if (pm_idle == c1e_idle) {
573 alloc_cpumask_var(&c1e_mask, GFP_KERNEL);
574 cpumask_clear(c1e_mask);
575 }
576}
577
375static int __init idle_setup(char *str) 578static int __init idle_setup(char *str)
376{ 579{
377 if (!str) 580 if (!str)
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 646da41a620a..76f8f84043a2 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -230,55 +230,6 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
230} 230}
231EXPORT_SYMBOL(kernel_thread); 231EXPORT_SYMBOL(kernel_thread);
232 232
233/*
234 * Free current thread data structures etc..
235 */
236void exit_thread(void)
237{
238 /* The process may have allocated an io port bitmap... nuke it. */
239 if (unlikely(test_thread_flag(TIF_IO_BITMAP))) {
240 struct task_struct *tsk = current;
241 struct thread_struct *t = &tsk->thread;
242 int cpu = get_cpu();
243 struct tss_struct *tss = &per_cpu(init_tss, cpu);
244
245 kfree(t->io_bitmap_ptr);
246 t->io_bitmap_ptr = NULL;
247 clear_thread_flag(TIF_IO_BITMAP);
248 /*
249 * Careful, clear this in the TSS too:
250 */
251 memset(tss->io_bitmap, 0xff, tss->io_bitmap_max);
252 t->io_bitmap_max = 0;
253 tss->io_bitmap_owner = NULL;
254 tss->io_bitmap_max = 0;
255 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
256 put_cpu();
257 }
258
259 ds_exit_thread(current);
260}
261
262void flush_thread(void)
263{
264 struct task_struct *tsk = current;
265
266 tsk->thread.debugreg0 = 0;
267 tsk->thread.debugreg1 = 0;
268 tsk->thread.debugreg2 = 0;
269 tsk->thread.debugreg3 = 0;
270 tsk->thread.debugreg6 = 0;
271 tsk->thread.debugreg7 = 0;
272 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
273 clear_tsk_thread_flag(tsk, TIF_DEBUG);
274 /*
275 * Forget coprocessor state..
276 */
277 tsk->fpu_counter = 0;
278 clear_fpu(tsk);
279 clear_used_math();
280}
281
282void release_thread(struct task_struct *dead_task) 233void release_thread(struct task_struct *dead_task)
283{ 234{
284 BUG_ON(dead_task->mm); 235 BUG_ON(dead_task->mm);
@@ -294,7 +245,7 @@ void prepare_to_copy(struct task_struct *tsk)
294 unlazy_fpu(tsk); 245 unlazy_fpu(tsk);
295} 246}
296 247
297int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, 248int copy_thread(unsigned long clone_flags, unsigned long sp,
298 unsigned long unused, 249 unsigned long unused,
299 struct task_struct *p, struct pt_regs *regs) 250 struct task_struct *p, struct pt_regs *regs)
300{ 251{
@@ -366,127 +317,6 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
366} 317}
367EXPORT_SYMBOL_GPL(start_thread); 318EXPORT_SYMBOL_GPL(start_thread);
368 319
369static void hard_disable_TSC(void)
370{
371 write_cr4(read_cr4() | X86_CR4_TSD);
372}
373
374void disable_TSC(void)
375{
376 preempt_disable();
377 if (!test_and_set_thread_flag(TIF_NOTSC))
378 /*
379 * Must flip the CPU state synchronously with
380 * TIF_NOTSC in the current running context.
381 */
382 hard_disable_TSC();
383 preempt_enable();
384}
385
386static void hard_enable_TSC(void)
387{
388 write_cr4(read_cr4() & ~X86_CR4_TSD);
389}
390
391static void enable_TSC(void)
392{
393 preempt_disable();
394 if (test_and_clear_thread_flag(TIF_NOTSC))
395 /*
396 * Must flip the CPU state synchronously with
397 * TIF_NOTSC in the current running context.
398 */
399 hard_enable_TSC();
400 preempt_enable();
401}
402
403int get_tsc_mode(unsigned long adr)
404{
405 unsigned int val;
406
407 if (test_thread_flag(TIF_NOTSC))
408 val = PR_TSC_SIGSEGV;
409 else
410 val = PR_TSC_ENABLE;
411
412 return put_user(val, (unsigned int __user *)adr);
413}
414
415int set_tsc_mode(unsigned int val)
416{
417 if (val == PR_TSC_SIGSEGV)
418 disable_TSC();
419 else if (val == PR_TSC_ENABLE)
420 enable_TSC();
421 else
422 return -EINVAL;
423
424 return 0;
425}
426
427static noinline void
428__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
429 struct tss_struct *tss)
430{
431 struct thread_struct *prev, *next;
432
433 prev = &prev_p->thread;
434 next = &next_p->thread;
435
436 if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
437 test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
438 ds_switch_to(prev_p, next_p);
439 else if (next->debugctlmsr != prev->debugctlmsr)
440 update_debugctlmsr(next->debugctlmsr);
441
442 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
443 set_debugreg(next->debugreg0, 0);
444 set_debugreg(next->debugreg1, 1);
445 set_debugreg(next->debugreg2, 2);
446 set_debugreg(next->debugreg3, 3);
447 /* no 4 and 5 */
448 set_debugreg(next->debugreg6, 6);
449 set_debugreg(next->debugreg7, 7);
450 }
451
452 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
453 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
454 /* prev and next are different */
455 if (test_tsk_thread_flag(next_p, TIF_NOTSC))
456 hard_disable_TSC();
457 else
458 hard_enable_TSC();
459 }
460
461 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
462 /*
463 * Disable the bitmap via an invalid offset. We still cache
464 * the previous bitmap owner and the IO bitmap contents:
465 */
466 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
467 return;
468 }
469
470 if (likely(next == tss->io_bitmap_owner)) {
471 /*
472 * Previous owner of the bitmap (hence the bitmap content)
473 * matches the next task, we dont have to do anything but
474 * to set a valid offset in the TSS:
475 */
476 tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
477 return;
478 }
479 /*
480 * Lazy TSS's I/O bitmap copy. We set an invalid offset here
481 * and we let the task to get a GPF in case an I/O instruction
482 * is performed. The handler of the GPF will verify that the
483 * faulting task has a valid I/O bitmap and, it true, does the
484 * real copy and restart the instruction. This will save us
485 * redundant copies when the currently switched task does not
486 * perform any I/O during its timeslice.
487 */
488 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
489}
490 320
491/* 321/*
492 * switch_to(x,yn) should switch tasks from x to y. 322 * switch_to(x,yn) should switch tasks from x to y.
@@ -600,11 +430,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
600 return prev_p; 430 return prev_p;
601} 431}
602 432
603int sys_fork(struct pt_regs *regs)
604{
605 return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
606}
607
608int sys_clone(struct pt_regs *regs) 433int sys_clone(struct pt_regs *regs)
609{ 434{
610 unsigned long clone_flags; 435 unsigned long clone_flags;
@@ -621,21 +446,6 @@ int sys_clone(struct pt_regs *regs)
621} 446}
622 447
623/* 448/*
624 * This is trivial, and on the face of it looks like it
625 * could equally well be done in user mode.
626 *
627 * Not so, for quite unobvious reasons - register pressure.
628 * In user mode vfork() cannot have a stack frame, and if
629 * done by calling the "clone()" system call directly, you
630 * do not have enough call-clobbered registers to hold all
631 * the information you need.
632 */
633int sys_vfork(struct pt_regs *regs)
634{
635 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0, NULL, NULL);
636}
637
638/*
639 * sys_execve() executes a new program. 449 * sys_execve() executes a new program.
640 */ 450 */
641int sys_execve(struct pt_regs *regs) 451int sys_execve(struct pt_regs *regs)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 836ef6575f01..b751a41392b1 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -237,61 +237,6 @@ void show_regs(struct pt_regs *regs)
237 show_trace(NULL, regs, (void *)(regs + 1), regs->bp); 237 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
238} 238}
239 239
240/*
241 * Free current thread data structures etc..
242 */
243void exit_thread(void)
244{
245 struct task_struct *me = current;
246 struct thread_struct *t = &me->thread;
247
248 if (me->thread.io_bitmap_ptr) {
249 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
250
251 kfree(t->io_bitmap_ptr);
252 t->io_bitmap_ptr = NULL;
253 clear_thread_flag(TIF_IO_BITMAP);
254 /*
255 * Careful, clear this in the TSS too:
256 */
257 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
258 t->io_bitmap_max = 0;
259 put_cpu();
260 }
261
262 ds_exit_thread(current);
263}
264
265void flush_thread(void)
266{
267 struct task_struct *tsk = current;
268
269 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
270 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
271 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
272 clear_tsk_thread_flag(tsk, TIF_IA32);
273 } else {
274 set_tsk_thread_flag(tsk, TIF_IA32);
275 current_thread_info()->status |= TS_COMPAT;
276 }
277 }
278 clear_tsk_thread_flag(tsk, TIF_DEBUG);
279
280 tsk->thread.debugreg0 = 0;
281 tsk->thread.debugreg1 = 0;
282 tsk->thread.debugreg2 = 0;
283 tsk->thread.debugreg3 = 0;
284 tsk->thread.debugreg6 = 0;
285 tsk->thread.debugreg7 = 0;
286 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
287 /*
288 * Forget coprocessor state..
289 */
290 tsk->fpu_counter = 0;
291 clear_fpu(tsk);
292 clear_used_math();
293}
294
295void release_thread(struct task_struct *dead_task) 240void release_thread(struct task_struct *dead_task)
296{ 241{
297 if (dead_task->mm) { 242 if (dead_task->mm) {
@@ -333,7 +278,7 @@ void prepare_to_copy(struct task_struct *tsk)
333 unlazy_fpu(tsk); 278 unlazy_fpu(tsk);
334} 279}
335 280
336int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, 281int copy_thread(unsigned long clone_flags, unsigned long sp,
337 unsigned long unused, 282 unsigned long unused,
338 struct task_struct *p, struct pt_regs *regs) 283 struct task_struct *p, struct pt_regs *regs)
339{ 284{
@@ -425,118 +370,6 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
425} 370}
426EXPORT_SYMBOL_GPL(start_thread); 371EXPORT_SYMBOL_GPL(start_thread);
427 372
428static void hard_disable_TSC(void)
429{
430 write_cr4(read_cr4() | X86_CR4_TSD);
431}
432
433void disable_TSC(void)
434{
435 preempt_disable();
436 if (!test_and_set_thread_flag(TIF_NOTSC))
437 /*
438 * Must flip the CPU state synchronously with
439 * TIF_NOTSC in the current running context.
440 */
441 hard_disable_TSC();
442 preempt_enable();
443}
444
445static void hard_enable_TSC(void)
446{
447 write_cr4(read_cr4() & ~X86_CR4_TSD);
448}
449
450static void enable_TSC(void)
451{
452 preempt_disable();
453 if (test_and_clear_thread_flag(TIF_NOTSC))
454 /*
455 * Must flip the CPU state synchronously with
456 * TIF_NOTSC in the current running context.
457 */
458 hard_enable_TSC();
459 preempt_enable();
460}
461
462int get_tsc_mode(unsigned long adr)
463{
464 unsigned int val;
465
466 if (test_thread_flag(TIF_NOTSC))
467 val = PR_TSC_SIGSEGV;
468 else
469 val = PR_TSC_ENABLE;
470
471 return put_user(val, (unsigned int __user *)adr);
472}
473
474int set_tsc_mode(unsigned int val)
475{
476 if (val == PR_TSC_SIGSEGV)
477 disable_TSC();
478 else if (val == PR_TSC_ENABLE)
479 enable_TSC();
480 else
481 return -EINVAL;
482
483 return 0;
484}
485
486/*
487 * This special macro can be used to load a debugging register
488 */
489#define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
490
491static inline void __switch_to_xtra(struct task_struct *prev_p,
492 struct task_struct *next_p,
493 struct tss_struct *tss)
494{
495 struct thread_struct *prev, *next;
496
497 prev = &prev_p->thread,
498 next = &next_p->thread;
499
500 if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
501 test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
502 ds_switch_to(prev_p, next_p);
503 else if (next->debugctlmsr != prev->debugctlmsr)
504 update_debugctlmsr(next->debugctlmsr);
505
506 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
507 loaddebug(next, 0);
508 loaddebug(next, 1);
509 loaddebug(next, 2);
510 loaddebug(next, 3);
511 /* no 4 and 5 */
512 loaddebug(next, 6);
513 loaddebug(next, 7);
514 }
515
516 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
517 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
518 /* prev and next are different */
519 if (test_tsk_thread_flag(next_p, TIF_NOTSC))
520 hard_disable_TSC();
521 else
522 hard_enable_TSC();
523 }
524
525 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
526 /*
527 * Copy the relevant range of the IO bitmap.
528 * Normally this is 128 bytes or less:
529 */
530 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
531 max(prev->io_bitmap_max, next->io_bitmap_max));
532 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
533 /*
534 * Clear any possible leftover bits:
535 */
536 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
537 }
538}
539
540/* 373/*
541 * switch_to(x,y) should switch tasks from x to y. 374 * switch_to(x,y) should switch tasks from x to y.
542 * 375 *
@@ -694,11 +527,6 @@ void set_personality_64bit(void)
694 current->personality &= ~READ_IMPLIES_EXEC; 527 current->personality &= ~READ_IMPLIES_EXEC;
695} 528}
696 529
697asmlinkage long sys_fork(struct pt_regs *regs)
698{
699 return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
700}
701
702asmlinkage long 530asmlinkage long
703sys_clone(unsigned long clone_flags, unsigned long newsp, 531sys_clone(unsigned long clone_flags, unsigned long newsp,
704 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) 532 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
@@ -708,22 +536,6 @@ sys_clone(unsigned long clone_flags, unsigned long newsp,
708 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); 536 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
709} 537}
710 538
711/*
712 * This is trivial, and on the face of it looks like it
713 * could equally well be done in user mode.
714 *
715 * Not so, for quite unobvious reasons - register pressure.
716 * In user mode vfork() cannot have a stack frame, and if
717 * done by calling the "clone()" system call directly, you
718 * do not have enough call-clobbered registers to hold all
719 * the information you need.
720 */
721asmlinkage long sys_vfork(struct pt_regs *regs)
722{
723 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
724 NULL, NULL);
725}
726
727unsigned long get_wchan(struct task_struct *p) 539unsigned long get_wchan(struct task_struct *p)
728{ 540{
729 unsigned long stack; 541 unsigned long stack;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 3d9672e59c16..fe9345c967de 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -21,6 +21,7 @@
21#include <linux/audit.h> 21#include <linux/audit.h>
22#include <linux/seccomp.h> 22#include <linux/seccomp.h>
23#include <linux/signal.h> 23#include <linux/signal.h>
24#include <linux/ftrace.h>
24 25
25#include <asm/uaccess.h> 26#include <asm/uaccess.h>
26#include <asm/pgtable.h> 27#include <asm/pgtable.h>
@@ -685,9 +686,8 @@ static int ptrace_bts_config(struct task_struct *child,
685 if (!cfg.signal) 686 if (!cfg.signal)
686 return -EINVAL; 687 return -EINVAL;
687 688
688 return -EOPNOTSUPP;
689
690 child->thread.bts_ovfl_signal = cfg.signal; 689 child->thread.bts_ovfl_signal = cfg.signal;
690 return -EOPNOTSUPP;
691 } 691 }
692 692
693 if ((cfg.flags & PTRACE_BTS_O_ALLOC) && 693 if ((cfg.flags & PTRACE_BTS_O_ALLOC) &&
@@ -1416,6 +1416,9 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)
1416 tracehook_report_syscall_entry(regs)) 1416 tracehook_report_syscall_entry(regs))
1417 ret = -1L; 1417 ret = -1L;
1418 1418
1419 if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE)))
1420 ftrace_syscall_enter(regs);
1421
1419 if (unlikely(current->audit_context)) { 1422 if (unlikely(current->audit_context)) {
1420 if (IS_IA32) 1423 if (IS_IA32)
1421 audit_syscall_entry(AUDIT_ARCH_I386, 1424 audit_syscall_entry(AUDIT_ARCH_I386,
@@ -1439,6 +1442,9 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
1439 if (unlikely(current->audit_context)) 1442 if (unlikely(current->audit_context))
1440 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); 1443 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1441 1444
1445 if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE)))
1446 ftrace_syscall_exit(regs);
1447
1442 if (test_thread_flag(TIF_SYSCALL_TRACE)) 1448 if (test_thread_flag(TIF_SYSCALL_TRACE))
1443 tracehook_report_syscall_exit(regs, 0); 1449 tracehook_report_syscall_exit(regs, 0);
1444 1450
@@ -1456,6 +1462,6 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
1456 * system call instruction. 1462 * system call instruction.
1457 */ 1463 */
1458 if (test_thread_flag(TIF_SINGLESTEP) && 1464 if (test_thread_flag(TIF_SINGLESTEP) &&
1459 tracehook_consider_fatal_signal(current, SIGTRAP, SIG_DFL)) 1465 tracehook_consider_fatal_signal(current, SIGTRAP))
1460 send_sigtrap(current, regs, 0, TRAP_BRKPT); 1466 send_sigtrap(current, regs, 0, TRAP_BRKPT);
1461} 1467}
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 309949e9e1c1..e95022e4f5d5 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -74,8 +74,7 @@ static void ich_force_hpet_resume(void)
74 if (!force_hpet_address) 74 if (!force_hpet_address)
75 return; 75 return;
76 76
77 if (rcba_base == NULL) 77 BUG_ON(rcba_base == NULL);
78 BUG();
79 78
80 /* read the Function Disable register, dword mode only */ 79 /* read the Function Disable register, dword mode only */
81 val = readl(rcba_base + 0x3404); 80 val = readl(rcba_base + 0x3404);
@@ -172,7 +171,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_4,
172 ich_force_enable_hpet); 171 ich_force_enable_hpet);
173DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7, 172DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7,
174 ich_force_enable_hpet); 173 ich_force_enable_hpet);
175 174DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x3a16, /* ICH10 */
175 ich_force_enable_hpet);
176 176
177static struct pci_dev *cached_dev; 177static struct pci_dev *cached_dev;
178 178
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 1cc18d439bbb..2aef36d8aca2 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -216,6 +216,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
216 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"), 216 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"),
217 }, 217 },
218 }, 218 },
219 { /* Handle problems with rebooting on Dell XPS710 */
220 .callback = set_bios_reboot,
221 .ident = "Dell XPS710",
222 .matches = {
223 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
224 DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"),
225 },
226 },
219 { } 227 { }
220}; 228};
221 229
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index 2064d0aa8d28..41235531b11c 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -17,7 +17,8 @@
17 17
18#define PTR(x) (x << 2) 18#define PTR(x) (x << 2)
19 19
20/* control_page + KEXEC_CONTROL_CODE_MAX_SIZE 20/*
21 * control_page + KEXEC_CONTROL_CODE_MAX_SIZE
21 * ~ control_page + PAGE_SIZE are used as data storage and stack for 22 * ~ control_page + PAGE_SIZE are used as data storage and stack for
22 * jumping back 23 * jumping back
23 */ 24 */
@@ -76,8 +77,10 @@ relocate_kernel:
76 movl %eax, CP_PA_SWAP_PAGE(%edi) 77 movl %eax, CP_PA_SWAP_PAGE(%edi)
77 movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi) 78 movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi)
78 79
79 /* get physical address of control page now */ 80 /*
80 /* this is impossible after page table switch */ 81 * get physical address of control page now
82 * this is impossible after page table switch
83 */
81 movl PTR(PA_CONTROL_PAGE)(%ebp), %edi 84 movl PTR(PA_CONTROL_PAGE)(%ebp), %edi
82 85
83 /* switch to new set of page tables */ 86 /* switch to new set of page tables */
@@ -97,7 +100,8 @@ identity_mapped:
97 /* store the start address on the stack */ 100 /* store the start address on the stack */
98 pushl %edx 101 pushl %edx
99 102
100 /* Set cr0 to a known state: 103 /*
104 * Set cr0 to a known state:
101 * - Paging disabled 105 * - Paging disabled
102 * - Alignment check disabled 106 * - Alignment check disabled
103 * - Write protect disabled 107 * - Write protect disabled
@@ -113,7 +117,8 @@ identity_mapped:
113 /* clear cr4 if applicable */ 117 /* clear cr4 if applicable */
114 testl %ecx, %ecx 118 testl %ecx, %ecx
115 jz 1f 119 jz 1f
116 /* Set cr4 to a known state: 120 /*
121 * Set cr4 to a known state:
117 * Setting everything to zero seems safe. 122 * Setting everything to zero seems safe.
118 */ 123 */
119 xorl %eax, %eax 124 xorl %eax, %eax
@@ -132,15 +137,18 @@ identity_mapped:
132 call swap_pages 137 call swap_pages
133 addl $8, %esp 138 addl $8, %esp
134 139
135 /* To be certain of avoiding problems with self-modifying code 140 /*
141 * To be certain of avoiding problems with self-modifying code
136 * I need to execute a serializing instruction here. 142 * I need to execute a serializing instruction here.
137 * So I flush the TLB, it's handy, and not processor dependent. 143 * So I flush the TLB, it's handy, and not processor dependent.
138 */ 144 */
139 xorl %eax, %eax 145 xorl %eax, %eax
140 movl %eax, %cr3 146 movl %eax, %cr3
141 147
142 /* set all of the registers to known values */ 148 /*
143 /* leave %esp alone */ 149 * set all of the registers to known values
150 * leave %esp alone
151 */
144 152
145 testl %esi, %esi 153 testl %esi, %esi
146 jnz 1f 154 jnz 1f
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index d32cfb27a479..4de8f5b3d476 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -19,29 +19,77 @@
19#define PTR(x) (x << 3) 19#define PTR(x) (x << 3)
20#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) 20#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
21 21
22/*
23 * control_page + KEXEC_CONTROL_CODE_MAX_SIZE
24 * ~ control_page + PAGE_SIZE are used as data storage and stack for
25 * jumping back
26 */
27#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset))
28
29/* Minimal CPU state */
30#define RSP DATA(0x0)
31#define CR0 DATA(0x8)
32#define CR3 DATA(0x10)
33#define CR4 DATA(0x18)
34
35/* other data */
36#define CP_PA_TABLE_PAGE DATA(0x20)
37#define CP_PA_SWAP_PAGE DATA(0x28)
38#define CP_PA_BACKUP_PAGES_MAP DATA(0x30)
39
22 .text 40 .text
23 .align PAGE_SIZE 41 .align PAGE_SIZE
24 .code64 42 .code64
25 .globl relocate_kernel 43 .globl relocate_kernel
26relocate_kernel: 44relocate_kernel:
27 /* %rdi indirection_page 45 /*
46 * %rdi indirection_page
28 * %rsi page_list 47 * %rsi page_list
29 * %rdx start address 48 * %rdx start address
49 * %rcx preserve_context
30 */ 50 */
31 51
52 /* Save the CPU context, used for jumping back */
53 pushq %rbx
54 pushq %rbp
55 pushq %r12
56 pushq %r13
57 pushq %r14
58 pushq %r15
59 pushf
60
61 movq PTR(VA_CONTROL_PAGE)(%rsi), %r11
62 movq %rsp, RSP(%r11)
63 movq %cr0, %rax
64 movq %rax, CR0(%r11)
65 movq %cr3, %rax
66 movq %rax, CR3(%r11)
67 movq %cr4, %rax
68 movq %rax, CR4(%r11)
69
32 /* zero out flags, and disable interrupts */ 70 /* zero out flags, and disable interrupts */
33 pushq $0 71 pushq $0
34 popfq 72 popfq
35 73
36 /* get physical address of control page now */ 74 /*
37 /* this is impossible after page table switch */ 75 * get physical address of control page now
76 * this is impossible after page table switch
77 */
38 movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 78 movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
39 79
40 /* get physical address of page table now too */ 80 /* get physical address of page table now too */
41 movq PTR(PA_TABLE_PAGE)(%rsi), %rcx 81 movq PTR(PA_TABLE_PAGE)(%rsi), %r9
82
83 /* get physical address of swap page now */
84 movq PTR(PA_SWAP_PAGE)(%rsi), %r10
85
86 /* save some information for jumping back */
87 movq %r9, CP_PA_TABLE_PAGE(%r11)
88 movq %r10, CP_PA_SWAP_PAGE(%r11)
89 movq %rdi, CP_PA_BACKUP_PAGES_MAP(%r11)
42 90
43 /* Switch to the identity mapped page tables */ 91 /* Switch to the identity mapped page tables */
44 movq %rcx, %cr3 92 movq %r9, %cr3
45 93
46 /* setup a new stack at the end of the physical control page */ 94 /* setup a new stack at the end of the physical control page */
47 lea PAGE_SIZE(%r8), %rsp 95 lea PAGE_SIZE(%r8), %rsp
@@ -55,7 +103,8 @@ identity_mapped:
55 /* store the start address on the stack */ 103 /* store the start address on the stack */
56 pushq %rdx 104 pushq %rdx
57 105
58 /* Set cr0 to a known state: 106 /*
107 * Set cr0 to a known state:
59 * - Paging enabled 108 * - Paging enabled
60 * - Alignment check disabled 109 * - Alignment check disabled
61 * - Write protect disabled 110 * - Write protect disabled
@@ -68,7 +117,8 @@ identity_mapped:
68 orl $(X86_CR0_PG | X86_CR0_PE), %eax 117 orl $(X86_CR0_PG | X86_CR0_PE), %eax
69 movq %rax, %cr0 118 movq %rax, %cr0
70 119
71 /* Set cr4 to a known state: 120 /*
121 * Set cr4 to a known state:
72 * - physical address extension enabled 122 * - physical address extension enabled
73 */ 123 */
74 movq $X86_CR4_PAE, %rax 124 movq $X86_CR4_PAE, %rax
@@ -78,9 +128,87 @@ identity_mapped:
781: 1281:
79 129
80 /* Flush the TLB (needed?) */ 130 /* Flush the TLB (needed?) */
81 movq %rcx, %cr3 131 movq %r9, %cr3
132
133 movq %rcx, %r11
134 call swap_pages
135
136 /*
137 * To be certain of avoiding problems with self-modifying code
138 * I need to execute a serializing instruction here.
139 * So I flush the TLB by reloading %cr3 here, it's handy,
140 * and not processor dependent.
141 */
142 movq %cr3, %rax
143 movq %rax, %cr3
144
145 /*
146 * set all of the registers to known values
147 * leave %rsp alone
148 */
149
150 testq %r11, %r11
151 jnz 1f
152 xorq %rax, %rax
153 xorq %rbx, %rbx
154 xorq %rcx, %rcx
155 xorq %rdx, %rdx
156 xorq %rsi, %rsi
157 xorq %rdi, %rdi
158 xorq %rbp, %rbp
159 xorq %r8, %r8
160 xorq %r9, %r9
161 xorq %r10, %r9
162 xorq %r11, %r11
163 xorq %r12, %r12
164 xorq %r13, %r13
165 xorq %r14, %r14
166 xorq %r15, %r15
167
168 ret
169
1701:
171 popq %rdx
172 leaq PAGE_SIZE(%r10), %rsp
173 call *%rdx
174
175 /* get the re-entry point of the peer system */
176 movq 0(%rsp), %rbp
177 call 1f
1781:
179 popq %r8
180 subq $(1b - relocate_kernel), %r8
181 movq CP_PA_SWAP_PAGE(%r8), %r10
182 movq CP_PA_BACKUP_PAGES_MAP(%r8), %rdi
183 movq CP_PA_TABLE_PAGE(%r8), %rax
184 movq %rax, %cr3
185 lea PAGE_SIZE(%r8), %rsp
186 call swap_pages
187 movq $virtual_mapped, %rax
188 pushq %rax
189 ret
190
191virtual_mapped:
192 movq RSP(%r8), %rsp
193 movq CR4(%r8), %rax
194 movq %rax, %cr4
195 movq CR3(%r8), %rax
196 movq CR0(%r8), %r8
197 movq %rax, %cr3
198 movq %r8, %cr0
199 movq %rbp, %rax
200
201 popf
202 popq %r15
203 popq %r14
204 popq %r13
205 popq %r12
206 popq %rbp
207 popq %rbx
208 ret
82 209
83 /* Do the copies */ 210 /* Do the copies */
211swap_pages:
84 movq %rdi, %rcx /* Put the page_list in %rcx */ 212 movq %rdi, %rcx /* Put the page_list in %rcx */
85 xorq %rdi, %rdi 213 xorq %rdi, %rdi
86 xorq %rsi, %rsi 214 xorq %rsi, %rsi
@@ -112,36 +240,27 @@ identity_mapped:
112 movq %rcx, %rsi /* For ever source page do a copy */ 240 movq %rcx, %rsi /* For ever source page do a copy */
113 andq $0xfffffffffffff000, %rsi 241 andq $0xfffffffffffff000, %rsi
114 242
243 movq %rdi, %rdx
244 movq %rsi, %rax
245
246 movq %r10, %rdi
115 movq $512, %rcx 247 movq $512, %rcx
116 rep ; movsq 248 rep ; movsq
117 jmp 0b
1183:
119
120 /* To be certain of avoiding problems with self-modifying code
121 * I need to execute a serializing instruction here.
122 * So I flush the TLB by reloading %cr3 here, it's handy,
123 * and not processor dependent.
124 */
125 movq %cr3, %rax
126 movq %rax, %cr3
127 249
128 /* set all of the registers to known values */ 250 movq %rax, %rdi
129 /* leave %rsp alone */ 251 movq %rdx, %rsi
252 movq $512, %rcx
253 rep ; movsq
130 254
131 xorq %rax, %rax 255 movq %rdx, %rdi
132 xorq %rbx, %rbx 256 movq %r10, %rsi
133 xorq %rcx, %rcx 257 movq $512, %rcx
134 xorq %rdx, %rdx 258 rep ; movsq
135 xorq %rsi, %rsi
136 xorq %rdi, %rdi
137 xorq %rbp, %rbp
138 xorq %r8, %r8
139 xorq %r9, %r9
140 xorq %r10, %r9
141 xorq %r11, %r11
142 xorq %r12, %r12
143 xorq %r13, %r13
144 xorq %r14, %r14
145 xorq %r15, %r15
146 259
260 lea PAGE_SIZE(%rax), %rsi
261 jmp 0b
2623:
147 ret 263 ret
264
265 .globl kexec_control_code_size
266.set kexec_control_code_size, . - relocate_kernel
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index dd6f2b71561b..5d465b207e72 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -1,14 +1,14 @@
1/* 1/*
2 * RTC related functions 2 * RTC related functions
3 */ 3 */
4#include <linux/platform_device.h>
5#include <linux/mc146818rtc.h>
4#include <linux/acpi.h> 6#include <linux/acpi.h>
5#include <linux/bcd.h> 7#include <linux/bcd.h>
6#include <linux/mc146818rtc.h>
7#include <linux/platform_device.h>
8#include <linux/pnp.h> 8#include <linux/pnp.h>
9 9
10#include <asm/time.h>
11#include <asm/vsyscall.h> 10#include <asm/vsyscall.h>
11#include <asm/time.h>
12 12
13#ifdef CONFIG_X86_32 13#ifdef CONFIG_X86_32
14/* 14/*
@@ -16,9 +16,9 @@
16 * register we are working with. It is required for NMI access to the 16 * register we are working with. It is required for NMI access to the
17 * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details. 17 * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details.
18 */ 18 */
19volatile unsigned long cmos_lock = 0; 19volatile unsigned long cmos_lock;
20EXPORT_SYMBOL(cmos_lock); 20EXPORT_SYMBOL(cmos_lock);
21#endif 21#endif /* CONFIG_X86_32 */
22 22
23/* For two digit years assume time is always after that */ 23/* For two digit years assume time is always after that */
24#define CMOS_YEARS_OFFS 2000 24#define CMOS_YEARS_OFFS 2000
@@ -38,9 +38,9 @@ EXPORT_SYMBOL(rtc_lock);
38 */ 38 */
39int mach_set_rtc_mmss(unsigned long nowtime) 39int mach_set_rtc_mmss(unsigned long nowtime)
40{ 40{
41 int retval = 0;
42 int real_seconds, real_minutes, cmos_minutes; 41 int real_seconds, real_minutes, cmos_minutes;
43 unsigned char save_control, save_freq_select; 42 unsigned char save_control, save_freq_select;
43 int retval = 0;
44 44
45 /* tell the clock it's being set */ 45 /* tell the clock it's being set */
46 save_control = CMOS_READ(RTC_CONTROL); 46 save_control = CMOS_READ(RTC_CONTROL);
@@ -72,8 +72,8 @@ int mach_set_rtc_mmss(unsigned long nowtime)
72 real_seconds = bin2bcd(real_seconds); 72 real_seconds = bin2bcd(real_seconds);
73 real_minutes = bin2bcd(real_minutes); 73 real_minutes = bin2bcd(real_minutes);
74 } 74 }
75 CMOS_WRITE(real_seconds,RTC_SECONDS); 75 CMOS_WRITE(real_seconds, RTC_SECONDS);
76 CMOS_WRITE(real_minutes,RTC_MINUTES); 76 CMOS_WRITE(real_minutes, RTC_MINUTES);
77 } else { 77 } else {
78 printk(KERN_WARNING 78 printk(KERN_WARNING
79 "set_rtc_mmss: can't update from %d to %d\n", 79 "set_rtc_mmss: can't update from %d to %d\n",
@@ -151,6 +151,7 @@ unsigned char rtc_cmos_read(unsigned char addr)
151 outb(addr, RTC_PORT(0)); 151 outb(addr, RTC_PORT(0));
152 val = inb(RTC_PORT(1)); 152 val = inb(RTC_PORT(1));
153 lock_cmos_suffix(addr); 153 lock_cmos_suffix(addr);
154
154 return val; 155 return val;
155} 156}
156EXPORT_SYMBOL(rtc_cmos_read); 157EXPORT_SYMBOL(rtc_cmos_read);
@@ -166,8 +167,8 @@ EXPORT_SYMBOL(rtc_cmos_write);
166 167
167static int set_rtc_mmss(unsigned long nowtime) 168static int set_rtc_mmss(unsigned long nowtime)
168{ 169{
169 int retval;
170 unsigned long flags; 170 unsigned long flags;
171 int retval;
171 172
172 spin_lock_irqsave(&rtc_lock, flags); 173 spin_lock_irqsave(&rtc_lock, flags);
173 retval = set_wallclock(nowtime); 174 retval = set_wallclock(nowtime);
@@ -242,6 +243,7 @@ static __init int add_rtc_cmos(void)
242 platform_device_register(&rtc_device); 243 platform_device_register(&rtc_device);
243 dev_info(&rtc_device.dev, 244 dev_info(&rtc_device.dev,
244 "registered platform RTC device (no PNP device found)\n"); 245 "registered platform RTC device (no PNP device found)\n");
246
245 return 0; 247 return 0;
246} 248}
247device_initcall(add_rtc_cmos); 249device_initcall(add_rtc_cmos);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 5b85759e7972..b4158439bf63 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -112,8 +112,13 @@
112#define ARCH_SETUP 112#define ARCH_SETUP
113#endif 113#endif
114 114
115RESERVE_BRK(dmi_alloc, 65536);
116
115unsigned int boot_cpu_id __read_mostly; 117unsigned int boot_cpu_id __read_mostly;
116 118
119static __initdata unsigned long _brk_start = (unsigned long)__brk_base;
120unsigned long _brk_end = (unsigned long)__brk_base;
121
117#ifdef CONFIG_X86_64 122#ifdef CONFIG_X86_64
118int default_cpu_present_to_apicid(int mps_cpu) 123int default_cpu_present_to_apicid(int mps_cpu)
119{ 124{
@@ -158,12 +163,6 @@ static struct resource bss_resource = {
158 163
159 164
160#ifdef CONFIG_X86_32 165#ifdef CONFIG_X86_32
161/* This value is set up by the early boot code to point to the value
162 immediately after the boot time page tables. It contains a *physical*
163 address, and must not be in the .bss segment! */
164unsigned long init_pg_tables_start __initdata = ~0UL;
165unsigned long init_pg_tables_end __initdata = ~0UL;
166
167static struct resource video_ram_resource = { 166static struct resource video_ram_resource = {
168 .name = "Video RAM area", 167 .name = "Video RAM area",
169 .start = 0xa0000, 168 .start = 0xa0000,
@@ -202,7 +201,9 @@ struct ist_info ist_info;
202#endif 201#endif
203 202
204#else 203#else
205struct cpuinfo_x86 boot_cpu_data __read_mostly; 204struct cpuinfo_x86 boot_cpu_data __read_mostly = {
205 .x86_phys_bits = MAX_PHYSMEM_BITS,
206};
206EXPORT_SYMBOL(boot_cpu_data); 207EXPORT_SYMBOL(boot_cpu_data);
207#endif 208#endif
208 209
@@ -217,12 +218,6 @@ unsigned long mmu_cr4_features = X86_CR4_PAE;
217int bootloader_type; 218int bootloader_type;
218 219
219/* 220/*
220 * Early DMI memory
221 */
222int dmi_alloc_index;
223char dmi_alloc_data[DMI_MAX_DATA];
224
225/*
226 * Setup options 221 * Setup options
227 */ 222 */
228struct screen_info screen_info; 223struct screen_info screen_info;
@@ -267,6 +262,35 @@ static inline void copy_edd(void)
267} 262}
268#endif 263#endif
269 264
265void * __init extend_brk(size_t size, size_t align)
266{
267 size_t mask = align - 1;
268 void *ret;
269
270 BUG_ON(_brk_start == 0);
271 BUG_ON(align & mask);
272
273 _brk_end = (_brk_end + mask) & ~mask;
274 BUG_ON((char *)(_brk_end + size) > __brk_limit);
275
276 ret = (void *)_brk_end;
277 _brk_end += size;
278
279 memset(ret, 0, size);
280
281 return ret;
282}
283
284static void __init reserve_brk(void)
285{
286 if (_brk_end > _brk_start)
287 reserve_early(__pa(_brk_start), __pa(_brk_end), "BRK");
288
289 /* Mark brk area as locked down and no longer taking any
290 new allocations */
291 _brk_start = 0;
292}
293
270#ifdef CONFIG_BLK_DEV_INITRD 294#ifdef CONFIG_BLK_DEV_INITRD
271 295
272#ifdef CONFIG_X86_32 296#ifdef CONFIG_X86_32
@@ -600,19 +624,7 @@ static int __init setup_elfcorehdr(char *arg)
600early_param("elfcorehdr", setup_elfcorehdr); 624early_param("elfcorehdr", setup_elfcorehdr);
601#endif 625#endif
602 626
603static int __init default_update_apic(void) 627static struct x86_quirks default_x86_quirks __initdata;
604{
605#ifdef CONFIG_SMP
606 if (!apic->wakeup_cpu)
607 apic->wakeup_cpu = wakeup_secondary_cpu_via_init;
608#endif
609
610 return 0;
611}
612
613static struct x86_quirks default_x86_quirks __initdata = {
614 .update_apic = default_update_apic,
615};
616 628
617struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; 629struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
618 630
@@ -727,11 +739,7 @@ void __init setup_arch(char **cmdline_p)
727 init_mm.start_code = (unsigned long) _text; 739 init_mm.start_code = (unsigned long) _text;
728 init_mm.end_code = (unsigned long) _etext; 740 init_mm.end_code = (unsigned long) _etext;
729 init_mm.end_data = (unsigned long) _edata; 741 init_mm.end_data = (unsigned long) _edata;
730#ifdef CONFIG_X86_32 742 init_mm.brk = _brk_end;
731 init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
732#else
733 init_mm.brk = (unsigned long) &_end;
734#endif
735 743
736 code_resource.start = virt_to_phys(_text); 744 code_resource.start = virt_to_phys(_text);
737 code_resource.end = virt_to_phys(_etext)-1; 745 code_resource.end = virt_to_phys(_etext)-1;
@@ -782,6 +790,9 @@ void __init setup_arch(char **cmdline_p)
782 790
783 finish_e820_parsing(); 791 finish_e820_parsing();
784 792
793 if (efi_enabled)
794 efi_init();
795
785 dmi_scan_machine(); 796 dmi_scan_machine();
786 797
787 dmi_check_system(bad_bios_dmi_table); 798 dmi_check_system(bad_bios_dmi_table);
@@ -801,8 +812,6 @@ void __init setup_arch(char **cmdline_p)
801 insert_resource(&iomem_resource, &data_resource); 812 insert_resource(&iomem_resource, &data_resource);
802 insert_resource(&iomem_resource, &bss_resource); 813 insert_resource(&iomem_resource, &bss_resource);
803 814
804 if (efi_enabled)
805 efi_init();
806 815
807#ifdef CONFIG_X86_32 816#ifdef CONFIG_X86_32
808 if (ppro_with_ram_bug()) { 817 if (ppro_with_ram_bug()) {
@@ -851,6 +860,8 @@ void __init setup_arch(char **cmdline_p)
851 setup_bios_corruption_check(); 860 setup_bios_corruption_check();
852#endif 861#endif
853 862
863 reserve_brk();
864
854 /* max_pfn_mapped is updated here */ 865 /* max_pfn_mapped is updated here */
855 max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT); 866 max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
856 max_pfn_mapped = max_low_pfn_mapped; 867 max_pfn_mapped = max_low_pfn_mapped;
@@ -875,9 +886,7 @@ void __init setup_arch(char **cmdline_p)
875 886
876 reserve_initrd(); 887 reserve_initrd();
877 888
878#ifdef CONFIG_X86_64
879 vsmp_init(); 889 vsmp_init();
880#endif
881 890
882 io_delay_init(); 891 io_delay_init();
883 892
@@ -1040,7 +1049,6 @@ void __init x86_quirk_trap_init(void)
1040static struct irqaction irq0 = { 1049static struct irqaction irq0 = {
1041 .handler = timer_interrupt, 1050 .handler = timer_interrupt,
1042 .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, 1051 .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
1043 .mask = CPU_MASK_NONE,
1044 .name = "timer" 1052 .name = "timer"
1045}; 1053};
1046 1054
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index d992e6cff730..3a97a4cf1872 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -7,6 +7,7 @@
7#include <linux/crash_dump.h> 7#include <linux/crash_dump.h>
8#include <linux/smp.h> 8#include <linux/smp.h>
9#include <linux/topology.h> 9#include <linux/topology.h>
10#include <linux/pfn.h>
10#include <asm/sections.h> 11#include <asm/sections.h>
11#include <asm/processor.h> 12#include <asm/processor.h>
12#include <asm/setup.h> 13#include <asm/setup.h>
@@ -41,6 +42,295 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
41}; 42};
42EXPORT_SYMBOL(__per_cpu_offset); 43EXPORT_SYMBOL(__per_cpu_offset);
43 44
45/*
46 * On x86_64 symbols referenced from code should be reachable using
47 * 32bit relocations. Reserve space for static percpu variables in
48 * modules so that they are always served from the first chunk which
49 * is located at the percpu segment base. On x86_32, anything can
50 * address anywhere. No need to reserve space in the first chunk.
51 */
52#ifdef CONFIG_X86_64
53#define PERCPU_FIRST_CHUNK_RESERVE PERCPU_MODULE_RESERVE
54#else
55#define PERCPU_FIRST_CHUNK_RESERVE 0
56#endif
57
58/**
59 * pcpu_need_numa - determine percpu allocation needs to consider NUMA
60 *
61 * If NUMA is not configured or there is only one NUMA node available,
62 * there is no reason to consider NUMA. This function determines
63 * whether percpu allocation should consider NUMA or not.
64 *
65 * RETURNS:
66 * true if NUMA should be considered; otherwise, false.
67 */
68static bool __init pcpu_need_numa(void)
69{
70#ifdef CONFIG_NEED_MULTIPLE_NODES
71 pg_data_t *last = NULL;
72 unsigned int cpu;
73
74 for_each_possible_cpu(cpu) {
75 int node = early_cpu_to_node(cpu);
76
77 if (node_online(node) && NODE_DATA(node) &&
78 last && last != NODE_DATA(node))
79 return true;
80
81 last = NODE_DATA(node);
82 }
83#endif
84 return false;
85}
86
87/**
88 * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
89 * @cpu: cpu to allocate for
90 * @size: size allocation in bytes
91 * @align: alignment
92 *
93 * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper
94 * does the right thing for NUMA regardless of the current
95 * configuration.
96 *
97 * RETURNS:
98 * Pointer to the allocated area on success, NULL on failure.
99 */
100static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
101 unsigned long align)
102{
103 const unsigned long goal = __pa(MAX_DMA_ADDRESS);
104#ifdef CONFIG_NEED_MULTIPLE_NODES
105 int node = early_cpu_to_node(cpu);
106 void *ptr;
107
108 if (!node_online(node) || !NODE_DATA(node)) {
109 ptr = __alloc_bootmem_nopanic(size, align, goal);
110 pr_info("cpu %d has no node %d or node-local memory\n",
111 cpu, node);
112 pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
113 cpu, size, __pa(ptr));
114 } else {
115 ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
116 size, align, goal);
117 pr_debug("per cpu data for cpu%d %lu bytes on node%d at "
118 "%016lx\n", cpu, size, node, __pa(ptr));
119 }
120 return ptr;
121#else
122 return __alloc_bootmem_nopanic(size, align, goal);
123#endif
124}
125
126/*
127 * Remap allocator
128 *
129 * This allocator uses PMD page as unit. A PMD page is allocated for
130 * each cpu and each is remapped into vmalloc area using PMD mapping.
131 * As PMD page is quite large, only part of it is used for the first
132 * chunk. Unused part is returned to the bootmem allocator.
133 *
134 * So, the PMD pages are mapped twice - once to the physical mapping
135 * and to the vmalloc area for the first percpu chunk. The double
136 * mapping does add one more PMD TLB entry pressure but still is much
137 * better than only using 4k mappings while still being NUMA friendly.
138 */
139#ifdef CONFIG_NEED_MULTIPLE_NODES
140static size_t pcpur_size __initdata;
141static void **pcpur_ptrs __initdata;
142
143static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
144{
145 size_t off = (size_t)pageno << PAGE_SHIFT;
146
147 if (off >= pcpur_size)
148 return NULL;
149
150 return virt_to_page(pcpur_ptrs[cpu] + off);
151}
152
153static ssize_t __init setup_pcpu_remap(size_t static_size)
154{
155 static struct vm_struct vm;
156 size_t ptrs_size, dyn_size;
157 unsigned int cpu;
158 ssize_t ret;
159
160 /*
161 * If large page isn't supported, there's no benefit in doing
162 * this. Also, on non-NUMA, embedding is better.
163 */
164 if (!cpu_has_pse || !pcpu_need_numa())
165 return -EINVAL;
166
167 /*
168 * Currently supports only single page. Supporting multiple
169 * pages won't be too difficult if it ever becomes necessary.
170 */
171 pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
172 PERCPU_DYNAMIC_RESERVE);
173 if (pcpur_size > PMD_SIZE) {
174 pr_warning("PERCPU: static data is larger than large page, "
175 "can't use large page\n");
176 return -EINVAL;
177 }
178 dyn_size = pcpur_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
179
180 /* allocate pointer array and alloc large pages */
181 ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0]));
182 pcpur_ptrs = alloc_bootmem(ptrs_size);
183
184 for_each_possible_cpu(cpu) {
185 pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE);
186 if (!pcpur_ptrs[cpu])
187 goto enomem;
188
189 /*
190 * Only use pcpur_size bytes and give back the rest.
191 *
192 * Ingo: The 2MB up-rounding bootmem is needed to make
193 * sure the partial 2MB page is still fully RAM - it's
194 * not well-specified to have a PAT-incompatible area
195 * (unmapped RAM, device memory, etc.) in that hole.
196 */
197 free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size),
198 PMD_SIZE - pcpur_size);
199
200 memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size);
201 }
202
203 /* allocate address and map */
204 vm.flags = VM_ALLOC;
205 vm.size = num_possible_cpus() * PMD_SIZE;
206 vm_area_register_early(&vm, PMD_SIZE);
207
208 for_each_possible_cpu(cpu) {
209 pmd_t *pmd;
210
211 pmd = populate_extra_pmd((unsigned long)vm.addr
212 + cpu * PMD_SIZE);
213 set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])),
214 PAGE_KERNEL_LARGE));
215 }
216
217 /* we're ready, commit */
218 pr_info("PERCPU: Remapped at %p with large pages, static data "
219 "%zu bytes\n", vm.addr, static_size);
220
221 ret = pcpu_setup_first_chunk(pcpur_get_page, static_size,
222 PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
223 PMD_SIZE, vm.addr, NULL);
224 goto out_free_ar;
225
226enomem:
227 for_each_possible_cpu(cpu)
228 if (pcpur_ptrs[cpu])
229 free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE);
230 ret = -ENOMEM;
231out_free_ar:
232 free_bootmem(__pa(pcpur_ptrs), ptrs_size);
233 return ret;
234}
235#else
236static ssize_t __init setup_pcpu_remap(size_t static_size)
237{
238 return -EINVAL;
239}
240#endif
241
242/*
243 * Embedding allocator
244 *
245 * The first chunk is sized to just contain the static area plus
246 * module and dynamic reserves and embedded into linear physical
247 * mapping so that it can use PMD mapping without additional TLB
248 * pressure.
249 */
250static ssize_t __init setup_pcpu_embed(size_t static_size)
251{
252 size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
253
254 /*
255 * If large page isn't supported, there's no benefit in doing
256 * this. Also, embedding allocation doesn't play well with
257 * NUMA.
258 */
259 if (!cpu_has_pse || pcpu_need_numa())
260 return -EINVAL;
261
262 return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
263 reserve - PERCPU_FIRST_CHUNK_RESERVE, -1);
264}
265
266/*
267 * 4k page allocator
268 *
269 * This is the basic allocator. Static percpu area is allocated
270 * page-by-page and most of initialization is done by the generic
271 * setup function.
272 */
273static struct page **pcpu4k_pages __initdata;
274static int pcpu4k_nr_static_pages __initdata;
275
276static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
277{
278 if (pageno < pcpu4k_nr_static_pages)
279 return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
280 return NULL;
281}
282
283static void __init pcpu4k_populate_pte(unsigned long addr)
284{
285 populate_extra_pte(addr);
286}
287
288static ssize_t __init setup_pcpu_4k(size_t static_size)
289{
290 size_t pages_size;
291 unsigned int cpu;
292 int i, j;
293 ssize_t ret;
294
295 pcpu4k_nr_static_pages = PFN_UP(static_size);
296
297 /* unaligned allocations can't be freed, round up to page size */
298 pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus()
299 * sizeof(pcpu4k_pages[0]));
300 pcpu4k_pages = alloc_bootmem(pages_size);
301
302 /* allocate and copy */
303 j = 0;
304 for_each_possible_cpu(cpu)
305 for (i = 0; i < pcpu4k_nr_static_pages; i++) {
306 void *ptr;
307
308 ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
309 if (!ptr)
310 goto enomem;
311
312 memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
313 pcpu4k_pages[j++] = virt_to_page(ptr);
314 }
315
316 /* we're ready, commit */
317 pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
318 pcpu4k_nr_static_pages, static_size);
319
320 ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
321 PERCPU_FIRST_CHUNK_RESERVE, -1,
322 -1, NULL, pcpu4k_populate_pte);
323 goto out_free_ar;
324
325enomem:
326 while (--j >= 0)
327 free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE);
328 ret = -ENOMEM;
329out_free_ar:
330 free_bootmem(__pa(pcpu4k_pages), pages_size);
331 return ret;
332}
333
44static inline void setup_percpu_segment(int cpu) 334static inline void setup_percpu_segment(int cpu)
45{ 335{
46#ifdef CONFIG_X86_32 336#ifdef CONFIG_X86_32
@@ -61,38 +351,35 @@ static inline void setup_percpu_segment(int cpu)
61 */ 351 */
62void __init setup_per_cpu_areas(void) 352void __init setup_per_cpu_areas(void)
63{ 353{
64 ssize_t size; 354 size_t static_size = __per_cpu_end - __per_cpu_start;
65 char *ptr; 355 unsigned int cpu;
66 int cpu; 356 unsigned long delta;
67 357 size_t pcpu_unit_size;
68 /* Copy section for each CPU (we discard the original) */ 358 ssize_t ret;
69 size = roundup(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
70 359
71 pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", 360 pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
72 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); 361 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
73 362
74 pr_info("PERCPU: Allocating %zd bytes of per cpu data\n", size); 363 /*
364 * Allocate percpu area. If PSE is supported, try to make use
365 * of large page mappings. Please read comments on top of
366 * each allocator for details.
367 */
368 ret = setup_pcpu_remap(static_size);
369 if (ret < 0)
370 ret = setup_pcpu_embed(static_size);
371 if (ret < 0)
372 ret = setup_pcpu_4k(static_size);
373 if (ret < 0)
374 panic("cannot allocate static percpu area (%zu bytes, err=%zd)",
375 static_size, ret);
75 376
76 for_each_possible_cpu(cpu) { 377 pcpu_unit_size = ret;
77#ifndef CONFIG_NEED_MULTIPLE_NODES
78 ptr = alloc_bootmem_pages(size);
79#else
80 int node = early_cpu_to_node(cpu);
81 if (!node_online(node) || !NODE_DATA(node)) {
82 ptr = alloc_bootmem_pages(size);
83 pr_info("cpu %d has no node %d or node-local memory\n",
84 cpu, node);
85 pr_debug("per cpu data for cpu%d at %016lx\n",
86 cpu, __pa(ptr));
87 } else {
88 ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
89 pr_debug("per cpu data for cpu%d on node%d at %016lx\n",
90 cpu, node, __pa(ptr));
91 }
92#endif
93 378
94 memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start); 379 /* alrighty, percpu areas up and running */
95 per_cpu_offset(cpu) = ptr - __per_cpu_start; 380 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
381 for_each_possible_cpu(cpu) {
382 per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size;
96 per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); 383 per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
97 per_cpu(cpu_number, cpu) = cpu; 384 per_cpu(cpu_number, cpu) = cpu;
98 setup_percpu_segment(cpu); 385 setup_percpu_segment(cpu);
@@ -125,8 +412,6 @@ void __init setup_per_cpu_areas(void)
125 */ 412 */
126 if (cpu == boot_cpu_id) 413 if (cpu == boot_cpu_id)
127 switch_to_new_gdt(cpu); 414 switch_to_new_gdt(cpu);
128
129 DBG("PERCPU: cpu %4d %p\n", cpu, ptr);
130 } 415 }
131 416
132 /* indicate the early static arrays will soon be gone */ 417 /* indicate the early static arrays will soon be gone */
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 4d3441018065..611615a92c90 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -187,6 +187,77 @@ setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
187/* 187/*
188 * Set up a signal frame. 188 * Set up a signal frame.
189 */ 189 */
190
191/*
192 * Determine which stack to use..
193 */
194static unsigned long align_sigframe(unsigned long sp)
195{
196#ifdef CONFIG_X86_32
197 /*
198 * Align the stack pointer according to the i386 ABI,
199 * i.e. so that on function entry ((sp + 4) & 15) == 0.
200 */
201 sp = ((sp + 4) & -16ul) - 4;
202#else /* !CONFIG_X86_32 */
203 sp = round_down(sp, 16) - 8;
204#endif
205 return sp;
206}
207
208static inline void __user *
209get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
210 void __user **fpstate)
211{
212 /* Default to using normal stack */
213 unsigned long sp = regs->sp;
214 int onsigstack = on_sig_stack(sp);
215
216#ifdef CONFIG_X86_64
217 /* redzone */
218 sp -= 128;
219#endif /* CONFIG_X86_64 */
220
221 if (!onsigstack) {
222 /* This is the X/Open sanctioned signal stack switching. */
223 if (ka->sa.sa_flags & SA_ONSTACK) {
224 if (current->sas_ss_size)
225 sp = current->sas_ss_sp + current->sas_ss_size;
226 } else {
227#ifdef CONFIG_X86_32
228 /* This is the legacy signal stack switching. */
229 if ((regs->ss & 0xffff) != __USER_DS &&
230 !(ka->sa.sa_flags & SA_RESTORER) &&
231 ka->sa.sa_restorer)
232 sp = (unsigned long) ka->sa.sa_restorer;
233#endif /* CONFIG_X86_32 */
234 }
235 }
236
237 if (used_math()) {
238 sp -= sig_xstate_size;
239#ifdef CONFIG_X86_64
240 sp = round_down(sp, 64);
241#endif /* CONFIG_X86_64 */
242 *fpstate = (void __user *)sp;
243 }
244
245 sp = align_sigframe(sp - frame_size);
246
247 /*
248 * If we are on the alternate signal stack and would overflow it, don't.
249 * Return an always-bogus address instead so we will die with SIGSEGV.
250 */
251 if (onsigstack && !likely(on_sig_stack(sp)))
252 return (void __user *)-1L;
253
254 /* save i387 state */
255 if (used_math() && save_i387_xstate(*fpstate) < 0)
256 return (void __user *)-1L;
257
258 return (void __user *)sp;
259}
260
190#ifdef CONFIG_X86_32 261#ifdef CONFIG_X86_32
191static const struct { 262static const struct {
192 u16 poplmovl; 263 u16 poplmovl;
@@ -210,54 +281,6 @@ static const struct {
210 0 281 0
211}; 282};
212 283
213/*
214 * Determine which stack to use..
215 */
216static inline void __user *
217get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
218 void **fpstate)
219{
220 unsigned long sp;
221
222 /* Default to using normal stack */
223 sp = regs->sp;
224
225 /*
226 * If we are on the alternate signal stack and would overflow it, don't.
227 * Return an always-bogus address instead so we will die with SIGSEGV.
228 */
229 if (on_sig_stack(sp) && !likely(on_sig_stack(sp - frame_size)))
230 return (void __user *) -1L;
231
232 /* This is the X/Open sanctioned signal stack switching. */
233 if (ka->sa.sa_flags & SA_ONSTACK) {
234 if (sas_ss_flags(sp) == 0)
235 sp = current->sas_ss_sp + current->sas_ss_size;
236 } else {
237 /* This is the legacy signal stack switching. */
238 if ((regs->ss & 0xffff) != __USER_DS &&
239 !(ka->sa.sa_flags & SA_RESTORER) &&
240 ka->sa.sa_restorer)
241 sp = (unsigned long) ka->sa.sa_restorer;
242 }
243
244 if (used_math()) {
245 sp = sp - sig_xstate_size;
246 *fpstate = (struct _fpstate *) sp;
247 if (save_i387_xstate(*fpstate) < 0)
248 return (void __user *)-1L;
249 }
250
251 sp -= frame_size;
252 /*
253 * Align the stack pointer according to the i386 ABI,
254 * i.e. so that on function entry ((sp + 4) & 15) == 0.
255 */
256 sp = ((sp + 4) & -16ul) - 4;
257
258 return (void __user *) sp;
259}
260
261static int 284static int
262__setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, 285__setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
263 struct pt_regs *regs) 286 struct pt_regs *regs)
@@ -388,24 +411,6 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
388 return 0; 411 return 0;
389} 412}
390#else /* !CONFIG_X86_32 */ 413#else /* !CONFIG_X86_32 */
391/*
392 * Determine which stack to use..
393 */
394static void __user *
395get_stack(struct k_sigaction *ka, unsigned long sp, unsigned long size)
396{
397 /* Default to using normal stack - redzone*/
398 sp -= 128;
399
400 /* This is the X/Open sanctioned signal stack switching. */
401 if (ka->sa.sa_flags & SA_ONSTACK) {
402 if (sas_ss_flags(sp) == 0)
403 sp = current->sas_ss_sp + current->sas_ss_size;
404 }
405
406 return (void __user *)round_down(sp - size, 64);
407}
408
409static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 414static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
410 sigset_t *set, struct pt_regs *regs) 415 sigset_t *set, struct pt_regs *regs)
411{ 416{
@@ -414,15 +419,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
414 int err = 0; 419 int err = 0;
415 struct task_struct *me = current; 420 struct task_struct *me = current;
416 421
417 if (used_math()) { 422 frame = get_sigframe(ka, regs, sizeof(struct rt_sigframe), &fp);
418 fp = get_stack(ka, regs->sp, sig_xstate_size);
419 frame = (void __user *)round_down(
420 (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
421
422 if (save_i387_xstate(fp) < 0)
423 return -EFAULT;
424 } else
425 frame = get_stack(ka, regs->sp, sizeof(struct rt_sigframe)) - 8;
426 423
427 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 424 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
428 return -EFAULT; 425 return -EFAULT;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 9ce666387f37..58d24ef917d8 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -101,29 +101,20 @@ EXPORT_SYMBOL(smp_num_siblings);
101DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID; 101DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID;
102 102
103/* representing HT siblings of each logical CPU */ 103/* representing HT siblings of each logical CPU */
104DEFINE_PER_CPU(cpumask_t, cpu_sibling_map); 104DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
105EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); 105EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
106 106
107/* representing HT and core siblings of each logical CPU */ 107/* representing HT and core siblings of each logical CPU */
108DEFINE_PER_CPU(cpumask_t, cpu_core_map); 108DEFINE_PER_CPU(cpumask_var_t, cpu_core_map);
109EXPORT_PER_CPU_SYMBOL(cpu_core_map); 109EXPORT_PER_CPU_SYMBOL(cpu_core_map);
110 110
111/* Per CPU bogomips and other parameters */ 111/* Per CPU bogomips and other parameters */
112DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); 112DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
113EXPORT_PER_CPU_SYMBOL(cpu_info); 113EXPORT_PER_CPU_SYMBOL(cpu_info);
114 114
115static atomic_t init_deasserted; 115atomic_t init_deasserted;
116
117
118/* Set if we find a B stepping CPU */
119static int __cpuinitdata smp_b_stepping;
120 116
121#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32) 117#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32)
122
123/* which logical CPUs are on which nodes */
124cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly =
125 { [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE };
126EXPORT_SYMBOL(node_to_cpumask_map);
127/* which node each logical CPU is on */ 118/* which node each logical CPU is on */
128int cpu_to_node_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 }; 119int cpu_to_node_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };
129EXPORT_SYMBOL(cpu_to_node_map); 120EXPORT_SYMBOL(cpu_to_node_map);
@@ -132,7 +123,7 @@ EXPORT_SYMBOL(cpu_to_node_map);
132static void map_cpu_to_node(int cpu, int node) 123static void map_cpu_to_node(int cpu, int node)
133{ 124{
134 printk(KERN_INFO "Mapping cpu %d to node %d\n", cpu, node); 125 printk(KERN_INFO "Mapping cpu %d to node %d\n", cpu, node);
135 cpumask_set_cpu(cpu, &node_to_cpumask_map[node]); 126 cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
136 cpu_to_node_map[cpu] = node; 127 cpu_to_node_map[cpu] = node;
137} 128}
138 129
@@ -143,7 +134,7 @@ static void unmap_cpu_to_node(int cpu)
143 134
144 printk(KERN_INFO "Unmapping cpu %d from all nodes\n", cpu); 135 printk(KERN_INFO "Unmapping cpu %d from all nodes\n", cpu);
145 for (node = 0; node < MAX_NUMNODES; node++) 136 for (node = 0; node < MAX_NUMNODES; node++)
146 cpumask_clear_cpu(cpu, &node_to_cpumask_map[node]); 137 cpumask_clear_cpu(cpu, node_to_cpumask_map[node]);
147 cpu_to_node_map[cpu] = 0; 138 cpu_to_node_map[cpu] = 0;
148} 139}
149#else /* !(CONFIG_NUMA && CONFIG_X86_32) */ 140#else /* !(CONFIG_NUMA && CONFIG_X86_32) */
@@ -271,8 +262,6 @@ static void __cpuinit smp_callin(void)
271 cpumask_set_cpu(cpuid, cpu_callin_mask); 262 cpumask_set_cpu(cpuid, cpu_callin_mask);
272} 263}
273 264
274static int __cpuinitdata unsafe_smp;
275
276/* 265/*
277 * Activate a secondary processor. 266 * Activate a secondary processor.
278 */ 267 */
@@ -307,7 +296,7 @@ notrace static void __cpuinit start_secondary(void *unused)
307 __flush_tlb_all(); 296 __flush_tlb_all();
308#endif 297#endif
309 298
310 /* This must be done before setting cpu_online_map */ 299 /* This must be done before setting cpu_online_mask */
311 set_cpu_sibling_map(raw_smp_processor_id()); 300 set_cpu_sibling_map(raw_smp_processor_id());
312 wmb(); 301 wmb();
313 302
@@ -340,75 +329,22 @@ notrace static void __cpuinit start_secondary(void *unused)
340 cpu_idle(); 329 cpu_idle();
341} 330}
342 331
343static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c) 332#ifdef CONFIG_CPUMASK_OFFSTACK
333/* In this case, llc_shared_map is a pointer to a cpumask. */
334static inline void copy_cpuinfo_x86(struct cpuinfo_x86 *dst,
335 const struct cpuinfo_x86 *src)
344{ 336{
345 /* 337 struct cpumask *llc = dst->llc_shared_map;
346 * Mask B, Pentium, but not Pentium MMX 338 *dst = *src;
347 */ 339 dst->llc_shared_map = llc;
348 if (c->x86_vendor == X86_VENDOR_INTEL &&
349 c->x86 == 5 &&
350 c->x86_mask >= 1 && c->x86_mask <= 4 &&
351 c->x86_model <= 3)
352 /*
353 * Remember we have B step Pentia with bugs
354 */
355 smp_b_stepping = 1;
356
357 /*
358 * Certain Athlons might work (for various values of 'work') in SMP
359 * but they are not certified as MP capable.
360 */
361 if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) {
362
363 if (num_possible_cpus() == 1)
364 goto valid_k7;
365
366 /* Athlon 660/661 is valid. */
367 if ((c->x86_model == 6) && ((c->x86_mask == 0) ||
368 (c->x86_mask == 1)))
369 goto valid_k7;
370
371 /* Duron 670 is valid */
372 if ((c->x86_model == 7) && (c->x86_mask == 0))
373 goto valid_k7;
374
375 /*
376 * Athlon 662, Duron 671, and Athlon >model 7 have capability
377 * bit. It's worth noting that the A5 stepping (662) of some
378 * Athlon XP's have the MP bit set.
379 * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
380 * more.
381 */
382 if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
383 ((c->x86_model == 7) && (c->x86_mask >= 1)) ||
384 (c->x86_model > 7))
385 if (cpu_has_mp)
386 goto valid_k7;
387
388 /* If we get here, not a certified SMP capable AMD system. */
389 unsafe_smp = 1;
390 }
391
392valid_k7:
393 ;
394} 340}
395 341#else
396static void __cpuinit smp_checks(void) 342static inline void copy_cpuinfo_x86(struct cpuinfo_x86 *dst,
343 const struct cpuinfo_x86 *src)
397{ 344{
398 if (smp_b_stepping) 345 *dst = *src;
399 printk(KERN_WARNING "WARNING: SMP operation may be unreliable"
400 "with B stepping processors.\n");
401
402 /*
403 * Don't taint if we are running SMP kernel on a single non-MP
404 * approved Athlon
405 */
406 if (unsafe_smp && num_online_cpus() > 1) {
407 printk(KERN_INFO "WARNING: This combination of AMD"
408 "processors is not suitable for SMP.\n");
409 add_taint(TAINT_UNSAFE_SMP);
410 }
411} 346}
347#endif /* CONFIG_CPUMASK_OFFSTACK */
412 348
413/* 349/*
414 * The bootstrap kernel entry code has set these up. Save them for 350 * The bootstrap kernel entry code has set these up. Save them for
@@ -419,11 +355,10 @@ void __cpuinit smp_store_cpu_info(int id)
419{ 355{
420 struct cpuinfo_x86 *c = &cpu_data(id); 356 struct cpuinfo_x86 *c = &cpu_data(id);
421 357
422 *c = boot_cpu_data; 358 copy_cpuinfo_x86(c, &boot_cpu_data);
423 c->cpu_index = id; 359 c->cpu_index = id;
424 if (id != 0) 360 if (id != 0)
425 identify_secondary_cpu(c); 361 identify_secondary_cpu(c);
426 smp_apply_quirks(c);
427} 362}
428 363
429 364
@@ -444,15 +379,15 @@ void __cpuinit set_cpu_sibling_map(int cpu)
444 cpumask_set_cpu(cpu, cpu_sibling_mask(i)); 379 cpumask_set_cpu(cpu, cpu_sibling_mask(i));
445 cpumask_set_cpu(i, cpu_core_mask(cpu)); 380 cpumask_set_cpu(i, cpu_core_mask(cpu));
446 cpumask_set_cpu(cpu, cpu_core_mask(i)); 381 cpumask_set_cpu(cpu, cpu_core_mask(i));
447 cpumask_set_cpu(i, &c->llc_shared_map); 382 cpumask_set_cpu(i, c->llc_shared_map);
448 cpumask_set_cpu(cpu, &o->llc_shared_map); 383 cpumask_set_cpu(cpu, o->llc_shared_map);
449 } 384 }
450 } 385 }
451 } else { 386 } else {
452 cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); 387 cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
453 } 388 }
454 389
455 cpumask_set_cpu(cpu, &c->llc_shared_map); 390 cpumask_set_cpu(cpu, c->llc_shared_map);
456 391
457 if (current_cpu_data.x86_max_cores == 1) { 392 if (current_cpu_data.x86_max_cores == 1) {
458 cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu)); 393 cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu));
@@ -463,8 +398,8 @@ void __cpuinit set_cpu_sibling_map(int cpu)
463 for_each_cpu(i, cpu_sibling_setup_mask) { 398 for_each_cpu(i, cpu_sibling_setup_mask) {
464 if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && 399 if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
465 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { 400 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
466 cpumask_set_cpu(i, &c->llc_shared_map); 401 cpumask_set_cpu(i, c->llc_shared_map);
467 cpumask_set_cpu(cpu, &cpu_data(i).llc_shared_map); 402 cpumask_set_cpu(cpu, cpu_data(i).llc_shared_map);
468 } 403 }
469 if (c->phys_proc_id == cpu_data(i).phys_proc_id) { 404 if (c->phys_proc_id == cpu_data(i).phys_proc_id) {
470 cpumask_set_cpu(i, cpu_core_mask(cpu)); 405 cpumask_set_cpu(i, cpu_core_mask(cpu));
@@ -502,12 +437,7 @@ const struct cpumask *cpu_coregroup_mask(int cpu)
502 if (sched_mc_power_savings || sched_smt_power_savings) 437 if (sched_mc_power_savings || sched_smt_power_savings)
503 return cpu_core_mask(cpu); 438 return cpu_core_mask(cpu);
504 else 439 else
505 return &c->llc_shared_map; 440 return c->llc_shared_map;
506}
507
508cpumask_t cpu_coregroup_map(int cpu)
509{
510 return *cpu_coregroup_mask(cpu);
511} 441}
512 442
513static void impress_friends(void) 443static void impress_friends(void)
@@ -614,12 +544,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
614 unsigned long send_status, accept_status = 0; 544 unsigned long send_status, accept_status = 0;
615 int maxlvt, num_starts, j; 545 int maxlvt, num_starts, j;
616 546
617 if (get_uv_system_type() == UV_NON_UNIQUE_APIC) {
618 send_status = uv_wakeup_secondary(phys_apicid, start_eip);
619 atomic_set(&init_deasserted, 1);
620 return send_status;
621 }
622
623 maxlvt = lapic_get_maxlvt(); 547 maxlvt = lapic_get_maxlvt();
624 548
625 /* 549 /*
@@ -748,7 +672,8 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
748/* 672/*
749 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad 673 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
750 * (ie clustered apic addressing mode), this is a LOGICAL apic ID. 674 * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
751 * Returns zero if CPU booted OK, else error code from ->wakeup_cpu. 675 * Returns zero if CPU booted OK, else error code from
676 * ->wakeup_secondary_cpu.
752 */ 677 */
753static int __cpuinit do_boot_cpu(int apicid, int cpu) 678static int __cpuinit do_boot_cpu(int apicid, int cpu)
754{ 679{
@@ -835,9 +760,13 @@ do_rest:
835 } 760 }
836 761
837 /* 762 /*
838 * Starting actual IPI sequence... 763 * Kick the secondary CPU. Use the method in the APIC driver
764 * if it's defined - or use an INIT boot APIC message otherwise:
839 */ 765 */
840 boot_error = apic->wakeup_cpu(apicid, start_ip); 766 if (apic->wakeup_secondary_cpu)
767 boot_error = apic->wakeup_secondary_cpu(apicid, start_ip);
768 else
769 boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip);
841 770
842 if (!boot_error) { 771 if (!boot_error) {
843 /* 772 /*
@@ -975,9 +904,8 @@ int __cpuinit native_cpu_up(unsigned int cpu)
975 */ 904 */
976static __init void disable_smp(void) 905static __init void disable_smp(void)
977{ 906{
978 /* use the read/write pointers to the present and possible maps */ 907 init_cpu_present(cpumask_of(0));
979 cpumask_copy(&cpu_present_map, cpumask_of(0)); 908 init_cpu_possible(cpumask_of(0));
980 cpumask_copy(&cpu_possible_map, cpumask_of(0));
981 smpboot_clear_io_apic_irqs(); 909 smpboot_clear_io_apic_irqs();
982 910
983 if (smp_found_config) 911 if (smp_found_config)
@@ -1109,6 +1037,8 @@ static void __init smp_cpu_index_default(void)
1109 */ 1037 */
1110void __init native_smp_prepare_cpus(unsigned int max_cpus) 1038void __init native_smp_prepare_cpus(unsigned int max_cpus)
1111{ 1039{
1040 unsigned int i;
1041
1112 preempt_disable(); 1042 preempt_disable();
1113 smp_cpu_index_default(); 1043 smp_cpu_index_default();
1114 current_cpu_data = boot_cpu_data; 1044 current_cpu_data = boot_cpu_data;
@@ -1122,6 +1052,14 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1122 boot_cpu_logical_apicid = logical_smp_processor_id(); 1052 boot_cpu_logical_apicid = logical_smp_processor_id();
1123#endif 1053#endif
1124 current_thread_info()->cpu = 0; /* needed? */ 1054 current_thread_info()->cpu = 0; /* needed? */
1055 for_each_possible_cpu(i) {
1056 alloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
1057 alloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
1058 alloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL);
1059 cpumask_clear(per_cpu(cpu_core_map, i));
1060 cpumask_clear(per_cpu(cpu_sibling_map, i));
1061 cpumask_clear(cpu_data(i).llc_shared_map);
1062 }
1125 set_cpu_sibling_map(0); 1063 set_cpu_sibling_map(0);
1126 1064
1127 enable_IR_x2apic(); 1065 enable_IR_x2apic();
@@ -1194,7 +1132,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
1194 pr_debug("Boot done.\n"); 1132 pr_debug("Boot done.\n");
1195 1133
1196 impress_friends(); 1134 impress_friends();
1197 smp_checks();
1198#ifdef CONFIG_X86_IO_APIC 1135#ifdef CONFIG_X86_IO_APIC
1199 setup_ioapic_dest(); 1136 setup_ioapic_dest();
1200#endif 1137#endif
@@ -1211,11 +1148,11 @@ early_param("possible_cpus", _setup_possible_cpus);
1211 1148
1212 1149
1213/* 1150/*
1214 * cpu_possible_map should be static, it cannot change as cpu's 1151 * cpu_possible_mask should be static, it cannot change as cpu's
1215 * are onlined, or offlined. The reason is per-cpu data-structures 1152 * are onlined, or offlined. The reason is per-cpu data-structures
1216 * are allocated by some modules at init time, and dont expect to 1153 * are allocated by some modules at init time, and dont expect to
1217 * do this dynamically on cpu arrival/departure. 1154 * do this dynamically on cpu arrival/departure.
1218 * cpu_present_map on the other hand can change dynamically. 1155 * cpu_present_mask on the other hand can change dynamically.
1219 * In case when cpu_hotplug is not compiled, then we resort to current 1156 * In case when cpu_hotplug is not compiled, then we resort to current
1220 * behaviour, which is cpu_possible == cpu_present. 1157 * behaviour, which is cpu_possible == cpu_present.
1221 * - Ashok Raj 1158 * - Ashok Raj
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index b7607c4f2042..c3ebbb901379 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -333,3 +333,5 @@ ENTRY(sys_call_table)
333 .long sys_pipe2 333 .long sys_pipe2
334 .long sys_inotify_init1 334 .long sys_inotify_init1
335 .long sys_perf_counter_open 335 .long sys_perf_counter_open
336 .long sys_preadv
337 .long sys_pwritev
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index 241ec3923f61..5ba343e61844 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -116,7 +116,6 @@ unsigned long __init calibrate_cpu(void)
116static struct irqaction irq0 = { 116static struct irqaction irq0 = {
117 .handler = timer_interrupt, 117 .handler = timer_interrupt,
118 .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING | IRQF_TIMER, 118 .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING | IRQF_TIMER,
119 .mask = CPU_MASK_NONE,
120 .name = "timer" 119 .name = "timer"
121}; 120};
122 121
@@ -125,7 +124,6 @@ void __init hpet_time_init(void)
125 if (!hpet_enable()) 124 if (!hpet_enable())
126 setup_pit_timer(); 125 setup_pit_timer();
127 126
128 irq0.mask = cpumask_of_cpu(0);
129 setup_irq(0, &irq0); 127 setup_irq(0, &irq0);
130} 128}
131 129
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index f04549afcfe9..deb5ebb32c3b 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -275,6 +275,8 @@ const struct cpumask *uv_flush_send_and_wait(int cpu, int this_blade,
275 return NULL; 275 return NULL;
276} 276}
277 277
278static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
279
278/** 280/**
279 * uv_flush_tlb_others - globally purge translation cache of a virtual 281 * uv_flush_tlb_others - globally purge translation cache of a virtual
280 * address or all TLB's 282 * address or all TLB's
@@ -304,8 +306,7 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
304 struct mm_struct *mm, 306 struct mm_struct *mm,
305 unsigned long va, unsigned int cpu) 307 unsigned long va, unsigned int cpu)
306{ 308{
307 static DEFINE_PER_CPU(cpumask_t, flush_tlb_mask); 309 struct cpumask *flush_mask = __get_cpu_var(uv_flush_tlb_mask);
308 struct cpumask *flush_mask = &__get_cpu_var(flush_tlb_mask);
309 int i; 310 int i;
310 int bit; 311 int bit;
311 int blade; 312 int blade;
@@ -314,8 +315,6 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
314 int locals = 0; 315 int locals = 0;
315 struct bau_desc *bau_desc; 316 struct bau_desc *bau_desc;
316 317
317 WARN_ON(!in_atomic());
318
319 cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); 318 cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
320 319
321 uv_cpu = uv_blade_processor_id(); 320 uv_cpu = uv_blade_processor_id();
@@ -752,16 +751,21 @@ static int __init uv_bau_init(void)
752 int node; 751 int node;
753 int nblades; 752 int nblades;
754 int last_blade; 753 int last_blade;
755 int cur_cpu = 0; 754 int cur_cpu;
756 755
757 if (!is_uv_system()) 756 if (!is_uv_system())
758 return 0; 757 return 0;
759 758
759 for_each_possible_cpu(cur_cpu)
760 alloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
761 GFP_KERNEL, cpu_to_node(cur_cpu));
762
760 uv_bau_retry_limit = 1; 763 uv_bau_retry_limit = 1;
761 uv_nshift = uv_hub_info->n_val; 764 uv_nshift = uv_hub_info->n_val;
762 uv_mmask = (1UL << uv_hub_info->n_val) - 1; 765 uv_mmask = (1UL << uv_hub_info->n_val) - 1;
763 nblades = 0; 766 nblades = 0;
764 last_blade = -1; 767 last_blade = -1;
768 cur_cpu = 0;
765 for_each_online_node(node) { 769 for_each_online_node(node) {
766 blade = uv_node_to_blade_id(node); 770 blade = uv_node_to_blade_id(node);
767 if (blade == last_blade) 771 if (blade == last_blade)
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index 0fcc95a354f7..7e4515957a1c 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -25,10 +25,10 @@
25 * 25 *
26 * Send feedback to <colpatch@us.ibm.com> 26 * Send feedback to <colpatch@us.ibm.com>
27 */ 27 */
28#include <linux/init.h>
29#include <linux/smp.h>
30#include <linux/nodemask.h> 28#include <linux/nodemask.h>
31#include <linux/mmzone.h> 29#include <linux/mmzone.h>
30#include <linux/init.h>
31#include <linux/smp.h>
32#include <asm/cpu.h> 32#include <asm/cpu.h>
33 33
34static DEFINE_PER_CPU(struct x86_cpu, cpu_devices); 34static DEFINE_PER_CPU(struct x86_cpu, cpu_devices);
@@ -47,6 +47,7 @@ int __ref arch_register_cpu(int num)
47 */ 47 */
48 if (num) 48 if (num)
49 per_cpu(cpu_devices, num).cpu.hotpluggable = 1; 49 per_cpu(cpu_devices, num).cpu.hotpluggable = 1;
50
50 return register_cpu(&per_cpu(cpu_devices, num).cpu, num); 51 return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
51} 52}
52EXPORT_SYMBOL(arch_register_cpu); 53EXPORT_SYMBOL(arch_register_cpu);
@@ -56,12 +57,13 @@ void arch_unregister_cpu(int num)
56 unregister_cpu(&per_cpu(cpu_devices, num).cpu); 57 unregister_cpu(&per_cpu(cpu_devices, num).cpu);
57} 58}
58EXPORT_SYMBOL(arch_unregister_cpu); 59EXPORT_SYMBOL(arch_unregister_cpu);
59#else 60#else /* CONFIG_HOTPLUG_CPU */
61
60static int __init arch_register_cpu(int num) 62static int __init arch_register_cpu(int num)
61{ 63{
62 return register_cpu(&per_cpu(cpu_devices, num).cpu, num); 64 return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
63} 65}
64#endif /*CONFIG_HOTPLUG_CPU*/ 66#endif /* CONFIG_HOTPLUG_CPU */
65 67
66static int __init topology_init(void) 68static int __init topology_init(void)
67{ 69{
@@ -70,11 +72,11 @@ static int __init topology_init(void)
70#ifdef CONFIG_NUMA 72#ifdef CONFIG_NUMA
71 for_each_online_node(i) 73 for_each_online_node(i)
72 register_one_node(i); 74 register_one_node(i);
73#endif /* CONFIG_NUMA */ 75#endif
74 76
75 for_each_present_cpu(i) 77 for_each_present_cpu(i)
76 arch_register_cpu(i); 78 arch_register_cpu(i);
79
77 return 0; 80 return 0;
78} 81}
79
80subsys_initcall(topology_init); 82subsys_initcall(topology_init);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 1dba866967e2..2cc162e09c4b 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -118,47 +118,6 @@ die_if_kernel(const char *str, struct pt_regs *regs, long err)
118 if (!user_mode_vm(regs)) 118 if (!user_mode_vm(regs))
119 die(str, regs, err); 119 die(str, regs, err);
120} 120}
121
122/*
123 * Perform the lazy TSS's I/O bitmap copy. If the TSS has an
124 * invalid offset set (the LAZY one) and the faulting thread has
125 * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS,
126 * we set the offset field correctly and return 1.
127 */
128static int lazy_iobitmap_copy(void)
129{
130 struct thread_struct *thread;
131 struct tss_struct *tss;
132 int cpu;
133
134 cpu = get_cpu();
135 tss = &per_cpu(init_tss, cpu);
136 thread = &current->thread;
137
138 if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
139 thread->io_bitmap_ptr) {
140 memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
141 thread->io_bitmap_max);
142 /*
143 * If the previously set map was extending to higher ports
144 * than the current one, pad extra space with 0xff (no access).
145 */
146 if (thread->io_bitmap_max < tss->io_bitmap_max) {
147 memset((char *) tss->io_bitmap +
148 thread->io_bitmap_max, 0xff,
149 tss->io_bitmap_max - thread->io_bitmap_max);
150 }
151 tss->io_bitmap_max = thread->io_bitmap_max;
152 tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
153 tss->io_bitmap_owner = thread;
154 put_cpu();
155
156 return 1;
157 }
158 put_cpu();
159
160 return 0;
161}
162#endif 121#endif
163 122
164static void __kprobes 123static void __kprobes
@@ -309,11 +268,6 @@ do_general_protection(struct pt_regs *regs, long error_code)
309 conditional_sti(regs); 268 conditional_sti(regs);
310 269
311#ifdef CONFIG_X86_32 270#ifdef CONFIG_X86_32
312 if (lazy_iobitmap_copy()) {
313 /* restart the faulting instruction */
314 return;
315 }
316
317 if (regs->flags & X86_VM_MASK) 271 if (regs->flags & X86_VM_MASK)
318 goto gp_in_vm86; 272 goto gp_in_vm86;
319#endif 273#endif
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 83d53ce5d4c4..7a567ebe6361 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -17,20 +17,21 @@
17#include <asm/delay.h> 17#include <asm/delay.h>
18#include <asm/hypervisor.h> 18#include <asm/hypervisor.h>
19 19
20unsigned int cpu_khz; /* TSC clocks / usec, not used here */ 20unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */
21EXPORT_SYMBOL(cpu_khz); 21EXPORT_SYMBOL(cpu_khz);
22unsigned int tsc_khz; 22
23unsigned int __read_mostly tsc_khz;
23EXPORT_SYMBOL(tsc_khz); 24EXPORT_SYMBOL(tsc_khz);
24 25
25/* 26/*
26 * TSC can be unstable due to cpufreq or due to unsynced TSCs 27 * TSC can be unstable due to cpufreq or due to unsynced TSCs
27 */ 28 */
28static int tsc_unstable; 29static int __read_mostly tsc_unstable;
29 30
30/* native_sched_clock() is called before tsc_init(), so 31/* native_sched_clock() is called before tsc_init(), so
31 we must start with the TSC soft disabled to prevent 32 we must start with the TSC soft disabled to prevent
32 erroneous rdtsc usage on !cpu_has_tsc processors */ 33 erroneous rdtsc usage on !cpu_has_tsc processors */
33static int tsc_disabled = -1; 34static int __read_mostly tsc_disabled = -1;
34 35
35static int tsc_clocksource_reliable; 36static int tsc_clocksource_reliable;
36/* 37/*
@@ -273,30 +274,43 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
273 * use the TSC value at the transitions to calculate a pretty 274 * use the TSC value at the transitions to calculate a pretty
274 * good value for the TSC frequencty. 275 * good value for the TSC frequencty.
275 */ 276 */
276static inline int pit_expect_msb(unsigned char val) 277static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap)
277{ 278{
278 int count = 0; 279 int count;
280 u64 tsc = 0;
279 281
280 for (count = 0; count < 50000; count++) { 282 for (count = 0; count < 50000; count++) {
281 /* Ignore LSB */ 283 /* Ignore LSB */
282 inb(0x42); 284 inb(0x42);
283 if (inb(0x42) != val) 285 if (inb(0x42) != val)
284 break; 286 break;
287 tsc = get_cycles();
285 } 288 }
286 return count > 50; 289 *deltap = get_cycles() - tsc;
290 *tscp = tsc;
291
292 /*
293 * We require _some_ success, but the quality control
294 * will be based on the error terms on the TSC values.
295 */
296 return count > 5;
287} 297}
288 298
289/* 299/*
290 * How many MSB values do we want to see? We aim for a 300 * How many MSB values do we want to see? We aim for
291 * 15ms calibration, which assuming a 2us counter read 301 * a maximum error rate of 500ppm (in practice the
292 * error should give us roughly 150 ppm precision for 302 * real error is much smaller), but refuse to spend
293 * the calibration. 303 * more than 25ms on it.
294 */ 304 */
295#define QUICK_PIT_MS 15 305#define MAX_QUICK_PIT_MS 25
296#define QUICK_PIT_ITERATIONS (QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256) 306#define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
297 307
298static unsigned long quick_pit_calibrate(void) 308static unsigned long quick_pit_calibrate(void)
299{ 309{
310 int i;
311 u64 tsc, delta;
312 unsigned long d1, d2;
313
300 /* Set the Gate high, disable speaker */ 314 /* Set the Gate high, disable speaker */
301 outb((inb(0x61) & ~0x02) | 0x01, 0x61); 315 outb((inb(0x61) & ~0x02) | 0x01, 0x61);
302 316
@@ -315,45 +329,52 @@ static unsigned long quick_pit_calibrate(void)
315 outb(0xff, 0x42); 329 outb(0xff, 0x42);
316 outb(0xff, 0x42); 330 outb(0xff, 0x42);
317 331
318 if (pit_expect_msb(0xff)) { 332 /*
319 int i; 333 * The PIT starts counting at the next edge, so we
320 u64 t1, t2, delta; 334 * need to delay for a microsecond. The easiest way
321 unsigned char expect = 0xfe; 335 * to do that is to just read back the 16-bit counter
322 336 * once from the PIT.
323 t1 = get_cycles(); 337 */
324 for (i = 0; i < QUICK_PIT_ITERATIONS; i++, expect--) { 338 inb(0x42);
325 if (!pit_expect_msb(expect)) 339 inb(0x42);
326 goto failed; 340
341 if (pit_expect_msb(0xff, &tsc, &d1)) {
342 for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) {
343 if (!pit_expect_msb(0xff-i, &delta, &d2))
344 break;
345
346 /*
347 * Iterate until the error is less than 500 ppm
348 */
349 delta -= tsc;
350 if (d1+d2 < delta >> 11)
351 goto success;
327 } 352 }
328 t2 = get_cycles();
329
330 /*
331 * Make sure we can rely on the second TSC timestamp:
332 */
333 if (!pit_expect_msb(expect))
334 goto failed;
335
336 /*
337 * Ok, if we get here, then we've seen the
338 * MSB of the PIT decrement QUICK_PIT_ITERATIONS
339 * times, and each MSB had many hits, so we never
340 * had any sudden jumps.
341 *
342 * As a result, we can depend on there not being
343 * any odd delays anywhere, and the TSC reads are
344 * reliable.
345 *
346 * kHz = ticks / time-in-seconds / 1000;
347 * kHz = (t2 - t1) / (QPI * 256 / PIT_TICK_RATE) / 1000
348 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (QPI * 256 * 1000)
349 */
350 delta = (t2 - t1)*PIT_TICK_RATE;
351 do_div(delta, QUICK_PIT_ITERATIONS*256*1000);
352 printk("Fast TSC calibration using PIT\n");
353 return delta;
354 } 353 }
355failed: 354 printk("Fast TSC calibration failed\n");
356 return 0; 355 return 0;
356
357success:
358 /*
359 * Ok, if we get here, then we've seen the
360 * MSB of the PIT decrement 'i' times, and the
361 * error has shrunk to less than 500 ppm.
362 *
363 * As a result, we can depend on there not being
364 * any odd delays anywhere, and the TSC reads are
365 * reliable (within the error). We also adjust the
366 * delta to the middle of the error bars, just
367 * because it looks nicer.
368 *
369 * kHz = ticks / time-in-seconds / 1000;
370 * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000
371 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000)
372 */
373 delta += (long)(d2 - d1)/2;
374 delta *= PIT_TICK_RATE;
375 do_div(delta, i*256*1000);
376 printk("Fast TSC calibration using PIT\n");
377 return delta;
357} 378}
358 379
359/** 380/**
@@ -523,8 +544,6 @@ unsigned long native_calibrate_tsc(void)
523 return tsc_pit_min; 544 return tsc_pit_min;
524} 545}
525 546
526#ifdef CONFIG_X86_32
527/* Only called from the Powernow K7 cpu freq driver */
528int recalibrate_cpu_khz(void) 547int recalibrate_cpu_khz(void)
529{ 548{
530#ifndef CONFIG_SMP 549#ifndef CONFIG_SMP
@@ -546,7 +565,6 @@ int recalibrate_cpu_khz(void)
546 565
547EXPORT_SYMBOL(recalibrate_cpu_khz); 566EXPORT_SYMBOL(recalibrate_cpu_khz);
548 567
549#endif /* CONFIG_X86_32 */
550 568
551/* Accelerators for sched_clock() 569/* Accelerators for sched_clock()
552 * convert from cycles(64bits) => nanoseconds (64bits) 570 * convert from cycles(64bits) => nanoseconds (64bits)
diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c
new file mode 100644
index 000000000000..2ffb6c53326e
--- /dev/null
+++ b/arch/x86/kernel/uv_time.c
@@ -0,0 +1,393 @@
1/*
2 * SGI RTC clock/timer routines.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 *
18 * Copyright (c) 2009 Silicon Graphics, Inc. All Rights Reserved.
19 * Copyright (c) Dimitri Sivanich
20 */
21#include <linux/clockchips.h>
22
23#include <asm/uv/uv_mmrs.h>
24#include <asm/uv/uv_hub.h>
25#include <asm/uv/bios.h>
26#include <asm/uv/uv.h>
27#include <asm/apic.h>
28#include <asm/cpu.h>
29
30#define RTC_NAME "sgi_rtc"
31
32static cycle_t uv_read_rtc(void);
33static int uv_rtc_next_event(unsigned long, struct clock_event_device *);
34static void uv_rtc_timer_setup(enum clock_event_mode,
35 struct clock_event_device *);
36
37static struct clocksource clocksource_uv = {
38 .name = RTC_NAME,
39 .rating = 400,
40 .read = uv_read_rtc,
41 .mask = (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK,
42 .shift = 10,
43 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
44};
45
46static struct clock_event_device clock_event_device_uv = {
47 .name = RTC_NAME,
48 .features = CLOCK_EVT_FEAT_ONESHOT,
49 .shift = 20,
50 .rating = 400,
51 .irq = -1,
52 .set_next_event = uv_rtc_next_event,
53 .set_mode = uv_rtc_timer_setup,
54 .event_handler = NULL,
55};
56
57static DEFINE_PER_CPU(struct clock_event_device, cpu_ced);
58
59/* There is one of these allocated per node */
60struct uv_rtc_timer_head {
61 spinlock_t lock;
62 /* next cpu waiting for timer, local node relative: */
63 int next_cpu;
64 /* number of cpus on this node: */
65 int ncpus;
66 struct {
67 int lcpu; /* systemwide logical cpu number */
68 u64 expires; /* next timer expiration for this cpu */
69 } cpu[1];
70};
71
72/*
73 * Access to uv_rtc_timer_head via blade id.
74 */
75static struct uv_rtc_timer_head **blade_info __read_mostly;
76
77static int uv_rtc_enable;
78
79/*
80 * Hardware interface routines
81 */
82
83/* Send IPIs to another node */
84static void uv_rtc_send_IPI(int cpu)
85{
86 unsigned long apicid, val;
87 int pnode;
88
89 apicid = cpu_physical_id(cpu);
90 pnode = uv_apicid_to_pnode(apicid);
91 val = (1UL << UVH_IPI_INT_SEND_SHFT) |
92 (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
93 (GENERIC_INTERRUPT_VECTOR << UVH_IPI_INT_VECTOR_SHFT);
94
95 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
96}
97
98/* Check for an RTC interrupt pending */
99static int uv_intr_pending(int pnode)
100{
101 return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) &
102 UVH_EVENT_OCCURRED0_RTC1_MASK;
103}
104
105/* Setup interrupt and return non-zero if early expiration occurred. */
106static int uv_setup_intr(int cpu, u64 expires)
107{
108 u64 val;
109 int pnode = uv_cpu_to_pnode(cpu);
110
111 uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
112 UVH_RTC1_INT_CONFIG_M_MASK);
113 uv_write_global_mmr64(pnode, UVH_INT_CMPB, -1L);
114
115 uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS,
116 UVH_EVENT_OCCURRED0_RTC1_MASK);
117
118 val = (GENERIC_INTERRUPT_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
119 ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);
120
121 /* Set configuration */
122 uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, val);
123 /* Initialize comparator value */
124 uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires);
125
126 return (expires < uv_read_rtc() && !uv_intr_pending(pnode));
127}
128
129/*
130 * Per-cpu timer tracking routines
131 */
132
133static __init void uv_rtc_deallocate_timers(void)
134{
135 int bid;
136
137 for_each_possible_blade(bid) {
138 kfree(blade_info[bid]);
139 }
140 kfree(blade_info);
141}
142
143/* Allocate per-node list of cpu timer expiration times. */
144static __init int uv_rtc_allocate_timers(void)
145{
146 int cpu;
147
148 blade_info = kmalloc(uv_possible_blades * sizeof(void *), GFP_KERNEL);
149 if (!blade_info)
150 return -ENOMEM;
151 memset(blade_info, 0, uv_possible_blades * sizeof(void *));
152
153 for_each_present_cpu(cpu) {
154 int nid = cpu_to_node(cpu);
155 int bid = uv_cpu_to_blade_id(cpu);
156 int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
157 struct uv_rtc_timer_head *head = blade_info[bid];
158
159 if (!head) {
160 head = kmalloc_node(sizeof(struct uv_rtc_timer_head) +
161 (uv_blade_nr_possible_cpus(bid) *
162 2 * sizeof(u64)),
163 GFP_KERNEL, nid);
164 if (!head) {
165 uv_rtc_deallocate_timers();
166 return -ENOMEM;
167 }
168 spin_lock_init(&head->lock);
169 head->ncpus = uv_blade_nr_possible_cpus(bid);
170 head->next_cpu = -1;
171 blade_info[bid] = head;
172 }
173
174 head->cpu[bcpu].lcpu = cpu;
175 head->cpu[bcpu].expires = ULLONG_MAX;
176 }
177
178 return 0;
179}
180
181/* Find and set the next expiring timer. */
182static void uv_rtc_find_next_timer(struct uv_rtc_timer_head *head, int pnode)
183{
184 u64 lowest = ULLONG_MAX;
185 int c, bcpu = -1;
186
187 head->next_cpu = -1;
188 for (c = 0; c < head->ncpus; c++) {
189 u64 exp = head->cpu[c].expires;
190 if (exp < lowest) {
191 bcpu = c;
192 lowest = exp;
193 }
194 }
195 if (bcpu >= 0) {
196 head->next_cpu = bcpu;
197 c = head->cpu[bcpu].lcpu;
198 if (uv_setup_intr(c, lowest))
199 /* If we didn't set it up in time, trigger */
200 uv_rtc_send_IPI(c);
201 } else {
202 uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
203 UVH_RTC1_INT_CONFIG_M_MASK);
204 }
205}
206
207/*
208 * Set expiration time for current cpu.
209 *
210 * Returns 1 if we missed the expiration time.
211 */
212static int uv_rtc_set_timer(int cpu, u64 expires)
213{
214 int pnode = uv_cpu_to_pnode(cpu);
215 int bid = uv_cpu_to_blade_id(cpu);
216 struct uv_rtc_timer_head *head = blade_info[bid];
217 int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
218 u64 *t = &head->cpu[bcpu].expires;
219 unsigned long flags;
220 int next_cpu;
221
222 spin_lock_irqsave(&head->lock, flags);
223
224 next_cpu = head->next_cpu;
225 *t = expires;
226 /* Will this one be next to go off? */
227 if (next_cpu < 0 || bcpu == next_cpu ||
228 expires < head->cpu[next_cpu].expires) {
229 head->next_cpu = bcpu;
230 if (uv_setup_intr(cpu, expires)) {
231 *t = ULLONG_MAX;
232 uv_rtc_find_next_timer(head, pnode);
233 spin_unlock_irqrestore(&head->lock, flags);
234 return 1;
235 }
236 }
237
238 spin_unlock_irqrestore(&head->lock, flags);
239 return 0;
240}
241
242/*
243 * Unset expiration time for current cpu.
244 *
245 * Returns 1 if this timer was pending.
246 */
247static int uv_rtc_unset_timer(int cpu)
248{
249 int pnode = uv_cpu_to_pnode(cpu);
250 int bid = uv_cpu_to_blade_id(cpu);
251 struct uv_rtc_timer_head *head = blade_info[bid];
252 int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
253 u64 *t = &head->cpu[bcpu].expires;
254 unsigned long flags;
255 int rc = 0;
256
257 spin_lock_irqsave(&head->lock, flags);
258
259 if (head->next_cpu == bcpu && uv_read_rtc() >= *t)
260 rc = 1;
261
262 *t = ULLONG_MAX;
263
264 /* Was the hardware setup for this timer? */
265 if (head->next_cpu == bcpu)
266 uv_rtc_find_next_timer(head, pnode);
267
268 spin_unlock_irqrestore(&head->lock, flags);
269
270 return rc;
271}
272
273
274/*
275 * Kernel interface routines.
276 */
277
278/*
279 * Read the RTC.
280 */
281static cycle_t uv_read_rtc(void)
282{
283 return (cycle_t)uv_read_local_mmr(UVH_RTC);
284}
285
286/*
287 * Program the next event, relative to now
288 */
289static int uv_rtc_next_event(unsigned long delta,
290 struct clock_event_device *ced)
291{
292 int ced_cpu = cpumask_first(ced->cpumask);
293
294 return uv_rtc_set_timer(ced_cpu, delta + uv_read_rtc());
295}
296
297/*
298 * Setup the RTC timer in oneshot mode
299 */
300static void uv_rtc_timer_setup(enum clock_event_mode mode,
301 struct clock_event_device *evt)
302{
303 int ced_cpu = cpumask_first(evt->cpumask);
304
305 switch (mode) {
306 case CLOCK_EVT_MODE_PERIODIC:
307 case CLOCK_EVT_MODE_ONESHOT:
308 case CLOCK_EVT_MODE_RESUME:
309 /* Nothing to do here yet */
310 break;
311 case CLOCK_EVT_MODE_UNUSED:
312 case CLOCK_EVT_MODE_SHUTDOWN:
313 uv_rtc_unset_timer(ced_cpu);
314 break;
315 }
316}
317
318static void uv_rtc_interrupt(void)
319{
320 struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
321 int cpu = smp_processor_id();
322
323 if (!ced || !ced->event_handler)
324 return;
325
326 if (uv_rtc_unset_timer(cpu) != 1)
327 return;
328
329 ced->event_handler(ced);
330}
331
332static int __init uv_enable_rtc(char *str)
333{
334 uv_rtc_enable = 1;
335
336 return 1;
337}
338__setup("uvrtc", uv_enable_rtc);
339
340static __init void uv_rtc_register_clockevents(struct work_struct *dummy)
341{
342 struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
343
344 *ced = clock_event_device_uv;
345 ced->cpumask = cpumask_of(smp_processor_id());
346 clockevents_register_device(ced);
347}
348
349static __init int uv_rtc_setup_clock(void)
350{
351 int rc;
352
353 if (!uv_rtc_enable || !is_uv_system() || generic_interrupt_extension)
354 return -ENODEV;
355
356 generic_interrupt_extension = uv_rtc_interrupt;
357
358 clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second,
359 clocksource_uv.shift);
360
361 rc = clocksource_register(&clocksource_uv);
362 if (rc) {
363 generic_interrupt_extension = NULL;
364 return rc;
365 }
366
367 /* Setup and register clockevents */
368 rc = uv_rtc_allocate_timers();
369 if (rc) {
370 clocksource_unregister(&clocksource_uv);
371 generic_interrupt_extension = NULL;
372 return rc;
373 }
374
375 clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second,
376 NSEC_PER_SEC, clock_event_device_uv.shift);
377
378 clock_event_device_uv.min_delta_ns = NSEC_PER_SEC /
379 sn_rtc_cycles_per_second;
380
381 clock_event_device_uv.max_delta_ns = clocksource_uv.mask *
382 (NSEC_PER_SEC / sn_rtc_cycles_per_second);
383
384 rc = schedule_on_each_cpu(uv_rtc_register_clockevents);
385 if (rc) {
386 clocksource_unregister(&clocksource_uv);
387 generic_interrupt_extension = NULL;
388 uv_rtc_deallocate_timers();
389 }
390
391 return rc;
392}
393arch_initcall(uv_rtc_setup_clock);
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 191a876e9e87..31ffc24eec4d 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -578,7 +578,7 @@ static struct irq_chip piix4_virtual_irq_type = {
578static irqreturn_t piix4_master_intr(int irq, void *dev_id) 578static irqreturn_t piix4_master_intr(int irq, void *dev_id)
579{ 579{
580 int realirq; 580 int realirq;
581 irq_desc_t *desc; 581 struct irq_desc *desc;
582 unsigned long flags; 582 unsigned long flags;
583 583
584 spin_lock_irqsave(&i8259A_lock, flags); 584 spin_lock_irqsave(&i8259A_lock, flags);
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 2cc4a90e2cb3..95deb9f2211e 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -395,11 +395,6 @@ static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
395 vmi_ops.update_pte(ptep, VMI_PAGE_PT); 395 vmi_ops.update_pte(ptep, VMI_PAGE_PT);
396} 396}
397 397
398static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
399{
400 vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1));
401}
402
403static void vmi_set_pud(pud_t *pudp, pud_t pudval) 398static void vmi_set_pud(pud_t *pudp, pud_t pudval)
404{ 399{
405 /* Um, eww */ 400 /* Um, eww */
@@ -750,7 +745,6 @@ static inline int __init activate_vmi(void)
750 pv_mmu_ops.set_pmd = vmi_set_pmd; 745 pv_mmu_ops.set_pmd = vmi_set_pmd;
751#ifdef CONFIG_X86_PAE 746#ifdef CONFIG_X86_PAE
752 pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic; 747 pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic;
753 pv_mmu_ops.set_pte_present = vmi_set_pte_present;
754 pv_mmu_ops.set_pud = vmi_set_pud; 748 pv_mmu_ops.set_pud = vmi_set_pud;
755 pv_mmu_ops.pte_clear = vmi_pte_clear; 749 pv_mmu_ops.pte_clear = vmi_pte_clear;
756 pv_mmu_ops.pmd_clear = vmi_pmd_clear; 750 pv_mmu_ops.pmd_clear = vmi_pmd_clear;
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 33a788d5879c..d303369a7bad 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -202,7 +202,6 @@ static struct irqaction vmi_clock_action = {
202 .name = "vmi-timer", 202 .name = "vmi-timer",
203 .handler = vmi_timer_interrupt, 203 .handler = vmi_timer_interrupt,
204 .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER, 204 .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER,
205 .mask = CPU_MASK_ALL,
206}; 205};
207 206
208static void __devinit vmi_time_init_clockevent(void) 207static void __devinit vmi_time_init_clockevent(void)
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 0d860963f268..62ad500d55f3 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -189,15 +189,24 @@ SECTIONS
189 *(.bss) 189 *(.bss)
190 . = ALIGN(4); 190 . = ALIGN(4);
191 __bss_stop = .; 191 __bss_stop = .;
192 _end = . ; 192 }
193 /* This is where the kernel creates the early boot page tables */ 193
194 .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
194 . = ALIGN(PAGE_SIZE); 195 . = ALIGN(PAGE_SIZE);
195 pg0 = . ; 196 __brk_base = . ;
197 . += 64 * 1024 ; /* 64k alignment slop space */
198 *(.brk_reservation) /* areas brk users have reserved */
199 __brk_limit = . ;
200 }
201
202 .end : AT(ADDR(.end) - LOAD_OFFSET) {
203 _end = . ;
196 } 204 }
197 205
198 /* Sections to be discarded */ 206 /* Sections to be discarded */
199 /DISCARD/ : { 207 /DISCARD/ : {
200 *(.exitcall.exit) 208 *(.exitcall.exit)
209 *(.discard)
201 } 210 }
202 211
203 STABS_DEBUG 212 STABS_DEBUG
@@ -205,6 +214,12 @@ SECTIONS
205 DWARF_DEBUG 214 DWARF_DEBUG
206} 215}
207 216
217/*
218 * Build-time check on the image size:
219 */
220ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
221 "kernel image bigger than KERNEL_IMAGE_SIZE")
222
208#ifdef CONFIG_KEXEC 223#ifdef CONFIG_KEXEC
209/* Link time checks */ 224/* Link time checks */
210#include <asm/kexec.h> 225#include <asm/kexec.h>
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index fbfced6f6800..c8742507b030 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -29,8 +29,8 @@ SECTIONS
29{ 29{
30 . = __START_KERNEL; 30 . = __START_KERNEL;
31 phys_startup_64 = startup_64 - LOAD_OFFSET; 31 phys_startup_64 = startup_64 - LOAD_OFFSET;
32 _text = .; /* Text and read-only data */
33 .text : AT(ADDR(.text) - LOAD_OFFSET) { 32 .text : AT(ADDR(.text) - LOAD_OFFSET) {
33 _text = .; /* Text and read-only data */
34 /* First the code that has to be first for bootstrapping */ 34 /* First the code that has to be first for bootstrapping */
35 *(.text.head) 35 *(.text.head)
36 _stext = .; 36 _stext = .;
@@ -61,13 +61,13 @@ SECTIONS
61 .data : AT(ADDR(.data) - LOAD_OFFSET) { 61 .data : AT(ADDR(.data) - LOAD_OFFSET) {
62 DATA_DATA 62 DATA_DATA
63 CONSTRUCTORS 63 CONSTRUCTORS
64 _edata = .; /* End of data section */
64 } :data 65 } :data
65 66
66 _edata = .; /* End of data section */
67 67
68 . = ALIGN(PAGE_SIZE);
69 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
70 .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { 68 .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
69 . = ALIGN(PAGE_SIZE);
70 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
71 *(.data.cacheline_aligned) 71 *(.data.cacheline_aligned)
72 } 72 }
73 . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); 73 . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
@@ -125,29 +125,29 @@ SECTIONS
125#undef VVIRT_OFFSET 125#undef VVIRT_OFFSET
126#undef VVIRT 126#undef VVIRT
127 127
128 . = ALIGN(THREAD_SIZE); /* init_task */
129 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { 128 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
129 . = ALIGN(THREAD_SIZE); /* init_task */
130 *(.data.init_task) 130 *(.data.init_task)
131 }:data.init 131 }:data.init
132 132
133 . = ALIGN(PAGE_SIZE);
134 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { 133 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
134 . = ALIGN(PAGE_SIZE);
135 *(.data.page_aligned) 135 *(.data.page_aligned)
136 } 136 }
137 137
138 /* might get freed after init */
139 . = ALIGN(PAGE_SIZE);
140 __smp_alt_begin = .;
141 __smp_locks = .;
142 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { 138 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
139 /* might get freed after init */
140 . = ALIGN(PAGE_SIZE);
141 __smp_alt_begin = .;
142 __smp_locks = .;
143 *(.smp_locks) 143 *(.smp_locks)
144 __smp_locks_end = .;
145 . = ALIGN(PAGE_SIZE);
146 __smp_alt_end = .;
144 } 147 }
145 __smp_locks_end = .;
146 . = ALIGN(PAGE_SIZE);
147 __smp_alt_end = .;
148 148
149 . = ALIGN(PAGE_SIZE); /* Init code and data */ 149 . = ALIGN(PAGE_SIZE); /* Init code and data */
150 __init_begin = .; 150 __init_begin = .; /* paired with __init_end */
151 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { 151 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
152 _sinittext = .; 152 _sinittext = .;
153 INIT_TEXT 153 INIT_TEXT
@@ -159,40 +159,42 @@ SECTIONS
159 __initdata_end = .; 159 __initdata_end = .;
160 } 160 }
161 161
162 . = ALIGN(16); 162 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
163 __setup_start = .; 163 . = ALIGN(16);
164 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) } 164 __setup_start = .;
165 __setup_end = .; 165 *(.init.setup)
166 __initcall_start = .; 166 __setup_end = .;
167 }
167 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { 168 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
169 __initcall_start = .;
168 INITCALLS 170 INITCALLS
171 __initcall_end = .;
169 } 172 }
170 __initcall_end = .;
171 __con_initcall_start = .;
172 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { 173 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
174 __con_initcall_start = .;
173 *(.con_initcall.init) 175 *(.con_initcall.init)
176 __con_initcall_end = .;
174 } 177 }
175 __con_initcall_end = .;
176 __x86_cpu_dev_start = .;
177 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { 178 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
179 __x86_cpu_dev_start = .;
178 *(.x86_cpu_dev.init) 180 *(.x86_cpu_dev.init)
181 __x86_cpu_dev_end = .;
179 } 182 }
180 __x86_cpu_dev_end = .;
181 SECURITY_INIT 183 SECURITY_INIT
182 184
183 . = ALIGN(8); 185 . = ALIGN(8);
184 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { 186 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
185 __parainstructions = .; 187 __parainstructions = .;
186 *(.parainstructions) 188 *(.parainstructions)
187 __parainstructions_end = .; 189 __parainstructions_end = .;
188 } 190 }
189 191
190 . = ALIGN(8);
191 __alt_instructions = .;
192 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { 192 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
193 . = ALIGN(8);
194 __alt_instructions = .;
193 *(.altinstructions) 195 *(.altinstructions)
196 __alt_instructions_end = .;
194 } 197 }
195 __alt_instructions_end = .;
196 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { 198 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
197 *(.altinstr_replacement) 199 *(.altinstr_replacement)
198 } 200 }
@@ -207,9 +209,11 @@ SECTIONS
207 209
208#ifdef CONFIG_BLK_DEV_INITRD 210#ifdef CONFIG_BLK_DEV_INITRD
209 . = ALIGN(PAGE_SIZE); 211 . = ALIGN(PAGE_SIZE);
210 __initramfs_start = .; 212 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
211 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) } 213 __initramfs_start = .;
212 __initramfs_end = .; 214 *(.init.ramfs)
215 __initramfs_end = .;
216 }
213#endif 217#endif
214 218
215#ifdef CONFIG_SMP 219#ifdef CONFIG_SMP
@@ -229,20 +233,29 @@ SECTIONS
229 . = ALIGN(PAGE_SIZE); 233 . = ALIGN(PAGE_SIZE);
230 __init_end = .; 234 __init_end = .;
231 235
232 . = ALIGN(PAGE_SIZE);
233 __nosave_begin = .;
234 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { 236 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
235 *(.data.nosave) 237 . = ALIGN(PAGE_SIZE);
238 __nosave_begin = .;
239 *(.data.nosave)
240 . = ALIGN(PAGE_SIZE);
241 __nosave_end = .;
236 } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */ 242 } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */
237 . = ALIGN(PAGE_SIZE);
238 __nosave_end = .;
239 243
240 __bss_start = .; /* BSS */
241 .bss : AT(ADDR(.bss) - LOAD_OFFSET) { 244 .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
245 . = ALIGN(PAGE_SIZE);
246 __bss_start = .; /* BSS */
242 *(.bss.page_aligned) 247 *(.bss.page_aligned)
243 *(.bss) 248 *(.bss)
244 } 249 __bss_stop = .;
245 __bss_stop = .; 250 }
251
252 .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
253 . = ALIGN(PAGE_SIZE);
254 __brk_base = . ;
255 . += 64 * 1024 ; /* 64k alignment slop space */
256 *(.brk_reservation) /* areas brk users have reserved */
257 __brk_limit = . ;
258 }
246 259
247 _end = . ; 260 _end = . ;
248 261
@@ -250,6 +263,7 @@ SECTIONS
250 /DISCARD/ : { 263 /DISCARD/ : {
251 *(.exitcall.exit) 264 *(.exitcall.exit)
252 *(.eh_frame) 265 *(.eh_frame)
266 *(.discard)
253 } 267 }
254 268
255 STABS_DEBUG 269 STABS_DEBUG
@@ -275,3 +289,10 @@ ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
275ASSERT((per_cpu__irq_stack_union == 0), 289ASSERT((per_cpu__irq_stack_union == 0),
276 "irq_stack_union is not at start of per-cpu area"); 290 "irq_stack_union is not at start of per-cpu area");
277#endif 291#endif
292
293#ifdef CONFIG_KEXEC
294#include <asm/kexec.h>
295
296ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
297 "kexec control code size is too big")
298#endif
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index c609205df594..a1d804bcd483 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -139,6 +139,7 @@ int is_vsmp_box(void)
139 return 0; 139 return 0;
140 } 140 }
141} 141}
142
142#else 143#else
143static void __init detect_vsmp_box(void) 144static void __init detect_vsmp_box(void)
144{ 145{
@@ -148,7 +149,6 @@ int is_vsmp_box(void)
148 return 0; 149 return 0;
149} 150}
150#endif 151#endif
151
152void __init vsmp_init(void) 152void __init vsmp_init(void)
153{ 153{
154 detect_vsmp_box(); 154 detect_vsmp_box();
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index b81125f0bdee..a58504ea78cc 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -4,6 +4,10 @@
4config HAVE_KVM 4config HAVE_KVM
5 bool 5 bool
6 6
7config HAVE_KVM_IRQCHIP
8 bool
9 default y
10
7menuconfig VIRTUALIZATION 11menuconfig VIRTUALIZATION
8 bool "Virtualization" 12 bool "Virtualization"
9 depends on HAVE_KVM || X86 13 depends on HAVE_KVM || X86
@@ -55,7 +59,8 @@ config KVM_AMD
55 59
56config KVM_TRACE 60config KVM_TRACE
57 bool "KVM trace support" 61 bool "KVM trace support"
58 depends on KVM && MARKERS && SYSFS 62 depends on KVM && SYSFS
63 select MARKERS
59 select RELAY 64 select RELAY
60 select DEBUG_FS 65 select DEBUG_FS
61 default n 66 default n
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 72bd275a9b5c..c13bb92d3157 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -201,6 +201,9 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps)
201 if (!atomic_inc_and_test(&pt->pending)) 201 if (!atomic_inc_and_test(&pt->pending))
202 set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests); 202 set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
203 203
204 if (!pt->reinject)
205 atomic_set(&pt->pending, 1);
206
204 if (vcpu0 && waitqueue_active(&vcpu0->wq)) 207 if (vcpu0 && waitqueue_active(&vcpu0->wq))
205 wake_up_interruptible(&vcpu0->wq); 208 wake_up_interruptible(&vcpu0->wq);
206 209
@@ -536,6 +539,16 @@ void kvm_pit_reset(struct kvm_pit *pit)
536 pit->pit_state.irq_ack = 1; 539 pit->pit_state.irq_ack = 1;
537} 540}
538 541
542static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
543{
544 struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier);
545
546 if (!mask) {
547 atomic_set(&pit->pit_state.pit_timer.pending, 0);
548 pit->pit_state.irq_ack = 1;
549 }
550}
551
539struct kvm_pit *kvm_create_pit(struct kvm *kvm) 552struct kvm_pit *kvm_create_pit(struct kvm *kvm)
540{ 553{
541 struct kvm_pit *pit; 554 struct kvm_pit *pit;
@@ -545,9 +558,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
545 if (!pit) 558 if (!pit)
546 return NULL; 559 return NULL;
547 560
548 mutex_lock(&kvm->lock);
549 pit->irq_source_id = kvm_request_irq_source_id(kvm); 561 pit->irq_source_id = kvm_request_irq_source_id(kvm);
550 mutex_unlock(&kvm->lock);
551 if (pit->irq_source_id < 0) { 562 if (pit->irq_source_id < 0) {
552 kfree(pit); 563 kfree(pit);
553 return NULL; 564 return NULL;
@@ -580,10 +591,14 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
580 pit_state->irq_ack_notifier.gsi = 0; 591 pit_state->irq_ack_notifier.gsi = 0;
581 pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq; 592 pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
582 kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); 593 kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
594 pit_state->pit_timer.reinject = true;
583 mutex_unlock(&pit->pit_state.lock); 595 mutex_unlock(&pit->pit_state.lock);
584 596
585 kvm_pit_reset(pit); 597 kvm_pit_reset(pit);
586 598
599 pit->mask_notifier.func = pit_mask_notifer;
600 kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
601
587 return pit; 602 return pit;
588} 603}
589 604
@@ -592,6 +607,8 @@ void kvm_free_pit(struct kvm *kvm)
592 struct hrtimer *timer; 607 struct hrtimer *timer;
593 608
594 if (kvm->arch.vpit) { 609 if (kvm->arch.vpit) {
610 kvm_unregister_irq_mask_notifier(kvm, 0,
611 &kvm->arch.vpit->mask_notifier);
595 mutex_lock(&kvm->arch.vpit->pit_state.lock); 612 mutex_lock(&kvm->arch.vpit->pit_state.lock);
596 timer = &kvm->arch.vpit->pit_state.pit_timer.timer; 613 timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
597 hrtimer_cancel(timer); 614 hrtimer_cancel(timer);
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 4178022b97aa..6acbe4b505d5 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -9,6 +9,7 @@ struct kvm_kpit_timer {
9 s64 period; /* unit: ns */ 9 s64 period; /* unit: ns */
10 s64 scheduled; 10 s64 scheduled;
11 atomic_t pending; 11 atomic_t pending;
12 bool reinject;
12}; 13};
13 14
14struct kvm_kpit_channel_state { 15struct kvm_kpit_channel_state {
@@ -45,6 +46,7 @@ struct kvm_pit {
45 struct kvm *kvm; 46 struct kvm *kvm;
46 struct kvm_kpit_state pit_state; 47 struct kvm_kpit_state pit_state;
47 int irq_source_id; 48 int irq_source_id;
49 struct kvm_irq_mask_notifier mask_notifier;
48}; 50};
49 51
50#define KVM_PIT_BASE_ADDRESS 0x40 52#define KVM_PIT_BASE_ADDRESS 0x40
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 179dcb0103fd..1ccb50c74f18 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -32,11 +32,13 @@
32#include <linux/kvm_host.h> 32#include <linux/kvm_host.h>
33 33
34static void pic_lock(struct kvm_pic *s) 34static void pic_lock(struct kvm_pic *s)
35 __acquires(&s->lock)
35{ 36{
36 spin_lock(&s->lock); 37 spin_lock(&s->lock);
37} 38}
38 39
39static void pic_unlock(struct kvm_pic *s) 40static void pic_unlock(struct kvm_pic *s)
41 __releases(&s->lock)
40{ 42{
41 struct kvm *kvm = s->kvm; 43 struct kvm *kvm = s->kvm;
42 unsigned acks = s->pending_acks; 44 unsigned acks = s->pending_acks;
@@ -49,7 +51,8 @@ static void pic_unlock(struct kvm_pic *s)
49 spin_unlock(&s->lock); 51 spin_unlock(&s->lock);
50 52
51 while (acks) { 53 while (acks) {
52 kvm_notify_acked_irq(kvm, __ffs(acks)); 54 kvm_notify_acked_irq(kvm, SELECT_PIC(__ffs(acks)),
55 __ffs(acks));
53 acks &= acks - 1; 56 acks &= acks - 1;
54 } 57 }
55 58
@@ -76,12 +79,13 @@ void kvm_pic_clear_isr_ack(struct kvm *kvm)
76/* 79/*
77 * set irq level. If an edge is detected, then the IRR is set to 1 80 * set irq level. If an edge is detected, then the IRR is set to 1
78 */ 81 */
79static inline void pic_set_irq1(struct kvm_kpic_state *s, int irq, int level) 82static inline int pic_set_irq1(struct kvm_kpic_state *s, int irq, int level)
80{ 83{
81 int mask; 84 int mask, ret = 1;
82 mask = 1 << irq; 85 mask = 1 << irq;
83 if (s->elcr & mask) /* level triggered */ 86 if (s->elcr & mask) /* level triggered */
84 if (level) { 87 if (level) {
88 ret = !(s->irr & mask);
85 s->irr |= mask; 89 s->irr |= mask;
86 s->last_irr |= mask; 90 s->last_irr |= mask;
87 } else { 91 } else {
@@ -90,11 +94,15 @@ static inline void pic_set_irq1(struct kvm_kpic_state *s, int irq, int level)
90 } 94 }
91 else /* edge triggered */ 95 else /* edge triggered */
92 if (level) { 96 if (level) {
93 if ((s->last_irr & mask) == 0) 97 if ((s->last_irr & mask) == 0) {
98 ret = !(s->irr & mask);
94 s->irr |= mask; 99 s->irr |= mask;
100 }
95 s->last_irr |= mask; 101 s->last_irr |= mask;
96 } else 102 } else
97 s->last_irr &= ~mask; 103 s->last_irr &= ~mask;
104
105 return (s->imr & mask) ? -1 : ret;
98} 106}
99 107
100/* 108/*
@@ -171,16 +179,19 @@ void kvm_pic_update_irq(struct kvm_pic *s)
171 pic_unlock(s); 179 pic_unlock(s);
172} 180}
173 181
174void kvm_pic_set_irq(void *opaque, int irq, int level) 182int kvm_pic_set_irq(void *opaque, int irq, int level)
175{ 183{
176 struct kvm_pic *s = opaque; 184 struct kvm_pic *s = opaque;
185 int ret = -1;
177 186
178 pic_lock(s); 187 pic_lock(s);
179 if (irq >= 0 && irq < PIC_NUM_PINS) { 188 if (irq >= 0 && irq < PIC_NUM_PINS) {
180 pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); 189 ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
181 pic_update_irq(s); 190 pic_update_irq(s);
182 } 191 }
183 pic_unlock(s); 192 pic_unlock(s);
193
194 return ret;
184} 195}
185 196
186/* 197/*
@@ -232,7 +243,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
232 } 243 }
233 pic_update_irq(s); 244 pic_update_irq(s);
234 pic_unlock(s); 245 pic_unlock(s);
235 kvm_notify_acked_irq(kvm, irq); 246 kvm_notify_acked_irq(kvm, SELECT_PIC(irq), irq);
236 247
237 return intno; 248 return intno;
238} 249}
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 82579ee538d0..9f593188129e 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -32,6 +32,8 @@
32#include "lapic.h" 32#include "lapic.h"
33 33
34#define PIC_NUM_PINS 16 34#define PIC_NUM_PINS 16
35#define SELECT_PIC(irq) \
36 ((irq) < 8 ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE)
35 37
36struct kvm; 38struct kvm;
37struct kvm_vcpu; 39struct kvm_vcpu;
diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h
index 8e5ee99551f6..ed66e4c078dc 100644
--- a/arch/x86/kvm/kvm_svm.h
+++ b/arch/x86/kvm/kvm_svm.h
@@ -18,7 +18,6 @@ static const u32 host_save_user_msrs[] = {
18}; 18};
19 19
20#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) 20#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
21#define NUM_DB_REGS 4
22 21
23struct kvm_vcpu; 22struct kvm_vcpu;
24 23
@@ -29,18 +28,23 @@ struct vcpu_svm {
29 struct svm_cpu_data *svm_data; 28 struct svm_cpu_data *svm_data;
30 uint64_t asid_generation; 29 uint64_t asid_generation;
31 30
32 unsigned long db_regs[NUM_DB_REGS];
33
34 u64 next_rip; 31 u64 next_rip;
35 32
36 u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; 33 u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
37 u64 host_gs_base; 34 u64 host_gs_base;
38 unsigned long host_cr2; 35 unsigned long host_cr2;
39 unsigned long host_db_regs[NUM_DB_REGS];
40 unsigned long host_dr6;
41 unsigned long host_dr7;
42 36
43 u32 *msrpm; 37 u32 *msrpm;
38 struct vmcb *hsave;
39 u64 hsave_msr;
40
41 u64 nested_vmcb;
42
43 /* These are the merged vectors */
44 u32 *nested_msrpm;
45
46 /* gpa pointers to the real vectors */
47 u64 nested_vmcb_msrpm;
44}; 48};
45 49
46#endif 50#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2d4477c71473..2a36f7f7c4c7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -145,11 +145,20 @@ struct kvm_rmap_desc {
145 struct kvm_rmap_desc *more; 145 struct kvm_rmap_desc *more;
146}; 146};
147 147
148struct kvm_shadow_walk { 148struct kvm_shadow_walk_iterator {
149 int (*entry)(struct kvm_shadow_walk *walk, struct kvm_vcpu *vcpu, 149 u64 addr;
150 u64 addr, u64 *spte, int level); 150 hpa_t shadow_addr;
151 int level;
152 u64 *sptep;
153 unsigned index;
151}; 154};
152 155
156#define for_each_shadow_entry(_vcpu, _addr, _walker) \
157 for (shadow_walk_init(&(_walker), _vcpu, _addr); \
158 shadow_walk_okay(&(_walker)); \
159 shadow_walk_next(&(_walker)))
160
161
153struct kvm_unsync_walk { 162struct kvm_unsync_walk {
154 int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk); 163 int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk);
155}; 164};
@@ -343,7 +352,6 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
343 352
344 BUG_ON(!mc->nobjs); 353 BUG_ON(!mc->nobjs);
345 p = mc->objects[--mc->nobjs]; 354 p = mc->objects[--mc->nobjs];
346 memset(p, 0, size);
347 return p; 355 return p;
348} 356}
349 357
@@ -794,10 +802,8 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
794 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 802 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
795 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); 803 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
796 INIT_LIST_HEAD(&sp->oos_link); 804 INIT_LIST_HEAD(&sp->oos_link);
797 ASSERT(is_empty_shadow_page(sp->spt));
798 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); 805 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
799 sp->multimapped = 0; 806 sp->multimapped = 0;
800 sp->global = 1;
801 sp->parent_pte = parent_pte; 807 sp->parent_pte = parent_pte;
802 --vcpu->kvm->arch.n_free_mmu_pages; 808 --vcpu->kvm->arch.n_free_mmu_pages;
803 return sp; 809 return sp;
@@ -983,8 +989,8 @@ struct kvm_mmu_pages {
983 idx < 512; \ 989 idx < 512; \
984 idx = find_next_bit(bitmap, 512, idx+1)) 990 idx = find_next_bit(bitmap, 512, idx+1))
985 991
986int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, 992static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
987 int idx) 993 int idx)
988{ 994{
989 int i; 995 int i;
990 996
@@ -1059,7 +1065,7 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
1059 index = kvm_page_table_hashfn(gfn); 1065 index = kvm_page_table_hashfn(gfn);
1060 bucket = &kvm->arch.mmu_page_hash[index]; 1066 bucket = &kvm->arch.mmu_page_hash[index];
1061 hlist_for_each_entry(sp, node, bucket, hash_link) 1067 hlist_for_each_entry(sp, node, bucket, hash_link)
1062 if (sp->gfn == gfn && !sp->role.metaphysical 1068 if (sp->gfn == gfn && !sp->role.direct
1063 && !sp->role.invalid) { 1069 && !sp->role.invalid) {
1064 pgprintk("%s: found role %x\n", 1070 pgprintk("%s: found role %x\n",
1065 __func__, sp->role.word); 1071 __func__, sp->role.word);
@@ -1115,8 +1121,9 @@ struct mmu_page_path {
1115 i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \ 1121 i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \
1116 i = mmu_pages_next(&pvec, &parents, i)) 1122 i = mmu_pages_next(&pvec, &parents, i))
1117 1123
1118int mmu_pages_next(struct kvm_mmu_pages *pvec, struct mmu_page_path *parents, 1124static int mmu_pages_next(struct kvm_mmu_pages *pvec,
1119 int i) 1125 struct mmu_page_path *parents,
1126 int i)
1120{ 1127{
1121 int n; 1128 int n;
1122 1129
@@ -1135,7 +1142,7 @@ int mmu_pages_next(struct kvm_mmu_pages *pvec, struct mmu_page_path *parents,
1135 return n; 1142 return n;
1136} 1143}
1137 1144
1138void mmu_pages_clear_parents(struct mmu_page_path *parents) 1145static void mmu_pages_clear_parents(struct mmu_page_path *parents)
1139{ 1146{
1140 struct kvm_mmu_page *sp; 1147 struct kvm_mmu_page *sp;
1141 unsigned int level = 0; 1148 unsigned int level = 0;
@@ -1193,7 +1200,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1193 gfn_t gfn, 1200 gfn_t gfn,
1194 gva_t gaddr, 1201 gva_t gaddr,
1195 unsigned level, 1202 unsigned level,
1196 int metaphysical, 1203 int direct,
1197 unsigned access, 1204 unsigned access,
1198 u64 *parent_pte) 1205 u64 *parent_pte)
1199{ 1206{
@@ -1204,10 +1211,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1204 struct kvm_mmu_page *sp; 1211 struct kvm_mmu_page *sp;
1205 struct hlist_node *node, *tmp; 1212 struct hlist_node *node, *tmp;
1206 1213
1207 role.word = 0; 1214 role = vcpu->arch.mmu.base_role;
1208 role.glevels = vcpu->arch.mmu.root_level;
1209 role.level = level; 1215 role.level = level;
1210 role.metaphysical = metaphysical; 1216 role.direct = direct;
1211 role.access = access; 1217 role.access = access;
1212 if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { 1218 if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
1213 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); 1219 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
@@ -1242,8 +1248,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1242 pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word); 1248 pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word);
1243 sp->gfn = gfn; 1249 sp->gfn = gfn;
1244 sp->role = role; 1250 sp->role = role;
1251 sp->global = role.cr4_pge;
1245 hlist_add_head(&sp->hash_link, bucket); 1252 hlist_add_head(&sp->hash_link, bucket);
1246 if (!metaphysical) { 1253 if (!direct) {
1247 if (rmap_write_protect(vcpu->kvm, gfn)) 1254 if (rmap_write_protect(vcpu->kvm, gfn))
1248 kvm_flush_remote_tlbs(vcpu->kvm); 1255 kvm_flush_remote_tlbs(vcpu->kvm);
1249 account_shadowed(vcpu->kvm, gfn); 1256 account_shadowed(vcpu->kvm, gfn);
@@ -1255,35 +1262,35 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1255 return sp; 1262 return sp;
1256} 1263}
1257 1264
1258static int walk_shadow(struct kvm_shadow_walk *walker, 1265static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
1259 struct kvm_vcpu *vcpu, u64 addr) 1266 struct kvm_vcpu *vcpu, u64 addr)
1260{ 1267{
1261 hpa_t shadow_addr; 1268 iterator->addr = addr;
1262 int level; 1269 iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
1263 int r; 1270 iterator->level = vcpu->arch.mmu.shadow_root_level;
1264 u64 *sptep; 1271 if (iterator->level == PT32E_ROOT_LEVEL) {
1265 unsigned index; 1272 iterator->shadow_addr
1266 1273 = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
1267 shadow_addr = vcpu->arch.mmu.root_hpa; 1274 iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
1268 level = vcpu->arch.mmu.shadow_root_level; 1275 --iterator->level;
1269 if (level == PT32E_ROOT_LEVEL) { 1276 if (!iterator->shadow_addr)
1270 shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; 1277 iterator->level = 0;
1271 shadow_addr &= PT64_BASE_ADDR_MASK;
1272 if (!shadow_addr)
1273 return 1;
1274 --level;
1275 } 1278 }
1279}
1276 1280
1277 while (level >= PT_PAGE_TABLE_LEVEL) { 1281static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
1278 index = SHADOW_PT_INDEX(addr, level); 1282{
1279 sptep = ((u64 *)__va(shadow_addr)) + index; 1283 if (iterator->level < PT_PAGE_TABLE_LEVEL)
1280 r = walker->entry(walker, vcpu, addr, sptep, level); 1284 return false;
1281 if (r) 1285 iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
1282 return r; 1286 iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
1283 shadow_addr = *sptep & PT64_BASE_ADDR_MASK; 1287 return true;
1284 --level; 1288}
1285 } 1289
1286 return 0; 1290static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
1291{
1292 iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK;
1293 --iterator->level;
1287} 1294}
1288 1295
1289static void kvm_mmu_page_unlink_children(struct kvm *kvm, 1296static void kvm_mmu_page_unlink_children(struct kvm *kvm,
@@ -1388,7 +1395,7 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1388 kvm_mmu_page_unlink_children(kvm, sp); 1395 kvm_mmu_page_unlink_children(kvm, sp);
1389 kvm_mmu_unlink_parents(kvm, sp); 1396 kvm_mmu_unlink_parents(kvm, sp);
1390 kvm_flush_remote_tlbs(kvm); 1397 kvm_flush_remote_tlbs(kvm);
1391 if (!sp->role.invalid && !sp->role.metaphysical) 1398 if (!sp->role.invalid && !sp->role.direct)
1392 unaccount_shadowed(kvm, sp->gfn); 1399 unaccount_shadowed(kvm, sp->gfn);
1393 if (sp->unsync) 1400 if (sp->unsync)
1394 kvm_unlink_unsync_page(kvm, sp); 1401 kvm_unlink_unsync_page(kvm, sp);
@@ -1451,7 +1458,7 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1451 index = kvm_page_table_hashfn(gfn); 1458 index = kvm_page_table_hashfn(gfn);
1452 bucket = &kvm->arch.mmu_page_hash[index]; 1459 bucket = &kvm->arch.mmu_page_hash[index];
1453 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) 1460 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
1454 if (sp->gfn == gfn && !sp->role.metaphysical) { 1461 if (sp->gfn == gfn && !sp->role.direct) {
1455 pgprintk("%s: gfn %lx role %x\n", __func__, gfn, 1462 pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
1456 sp->role.word); 1463 sp->role.word);
1457 r = 1; 1464 r = 1;
@@ -1463,11 +1470,20 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1463 1470
1464static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) 1471static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1465{ 1472{
1473 unsigned index;
1474 struct hlist_head *bucket;
1466 struct kvm_mmu_page *sp; 1475 struct kvm_mmu_page *sp;
1476 struct hlist_node *node, *nn;
1467 1477
1468 while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) { 1478 index = kvm_page_table_hashfn(gfn);
1469 pgprintk("%s: zap %lx %x\n", __func__, gfn, sp->role.word); 1479 bucket = &kvm->arch.mmu_page_hash[index];
1470 kvm_mmu_zap_page(kvm, sp); 1480 hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) {
1481 if (sp->gfn == gfn && !sp->role.direct
1482 && !sp->role.invalid) {
1483 pgprintk("%s: zap %lx %x\n",
1484 __func__, gfn, sp->role.word);
1485 kvm_mmu_zap_page(kvm, sp);
1486 }
1471 } 1487 }
1472} 1488}
1473 1489
@@ -1622,7 +1638,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1622 bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 1638 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1623 /* don't unsync if pagetable is shadowed with multiple roles */ 1639 /* don't unsync if pagetable is shadowed with multiple roles */
1624 hlist_for_each_entry_safe(s, node, n, bucket, hash_link) { 1640 hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
1625 if (s->gfn != sp->gfn || s->role.metaphysical) 1641 if (s->gfn != sp->gfn || s->role.direct)
1626 continue; 1642 continue;
1627 if (s->role.word != sp->role.word) 1643 if (s->role.word != sp->role.word)
1628 return 1; 1644 return 1;
@@ -1669,8 +1685,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1669 u64 mt_mask = shadow_mt_mask; 1685 u64 mt_mask = shadow_mt_mask;
1670 struct kvm_mmu_page *sp = page_header(__pa(shadow_pte)); 1686 struct kvm_mmu_page *sp = page_header(__pa(shadow_pte));
1671 1687
1672 if (!(vcpu->arch.cr4 & X86_CR4_PGE))
1673 global = 0;
1674 if (!global && sp->global) { 1688 if (!global && sp->global) {
1675 sp->global = 0; 1689 sp->global = 0;
1676 if (sp->unsync) { 1690 if (sp->unsync) {
@@ -1777,12 +1791,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1777 pgprintk("hfn old %lx new %lx\n", 1791 pgprintk("hfn old %lx new %lx\n",
1778 spte_to_pfn(*shadow_pte), pfn); 1792 spte_to_pfn(*shadow_pte), pfn);
1779 rmap_remove(vcpu->kvm, shadow_pte); 1793 rmap_remove(vcpu->kvm, shadow_pte);
1780 } else { 1794 } else
1781 if (largepage) 1795 was_rmapped = 1;
1782 was_rmapped = is_large_pte(*shadow_pte);
1783 else
1784 was_rmapped = 1;
1785 }
1786 } 1796 }
1787 if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, 1797 if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
1788 dirty, largepage, global, gfn, pfn, speculative, true)) { 1798 dirty, largepage, global, gfn, pfn, speculative, true)) {
@@ -1820,67 +1830,42 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
1820{ 1830{
1821} 1831}
1822 1832
1823struct direct_shadow_walk { 1833static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1824 struct kvm_shadow_walk walker; 1834 int largepage, gfn_t gfn, pfn_t pfn)
1825 pfn_t pfn;
1826 int write;
1827 int largepage;
1828 int pt_write;
1829};
1830
1831static int direct_map_entry(struct kvm_shadow_walk *_walk,
1832 struct kvm_vcpu *vcpu,
1833 u64 addr, u64 *sptep, int level)
1834{ 1835{
1835 struct direct_shadow_walk *walk = 1836 struct kvm_shadow_walk_iterator iterator;
1836 container_of(_walk, struct direct_shadow_walk, walker);
1837 struct kvm_mmu_page *sp; 1837 struct kvm_mmu_page *sp;
1838 int pt_write = 0;
1838 gfn_t pseudo_gfn; 1839 gfn_t pseudo_gfn;
1839 gfn_t gfn = addr >> PAGE_SHIFT;
1840
1841 if (level == PT_PAGE_TABLE_LEVEL
1842 || (walk->largepage && level == PT_DIRECTORY_LEVEL)) {
1843 mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL,
1844 0, walk->write, 1, &walk->pt_write,
1845 walk->largepage, 0, gfn, walk->pfn, false);
1846 ++vcpu->stat.pf_fixed;
1847 return 1;
1848 }
1849 1840
1850 if (*sptep == shadow_trap_nonpresent_pte) { 1841 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
1851 pseudo_gfn = (addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; 1842 if (iterator.level == PT_PAGE_TABLE_LEVEL
1852 sp = kvm_mmu_get_page(vcpu, pseudo_gfn, (gva_t)addr, level - 1, 1843 || (largepage && iterator.level == PT_DIRECTORY_LEVEL)) {
1853 1, ACC_ALL, sptep); 1844 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
1854 if (!sp) { 1845 0, write, 1, &pt_write,
1855 pgprintk("nonpaging_map: ENOMEM\n"); 1846 largepage, 0, gfn, pfn, false);
1856 kvm_release_pfn_clean(walk->pfn); 1847 ++vcpu->stat.pf_fixed;
1857 return -ENOMEM; 1848 break;
1858 } 1849 }
1859 1850
1860 set_shadow_pte(sptep, 1851 if (*iterator.sptep == shadow_trap_nonpresent_pte) {
1861 __pa(sp->spt) 1852 pseudo_gfn = (iterator.addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
1862 | PT_PRESENT_MASK | PT_WRITABLE_MASK 1853 sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
1863 | shadow_user_mask | shadow_x_mask); 1854 iterator.level - 1,
1864 } 1855 1, ACC_ALL, iterator.sptep);
1865 return 0; 1856 if (!sp) {
1866} 1857 pgprintk("nonpaging_map: ENOMEM\n");
1858 kvm_release_pfn_clean(pfn);
1859 return -ENOMEM;
1860 }
1867 1861
1868static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, 1862 set_shadow_pte(iterator.sptep,
1869 int largepage, gfn_t gfn, pfn_t pfn) 1863 __pa(sp->spt)
1870{ 1864 | PT_PRESENT_MASK | PT_WRITABLE_MASK
1871 int r; 1865 | shadow_user_mask | shadow_x_mask);
1872 struct direct_shadow_walk walker = { 1866 }
1873 .walker = { .entry = direct_map_entry, }, 1867 }
1874 .pfn = pfn, 1868 return pt_write;
1875 .largepage = largepage,
1876 .write = write,
1877 .pt_write = 0,
1878 };
1879
1880 r = walk_shadow(&walker.walker, vcpu, gfn << PAGE_SHIFT);
1881 if (r < 0)
1882 return r;
1883 return walker.pt_write;
1884} 1869}
1885 1870
1886static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) 1871static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
@@ -1962,7 +1947,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
1962 int i; 1947 int i;
1963 gfn_t root_gfn; 1948 gfn_t root_gfn;
1964 struct kvm_mmu_page *sp; 1949 struct kvm_mmu_page *sp;
1965 int metaphysical = 0; 1950 int direct = 0;
1966 1951
1967 root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; 1952 root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
1968 1953
@@ -1971,18 +1956,18 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
1971 1956
1972 ASSERT(!VALID_PAGE(root)); 1957 ASSERT(!VALID_PAGE(root));
1973 if (tdp_enabled) 1958 if (tdp_enabled)
1974 metaphysical = 1; 1959 direct = 1;
1975 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, 1960 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
1976 PT64_ROOT_LEVEL, metaphysical, 1961 PT64_ROOT_LEVEL, direct,
1977 ACC_ALL, NULL); 1962 ACC_ALL, NULL);
1978 root = __pa(sp->spt); 1963 root = __pa(sp->spt);
1979 ++sp->root_count; 1964 ++sp->root_count;
1980 vcpu->arch.mmu.root_hpa = root; 1965 vcpu->arch.mmu.root_hpa = root;
1981 return; 1966 return;
1982 } 1967 }
1983 metaphysical = !is_paging(vcpu); 1968 direct = !is_paging(vcpu);
1984 if (tdp_enabled) 1969 if (tdp_enabled)
1985 metaphysical = 1; 1970 direct = 1;
1986 for (i = 0; i < 4; ++i) { 1971 for (i = 0; i < 4; ++i) {
1987 hpa_t root = vcpu->arch.mmu.pae_root[i]; 1972 hpa_t root = vcpu->arch.mmu.pae_root[i];
1988 1973
@@ -1996,7 +1981,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
1996 } else if (vcpu->arch.mmu.root_level == 0) 1981 } else if (vcpu->arch.mmu.root_level == 0)
1997 root_gfn = 0; 1982 root_gfn = 0;
1998 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, 1983 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
1999 PT32_ROOT_LEVEL, metaphysical, 1984 PT32_ROOT_LEVEL, direct,
2000 ACC_ALL, NULL); 1985 ACC_ALL, NULL);
2001 root = __pa(sp->spt); 1986 root = __pa(sp->spt);
2002 ++sp->root_count; 1987 ++sp->root_count;
@@ -2251,17 +2236,23 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2251 2236
2252static int init_kvm_softmmu(struct kvm_vcpu *vcpu) 2237static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
2253{ 2238{
2239 int r;
2240
2254 ASSERT(vcpu); 2241 ASSERT(vcpu);
2255 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); 2242 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
2256 2243
2257 if (!is_paging(vcpu)) 2244 if (!is_paging(vcpu))
2258 return nonpaging_init_context(vcpu); 2245 r = nonpaging_init_context(vcpu);
2259 else if (is_long_mode(vcpu)) 2246 else if (is_long_mode(vcpu))
2260 return paging64_init_context(vcpu); 2247 r = paging64_init_context(vcpu);
2261 else if (is_pae(vcpu)) 2248 else if (is_pae(vcpu))
2262 return paging32E_init_context(vcpu); 2249 r = paging32E_init_context(vcpu);
2263 else 2250 else
2264 return paging32_init_context(vcpu); 2251 r = paging32_init_context(vcpu);
2252
2253 vcpu->arch.mmu.base_role.glevels = vcpu->arch.mmu.root_level;
2254
2255 return r;
2265} 2256}
2266 2257
2267static int init_kvm_mmu(struct kvm_vcpu *vcpu) 2258static int init_kvm_mmu(struct kvm_vcpu *vcpu)
@@ -2492,7 +2483,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2492 index = kvm_page_table_hashfn(gfn); 2483 index = kvm_page_table_hashfn(gfn);
2493 bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 2484 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
2494 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { 2485 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
2495 if (sp->gfn != gfn || sp->role.metaphysical || sp->role.invalid) 2486 if (sp->gfn != gfn || sp->role.direct || sp->role.invalid)
2496 continue; 2487 continue;
2497 pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; 2488 pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
2498 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); 2489 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
@@ -3130,7 +3121,7 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
3130 gfn_t gfn; 3121 gfn_t gfn;
3131 3122
3132 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { 3123 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
3133 if (sp->role.metaphysical) 3124 if (sp->role.direct)
3134 continue; 3125 continue;
3135 3126
3136 gfn = unalias_gfn(vcpu->kvm, sp->gfn); 3127 gfn = unalias_gfn(vcpu->kvm, sp->gfn);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 258e5d56298e..eaab2145f62b 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -54,7 +54,7 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
54static inline int is_long_mode(struct kvm_vcpu *vcpu) 54static inline int is_long_mode(struct kvm_vcpu *vcpu)
55{ 55{
56#ifdef CONFIG_X86_64 56#ifdef CONFIG_X86_64
57 return vcpu->arch.shadow_efer & EFER_LME; 57 return vcpu->arch.shadow_efer & EFER_LMA;
58#else 58#else
59 return 0; 59 return 0;
60#endif 60#endif
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 9fd78b6e17ad..6bd70206c561 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -25,7 +25,6 @@
25#if PTTYPE == 64 25#if PTTYPE == 64
26 #define pt_element_t u64 26 #define pt_element_t u64
27 #define guest_walker guest_walker64 27 #define guest_walker guest_walker64
28 #define shadow_walker shadow_walker64
29 #define FNAME(name) paging##64_##name 28 #define FNAME(name) paging##64_##name
30 #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK 29 #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
31 #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK 30 #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
@@ -42,7 +41,6 @@
42#elif PTTYPE == 32 41#elif PTTYPE == 32
43 #define pt_element_t u32 42 #define pt_element_t u32
44 #define guest_walker guest_walker32 43 #define guest_walker guest_walker32
45 #define shadow_walker shadow_walker32
46 #define FNAME(name) paging##32_##name 44 #define FNAME(name) paging##32_##name
47 #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK 45 #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
48 #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK 46 #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
@@ -73,18 +71,6 @@ struct guest_walker {
73 u32 error_code; 71 u32 error_code;
74}; 72};
75 73
76struct shadow_walker {
77 struct kvm_shadow_walk walker;
78 struct guest_walker *guest_walker;
79 int user_fault;
80 int write_fault;
81 int largepage;
82 int *ptwrite;
83 pfn_t pfn;
84 u64 *sptep;
85 gpa_t pte_gpa;
86};
87
88static gfn_t gpte_to_gfn(pt_element_t gpte) 74static gfn_t gpte_to_gfn(pt_element_t gpte)
89{ 75{
90 return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; 76 return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -283,91 +269,79 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
283/* 269/*
284 * Fetch a shadow pte for a specific level in the paging hierarchy. 270 * Fetch a shadow pte for a specific level in the paging hierarchy.
285 */ 271 */
286static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw, 272static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
287 struct kvm_vcpu *vcpu, u64 addr, 273 struct guest_walker *gw,
288 u64 *sptep, int level) 274 int user_fault, int write_fault, int largepage,
275 int *ptwrite, pfn_t pfn)
289{ 276{
290 struct shadow_walker *sw =
291 container_of(_sw, struct shadow_walker, walker);
292 struct guest_walker *gw = sw->guest_walker;
293 unsigned access = gw->pt_access; 277 unsigned access = gw->pt_access;
294 struct kvm_mmu_page *shadow_page; 278 struct kvm_mmu_page *shadow_page;
295 u64 spte; 279 u64 spte, *sptep;
296 int metaphysical; 280 int direct;
297 gfn_t table_gfn; 281 gfn_t table_gfn;
298 int r; 282 int r;
283 int level;
299 pt_element_t curr_pte; 284 pt_element_t curr_pte;
285 struct kvm_shadow_walk_iterator iterator;
300 286
301 if (level == PT_PAGE_TABLE_LEVEL 287 if (!is_present_pte(gw->ptes[gw->level - 1]))
302 || (sw->largepage && level == PT_DIRECTORY_LEVEL)) { 288 return NULL;
303 mmu_set_spte(vcpu, sptep, access, gw->pte_access & access,
304 sw->user_fault, sw->write_fault,
305 gw->ptes[gw->level-1] & PT_DIRTY_MASK,
306 sw->ptwrite, sw->largepage,
307 gw->ptes[gw->level-1] & PT_GLOBAL_MASK,
308 gw->gfn, sw->pfn, false);
309 sw->sptep = sptep;
310 return 1;
311 }
312 289
313 if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) 290 for_each_shadow_entry(vcpu, addr, iterator) {
314 return 0; 291 level = iterator.level;
292 sptep = iterator.sptep;
293 if (level == PT_PAGE_TABLE_LEVEL
294 || (largepage && level == PT_DIRECTORY_LEVEL)) {
295 mmu_set_spte(vcpu, sptep, access,
296 gw->pte_access & access,
297 user_fault, write_fault,
298 gw->ptes[gw->level-1] & PT_DIRTY_MASK,
299 ptwrite, largepage,
300 gw->ptes[gw->level-1] & PT_GLOBAL_MASK,
301 gw->gfn, pfn, false);
302 break;
303 }
315 304
316 if (is_large_pte(*sptep)) { 305 if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
317 set_shadow_pte(sptep, shadow_trap_nonpresent_pte); 306 continue;
318 kvm_flush_remote_tlbs(vcpu->kvm);
319 rmap_remove(vcpu->kvm, sptep);
320 }
321 307
322 if (level == PT_DIRECTORY_LEVEL && gw->level == PT_DIRECTORY_LEVEL) { 308 if (is_large_pte(*sptep)) {
323 metaphysical = 1; 309 rmap_remove(vcpu->kvm, sptep);
324 if (!is_dirty_pte(gw->ptes[level - 1])) 310 set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
325 access &= ~ACC_WRITE_MASK; 311 kvm_flush_remote_tlbs(vcpu->kvm);
326 table_gfn = gpte_to_gfn(gw->ptes[level - 1]);
327 } else {
328 metaphysical = 0;
329 table_gfn = gw->table_gfn[level - 2];
330 }
331 shadow_page = kvm_mmu_get_page(vcpu, table_gfn, (gva_t)addr, level-1,
332 metaphysical, access, sptep);
333 if (!metaphysical) {
334 r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 2],
335 &curr_pte, sizeof(curr_pte));
336 if (r || curr_pte != gw->ptes[level - 2]) {
337 kvm_mmu_put_page(shadow_page, sptep);
338 kvm_release_pfn_clean(sw->pfn);
339 sw->sptep = NULL;
340 return 1;
341 } 312 }
342 }
343 313
344 spte = __pa(shadow_page->spt) | PT_PRESENT_MASK | PT_ACCESSED_MASK 314 if (level == PT_DIRECTORY_LEVEL
345 | PT_WRITABLE_MASK | PT_USER_MASK; 315 && gw->level == PT_DIRECTORY_LEVEL) {
346 *sptep = spte; 316 direct = 1;
347 return 0; 317 if (!is_dirty_pte(gw->ptes[level - 1]))
348} 318 access &= ~ACC_WRITE_MASK;
349 319 table_gfn = gpte_to_gfn(gw->ptes[level - 1]);
350static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, 320 } else {
351 struct guest_walker *guest_walker, 321 direct = 0;
352 int user_fault, int write_fault, int largepage, 322 table_gfn = gw->table_gfn[level - 2];
353 int *ptwrite, pfn_t pfn) 323 }
354{ 324 shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
355 struct shadow_walker walker = { 325 direct, access, sptep);
356 .walker = { .entry = FNAME(shadow_walk_entry), }, 326 if (!direct) {
357 .guest_walker = guest_walker, 327 r = kvm_read_guest_atomic(vcpu->kvm,
358 .user_fault = user_fault, 328 gw->pte_gpa[level - 2],
359 .write_fault = write_fault, 329 &curr_pte, sizeof(curr_pte));
360 .largepage = largepage, 330 if (r || curr_pte != gw->ptes[level - 2]) {
361 .ptwrite = ptwrite, 331 kvm_mmu_put_page(shadow_page, sptep);
362 .pfn = pfn, 332 kvm_release_pfn_clean(pfn);
363 }; 333 sptep = NULL;
364 334 break;
365 if (!is_present_pte(guest_walker->ptes[guest_walker->level - 1])) 335 }
366 return NULL; 336 }
367 337
368 walk_shadow(&walker.walker, vcpu, addr); 338 spte = __pa(shadow_page->spt)
339 | PT_PRESENT_MASK | PT_ACCESSED_MASK
340 | PT_WRITABLE_MASK | PT_USER_MASK;
341 *sptep = spte;
342 }
369 343
370 return walker.sptep; 344 return sptep;
371} 345}
372 346
373/* 347/*
@@ -465,54 +439,56 @@ out_unlock:
465 return 0; 439 return 0;
466} 440}
467 441
468static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw, 442static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
469 struct kvm_vcpu *vcpu, u64 addr,
470 u64 *sptep, int level)
471{ 443{
472 struct shadow_walker *sw = 444 struct kvm_shadow_walk_iterator iterator;
473 container_of(_sw, struct shadow_walker, walker); 445 pt_element_t gpte;
474 446 gpa_t pte_gpa = -1;
475 /* FIXME: properly handle invlpg on large guest pages */ 447 int level;
476 if (level == PT_PAGE_TABLE_LEVEL || 448 u64 *sptep;
477 ((level == PT_DIRECTORY_LEVEL) && is_large_pte(*sptep))) { 449 int need_flush = 0;
478 struct kvm_mmu_page *sp = page_header(__pa(sptep));
479 450
480 sw->pte_gpa = (sp->gfn << PAGE_SHIFT); 451 spin_lock(&vcpu->kvm->mmu_lock);
481 sw->pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
482 452
483 if (is_shadow_present_pte(*sptep)) { 453 for_each_shadow_entry(vcpu, gva, iterator) {
484 rmap_remove(vcpu->kvm, sptep); 454 level = iterator.level;
485 if (is_large_pte(*sptep)) 455 sptep = iterator.sptep;
486 --vcpu->kvm->stat.lpages; 456
457 /* FIXME: properly handle invlpg on large guest pages */
458 if (level == PT_PAGE_TABLE_LEVEL ||
459 ((level == PT_DIRECTORY_LEVEL) && is_large_pte(*sptep))) {
460 struct kvm_mmu_page *sp = page_header(__pa(sptep));
461
462 pte_gpa = (sp->gfn << PAGE_SHIFT);
463 pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
464
465 if (is_shadow_present_pte(*sptep)) {
466 rmap_remove(vcpu->kvm, sptep);
467 if (is_large_pte(*sptep))
468 --vcpu->kvm->stat.lpages;
469 need_flush = 1;
470 }
471 set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
472 break;
487 } 473 }
488 set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
489 return 1;
490 }
491 if (!is_shadow_present_pte(*sptep))
492 return 1;
493 return 0;
494}
495 474
496static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) 475 if (!is_shadow_present_pte(*sptep))
497{ 476 break;
498 pt_element_t gpte; 477 }
499 struct shadow_walker walker = {
500 .walker = { .entry = FNAME(shadow_invlpg_entry), },
501 .pte_gpa = -1,
502 };
503 478
504 spin_lock(&vcpu->kvm->mmu_lock); 479 if (need_flush)
505 walk_shadow(&walker.walker, vcpu, gva); 480 kvm_flush_remote_tlbs(vcpu->kvm);
506 spin_unlock(&vcpu->kvm->mmu_lock); 481 spin_unlock(&vcpu->kvm->mmu_lock);
507 if (walker.pte_gpa == -1) 482
483 if (pte_gpa == -1)
508 return; 484 return;
509 if (kvm_read_guest_atomic(vcpu->kvm, walker.pte_gpa, &gpte, 485 if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
510 sizeof(pt_element_t))) 486 sizeof(pt_element_t)))
511 return; 487 return;
512 if (is_present_pte(gpte) && (gpte & PT_ACCESSED_MASK)) { 488 if (is_present_pte(gpte) && (gpte & PT_ACCESSED_MASK)) {
513 if (mmu_topup_memory_caches(vcpu)) 489 if (mmu_topup_memory_caches(vcpu))
514 return; 490 return;
515 kvm_mmu_pte_write(vcpu, walker.pte_gpa, (const u8 *)&gpte, 491 kvm_mmu_pte_write(vcpu, pte_gpa, (const u8 *)&gpte,
516 sizeof(pt_element_t), 0); 492 sizeof(pt_element_t), 0);
517 } 493 }
518} 494}
@@ -540,7 +516,7 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
540 pt_element_t pt[256 / sizeof(pt_element_t)]; 516 pt_element_t pt[256 / sizeof(pt_element_t)];
541 gpa_t pte_gpa; 517 gpa_t pte_gpa;
542 518
543 if (sp->role.metaphysical 519 if (sp->role.direct
544 || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) { 520 || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
545 nonpaging_prefetch_page(vcpu, sp); 521 nonpaging_prefetch_page(vcpu, sp);
546 return; 522 return;
@@ -619,7 +595,6 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
619 595
620#undef pt_element_t 596#undef pt_element_t
621#undef guest_walker 597#undef guest_walker
622#undef shadow_walker
623#undef FNAME 598#undef FNAME
624#undef PT_BASE_ADDR_MASK 599#undef PT_BASE_ADDR_MASK
625#undef PT_INDEX 600#undef PT_INDEX
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index a9e769e4e251..1821c2078199 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -38,9 +38,6 @@ MODULE_LICENSE("GPL");
38#define IOPM_ALLOC_ORDER 2 38#define IOPM_ALLOC_ORDER 2
39#define MSRPM_ALLOC_ORDER 1 39#define MSRPM_ALLOC_ORDER 1
40 40
41#define DR7_GD_MASK (1 << 13)
42#define DR6_BD_MASK (1 << 13)
43
44#define SEG_TYPE_LDT 2 41#define SEG_TYPE_LDT 2
45#define SEG_TYPE_BUSY_TSS16 3 42#define SEG_TYPE_BUSY_TSS16 3
46 43
@@ -50,6 +47,15 @@ MODULE_LICENSE("GPL");
50 47
51#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) 48#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
52 49
50/* Turn on to get debugging output*/
51/* #define NESTED_DEBUG */
52
53#ifdef NESTED_DEBUG
54#define nsvm_printk(fmt, args...) printk(KERN_INFO fmt, ## args)
55#else
56#define nsvm_printk(fmt, args...) do {} while(0)
57#endif
58
53/* enable NPT for AMD64 and X86 with PAE */ 59/* enable NPT for AMD64 and X86 with PAE */
54#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) 60#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
55static bool npt_enabled = true; 61static bool npt_enabled = true;
@@ -60,14 +66,29 @@ static int npt = 1;
60 66
61module_param(npt, int, S_IRUGO); 67module_param(npt, int, S_IRUGO);
62 68
69static int nested = 0;
70module_param(nested, int, S_IRUGO);
71
63static void kvm_reput_irq(struct vcpu_svm *svm); 72static void kvm_reput_irq(struct vcpu_svm *svm);
64static void svm_flush_tlb(struct kvm_vcpu *vcpu); 73static void svm_flush_tlb(struct kvm_vcpu *vcpu);
65 74
75static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override);
76static int nested_svm_vmexit(struct vcpu_svm *svm);
77static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb,
78 void *arg2, void *opaque);
79static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
80 bool has_error_code, u32 error_code);
81
66static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) 82static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
67{ 83{
68 return container_of(vcpu, struct vcpu_svm, vcpu); 84 return container_of(vcpu, struct vcpu_svm, vcpu);
69} 85}
70 86
87static inline bool is_nested(struct vcpu_svm *svm)
88{
89 return svm->nested_vmcb;
90}
91
71static unsigned long iopm_base; 92static unsigned long iopm_base;
72 93
73struct kvm_ldttss_desc { 94struct kvm_ldttss_desc {
@@ -157,32 +178,6 @@ static inline void kvm_write_cr2(unsigned long val)
157 asm volatile ("mov %0, %%cr2" :: "r" (val)); 178 asm volatile ("mov %0, %%cr2" :: "r" (val));
158} 179}
159 180
160static inline unsigned long read_dr6(void)
161{
162 unsigned long dr6;
163
164 asm volatile ("mov %%dr6, %0" : "=r" (dr6));
165 return dr6;
166}
167
168static inline void write_dr6(unsigned long val)
169{
170 asm volatile ("mov %0, %%dr6" :: "r" (val));
171}
172
173static inline unsigned long read_dr7(void)
174{
175 unsigned long dr7;
176
177 asm volatile ("mov %%dr7, %0" : "=r" (dr7));
178 return dr7;
179}
180
181static inline void write_dr7(unsigned long val)
182{
183 asm volatile ("mov %0, %%dr7" :: "r" (val));
184}
185
186static inline void force_new_asid(struct kvm_vcpu *vcpu) 181static inline void force_new_asid(struct kvm_vcpu *vcpu)
187{ 182{
188 to_svm(vcpu)->asid_generation--; 183 to_svm(vcpu)->asid_generation--;
@@ -198,7 +193,7 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
198 if (!npt_enabled && !(efer & EFER_LMA)) 193 if (!npt_enabled && !(efer & EFER_LMA))
199 efer &= ~EFER_LME; 194 efer &= ~EFER_LME;
200 195
201 to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK; 196 to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
202 vcpu->arch.shadow_efer = efer; 197 vcpu->arch.shadow_efer = efer;
203} 198}
204 199
@@ -207,6 +202,11 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
207{ 202{
208 struct vcpu_svm *svm = to_svm(vcpu); 203 struct vcpu_svm *svm = to_svm(vcpu);
209 204
205 /* If we are within a nested VM we'd better #VMEXIT and let the
206 guest handle the exception */
207 if (nested_svm_check_exception(svm, nr, has_error_code, error_code))
208 return;
209
210 svm->vmcb->control.event_inj = nr 210 svm->vmcb->control.event_inj = nr
211 | SVM_EVTINJ_VALID 211 | SVM_EVTINJ_VALID
212 | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0) 212 | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
@@ -242,7 +242,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
242 kvm_rip_write(vcpu, svm->next_rip); 242 kvm_rip_write(vcpu, svm->next_rip);
243 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; 243 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
244 244
245 vcpu->arch.interrupt_window_open = 1; 245 vcpu->arch.interrupt_window_open = (svm->vcpu.arch.hflags & HF_GIF_MASK);
246} 246}
247 247
248static int has_svm(void) 248static int has_svm(void)
@@ -250,7 +250,7 @@ static int has_svm(void)
250 const char *msg; 250 const char *msg;
251 251
252 if (!cpu_has_svm(&msg)) { 252 if (!cpu_has_svm(&msg)) {
253 printk(KERN_INFO "has_svn: %s\n", msg); 253 printk(KERN_INFO "has_svm: %s\n", msg);
254 return 0; 254 return 0;
255 } 255 }
256 256
@@ -292,7 +292,7 @@ static void svm_hardware_enable(void *garbage)
292 svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); 292 svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
293 293
294 rdmsrl(MSR_EFER, efer); 294 rdmsrl(MSR_EFER, efer);
295 wrmsrl(MSR_EFER, efer | MSR_EFER_SVME_MASK); 295 wrmsrl(MSR_EFER, efer | EFER_SVME);
296 296
297 wrmsrl(MSR_VM_HSAVE_PA, 297 wrmsrl(MSR_VM_HSAVE_PA,
298 page_to_pfn(svm_data->save_area) << PAGE_SHIFT); 298 page_to_pfn(svm_data->save_area) << PAGE_SHIFT);
@@ -417,6 +417,14 @@ static __init int svm_hardware_setup(void)
417 if (boot_cpu_has(X86_FEATURE_NX)) 417 if (boot_cpu_has(X86_FEATURE_NX))
418 kvm_enable_efer_bits(EFER_NX); 418 kvm_enable_efer_bits(EFER_NX);
419 419
420 if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
421 kvm_enable_efer_bits(EFER_FFXSR);
422
423 if (nested) {
424 printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
425 kvm_enable_efer_bits(EFER_SVME);
426 }
427
420 for_each_online_cpu(cpu) { 428 for_each_online_cpu(cpu) {
421 r = svm_cpu_init(cpu); 429 r = svm_cpu_init(cpu);
422 if (r) 430 if (r)
@@ -559,7 +567,7 @@ static void init_vmcb(struct vcpu_svm *svm)
559 init_sys_seg(&save->ldtr, SEG_TYPE_LDT); 567 init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
560 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); 568 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
561 569
562 save->efer = MSR_EFER_SVME_MASK; 570 save->efer = EFER_SVME;
563 save->dr6 = 0xffff0ff0; 571 save->dr6 = 0xffff0ff0;
564 save->dr7 = 0x400; 572 save->dr7 = 0x400;
565 save->rflags = 2; 573 save->rflags = 2;
@@ -591,6 +599,9 @@ static void init_vmcb(struct vcpu_svm *svm)
591 save->cr4 = 0; 599 save->cr4 = 0;
592 } 600 }
593 force_new_asid(&svm->vcpu); 601 force_new_asid(&svm->vcpu);
602
603 svm->nested_vmcb = 0;
604 svm->vcpu.arch.hflags = HF_GIF_MASK;
594} 605}
595 606
596static int svm_vcpu_reset(struct kvm_vcpu *vcpu) 607static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
@@ -615,6 +626,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
615 struct vcpu_svm *svm; 626 struct vcpu_svm *svm;
616 struct page *page; 627 struct page *page;
617 struct page *msrpm_pages; 628 struct page *msrpm_pages;
629 struct page *hsave_page;
630 struct page *nested_msrpm_pages;
618 int err; 631 int err;
619 632
620 svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); 633 svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
@@ -637,14 +650,25 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
637 msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); 650 msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
638 if (!msrpm_pages) 651 if (!msrpm_pages)
639 goto uninit; 652 goto uninit;
653
654 nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
655 if (!nested_msrpm_pages)
656 goto uninit;
657
640 svm->msrpm = page_address(msrpm_pages); 658 svm->msrpm = page_address(msrpm_pages);
641 svm_vcpu_init_msrpm(svm->msrpm); 659 svm_vcpu_init_msrpm(svm->msrpm);
642 660
661 hsave_page = alloc_page(GFP_KERNEL);
662 if (!hsave_page)
663 goto uninit;
664 svm->hsave = page_address(hsave_page);
665
666 svm->nested_msrpm = page_address(nested_msrpm_pages);
667
643 svm->vmcb = page_address(page); 668 svm->vmcb = page_address(page);
644 clear_page(svm->vmcb); 669 clear_page(svm->vmcb);
645 svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; 670 svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
646 svm->asid_generation = 0; 671 svm->asid_generation = 0;
647 memset(svm->db_regs, 0, sizeof(svm->db_regs));
648 init_vmcb(svm); 672 init_vmcb(svm);
649 673
650 fx_init(&svm->vcpu); 674 fx_init(&svm->vcpu);
@@ -669,6 +693,8 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
669 693
670 __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT)); 694 __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
671 __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); 695 __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
696 __free_page(virt_to_page(svm->hsave));
697 __free_pages(virt_to_page(svm->nested_msrpm), MSRPM_ALLOC_ORDER);
672 kvm_vcpu_uninit(vcpu); 698 kvm_vcpu_uninit(vcpu);
673 kmem_cache_free(kvm_vcpu_cache, svm); 699 kmem_cache_free(kvm_vcpu_cache, svm);
674} 700}
@@ -718,6 +744,16 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
718 to_svm(vcpu)->vmcb->save.rflags = rflags; 744 to_svm(vcpu)->vmcb->save.rflags = rflags;
719} 745}
720 746
747static void svm_set_vintr(struct vcpu_svm *svm)
748{
749 svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR;
750}
751
752static void svm_clear_vintr(struct vcpu_svm *svm)
753{
754 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR);
755}
756
721static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) 757static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
722{ 758{
723 struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save; 759 struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
@@ -760,20 +796,37 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
760 var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; 796 var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
761 var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1; 797 var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
762 798
763 /* 799 switch (seg) {
764 * SVM always stores 0 for the 'G' bit in the CS selector in 800 case VCPU_SREG_CS:
765 * the VMCB on a VMEXIT. This hurts cross-vendor migration: 801 /*
766 * Intel's VMENTRY has a check on the 'G' bit. 802 * SVM always stores 0 for the 'G' bit in the CS selector in
767 */ 803 * the VMCB on a VMEXIT. This hurts cross-vendor migration:
768 if (seg == VCPU_SREG_CS) 804 * Intel's VMENTRY has a check on the 'G' bit.
805 */
769 var->g = s->limit > 0xfffff; 806 var->g = s->limit > 0xfffff;
770 807 break;
771 /* 808 case VCPU_SREG_TR:
772 * Work around a bug where the busy flag in the tr selector 809 /*
773 * isn't exposed 810 * Work around a bug where the busy flag in the tr selector
774 */ 811 * isn't exposed
775 if (seg == VCPU_SREG_TR) 812 */
776 var->type |= 0x2; 813 var->type |= 0x2;
814 break;
815 case VCPU_SREG_DS:
816 case VCPU_SREG_ES:
817 case VCPU_SREG_FS:
818 case VCPU_SREG_GS:
819 /*
820 * The accessed bit must always be set in the segment
821 * descriptor cache, although it can be cleared in the
822 * descriptor, the cached bit always remains at 1. Since
823 * Intel has a check on this, set it here to support
824 * cross-vendor migration.
825 */
826 if (!var->unusable)
827 var->type |= 0x1;
828 break;
829 }
777 830
778 var->unusable = !var->present; 831 var->unusable = !var->present;
779} 832}
@@ -905,9 +958,37 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
905 958
906} 959}
907 960
908static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) 961static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
909{ 962{
910 return -EOPNOTSUPP; 963 int old_debug = vcpu->guest_debug;
964 struct vcpu_svm *svm = to_svm(vcpu);
965
966 vcpu->guest_debug = dbg->control;
967
968 svm->vmcb->control.intercept_exceptions &=
969 ~((1 << DB_VECTOR) | (1 << BP_VECTOR));
970 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
971 if (vcpu->guest_debug &
972 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
973 svm->vmcb->control.intercept_exceptions |=
974 1 << DB_VECTOR;
975 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
976 svm->vmcb->control.intercept_exceptions |=
977 1 << BP_VECTOR;
978 } else
979 vcpu->guest_debug = 0;
980
981 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
982 svm->vmcb->save.dr7 = dbg->arch.debugreg[7];
983 else
984 svm->vmcb->save.dr7 = vcpu->arch.dr7;
985
986 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
987 svm->vmcb->save.rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
988 else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
989 svm->vmcb->save.rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
990
991 return 0;
911} 992}
912 993
913static int svm_get_irq(struct kvm_vcpu *vcpu) 994static int svm_get_irq(struct kvm_vcpu *vcpu)
@@ -949,7 +1030,29 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data)
949 1030
950static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) 1031static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr)
951{ 1032{
952 unsigned long val = to_svm(vcpu)->db_regs[dr]; 1033 struct vcpu_svm *svm = to_svm(vcpu);
1034 unsigned long val;
1035
1036 switch (dr) {
1037 case 0 ... 3:
1038 val = vcpu->arch.db[dr];
1039 break;
1040 case 6:
1041 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1042 val = vcpu->arch.dr6;
1043 else
1044 val = svm->vmcb->save.dr6;
1045 break;
1046 case 7:
1047 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1048 val = vcpu->arch.dr7;
1049 else
1050 val = svm->vmcb->save.dr7;
1051 break;
1052 default:
1053 val = 0;
1054 }
1055
953 KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler); 1056 KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
954 return val; 1057 return val;
955} 1058}
@@ -959,33 +1062,40 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
959{ 1062{
960 struct vcpu_svm *svm = to_svm(vcpu); 1063 struct vcpu_svm *svm = to_svm(vcpu);
961 1064
962 *exception = 0; 1065 KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)value, handler);
963 1066
964 if (svm->vmcb->save.dr7 & DR7_GD_MASK) { 1067 *exception = 0;
965 svm->vmcb->save.dr7 &= ~DR7_GD_MASK;
966 svm->vmcb->save.dr6 |= DR6_BD_MASK;
967 *exception = DB_VECTOR;
968 return;
969 }
970 1068
971 switch (dr) { 1069 switch (dr) {
972 case 0 ... 3: 1070 case 0 ... 3:
973 svm->db_regs[dr] = value; 1071 vcpu->arch.db[dr] = value;
1072 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
1073 vcpu->arch.eff_db[dr] = value;
974 return; 1074 return;
975 case 4 ... 5: 1075 case 4 ... 5:
976 if (vcpu->arch.cr4 & X86_CR4_DE) { 1076 if (vcpu->arch.cr4 & X86_CR4_DE)
977 *exception = UD_VECTOR; 1077 *exception = UD_VECTOR;
1078 return;
1079 case 6:
1080 if (value & 0xffffffff00000000ULL) {
1081 *exception = GP_VECTOR;
978 return; 1082 return;
979 } 1083 }
980 case 7: { 1084 vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1;
981 if (value & ~((1ULL << 32) - 1)) { 1085 return;
1086 case 7:
1087 if (value & 0xffffffff00000000ULL) {
982 *exception = GP_VECTOR; 1088 *exception = GP_VECTOR;
983 return; 1089 return;
984 } 1090 }
985 svm->vmcb->save.dr7 = value; 1091 vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1;
1092 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
1093 svm->vmcb->save.dr7 = vcpu->arch.dr7;
1094 vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK);
1095 }
986 return; 1096 return;
987 }
988 default: 1097 default:
1098 /* FIXME: Possible case? */
989 printk(KERN_DEBUG "%s: unexpected dr %u\n", 1099 printk(KERN_DEBUG "%s: unexpected dr %u\n",
990 __func__, dr); 1100 __func__, dr);
991 *exception = UD_VECTOR; 1101 *exception = UD_VECTOR;
@@ -1031,6 +1141,27 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1031 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); 1141 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
1032} 1142}
1033 1143
1144static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1145{
1146 if (!(svm->vcpu.guest_debug &
1147 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
1148 kvm_queue_exception(&svm->vcpu, DB_VECTOR);
1149 return 1;
1150 }
1151 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1152 kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1153 kvm_run->debug.arch.exception = DB_VECTOR;
1154 return 0;
1155}
1156
1157static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1158{
1159 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1160 kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1161 kvm_run->debug.arch.exception = BP_VECTOR;
1162 return 0;
1163}
1164
1034static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1165static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1035{ 1166{
1036 int er; 1167 int er;
@@ -1080,7 +1211,7 @@ static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1080static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1211static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1081{ 1212{
1082 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ 1213 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
1083 int size, down, in, string, rep; 1214 int size, in, string;
1084 unsigned port; 1215 unsigned port;
1085 1216
1086 ++svm->vcpu.stat.io_exits; 1217 ++svm->vcpu.stat.io_exits;
@@ -1099,8 +1230,6 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1099 in = (io_info & SVM_IOIO_TYPE_MASK) != 0; 1230 in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
1100 port = io_info >> 16; 1231 port = io_info >> 16;
1101 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; 1232 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
1102 rep = (io_info & SVM_IOIO_REP_MASK) != 0;
1103 down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0;
1104 1233
1105 skip_emulated_instruction(&svm->vcpu); 1234 skip_emulated_instruction(&svm->vcpu);
1106 return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port); 1235 return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port);
@@ -1139,6 +1268,567 @@ static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1139 return 1; 1268 return 1;
1140} 1269}
1141 1270
1271static int nested_svm_check_permissions(struct vcpu_svm *svm)
1272{
1273 if (!(svm->vcpu.arch.shadow_efer & EFER_SVME)
1274 || !is_paging(&svm->vcpu)) {
1275 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1276 return 1;
1277 }
1278
1279 if (svm->vmcb->save.cpl) {
1280 kvm_inject_gp(&svm->vcpu, 0);
1281 return 1;
1282 }
1283
1284 return 0;
1285}
1286
1287static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
1288 bool has_error_code, u32 error_code)
1289{
1290 if (is_nested(svm)) {
1291 svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
1292 svm->vmcb->control.exit_code_hi = 0;
1293 svm->vmcb->control.exit_info_1 = error_code;
1294 svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
1295 if (nested_svm_exit_handled(svm, false)) {
1296 nsvm_printk("VMexit -> EXCP 0x%x\n", nr);
1297
1298 nested_svm_vmexit(svm);
1299 return 1;
1300 }
1301 }
1302
1303 return 0;
1304}
1305
1306static inline int nested_svm_intr(struct vcpu_svm *svm)
1307{
1308 if (is_nested(svm)) {
1309 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
1310 return 0;
1311
1312 if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
1313 return 0;
1314
1315 svm->vmcb->control.exit_code = SVM_EXIT_INTR;
1316
1317 if (nested_svm_exit_handled(svm, false)) {
1318 nsvm_printk("VMexit -> INTR\n");
1319 nested_svm_vmexit(svm);
1320 return 1;
1321 }
1322 }
1323
1324 return 0;
1325}
1326
1327static struct page *nested_svm_get_page(struct vcpu_svm *svm, u64 gpa)
1328{
1329 struct page *page;
1330
1331 down_read(&current->mm->mmap_sem);
1332 page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
1333 up_read(&current->mm->mmap_sem);
1334
1335 if (is_error_page(page)) {
1336 printk(KERN_INFO "%s: could not find page at 0x%llx\n",
1337 __func__, gpa);
1338 kvm_release_page_clean(page);
1339 kvm_inject_gp(&svm->vcpu, 0);
1340 return NULL;
1341 }
1342 return page;
1343}
1344
1345static int nested_svm_do(struct vcpu_svm *svm,
1346 u64 arg1_gpa, u64 arg2_gpa, void *opaque,
1347 int (*handler)(struct vcpu_svm *svm,
1348 void *arg1,
1349 void *arg2,
1350 void *opaque))
1351{
1352 struct page *arg1_page;
1353 struct page *arg2_page = NULL;
1354 void *arg1;
1355 void *arg2 = NULL;
1356 int retval;
1357
1358 arg1_page = nested_svm_get_page(svm, arg1_gpa);
1359 if(arg1_page == NULL)
1360 return 1;
1361
1362 if (arg2_gpa) {
1363 arg2_page = nested_svm_get_page(svm, arg2_gpa);
1364 if(arg2_page == NULL) {
1365 kvm_release_page_clean(arg1_page);
1366 return 1;
1367 }
1368 }
1369
1370 arg1 = kmap_atomic(arg1_page, KM_USER0);
1371 if (arg2_gpa)
1372 arg2 = kmap_atomic(arg2_page, KM_USER1);
1373
1374 retval = handler(svm, arg1, arg2, opaque);
1375
1376 kunmap_atomic(arg1, KM_USER0);
1377 if (arg2_gpa)
1378 kunmap_atomic(arg2, KM_USER1);
1379
1380 kvm_release_page_dirty(arg1_page);
1381 if (arg2_gpa)
1382 kvm_release_page_dirty(arg2_page);
1383
1384 return retval;
1385}
1386
1387static int nested_svm_exit_handled_real(struct vcpu_svm *svm,
1388 void *arg1,
1389 void *arg2,
1390 void *opaque)
1391{
1392 struct vmcb *nested_vmcb = (struct vmcb *)arg1;
1393 bool kvm_overrides = *(bool *)opaque;
1394 u32 exit_code = svm->vmcb->control.exit_code;
1395
1396 if (kvm_overrides) {
1397 switch (exit_code) {
1398 case SVM_EXIT_INTR:
1399 case SVM_EXIT_NMI:
1400 return 0;
1401 /* For now we are always handling NPFs when using them */
1402 case SVM_EXIT_NPF:
1403 if (npt_enabled)
1404 return 0;
1405 break;
1406 /* When we're shadowing, trap PFs */
1407 case SVM_EXIT_EXCP_BASE + PF_VECTOR:
1408 if (!npt_enabled)
1409 return 0;
1410 break;
1411 default:
1412 break;
1413 }
1414 }
1415
1416 switch (exit_code) {
1417 case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: {
1418 u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0);
1419 if (nested_vmcb->control.intercept_cr_read & cr_bits)
1420 return 1;
1421 break;
1422 }
1423 case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: {
1424 u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0);
1425 if (nested_vmcb->control.intercept_cr_write & cr_bits)
1426 return 1;
1427 break;
1428 }
1429 case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: {
1430 u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0);
1431 if (nested_vmcb->control.intercept_dr_read & dr_bits)
1432 return 1;
1433 break;
1434 }
1435 case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: {
1436 u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0);
1437 if (nested_vmcb->control.intercept_dr_write & dr_bits)
1438 return 1;
1439 break;
1440 }
1441 case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
1442 u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
1443 if (nested_vmcb->control.intercept_exceptions & excp_bits)
1444 return 1;
1445 break;
1446 }
1447 default: {
1448 u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
1449 nsvm_printk("exit code: 0x%x\n", exit_code);
1450 if (nested_vmcb->control.intercept & exit_bits)
1451 return 1;
1452 }
1453 }
1454
1455 return 0;
1456}
1457
1458static int nested_svm_exit_handled_msr(struct vcpu_svm *svm,
1459 void *arg1, void *arg2,
1460 void *opaque)
1461{
1462 struct vmcb *nested_vmcb = (struct vmcb *)arg1;
1463 u8 *msrpm = (u8 *)arg2;
1464 u32 t0, t1;
1465 u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1466 u32 param = svm->vmcb->control.exit_info_1 & 1;
1467
1468 if (!(nested_vmcb->control.intercept & (1ULL << INTERCEPT_MSR_PROT)))
1469 return 0;
1470
1471 switch(msr) {
1472 case 0 ... 0x1fff:
1473 t0 = (msr * 2) % 8;
1474 t1 = msr / 8;
1475 break;
1476 case 0xc0000000 ... 0xc0001fff:
1477 t0 = (8192 + msr - 0xc0000000) * 2;
1478 t1 = (t0 / 8);
1479 t0 %= 8;
1480 break;
1481 case 0xc0010000 ... 0xc0011fff:
1482 t0 = (16384 + msr - 0xc0010000) * 2;
1483 t1 = (t0 / 8);
1484 t0 %= 8;
1485 break;
1486 default:
1487 return 1;
1488 break;
1489 }
1490 if (msrpm[t1] & ((1 << param) << t0))
1491 return 1;
1492
1493 return 0;
1494}
1495
1496static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override)
1497{
1498 bool k = kvm_override;
1499
1500 switch (svm->vmcb->control.exit_code) {
1501 case SVM_EXIT_MSR:
1502 return nested_svm_do(svm, svm->nested_vmcb,
1503 svm->nested_vmcb_msrpm, NULL,
1504 nested_svm_exit_handled_msr);
1505 default: break;
1506 }
1507
1508 return nested_svm_do(svm, svm->nested_vmcb, 0, &k,
1509 nested_svm_exit_handled_real);
1510}
1511
1512static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1,
1513 void *arg2, void *opaque)
1514{
1515 struct vmcb *nested_vmcb = (struct vmcb *)arg1;
1516 struct vmcb *hsave = svm->hsave;
1517 u64 nested_save[] = { nested_vmcb->save.cr0,
1518 nested_vmcb->save.cr3,
1519 nested_vmcb->save.cr4,
1520 nested_vmcb->save.efer,
1521 nested_vmcb->control.intercept_cr_read,
1522 nested_vmcb->control.intercept_cr_write,
1523 nested_vmcb->control.intercept_dr_read,
1524 nested_vmcb->control.intercept_dr_write,
1525 nested_vmcb->control.intercept_exceptions,
1526 nested_vmcb->control.intercept,
1527 nested_vmcb->control.msrpm_base_pa,
1528 nested_vmcb->control.iopm_base_pa,
1529 nested_vmcb->control.tsc_offset };
1530
1531 /* Give the current vmcb to the guest */
1532 memcpy(nested_vmcb, svm->vmcb, sizeof(struct vmcb));
1533 nested_vmcb->save.cr0 = nested_save[0];
1534 if (!npt_enabled)
1535 nested_vmcb->save.cr3 = nested_save[1];
1536 nested_vmcb->save.cr4 = nested_save[2];
1537 nested_vmcb->save.efer = nested_save[3];
1538 nested_vmcb->control.intercept_cr_read = nested_save[4];
1539 nested_vmcb->control.intercept_cr_write = nested_save[5];
1540 nested_vmcb->control.intercept_dr_read = nested_save[6];
1541 nested_vmcb->control.intercept_dr_write = nested_save[7];
1542 nested_vmcb->control.intercept_exceptions = nested_save[8];
1543 nested_vmcb->control.intercept = nested_save[9];
1544 nested_vmcb->control.msrpm_base_pa = nested_save[10];
1545 nested_vmcb->control.iopm_base_pa = nested_save[11];
1546 nested_vmcb->control.tsc_offset = nested_save[12];
1547
1548 /* We always set V_INTR_MASKING and remember the old value in hflags */
1549 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
1550 nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
1551
1552 if ((nested_vmcb->control.int_ctl & V_IRQ_MASK) &&
1553 (nested_vmcb->control.int_vector)) {
1554 nsvm_printk("WARNING: IRQ 0x%x still enabled on #VMEXIT\n",
1555 nested_vmcb->control.int_vector);
1556 }
1557
1558 /* Restore the original control entries */
1559 svm->vmcb->control = hsave->control;
1560
1561 /* Kill any pending exceptions */
1562 if (svm->vcpu.arch.exception.pending == true)
1563 nsvm_printk("WARNING: Pending Exception\n");
1564 svm->vcpu.arch.exception.pending = false;
1565
1566 /* Restore selected save entries */
1567 svm->vmcb->save.es = hsave->save.es;
1568 svm->vmcb->save.cs = hsave->save.cs;
1569 svm->vmcb->save.ss = hsave->save.ss;
1570 svm->vmcb->save.ds = hsave->save.ds;
1571 svm->vmcb->save.gdtr = hsave->save.gdtr;
1572 svm->vmcb->save.idtr = hsave->save.idtr;
1573 svm->vmcb->save.rflags = hsave->save.rflags;
1574 svm_set_efer(&svm->vcpu, hsave->save.efer);
1575 svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
1576 svm_set_cr4(&svm->vcpu, hsave->save.cr4);
1577 if (npt_enabled) {
1578 svm->vmcb->save.cr3 = hsave->save.cr3;
1579 svm->vcpu.arch.cr3 = hsave->save.cr3;
1580 } else {
1581 kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
1582 }
1583 kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax);
1584 kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp);
1585 kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, hsave->save.rip);
1586 svm->vmcb->save.dr7 = 0;
1587 svm->vmcb->save.cpl = 0;
1588 svm->vmcb->control.exit_int_info = 0;
1589
1590 svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
1591 /* Exit nested SVM mode */
1592 svm->nested_vmcb = 0;
1593
1594 return 0;
1595}
1596
1597static int nested_svm_vmexit(struct vcpu_svm *svm)
1598{
1599 nsvm_printk("VMexit\n");
1600 if (nested_svm_do(svm, svm->nested_vmcb, 0,
1601 NULL, nested_svm_vmexit_real))
1602 return 1;
1603
1604 kvm_mmu_reset_context(&svm->vcpu);
1605 kvm_mmu_load(&svm->vcpu);
1606
1607 return 0;
1608}
1609
1610static int nested_svm_vmrun_msrpm(struct vcpu_svm *svm, void *arg1,
1611 void *arg2, void *opaque)
1612{
1613 int i;
1614 u32 *nested_msrpm = (u32*)arg1;
1615 for (i=0; i< PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++)
1616 svm->nested_msrpm[i] = svm->msrpm[i] | nested_msrpm[i];
1617 svm->vmcb->control.msrpm_base_pa = __pa(svm->nested_msrpm);
1618
1619 return 0;
1620}
1621
1622static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1,
1623 void *arg2, void *opaque)
1624{
1625 struct vmcb *nested_vmcb = (struct vmcb *)arg1;
1626 struct vmcb *hsave = svm->hsave;
1627
1628 /* nested_vmcb is our indicator if nested SVM is activated */
1629 svm->nested_vmcb = svm->vmcb->save.rax;
1630
1631 /* Clear internal status */
1632 svm->vcpu.arch.exception.pending = false;
1633
1634 /* Save the old vmcb, so we don't need to pick what we save, but
1635 can restore everything when a VMEXIT occurs */
1636 memcpy(hsave, svm->vmcb, sizeof(struct vmcb));
1637 /* We need to remember the original CR3 in the SPT case */
1638 if (!npt_enabled)
1639 hsave->save.cr3 = svm->vcpu.arch.cr3;
1640 hsave->save.cr4 = svm->vcpu.arch.cr4;
1641 hsave->save.rip = svm->next_rip;
1642
1643 if (svm->vmcb->save.rflags & X86_EFLAGS_IF)
1644 svm->vcpu.arch.hflags |= HF_HIF_MASK;
1645 else
1646 svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
1647
1648 /* Load the nested guest state */
1649 svm->vmcb->save.es = nested_vmcb->save.es;
1650 svm->vmcb->save.cs = nested_vmcb->save.cs;
1651 svm->vmcb->save.ss = nested_vmcb->save.ss;
1652 svm->vmcb->save.ds = nested_vmcb->save.ds;
1653 svm->vmcb->save.gdtr = nested_vmcb->save.gdtr;
1654 svm->vmcb->save.idtr = nested_vmcb->save.idtr;
1655 svm->vmcb->save.rflags = nested_vmcb->save.rflags;
1656 svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
1657 svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
1658 svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
1659 if (npt_enabled) {
1660 svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
1661 svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
1662 } else {
1663 kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
1664 kvm_mmu_reset_context(&svm->vcpu);
1665 }
1666 svm->vmcb->save.cr2 = nested_vmcb->save.cr2;
1667 kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
1668 kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
1669 kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
1670 /* In case we don't even reach vcpu_run, the fields are not updated */
1671 svm->vmcb->save.rax = nested_vmcb->save.rax;
1672 svm->vmcb->save.rsp = nested_vmcb->save.rsp;
1673 svm->vmcb->save.rip = nested_vmcb->save.rip;
1674 svm->vmcb->save.dr7 = nested_vmcb->save.dr7;
1675 svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
1676 svm->vmcb->save.cpl = nested_vmcb->save.cpl;
1677
1678 /* We don't want a nested guest to be more powerful than the guest,
1679 so all intercepts are ORed */
1680 svm->vmcb->control.intercept_cr_read |=
1681 nested_vmcb->control.intercept_cr_read;
1682 svm->vmcb->control.intercept_cr_write |=
1683 nested_vmcb->control.intercept_cr_write;
1684 svm->vmcb->control.intercept_dr_read |=
1685 nested_vmcb->control.intercept_dr_read;
1686 svm->vmcb->control.intercept_dr_write |=
1687 nested_vmcb->control.intercept_dr_write;
1688 svm->vmcb->control.intercept_exceptions |=
1689 nested_vmcb->control.intercept_exceptions;
1690
1691 svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
1692
1693 svm->nested_vmcb_msrpm = nested_vmcb->control.msrpm_base_pa;
1694
1695 force_new_asid(&svm->vcpu);
1696 svm->vmcb->control.exit_int_info = nested_vmcb->control.exit_int_info;
1697 svm->vmcb->control.exit_int_info_err = nested_vmcb->control.exit_int_info_err;
1698 svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
1699 if (nested_vmcb->control.int_ctl & V_IRQ_MASK) {
1700 nsvm_printk("nSVM Injecting Interrupt: 0x%x\n",
1701 nested_vmcb->control.int_ctl);
1702 }
1703 if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
1704 svm->vcpu.arch.hflags |= HF_VINTR_MASK;
1705 else
1706 svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
1707
1708 nsvm_printk("nSVM exit_int_info: 0x%x | int_state: 0x%x\n",
1709 nested_vmcb->control.exit_int_info,
1710 nested_vmcb->control.int_state);
1711
1712 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
1713 svm->vmcb->control.int_state = nested_vmcb->control.int_state;
1714 svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
1715 if (nested_vmcb->control.event_inj & SVM_EVTINJ_VALID)
1716 nsvm_printk("Injecting Event: 0x%x\n",
1717 nested_vmcb->control.event_inj);
1718 svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
1719 svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
1720
1721 svm->vcpu.arch.hflags |= HF_GIF_MASK;
1722
1723 return 0;
1724}
1725
1726static int nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
1727{
1728 to_vmcb->save.fs = from_vmcb->save.fs;
1729 to_vmcb->save.gs = from_vmcb->save.gs;
1730 to_vmcb->save.tr = from_vmcb->save.tr;
1731 to_vmcb->save.ldtr = from_vmcb->save.ldtr;
1732 to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base;
1733 to_vmcb->save.star = from_vmcb->save.star;
1734 to_vmcb->save.lstar = from_vmcb->save.lstar;
1735 to_vmcb->save.cstar = from_vmcb->save.cstar;
1736 to_vmcb->save.sfmask = from_vmcb->save.sfmask;
1737 to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
1738 to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
1739 to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
1740
1741 return 1;
1742}
1743
1744static int nested_svm_vmload(struct vcpu_svm *svm, void *nested_vmcb,
1745 void *arg2, void *opaque)
1746{
1747 return nested_svm_vmloadsave((struct vmcb *)nested_vmcb, svm->vmcb);
1748}
1749
1750static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb,
1751 void *arg2, void *opaque)
1752{
1753 return nested_svm_vmloadsave(svm->vmcb, (struct vmcb *)nested_vmcb);
1754}
1755
1756static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1757{
1758 if (nested_svm_check_permissions(svm))
1759 return 1;
1760
1761 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1762 skip_emulated_instruction(&svm->vcpu);
1763
1764 nested_svm_do(svm, svm->vmcb->save.rax, 0, NULL, nested_svm_vmload);
1765
1766 return 1;
1767}
1768
1769static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1770{
1771 if (nested_svm_check_permissions(svm))
1772 return 1;
1773
1774 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1775 skip_emulated_instruction(&svm->vcpu);
1776
1777 nested_svm_do(svm, svm->vmcb->save.rax, 0, NULL, nested_svm_vmsave);
1778
1779 return 1;
1780}
1781
1782static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1783{
1784 nsvm_printk("VMrun\n");
1785 if (nested_svm_check_permissions(svm))
1786 return 1;
1787
1788 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1789 skip_emulated_instruction(&svm->vcpu);
1790
1791 if (nested_svm_do(svm, svm->vmcb->save.rax, 0,
1792 NULL, nested_svm_vmrun))
1793 return 1;
1794
1795 if (nested_svm_do(svm, svm->nested_vmcb_msrpm, 0,
1796 NULL, nested_svm_vmrun_msrpm))
1797 return 1;
1798
1799 return 1;
1800}
1801
1802static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1803{
1804 if (nested_svm_check_permissions(svm))
1805 return 1;
1806
1807 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1808 skip_emulated_instruction(&svm->vcpu);
1809
1810 svm->vcpu.arch.hflags |= HF_GIF_MASK;
1811
1812 return 1;
1813}
1814
1815static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1816{
1817 if (nested_svm_check_permissions(svm))
1818 return 1;
1819
1820 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1821 skip_emulated_instruction(&svm->vcpu);
1822
1823 svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
1824
1825 /* After a CLGI no interrupts should come */
1826 svm_clear_vintr(svm);
1827 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
1828
1829 return 1;
1830}
1831
1142static int invalid_op_interception(struct vcpu_svm *svm, 1832static int invalid_op_interception(struct vcpu_svm *svm,
1143 struct kvm_run *kvm_run) 1833 struct kvm_run *kvm_run)
1144{ 1834{
@@ -1250,6 +1940,15 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
1250 case MSR_IA32_LASTINTTOIP: 1940 case MSR_IA32_LASTINTTOIP:
1251 *data = svm->vmcb->save.last_excp_to; 1941 *data = svm->vmcb->save.last_excp_to;
1252 break; 1942 break;
1943 case MSR_VM_HSAVE_PA:
1944 *data = svm->hsave_msr;
1945 break;
1946 case MSR_VM_CR:
1947 *data = 0;
1948 break;
1949 case MSR_IA32_UCODE_REV:
1950 *data = 0x01000065;
1951 break;
1253 default: 1952 default:
1254 return kvm_get_msr_common(vcpu, ecx, data); 1953 return kvm_get_msr_common(vcpu, ecx, data);
1255 } 1954 }
@@ -1344,6 +2043,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
1344 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", ecx, data); 2043 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", ecx, data);
1345 2044
1346 break; 2045 break;
2046 case MSR_VM_HSAVE_PA:
2047 svm->hsave_msr = data;
2048 break;
1347 default: 2049 default:
1348 return kvm_set_msr_common(vcpu, ecx, data); 2050 return kvm_set_msr_common(vcpu, ecx, data);
1349 } 2051 }
@@ -1380,7 +2082,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm,
1380{ 2082{
1381 KVMTRACE_0D(PEND_INTR, &svm->vcpu, handler); 2083 KVMTRACE_0D(PEND_INTR, &svm->vcpu, handler);
1382 2084
1383 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR); 2085 svm_clear_vintr(svm);
1384 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 2086 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
1385 /* 2087 /*
1386 * If the user space waits to inject interrupts, exit as soon as 2088 * If the user space waits to inject interrupts, exit as soon as
@@ -1417,6 +2119,8 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
1417 [SVM_EXIT_WRITE_DR3] = emulate_on_interception, 2119 [SVM_EXIT_WRITE_DR3] = emulate_on_interception,
1418 [SVM_EXIT_WRITE_DR5] = emulate_on_interception, 2120 [SVM_EXIT_WRITE_DR5] = emulate_on_interception,
1419 [SVM_EXIT_WRITE_DR7] = emulate_on_interception, 2121 [SVM_EXIT_WRITE_DR7] = emulate_on_interception,
2122 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
2123 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
1420 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, 2124 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
1421 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, 2125 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
1422 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, 2126 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
@@ -1436,12 +2140,12 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
1436 [SVM_EXIT_MSR] = msr_interception, 2140 [SVM_EXIT_MSR] = msr_interception,
1437 [SVM_EXIT_TASK_SWITCH] = task_switch_interception, 2141 [SVM_EXIT_TASK_SWITCH] = task_switch_interception,
1438 [SVM_EXIT_SHUTDOWN] = shutdown_interception, 2142 [SVM_EXIT_SHUTDOWN] = shutdown_interception,
1439 [SVM_EXIT_VMRUN] = invalid_op_interception, 2143 [SVM_EXIT_VMRUN] = vmrun_interception,
1440 [SVM_EXIT_VMMCALL] = vmmcall_interception, 2144 [SVM_EXIT_VMMCALL] = vmmcall_interception,
1441 [SVM_EXIT_VMLOAD] = invalid_op_interception, 2145 [SVM_EXIT_VMLOAD] = vmload_interception,
1442 [SVM_EXIT_VMSAVE] = invalid_op_interception, 2146 [SVM_EXIT_VMSAVE] = vmsave_interception,
1443 [SVM_EXIT_STGI] = invalid_op_interception, 2147 [SVM_EXIT_STGI] = stgi_interception,
1444 [SVM_EXIT_CLGI] = invalid_op_interception, 2148 [SVM_EXIT_CLGI] = clgi_interception,
1445 [SVM_EXIT_SKINIT] = invalid_op_interception, 2149 [SVM_EXIT_SKINIT] = invalid_op_interception,
1446 [SVM_EXIT_WBINVD] = emulate_on_interception, 2150 [SVM_EXIT_WBINVD] = emulate_on_interception,
1447 [SVM_EXIT_MONITOR] = invalid_op_interception, 2151 [SVM_EXIT_MONITOR] = invalid_op_interception,
@@ -1457,6 +2161,17 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1457 KVMTRACE_3D(VMEXIT, vcpu, exit_code, (u32)svm->vmcb->save.rip, 2161 KVMTRACE_3D(VMEXIT, vcpu, exit_code, (u32)svm->vmcb->save.rip,
1458 (u32)((u64)svm->vmcb->save.rip >> 32), entryexit); 2162 (u32)((u64)svm->vmcb->save.rip >> 32), entryexit);
1459 2163
2164 if (is_nested(svm)) {
2165 nsvm_printk("nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n",
2166 exit_code, svm->vmcb->control.exit_info_1,
2167 svm->vmcb->control.exit_info_2, svm->vmcb->save.rip);
2168 if (nested_svm_exit_handled(svm, true)) {
2169 nested_svm_vmexit(svm);
2170 nsvm_printk("-> #VMEXIT\n");
2171 return 1;
2172 }
2173 }
2174
1460 if (npt_enabled) { 2175 if (npt_enabled) {
1461 int mmu_reload = 0; 2176 int mmu_reload = 0;
1462 if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) { 2177 if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) {
@@ -1544,6 +2259,8 @@ static void svm_set_irq(struct kvm_vcpu *vcpu, int irq)
1544{ 2259{
1545 struct vcpu_svm *svm = to_svm(vcpu); 2260 struct vcpu_svm *svm = to_svm(vcpu);
1546 2261
2262 nested_svm_intr(svm);
2263
1547 svm_inject_irq(svm, irq); 2264 svm_inject_irq(svm, irq);
1548} 2265}
1549 2266
@@ -1589,11 +2306,17 @@ static void svm_intr_assist(struct kvm_vcpu *vcpu)
1589 if (!kvm_cpu_has_interrupt(vcpu)) 2306 if (!kvm_cpu_has_interrupt(vcpu))
1590 goto out; 2307 goto out;
1591 2308
2309 if (nested_svm_intr(svm))
2310 goto out;
2311
2312 if (!(svm->vcpu.arch.hflags & HF_GIF_MASK))
2313 goto out;
2314
1592 if (!(vmcb->save.rflags & X86_EFLAGS_IF) || 2315 if (!(vmcb->save.rflags & X86_EFLAGS_IF) ||
1593 (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) || 2316 (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
1594 (vmcb->control.event_inj & SVM_EVTINJ_VALID)) { 2317 (vmcb->control.event_inj & SVM_EVTINJ_VALID)) {
1595 /* unable to deliver irq, set pending irq */ 2318 /* unable to deliver irq, set pending irq */
1596 vmcb->control.intercept |= (1ULL << INTERCEPT_VINTR); 2319 svm_set_vintr(svm);
1597 svm_inject_irq(svm, 0x0); 2320 svm_inject_irq(svm, 0x0);
1598 goto out; 2321 goto out;
1599 } 2322 }
@@ -1615,7 +2338,8 @@ static void kvm_reput_irq(struct vcpu_svm *svm)
1615 } 2338 }
1616 2339
1617 svm->vcpu.arch.interrupt_window_open = 2340 svm->vcpu.arch.interrupt_window_open =
1618 !(control->int_state & SVM_INTERRUPT_SHADOW_MASK); 2341 !(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
2342 (svm->vcpu.arch.hflags & HF_GIF_MASK);
1619} 2343}
1620 2344
1621static void svm_do_inject_vector(struct vcpu_svm *svm) 2345static void svm_do_inject_vector(struct vcpu_svm *svm)
@@ -1637,9 +2361,13 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
1637 struct vcpu_svm *svm = to_svm(vcpu); 2361 struct vcpu_svm *svm = to_svm(vcpu);
1638 struct vmcb_control_area *control = &svm->vmcb->control; 2362 struct vmcb_control_area *control = &svm->vmcb->control;
1639 2363
2364 if (nested_svm_intr(svm))
2365 return;
2366
1640 svm->vcpu.arch.interrupt_window_open = 2367 svm->vcpu.arch.interrupt_window_open =
1641 (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) && 2368 (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
1642 (svm->vmcb->save.rflags & X86_EFLAGS_IF)); 2369 (svm->vmcb->save.rflags & X86_EFLAGS_IF) &&
2370 (svm->vcpu.arch.hflags & HF_GIF_MASK));
1643 2371
1644 if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary) 2372 if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary)
1645 /* 2373 /*
@@ -1652,9 +2380,9 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
1652 */ 2380 */
1653 if (!svm->vcpu.arch.interrupt_window_open && 2381 if (!svm->vcpu.arch.interrupt_window_open &&
1654 (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window)) 2382 (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window))
1655 control->intercept |= 1ULL << INTERCEPT_VINTR; 2383 svm_set_vintr(svm);
1656 else 2384 else
1657 control->intercept &= ~(1ULL << INTERCEPT_VINTR); 2385 svm_clear_vintr(svm);
1658} 2386}
1659 2387
1660static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) 2388static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -1662,22 +2390,6 @@ static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
1662 return 0; 2390 return 0;
1663} 2391}
1664 2392
1665static void save_db_regs(unsigned long *db_regs)
1666{
1667 asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0]));
1668 asm volatile ("mov %%dr1, %0" : "=r"(db_regs[1]));
1669 asm volatile ("mov %%dr2, %0" : "=r"(db_regs[2]));
1670 asm volatile ("mov %%dr3, %0" : "=r"(db_regs[3]));
1671}
1672
1673static void load_db_regs(unsigned long *db_regs)
1674{
1675 asm volatile ("mov %0, %%dr0" : : "r"(db_regs[0]));
1676 asm volatile ("mov %0, %%dr1" : : "r"(db_regs[1]));
1677 asm volatile ("mov %0, %%dr2" : : "r"(db_regs[2]));
1678 asm volatile ("mov %0, %%dr3" : : "r"(db_regs[3]));
1679}
1680
1681static void svm_flush_tlb(struct kvm_vcpu *vcpu) 2393static void svm_flush_tlb(struct kvm_vcpu *vcpu)
1682{ 2394{
1683 force_new_asid(vcpu); 2395 force_new_asid(vcpu);
@@ -1736,19 +2448,12 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1736 gs_selector = kvm_read_gs(); 2448 gs_selector = kvm_read_gs();
1737 ldt_selector = kvm_read_ldt(); 2449 ldt_selector = kvm_read_ldt();
1738 svm->host_cr2 = kvm_read_cr2(); 2450 svm->host_cr2 = kvm_read_cr2();
1739 svm->host_dr6 = read_dr6(); 2451 if (!is_nested(svm))
1740 svm->host_dr7 = read_dr7(); 2452 svm->vmcb->save.cr2 = vcpu->arch.cr2;
1741 svm->vmcb->save.cr2 = vcpu->arch.cr2;
1742 /* required for live migration with NPT */ 2453 /* required for live migration with NPT */
1743 if (npt_enabled) 2454 if (npt_enabled)
1744 svm->vmcb->save.cr3 = vcpu->arch.cr3; 2455 svm->vmcb->save.cr3 = vcpu->arch.cr3;
1745 2456
1746 if (svm->vmcb->save.dr7 & 0xff) {
1747 write_dr7(0);
1748 save_db_regs(svm->host_db_regs);
1749 load_db_regs(svm->db_regs);
1750 }
1751
1752 clgi(); 2457 clgi();
1753 2458
1754 local_irq_enable(); 2459 local_irq_enable();
@@ -1824,16 +2529,11 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1824#endif 2529#endif
1825 ); 2530 );
1826 2531
1827 if ((svm->vmcb->save.dr7 & 0xff))
1828 load_db_regs(svm->host_db_regs);
1829
1830 vcpu->arch.cr2 = svm->vmcb->save.cr2; 2532 vcpu->arch.cr2 = svm->vmcb->save.cr2;
1831 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; 2533 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
1832 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; 2534 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
1833 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; 2535 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
1834 2536
1835 write_dr6(svm->host_dr6);
1836 write_dr7(svm->host_dr7);
1837 kvm_write_cr2(svm->host_cr2); 2537 kvm_write_cr2(svm->host_cr2);
1838 2538
1839 kvm_load_fs(fs_selector); 2539 kvm_load_fs(fs_selector);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7611af576829..bb481330716f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -91,6 +91,7 @@ struct vcpu_vmx {
91 } rmode; 91 } rmode;
92 int vpid; 92 int vpid;
93 bool emulation_required; 93 bool emulation_required;
94 enum emulation_result invalid_state_emulation_result;
94 95
95 /* Support for vnmi-less CPUs */ 96 /* Support for vnmi-less CPUs */
96 int soft_vnmi_blocked; 97 int soft_vnmi_blocked;
@@ -189,21 +190,21 @@ static inline int is_page_fault(u32 intr_info)
189{ 190{
190 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 191 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
191 INTR_INFO_VALID_MASK)) == 192 INTR_INFO_VALID_MASK)) ==
192 (INTR_TYPE_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK); 193 (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
193} 194}
194 195
195static inline int is_no_device(u32 intr_info) 196static inline int is_no_device(u32 intr_info)
196{ 197{
197 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 198 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
198 INTR_INFO_VALID_MASK)) == 199 INTR_INFO_VALID_MASK)) ==
199 (INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK); 200 (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
200} 201}
201 202
202static inline int is_invalid_opcode(u32 intr_info) 203static inline int is_invalid_opcode(u32 intr_info)
203{ 204{
204 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 205 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
205 INTR_INFO_VALID_MASK)) == 206 INTR_INFO_VALID_MASK)) ==
206 (INTR_TYPE_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK); 207 (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
207} 208}
208 209
209static inline int is_external_interrupt(u32 intr_info) 210static inline int is_external_interrupt(u32 intr_info)
@@ -480,8 +481,13 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
480 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR); 481 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR);
481 if (!vcpu->fpu_active) 482 if (!vcpu->fpu_active)
482 eb |= 1u << NM_VECTOR; 483 eb |= 1u << NM_VECTOR;
483 if (vcpu->guest_debug.enabled) 484 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
484 eb |= 1u << DB_VECTOR; 485 if (vcpu->guest_debug &
486 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
487 eb |= 1u << DB_VECTOR;
488 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
489 eb |= 1u << BP_VECTOR;
490 }
485 if (vcpu->arch.rmode.active) 491 if (vcpu->arch.rmode.active)
486 eb = ~0; 492 eb = ~0;
487 if (vm_need_ept()) 493 if (vm_need_ept())
@@ -747,29 +753,33 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
747 bool has_error_code, u32 error_code) 753 bool has_error_code, u32 error_code)
748{ 754{
749 struct vcpu_vmx *vmx = to_vmx(vcpu); 755 struct vcpu_vmx *vmx = to_vmx(vcpu);
756 u32 intr_info = nr | INTR_INFO_VALID_MASK;
750 757
751 if (has_error_code) 758 if (has_error_code) {
752 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); 759 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
760 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
761 }
753 762
754 if (vcpu->arch.rmode.active) { 763 if (vcpu->arch.rmode.active) {
755 vmx->rmode.irq.pending = true; 764 vmx->rmode.irq.pending = true;
756 vmx->rmode.irq.vector = nr; 765 vmx->rmode.irq.vector = nr;
757 vmx->rmode.irq.rip = kvm_rip_read(vcpu); 766 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
758 if (nr == BP_VECTOR) 767 if (nr == BP_VECTOR || nr == OF_VECTOR)
759 vmx->rmode.irq.rip++; 768 vmx->rmode.irq.rip++;
760 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 769 intr_info |= INTR_TYPE_SOFT_INTR;
761 nr | INTR_TYPE_SOFT_INTR 770 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
762 | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0)
763 | INTR_INFO_VALID_MASK);
764 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); 771 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
765 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); 772 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
766 return; 773 return;
767 } 774 }
768 775
769 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 776 if (nr == BP_VECTOR || nr == OF_VECTOR) {
770 nr | INTR_TYPE_EXCEPTION 777 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
771 | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0) 778 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
772 | INTR_INFO_VALID_MASK); 779 } else
780 intr_info |= INTR_TYPE_HARD_EXCEPTION;
781
782 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
773} 783}
774 784
775static bool vmx_exception_injected(struct kvm_vcpu *vcpu) 785static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
@@ -856,11 +866,8 @@ static u64 guest_read_tsc(void)
856 * writes 'guest_tsc' into guest's timestamp counter "register" 866 * writes 'guest_tsc' into guest's timestamp counter "register"
857 * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc 867 * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc
858 */ 868 */
859static void guest_write_tsc(u64 guest_tsc) 869static void guest_write_tsc(u64 guest_tsc, u64 host_tsc)
860{ 870{
861 u64 host_tsc;
862
863 rdtscll(host_tsc);
864 vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc); 871 vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc);
865} 872}
866 873
@@ -925,14 +932,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
925{ 932{
926 struct vcpu_vmx *vmx = to_vmx(vcpu); 933 struct vcpu_vmx *vmx = to_vmx(vcpu);
927 struct kvm_msr_entry *msr; 934 struct kvm_msr_entry *msr;
935 u64 host_tsc;
928 int ret = 0; 936 int ret = 0;
929 937
930 switch (msr_index) { 938 switch (msr_index) {
931#ifdef CONFIG_X86_64
932 case MSR_EFER: 939 case MSR_EFER:
933 vmx_load_host_state(vmx); 940 vmx_load_host_state(vmx);
934 ret = kvm_set_msr_common(vcpu, msr_index, data); 941 ret = kvm_set_msr_common(vcpu, msr_index, data);
935 break; 942 break;
943#ifdef CONFIG_X86_64
936 case MSR_FS_BASE: 944 case MSR_FS_BASE:
937 vmcs_writel(GUEST_FS_BASE, data); 945 vmcs_writel(GUEST_FS_BASE, data);
938 break; 946 break;
@@ -950,7 +958,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
950 vmcs_writel(GUEST_SYSENTER_ESP, data); 958 vmcs_writel(GUEST_SYSENTER_ESP, data);
951 break; 959 break;
952 case MSR_IA32_TIME_STAMP_COUNTER: 960 case MSR_IA32_TIME_STAMP_COUNTER:
953 guest_write_tsc(data); 961 rdtscll(host_tsc);
962 guest_write_tsc(data, host_tsc);
954 break; 963 break;
955 case MSR_P6_PERFCTR0: 964 case MSR_P6_PERFCTR0:
956 case MSR_P6_PERFCTR1: 965 case MSR_P6_PERFCTR1:
@@ -999,40 +1008,28 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
999 } 1008 }
1000} 1009}
1001 1010
1002static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) 1011static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
1003{ 1012{
1004 unsigned long dr7 = 0x400; 1013 int old_debug = vcpu->guest_debug;
1005 int old_singlestep; 1014 unsigned long flags;
1006
1007 old_singlestep = vcpu->guest_debug.singlestep;
1008
1009 vcpu->guest_debug.enabled = dbg->enabled;
1010 if (vcpu->guest_debug.enabled) {
1011 int i;
1012 1015
1013 dr7 |= 0x200; /* exact */ 1016 vcpu->guest_debug = dbg->control;
1014 for (i = 0; i < 4; ++i) { 1017 if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
1015 if (!dbg->breakpoints[i].enabled) 1018 vcpu->guest_debug = 0;
1016 continue;
1017 vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address;
1018 dr7 |= 2 << (i*2); /* global enable */
1019 dr7 |= 0 << (i*4+16); /* execution breakpoint */
1020 }
1021 1019
1022 vcpu->guest_debug.singlestep = dbg->singlestep; 1020 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1023 } else 1021 vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]);
1024 vcpu->guest_debug.singlestep = 0; 1022 else
1025 1023 vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
1026 if (old_singlestep && !vcpu->guest_debug.singlestep) {
1027 unsigned long flags;
1028 1024
1029 flags = vmcs_readl(GUEST_RFLAGS); 1025 flags = vmcs_readl(GUEST_RFLAGS);
1026 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
1027 flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
1028 else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
1030 flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); 1029 flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1031 vmcs_writel(GUEST_RFLAGS, flags); 1030 vmcs_writel(GUEST_RFLAGS, flags);
1032 }
1033 1031
1034 update_exception_bitmap(vcpu); 1032 update_exception_bitmap(vcpu);
1035 vmcs_writel(GUEST_DR7, dr7);
1036 1033
1037 return 0; 1034 return 0;
1038} 1035}
@@ -1433,6 +1430,29 @@ continue_rmode:
1433 init_rmode(vcpu->kvm); 1430 init_rmode(vcpu->kvm);
1434} 1431}
1435 1432
1433static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
1434{
1435 struct vcpu_vmx *vmx = to_vmx(vcpu);
1436 struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
1437
1438 vcpu->arch.shadow_efer = efer;
1439 if (!msr)
1440 return;
1441 if (efer & EFER_LMA) {
1442 vmcs_write32(VM_ENTRY_CONTROLS,
1443 vmcs_read32(VM_ENTRY_CONTROLS) |
1444 VM_ENTRY_IA32E_MODE);
1445 msr->data = efer;
1446 } else {
1447 vmcs_write32(VM_ENTRY_CONTROLS,
1448 vmcs_read32(VM_ENTRY_CONTROLS) &
1449 ~VM_ENTRY_IA32E_MODE);
1450
1451 msr->data = efer & ~EFER_LME;
1452 }
1453 setup_msrs(vmx);
1454}
1455
1436#ifdef CONFIG_X86_64 1456#ifdef CONFIG_X86_64
1437 1457
1438static void enter_lmode(struct kvm_vcpu *vcpu) 1458static void enter_lmode(struct kvm_vcpu *vcpu)
@@ -1447,13 +1467,8 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
1447 (guest_tr_ar & ~AR_TYPE_MASK) 1467 (guest_tr_ar & ~AR_TYPE_MASK)
1448 | AR_TYPE_BUSY_64_TSS); 1468 | AR_TYPE_BUSY_64_TSS);
1449 } 1469 }
1450
1451 vcpu->arch.shadow_efer |= EFER_LMA; 1470 vcpu->arch.shadow_efer |= EFER_LMA;
1452 1471 vmx_set_efer(vcpu, vcpu->arch.shadow_efer);
1453 find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME;
1454 vmcs_write32(VM_ENTRY_CONTROLS,
1455 vmcs_read32(VM_ENTRY_CONTROLS)
1456 | VM_ENTRY_IA32E_MODE);
1457} 1472}
1458 1473
1459static void exit_lmode(struct kvm_vcpu *vcpu) 1474static void exit_lmode(struct kvm_vcpu *vcpu)
@@ -1612,30 +1627,6 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1612 vmcs_writel(GUEST_CR4, hw_cr4); 1627 vmcs_writel(GUEST_CR4, hw_cr4);
1613} 1628}
1614 1629
1615static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
1616{
1617 struct vcpu_vmx *vmx = to_vmx(vcpu);
1618 struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
1619
1620 vcpu->arch.shadow_efer = efer;
1621 if (!msr)
1622 return;
1623 if (efer & EFER_LMA) {
1624 vmcs_write32(VM_ENTRY_CONTROLS,
1625 vmcs_read32(VM_ENTRY_CONTROLS) |
1626 VM_ENTRY_IA32E_MODE);
1627 msr->data = efer;
1628
1629 } else {
1630 vmcs_write32(VM_ENTRY_CONTROLS,
1631 vmcs_read32(VM_ENTRY_CONTROLS) &
1632 ~VM_ENTRY_IA32E_MODE);
1633
1634 msr->data = efer & ~EFER_LME;
1635 }
1636 setup_msrs(vmx);
1637}
1638
1639static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) 1630static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1640{ 1631{
1641 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 1632 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
@@ -1653,7 +1644,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
1653 var->limit = vmcs_read32(sf->limit); 1644 var->limit = vmcs_read32(sf->limit);
1654 var->selector = vmcs_read16(sf->selector); 1645 var->selector = vmcs_read16(sf->selector);
1655 ar = vmcs_read32(sf->ar_bytes); 1646 ar = vmcs_read32(sf->ar_bytes);
1656 if (ar & AR_UNUSABLE_MASK) 1647 if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
1657 ar = 0; 1648 ar = 0;
1658 var->type = ar & 15; 1649 var->type = ar & 15;
1659 var->s = (ar >> 4) & 1; 1650 var->s = (ar >> 4) & 1;
@@ -1788,14 +1779,16 @@ static bool code_segment_valid(struct kvm_vcpu *vcpu)
1788 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 1779 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
1789 cs_rpl = cs.selector & SELECTOR_RPL_MASK; 1780 cs_rpl = cs.selector & SELECTOR_RPL_MASK;
1790 1781
1782 if (cs.unusable)
1783 return false;
1791 if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK)) 1784 if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK))
1792 return false; 1785 return false;
1793 if (!cs.s) 1786 if (!cs.s)
1794 return false; 1787 return false;
1795 if (!(~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK))) { 1788 if (cs.type & AR_TYPE_WRITEABLE_MASK) {
1796 if (cs.dpl > cs_rpl) 1789 if (cs.dpl > cs_rpl)
1797 return false; 1790 return false;
1798 } else if (cs.type & AR_TYPE_CODE_MASK) { 1791 } else {
1799 if (cs.dpl != cs_rpl) 1792 if (cs.dpl != cs_rpl)
1800 return false; 1793 return false;
1801 } 1794 }
@@ -1814,7 +1807,9 @@ static bool stack_segment_valid(struct kvm_vcpu *vcpu)
1814 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 1807 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
1815 ss_rpl = ss.selector & SELECTOR_RPL_MASK; 1808 ss_rpl = ss.selector & SELECTOR_RPL_MASK;
1816 1809
1817 if ((ss.type != 3) || (ss.type != 7)) 1810 if (ss.unusable)
1811 return true;
1812 if (ss.type != 3 && ss.type != 7)
1818 return false; 1813 return false;
1819 if (!ss.s) 1814 if (!ss.s)
1820 return false; 1815 return false;
@@ -1834,6 +1829,8 @@ static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
1834 vmx_get_segment(vcpu, &var, seg); 1829 vmx_get_segment(vcpu, &var, seg);
1835 rpl = var.selector & SELECTOR_RPL_MASK; 1830 rpl = var.selector & SELECTOR_RPL_MASK;
1836 1831
1832 if (var.unusable)
1833 return true;
1837 if (!var.s) 1834 if (!var.s)
1838 return false; 1835 return false;
1839 if (!var.present) 1836 if (!var.present)
@@ -1855,9 +1852,11 @@ static bool tr_valid(struct kvm_vcpu *vcpu)
1855 1852
1856 vmx_get_segment(vcpu, &tr, VCPU_SREG_TR); 1853 vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
1857 1854
1855 if (tr.unusable)
1856 return false;
1858 if (tr.selector & SELECTOR_TI_MASK) /* TI = 1 */ 1857 if (tr.selector & SELECTOR_TI_MASK) /* TI = 1 */
1859 return false; 1858 return false;
1860 if ((tr.type != 3) || (tr.type != 11)) /* TODO: Check if guest is in IA32e mode */ 1859 if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
1861 return false; 1860 return false;
1862 if (!tr.present) 1861 if (!tr.present)
1863 return false; 1862 return false;
@@ -1871,6 +1870,8 @@ static bool ldtr_valid(struct kvm_vcpu *vcpu)
1871 1870
1872 vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR); 1871 vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
1873 1872
1873 if (ldtr.unusable)
1874 return true;
1874 if (ldtr.selector & SELECTOR_TI_MASK) /* TI = 1 */ 1875 if (ldtr.selector & SELECTOR_TI_MASK) /* TI = 1 */
1875 return false; 1876 return false;
1876 if (ldtr.type != 2) 1877 if (ldtr.type != 2)
@@ -2112,7 +2113,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2112{ 2113{
2113 u32 host_sysenter_cs, msr_low, msr_high; 2114 u32 host_sysenter_cs, msr_low, msr_high;
2114 u32 junk; 2115 u32 junk;
2115 u64 host_pat; 2116 u64 host_pat, tsc_this, tsc_base;
2116 unsigned long a; 2117 unsigned long a;
2117 struct descriptor_table dt; 2118 struct descriptor_table dt;
2118 int i; 2119 int i;
@@ -2240,6 +2241,12 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2240 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); 2241 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
2241 vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); 2242 vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
2242 2243
2244 tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc;
2245 rdtscll(tsc_this);
2246 if (tsc_this < vmx->vcpu.kvm->arch.vm_init_tsc)
2247 tsc_base = tsc_this;
2248
2249 guest_write_tsc(0, tsc_base);
2243 2250
2244 return 0; 2251 return 0;
2245} 2252}
@@ -2319,7 +2326,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2319 kvm_rip_write(vcpu, 0); 2326 kvm_rip_write(vcpu, 0);
2320 kvm_register_write(vcpu, VCPU_REGS_RSP, 0); 2327 kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
2321 2328
2322 /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */
2323 vmcs_writel(GUEST_DR7, 0x400); 2329 vmcs_writel(GUEST_DR7, 0x400);
2324 2330
2325 vmcs_writel(GUEST_GDTR_BASE, 0); 2331 vmcs_writel(GUEST_GDTR_BASE, 0);
@@ -2332,8 +2338,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2332 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); 2338 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
2333 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); 2339 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
2334 2340
2335 guest_write_tsc(0);
2336
2337 /* Special registers */ 2341 /* Special registers */
2338 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 2342 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
2339 2343
@@ -2486,6 +2490,11 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
2486{ 2490{
2487 vmx_update_window_states(vcpu); 2491 vmx_update_window_states(vcpu);
2488 2492
2493 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
2494 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
2495 GUEST_INTR_STATE_STI |
2496 GUEST_INTR_STATE_MOV_SS);
2497
2489 if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) { 2498 if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
2490 if (vcpu->arch.interrupt.pending) { 2499 if (vcpu->arch.interrupt.pending) {
2491 enable_nmi_window(vcpu); 2500 enable_nmi_window(vcpu);
@@ -2536,24 +2545,6 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
2536 return 0; 2545 return 0;
2537} 2546}
2538 2547
2539static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
2540{
2541 struct kvm_guest_debug *dbg = &vcpu->guest_debug;
2542
2543 set_debugreg(dbg->bp[0], 0);
2544 set_debugreg(dbg->bp[1], 1);
2545 set_debugreg(dbg->bp[2], 2);
2546 set_debugreg(dbg->bp[3], 3);
2547
2548 if (dbg->singlestep) {
2549 unsigned long flags;
2550
2551 flags = vmcs_readl(GUEST_RFLAGS);
2552 flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
2553 vmcs_writel(GUEST_RFLAGS, flags);
2554 }
2555}
2556
2557static int handle_rmode_exception(struct kvm_vcpu *vcpu, 2548static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2558 int vec, u32 err_code) 2549 int vec, u32 err_code)
2559{ 2550{
@@ -2570,9 +2561,17 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2570 * the required debugging infrastructure rework. 2561 * the required debugging infrastructure rework.
2571 */ 2562 */
2572 switch (vec) { 2563 switch (vec) {
2573 case DE_VECTOR:
2574 case DB_VECTOR: 2564 case DB_VECTOR:
2565 if (vcpu->guest_debug &
2566 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
2567 return 0;
2568 kvm_queue_exception(vcpu, vec);
2569 return 1;
2575 case BP_VECTOR: 2570 case BP_VECTOR:
2571 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
2572 return 0;
2573 /* fall through */
2574 case DE_VECTOR:
2576 case OF_VECTOR: 2575 case OF_VECTOR:
2577 case BR_VECTOR: 2576 case BR_VECTOR:
2578 case UD_VECTOR: 2577 case UD_VECTOR:
@@ -2589,8 +2588,8 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2589static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2588static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2590{ 2589{
2591 struct vcpu_vmx *vmx = to_vmx(vcpu); 2590 struct vcpu_vmx *vmx = to_vmx(vcpu);
2592 u32 intr_info, error_code; 2591 u32 intr_info, ex_no, error_code;
2593 unsigned long cr2, rip; 2592 unsigned long cr2, rip, dr6;
2594 u32 vect_info; 2593 u32 vect_info;
2595 enum emulation_result er; 2594 enum emulation_result er;
2596 2595
@@ -2649,14 +2648,30 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2649 return 1; 2648 return 1;
2650 } 2649 }
2651 2650
2652 if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == 2651 ex_no = intr_info & INTR_INFO_VECTOR_MASK;
2653 (INTR_TYPE_EXCEPTION | 1)) { 2652 switch (ex_no) {
2653 case DB_VECTOR:
2654 dr6 = vmcs_readl(EXIT_QUALIFICATION);
2655 if (!(vcpu->guest_debug &
2656 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
2657 vcpu->arch.dr6 = dr6 | DR6_FIXED_1;
2658 kvm_queue_exception(vcpu, DB_VECTOR);
2659 return 1;
2660 }
2661 kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
2662 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
2663 /* fall through */
2664 case BP_VECTOR:
2654 kvm_run->exit_reason = KVM_EXIT_DEBUG; 2665 kvm_run->exit_reason = KVM_EXIT_DEBUG;
2655 return 0; 2666 kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
2667 kvm_run->debug.arch.exception = ex_no;
2668 break;
2669 default:
2670 kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
2671 kvm_run->ex.exception = ex_no;
2672 kvm_run->ex.error_code = error_code;
2673 break;
2656 } 2674 }
2657 kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
2658 kvm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
2659 kvm_run->ex.error_code = error_code;
2660 return 0; 2675 return 0;
2661} 2676}
2662 2677
@@ -2677,7 +2692,7 @@ static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2677static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2692static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2678{ 2693{
2679 unsigned long exit_qualification; 2694 unsigned long exit_qualification;
2680 int size, down, in, string, rep; 2695 int size, in, string;
2681 unsigned port; 2696 unsigned port;
2682 2697
2683 ++vcpu->stat.io_exits; 2698 ++vcpu->stat.io_exits;
@@ -2693,8 +2708,6 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2693 2708
2694 size = (exit_qualification & 7) + 1; 2709 size = (exit_qualification & 7) + 1;
2695 in = (exit_qualification & 8) != 0; 2710 in = (exit_qualification & 8) != 0;
2696 down = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0;
2697 rep = (exit_qualification & 32) != 0;
2698 port = exit_qualification >> 16; 2711 port = exit_qualification >> 16;
2699 2712
2700 skip_emulated_instruction(vcpu); 2713 skip_emulated_instruction(vcpu);
@@ -2795,21 +2808,44 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2795 unsigned long val; 2808 unsigned long val;
2796 int dr, reg; 2809 int dr, reg;
2797 2810
2798 /* 2811 dr = vmcs_readl(GUEST_DR7);
2799 * FIXME: this code assumes the host is debugging the guest. 2812 if (dr & DR7_GD) {
2800 * need to deal with guest debugging itself too. 2813 /*
2801 */ 2814 * As the vm-exit takes precedence over the debug trap, we
2815 * need to emulate the latter, either for the host or the
2816 * guest debugging itself.
2817 */
2818 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
2819 kvm_run->debug.arch.dr6 = vcpu->arch.dr6;
2820 kvm_run->debug.arch.dr7 = dr;
2821 kvm_run->debug.arch.pc =
2822 vmcs_readl(GUEST_CS_BASE) +
2823 vmcs_readl(GUEST_RIP);
2824 kvm_run->debug.arch.exception = DB_VECTOR;
2825 kvm_run->exit_reason = KVM_EXIT_DEBUG;
2826 return 0;
2827 } else {
2828 vcpu->arch.dr7 &= ~DR7_GD;
2829 vcpu->arch.dr6 |= DR6_BD;
2830 vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
2831 kvm_queue_exception(vcpu, DB_VECTOR);
2832 return 1;
2833 }
2834 }
2835
2802 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 2836 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
2803 dr = exit_qualification & 7; 2837 dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
2804 reg = (exit_qualification >> 8) & 15; 2838 reg = DEBUG_REG_ACCESS_REG(exit_qualification);
2805 if (exit_qualification & 16) { 2839 if (exit_qualification & TYPE_MOV_FROM_DR) {
2806 /* mov from dr */
2807 switch (dr) { 2840 switch (dr) {
2841 case 0 ... 3:
2842 val = vcpu->arch.db[dr];
2843 break;
2808 case 6: 2844 case 6:
2809 val = 0xffff0ff0; 2845 val = vcpu->arch.dr6;
2810 break; 2846 break;
2811 case 7: 2847 case 7:
2812 val = 0x400; 2848 val = vcpu->arch.dr7;
2813 break; 2849 break;
2814 default: 2850 default:
2815 val = 0; 2851 val = 0;
@@ -2817,7 +2853,38 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2817 kvm_register_write(vcpu, reg, val); 2853 kvm_register_write(vcpu, reg, val);
2818 KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler); 2854 KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
2819 } else { 2855 } else {
2820 /* mov to dr */ 2856 val = vcpu->arch.regs[reg];
2857 switch (dr) {
2858 case 0 ... 3:
2859 vcpu->arch.db[dr] = val;
2860 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
2861 vcpu->arch.eff_db[dr] = val;
2862 break;
2863 case 4 ... 5:
2864 if (vcpu->arch.cr4 & X86_CR4_DE)
2865 kvm_queue_exception(vcpu, UD_VECTOR);
2866 break;
2867 case 6:
2868 if (val & 0xffffffff00000000ULL) {
2869 kvm_queue_exception(vcpu, GP_VECTOR);
2870 break;
2871 }
2872 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
2873 break;
2874 case 7:
2875 if (val & 0xffffffff00000000ULL) {
2876 kvm_queue_exception(vcpu, GP_VECTOR);
2877 break;
2878 }
2879 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
2880 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
2881 vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
2882 vcpu->arch.switch_db_regs =
2883 (val & DR7_BP_EN_MASK);
2884 }
2885 break;
2886 }
2887 KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)val, handler);
2821 } 2888 }
2822 skip_emulated_instruction(vcpu); 2889 skip_emulated_instruction(vcpu);
2823 return 1; 2890 return 1;
@@ -2968,17 +3035,25 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2968 } 3035 }
2969 tss_selector = exit_qualification; 3036 tss_selector = exit_qualification;
2970 3037
2971 return kvm_task_switch(vcpu, tss_selector, reason); 3038 if (!kvm_task_switch(vcpu, tss_selector, reason))
3039 return 0;
3040
3041 /* clear all local breakpoint enable flags */
3042 vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55);
3043
3044 /*
3045 * TODO: What about debug traps on tss switch?
3046 * Are we supposed to inject them and update dr6?
3047 */
3048
3049 return 1;
2972} 3050}
2973 3051
2974static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3052static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2975{ 3053{
2976 u64 exit_qualification; 3054 u64 exit_qualification;
2977 enum emulation_result er;
2978 gpa_t gpa; 3055 gpa_t gpa;
2979 unsigned long hva;
2980 int gla_validity; 3056 int gla_validity;
2981 int r;
2982 3057
2983 exit_qualification = vmcs_read64(EXIT_QUALIFICATION); 3058 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2984 3059
@@ -3001,32 +3076,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3001 } 3076 }
3002 3077
3003 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 3078 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
3004 hva = gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT); 3079 return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0);
3005 if (!kvm_is_error_hva(hva)) {
3006 r = kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0);
3007 if (r < 0) {
3008 printk(KERN_ERR "EPT: Not enough memory!\n");
3009 return -ENOMEM;
3010 }
3011 return 1;
3012 } else {
3013 /* must be MMIO */
3014 er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
3015
3016 if (er == EMULATE_FAIL) {
3017 printk(KERN_ERR
3018 "EPT: Fail to handle EPT violation vmexit!er is %d\n",
3019 er);
3020 printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
3021 (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
3022 (long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS));
3023 printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
3024 (long unsigned int)exit_qualification);
3025 return -ENOTSUPP;
3026 } else if (er == EMULATE_DO_MMIO)
3027 return 0;
3028 }
3029 return 1;
3030} 3080}
3031 3081
3032static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3082static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
@@ -3046,7 +3096,7 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
3046 struct kvm_run *kvm_run) 3096 struct kvm_run *kvm_run)
3047{ 3097{
3048 struct vcpu_vmx *vmx = to_vmx(vcpu); 3098 struct vcpu_vmx *vmx = to_vmx(vcpu);
3049 int err; 3099 enum emulation_result err = EMULATE_DONE;
3050 3100
3051 preempt_enable(); 3101 preempt_enable();
3052 local_irq_enable(); 3102 local_irq_enable();
@@ -3071,10 +3121,7 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
3071 local_irq_disable(); 3121 local_irq_disable();
3072 preempt_disable(); 3122 preempt_disable();
3073 3123
3074 /* Guest state should be valid now except if we need to 3124 vmx->invalid_state_emulation_result = err;
3075 * emulate an MMIO */
3076 if (guest_state_valid(vcpu))
3077 vmx->emulation_required = 0;
3078} 3125}
3079 3126
3080/* 3127/*
@@ -3123,8 +3170,11 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3123 3170
3124 /* If we need to emulate an MMIO from handle_invalid_guest_state 3171 /* If we need to emulate an MMIO from handle_invalid_guest_state
3125 * we just return 0 */ 3172 * we just return 0 */
3126 if (vmx->emulation_required && emulate_invalid_guest_state) 3173 if (vmx->emulation_required && emulate_invalid_guest_state) {
3127 return 0; 3174 if (guest_state_valid(vcpu))
3175 vmx->emulation_required = 0;
3176 return vmx->invalid_state_emulation_result != EMULATE_DO_MMIO;
3177 }
3128 3178
3129 /* Access CR3 don't cause VMExit in paging mode, so we need 3179 /* Access CR3 don't cause VMExit in paging mode, so we need
3130 * to sync with guest real CR3. */ 3180 * to sync with guest real CR3. */
@@ -3238,7 +3288,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3238 vmx->vcpu.arch.nmi_injected = false; 3288 vmx->vcpu.arch.nmi_injected = false;
3239 } 3289 }
3240 kvm_clear_exception_queue(&vmx->vcpu); 3290 kvm_clear_exception_queue(&vmx->vcpu);
3241 if (idtv_info_valid && type == INTR_TYPE_EXCEPTION) { 3291 if (idtv_info_valid && (type == INTR_TYPE_HARD_EXCEPTION ||
3292 type == INTR_TYPE_SOFT_EXCEPTION)) {
3242 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { 3293 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
3243 error = vmcs_read32(IDT_VECTORING_ERROR_CODE); 3294 error = vmcs_read32(IDT_VECTORING_ERROR_CODE);
3244 kvm_queue_exception_e(&vmx->vcpu, vector, error); 3295 kvm_queue_exception_e(&vmx->vcpu, vector, error);
@@ -3259,6 +3310,11 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
3259 3310
3260 vmx_update_window_states(vcpu); 3311 vmx_update_window_states(vcpu);
3261 3312
3313 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
3314 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
3315 GUEST_INTR_STATE_STI |
3316 GUEST_INTR_STATE_MOV_SS);
3317
3262 if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) { 3318 if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
3263 if (vcpu->arch.interrupt.pending) { 3319 if (vcpu->arch.interrupt.pending) {
3264 enable_nmi_window(vcpu); 3320 enable_nmi_window(vcpu);
@@ -3347,6 +3403,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3347 */ 3403 */
3348 vmcs_writel(HOST_CR0, read_cr0()); 3404 vmcs_writel(HOST_CR0, read_cr0());
3349 3405
3406 set_debugreg(vcpu->arch.dr6, 6);
3407
3350 asm( 3408 asm(
3351 /* Store host registers */ 3409 /* Store host registers */
3352 "push %%"R"dx; push %%"R"bp;" 3410 "push %%"R"dx; push %%"R"bp;"
@@ -3441,6 +3499,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3441 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); 3499 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
3442 vcpu->arch.regs_dirty = 0; 3500 vcpu->arch.regs_dirty = 0;
3443 3501
3502 get_debugreg(vcpu->arch.dr6, 6);
3503
3444 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 3504 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
3445 if (vmx->rmode.irq.pending) 3505 if (vmx->rmode.irq.pending)
3446 fixup_rmode_irq(vmx); 3506 fixup_rmode_irq(vmx);
@@ -3595,7 +3655,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
3595 .vcpu_put = vmx_vcpu_put, 3655 .vcpu_put = vmx_vcpu_put,
3596 3656
3597 .set_guest_debug = set_guest_debug, 3657 .set_guest_debug = set_guest_debug,
3598 .guest_debug_pre = kvm_guest_debug_pre,
3599 .get_msr = vmx_get_msr, 3658 .get_msr = vmx_get_msr,
3600 .set_msr = vmx_set_msr, 3659 .set_msr = vmx_set_msr,
3601 .get_segment_base = vmx_get_segment_base, 3660 .get_segment_base = vmx_get_segment_base,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 758b7a155ae9..8ca100a9ecac 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -36,6 +36,7 @@
36#include <linux/highmem.h> 36#include <linux/highmem.h>
37#include <linux/iommu.h> 37#include <linux/iommu.h>
38#include <linux/intel-iommu.h> 38#include <linux/intel-iommu.h>
39#include <linux/cpufreq.h>
39 40
40#include <asm/uaccess.h> 41#include <asm/uaccess.h>
41#include <asm/msr.h> 42#include <asm/msr.h>
@@ -69,6 +70,8 @@ static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
69 70
70static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, 71static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
71 struct kvm_cpuid_entry2 __user *entries); 72 struct kvm_cpuid_entry2 __user *entries);
73struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
74 u32 function, u32 index);
72 75
73struct kvm_x86_ops *kvm_x86_ops; 76struct kvm_x86_ops *kvm_x86_ops;
74EXPORT_SYMBOL_GPL(kvm_x86_ops); 77EXPORT_SYMBOL_GPL(kvm_x86_ops);
@@ -173,6 +176,7 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
173 u32 error_code) 176 u32 error_code)
174{ 177{
175 ++vcpu->stat.pf_guest; 178 ++vcpu->stat.pf_guest;
179
176 if (vcpu->arch.exception.pending) { 180 if (vcpu->arch.exception.pending) {
177 if (vcpu->arch.exception.nr == PF_VECTOR) { 181 if (vcpu->arch.exception.nr == PF_VECTOR) {
178 printk(KERN_DEBUG "kvm: inject_page_fault:" 182 printk(KERN_DEBUG "kvm: inject_page_fault:"
@@ -361,6 +365,7 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
361 } 365 }
362 kvm_x86_ops->set_cr4(vcpu, cr4); 366 kvm_x86_ops->set_cr4(vcpu, cr4);
363 vcpu->arch.cr4 = cr4; 367 vcpu->arch.cr4 = cr4;
368 vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
364 kvm_mmu_sync_global(vcpu); 369 kvm_mmu_sync_global(vcpu);
365 kvm_mmu_reset_context(vcpu); 370 kvm_mmu_reset_context(vcpu);
366} 371}
@@ -442,6 +447,11 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
442} 447}
443EXPORT_SYMBOL_GPL(kvm_get_cr8); 448EXPORT_SYMBOL_GPL(kvm_get_cr8);
444 449
450static inline u32 bit(int bitno)
451{
452 return 1 << (bitno & 31);
453}
454
445/* 455/*
446 * List of msr numbers which we expose to userspace through KVM_GET_MSRS 456 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
447 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. 457 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
@@ -456,7 +466,7 @@ static u32 msrs_to_save[] = {
456 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 466 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
457#endif 467#endif
458 MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 468 MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
459 MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT 469 MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
460}; 470};
461 471
462static unsigned num_msrs_to_save; 472static unsigned num_msrs_to_save;
@@ -481,6 +491,28 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
481 return; 491 return;
482 } 492 }
483 493
494 if (efer & EFER_FFXSR) {
495 struct kvm_cpuid_entry2 *feat;
496
497 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
498 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) {
499 printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n");
500 kvm_inject_gp(vcpu, 0);
501 return;
502 }
503 }
504
505 if (efer & EFER_SVME) {
506 struct kvm_cpuid_entry2 *feat;
507
508 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
509 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) {
510 printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n");
511 kvm_inject_gp(vcpu, 0);
512 return;
513 }
514 }
515
484 kvm_x86_ops->set_efer(vcpu, efer); 516 kvm_x86_ops->set_efer(vcpu, efer);
485 517
486 efer &= ~EFER_LMA; 518 efer &= ~EFER_LMA;
@@ -586,6 +618,8 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *
586 hv_clock->tsc_to_system_mul); 618 hv_clock->tsc_to_system_mul);
587} 619}
588 620
621static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
622
589static void kvm_write_guest_time(struct kvm_vcpu *v) 623static void kvm_write_guest_time(struct kvm_vcpu *v)
590{ 624{
591 struct timespec ts; 625 struct timespec ts;
@@ -596,9 +630,9 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
596 if ((!vcpu->time_page)) 630 if ((!vcpu->time_page))
597 return; 631 return;
598 632
599 if (unlikely(vcpu->hv_clock_tsc_khz != tsc_khz)) { 633 if (unlikely(vcpu->hv_clock_tsc_khz != __get_cpu_var(cpu_tsc_khz))) {
600 kvm_set_time_scale(tsc_khz, &vcpu->hv_clock); 634 kvm_set_time_scale(__get_cpu_var(cpu_tsc_khz), &vcpu->hv_clock);
601 vcpu->hv_clock_tsc_khz = tsc_khz; 635 vcpu->hv_clock_tsc_khz = __get_cpu_var(cpu_tsc_khz);
602 } 636 }
603 637
604 /* Keep irq disabled to prevent changes to the clock */ 638 /* Keep irq disabled to prevent changes to the clock */
@@ -629,6 +663,16 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
629 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); 663 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
630} 664}
631 665
666static int kvm_request_guest_time_update(struct kvm_vcpu *v)
667{
668 struct kvm_vcpu_arch *vcpu = &v->arch;
669
670 if (!vcpu->time_page)
671 return 0;
672 set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests);
673 return 1;
674}
675
632static bool msr_mtrr_valid(unsigned msr) 676static bool msr_mtrr_valid(unsigned msr)
633{ 677{
634 switch (msr) { 678 switch (msr) {
@@ -722,6 +766,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
722 break; 766 break;
723 case MSR_IA32_UCODE_REV: 767 case MSR_IA32_UCODE_REV:
724 case MSR_IA32_UCODE_WRITE: 768 case MSR_IA32_UCODE_WRITE:
769 case MSR_VM_HSAVE_PA:
725 break; 770 break;
726 case 0x200 ... 0x2ff: 771 case 0x200 ... 0x2ff:
727 return set_msr_mtrr(vcpu, msr, data); 772 return set_msr_mtrr(vcpu, msr, data);
@@ -758,7 +803,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
758 vcpu->arch.time_page = NULL; 803 vcpu->arch.time_page = NULL;
759 } 804 }
760 805
761 kvm_write_guest_time(vcpu); 806 kvm_request_guest_time_update(vcpu);
762 break; 807 break;
763 } 808 }
764 default: 809 default:
@@ -843,6 +888,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
843 case MSR_IA32_LASTBRANCHTOIP: 888 case MSR_IA32_LASTBRANCHTOIP:
844 case MSR_IA32_LASTINTFROMIP: 889 case MSR_IA32_LASTINTFROMIP:
845 case MSR_IA32_LASTINTTOIP: 890 case MSR_IA32_LASTINTTOIP:
891 case MSR_VM_HSAVE_PA:
846 data = 0; 892 data = 0;
847 break; 893 break;
848 case MSR_MTRRcap: 894 case MSR_MTRRcap:
@@ -967,10 +1013,13 @@ int kvm_dev_ioctl_check_extension(long ext)
967 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: 1013 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
968 case KVM_CAP_SET_TSS_ADDR: 1014 case KVM_CAP_SET_TSS_ADDR:
969 case KVM_CAP_EXT_CPUID: 1015 case KVM_CAP_EXT_CPUID:
1016 case KVM_CAP_CLOCKSOURCE:
970 case KVM_CAP_PIT: 1017 case KVM_CAP_PIT:
971 case KVM_CAP_NOP_IO_DELAY: 1018 case KVM_CAP_NOP_IO_DELAY:
972 case KVM_CAP_MP_STATE: 1019 case KVM_CAP_MP_STATE:
973 case KVM_CAP_SYNC_MMU: 1020 case KVM_CAP_SYNC_MMU:
1021 case KVM_CAP_REINJECT_CONTROL:
1022 case KVM_CAP_IRQ_INJECT_STATUS:
974 r = 1; 1023 r = 1;
975 break; 1024 break;
976 case KVM_CAP_COALESCED_MMIO: 1025 case KVM_CAP_COALESCED_MMIO:
@@ -991,9 +1040,6 @@ int kvm_dev_ioctl_check_extension(long ext)
991 case KVM_CAP_IOMMU: 1040 case KVM_CAP_IOMMU:
992 r = iommu_found(); 1041 r = iommu_found();
993 break; 1042 break;
994 case KVM_CAP_CLOCKSOURCE:
995 r = boot_cpu_has(X86_FEATURE_CONSTANT_TSC);
996 break;
997 default: 1043 default:
998 r = 0; 1044 r = 0;
999 break; 1045 break;
@@ -1044,7 +1090,7 @@ long kvm_arch_dev_ioctl(struct file *filp,
1044 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1090 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1045 goto out; 1091 goto out;
1046 r = kvm_dev_ioctl_get_supported_cpuid(&cpuid, 1092 r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
1047 cpuid_arg->entries); 1093 cpuid_arg->entries);
1048 if (r) 1094 if (r)
1049 goto out; 1095 goto out;
1050 1096
@@ -1064,7 +1110,7 @@ out:
1064void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1110void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1065{ 1111{
1066 kvm_x86_ops->vcpu_load(vcpu, cpu); 1112 kvm_x86_ops->vcpu_load(vcpu, cpu);
1067 kvm_write_guest_time(vcpu); 1113 kvm_request_guest_time_update(vcpu);
1068} 1114}
1069 1115
1070void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 1116void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -1142,8 +1188,8 @@ out:
1142} 1188}
1143 1189
1144static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, 1190static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
1145 struct kvm_cpuid2 *cpuid, 1191 struct kvm_cpuid2 *cpuid,
1146 struct kvm_cpuid_entry2 __user *entries) 1192 struct kvm_cpuid_entry2 __user *entries)
1147{ 1193{
1148 int r; 1194 int r;
1149 1195
@@ -1162,8 +1208,8 @@ out:
1162} 1208}
1163 1209
1164static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, 1210static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
1165 struct kvm_cpuid2 *cpuid, 1211 struct kvm_cpuid2 *cpuid,
1166 struct kvm_cpuid_entry2 __user *entries) 1212 struct kvm_cpuid_entry2 __user *entries)
1167{ 1213{
1168 int r; 1214 int r;
1169 1215
@@ -1172,7 +1218,7 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
1172 goto out; 1218 goto out;
1173 r = -EFAULT; 1219 r = -EFAULT;
1174 if (copy_to_user(entries, &vcpu->arch.cpuid_entries, 1220 if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
1175 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) 1221 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
1176 goto out; 1222 goto out;
1177 return 0; 1223 return 0;
1178 1224
@@ -1181,18 +1227,13 @@ out:
1181 return r; 1227 return r;
1182} 1228}
1183 1229
1184static inline u32 bit(int bitno)
1185{
1186 return 1 << (bitno & 31);
1187}
1188
1189static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, 1230static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1190 u32 index) 1231 u32 index)
1191{ 1232{
1192 entry->function = function; 1233 entry->function = function;
1193 entry->index = index; 1234 entry->index = index;
1194 cpuid_count(entry->function, entry->index, 1235 cpuid_count(entry->function, entry->index,
1195 &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); 1236 &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
1196 entry->flags = 0; 1237 entry->flags = 0;
1197} 1238}
1198 1239
@@ -1222,15 +1263,17 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1222#ifdef CONFIG_X86_64 1263#ifdef CONFIG_X86_64
1223 bit(X86_FEATURE_LM) | 1264 bit(X86_FEATURE_LM) |
1224#endif 1265#endif
1266 bit(X86_FEATURE_FXSR_OPT) |
1225 bit(X86_FEATURE_MMXEXT) | 1267 bit(X86_FEATURE_MMXEXT) |
1226 bit(X86_FEATURE_3DNOWEXT) | 1268 bit(X86_FEATURE_3DNOWEXT) |
1227 bit(X86_FEATURE_3DNOW); 1269 bit(X86_FEATURE_3DNOW);
1228 const u32 kvm_supported_word3_x86_features = 1270 const u32 kvm_supported_word3_x86_features =
1229 bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16); 1271 bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16);
1230 const u32 kvm_supported_word6_x86_features = 1272 const u32 kvm_supported_word6_x86_features =
1231 bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY); 1273 bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY) |
1274 bit(X86_FEATURE_SVM);
1232 1275
1233 /* all func 2 cpuid_count() should be called on the same cpu */ 1276 /* all calls to cpuid_count() should be made on the same cpu */
1234 get_cpu(); 1277 get_cpu();
1235 do_cpuid_1_ent(entry, function, index); 1278 do_cpuid_1_ent(entry, function, index);
1236 ++*nent; 1279 ++*nent;
@@ -1304,7 +1347,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1304} 1347}
1305 1348
1306static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, 1349static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
1307 struct kvm_cpuid_entry2 __user *entries) 1350 struct kvm_cpuid_entry2 __user *entries)
1308{ 1351{
1309 struct kvm_cpuid_entry2 *cpuid_entries; 1352 struct kvm_cpuid_entry2 *cpuid_entries;
1310 int limit, nent = 0, r = -E2BIG; 1353 int limit, nent = 0, r = -E2BIG;
@@ -1321,7 +1364,7 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
1321 limit = cpuid_entries[0].eax; 1364 limit = cpuid_entries[0].eax;
1322 for (func = 1; func <= limit && nent < cpuid->nent; ++func) 1365 for (func = 1; func <= limit && nent < cpuid->nent; ++func)
1323 do_cpuid_ent(&cpuid_entries[nent], func, 0, 1366 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1324 &nent, cpuid->nent); 1367 &nent, cpuid->nent);
1325 r = -E2BIG; 1368 r = -E2BIG;
1326 if (nent >= cpuid->nent) 1369 if (nent >= cpuid->nent)
1327 goto out_free; 1370 goto out_free;
@@ -1330,10 +1373,10 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
1330 limit = cpuid_entries[nent - 1].eax; 1373 limit = cpuid_entries[nent - 1].eax;
1331 for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) 1374 for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
1332 do_cpuid_ent(&cpuid_entries[nent], func, 0, 1375 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1333 &nent, cpuid->nent); 1376 &nent, cpuid->nent);
1334 r = -EFAULT; 1377 r = -EFAULT;
1335 if (copy_to_user(entries, cpuid_entries, 1378 if (copy_to_user(entries, cpuid_entries,
1336 nent * sizeof(struct kvm_cpuid_entry2))) 1379 nent * sizeof(struct kvm_cpuid_entry2)))
1337 goto out_free; 1380 goto out_free;
1338 cpuid->nent = nent; 1381 cpuid->nent = nent;
1339 r = 0; 1382 r = 0;
@@ -1477,7 +1520,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1477 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1520 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1478 goto out; 1521 goto out;
1479 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, 1522 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
1480 cpuid_arg->entries); 1523 cpuid_arg->entries);
1481 if (r) 1524 if (r)
1482 goto out; 1525 goto out;
1483 break; 1526 break;
@@ -1490,7 +1533,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1490 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 1533 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1491 goto out; 1534 goto out;
1492 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid, 1535 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
1493 cpuid_arg->entries); 1536 cpuid_arg->entries);
1494 if (r) 1537 if (r)
1495 goto out; 1538 goto out;
1496 r = -EFAULT; 1539 r = -EFAULT;
@@ -1710,6 +1753,15 @@ static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
1710 return r; 1753 return r;
1711} 1754}
1712 1755
1756static int kvm_vm_ioctl_reinject(struct kvm *kvm,
1757 struct kvm_reinject_control *control)
1758{
1759 if (!kvm->arch.vpit)
1760 return -ENXIO;
1761 kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject;
1762 return 0;
1763}
1764
1713/* 1765/*
1714 * Get (and clear) the dirty memory log for a memory slot. 1766 * Get (and clear) the dirty memory log for a memory slot.
1715 */ 1767 */
@@ -1807,13 +1859,26 @@ long kvm_arch_vm_ioctl(struct file *filp,
1807 } 1859 }
1808 } else 1860 } else
1809 goto out; 1861 goto out;
1862 r = kvm_setup_default_irq_routing(kvm);
1863 if (r) {
1864 kfree(kvm->arch.vpic);
1865 kfree(kvm->arch.vioapic);
1866 goto out;
1867 }
1810 break; 1868 break;
1811 case KVM_CREATE_PIT: 1869 case KVM_CREATE_PIT:
1870 mutex_lock(&kvm->lock);
1871 r = -EEXIST;
1872 if (kvm->arch.vpit)
1873 goto create_pit_unlock;
1812 r = -ENOMEM; 1874 r = -ENOMEM;
1813 kvm->arch.vpit = kvm_create_pit(kvm); 1875 kvm->arch.vpit = kvm_create_pit(kvm);
1814 if (kvm->arch.vpit) 1876 if (kvm->arch.vpit)
1815 r = 0; 1877 r = 0;
1878 create_pit_unlock:
1879 mutex_unlock(&kvm->lock);
1816 break; 1880 break;
1881 case KVM_IRQ_LINE_STATUS:
1817 case KVM_IRQ_LINE: { 1882 case KVM_IRQ_LINE: {
1818 struct kvm_irq_level irq_event; 1883 struct kvm_irq_level irq_event;
1819 1884
@@ -1821,10 +1886,17 @@ long kvm_arch_vm_ioctl(struct file *filp,
1821 if (copy_from_user(&irq_event, argp, sizeof irq_event)) 1886 if (copy_from_user(&irq_event, argp, sizeof irq_event))
1822 goto out; 1887 goto out;
1823 if (irqchip_in_kernel(kvm)) { 1888 if (irqchip_in_kernel(kvm)) {
1889 __s32 status;
1824 mutex_lock(&kvm->lock); 1890 mutex_lock(&kvm->lock);
1825 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1891 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
1826 irq_event.irq, irq_event.level); 1892 irq_event.irq, irq_event.level);
1827 mutex_unlock(&kvm->lock); 1893 mutex_unlock(&kvm->lock);
1894 if (ioctl == KVM_IRQ_LINE_STATUS) {
1895 irq_event.status = status;
1896 if (copy_to_user(argp, &irq_event,
1897 sizeof irq_event))
1898 goto out;
1899 }
1828 r = 0; 1900 r = 0;
1829 } 1901 }
1830 break; 1902 break;
@@ -1907,6 +1979,17 @@ long kvm_arch_vm_ioctl(struct file *filp,
1907 r = 0; 1979 r = 0;
1908 break; 1980 break;
1909 } 1981 }
1982 case KVM_REINJECT_CONTROL: {
1983 struct kvm_reinject_control control;
1984 r = -EFAULT;
1985 if (copy_from_user(&control, argp, sizeof(control)))
1986 goto out;
1987 r = kvm_vm_ioctl_reinject(kvm, &control);
1988 if (r)
1989 goto out;
1990 r = 0;
1991 break;
1992 }
1910 default: 1993 default:
1911 ; 1994 ;
1912 } 1995 }
@@ -1960,10 +2043,38 @@ static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
1960 return dev; 2043 return dev;
1961} 2044}
1962 2045
1963int emulator_read_std(unsigned long addr, 2046static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
1964 void *val, 2047 struct kvm_vcpu *vcpu)
1965 unsigned int bytes, 2048{
1966 struct kvm_vcpu *vcpu) 2049 void *data = val;
2050 int r = X86EMUL_CONTINUE;
2051
2052 while (bytes) {
2053 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2054 unsigned offset = addr & (PAGE_SIZE-1);
2055 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
2056 int ret;
2057
2058 if (gpa == UNMAPPED_GVA) {
2059 r = X86EMUL_PROPAGATE_FAULT;
2060 goto out;
2061 }
2062 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
2063 if (ret < 0) {
2064 r = X86EMUL_UNHANDLEABLE;
2065 goto out;
2066 }
2067
2068 bytes -= toread;
2069 data += toread;
2070 addr += toread;
2071 }
2072out:
2073 return r;
2074}
2075
2076static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
2077 struct kvm_vcpu *vcpu)
1967{ 2078{
1968 void *data = val; 2079 void *data = val;
1969 int r = X86EMUL_CONTINUE; 2080 int r = X86EMUL_CONTINUE;
@@ -1971,27 +2082,27 @@ int emulator_read_std(unsigned long addr,
1971 while (bytes) { 2082 while (bytes) {
1972 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2083 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1973 unsigned offset = addr & (PAGE_SIZE-1); 2084 unsigned offset = addr & (PAGE_SIZE-1);
1974 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset); 2085 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
1975 int ret; 2086 int ret;
1976 2087
1977 if (gpa == UNMAPPED_GVA) { 2088 if (gpa == UNMAPPED_GVA) {
1978 r = X86EMUL_PROPAGATE_FAULT; 2089 r = X86EMUL_PROPAGATE_FAULT;
1979 goto out; 2090 goto out;
1980 } 2091 }
1981 ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy); 2092 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
1982 if (ret < 0) { 2093 if (ret < 0) {
1983 r = X86EMUL_UNHANDLEABLE; 2094 r = X86EMUL_UNHANDLEABLE;
1984 goto out; 2095 goto out;
1985 } 2096 }
1986 2097
1987 bytes -= tocopy; 2098 bytes -= towrite;
1988 data += tocopy; 2099 data += towrite;
1989 addr += tocopy; 2100 addr += towrite;
1990 } 2101 }
1991out: 2102out:
1992 return r; 2103 return r;
1993} 2104}
1994EXPORT_SYMBOL_GPL(emulator_read_std); 2105
1995 2106
1996static int emulator_read_emulated(unsigned long addr, 2107static int emulator_read_emulated(unsigned long addr,
1997 void *val, 2108 void *val,
@@ -2013,8 +2124,8 @@ static int emulator_read_emulated(unsigned long addr,
2013 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 2124 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2014 goto mmio; 2125 goto mmio;
2015 2126
2016 if (emulator_read_std(addr, val, bytes, vcpu) 2127 if (kvm_read_guest_virt(addr, val, bytes, vcpu)
2017 == X86EMUL_CONTINUE) 2128 == X86EMUL_CONTINUE)
2018 return X86EMUL_CONTINUE; 2129 return X86EMUL_CONTINUE;
2019 if (gpa == UNMAPPED_GVA) 2130 if (gpa == UNMAPPED_GVA)
2020 return X86EMUL_PROPAGATE_FAULT; 2131 return X86EMUL_PROPAGATE_FAULT;
@@ -2217,7 +2328,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
2217 2328
2218 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); 2329 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
2219 2330
2220 emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu); 2331 kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu);
2221 2332
2222 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", 2333 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
2223 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); 2334 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
@@ -2225,7 +2336,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
2225EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); 2336EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
2226 2337
2227static struct x86_emulate_ops emulate_ops = { 2338static struct x86_emulate_ops emulate_ops = {
2228 .read_std = emulator_read_std, 2339 .read_std = kvm_read_guest_virt,
2229 .read_emulated = emulator_read_emulated, 2340 .read_emulated = emulator_read_emulated,
2230 .write_emulated = emulator_write_emulated, 2341 .write_emulated = emulator_write_emulated,
2231 .cmpxchg_emulated = emulator_cmpxchg_emulated, 2342 .cmpxchg_emulated = emulator_cmpxchg_emulated,
@@ -2327,40 +2438,19 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2327} 2438}
2328EXPORT_SYMBOL_GPL(emulate_instruction); 2439EXPORT_SYMBOL_GPL(emulate_instruction);
2329 2440
2330static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
2331{
2332 int i;
2333
2334 for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i)
2335 if (vcpu->arch.pio.guest_pages[i]) {
2336 kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]);
2337 vcpu->arch.pio.guest_pages[i] = NULL;
2338 }
2339}
2340
2341static int pio_copy_data(struct kvm_vcpu *vcpu) 2441static int pio_copy_data(struct kvm_vcpu *vcpu)
2342{ 2442{
2343 void *p = vcpu->arch.pio_data; 2443 void *p = vcpu->arch.pio_data;
2344 void *q; 2444 gva_t q = vcpu->arch.pio.guest_gva;
2345 unsigned bytes; 2445 unsigned bytes;
2346 int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1; 2446 int ret;
2347 2447
2348 q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
2349 PAGE_KERNEL);
2350 if (!q) {
2351 free_pio_guest_pages(vcpu);
2352 return -ENOMEM;
2353 }
2354 q += vcpu->arch.pio.guest_page_offset;
2355 bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; 2448 bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
2356 if (vcpu->arch.pio.in) 2449 if (vcpu->arch.pio.in)
2357 memcpy(q, p, bytes); 2450 ret = kvm_write_guest_virt(q, p, bytes, vcpu);
2358 else 2451 else
2359 memcpy(p, q, bytes); 2452 ret = kvm_read_guest_virt(q, p, bytes, vcpu);
2360 q -= vcpu->arch.pio.guest_page_offset; 2453 return ret;
2361 vunmap(q);
2362 free_pio_guest_pages(vcpu);
2363 return 0;
2364} 2454}
2365 2455
2366int complete_pio(struct kvm_vcpu *vcpu) 2456int complete_pio(struct kvm_vcpu *vcpu)
@@ -2471,7 +2561,6 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2471 vcpu->arch.pio.in = in; 2561 vcpu->arch.pio.in = in;
2472 vcpu->arch.pio.string = 0; 2562 vcpu->arch.pio.string = 0;
2473 vcpu->arch.pio.down = 0; 2563 vcpu->arch.pio.down = 0;
2474 vcpu->arch.pio.guest_page_offset = 0;
2475 vcpu->arch.pio.rep = 0; 2564 vcpu->arch.pio.rep = 0;
2476 2565
2477 if (vcpu->run->io.direction == KVM_EXIT_IO_IN) 2566 if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
@@ -2499,9 +2588,7 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2499 gva_t address, int rep, unsigned port) 2588 gva_t address, int rep, unsigned port)
2500{ 2589{
2501 unsigned now, in_page; 2590 unsigned now, in_page;
2502 int i, ret = 0; 2591 int ret = 0;
2503 int nr_pages = 1;
2504 struct page *page;
2505 struct kvm_io_device *pio_dev; 2592 struct kvm_io_device *pio_dev;
2506 2593
2507 vcpu->run->exit_reason = KVM_EXIT_IO; 2594 vcpu->run->exit_reason = KVM_EXIT_IO;
@@ -2513,7 +2600,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2513 vcpu->arch.pio.in = in; 2600 vcpu->arch.pio.in = in;
2514 vcpu->arch.pio.string = 1; 2601 vcpu->arch.pio.string = 1;
2515 vcpu->arch.pio.down = down; 2602 vcpu->arch.pio.down = down;
2516 vcpu->arch.pio.guest_page_offset = offset_in_page(address);
2517 vcpu->arch.pio.rep = rep; 2603 vcpu->arch.pio.rep = rep;
2518 2604
2519 if (vcpu->run->io.direction == KVM_EXIT_IO_IN) 2605 if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
@@ -2533,15 +2619,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2533 else 2619 else
2534 in_page = offset_in_page(address) + size; 2620 in_page = offset_in_page(address) + size;
2535 now = min(count, (unsigned long)in_page / size); 2621 now = min(count, (unsigned long)in_page / size);
2536 if (!now) { 2622 if (!now)
2537 /*
2538 * String I/O straddles page boundary. Pin two guest pages
2539 * so that we satisfy atomicity constraints. Do just one
2540 * transaction to avoid complexity.
2541 */
2542 nr_pages = 2;
2543 now = 1; 2623 now = 1;
2544 }
2545 if (down) { 2624 if (down) {
2546 /* 2625 /*
2547 * String I/O in reverse. Yuck. Kill the guest, fix later. 2626 * String I/O in reverse. Yuck. Kill the guest, fix later.
@@ -2556,15 +2635,7 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2556 if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count) 2635 if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
2557 kvm_x86_ops->skip_emulated_instruction(vcpu); 2636 kvm_x86_ops->skip_emulated_instruction(vcpu);
2558 2637
2559 for (i = 0; i < nr_pages; ++i) { 2638 vcpu->arch.pio.guest_gva = address;
2560 page = gva_to_page(vcpu, address + i * PAGE_SIZE);
2561 vcpu->arch.pio.guest_pages[i] = page;
2562 if (!page) {
2563 kvm_inject_gp(vcpu, 0);
2564 free_pio_guest_pages(vcpu);
2565 return 1;
2566 }
2567 }
2568 2639
2569 pio_dev = vcpu_find_pio_dev(vcpu, port, 2640 pio_dev = vcpu_find_pio_dev(vcpu, port,
2570 vcpu->arch.pio.cur_count, 2641 vcpu->arch.pio.cur_count,
@@ -2572,7 +2643,11 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2572 if (!vcpu->arch.pio.in) { 2643 if (!vcpu->arch.pio.in) {
2573 /* string PIO write */ 2644 /* string PIO write */
2574 ret = pio_copy_data(vcpu); 2645 ret = pio_copy_data(vcpu);
2575 if (ret >= 0 && pio_dev) { 2646 if (ret == X86EMUL_PROPAGATE_FAULT) {
2647 kvm_inject_gp(vcpu, 0);
2648 return 1;
2649 }
2650 if (ret == 0 && pio_dev) {
2576 pio_string_write(pio_dev, vcpu); 2651 pio_string_write(pio_dev, vcpu);
2577 complete_pio(vcpu); 2652 complete_pio(vcpu);
2578 if (vcpu->arch.pio.count == 0) 2653 if (vcpu->arch.pio.count == 0)
@@ -2587,9 +2662,72 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2587} 2662}
2588EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); 2663EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
2589 2664
2665static void bounce_off(void *info)
2666{
2667 /* nothing */
2668}
2669
2670static unsigned int ref_freq;
2671static unsigned long tsc_khz_ref;
2672
2673static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
2674 void *data)
2675{
2676 struct cpufreq_freqs *freq = data;
2677 struct kvm *kvm;
2678 struct kvm_vcpu *vcpu;
2679 int i, send_ipi = 0;
2680
2681 if (!ref_freq)
2682 ref_freq = freq->old;
2683
2684 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
2685 return 0;
2686 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
2687 return 0;
2688 per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
2689
2690 spin_lock(&kvm_lock);
2691 list_for_each_entry(kvm, &vm_list, vm_list) {
2692 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
2693 vcpu = kvm->vcpus[i];
2694 if (!vcpu)
2695 continue;
2696 if (vcpu->cpu != freq->cpu)
2697 continue;
2698 if (!kvm_request_guest_time_update(vcpu))
2699 continue;
2700 if (vcpu->cpu != smp_processor_id())
2701 send_ipi++;
2702 }
2703 }
2704 spin_unlock(&kvm_lock);
2705
2706 if (freq->old < freq->new && send_ipi) {
2707 /*
2708 * We upscale the frequency. Must make the guest
2709 * doesn't see old kvmclock values while running with
2710 * the new frequency, otherwise we risk the guest sees
2711 * time go backwards.
2712 *
2713 * In case we update the frequency for another cpu
2714 * (which might be in guest context) send an interrupt
2715 * to kick the cpu out of guest context. Next time
2716 * guest context is entered kvmclock will be updated,
2717 * so the guest will not see stale values.
2718 */
2719 smp_call_function_single(freq->cpu, bounce_off, NULL, 1);
2720 }
2721 return 0;
2722}
2723
2724static struct notifier_block kvmclock_cpufreq_notifier_block = {
2725 .notifier_call = kvmclock_cpufreq_notifier
2726};
2727
2590int kvm_arch_init(void *opaque) 2728int kvm_arch_init(void *opaque)
2591{ 2729{
2592 int r; 2730 int r, cpu;
2593 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; 2731 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
2594 2732
2595 if (kvm_x86_ops) { 2733 if (kvm_x86_ops) {
@@ -2620,6 +2758,15 @@ int kvm_arch_init(void *opaque)
2620 kvm_mmu_set_base_ptes(PT_PRESENT_MASK); 2758 kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
2621 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 2759 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
2622 PT_DIRTY_MASK, PT64_NX_MASK, 0, 0); 2760 PT_DIRTY_MASK, PT64_NX_MASK, 0, 0);
2761
2762 for_each_possible_cpu(cpu)
2763 per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
2764 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
2765 tsc_khz_ref = tsc_khz;
2766 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
2767 CPUFREQ_TRANSITION_NOTIFIER);
2768 }
2769
2623 return 0; 2770 return 0;
2624 2771
2625out: 2772out:
@@ -2827,25 +2974,20 @@ static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
2827 if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) 2974 if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
2828 return 0; 2975 return 0;
2829 if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && 2976 if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
2830 !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) 2977 !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
2831 return 0; 2978 return 0;
2832 return 1; 2979 return 1;
2833} 2980}
2834 2981
2835void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) 2982struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
2983 u32 function, u32 index)
2836{ 2984{
2837 int i; 2985 int i;
2838 u32 function, index; 2986 struct kvm_cpuid_entry2 *best = NULL;
2839 struct kvm_cpuid_entry2 *e, *best;
2840 2987
2841 function = kvm_register_read(vcpu, VCPU_REGS_RAX);
2842 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
2843 kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
2844 kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
2845 kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
2846 kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
2847 best = NULL;
2848 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { 2988 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
2989 struct kvm_cpuid_entry2 *e;
2990
2849 e = &vcpu->arch.cpuid_entries[i]; 2991 e = &vcpu->arch.cpuid_entries[i];
2850 if (is_matching_cpuid_entry(e, function, index)) { 2992 if (is_matching_cpuid_entry(e, function, index)) {
2851 if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) 2993 if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
@@ -2860,6 +3002,21 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
2860 if (!best || e->function > best->function) 3002 if (!best || e->function > best->function)
2861 best = e; 3003 best = e;
2862 } 3004 }
3005 return best;
3006}
3007
3008void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
3009{
3010 u32 function, index;
3011 struct kvm_cpuid_entry2 *best;
3012
3013 function = kvm_register_read(vcpu, VCPU_REGS_RAX);
3014 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
3015 kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
3016 kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
3017 kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
3018 kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
3019 best = kvm_find_cpuid_entry(vcpu, function, index);
2863 if (best) { 3020 if (best) {
2864 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); 3021 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
2865 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); 3022 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
@@ -2945,6 +3102,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2945 if (vcpu->requests) { 3102 if (vcpu->requests) {
2946 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) 3103 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
2947 __kvm_migrate_timers(vcpu); 3104 __kvm_migrate_timers(vcpu);
3105 if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests))
3106 kvm_write_guest_time(vcpu);
2948 if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) 3107 if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
2949 kvm_mmu_sync_roots(vcpu); 3108 kvm_mmu_sync_roots(vcpu);
2950 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) 3109 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
@@ -2979,9 +3138,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2979 goto out; 3138 goto out;
2980 } 3139 }
2981 3140
2982 if (vcpu->guest_debug.enabled)
2983 kvm_x86_ops->guest_debug_pre(vcpu);
2984
2985 vcpu->guest_mode = 1; 3141 vcpu->guest_mode = 1;
2986 /* 3142 /*
2987 * Make sure that guest_mode assignment won't happen after 3143 * Make sure that guest_mode assignment won't happen after
@@ -3002,10 +3158,34 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3002 3158
3003 kvm_guest_enter(); 3159 kvm_guest_enter();
3004 3160
3161 get_debugreg(vcpu->arch.host_dr6, 6);
3162 get_debugreg(vcpu->arch.host_dr7, 7);
3163 if (unlikely(vcpu->arch.switch_db_regs)) {
3164 get_debugreg(vcpu->arch.host_db[0], 0);
3165 get_debugreg(vcpu->arch.host_db[1], 1);
3166 get_debugreg(vcpu->arch.host_db[2], 2);
3167 get_debugreg(vcpu->arch.host_db[3], 3);
3168
3169 set_debugreg(0, 7);
3170 set_debugreg(vcpu->arch.eff_db[0], 0);
3171 set_debugreg(vcpu->arch.eff_db[1], 1);
3172 set_debugreg(vcpu->arch.eff_db[2], 2);
3173 set_debugreg(vcpu->arch.eff_db[3], 3);
3174 }
3005 3175
3006 KVMTRACE_0D(VMENTRY, vcpu, entryexit); 3176 KVMTRACE_0D(VMENTRY, vcpu, entryexit);
3007 kvm_x86_ops->run(vcpu, kvm_run); 3177 kvm_x86_ops->run(vcpu, kvm_run);
3008 3178
3179 if (unlikely(vcpu->arch.switch_db_regs)) {
3180 set_debugreg(0, 7);
3181 set_debugreg(vcpu->arch.host_db[0], 0);
3182 set_debugreg(vcpu->arch.host_db[1], 1);
3183 set_debugreg(vcpu->arch.host_db[2], 2);
3184 set_debugreg(vcpu->arch.host_db[3], 3);
3185 }
3186 set_debugreg(vcpu->arch.host_dr6, 6);
3187 set_debugreg(vcpu->arch.host_dr7, 7);
3188
3009 vcpu->guest_mode = 0; 3189 vcpu->guest_mode = 0;
3010 local_irq_enable(); 3190 local_irq_enable();
3011 3191
@@ -3192,7 +3372,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3192 /* 3372 /*
3193 * Don't leak debug flags in case they were set for guest debugging 3373 * Don't leak debug flags in case they were set for guest debugging
3194 */ 3374 */
3195 if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep) 3375 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
3196 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); 3376 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
3197 3377
3198 vcpu_put(vcpu); 3378 vcpu_put(vcpu);
@@ -3811,15 +3991,32 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
3811 return 0; 3991 return 0;
3812} 3992}
3813 3993
3814int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, 3994int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
3815 struct kvm_debug_guest *dbg) 3995 struct kvm_guest_debug *dbg)
3816{ 3996{
3817 int r; 3997 int i, r;
3818 3998
3819 vcpu_load(vcpu); 3999 vcpu_load(vcpu);
3820 4000
4001 if ((dbg->control & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) ==
4002 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) {
4003 for (i = 0; i < KVM_NR_DB_REGS; ++i)
4004 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
4005 vcpu->arch.switch_db_regs =
4006 (dbg->arch.debugreg[7] & DR7_BP_EN_MASK);
4007 } else {
4008 for (i = 0; i < KVM_NR_DB_REGS; i++)
4009 vcpu->arch.eff_db[i] = vcpu->arch.db[i];
4010 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
4011 }
4012
3821 r = kvm_x86_ops->set_guest_debug(vcpu, dbg); 4013 r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
3822 4014
4015 if (dbg->control & KVM_GUESTDBG_INJECT_DB)
4016 kvm_queue_exception(vcpu, DB_VECTOR);
4017 else if (dbg->control & KVM_GUESTDBG_INJECT_BP)
4018 kvm_queue_exception(vcpu, BP_VECTOR);
4019
3823 vcpu_put(vcpu); 4020 vcpu_put(vcpu);
3824 4021
3825 return r; 4022 return r;
@@ -4007,6 +4204,11 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
4007 vcpu->arch.nmi_pending = false; 4204 vcpu->arch.nmi_pending = false;
4008 vcpu->arch.nmi_injected = false; 4205 vcpu->arch.nmi_injected = false;
4009 4206
4207 vcpu->arch.switch_db_regs = 0;
4208 memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
4209 vcpu->arch.dr6 = DR6_FIXED_1;
4210 vcpu->arch.dr7 = DR7_FIXED_1;
4211
4010 return kvm_x86_ops->vcpu_reset(vcpu); 4212 return kvm_x86_ops->vcpu_reset(vcpu);
4011} 4213}
4012 4214
@@ -4100,6 +4302,8 @@ struct kvm *kvm_arch_create_vm(void)
4100 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 4302 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
4101 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); 4303 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
4102 4304
4305 rdtscll(kvm->arch.vm_init_tsc);
4306
4103 return kvm; 4307 return kvm;
4104} 4308}
4105 4309
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index d174db7a3370..ca91749d2083 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -178,7 +178,7 @@ static u32 opcode_table[256] = {
178 0, ImplicitOps | Stack, 0, 0, 178 0, ImplicitOps | Stack, 0, 0,
179 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, 179 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
180 /* 0xC8 - 0xCF */ 180 /* 0xC8 - 0xCF */
181 0, 0, 0, 0, 0, 0, 0, 0, 181 0, 0, 0, ImplicitOps | Stack, 0, 0, 0, 0,
182 /* 0xD0 - 0xD7 */ 182 /* 0xD0 - 0xD7 */
183 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, 183 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
184 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, 184 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
@@ -1136,18 +1136,19 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
1136} 1136}
1137 1137
1138static int emulate_pop(struct x86_emulate_ctxt *ctxt, 1138static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1139 struct x86_emulate_ops *ops) 1139 struct x86_emulate_ops *ops,
1140 void *dest, int len)
1140{ 1141{
1141 struct decode_cache *c = &ctxt->decode; 1142 struct decode_cache *c = &ctxt->decode;
1142 int rc; 1143 int rc;
1143 1144
1144 rc = ops->read_emulated(register_address(c, ss_base(ctxt), 1145 rc = ops->read_emulated(register_address(c, ss_base(ctxt),
1145 c->regs[VCPU_REGS_RSP]), 1146 c->regs[VCPU_REGS_RSP]),
1146 &c->src.val, c->src.bytes, ctxt->vcpu); 1147 dest, len, ctxt->vcpu);
1147 if (rc != 0) 1148 if (rc != 0)
1148 return rc; 1149 return rc;
1149 1150
1150 register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.bytes); 1151 register_address_increment(c, &c->regs[VCPU_REGS_RSP], len);
1151 return rc; 1152 return rc;
1152} 1153}
1153 1154
@@ -1157,11 +1158,9 @@ static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
1157 struct decode_cache *c = &ctxt->decode; 1158 struct decode_cache *c = &ctxt->decode;
1158 int rc; 1159 int rc;
1159 1160
1160 c->src.bytes = c->dst.bytes; 1161 rc = emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes);
1161 rc = emulate_pop(ctxt, ops);
1162 if (rc != 0) 1162 if (rc != 0)
1163 return rc; 1163 return rc;
1164 c->dst.val = c->src.val;
1165 return 0; 1164 return 0;
1166} 1165}
1167 1166
@@ -1279,6 +1278,25 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
1279 return 0; 1278 return 0;
1280} 1279}
1281 1280
1281static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
1282 struct x86_emulate_ops *ops)
1283{
1284 struct decode_cache *c = &ctxt->decode;
1285 int rc;
1286 unsigned long cs;
1287
1288 rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes);
1289 if (rc)
1290 return rc;
1291 if (c->op_bytes == 4)
1292 c->eip = (u32)c->eip;
1293 rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
1294 if (rc)
1295 return rc;
1296 rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, 1, VCPU_SREG_CS);
1297 return rc;
1298}
1299
1282static inline int writeback(struct x86_emulate_ctxt *ctxt, 1300static inline int writeback(struct x86_emulate_ctxt *ctxt,
1283 struct x86_emulate_ops *ops) 1301 struct x86_emulate_ops *ops)
1284{ 1302{
@@ -1467,11 +1485,9 @@ special_insn:
1467 break; 1485 break;
1468 case 0x58 ... 0x5f: /* pop reg */ 1486 case 0x58 ... 0x5f: /* pop reg */
1469 pop_instruction: 1487 pop_instruction:
1470 c->src.bytes = c->op_bytes; 1488 rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes);
1471 rc = emulate_pop(ctxt, ops);
1472 if (rc != 0) 1489 if (rc != 0)
1473 goto done; 1490 goto done;
1474 c->dst.val = c->src.val;
1475 break; 1491 break;
1476 case 0x63: /* movsxd */ 1492 case 0x63: /* movsxd */
1477 if (ctxt->mode != X86EMUL_MODE_PROT64) 1493 if (ctxt->mode != X86EMUL_MODE_PROT64)
@@ -1738,6 +1754,11 @@ special_insn:
1738 mov: 1754 mov:
1739 c->dst.val = c->src.val; 1755 c->dst.val = c->src.val;
1740 break; 1756 break;
1757 case 0xcb: /* ret far */
1758 rc = emulate_ret_far(ctxt, ops);
1759 if (rc)
1760 goto done;
1761 break;
1741 case 0xd0 ... 0xd1: /* Grp2 */ 1762 case 0xd0 ... 0xd1: /* Grp2 */
1742 c->src.val = 1; 1763 c->src.val = 1;
1743 emulate_grp2(ctxt); 1764 emulate_grp2(ctxt);
@@ -1908,11 +1929,16 @@ twobyte_insn:
1908 c->dst.type = OP_NONE; 1929 c->dst.type = OP_NONE;
1909 break; 1930 break;
1910 case 3: /* lidt/vmmcall */ 1931 case 3: /* lidt/vmmcall */
1911 if (c->modrm_mod == 3 && c->modrm_rm == 1) { 1932 if (c->modrm_mod == 3) {
1912 rc = kvm_fix_hypercall(ctxt->vcpu); 1933 switch (c->modrm_rm) {
1913 if (rc) 1934 case 1:
1914 goto done; 1935 rc = kvm_fix_hypercall(ctxt->vcpu);
1915 kvm_emulate_hypercall(ctxt->vcpu); 1936 if (rc)
1937 goto done;
1938 break;
1939 default:
1940 goto cannot_emulate;
1941 }
1916 } else { 1942 } else {
1917 rc = read_descriptor(ctxt, ops, c->src.ptr, 1943 rc = read_descriptor(ctxt, ops, c->src.ptr,
1918 &size, &address, 1944 &size, &address,
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index f3a5305b8adf..e94a11e42f98 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -107,7 +107,7 @@ static void async_hcall(unsigned long call, unsigned long arg1,
107 local_irq_save(flags); 107 local_irq_save(flags);
108 if (lguest_data.hcall_status[next_call] != 0xFF) { 108 if (lguest_data.hcall_status[next_call] != 0xFF) {
109 /* Table full, so do normal hcall which will flush table. */ 109 /* Table full, so do normal hcall which will flush table. */
110 hcall(call, arg1, arg2, arg3); 110 kvm_hypercall3(call, arg1, arg2, arg3);
111 } else { 111 } else {
112 lguest_data.hcalls[next_call].arg0 = call; 112 lguest_data.hcalls[next_call].arg0 = call;
113 lguest_data.hcalls[next_call].arg1 = arg1; 113 lguest_data.hcalls[next_call].arg1 = arg1;
@@ -134,13 +134,32 @@ static void async_hcall(unsigned long call, unsigned long arg1,
134 * 134 *
135 * So, when we're in lazy mode, we call async_hcall() to store the call for 135 * So, when we're in lazy mode, we call async_hcall() to store the call for
136 * future processing: */ 136 * future processing: */
137static void lazy_hcall(unsigned long call, 137static void lazy_hcall1(unsigned long call,
138 unsigned long arg1)
139{
140 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
141 kvm_hypercall1(call, arg1);
142 else
143 async_hcall(call, arg1, 0, 0);
144}
145
146static void lazy_hcall2(unsigned long call,
147 unsigned long arg1,
148 unsigned long arg2)
149{
150 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
151 kvm_hypercall2(call, arg1, arg2);
152 else
153 async_hcall(call, arg1, arg2, 0);
154}
155
156static void lazy_hcall3(unsigned long call,
138 unsigned long arg1, 157 unsigned long arg1,
139 unsigned long arg2, 158 unsigned long arg2,
140 unsigned long arg3) 159 unsigned long arg3)
141{ 160{
142 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) 161 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
143 hcall(call, arg1, arg2, arg3); 162 kvm_hypercall3(call, arg1, arg2, arg3);
144 else 163 else
145 async_hcall(call, arg1, arg2, arg3); 164 async_hcall(call, arg1, arg2, arg3);
146} 165}
@@ -150,7 +169,7 @@ static void lazy_hcall(unsigned long call,
150static void lguest_leave_lazy_mode(void) 169static void lguest_leave_lazy_mode(void)
151{ 170{
152 paravirt_leave_lazy(paravirt_get_lazy_mode()); 171 paravirt_leave_lazy(paravirt_get_lazy_mode());
153 hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0); 172 kvm_hypercall0(LHCALL_FLUSH_ASYNC);
154} 173}
155 174
156/*G:033 175/*G:033
@@ -229,7 +248,7 @@ static void lguest_write_idt_entry(gate_desc *dt,
229 /* Keep the local copy up to date. */ 248 /* Keep the local copy up to date. */
230 native_write_idt_entry(dt, entrynum, g); 249 native_write_idt_entry(dt, entrynum, g);
231 /* Tell Host about this new entry. */ 250 /* Tell Host about this new entry. */
232 hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]); 251 kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]);
233} 252}
234 253
235/* Changing to a different IDT is very rare: we keep the IDT up-to-date every 254/* Changing to a different IDT is very rare: we keep the IDT up-to-date every
@@ -241,7 +260,7 @@ static void lguest_load_idt(const struct desc_ptr *desc)
241 struct desc_struct *idt = (void *)desc->address; 260 struct desc_struct *idt = (void *)desc->address;
242 261
243 for (i = 0; i < (desc->size+1)/8; i++) 262 for (i = 0; i < (desc->size+1)/8; i++)
244 hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b); 263 kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b);
245} 264}
246 265
247/* 266/*
@@ -261,8 +280,8 @@ static void lguest_load_idt(const struct desc_ptr *desc)
261 */ 280 */
262static void lguest_load_gdt(const struct desc_ptr *desc) 281static void lguest_load_gdt(const struct desc_ptr *desc)
263{ 282{
264 BUG_ON((desc->size+1)/8 != GDT_ENTRIES); 283 BUG_ON((desc->size + 1) / 8 != GDT_ENTRIES);
265 hcall(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES, 0); 284 kvm_hypercall2(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES);
266} 285}
267 286
268/* For a single GDT entry which changes, we do the lazy thing: alter our GDT, 287/* For a single GDT entry which changes, we do the lazy thing: alter our GDT,
@@ -272,7 +291,7 @@ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum,
272 const void *desc, int type) 291 const void *desc, int type)
273{ 292{
274 native_write_gdt_entry(dt, entrynum, desc, type); 293 native_write_gdt_entry(dt, entrynum, desc, type);
275 hcall(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES, 0); 294 kvm_hypercall2(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES);
276} 295}
277 296
278/* OK, I lied. There are three "thread local storage" GDT entries which change 297/* OK, I lied. There are three "thread local storage" GDT entries which change
@@ -284,7 +303,7 @@ static void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
284 * can't handle us removing entries we're currently using. So we clear 303 * can't handle us removing entries we're currently using. So we clear
285 * the GS register here: if it's needed it'll be reloaded anyway. */ 304 * the GS register here: if it's needed it'll be reloaded anyway. */
286 lazy_load_gs(0); 305 lazy_load_gs(0);
287 lazy_hcall(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu, 0); 306 lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu);
288} 307}
289 308
290/*G:038 That's enough excitement for now, back to ploughing through each of 309/*G:038 That's enough excitement for now, back to ploughing through each of
@@ -348,6 +367,11 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
348 * flush_tlb_user() for both user and kernel mappings unless 367 * flush_tlb_user() for both user and kernel mappings unless
349 * the Page Global Enable (PGE) feature bit is set. */ 368 * the Page Global Enable (PGE) feature bit is set. */
350 *dx |= 0x00002000; 369 *dx |= 0x00002000;
370 /* We also lie, and say we're family id 5. 6 or greater
371 * leads to a rdmsr in early_init_intel which we can't handle.
372 * Family ID is returned as bits 8-12 in ax. */
373 *ax &= 0xFFFFF0FF;
374 *ax |= 0x00000500;
351 break; 375 break;
352 case 0x80000000: 376 case 0x80000000:
353 /* Futureproof this a little: if they ask how much extended 377 /* Futureproof this a little: if they ask how much extended
@@ -377,7 +401,7 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
377static unsigned long current_cr0; 401static unsigned long current_cr0;
378static void lguest_write_cr0(unsigned long val) 402static void lguest_write_cr0(unsigned long val)
379{ 403{
380 lazy_hcall(LHCALL_TS, val & X86_CR0_TS, 0, 0); 404 lazy_hcall1(LHCALL_TS, val & X86_CR0_TS);
381 current_cr0 = val; 405 current_cr0 = val;
382} 406}
383 407
@@ -391,7 +415,7 @@ static unsigned long lguest_read_cr0(void)
391 * the vowels have been optimized out. */ 415 * the vowels have been optimized out. */
392static void lguest_clts(void) 416static void lguest_clts(void)
393{ 417{
394 lazy_hcall(LHCALL_TS, 0, 0, 0); 418 lazy_hcall1(LHCALL_TS, 0);
395 current_cr0 &= ~X86_CR0_TS; 419 current_cr0 &= ~X86_CR0_TS;
396} 420}
397 421
@@ -413,7 +437,7 @@ static bool cr3_changed = false;
413static void lguest_write_cr3(unsigned long cr3) 437static void lguest_write_cr3(unsigned long cr3)
414{ 438{
415 lguest_data.pgdir = cr3; 439 lguest_data.pgdir = cr3;
416 lazy_hcall(LHCALL_NEW_PGTABLE, cr3, 0, 0); 440 lazy_hcall1(LHCALL_NEW_PGTABLE, cr3);
417 cr3_changed = true; 441 cr3_changed = true;
418} 442}
419 443
@@ -485,11 +509,17 @@ static void lguest_write_cr4(unsigned long val)
485 * into a process' address space. We set the entry then tell the Host the 509 * into a process' address space. We set the entry then tell the Host the
486 * toplevel and address this corresponds to. The Guest uses one pagetable per 510 * toplevel and address this corresponds to. The Guest uses one pagetable per
487 * process, so we need to tell the Host which one we're changing (mm->pgd). */ 511 * process, so we need to tell the Host which one we're changing (mm->pgd). */
512static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
513 pte_t *ptep)
514{
515 lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low);
516}
517
488static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, 518static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
489 pte_t *ptep, pte_t pteval) 519 pte_t *ptep, pte_t pteval)
490{ 520{
491 *ptep = pteval; 521 *ptep = pteval;
492 lazy_hcall(LHCALL_SET_PTE, __pa(mm->pgd), addr, pteval.pte_low); 522 lguest_pte_update(mm, addr, ptep);
493} 523}
494 524
495/* The Guest calls this to set a top-level entry. Again, we set the entry then 525/* The Guest calls this to set a top-level entry. Again, we set the entry then
@@ -498,8 +528,8 @@ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
498static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) 528static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
499{ 529{
500 *pmdp = pmdval; 530 *pmdp = pmdval;
501 lazy_hcall(LHCALL_SET_PMD, __pa(pmdp)&PAGE_MASK, 531 lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK,
502 (__pa(pmdp)&(PAGE_SIZE-1))/4, 0); 532 (__pa(pmdp) & (PAGE_SIZE - 1)) / 4);
503} 533}
504 534
505/* There are a couple of legacy places where the kernel sets a PTE, but we 535/* There are a couple of legacy places where the kernel sets a PTE, but we
@@ -515,7 +545,7 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval)
515{ 545{
516 *ptep = pteval; 546 *ptep = pteval;
517 if (cr3_changed) 547 if (cr3_changed)
518 lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0); 548 lazy_hcall1(LHCALL_FLUSH_TLB, 1);
519} 549}
520 550
521/* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on 551/* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
@@ -531,7 +561,7 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval)
531static void lguest_flush_tlb_single(unsigned long addr) 561static void lguest_flush_tlb_single(unsigned long addr)
532{ 562{
533 /* Simply set it to zero: if it was not, it will fault back in. */ 563 /* Simply set it to zero: if it was not, it will fault back in. */
534 lazy_hcall(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0); 564 lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0);
535} 565}
536 566
537/* This is what happens after the Guest has removed a large number of entries. 567/* This is what happens after the Guest has removed a large number of entries.
@@ -539,7 +569,7 @@ static void lguest_flush_tlb_single(unsigned long addr)
539 * have changed, ie. virtual addresses below PAGE_OFFSET. */ 569 * have changed, ie. virtual addresses below PAGE_OFFSET. */
540static void lguest_flush_tlb_user(void) 570static void lguest_flush_tlb_user(void)
541{ 571{
542 lazy_hcall(LHCALL_FLUSH_TLB, 0, 0, 0); 572 lazy_hcall1(LHCALL_FLUSH_TLB, 0);
543} 573}
544 574
545/* This is called when the kernel page tables have changed. That's not very 575/* This is called when the kernel page tables have changed. That's not very
@@ -547,7 +577,7 @@ static void lguest_flush_tlb_user(void)
547 * slow), so it's worth separating this from the user flushing above. */ 577 * slow), so it's worth separating this from the user flushing above. */
548static void lguest_flush_tlb_kernel(void) 578static void lguest_flush_tlb_kernel(void)
549{ 579{
550 lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0); 580 lazy_hcall1(LHCALL_FLUSH_TLB, 1);
551} 581}
552 582
553/* 583/*
@@ -594,19 +624,21 @@ static void __init lguest_init_IRQ(void)
594 /* Some systems map "vectors" to interrupts weirdly. Lguest has 624 /* Some systems map "vectors" to interrupts weirdly. Lguest has
595 * a straightforward 1 to 1 mapping, so force that here. */ 625 * a straightforward 1 to 1 mapping, so force that here. */
596 __get_cpu_var(vector_irq)[vector] = i; 626 __get_cpu_var(vector_irq)[vector] = i;
597 if (vector != SYSCALL_VECTOR) { 627 if (vector != SYSCALL_VECTOR)
598 set_intr_gate(vector, 628 set_intr_gate(vector, interrupt[i]);
599 interrupt[vector-FIRST_EXTERNAL_VECTOR]);
600 set_irq_chip_and_handler_name(i, &lguest_irq_controller,
601 handle_level_irq,
602 "level");
603 }
604 } 629 }
605 /* This call is required to set up for 4k stacks, where we have 630 /* This call is required to set up for 4k stacks, where we have
606 * separate stacks for hard and soft interrupts. */ 631 * separate stacks for hard and soft interrupts. */
607 irq_ctx_init(smp_processor_id()); 632 irq_ctx_init(smp_processor_id());
608} 633}
609 634
635void lguest_setup_irq(unsigned int irq)
636{
637 irq_to_desc_alloc_cpu(irq, 0);
638 set_irq_chip_and_handler_name(irq, &lguest_irq_controller,
639 handle_level_irq, "level");
640}
641
610/* 642/*
611 * Time. 643 * Time.
612 * 644 *
@@ -682,7 +714,7 @@ static int lguest_clockevent_set_next_event(unsigned long delta,
682 } 714 }
683 715
684 /* Please wake us this far in the future. */ 716 /* Please wake us this far in the future. */
685 hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0); 717 kvm_hypercall1(LHCALL_SET_CLOCKEVENT, delta);
686 return 0; 718 return 0;
687} 719}
688 720
@@ -693,7 +725,7 @@ static void lguest_clockevent_set_mode(enum clock_event_mode mode,
693 case CLOCK_EVT_MODE_UNUSED: 725 case CLOCK_EVT_MODE_UNUSED:
694 case CLOCK_EVT_MODE_SHUTDOWN: 726 case CLOCK_EVT_MODE_SHUTDOWN:
695 /* A 0 argument shuts the clock down. */ 727 /* A 0 argument shuts the clock down. */
696 hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0); 728 kvm_hypercall0(LHCALL_SET_CLOCKEVENT);
697 break; 729 break;
698 case CLOCK_EVT_MODE_ONESHOT: 730 case CLOCK_EVT_MODE_ONESHOT:
699 /* This is what we expect. */ 731 /* This is what we expect. */
@@ -768,8 +800,8 @@ static void lguest_time_init(void)
768static void lguest_load_sp0(struct tss_struct *tss, 800static void lguest_load_sp0(struct tss_struct *tss,
769 struct thread_struct *thread) 801 struct thread_struct *thread)
770{ 802{
771 lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->sp0, 803 lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0,
772 THREAD_SIZE/PAGE_SIZE); 804 THREAD_SIZE / PAGE_SIZE);
773} 805}
774 806
775/* Let's just say, I wouldn't do debugging under a Guest. */ 807/* Let's just say, I wouldn't do debugging under a Guest. */
@@ -842,7 +874,7 @@ static void set_lguest_basic_apic_ops(void)
842/* STOP! Until an interrupt comes in. */ 874/* STOP! Until an interrupt comes in. */
843static void lguest_safe_halt(void) 875static void lguest_safe_halt(void)
844{ 876{
845 hcall(LHCALL_HALT, 0, 0, 0); 877 kvm_hypercall0(LHCALL_HALT);
846} 878}
847 879
848/* The SHUTDOWN hypercall takes a string to describe what's happening, and 880/* The SHUTDOWN hypercall takes a string to describe what's happening, and
@@ -852,7 +884,8 @@ static void lguest_safe_halt(void)
852 * rather than virtual addresses, so we use __pa() here. */ 884 * rather than virtual addresses, so we use __pa() here. */
853static void lguest_power_off(void) 885static void lguest_power_off(void)
854{ 886{
855 hcall(LHCALL_SHUTDOWN, __pa("Power down"), LGUEST_SHUTDOWN_POWEROFF, 0); 887 kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"),
888 LGUEST_SHUTDOWN_POWEROFF);
856} 889}
857 890
858/* 891/*
@@ -862,7 +895,7 @@ static void lguest_power_off(void)
862 */ 895 */
863static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p) 896static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p)
864{ 897{
865 hcall(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF, 0); 898 kvm_hypercall2(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF);
866 /* The hcall won't return, but to keep gcc happy, we're "done". */ 899 /* The hcall won't return, but to keep gcc happy, we're "done". */
867 return NOTIFY_DONE; 900 return NOTIFY_DONE;
868} 901}
@@ -903,7 +936,7 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count)
903 len = sizeof(scratch) - 1; 936 len = sizeof(scratch) - 1;
904 scratch[len] = '\0'; 937 scratch[len] = '\0';
905 memcpy(scratch, buf, len); 938 memcpy(scratch, buf, len);
906 hcall(LHCALL_NOTIFY, __pa(scratch), 0, 0); 939 kvm_hypercall1(LHCALL_NOTIFY, __pa(scratch));
907 940
908 /* This routine returns the number of bytes actually written. */ 941 /* This routine returns the number of bytes actually written. */
909 return len; 942 return len;
@@ -913,7 +946,7 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count)
913 * Launcher to reboot us. */ 946 * Launcher to reboot us. */
914static void lguest_restart(char *reason) 947static void lguest_restart(char *reason)
915{ 948{
916 hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0); 949 kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART);
917} 950}
918 951
919/*G:050 952/*G:050
@@ -1033,6 +1066,8 @@ __init void lguest_init(void)
1033 pv_mmu_ops.read_cr3 = lguest_read_cr3; 1066 pv_mmu_ops.read_cr3 = lguest_read_cr3;
1034 pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; 1067 pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
1035 pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mode; 1068 pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mode;
1069 pv_mmu_ops.pte_update = lguest_pte_update;
1070 pv_mmu_ops.pte_update_defer = lguest_pte_update;
1036 1071
1037#ifdef CONFIG_X86_LOCAL_APIC 1072#ifdef CONFIG_X86_LOCAL_APIC
1038 /* apic read/write intercepts */ 1073 /* apic read/write intercepts */
@@ -1051,14 +1086,6 @@ __init void lguest_init(void)
1051 * lguest_init() where the rest of the fairly chaotic boot setup 1086 * lguest_init() where the rest of the fairly chaotic boot setup
1052 * occurs. */ 1087 * occurs. */
1053 1088
1054 /* The native boot code sets up initial page tables immediately after
1055 * the kernel itself, and sets init_pg_tables_end so they're not
1056 * clobbered. The Launcher places our initial pagetables somewhere at
1057 * the top of our physical memory, so we don't need extra space: set
1058 * init_pg_tables_end to the end of the kernel. */
1059 init_pg_tables_start = __pa(pg0);
1060 init_pg_tables_end = __pa(pg0);
1061
1062 /* As described in head_32.S, we map the first 128M of memory. */ 1089 /* As described in head_32.S, we map the first 128M of memory. */
1063 max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; 1090 max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
1064 1091
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index 10b9bd35a8ff..f79541989471 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -27,8 +27,8 @@ ENTRY(lguest_entry)
27 /* We make the "initialization" hypercall now to tell the Host about 27 /* We make the "initialization" hypercall now to tell the Host about
28 * us, and also find out where it put our page tables. */ 28 * us, and also find out where it put our page tables. */
29 movl $LHCALL_LGUEST_INIT, %eax 29 movl $LHCALL_LGUEST_INIT, %eax
30 movl $lguest_data - __PAGE_OFFSET, %edx 30 movl $lguest_data - __PAGE_OFFSET, %ebx
31 int $LGUEST_TRAP_ENTRY 31 .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */
32 32
33 /* Set up the initial stack so we can run C code. */ 33 /* Set up the initial stack so we can run C code. */
34 movl $(init_thread_union+THREAD_SIZE),%esp 34 movl $(init_thread_union+THREAD_SIZE),%esp
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index c22981fa2f3a..ad5441ed1b57 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -1,30 +1,38 @@
1/* Copyright 2002 Andi Kleen */ 1/* Copyright 2002 Andi Kleen */
2 2
3#include <linux/linkage.h> 3#include <linux/linkage.h>
4#include <asm/dwarf2.h> 4
5#include <asm/cpufeature.h> 5#include <asm/cpufeature.h>
6#include <asm/dwarf2.h>
6 7
7/* 8/*
8 * memcpy - Copy a memory block. 9 * memcpy - Copy a memory block.
9 * 10 *
10 * Input: 11 * Input:
11 * rdi destination 12 * rdi destination
12 * rsi source 13 * rsi source
13 * rdx count 14 * rdx count
14 * 15 *
15 * Output: 16 * Output:
16 * rax original destination 17 * rax original destination
17 */ 18 */
18 19
20/*
21 * memcpy_c() - fast string ops (REP MOVSQ) based variant.
22 *
23 * Calls to this get patched into the kernel image via the
24 * alternative instructions framework:
25 */
19 ALIGN 26 ALIGN
20memcpy_c: 27memcpy_c:
21 CFI_STARTPROC 28 CFI_STARTPROC
22 movq %rdi,%rax 29 movq %rdi, %rax
23 movl %edx,%ecx 30
24 shrl $3,%ecx 31 movl %edx, %ecx
25 andl $7,%edx 32 shrl $3, %ecx
33 andl $7, %edx
26 rep movsq 34 rep movsq
27 movl %edx,%ecx 35 movl %edx, %ecx
28 rep movsb 36 rep movsb
29 ret 37 ret
30 CFI_ENDPROC 38 CFI_ENDPROC
@@ -33,99 +41,110 @@ ENDPROC(memcpy_c)
33ENTRY(__memcpy) 41ENTRY(__memcpy)
34ENTRY(memcpy) 42ENTRY(memcpy)
35 CFI_STARTPROC 43 CFI_STARTPROC
36 pushq %rbx
37 CFI_ADJUST_CFA_OFFSET 8
38 CFI_REL_OFFSET rbx, 0
39 movq %rdi,%rax
40 44
41 movl %edx,%ecx 45 /*
42 shrl $6,%ecx 46 * Put the number of full 64-byte blocks into %ecx.
47 * Tail portion is handled at the end:
48 */
49 movq %rdi, %rax
50 movl %edx, %ecx
51 shrl $6, %ecx
43 jz .Lhandle_tail 52 jz .Lhandle_tail
44 53
45 .p2align 4 54 .p2align 4
46.Lloop_64: 55.Lloop_64:
56 /*
57 * We decrement the loop index here - and the zero-flag is
58 * checked at the end of the loop (instructions inbetween do
59 * not change the zero flag):
60 */
47 decl %ecx 61 decl %ecx
48 62
49 movq (%rsi),%r11 63 /*
50 movq 8(%rsi),%r8 64 * Move in blocks of 4x16 bytes:
65 */
66 movq 0*8(%rsi), %r11
67 movq 1*8(%rsi), %r8
68 movq %r11, 0*8(%rdi)
69 movq %r8, 1*8(%rdi)
51 70
52 movq %r11,(%rdi) 71 movq 2*8(%rsi), %r9
53 movq %r8,1*8(%rdi) 72 movq 3*8(%rsi), %r10
73 movq %r9, 2*8(%rdi)
74 movq %r10, 3*8(%rdi)
54 75
55 movq 2*8(%rsi),%r9 76 movq 4*8(%rsi), %r11
56 movq 3*8(%rsi),%r10 77 movq 5*8(%rsi), %r8
78 movq %r11, 4*8(%rdi)
79 movq %r8, 5*8(%rdi)
57 80
58 movq %r9,2*8(%rdi) 81 movq 6*8(%rsi), %r9
59 movq %r10,3*8(%rdi) 82 movq 7*8(%rsi), %r10
83 movq %r9, 6*8(%rdi)
84 movq %r10, 7*8(%rdi)
60 85
61 movq 4*8(%rsi),%r11 86 leaq 64(%rsi), %rsi
62 movq 5*8(%rsi),%r8 87 leaq 64(%rdi), %rdi
63 88
64 movq %r11,4*8(%rdi)
65 movq %r8,5*8(%rdi)
66
67 movq 6*8(%rsi),%r9
68 movq 7*8(%rsi),%r10
69
70 movq %r9,6*8(%rdi)
71 movq %r10,7*8(%rdi)
72
73 leaq 64(%rsi),%rsi
74 leaq 64(%rdi),%rdi
75 jnz .Lloop_64 89 jnz .Lloop_64
76 90
77.Lhandle_tail: 91.Lhandle_tail:
78 movl %edx,%ecx 92 movl %edx, %ecx
79 andl $63,%ecx 93 andl $63, %ecx
80 shrl $3,%ecx 94 shrl $3, %ecx
81 jz .Lhandle_7 95 jz .Lhandle_7
96
82 .p2align 4 97 .p2align 4
83.Lloop_8: 98.Lloop_8:
84 decl %ecx 99 decl %ecx
85 movq (%rsi),%r8 100 movq (%rsi), %r8
86 movq %r8,(%rdi) 101 movq %r8, (%rdi)
87 leaq 8(%rdi),%rdi 102 leaq 8(%rdi), %rdi
88 leaq 8(%rsi),%rsi 103 leaq 8(%rsi), %rsi
89 jnz .Lloop_8 104 jnz .Lloop_8
90 105
91.Lhandle_7: 106.Lhandle_7:
92 movl %edx,%ecx 107 movl %edx, %ecx
93 andl $7,%ecx 108 andl $7, %ecx
94 jz .Lende 109 jz .Lend
110
95 .p2align 4 111 .p2align 4
96.Lloop_1: 112.Lloop_1:
97 movb (%rsi),%r8b 113 movb (%rsi), %r8b
98 movb %r8b,(%rdi) 114 movb %r8b, (%rdi)
99 incq %rdi 115 incq %rdi
100 incq %rsi 116 incq %rsi
101 decl %ecx 117 decl %ecx
102 jnz .Lloop_1 118 jnz .Lloop_1
103 119
104.Lende: 120.Lend:
105 popq %rbx
106 CFI_ADJUST_CFA_OFFSET -8
107 CFI_RESTORE rbx
108 ret 121 ret
109.Lfinal:
110 CFI_ENDPROC 122 CFI_ENDPROC
111ENDPROC(memcpy) 123ENDPROC(memcpy)
112ENDPROC(__memcpy) 124ENDPROC(__memcpy)
113 125
114 /* Some CPUs run faster using the string copy instructions. 126 /*
115 It is also a lot simpler. Use this when possible */ 127 * Some CPUs run faster using the string copy instructions.
128 * It is also a lot simpler. Use this when possible:
129 */
116 130
117 .section .altinstr_replacement,"ax" 131 .section .altinstr_replacement, "ax"
1181: .byte 0xeb /* jmp <disp8> */ 1321: .byte 0xeb /* jmp <disp8> */
119 .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */ 133 .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */
1202: 1342:
121 .previous 135 .previous
122 .section .altinstructions,"a" 136
137 .section .altinstructions, "a"
123 .align 8 138 .align 8
124 .quad memcpy 139 .quad memcpy
125 .quad 1b 140 .quad 1b
126 .byte X86_FEATURE_REP_GOOD 141 .byte X86_FEATURE_REP_GOOD
127 /* Replace only beginning, memcpy is used to apply alternatives, so it 142
128 * is silly to overwrite itself with nops - reboot is only outcome... */ 143 /*
144 * Replace only beginning, memcpy is used to apply alternatives,
145 * so it is silly to overwrite itself with nops - reboot is the
146 * only outcome...
147 */
129 .byte 2b - 1b 148 .byte 2b - 1b
130 .byte 2b - 1b 149 .byte 2b - 1b
131 .previous 150 .previous
diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c
index 491e737ce547..aa0987088774 100644
--- a/arch/x86/math-emu/fpu_aux.c
+++ b/arch/x86/math-emu/fpu_aux.c
@@ -30,20 +30,29 @@ static void fclex(void)
30} 30}
31 31
32/* Needs to be externally visible */ 32/* Needs to be externally visible */
33void finit(void) 33void finit_task(struct task_struct *tsk)
34{ 34{
35 control_word = 0x037f; 35 struct i387_soft_struct *soft = &tsk->thread.xstate->soft;
36 partial_status = 0; 36 struct address *oaddr, *iaddr;
37 top = 0; /* We don't keep top in the status word internally. */ 37 soft->cwd = 0x037f;
38 fpu_tag_word = 0xffff; 38 soft->swd = 0;
39 soft->ftop = 0; /* We don't keep top in the status word internally. */
40 soft->twd = 0xffff;
39 /* The behaviour is different from that detailed in 41 /* The behaviour is different from that detailed in
40 Section 15.1.6 of the Intel manual */ 42 Section 15.1.6 of the Intel manual */
41 operand_address.offset = 0; 43 oaddr = (struct address *)&soft->foo;
42 operand_address.selector = 0; 44 oaddr->offset = 0;
43 instruction_address.offset = 0; 45 oaddr->selector = 0;
44 instruction_address.selector = 0; 46 iaddr = (struct address *)&soft->fip;
45 instruction_address.opcode = 0; 47 iaddr->offset = 0;
46 no_ip_update = 1; 48 iaddr->selector = 0;
49 iaddr->opcode = 0;
50 soft->no_update = 1;
51}
52
53void finit(void)
54{
55 finit_task(current);
47} 56}
48 57
49/* 58/*
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 2b938a384910..fdd30d08ab52 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,4 +1,4 @@
1obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ 1obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
2 pat.o pgtable.o gup.o 2 pat.o pgtable.o gup.o
3 3
4obj-$(CONFIG_SMP) += tlb.o 4obj-$(CONFIG_SMP) += tlb.o
@@ -14,7 +14,7 @@ obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
14mmiotrace-y := kmmio.o pf_in.o mmio-mod.o 14mmiotrace-y := kmmio.o pf_in.o mmio-mod.o
15obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o 15obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
16 16
17obj-$(CONFIG_NUMA) += numa_$(BITS).o 17obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o
18obj-$(CONFIG_K8_NUMA) += k8topology_64.o 18obj-$(CONFIG_K8_NUMA) += k8topology_64.o
19obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o 19obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
20 20
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index bcc079c282dd..8126e8d1a2a4 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -1,5 +1,6 @@
1#include <linux/highmem.h> 1#include <linux/highmem.h>
2#include <linux/module.h> 2#include <linux/module.h>
3#include <linux/swap.h> /* for totalram_pages */
3 4
4void *kmap(struct page *page) 5void *kmap(struct page *page)
5{ 6{
@@ -18,49 +19,6 @@ void kunmap(struct page *page)
18 kunmap_high(page); 19 kunmap_high(page);
19} 20}
20 21
21static void debug_kmap_atomic_prot(enum km_type type)
22{
23#ifdef CONFIG_DEBUG_HIGHMEM
24 static unsigned warn_count = 10;
25
26 if (unlikely(warn_count == 0))
27 return;
28
29 if (unlikely(in_interrupt())) {
30 if (in_irq()) {
31 if (type != KM_IRQ0 && type != KM_IRQ1 &&
32 type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
33 type != KM_BOUNCE_READ) {
34 WARN_ON(1);
35 warn_count--;
36 }
37 } else if (!irqs_disabled()) { /* softirq */
38 if (type != KM_IRQ0 && type != KM_IRQ1 &&
39 type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
40 type != KM_SKB_SUNRPC_DATA &&
41 type != KM_SKB_DATA_SOFTIRQ &&
42 type != KM_BOUNCE_READ) {
43 WARN_ON(1);
44 warn_count--;
45 }
46 }
47 }
48
49 if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
50 type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) {
51 if (!irqs_disabled()) {
52 WARN_ON(1);
53 warn_count--;
54 }
55 } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
56 if (irq_count() == 0 && !irqs_disabled()) {
57 WARN_ON(1);
58 warn_count--;
59 }
60 }
61#endif
62}
63
64/* 22/*
65 * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because 23 * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
66 * no global lock is needed and because the kmap code must perform a global TLB 24 * no global lock is needed and because the kmap code must perform a global TLB
@@ -80,7 +38,7 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
80 if (!PageHighMem(page)) 38 if (!PageHighMem(page))
81 return page_address(page); 39 return page_address(page);
82 40
83 debug_kmap_atomic_prot(type); 41 debug_kmap_atomic(type);
84 42
85 idx = type + KM_TYPE_NR*smp_processor_id(); 43 idx = type + KM_TYPE_NR*smp_processor_id();
86 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); 44 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
@@ -120,22 +78,13 @@ void kunmap_atomic(void *kvaddr, enum km_type type)
120 pagefault_enable(); 78 pagefault_enable();
121} 79}
122 80
123/* This is the same as kmap_atomic() but can map memory that doesn't 81/*
82 * This is the same as kmap_atomic() but can map memory that doesn't
124 * have a struct page associated with it. 83 * have a struct page associated with it.
125 */ 84 */
126void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) 85void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
127{ 86{
128 enum fixed_addresses idx; 87 return kmap_atomic_prot_pfn(pfn, type, kmap_prot);
129 unsigned long vaddr;
130
131 pagefault_disable();
132
133 idx = type + KM_TYPE_NR*smp_processor_id();
134 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
135 set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot));
136 arch_flush_lazy_mmu_mode();
137
138 return (void*) vaddr;
139} 88}
140EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */ 89EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */
141 90
@@ -156,3 +105,27 @@ EXPORT_SYMBOL(kmap);
156EXPORT_SYMBOL(kunmap); 105EXPORT_SYMBOL(kunmap);
157EXPORT_SYMBOL(kmap_atomic); 106EXPORT_SYMBOL(kmap_atomic);
158EXPORT_SYMBOL(kunmap_atomic); 107EXPORT_SYMBOL(kunmap_atomic);
108
109void __init set_highmem_pages_init(void)
110{
111 struct zone *zone;
112 int nid;
113
114 for_each_zone(zone) {
115 unsigned long zone_start_pfn, zone_end_pfn;
116
117 if (!is_highmem(zone))
118 continue;
119
120 zone_start_pfn = zone->zone_start_pfn;
121 zone_end_pfn = zone_start_pfn + zone->spanned_pages;
122
123 nid = zone_to_nid(zone);
124 printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n",
125 zone->name, nid, zone_start_pfn, zone_end_pfn);
126
127 add_highpages_with_active_regions(nid, zone_start_pfn,
128 zone_end_pfn);
129 }
130 totalram_pages += totalhigh_pages;
131}
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
new file mode 100644
index 000000000000..fd3da1dda1c9
--- /dev/null
+++ b/arch/x86/mm/init.c
@@ -0,0 +1,393 @@
1#include <linux/ioport.h>
2#include <linux/swap.h>
3
4#include <asm/cacheflush.h>
5#include <asm/e820.h>
6#include <asm/init.h>
7#include <asm/page.h>
8#include <asm/page_types.h>
9#include <asm/sections.h>
10#include <asm/system.h>
11#include <asm/tlbflush.h>
12
13unsigned long __initdata e820_table_start;
14unsigned long __meminitdata e820_table_end;
15unsigned long __meminitdata e820_table_top;
16
17int after_bootmem;
18
19int direct_gbpages
20#ifdef CONFIG_DIRECT_GBPAGES
21 = 1
22#endif
23;
24
25static void __init find_early_table_space(unsigned long end, int use_pse,
26 int use_gbpages)
27{
28 unsigned long puds, pmds, ptes, tables, start;
29
30 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
31 tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
32
33 if (use_gbpages) {
34 unsigned long extra;
35
36 extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
37 pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
38 } else
39 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
40
41 tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
42
43 if (use_pse) {
44 unsigned long extra;
45
46 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
47#ifdef CONFIG_X86_32
48 extra += PMD_SIZE;
49#endif
50 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
51 } else
52 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
53
54 tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
55
56#ifdef CONFIG_X86_32
57 /* for fixmap */
58 tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
59#endif
60
61 /*
62 * RED-PEN putting page tables only on node 0 could
63 * cause a hotspot and fill up ZONE_DMA. The page tables
64 * need roughly 0.5KB per GB.
65 */
66#ifdef CONFIG_X86_32
67 start = 0x7000;
68 e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
69 tables, PAGE_SIZE);
70#else /* CONFIG_X86_64 */
71 start = 0x8000;
72 e820_table_start = find_e820_area(start, end, tables, PAGE_SIZE);
73#endif
74 if (e820_table_start == -1UL)
75 panic("Cannot find space for the kernel page tables");
76
77 e820_table_start >>= PAGE_SHIFT;
78 e820_table_end = e820_table_start;
79 e820_table_top = e820_table_start + (tables >> PAGE_SHIFT);
80
81 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
82 end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT);
83}
84
85struct map_range {
86 unsigned long start;
87 unsigned long end;
88 unsigned page_size_mask;
89};
90
91#ifdef CONFIG_X86_32
92#define NR_RANGE_MR 3
93#else /* CONFIG_X86_64 */
94#define NR_RANGE_MR 5
95#endif
96
97static int __meminit save_mr(struct map_range *mr, int nr_range,
98 unsigned long start_pfn, unsigned long end_pfn,
99 unsigned long page_size_mask)
100{
101 if (start_pfn < end_pfn) {
102 if (nr_range >= NR_RANGE_MR)
103 panic("run out of range for init_memory_mapping\n");
104 mr[nr_range].start = start_pfn<<PAGE_SHIFT;
105 mr[nr_range].end = end_pfn<<PAGE_SHIFT;
106 mr[nr_range].page_size_mask = page_size_mask;
107 nr_range++;
108 }
109
110 return nr_range;
111}
112
113#ifdef CONFIG_X86_64
114static void __init init_gbpages(void)
115{
116 if (direct_gbpages && cpu_has_gbpages)
117 printk(KERN_INFO "Using GB pages for direct mapping\n");
118 else
119 direct_gbpages = 0;
120}
121#else
122static inline void init_gbpages(void)
123{
124}
125#endif
126
127/*
128 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
129 * This runs before bootmem is initialized and gets pages directly from
130 * the physical memory. To access them they are temporarily mapped.
131 */
132unsigned long __init_refok init_memory_mapping(unsigned long start,
133 unsigned long end)
134{
135 unsigned long page_size_mask = 0;
136 unsigned long start_pfn, end_pfn;
137 unsigned long ret = 0;
138 unsigned long pos;
139
140 struct map_range mr[NR_RANGE_MR];
141 int nr_range, i;
142 int use_pse, use_gbpages;
143
144 printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
145
146 if (!after_bootmem)
147 init_gbpages();
148
149#ifdef CONFIG_DEBUG_PAGEALLOC
150 /*
151 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
152 * This will simplify cpa(), which otherwise needs to support splitting
153 * large pages into small in interrupt context, etc.
154 */
155 use_pse = use_gbpages = 0;
156#else
157 use_pse = cpu_has_pse;
158 use_gbpages = direct_gbpages;
159#endif
160
161#ifdef CONFIG_X86_32
162#ifdef CONFIG_X86_PAE
163 set_nx();
164 if (nx_enabled)
165 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
166#endif
167
168 /* Enable PSE if available */
169 if (cpu_has_pse)
170 set_in_cr4(X86_CR4_PSE);
171
172 /* Enable PGE if available */
173 if (cpu_has_pge) {
174 set_in_cr4(X86_CR4_PGE);
175 __supported_pte_mask |= _PAGE_GLOBAL;
176 }
177#endif
178
179 if (use_gbpages)
180 page_size_mask |= 1 << PG_LEVEL_1G;
181 if (use_pse)
182 page_size_mask |= 1 << PG_LEVEL_2M;
183
184 memset(mr, 0, sizeof(mr));
185 nr_range = 0;
186
187 /* head if not big page alignment ? */
188 start_pfn = start >> PAGE_SHIFT;
189 pos = start_pfn << PAGE_SHIFT;
190#ifdef CONFIG_X86_32
191 /*
192 * Don't use a large page for the first 2/4MB of memory
193 * because there are often fixed size MTRRs in there
194 * and overlapping MTRRs into large pages can cause
195 * slowdowns.
196 */
197 if (pos == 0)
198 end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT);
199 else
200 end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
201 << (PMD_SHIFT - PAGE_SHIFT);
202#else /* CONFIG_X86_64 */
203 end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
204 << (PMD_SHIFT - PAGE_SHIFT);
205#endif
206 if (end_pfn > (end >> PAGE_SHIFT))
207 end_pfn = end >> PAGE_SHIFT;
208 if (start_pfn < end_pfn) {
209 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
210 pos = end_pfn << PAGE_SHIFT;
211 }
212
213 /* big page (2M) range */
214 start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
215 << (PMD_SHIFT - PAGE_SHIFT);
216#ifdef CONFIG_X86_32
217 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
218#else /* CONFIG_X86_64 */
219 end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
220 << (PUD_SHIFT - PAGE_SHIFT);
221 if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
222 end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
223#endif
224
225 if (start_pfn < end_pfn) {
226 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
227 page_size_mask & (1<<PG_LEVEL_2M));
228 pos = end_pfn << PAGE_SHIFT;
229 }
230
231#ifdef CONFIG_X86_64
232 /* big page (1G) range */
233 start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
234 << (PUD_SHIFT - PAGE_SHIFT);
235 end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
236 if (start_pfn < end_pfn) {
237 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
238 page_size_mask &
239 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
240 pos = end_pfn << PAGE_SHIFT;
241 }
242
243 /* tail is not big page (1G) alignment */
244 start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
245 << (PMD_SHIFT - PAGE_SHIFT);
246 end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
247 if (start_pfn < end_pfn) {
248 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
249 page_size_mask & (1<<PG_LEVEL_2M));
250 pos = end_pfn << PAGE_SHIFT;
251 }
252#endif
253
254 /* tail is not big page (2M) alignment */
255 start_pfn = pos>>PAGE_SHIFT;
256 end_pfn = end>>PAGE_SHIFT;
257 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
258
259 /* try to merge same page size and continuous */
260 for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
261 unsigned long old_start;
262 if (mr[i].end != mr[i+1].start ||
263 mr[i].page_size_mask != mr[i+1].page_size_mask)
264 continue;
265 /* move it */
266 old_start = mr[i].start;
267 memmove(&mr[i], &mr[i+1],
268 (nr_range - 1 - i) * sizeof(struct map_range));
269 mr[i--].start = old_start;
270 nr_range--;
271 }
272
273 for (i = 0; i < nr_range; i++)
274 printk(KERN_DEBUG " %010lx - %010lx page %s\n",
275 mr[i].start, mr[i].end,
276 (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
277 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
278
279 /*
280 * Find space for the kernel direct mapping tables.
281 *
282 * Later we should allocate these tables in the local node of the
283 * memory mapped. Unfortunately this is done currently before the
284 * nodes are discovered.
285 */
286 if (!after_bootmem)
287 find_early_table_space(end, use_pse, use_gbpages);
288
289#ifdef CONFIG_X86_32
290 for (i = 0; i < nr_range; i++)
291 kernel_physical_mapping_init(mr[i].start, mr[i].end,
292 mr[i].page_size_mask);
293 ret = end;
294#else /* CONFIG_X86_64 */
295 for (i = 0; i < nr_range; i++)
296 ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
297 mr[i].page_size_mask);
298#endif
299
300#ifdef CONFIG_X86_32
301 early_ioremap_page_table_range_init();
302
303 load_cr3(swapper_pg_dir);
304#endif
305
306#ifdef CONFIG_X86_64
307 if (!after_bootmem)
308 mmu_cr4_features = read_cr4();
309#endif
310 __flush_tlb_all();
311
312 if (!after_bootmem && e820_table_end > e820_table_start)
313 reserve_early(e820_table_start << PAGE_SHIFT,
314 e820_table_end << PAGE_SHIFT, "PGTABLE");
315
316 if (!after_bootmem)
317 early_memtest(start, end);
318
319 return ret >> PAGE_SHIFT;
320}
321
322
323/*
324 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
325 * is valid. The argument is a physical page number.
326 *
327 *
328 * On x86, access has to be given to the first megabyte of ram because that area
329 * contains bios code and data regions used by X and dosemu and similar apps.
330 * Access has to be given to non-kernel-ram areas as well, these contain the PCI
331 * mmio resources as well as potential bios/acpi data regions.
332 */
333int devmem_is_allowed(unsigned long pagenr)
334{
335 if (pagenr <= 256)
336 return 1;
337 if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
338 return 0;
339 if (!page_is_ram(pagenr))
340 return 1;
341 return 0;
342}
343
344void free_init_pages(char *what, unsigned long begin, unsigned long end)
345{
346 unsigned long addr = begin;
347
348 if (addr >= end)
349 return;
350
351 /*
352 * If debugging page accesses then do not free this memory but
353 * mark them not present - any buggy init-section access will
354 * create a kernel page fault:
355 */
356#ifdef CONFIG_DEBUG_PAGEALLOC
357 printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
358 begin, PAGE_ALIGN(end));
359 set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
360#else
361 /*
362 * We just marked the kernel text read only above, now that
363 * we are going to free part of that, we need to make that
364 * writeable first.
365 */
366 set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
367
368 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
369
370 for (; addr < end; addr += PAGE_SIZE) {
371 ClearPageReserved(virt_to_page(addr));
372 init_page_count(virt_to_page(addr));
373 memset((void *)(addr & ~(PAGE_SIZE-1)),
374 POISON_FREE_INITMEM, PAGE_SIZE);
375 free_page(addr);
376 totalram_pages++;
377 }
378#endif
379}
380
381void free_initmem(void)
382{
383 free_init_pages("unused kernel memory",
384 (unsigned long)(&__init_begin),
385 (unsigned long)(&__init_end));
386}
387
388#ifdef CONFIG_BLK_DEV_INITRD
389void free_initrd_mem(unsigned long start, unsigned long end)
390{
391 free_init_pages("initrd memory", start, end);
392}
393#endif
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 06708ee94aa4..749559ed80f5 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -49,8 +49,7 @@
49#include <asm/paravirt.h> 49#include <asm/paravirt.h>
50#include <asm/setup.h> 50#include <asm/setup.h>
51#include <asm/cacheflush.h> 51#include <asm/cacheflush.h>
52 52#include <asm/init.h>
53unsigned int __VMALLOC_RESERVE = 128 << 20;
54 53
55unsigned long max_low_pfn_mapped; 54unsigned long max_low_pfn_mapped;
56unsigned long max_pfn_mapped; 55unsigned long max_pfn_mapped;
@@ -60,19 +59,14 @@ unsigned long highstart_pfn, highend_pfn;
60 59
61static noinline int do_test_wp_bit(void); 60static noinline int do_test_wp_bit(void);
62 61
63 62bool __read_mostly __vmalloc_start_set = false;
64static unsigned long __initdata table_start;
65static unsigned long __meminitdata table_end;
66static unsigned long __meminitdata table_top;
67
68static int __initdata after_init_bootmem;
69 63
70static __init void *alloc_low_page(void) 64static __init void *alloc_low_page(void)
71{ 65{
72 unsigned long pfn = table_end++; 66 unsigned long pfn = e820_table_end++;
73 void *adr; 67 void *adr;
74 68
75 if (pfn >= table_top) 69 if (pfn >= e820_table_top)
76 panic("alloc_low_page: ran out of memory"); 70 panic("alloc_low_page: ran out of memory");
77 71
78 adr = __va(pfn * PAGE_SIZE); 72 adr = __va(pfn * PAGE_SIZE);
@@ -92,7 +86,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
92 86
93#ifdef CONFIG_X86_PAE 87#ifdef CONFIG_X86_PAE
94 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { 88 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
95 if (after_init_bootmem) 89 if (after_bootmem)
96 pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); 90 pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
97 else 91 else
98 pmd_table = (pmd_t *)alloc_low_page(); 92 pmd_table = (pmd_t *)alloc_low_page();
@@ -119,7 +113,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
119 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { 113 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
120 pte_t *page_table = NULL; 114 pte_t *page_table = NULL;
121 115
122 if (after_init_bootmem) { 116 if (after_bootmem) {
123#ifdef CONFIG_DEBUG_PAGEALLOC 117#ifdef CONFIG_DEBUG_PAGEALLOC
124 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); 118 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
125#endif 119#endif
@@ -137,6 +131,23 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
137 return pte_offset_kernel(pmd, 0); 131 return pte_offset_kernel(pmd, 0);
138} 132}
139 133
134pmd_t * __init populate_extra_pmd(unsigned long vaddr)
135{
136 int pgd_idx = pgd_index(vaddr);
137 int pmd_idx = pmd_index(vaddr);
138
139 return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx;
140}
141
142pte_t * __init populate_extra_pte(unsigned long vaddr)
143{
144 int pte_idx = pte_index(vaddr);
145 pmd_t *pmd;
146
147 pmd = populate_extra_pmd(vaddr);
148 return one_page_table_init(pmd) + pte_idx;
149}
150
140static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, 151static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
141 unsigned long vaddr, pte_t *lastpte) 152 unsigned long vaddr, pte_t *lastpte)
142{ 153{
@@ -153,12 +164,12 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
153 if (pmd_idx_kmap_begin != pmd_idx_kmap_end 164 if (pmd_idx_kmap_begin != pmd_idx_kmap_end
154 && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin 165 && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
155 && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end 166 && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
156 && ((__pa(pte) >> PAGE_SHIFT) < table_start 167 && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start
157 || (__pa(pte) >> PAGE_SHIFT) >= table_end)) { 168 || (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) {
158 pte_t *newpte; 169 pte_t *newpte;
159 int i; 170 int i;
160 171
161 BUG_ON(after_init_bootmem); 172 BUG_ON(after_bootmem);
162 newpte = alloc_low_page(); 173 newpte = alloc_low_page();
163 for (i = 0; i < PTRS_PER_PTE; i++) 174 for (i = 0; i < PTRS_PER_PTE; i++)
164 set_pte(newpte + i, pte[i]); 175 set_pte(newpte + i, pte[i]);
@@ -227,11 +238,14 @@ static inline int is_kernel_text(unsigned long addr)
227 * of max_low_pfn pages, by creating page tables starting from address 238 * of max_low_pfn pages, by creating page tables starting from address
228 * PAGE_OFFSET: 239 * PAGE_OFFSET:
229 */ 240 */
230static void __init kernel_physical_mapping_init(pgd_t *pgd_base, 241unsigned long __init
231 unsigned long start_pfn, 242kernel_physical_mapping_init(unsigned long start,
232 unsigned long end_pfn, 243 unsigned long end,
233 int use_pse) 244 unsigned long page_size_mask)
234{ 245{
246 int use_pse = page_size_mask == (1<<PG_LEVEL_2M);
247 unsigned long start_pfn, end_pfn;
248 pgd_t *pgd_base = swapper_pg_dir;
235 int pgd_idx, pmd_idx, pte_ofs; 249 int pgd_idx, pmd_idx, pte_ofs;
236 unsigned long pfn; 250 unsigned long pfn;
237 pgd_t *pgd; 251 pgd_t *pgd;
@@ -240,6 +254,9 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
240 unsigned pages_2m, pages_4k; 254 unsigned pages_2m, pages_4k;
241 int mapping_iter; 255 int mapping_iter;
242 256
257 start_pfn = start >> PAGE_SHIFT;
258 end_pfn = end >> PAGE_SHIFT;
259
243 /* 260 /*
244 * First iteration will setup identity mapping using large/small pages 261 * First iteration will setup identity mapping using large/small pages
245 * based on use_pse, with other attributes same as set by 262 * based on use_pse, with other attributes same as set by
@@ -354,26 +371,6 @@ repeat:
354 mapping_iter = 2; 371 mapping_iter = 2;
355 goto repeat; 372 goto repeat;
356 } 373 }
357}
358
359/*
360 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
361 * is valid. The argument is a physical page number.
362 *
363 *
364 * On x86, access has to be given to the first megabyte of ram because that area
365 * contains bios code and data regions used by X and dosemu and similar apps.
366 * Access has to be given to non-kernel-ram areas as well, these contain the PCI
367 * mmio resources as well as potential bios/acpi data regions.
368 */
369int devmem_is_allowed(unsigned long pagenr)
370{
371 if (pagenr <= 256)
372 return 1;
373 if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
374 return 0;
375 if (!page_is_ram(pagenr))
376 return 1;
377 return 0; 374 return 0;
378} 375}
379 376
@@ -469,22 +466,10 @@ void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
469 work_with_active_regions(nid, add_highpages_work_fn, &data); 466 work_with_active_regions(nid, add_highpages_work_fn, &data);
470} 467}
471 468
472#ifndef CONFIG_NUMA
473static void __init set_highmem_pages_init(void)
474{
475 add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
476
477 totalram_pages += totalhigh_pages;
478}
479#endif /* !CONFIG_NUMA */
480
481#else 469#else
482static inline void permanent_kmaps_init(pgd_t *pgd_base) 470static inline void permanent_kmaps_init(pgd_t *pgd_base)
483{ 471{
484} 472}
485static inline void set_highmem_pages_init(void)
486{
487}
488#endif /* CONFIG_HIGHMEM */ 473#endif /* CONFIG_HIGHMEM */
489 474
490void __init native_pagetable_setup_start(pgd_t *base) 475void __init native_pagetable_setup_start(pgd_t *base)
@@ -542,8 +527,9 @@ void __init native_pagetable_setup_done(pgd_t *base)
542 * be partially populated, and so it avoids stomping on any existing 527 * be partially populated, and so it avoids stomping on any existing
543 * mappings. 528 * mappings.
544 */ 529 */
545static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base) 530void __init early_ioremap_page_table_range_init(void)
546{ 531{
532 pgd_t *pgd_base = swapper_pg_dir;
547 unsigned long vaddr, end; 533 unsigned long vaddr, end;
548 534
549 /* 535 /*
@@ -638,7 +624,7 @@ static int __init noexec_setup(char *str)
638} 624}
639early_param("noexec", noexec_setup); 625early_param("noexec", noexec_setup);
640 626
641static void __init set_nx(void) 627void __init set_nx(void)
642{ 628{
643 unsigned int v[4], l, h; 629 unsigned int v[4], l, h;
644 630
@@ -790,6 +776,8 @@ void __init initmem_init(unsigned long start_pfn,
790#ifdef CONFIG_FLATMEM 776#ifdef CONFIG_FLATMEM
791 max_mapnr = num_physpages; 777 max_mapnr = num_physpages;
792#endif 778#endif
779 __vmalloc_start_set = true;
780
793 printk(KERN_NOTICE "%ldMB LOWMEM available.\n", 781 printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
794 pages_to_mb(max_low_pfn)); 782 pages_to_mb(max_low_pfn));
795 783
@@ -811,176 +799,66 @@ static void __init zone_sizes_init(void)
811 free_area_init_nodes(max_zone_pfns); 799 free_area_init_nodes(max_zone_pfns);
812} 800}
813 801
802static unsigned long __init setup_node_bootmem(int nodeid,
803 unsigned long start_pfn,
804 unsigned long end_pfn,
805 unsigned long bootmap)
806{
807 unsigned long bootmap_size;
808
809 /* don't touch min_low_pfn */
810 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
811 bootmap >> PAGE_SHIFT,
812 start_pfn, end_pfn);
813 printk(KERN_INFO " node %d low ram: %08lx - %08lx\n",
814 nodeid, start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
815 printk(KERN_INFO " node %d bootmap %08lx - %08lx\n",
816 nodeid, bootmap, bootmap + bootmap_size);
817 free_bootmem_with_active_regions(nodeid, end_pfn);
818 early_res_to_bootmem(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
819
820 return bootmap + bootmap_size;
821}
822
814void __init setup_bootmem_allocator(void) 823void __init setup_bootmem_allocator(void)
815{ 824{
816 int i; 825 int nodeid;
817 unsigned long bootmap_size, bootmap; 826 unsigned long bootmap_size, bootmap;
818 /* 827 /*
819 * Initialize the boot-time allocator (with low memory only): 828 * Initialize the boot-time allocator (with low memory only):
820 */ 829 */
821 bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT; 830 bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
822 bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT, 831 bootmap = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
823 max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
824 PAGE_SIZE); 832 PAGE_SIZE);
825 if (bootmap == -1L) 833 if (bootmap == -1L)
826 panic("Cannot find bootmem map of size %ld\n", bootmap_size); 834 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
827 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP"); 835 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
828 836
829 /* don't touch min_low_pfn */
830 bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
831 min_low_pfn, max_low_pfn);
832 printk(KERN_INFO " mapped low ram: 0 - %08lx\n", 837 printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
833 max_pfn_mapped<<PAGE_SHIFT); 838 max_pfn_mapped<<PAGE_SHIFT);
834 printk(KERN_INFO " low ram: %08lx - %08lx\n", 839 printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
835 min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
836 printk(KERN_INFO " bootmap %08lx - %08lx\n",
837 bootmap, bootmap + bootmap_size);
838 for_each_online_node(i)
839 free_bootmem_with_active_regions(i, max_low_pfn);
840 early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
841
842 after_init_bootmem = 1;
843}
844
845static void __init find_early_table_space(unsigned long end, int use_pse)
846{
847 unsigned long puds, pmds, ptes, tables, start;
848
849 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
850 tables = PAGE_ALIGN(puds * sizeof(pud_t));
851
852 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
853 tables += PAGE_ALIGN(pmds * sizeof(pmd_t));
854 840
855 if (use_pse) { 841 for_each_online_node(nodeid) {
856 unsigned long extra; 842 unsigned long start_pfn, end_pfn;
857 843
858 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); 844#ifdef CONFIG_NEED_MULTIPLE_NODES
859 extra += PMD_SIZE; 845 start_pfn = node_start_pfn[nodeid];
860 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; 846 end_pfn = node_end_pfn[nodeid];
861 } else 847 if (start_pfn > max_low_pfn)
862 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; 848 continue;
863 849 if (end_pfn > max_low_pfn)
864 tables += PAGE_ALIGN(ptes * sizeof(pte_t)); 850 end_pfn = max_low_pfn;
865
866 /* for fixmap */
867 tables += PAGE_ALIGN(__end_of_fixed_addresses * sizeof(pte_t));
868
869 /*
870 * RED-PEN putting page tables only on node 0 could
871 * cause a hotspot and fill up ZONE_DMA. The page tables
872 * need roughly 0.5KB per GB.
873 */
874 start = 0x7000;
875 table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
876 tables, PAGE_SIZE);
877 if (table_start == -1UL)
878 panic("Cannot find space for the kernel page tables");
879
880 table_start >>= PAGE_SHIFT;
881 table_end = table_start;
882 table_top = table_start + (tables>>PAGE_SHIFT);
883
884 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
885 end, table_start << PAGE_SHIFT,
886 (table_start << PAGE_SHIFT) + tables);
887}
888
889unsigned long __init_refok init_memory_mapping(unsigned long start,
890 unsigned long end)
891{
892 pgd_t *pgd_base = swapper_pg_dir;
893 unsigned long start_pfn, end_pfn;
894 unsigned long big_page_start;
895#ifdef CONFIG_DEBUG_PAGEALLOC
896 /*
897 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
898 * This will simplify cpa(), which otherwise needs to support splitting
899 * large pages into small in interrupt context, etc.
900 */
901 int use_pse = 0;
902#else 851#else
903 int use_pse = cpu_has_pse; 852 start_pfn = 0;
904#endif 853 end_pfn = max_low_pfn;
905
906 /*
907 * Find space for the kernel direct mapping tables.
908 */
909 if (!after_init_bootmem)
910 find_early_table_space(end, use_pse);
911
912#ifdef CONFIG_X86_PAE
913 set_nx();
914 if (nx_enabled)
915 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
916#endif 854#endif
917 855 bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn,
918 /* Enable PSE if available */ 856 bootmap);
919 if (cpu_has_pse)
920 set_in_cr4(X86_CR4_PSE);
921
922 /* Enable PGE if available */
923 if (cpu_has_pge) {
924 set_in_cr4(X86_CR4_PGE);
925 __supported_pte_mask |= _PAGE_GLOBAL;
926 } 857 }
927 858
928 /* 859 after_bootmem = 1;
929 * Don't use a large page for the first 2/4MB of memory
930 * because there are often fixed size MTRRs in there
931 * and overlapping MTRRs into large pages can cause
932 * slowdowns.
933 */
934 big_page_start = PMD_SIZE;
935
936 if (start < big_page_start) {
937 start_pfn = start >> PAGE_SHIFT;
938 end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT);
939 } else {
940 /* head is not big page alignment ? */
941 start_pfn = start >> PAGE_SHIFT;
942 end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
943 << (PMD_SHIFT - PAGE_SHIFT);
944 }
945 if (start_pfn < end_pfn)
946 kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0);
947
948 /* big page range */
949 start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
950 << (PMD_SHIFT - PAGE_SHIFT);
951 if (start_pfn < (big_page_start >> PAGE_SHIFT))
952 start_pfn = big_page_start >> PAGE_SHIFT;
953 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
954 if (start_pfn < end_pfn)
955 kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
956 use_pse);
957
958 /* tail is not big page alignment ? */
959 start_pfn = end_pfn;
960 if (start_pfn > (big_page_start>>PAGE_SHIFT)) {
961 end_pfn = end >> PAGE_SHIFT;
962 if (start_pfn < end_pfn)
963 kernel_physical_mapping_init(pgd_base, start_pfn,
964 end_pfn, 0);
965 }
966
967 early_ioremap_page_table_range_init(pgd_base);
968
969 load_cr3(swapper_pg_dir);
970
971 __flush_tlb_all();
972
973 if (!after_init_bootmem)
974 reserve_early(table_start << PAGE_SHIFT,
975 table_end << PAGE_SHIFT, "PGTABLE");
976
977 if (!after_init_bootmem)
978 early_memtest(start, end);
979
980 return end >> PAGE_SHIFT;
981} 860}
982 861
983
984/* 862/*
985 * paging_init() sets up the page tables - note that the first 8MB are 863 * paging_init() sets up the page tables - note that the first 8MB are
986 * already mapped by head.S. 864 * already mapped by head.S.
@@ -1176,17 +1054,47 @@ static noinline int do_test_wp_bit(void)
1176const int rodata_test_data = 0xC3; 1054const int rodata_test_data = 0xC3;
1177EXPORT_SYMBOL_GPL(rodata_test_data); 1055EXPORT_SYMBOL_GPL(rodata_test_data);
1178 1056
1057static int kernel_set_to_readonly;
1058
1059void set_kernel_text_rw(void)
1060{
1061 unsigned long start = PFN_ALIGN(_text);
1062 unsigned long size = PFN_ALIGN(_etext) - start;
1063
1064 if (!kernel_set_to_readonly)
1065 return;
1066
1067 pr_debug("Set kernel text: %lx - %lx for read write\n",
1068 start, start+size);
1069
1070 set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
1071}
1072
1073void set_kernel_text_ro(void)
1074{
1075 unsigned long start = PFN_ALIGN(_text);
1076 unsigned long size = PFN_ALIGN(_etext) - start;
1077
1078 if (!kernel_set_to_readonly)
1079 return;
1080
1081 pr_debug("Set kernel text: %lx - %lx for read only\n",
1082 start, start+size);
1083
1084 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
1085}
1086
1179void mark_rodata_ro(void) 1087void mark_rodata_ro(void)
1180{ 1088{
1181 unsigned long start = PFN_ALIGN(_text); 1089 unsigned long start = PFN_ALIGN(_text);
1182 unsigned long size = PFN_ALIGN(_etext) - start; 1090 unsigned long size = PFN_ALIGN(_etext) - start;
1183 1091
1184#ifndef CONFIG_DYNAMIC_FTRACE
1185 /* Dynamic tracing modifies the kernel text section */
1186 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); 1092 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
1187 printk(KERN_INFO "Write protecting the kernel text: %luk\n", 1093 printk(KERN_INFO "Write protecting the kernel text: %luk\n",
1188 size >> 10); 1094 size >> 10);
1189 1095
1096 kernel_set_to_readonly = 1;
1097
1190#ifdef CONFIG_CPA_DEBUG 1098#ifdef CONFIG_CPA_DEBUG
1191 printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n", 1099 printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
1192 start, start+size); 1100 start, start+size);
@@ -1195,7 +1103,6 @@ void mark_rodata_ro(void)
1195 printk(KERN_INFO "Testing CPA: write protecting again\n"); 1103 printk(KERN_INFO "Testing CPA: write protecting again\n");
1196 set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT); 1104 set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
1197#endif 1105#endif
1198#endif /* CONFIG_DYNAMIC_FTRACE */
1199 1106
1200 start += size; 1107 start += size;
1201 size = (unsigned long)__end_rodata - start; 1108 size = (unsigned long)__end_rodata - start;
@@ -1214,52 +1121,6 @@ void mark_rodata_ro(void)
1214} 1121}
1215#endif 1122#endif
1216 1123
1217void free_init_pages(char *what, unsigned long begin, unsigned long end)
1218{
1219#ifdef CONFIG_DEBUG_PAGEALLOC
1220 /*
1221 * If debugging page accesses then do not free this memory but
1222 * mark them not present - any buggy init-section access will
1223 * create a kernel page fault:
1224 */
1225 printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
1226 begin, PAGE_ALIGN(end));
1227 set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
1228#else
1229 unsigned long addr;
1230
1231 /*
1232 * We just marked the kernel text read only above, now that
1233 * we are going to free part of that, we need to make that
1234 * writeable first.
1235 */
1236 set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
1237
1238 for (addr = begin; addr < end; addr += PAGE_SIZE) {
1239 ClearPageReserved(virt_to_page(addr));
1240 init_page_count(virt_to_page(addr));
1241 memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
1242 free_page(addr);
1243 totalram_pages++;
1244 }
1245 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
1246#endif
1247}
1248
1249void free_initmem(void)
1250{
1251 free_init_pages("unused kernel memory",
1252 (unsigned long)(&__init_begin),
1253 (unsigned long)(&__init_end));
1254}
1255
1256#ifdef CONFIG_BLK_DEV_INITRD
1257void free_initrd_mem(unsigned long start, unsigned long end)
1258{
1259 free_init_pages("initrd memory", start, end);
1260}
1261#endif
1262
1263int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, 1124int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
1264 int flags) 1125 int flags)
1265{ 1126{
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index b1352250096e..1753e8020df6 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -48,6 +48,7 @@
48#include <asm/kdebug.h> 48#include <asm/kdebug.h>
49#include <asm/numa.h> 49#include <asm/numa.h>
50#include <asm/cacheflush.h> 50#include <asm/cacheflush.h>
51#include <asm/init.h>
51 52
52/* 53/*
53 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. 54 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -61,12 +62,6 @@ static unsigned long dma_reserve __initdata;
61 62
62DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 63DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
63 64
64int direct_gbpages
65#ifdef CONFIG_DIRECT_GBPAGES
66 = 1
67#endif
68;
69
70static int __init parse_direct_gbpages_off(char *arg) 65static int __init parse_direct_gbpages_off(char *arg)
71{ 66{
72 direct_gbpages = 0; 67 direct_gbpages = 0;
@@ -87,12 +82,10 @@ early_param("gbpages", parse_direct_gbpages_on);
87 * around without checking the pgd every time. 82 * around without checking the pgd every time.
88 */ 83 */
89 84
90int after_bootmem;
91
92pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP; 85pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
93EXPORT_SYMBOL_GPL(__supported_pte_mask); 86EXPORT_SYMBOL_GPL(__supported_pte_mask);
94 87
95static int do_not_nx __cpuinitdata; 88static int disable_nx __cpuinitdata;
96 89
97/* 90/*
98 * noexec=on|off 91 * noexec=on|off
@@ -107,9 +100,9 @@ static int __init nonx_setup(char *str)
107 return -EINVAL; 100 return -EINVAL;
108 if (!strncmp(str, "on", 2)) { 101 if (!strncmp(str, "on", 2)) {
109 __supported_pte_mask |= _PAGE_NX; 102 __supported_pte_mask |= _PAGE_NX;
110 do_not_nx = 0; 103 disable_nx = 0;
111 } else if (!strncmp(str, "off", 3)) { 104 } else if (!strncmp(str, "off", 3)) {
112 do_not_nx = 1; 105 disable_nx = 1;
113 __supported_pte_mask &= ~_PAGE_NX; 106 __supported_pte_mask &= ~_PAGE_NX;
114 } 107 }
115 return 0; 108 return 0;
@@ -121,7 +114,7 @@ void __cpuinit check_efer(void)
121 unsigned long efer; 114 unsigned long efer;
122 115
123 rdmsrl(MSR_EFER, efer); 116 rdmsrl(MSR_EFER, efer);
124 if (!(efer & EFER_NX) || do_not_nx) 117 if (!(efer & EFER_NX) || disable_nx)
125 __supported_pte_mask &= ~_PAGE_NX; 118 __supported_pte_mask &= ~_PAGE_NX;
126} 119}
127 120
@@ -168,34 +161,51 @@ static __ref void *spp_getpage(void)
168 return ptr; 161 return ptr;
169} 162}
170 163
171void 164static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr)
172set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
173{ 165{
174 pud_t *pud; 166 if (pgd_none(*pgd)) {
175 pmd_t *pmd; 167 pud_t *pud = (pud_t *)spp_getpage();
176 pte_t *pte; 168 pgd_populate(&init_mm, pgd, pud);
169 if (pud != pud_offset(pgd, 0))
170 printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
171 pud, pud_offset(pgd, 0));
172 }
173 return pud_offset(pgd, vaddr);
174}
177 175
178 pud = pud_page + pud_index(vaddr); 176static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
177{
179 if (pud_none(*pud)) { 178 if (pud_none(*pud)) {
180 pmd = (pmd_t *) spp_getpage(); 179 pmd_t *pmd = (pmd_t *) spp_getpage();
181 pud_populate(&init_mm, pud, pmd); 180 pud_populate(&init_mm, pud, pmd);
182 if (pmd != pmd_offset(pud, 0)) { 181 if (pmd != pmd_offset(pud, 0))
183 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", 182 printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
184 pmd, pmd_offset(pud, 0)); 183 pmd, pmd_offset(pud, 0));
185 return;
186 }
187 } 184 }
188 pmd = pmd_offset(pud, vaddr); 185 return pmd_offset(pud, vaddr);
186}
187
188static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
189{
189 if (pmd_none(*pmd)) { 190 if (pmd_none(*pmd)) {
190 pte = (pte_t *) spp_getpage(); 191 pte_t *pte = (pte_t *) spp_getpage();
191 pmd_populate_kernel(&init_mm, pmd, pte); 192 pmd_populate_kernel(&init_mm, pmd, pte);
192 if (pte != pte_offset_kernel(pmd, 0)) { 193 if (pte != pte_offset_kernel(pmd, 0))
193 printk(KERN_ERR "PAGETABLE BUG #02!\n"); 194 printk(KERN_ERR "PAGETABLE BUG #02!\n");
194 return;
195 }
196 } 195 }
196 return pte_offset_kernel(pmd, vaddr);
197}
198
199void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
200{
201 pud_t *pud;
202 pmd_t *pmd;
203 pte_t *pte;
204
205 pud = pud_page + pud_index(vaddr);
206 pmd = fill_pmd(pud, vaddr);
207 pte = fill_pte(pmd, vaddr);
197 208
198 pte = pte_offset_kernel(pmd, vaddr);
199 set_pte(pte, new_pte); 209 set_pte(pte, new_pte);
200 210
201 /* 211 /*
@@ -205,8 +215,7 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
205 __flush_tlb_one(vaddr); 215 __flush_tlb_one(vaddr);
206} 216}
207 217
208void 218void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
209set_pte_vaddr(unsigned long vaddr, pte_t pteval)
210{ 219{
211 pgd_t *pgd; 220 pgd_t *pgd;
212 pud_t *pud_page; 221 pud_t *pud_page;
@@ -223,6 +232,24 @@ set_pte_vaddr(unsigned long vaddr, pte_t pteval)
223 set_pte_vaddr_pud(pud_page, vaddr, pteval); 232 set_pte_vaddr_pud(pud_page, vaddr, pteval);
224} 233}
225 234
235pmd_t * __init populate_extra_pmd(unsigned long vaddr)
236{
237 pgd_t *pgd;
238 pud_t *pud;
239
240 pgd = pgd_offset_k(vaddr);
241 pud = fill_pud(pgd, vaddr);
242 return fill_pmd(pud, vaddr);
243}
244
245pte_t * __init populate_extra_pte(unsigned long vaddr)
246{
247 pmd_t *pmd;
248
249 pmd = populate_extra_pmd(vaddr);
250 return fill_pte(pmd, vaddr);
251}
252
226/* 253/*
227 * Create large page table mappings for a range of physical addresses. 254 * Create large page table mappings for a range of physical addresses.
228 */ 255 */
@@ -291,13 +318,9 @@ void __init cleanup_highmap(void)
291 } 318 }
292} 319}
293 320
294static unsigned long __initdata table_start;
295static unsigned long __meminitdata table_end;
296static unsigned long __meminitdata table_top;
297
298static __ref void *alloc_low_page(unsigned long *phys) 321static __ref void *alloc_low_page(unsigned long *phys)
299{ 322{
300 unsigned long pfn = table_end++; 323 unsigned long pfn = e820_table_end++;
301 void *adr; 324 void *adr;
302 325
303 if (after_bootmem) { 326 if (after_bootmem) {
@@ -307,7 +330,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
307 return adr; 330 return adr;
308 } 331 }
309 332
310 if (pfn >= table_top) 333 if (pfn >= e820_table_top)
311 panic("alloc_low_page: ran out of memory"); 334 panic("alloc_low_page: ran out of memory");
312 335
313 adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); 336 adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
@@ -547,58 +570,10 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
547 return phys_pud_init(pud, addr, end, page_size_mask); 570 return phys_pud_init(pud, addr, end, page_size_mask);
548} 571}
549 572
550static void __init find_early_table_space(unsigned long end, int use_pse, 573unsigned long __init
551 int use_gbpages) 574kernel_physical_mapping_init(unsigned long start,
552{ 575 unsigned long end,
553 unsigned long puds, pmds, ptes, tables, start; 576 unsigned long page_size_mask)
554
555 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
556 tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
557 if (use_gbpages) {
558 unsigned long extra;
559 extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
560 pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
561 } else
562 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
563 tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
564
565 if (use_pse) {
566 unsigned long extra;
567 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
568 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
569 } else
570 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
571 tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
572
573 /*
574 * RED-PEN putting page tables only on node 0 could
575 * cause a hotspot and fill up ZONE_DMA. The page tables
576 * need roughly 0.5KB per GB.
577 */
578 start = 0x8000;
579 table_start = find_e820_area(start, end, tables, PAGE_SIZE);
580 if (table_start == -1UL)
581 panic("Cannot find space for the kernel page tables");
582
583 table_start >>= PAGE_SHIFT;
584 table_end = table_start;
585 table_top = table_start + (tables >> PAGE_SHIFT);
586
587 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
588 end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT);
589}
590
591static void __init init_gbpages(void)
592{
593 if (direct_gbpages && cpu_has_gbpages)
594 printk(KERN_INFO "Using GB pages for direct mapping\n");
595 else
596 direct_gbpages = 0;
597}
598
599static unsigned long __meminit kernel_physical_mapping_init(unsigned long start,
600 unsigned long end,
601 unsigned long page_size_mask)
602{ 577{
603 578
604 unsigned long next, last_map_addr = end; 579 unsigned long next, last_map_addr = end;
@@ -635,176 +610,6 @@ static unsigned long __meminit kernel_physical_mapping_init(unsigned long start,
635 return last_map_addr; 610 return last_map_addr;
636} 611}
637 612
638struct map_range {
639 unsigned long start;
640 unsigned long end;
641 unsigned page_size_mask;
642};
643
644#define NR_RANGE_MR 5
645
646static int save_mr(struct map_range *mr, int nr_range,
647 unsigned long start_pfn, unsigned long end_pfn,
648 unsigned long page_size_mask)
649{
650
651 if (start_pfn < end_pfn) {
652 if (nr_range >= NR_RANGE_MR)
653 panic("run out of range for init_memory_mapping\n");
654 mr[nr_range].start = start_pfn<<PAGE_SHIFT;
655 mr[nr_range].end = end_pfn<<PAGE_SHIFT;
656 mr[nr_range].page_size_mask = page_size_mask;
657 nr_range++;
658 }
659
660 return nr_range;
661}
662
663/*
664 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
665 * This runs before bootmem is initialized and gets pages directly from
666 * the physical memory. To access them they are temporarily mapped.
667 */
668unsigned long __init_refok init_memory_mapping(unsigned long start,
669 unsigned long end)
670{
671 unsigned long last_map_addr = 0;
672 unsigned long page_size_mask = 0;
673 unsigned long start_pfn, end_pfn;
674 unsigned long pos;
675
676 struct map_range mr[NR_RANGE_MR];
677 int nr_range, i;
678 int use_pse, use_gbpages;
679
680 printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
681
682 /*
683 * Find space for the kernel direct mapping tables.
684 *
685 * Later we should allocate these tables in the local node of the
686 * memory mapped. Unfortunately this is done currently before the
687 * nodes are discovered.
688 */
689 if (!after_bootmem)
690 init_gbpages();
691
692#ifdef CONFIG_DEBUG_PAGEALLOC
693 /*
694 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
695 * This will simplify cpa(), which otherwise needs to support splitting
696 * large pages into small in interrupt context, etc.
697 */
698 use_pse = use_gbpages = 0;
699#else
700 use_pse = cpu_has_pse;
701 use_gbpages = direct_gbpages;
702#endif
703
704 if (use_gbpages)
705 page_size_mask |= 1 << PG_LEVEL_1G;
706 if (use_pse)
707 page_size_mask |= 1 << PG_LEVEL_2M;
708
709 memset(mr, 0, sizeof(mr));
710 nr_range = 0;
711
712 /* head if not big page alignment ?*/
713 start_pfn = start >> PAGE_SHIFT;
714 pos = start_pfn << PAGE_SHIFT;
715 end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
716 << (PMD_SHIFT - PAGE_SHIFT);
717 if (end_pfn > (end >> PAGE_SHIFT))
718 end_pfn = end >> PAGE_SHIFT;
719 if (start_pfn < end_pfn) {
720 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
721 pos = end_pfn << PAGE_SHIFT;
722 }
723
724 /* big page (2M) range*/
725 start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
726 << (PMD_SHIFT - PAGE_SHIFT);
727 end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
728 << (PUD_SHIFT - PAGE_SHIFT);
729 if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
730 end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
731 if (start_pfn < end_pfn) {
732 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
733 page_size_mask & (1<<PG_LEVEL_2M));
734 pos = end_pfn << PAGE_SHIFT;
735 }
736
737 /* big page (1G) range */
738 start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
739 << (PUD_SHIFT - PAGE_SHIFT);
740 end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
741 if (start_pfn < end_pfn) {
742 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
743 page_size_mask &
744 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
745 pos = end_pfn << PAGE_SHIFT;
746 }
747
748 /* tail is not big page (1G) alignment */
749 start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
750 << (PMD_SHIFT - PAGE_SHIFT);
751 end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
752 if (start_pfn < end_pfn) {
753 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
754 page_size_mask & (1<<PG_LEVEL_2M));
755 pos = end_pfn << PAGE_SHIFT;
756 }
757
758 /* tail is not big page (2M) alignment */
759 start_pfn = pos>>PAGE_SHIFT;
760 end_pfn = end>>PAGE_SHIFT;
761 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
762
763 /* try to merge same page size and continuous */
764 for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
765 unsigned long old_start;
766 if (mr[i].end != mr[i+1].start ||
767 mr[i].page_size_mask != mr[i+1].page_size_mask)
768 continue;
769 /* move it */
770 old_start = mr[i].start;
771 memmove(&mr[i], &mr[i+1],
772 (nr_range - 1 - i) * sizeof (struct map_range));
773 mr[i--].start = old_start;
774 nr_range--;
775 }
776
777 for (i = 0; i < nr_range; i++)
778 printk(KERN_DEBUG " %010lx - %010lx page %s\n",
779 mr[i].start, mr[i].end,
780 (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
781 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
782
783 if (!after_bootmem)
784 find_early_table_space(end, use_pse, use_gbpages);
785
786 for (i = 0; i < nr_range; i++)
787 last_map_addr = kernel_physical_mapping_init(
788 mr[i].start, mr[i].end,
789 mr[i].page_size_mask);
790
791 if (!after_bootmem)
792 mmu_cr4_features = read_cr4();
793 __flush_tlb_all();
794
795 if (!after_bootmem && table_end > table_start)
796 reserve_early(table_start << PAGE_SHIFT,
797 table_end << PAGE_SHIFT, "PGTABLE");
798
799 printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
800 last_map_addr, end);
801
802 if (!after_bootmem)
803 early_memtest(start, end);
804
805 return last_map_addr >> PAGE_SHIFT;
806}
807
808#ifndef CONFIG_NUMA 613#ifndef CONFIG_NUMA
809void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) 614void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
810{ 615{
@@ -876,28 +681,6 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
876 681
877#endif /* CONFIG_MEMORY_HOTPLUG */ 682#endif /* CONFIG_MEMORY_HOTPLUG */
878 683
879/*
880 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
881 * is valid. The argument is a physical page number.
882 *
883 *
884 * On x86, access has to be given to the first megabyte of ram because that area
885 * contains bios code and data regions used by X and dosemu and similar apps.
886 * Access has to be given to non-kernel-ram areas as well, these contain the PCI
887 * mmio resources as well as potential bios/acpi data regions.
888 */
889int devmem_is_allowed(unsigned long pagenr)
890{
891 if (pagenr <= 256)
892 return 1;
893 if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
894 return 0;
895 if (!page_is_ram(pagenr))
896 return 1;
897 return 0;
898}
899
900
901static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, 684static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
902 kcore_modules, kcore_vsyscall; 685 kcore_modules, kcore_vsyscall;
903 686
@@ -947,46 +730,39 @@ void __init mem_init(void)
947 initsize >> 10); 730 initsize >> 10);
948} 731}
949 732
950void free_init_pages(char *what, unsigned long begin, unsigned long end) 733#ifdef CONFIG_DEBUG_RODATA
734const int rodata_test_data = 0xC3;
735EXPORT_SYMBOL_GPL(rodata_test_data);
736
737static int kernel_set_to_readonly;
738
739void set_kernel_text_rw(void)
951{ 740{
952 unsigned long addr = begin; 741 unsigned long start = PFN_ALIGN(_stext);
742 unsigned long end = PFN_ALIGN(__start_rodata);
953 743
954 if (addr >= end) 744 if (!kernel_set_to_readonly)
955 return; 745 return;
956 746
957 /* 747 pr_debug("Set kernel text: %lx - %lx for read write\n",
958 * If debugging page accesses then do not free this memory but 748 start, end);
959 * mark them not present - any buggy init-section access will 749
960 * create a kernel page fault: 750 set_memory_rw(start, (end - start) >> PAGE_SHIFT);
961 */
962#ifdef CONFIG_DEBUG_PAGEALLOC
963 printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
964 begin, PAGE_ALIGN(end));
965 set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
966#else
967 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
968
969 for (; addr < end; addr += PAGE_SIZE) {
970 ClearPageReserved(virt_to_page(addr));
971 init_page_count(virt_to_page(addr));
972 memset((void *)(addr & ~(PAGE_SIZE-1)),
973 POISON_FREE_INITMEM, PAGE_SIZE);
974 free_page(addr);
975 totalram_pages++;
976 }
977#endif
978} 751}
979 752
980void free_initmem(void) 753void set_kernel_text_ro(void)
981{ 754{
982 free_init_pages("unused kernel memory", 755 unsigned long start = PFN_ALIGN(_stext);
983 (unsigned long)(&__init_begin), 756 unsigned long end = PFN_ALIGN(__start_rodata);
984 (unsigned long)(&__init_end));
985}
986 757
987#ifdef CONFIG_DEBUG_RODATA 758 if (!kernel_set_to_readonly)
988const int rodata_test_data = 0xC3; 759 return;
989EXPORT_SYMBOL_GPL(rodata_test_data); 760
761 pr_debug("Set kernel text: %lx - %lx for read only\n",
762 start, end);
763
764 set_memory_ro(start, (end - start) >> PAGE_SHIFT);
765}
990 766
991void mark_rodata_ro(void) 767void mark_rodata_ro(void)
992{ 768{
@@ -994,15 +770,12 @@ void mark_rodata_ro(void)
994 unsigned long rodata_start = 770 unsigned long rodata_start =
995 ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; 771 ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
996 772
997#ifdef CONFIG_DYNAMIC_FTRACE
998 /* Dynamic tracing modifies the kernel text section */
999 start = rodata_start;
1000#endif
1001
1002 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", 773 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
1003 (end - start) >> 10); 774 (end - start) >> 10);
1004 set_memory_ro(start, (end - start) >> PAGE_SHIFT); 775 set_memory_ro(start, (end - start) >> PAGE_SHIFT);
1005 776
777 kernel_set_to_readonly = 1;
778
1006 /* 779 /*
1007 * The rodata section (but not the kernel text!) should also be 780 * The rodata section (but not the kernel text!) should also be
1008 * not-executable. 781 * not-executable.
@@ -1022,13 +795,6 @@ void mark_rodata_ro(void)
1022 795
1023#endif 796#endif
1024 797
1025#ifdef CONFIG_BLK_DEV_INITRD
1026void free_initrd_mem(unsigned long start, unsigned long end)
1027{
1028 free_init_pages("initrd memory", start, end);
1029}
1030#endif
1031
1032int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, 798int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
1033 int flags) 799 int flags)
1034{ 800{
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 04102d42ff42..8056545e2d39 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -19,10 +19,11 @@
19#include <asm/iomap.h> 19#include <asm/iomap.h>
20#include <asm/pat.h> 20#include <asm/pat.h>
21#include <linux/module.h> 21#include <linux/module.h>
22#include <linux/highmem.h>
22 23
23int is_io_mapping_possible(resource_size_t base, unsigned long size) 24int is_io_mapping_possible(resource_size_t base, unsigned long size)
24{ 25{
25#ifndef CONFIG_X86_PAE 26#if !defined(CONFIG_X86_PAE) && defined(CONFIG_PHYS_ADDR_T_64BIT)
26 /* There is no way to map greater than 1 << 32 address without PAE */ 27 /* There is no way to map greater than 1 << 32 address without PAE */
27 if (base + size > 0x100000000ULL) 28 if (base + size > 0x100000000ULL)
28 return 0; 29 return 0;
@@ -31,16 +32,28 @@ int is_io_mapping_possible(resource_size_t base, unsigned long size)
31} 32}
32EXPORT_SYMBOL_GPL(is_io_mapping_possible); 33EXPORT_SYMBOL_GPL(is_io_mapping_possible);
33 34
34/* Map 'pfn' using fixed map 'type' and protections 'prot' 35void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
35 */
36void *
37iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
38{ 36{
39 enum fixed_addresses idx; 37 enum fixed_addresses idx;
40 unsigned long vaddr; 38 unsigned long vaddr;
41 39
42 pagefault_disable(); 40 pagefault_disable();
43 41
42 debug_kmap_atomic(type);
43 idx = type + KM_TYPE_NR * smp_processor_id();
44 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
45 set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
46 arch_flush_lazy_mmu_mode();
47
48 return (void *)vaddr;
49}
50
51/*
52 * Map 'pfn' using fixed map 'type' and protections 'prot'
53 */
54void *
55iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
56{
44 /* 57 /*
45 * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS. 58 * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS.
46 * PAGE_KERNEL_WC maps to PWT, which translates to uncached if the 59 * PAGE_KERNEL_WC maps to PWT, which translates to uncached if the
@@ -50,12 +63,7 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
50 if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC)) 63 if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC))
51 prot = PAGE_KERNEL_UC_MINUS; 64 prot = PAGE_KERNEL_UC_MINUS;
52 65
53 idx = type + KM_TYPE_NR*smp_processor_id(); 66 return kmap_atomic_prot_pfn(pfn, type, prot);
54 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
55 set_pte(kmap_pte-idx, pfn_pte(pfn, prot));
56 arch_flush_lazy_mmu_mode();
57
58 return (void*) vaddr;
59} 67}
60EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn); 68EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn);
61 69
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 433f7bd4648a..0dfa09d69e80 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -22,13 +22,17 @@
22#include <asm/pgalloc.h> 22#include <asm/pgalloc.h>
23#include <asm/pat.h> 23#include <asm/pat.h>
24 24
25#ifdef CONFIG_X86_64 25static inline int phys_addr_valid(resource_size_t addr)
26
27static inline int phys_addr_valid(unsigned long addr)
28{ 26{
29 return addr < (1UL << boot_cpu_data.x86_phys_bits); 27#ifdef CONFIG_PHYS_ADDR_T_64BIT
28 return !(addr >> boot_cpu_data.x86_phys_bits);
29#else
30 return 1;
31#endif
30} 32}
31 33
34#ifdef CONFIG_X86_64
35
32unsigned long __phys_addr(unsigned long x) 36unsigned long __phys_addr(unsigned long x)
33{ 37{
34 if (x >= __START_KERNEL_map) { 38 if (x >= __START_KERNEL_map) {
@@ -38,8 +42,7 @@ unsigned long __phys_addr(unsigned long x)
38 } else { 42 } else {
39 VIRTUAL_BUG_ON(x < PAGE_OFFSET); 43 VIRTUAL_BUG_ON(x < PAGE_OFFSET);
40 x -= PAGE_OFFSET; 44 x -= PAGE_OFFSET;
41 VIRTUAL_BUG_ON(system_state == SYSTEM_BOOTING ? x > MAXMEM : 45 VIRTUAL_BUG_ON(!phys_addr_valid(x));
42 !phys_addr_valid(x));
43 } 46 }
44 return x; 47 return x;
45} 48}
@@ -56,10 +59,8 @@ bool __virt_addr_valid(unsigned long x)
56 if (x < PAGE_OFFSET) 59 if (x < PAGE_OFFSET)
57 return false; 60 return false;
58 x -= PAGE_OFFSET; 61 x -= PAGE_OFFSET;
59 if (system_state == SYSTEM_BOOTING ? 62 if (!phys_addr_valid(x))
60 x > MAXMEM : !phys_addr_valid(x)) {
61 return false; 63 return false;
62 }
63 } 64 }
64 65
65 return pfn_valid(x >> PAGE_SHIFT); 66 return pfn_valid(x >> PAGE_SHIFT);
@@ -68,18 +69,12 @@ EXPORT_SYMBOL(__virt_addr_valid);
68 69
69#else 70#else
70 71
71static inline int phys_addr_valid(unsigned long addr)
72{
73 return 1;
74}
75
76#ifdef CONFIG_DEBUG_VIRTUAL 72#ifdef CONFIG_DEBUG_VIRTUAL
77unsigned long __phys_addr(unsigned long x) 73unsigned long __phys_addr(unsigned long x)
78{ 74{
79 /* VMALLOC_* aren't constants; not available at the boot time */ 75 /* VMALLOC_* aren't constants */
80 VIRTUAL_BUG_ON(x < PAGE_OFFSET); 76 VIRTUAL_BUG_ON(x < PAGE_OFFSET);
81 VIRTUAL_BUG_ON(system_state != SYSTEM_BOOTING && 77 VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x));
82 is_vmalloc_addr((void *) x));
83 return x - PAGE_OFFSET; 78 return x - PAGE_OFFSET;
84} 79}
85EXPORT_SYMBOL(__phys_addr); 80EXPORT_SYMBOL(__phys_addr);
@@ -89,7 +84,9 @@ bool __virt_addr_valid(unsigned long x)
89{ 84{
90 if (x < PAGE_OFFSET) 85 if (x < PAGE_OFFSET)
91 return false; 86 return false;
92 if (system_state != SYSTEM_BOOTING && is_vmalloc_addr((void *) x)) 87 if (__vmalloc_start_set && is_vmalloc_addr((void *) x))
88 return false;
89 if (x >= FIXADDR_START)
93 return false; 90 return false;
94 return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT); 91 return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
95} 92}
@@ -508,13 +505,19 @@ static inline pte_t * __init early_ioremap_pte(unsigned long addr)
508 return &bm_pte[pte_index(addr)]; 505 return &bm_pte[pte_index(addr)];
509} 506}
510 507
508static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
509
511void __init early_ioremap_init(void) 510void __init early_ioremap_init(void)
512{ 511{
513 pmd_t *pmd; 512 pmd_t *pmd;
513 int i;
514 514
515 if (early_ioremap_debug) 515 if (early_ioremap_debug)
516 printk(KERN_INFO "early_ioremap_init()\n"); 516 printk(KERN_INFO "early_ioremap_init()\n");
517 517
518 for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
519 slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
520
518 pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); 521 pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
519 memset(bm_pte, 0, sizeof(bm_pte)); 522 memset(bm_pte, 0, sizeof(bm_pte));
520 pmd_populate_kernel(&init_mm, pmd, bm_pte); 523 pmd_populate_kernel(&init_mm, pmd, bm_pte);
@@ -581,6 +584,7 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx)
581 584
582static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata; 585static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata;
583static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata; 586static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
587
584static int __init check_early_ioremap_leak(void) 588static int __init check_early_ioremap_leak(void)
585{ 589{
586 int count = 0; 590 int count = 0;
@@ -602,7 +606,8 @@ static int __init check_early_ioremap_leak(void)
602} 606}
603late_initcall(check_early_ioremap_leak); 607late_initcall(check_early_ioremap_leak);
604 608
605static void __init __iomem *__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot) 609static void __init __iomem *
610__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot)
606{ 611{
607 unsigned long offset, last_addr; 612 unsigned long offset, last_addr;
608 unsigned int nrpages; 613 unsigned int nrpages;
@@ -668,9 +673,9 @@ static void __init __iomem *__early_ioremap(unsigned long phys_addr, unsigned lo
668 --nrpages; 673 --nrpages;
669 } 674 }
670 if (early_ioremap_debug) 675 if (early_ioremap_debug)
671 printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0)); 676 printk(KERN_CONT "%08lx + %08lx\n", offset, slot_virt[slot]);
672 677
673 prev_map[slot] = (void __iomem *)(offset + fix_to_virt(idx0)); 678 prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]);
674 return prev_map[slot]; 679 return prev_map[slot];
675} 680}
676 681
@@ -738,8 +743,3 @@ void __init early_iounmap(void __iomem *addr, unsigned long size)
738 } 743 }
739 prev_map[slot] = NULL; 744 prev_map[slot] = NULL;
740} 745}
741
742void __this_fixmap_does_not_exist(void)
743{
744 WARN_ON(1);
745}
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 9f205030d9aa..4f115e00486b 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -310,7 +310,7 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
310 struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); 310 struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
311 311
312 if (!ctx->active) { 312 if (!ctx->active) {
313 pr_warning("kmmio: spurious debug trap on CPU %d.\n", 313 pr_debug("kmmio: spurious debug trap on CPU %d.\n",
314 smp_processor_id()); 314 smp_processor_id());
315 goto out; 315 goto out;
316 } 316 }
@@ -451,23 +451,24 @@ static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
451 451
452static void remove_kmmio_fault_pages(struct rcu_head *head) 452static void remove_kmmio_fault_pages(struct rcu_head *head)
453{ 453{
454 struct kmmio_delayed_release *dr = container_of( 454 struct kmmio_delayed_release *dr =
455 head, 455 container_of(head, struct kmmio_delayed_release, rcu);
456 struct kmmio_delayed_release,
457 rcu);
458 struct kmmio_fault_page *p = dr->release_list; 456 struct kmmio_fault_page *p = dr->release_list;
459 struct kmmio_fault_page **prevp = &dr->release_list; 457 struct kmmio_fault_page **prevp = &dr->release_list;
460 unsigned long flags; 458 unsigned long flags;
459
461 spin_lock_irqsave(&kmmio_lock, flags); 460 spin_lock_irqsave(&kmmio_lock, flags);
462 while (p) { 461 while (p) {
463 if (!p->count) 462 if (!p->count) {
464 list_del_rcu(&p->list); 463 list_del_rcu(&p->list);
465 else 464 prevp = &p->release_next;
465 } else {
466 *prevp = p->release_next; 466 *prevp = p->release_next;
467 prevp = &p->release_next; 467 }
468 p = p->release_next; 468 p = p->release_next;
469 } 469 }
470 spin_unlock_irqrestore(&kmmio_lock, flags); 470 spin_unlock_irqrestore(&kmmio_lock, flags);
471
471 /* This is the real RCU destroy call. */ 472 /* This is the real RCU destroy call. */
472 call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages); 473 call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages);
473} 474}
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index 0bcd7883d036..605c8be06217 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -100,6 +100,9 @@ static int __init parse_memtest(char *arg)
100{ 100{
101 if (arg) 101 if (arg)
102 memtest_pattern = simple_strtoul(arg, NULL, 0); 102 memtest_pattern = simple_strtoul(arg, NULL, 0);
103 else
104 memtest_pattern = ARRAY_SIZE(patterns);
105
103 return 0; 106 return 0;
104} 107}
105 108
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index 2c4baa88f2cb..c9342ed8b402 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -378,27 +378,34 @@ static void clear_trace_list(void)
378} 378}
379 379
380#ifdef CONFIG_HOTPLUG_CPU 380#ifdef CONFIG_HOTPLUG_CPU
381static cpumask_t downed_cpus; 381static cpumask_var_t downed_cpus;
382 382
383static void enter_uniprocessor(void) 383static void enter_uniprocessor(void)
384{ 384{
385 int cpu; 385 int cpu;
386 int err; 386 int err;
387 387
388 if (downed_cpus == NULL &&
389 !alloc_cpumask_var(&downed_cpus, GFP_KERNEL)) {
390 pr_notice(NAME "Failed to allocate mask\n");
391 goto out;
392 }
393
388 get_online_cpus(); 394 get_online_cpus();
389 downed_cpus = cpu_online_map; 395 cpumask_copy(downed_cpus, cpu_online_mask);
390 cpu_clear(first_cpu(cpu_online_map), downed_cpus); 396 cpumask_clear_cpu(cpumask_first(cpu_online_mask), downed_cpus);
391 if (num_online_cpus() > 1) 397 if (num_online_cpus() > 1)
392 pr_notice(NAME "Disabling non-boot CPUs...\n"); 398 pr_notice(NAME "Disabling non-boot CPUs...\n");
393 put_online_cpus(); 399 put_online_cpus();
394 400
395 for_each_cpu_mask(cpu, downed_cpus) { 401 for_each_cpu(cpu, downed_cpus) {
396 err = cpu_down(cpu); 402 err = cpu_down(cpu);
397 if (!err) 403 if (!err)
398 pr_info(NAME "CPU%d is down.\n", cpu); 404 pr_info(NAME "CPU%d is down.\n", cpu);
399 else 405 else
400 pr_err(NAME "Error taking CPU%d down: %d\n", cpu, err); 406 pr_err(NAME "Error taking CPU%d down: %d\n", cpu, err);
401 } 407 }
408out:
402 if (num_online_cpus() > 1) 409 if (num_online_cpus() > 1)
403 pr_warning(NAME "multiple CPUs still online, " 410 pr_warning(NAME "multiple CPUs still online, "
404 "may miss events.\n"); 411 "may miss events.\n");
@@ -411,10 +418,10 @@ static void __ref leave_uniprocessor(void)
411 int cpu; 418 int cpu;
412 int err; 419 int err;
413 420
414 if (cpus_weight(downed_cpus) == 0) 421 if (downed_cpus == NULL || cpumask_weight(downed_cpus) == 0)
415 return; 422 return;
416 pr_notice(NAME "Re-enabling CPUs...\n"); 423 pr_notice(NAME "Re-enabling CPUs...\n");
417 for_each_cpu_mask(cpu, downed_cpus) { 424 for_each_cpu(cpu, downed_cpus) {
418 err = cpu_up(cpu); 425 err = cpu_up(cpu);
419 if (!err) 426 if (!err)
420 pr_info(NAME "enabled CPU%d.\n", cpu); 427 pr_info(NAME "enabled CPU%d.\n", cpu);
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
new file mode 100644
index 000000000000..550df481accd
--- /dev/null
+++ b/arch/x86/mm/numa.c
@@ -0,0 +1,67 @@
1/* Common code for 32 and 64-bit NUMA */
2#include <linux/topology.h>
3#include <linux/module.h>
4#include <linux/bootmem.h>
5
6#ifdef CONFIG_DEBUG_PER_CPU_MAPS
7# define DBG(x...) printk(KERN_DEBUG x)
8#else
9# define DBG(x...)
10#endif
11
12/*
13 * Which logical CPUs are on which nodes
14 */
15cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
16EXPORT_SYMBOL(node_to_cpumask_map);
17
18/*
19 * Allocate node_to_cpumask_map based on number of available nodes
20 * Requires node_possible_map to be valid.
21 *
22 * Note: node_to_cpumask() is not valid until after this is done.
23 * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
24 */
25void __init setup_node_to_cpumask_map(void)
26{
27 unsigned int node, num = 0;
28
29 /* setup nr_node_ids if not done yet */
30 if (nr_node_ids == MAX_NUMNODES) {
31 for_each_node_mask(node, node_possible_map)
32 num = node;
33 nr_node_ids = num + 1;
34 }
35
36 /* allocate the map */
37 for (node = 0; node < nr_node_ids; node++)
38 alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
39
40 /* cpumask_of_node() will now work */
41 pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids);
42}
43
44#ifdef CONFIG_DEBUG_PER_CPU_MAPS
45/*
46 * Returns a pointer to the bitmask of CPUs on Node 'node'.
47 */
48const struct cpumask *cpumask_of_node(int node)
49{
50 if (node >= nr_node_ids) {
51 printk(KERN_WARNING
52 "cpumask_of_node(%d): node > nr_node_ids(%d)\n",
53 node, nr_node_ids);
54 dump_stack();
55 return cpu_none_mask;
56 }
57 if (node_to_cpumask_map[node] == NULL) {
58 printk(KERN_WARNING
59 "cpumask_of_node(%d): no node_to_cpumask_map!\n",
60 node);
61 dump_stack();
62 return cpu_online_mask;
63 }
64 return node_to_cpumask_map[node];
65}
66EXPORT_SYMBOL(cpumask_of_node);
67#endif
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 3957cd6d6454..3daefa04ace5 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -416,39 +416,14 @@ void __init initmem_init(unsigned long start_pfn,
416 for_each_online_node(nid) 416 for_each_online_node(nid)
417 propagate_e820_map_node(nid); 417 propagate_e820_map_node(nid);
418 418
419 for_each_online_node(nid) 419 for_each_online_node(nid) {
420 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); 420 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
421 NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
422 }
421 423
422 NODE_DATA(0)->bdata = &bootmem_node_data[0];
423 setup_bootmem_allocator(); 424 setup_bootmem_allocator();
424} 425}
425 426
426void __init set_highmem_pages_init(void)
427{
428#ifdef CONFIG_HIGHMEM
429 struct zone *zone;
430 int nid;
431
432 for_each_zone(zone) {
433 unsigned long zone_start_pfn, zone_end_pfn;
434
435 if (!is_highmem(zone))
436 continue;
437
438 zone_start_pfn = zone->zone_start_pfn;
439 zone_end_pfn = zone_start_pfn + zone->spanned_pages;
440
441 nid = zone_to_nid(zone);
442 printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n",
443 zone->name, nid, zone_start_pfn, zone_end_pfn);
444
445 add_highpages_with_active_regions(nid, zone_start_pfn,
446 zone_end_pfn);
447 }
448 totalram_pages += totalhigh_pages;
449#endif
450}
451
452#ifdef CONFIG_MEMORY_HOTPLUG 427#ifdef CONFIG_MEMORY_HOTPLUG
453static int paddr_to_nid(u64 addr) 428static int paddr_to_nid(u64 addr)
454{ 429{
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 64c9cf043cdd..d73aaa892371 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -20,12 +20,6 @@
20#include <asm/acpi.h> 20#include <asm/acpi.h>
21#include <asm/k8.h> 21#include <asm/k8.h>
22 22
23#ifdef CONFIG_DEBUG_PER_CPU_MAPS
24# define DBG(x...) printk(KERN_DEBUG x)
25#else
26# define DBG(x...)
27#endif
28
29struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 23struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
30EXPORT_SYMBOL(node_data); 24EXPORT_SYMBOL(node_data);
31 25
@@ -49,12 +43,6 @@ DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
49EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); 43EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
50 44
51/* 45/*
52 * Which logical CPUs are on which nodes
53 */
54cpumask_t *node_to_cpumask_map;
55EXPORT_SYMBOL(node_to_cpumask_map);
56
57/*
58 * Given a shift value, try to populate memnodemap[] 46 * Given a shift value, try to populate memnodemap[]
59 * Returns : 47 * Returns :
60 * 1 if OK 48 * 1 if OK
@@ -661,36 +649,6 @@ void __init init_cpu_to_node(void)
661#endif 649#endif
662 650
663 651
664/*
665 * Allocate node_to_cpumask_map based on number of available nodes
666 * Requires node_possible_map to be valid.
667 *
668 * Note: node_to_cpumask() is not valid until after this is done.
669 * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
670 */
671void __init setup_node_to_cpumask_map(void)
672{
673 unsigned int node, num = 0;
674 cpumask_t *map;
675
676 /* setup nr_node_ids if not done yet */
677 if (nr_node_ids == MAX_NUMNODES) {
678 for_each_node_mask(node, node_possible_map)
679 num = node;
680 nr_node_ids = num + 1;
681 }
682
683 /* allocate the map */
684 map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
685 DBG("node_to_cpumask_map at %p for %d nodes\n", map, nr_node_ids);
686
687 pr_debug("Node to cpumask map at %p for %d nodes\n",
688 map, nr_node_ids);
689
690 /* node_to_cpumask() will now work */
691 node_to_cpumask_map = map;
692}
693
694void __cpuinit numa_set_node(int cpu, int node) 652void __cpuinit numa_set_node(int cpu, int node)
695{ 653{
696 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); 654 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
@@ -723,12 +681,12 @@ void __cpuinit numa_clear_node(int cpu)
723 681
724void __cpuinit numa_add_cpu(int cpu) 682void __cpuinit numa_add_cpu(int cpu)
725{ 683{
726 cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); 684 cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
727} 685}
728 686
729void __cpuinit numa_remove_cpu(int cpu) 687void __cpuinit numa_remove_cpu(int cpu)
730{ 688{
731 cpu_clear(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); 689 cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
732} 690}
733 691
734#else /* CONFIG_DEBUG_PER_CPU_MAPS */ 692#else /* CONFIG_DEBUG_PER_CPU_MAPS */
@@ -739,20 +697,20 @@ void __cpuinit numa_remove_cpu(int cpu)
739static void __cpuinit numa_set_cpumask(int cpu, int enable) 697static void __cpuinit numa_set_cpumask(int cpu, int enable)
740{ 698{
741 int node = early_cpu_to_node(cpu); 699 int node = early_cpu_to_node(cpu);
742 cpumask_t *mask; 700 struct cpumask *mask;
743 char buf[64]; 701 char buf[64];
744 702
745 if (node_to_cpumask_map == NULL) { 703 mask = node_to_cpumask_map[node];
746 printk(KERN_ERR "node_to_cpumask_map NULL\n"); 704 if (mask == NULL) {
705 printk(KERN_ERR "node_to_cpumask_map[%i] NULL\n", node);
747 dump_stack(); 706 dump_stack();
748 return; 707 return;
749 } 708 }
750 709
751 mask = &node_to_cpumask_map[node];
752 if (enable) 710 if (enable)
753 cpu_set(cpu, *mask); 711 cpumask_set_cpu(cpu, mask);
754 else 712 else
755 cpu_clear(cpu, *mask); 713 cpumask_clear_cpu(cpu, mask);
756 714
757 cpulist_scnprintf(buf, sizeof(buf), mask); 715 cpulist_scnprintf(buf, sizeof(buf), mask);
758 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", 716 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
@@ -799,59 +757,6 @@ int early_cpu_to_node(int cpu)
799 return per_cpu(x86_cpu_to_node_map, cpu); 757 return per_cpu(x86_cpu_to_node_map, cpu);
800} 758}
801 759
802
803/* empty cpumask */
804static const cpumask_t cpu_mask_none;
805
806/*
807 * Returns a pointer to the bitmask of CPUs on Node 'node'.
808 */
809const cpumask_t *cpumask_of_node(int node)
810{
811 if (node_to_cpumask_map == NULL) {
812 printk(KERN_WARNING
813 "cpumask_of_node(%d): no node_to_cpumask_map!\n",
814 node);
815 dump_stack();
816 return (const cpumask_t *)&cpu_online_map;
817 }
818 if (node >= nr_node_ids) {
819 printk(KERN_WARNING
820 "cpumask_of_node(%d): node > nr_node_ids(%d)\n",
821 node, nr_node_ids);
822 dump_stack();
823 return &cpu_mask_none;
824 }
825 return &node_to_cpumask_map[node];
826}
827EXPORT_SYMBOL(cpumask_of_node);
828
829/*
830 * Returns a bitmask of CPUs on Node 'node'.
831 *
832 * Side note: this function creates the returned cpumask on the stack
833 * so with a high NR_CPUS count, excessive stack space is used. The
834 * node_to_cpumask_ptr function should be used whenever possible.
835 */
836cpumask_t node_to_cpumask(int node)
837{
838 if (node_to_cpumask_map == NULL) {
839 printk(KERN_WARNING
840 "node_to_cpumask(%d): no node_to_cpumask_map!\n", node);
841 dump_stack();
842 return cpu_online_map;
843 }
844 if (node >= nr_node_ids) {
845 printk(KERN_WARNING
846 "node_to_cpumask(%d): node > nr_node_ids(%d)\n",
847 node, nr_node_ids);
848 dump_stack();
849 return cpu_mask_none;
850 }
851 return node_to_cpumask_map[node];
852}
853EXPORT_SYMBOL(node_to_cpumask);
854
855/* 760/*
856 * --------- end of debug versions of the numa functions --------- 761 * --------- end of debug versions of the numa functions ---------
857 */ 762 */
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 8253bc97587e..d71e1b636ce6 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -16,6 +16,7 @@
16#include <asm/processor.h> 16#include <asm/processor.h>
17#include <asm/tlbflush.h> 17#include <asm/tlbflush.h>
18#include <asm/sections.h> 18#include <asm/sections.h>
19#include <asm/setup.h>
19#include <asm/uaccess.h> 20#include <asm/uaccess.h>
20#include <asm/pgalloc.h> 21#include <asm/pgalloc.h>
21#include <asm/proto.h> 22#include <asm/proto.h>
@@ -33,6 +34,7 @@ struct cpa_data {
33 unsigned long pfn; 34 unsigned long pfn;
34 unsigned force_split : 1; 35 unsigned force_split : 1;
35 int curpage; 36 int curpage;
37 struct page **pages;
36}; 38};
37 39
38/* 40/*
@@ -45,6 +47,7 @@ static DEFINE_SPINLOCK(cpa_lock);
45 47
46#define CPA_FLUSHTLB 1 48#define CPA_FLUSHTLB 1
47#define CPA_ARRAY 2 49#define CPA_ARRAY 2
50#define CPA_PAGES_ARRAY 4
48 51
49#ifdef CONFIG_PROC_FS 52#ifdef CONFIG_PROC_FS
50static unsigned long direct_pages_count[PG_LEVEL_NUM]; 53static unsigned long direct_pages_count[PG_LEVEL_NUM];
@@ -95,7 +98,7 @@ static inline unsigned long highmap_start_pfn(void)
95 98
96static inline unsigned long highmap_end_pfn(void) 99static inline unsigned long highmap_end_pfn(void)
97{ 100{
98 return __pa(roundup((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT; 101 return __pa(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT;
99} 102}
100 103
101#endif 104#endif
@@ -201,10 +204,10 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
201 } 204 }
202} 205}
203 206
204static void cpa_flush_array(unsigned long *start, int numpages, int cache) 207static void cpa_flush_array(unsigned long *start, int numpages, int cache,
208 int in_flags, struct page **pages)
205{ 209{
206 unsigned int i, level; 210 unsigned int i, level;
207 unsigned long *addr;
208 211
209 BUG_ON(irqs_disabled()); 212 BUG_ON(irqs_disabled());
210 213
@@ -225,14 +228,22 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache)
225 * will cause all other CPUs to flush the same 228 * will cause all other CPUs to flush the same
226 * cachelines: 229 * cachelines:
227 */ 230 */
228 for (i = 0, addr = start; i < numpages; i++, addr++) { 231 for (i = 0; i < numpages; i++) {
229 pte_t *pte = lookup_address(*addr, &level); 232 unsigned long addr;
233 pte_t *pte;
234
235 if (in_flags & CPA_PAGES_ARRAY)
236 addr = (unsigned long)page_address(pages[i]);
237 else
238 addr = start[i];
239
240 pte = lookup_address(addr, &level);
230 241
231 /* 242 /*
232 * Only flush present addresses: 243 * Only flush present addresses:
233 */ 244 */
234 if (pte && (pte_val(*pte) & _PAGE_PRESENT)) 245 if (pte && (pte_val(*pte) & _PAGE_PRESENT))
235 clflush_cache_range((void *) *addr, PAGE_SIZE); 246 clflush_cache_range((void *)addr, PAGE_SIZE);
236 } 247 }
237} 248}
238 249
@@ -522,6 +533,17 @@ static int split_large_page(pte_t *kpte, unsigned long address)
522 * primary protection behavior: 533 * primary protection behavior:
523 */ 534 */
524 __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE))); 535 __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
536
537 /*
538 * Intel Atom errata AAH41 workaround.
539 *
540 * The real fix should be in hw or in a microcode update, but
541 * we also probabilistically try to reduce the window of having
542 * a large TLB mixed with 4K TLBs while instruction fetches are
543 * going on.
544 */
545 __flush_tlb_all();
546
525 base = NULL; 547 base = NULL;
526 548
527out_unlock: 549out_unlock:
@@ -573,7 +595,9 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
573 unsigned int level; 595 unsigned int level;
574 pte_t *kpte, old_pte; 596 pte_t *kpte, old_pte;
575 597
576 if (cpa->flags & CPA_ARRAY) 598 if (cpa->flags & CPA_PAGES_ARRAY)
599 address = (unsigned long)page_address(cpa->pages[cpa->curpage]);
600 else if (cpa->flags & CPA_ARRAY)
577 address = cpa->vaddr[cpa->curpage]; 601 address = cpa->vaddr[cpa->curpage];
578 else 602 else
579 address = *cpa->vaddr; 603 address = *cpa->vaddr;
@@ -676,7 +700,9 @@ static int cpa_process_alias(struct cpa_data *cpa)
676 * No need to redo, when the primary call touched the direct 700 * No need to redo, when the primary call touched the direct
677 * mapping already: 701 * mapping already:
678 */ 702 */
679 if (cpa->flags & CPA_ARRAY) 703 if (cpa->flags & CPA_PAGES_ARRAY)
704 vaddr = (unsigned long)page_address(cpa->pages[cpa->curpage]);
705 else if (cpa->flags & CPA_ARRAY)
680 vaddr = cpa->vaddr[cpa->curpage]; 706 vaddr = cpa->vaddr[cpa->curpage];
681 else 707 else
682 vaddr = *cpa->vaddr; 708 vaddr = *cpa->vaddr;
@@ -687,7 +713,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
687 alias_cpa = *cpa; 713 alias_cpa = *cpa;
688 temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); 714 temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
689 alias_cpa.vaddr = &temp_cpa_vaddr; 715 alias_cpa.vaddr = &temp_cpa_vaddr;
690 alias_cpa.flags &= ~CPA_ARRAY; 716 alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
691 717
692 718
693 ret = __change_page_attr_set_clr(&alias_cpa, 0); 719 ret = __change_page_attr_set_clr(&alias_cpa, 0);
@@ -700,7 +726,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
700 * No need to redo, when the primary call touched the high 726 * No need to redo, when the primary call touched the high
701 * mapping already: 727 * mapping already:
702 */ 728 */
703 if (within(vaddr, (unsigned long) _text, (unsigned long) _end)) 729 if (within(vaddr, (unsigned long) _text, _brk_end))
704 return 0; 730 return 0;
705 731
706 /* 732 /*
@@ -713,7 +739,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
713 alias_cpa = *cpa; 739 alias_cpa = *cpa;
714 temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; 740 temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base;
715 alias_cpa.vaddr = &temp_cpa_vaddr; 741 alias_cpa.vaddr = &temp_cpa_vaddr;
716 alias_cpa.flags &= ~CPA_ARRAY; 742 alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
717 743
718 /* 744 /*
719 * The high mapping range is imprecise, so ignore the return value. 745 * The high mapping range is imprecise, so ignore the return value.
@@ -734,7 +760,7 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
734 */ 760 */
735 cpa->numpages = numpages; 761 cpa->numpages = numpages;
736 /* for array changes, we can't use large page */ 762 /* for array changes, we can't use large page */
737 if (cpa->flags & CPA_ARRAY) 763 if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
738 cpa->numpages = 1; 764 cpa->numpages = 1;
739 765
740 if (!debug_pagealloc) 766 if (!debug_pagealloc)
@@ -758,7 +784,7 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
758 */ 784 */
759 BUG_ON(cpa->numpages > numpages); 785 BUG_ON(cpa->numpages > numpages);
760 numpages -= cpa->numpages; 786 numpages -= cpa->numpages;
761 if (cpa->flags & CPA_ARRAY) 787 if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY))
762 cpa->curpage++; 788 cpa->curpage++;
763 else 789 else
764 *cpa->vaddr += cpa->numpages * PAGE_SIZE; 790 *cpa->vaddr += cpa->numpages * PAGE_SIZE;
@@ -775,7 +801,8 @@ static inline int cache_attr(pgprot_t attr)
775 801
776static int change_page_attr_set_clr(unsigned long *addr, int numpages, 802static int change_page_attr_set_clr(unsigned long *addr, int numpages,
777 pgprot_t mask_set, pgprot_t mask_clr, 803 pgprot_t mask_set, pgprot_t mask_clr,
778 int force_split, int array) 804 int force_split, int in_flag,
805 struct page **pages)
779{ 806{
780 struct cpa_data cpa; 807 struct cpa_data cpa;
781 int ret, cache, checkalias; 808 int ret, cache, checkalias;
@@ -790,15 +817,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
790 return 0; 817 return 0;
791 818
792 /* Ensure we are PAGE_SIZE aligned */ 819 /* Ensure we are PAGE_SIZE aligned */
793 if (!array) { 820 if (in_flag & CPA_ARRAY) {
794 if (*addr & ~PAGE_MASK) {
795 *addr &= PAGE_MASK;
796 /*
797 * People should not be passing in unaligned addresses:
798 */
799 WARN_ON_ONCE(1);
800 }
801 } else {
802 int i; 821 int i;
803 for (i = 0; i < numpages; i++) { 822 for (i = 0; i < numpages; i++) {
804 if (addr[i] & ~PAGE_MASK) { 823 if (addr[i] & ~PAGE_MASK) {
@@ -806,6 +825,18 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
806 WARN_ON_ONCE(1); 825 WARN_ON_ONCE(1);
807 } 826 }
808 } 827 }
828 } else if (!(in_flag & CPA_PAGES_ARRAY)) {
829 /*
830 * in_flag of CPA_PAGES_ARRAY implies it is aligned.
831 * No need to cehck in that case
832 */
833 if (*addr & ~PAGE_MASK) {
834 *addr &= PAGE_MASK;
835 /*
836 * People should not be passing in unaligned addresses:
837 */
838 WARN_ON_ONCE(1);
839 }
809 } 840 }
810 841
811 /* Must avoid aliasing mappings in the highmem code */ 842 /* Must avoid aliasing mappings in the highmem code */
@@ -821,6 +852,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
821 arch_flush_lazy_mmu_mode(); 852 arch_flush_lazy_mmu_mode();
822 853
823 cpa.vaddr = addr; 854 cpa.vaddr = addr;
855 cpa.pages = pages;
824 cpa.numpages = numpages; 856 cpa.numpages = numpages;
825 cpa.mask_set = mask_set; 857 cpa.mask_set = mask_set;
826 cpa.mask_clr = mask_clr; 858 cpa.mask_clr = mask_clr;
@@ -828,8 +860,8 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
828 cpa.curpage = 0; 860 cpa.curpage = 0;
829 cpa.force_split = force_split; 861 cpa.force_split = force_split;
830 862
831 if (array) 863 if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY))
832 cpa.flags |= CPA_ARRAY; 864 cpa.flags |= in_flag;
833 865
834 /* No alias checking for _NX bit modifications */ 866 /* No alias checking for _NX bit modifications */
835 checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; 867 checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
@@ -855,9 +887,10 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
855 * wbindv): 887 * wbindv):
856 */ 888 */
857 if (!ret && cpu_has_clflush) { 889 if (!ret && cpu_has_clflush) {
858 if (cpa.flags & CPA_ARRAY) 890 if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
859 cpa_flush_array(addr, numpages, cache); 891 cpa_flush_array(addr, numpages, cache,
860 else 892 cpa.flags, pages);
893 } else
861 cpa_flush_range(*addr, numpages, cache); 894 cpa_flush_range(*addr, numpages, cache);
862 } else 895 } else
863 cpa_flush_all(cache); 896 cpa_flush_all(cache);
@@ -877,14 +910,28 @@ static inline int change_page_attr_set(unsigned long *addr, int numpages,
877 pgprot_t mask, int array) 910 pgprot_t mask, int array)
878{ 911{
879 return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0, 912 return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
880 array); 913 (array ? CPA_ARRAY : 0), NULL);
881} 914}
882 915
883static inline int change_page_attr_clear(unsigned long *addr, int numpages, 916static inline int change_page_attr_clear(unsigned long *addr, int numpages,
884 pgprot_t mask, int array) 917 pgprot_t mask, int array)
885{ 918{
886 return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0, 919 return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
887 array); 920 (array ? CPA_ARRAY : 0), NULL);
921}
922
923static inline int cpa_set_pages_array(struct page **pages, int numpages,
924 pgprot_t mask)
925{
926 return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0,
927 CPA_PAGES_ARRAY, pages);
928}
929
930static inline int cpa_clear_pages_array(struct page **pages, int numpages,
931 pgprot_t mask)
932{
933 return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0,
934 CPA_PAGES_ARRAY, pages);
888} 935}
889 936
890int _set_memory_uc(unsigned long addr, int numpages) 937int _set_memory_uc(unsigned long addr, int numpages)
@@ -1032,7 +1079,7 @@ int set_memory_np(unsigned long addr, int numpages)
1032int set_memory_4k(unsigned long addr, int numpages) 1079int set_memory_4k(unsigned long addr, int numpages)
1033{ 1080{
1034 return change_page_attr_set_clr(&addr, numpages, __pgprot(0), 1081 return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
1035 __pgprot(0), 1, 0); 1082 __pgprot(0), 1, 0, NULL);
1036} 1083}
1037 1084
1038int set_pages_uc(struct page *page, int numpages) 1085int set_pages_uc(struct page *page, int numpages)
@@ -1043,6 +1090,35 @@ int set_pages_uc(struct page *page, int numpages)
1043} 1090}
1044EXPORT_SYMBOL(set_pages_uc); 1091EXPORT_SYMBOL(set_pages_uc);
1045 1092
1093int set_pages_array_uc(struct page **pages, int addrinarray)
1094{
1095 unsigned long start;
1096 unsigned long end;
1097 int i;
1098 int free_idx;
1099
1100 for (i = 0; i < addrinarray; i++) {
1101 start = (unsigned long)page_address(pages[i]);
1102 end = start + PAGE_SIZE;
1103 if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL))
1104 goto err_out;
1105 }
1106
1107 if (cpa_set_pages_array(pages, addrinarray,
1108 __pgprot(_PAGE_CACHE_UC_MINUS)) == 0) {
1109 return 0; /* Success */
1110 }
1111err_out:
1112 free_idx = i;
1113 for (i = 0; i < free_idx; i++) {
1114 start = (unsigned long)page_address(pages[i]);
1115 end = start + PAGE_SIZE;
1116 free_memtype(start, end);
1117 }
1118 return -EINVAL;
1119}
1120EXPORT_SYMBOL(set_pages_array_uc);
1121
1046int set_pages_wb(struct page *page, int numpages) 1122int set_pages_wb(struct page *page, int numpages)
1047{ 1123{
1048 unsigned long addr = (unsigned long)page_address(page); 1124 unsigned long addr = (unsigned long)page_address(page);
@@ -1051,6 +1127,26 @@ int set_pages_wb(struct page *page, int numpages)
1051} 1127}
1052EXPORT_SYMBOL(set_pages_wb); 1128EXPORT_SYMBOL(set_pages_wb);
1053 1129
1130int set_pages_array_wb(struct page **pages, int addrinarray)
1131{
1132 int retval;
1133 unsigned long start;
1134 unsigned long end;
1135 int i;
1136
1137 retval = cpa_clear_pages_array(pages, addrinarray,
1138 __pgprot(_PAGE_CACHE_MASK));
1139
1140 for (i = 0; i < addrinarray; i++) {
1141 start = (unsigned long)page_address(pages[i]);
1142 end = start + PAGE_SIZE;
1143 free_memtype(start, end);
1144 }
1145
1146 return retval;
1147}
1148EXPORT_SYMBOL(set_pages_array_wb);
1149
1054int set_pages_x(struct page *page, int numpages) 1150int set_pages_x(struct page *page, int numpages)
1055{ 1151{
1056 unsigned long addr = (unsigned long)page_address(page); 1152 unsigned long addr = (unsigned long)page_address(page);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 2ed37158012d..640339ee4fb2 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -677,10 +677,11 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
677 is_ram = pat_pagerange_is_ram(paddr, paddr + size); 677 is_ram = pat_pagerange_is_ram(paddr, paddr + size);
678 678
679 /* 679 /*
680 * reserve_pfn_range() doesn't support RAM pages. 680 * reserve_pfn_range() doesn't support RAM pages. Maintain the current
681 * behavior with RAM pages by returning success.
681 */ 682 */
682 if (is_ram != 0) 683 if (is_ram != 0)
683 return -EINVAL; 684 return 0;
684 685
685 ret = reserve_memtype(paddr, paddr + size, want_flags, &flags); 686 ret = reserve_memtype(paddr, paddr + size, want_flags, &flags);
686 if (ret) 687 if (ret)
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 86f2ffc43c3d..5b7c7c8464fe 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -313,6 +313,24 @@ int ptep_clear_flush_young(struct vm_area_struct *vma,
313 return young; 313 return young;
314} 314}
315 315
316/**
317 * reserve_top_address - reserves a hole in the top of kernel address space
318 * @reserve - size of hole to reserve
319 *
320 * Can be used to relocate the fixmap area and poke a hole in the top
321 * of kernel address space to make room for a hypervisor.
322 */
323void __init reserve_top_address(unsigned long reserve)
324{
325#ifdef CONFIG_X86_32
326 BUG_ON(fixmaps_set > 0);
327 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
328 (int)-reserve);
329 __FIXADDR_TOP = -reserve - PAGE_SIZE;
330 __VMALLOC_RESERVE += reserve;
331#endif
332}
333
316int fixmaps_set; 334int fixmaps_set;
317 335
318void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) 336void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 0951db9ee519..46c8834aedc0 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -20,6 +20,8 @@
20#include <asm/tlb.h> 20#include <asm/tlb.h>
21#include <asm/tlbflush.h> 21#include <asm/tlbflush.h>
22 22
23unsigned int __VMALLOC_RESERVE = 128 << 20;
24
23/* 25/*
24 * Associate a virtual page frame with a given physical page frame 26 * Associate a virtual page frame with a given physical page frame
25 * and protection flags for that frame. 27 * and protection flags for that frame.
@@ -48,7 +50,7 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
48 } 50 }
49 pte = pte_offset_kernel(pmd, vaddr); 51 pte = pte_offset_kernel(pmd, vaddr);
50 if (pte_val(pteval)) 52 if (pte_val(pteval))
51 set_pte_present(&init_mm, vaddr, pte, pteval); 53 set_pte_at(&init_mm, vaddr, pte, pteval);
52 else 54 else
53 pte_clear(&init_mm, vaddr, pte); 55 pte_clear(&init_mm, vaddr, pte);
54 56
@@ -97,22 +99,6 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
97unsigned long __FIXADDR_TOP = 0xfffff000; 99unsigned long __FIXADDR_TOP = 0xfffff000;
98EXPORT_SYMBOL(__FIXADDR_TOP); 100EXPORT_SYMBOL(__FIXADDR_TOP);
99 101
100/**
101 * reserve_top_address - reserves a hole in the top of kernel address space
102 * @reserve - size of hole to reserve
103 *
104 * Can be used to relocate the fixmap area and poke a hole in the top
105 * of kernel address space to make room for a hypervisor.
106 */
107void __init reserve_top_address(unsigned long reserve)
108{
109 BUG_ON(fixmaps_set > 0);
110 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
111 (int)-reserve);
112 __FIXADDR_TOP = -reserve - PAGE_SIZE;
113 __VMALLOC_RESERVE += reserve;
114}
115
116/* 102/*
117 * vmalloc=size forces the vmalloc area to be exactly 'size' 103 * vmalloc=size forces the vmalloc area to be exactly 'size'
118 * bytes. This can be used to increase (or decrease) the 104 * bytes. This can be used to increase (or decrease) the
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 574c8bc95ef0..c7d272b8574c 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -116,6 +116,36 @@ void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
116 reserve_early(phys, phys + length, "ACPI SLIT"); 116 reserve_early(phys, phys + length, "ACPI SLIT");
117} 117}
118 118
119/* Callback for Proximity Domain -> x2APIC mapping */
120void __init
121acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
122{
123 int pxm, node;
124 int apic_id;
125
126 if (srat_disabled())
127 return;
128 if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) {
129 bad_srat();
130 return;
131 }
132 if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
133 return;
134 pxm = pa->proximity_domain;
135 node = setup_node(pxm);
136 if (node < 0) {
137 printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
138 bad_srat();
139 return;
140 }
141
142 apic_id = pa->apic_id;
143 apicid_to_node[apic_id] = node;
144 acpi_numa = 1;
145 printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
146 pxm, apic_id, node);
147}
148
119/* Callback for Proximity Domain -> LAPIC mapping */ 149/* Callback for Proximity Domain -> LAPIC mapping */
120void __init 150void __init
121acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) 151acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index a654d59e4483..821e97017e95 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -187,11 +187,6 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
187 cpumask, cpumask_of(smp_processor_id())); 187 cpumask, cpumask_of(smp_processor_id()));
188 188
189 /* 189 /*
190 * Make the above memory operations globally visible before
191 * sending the IPI.
192 */
193 smp_mb();
194 /*
195 * We have to send the IPI only to 190 * We have to send the IPI only to
196 * CPUs affected. 191 * CPUs affected.
197 */ 192 */
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
index 4c4a51c90bc2..819b131fd752 100644
--- a/arch/x86/oprofile/op_model_p4.c
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -380,7 +380,7 @@ static unsigned int get_stagger(void)
380{ 380{
381#ifdef CONFIG_SMP 381#ifdef CONFIG_SMP
382 int cpu = smp_processor_id(); 382 int cpu = smp_processor_id();
383 return (cpu != first_cpu(per_cpu(cpu_sibling_map, cpu))); 383 return cpu != cpumask_first(__get_cpu_var(cpu_sibling_map));
384#endif 384#endif
385 return 0; 385 return 0;
386} 386}
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 82d22fc601ae..8c362b96b644 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -90,7 +90,7 @@ static int __devinit can_skip_ioresource_align(const struct dmi_system_id *d)
90 return 0; 90 return 0;
91} 91}
92 92
93static struct dmi_system_id can_skip_pciprobe_dmi_table[] __devinitdata = { 93static const struct dmi_system_id can_skip_pciprobe_dmi_table[] __devinitconst = {
94/* 94/*
95 * Systems where PCI IO resource ISA alignment can be skipped 95 * Systems where PCI IO resource ISA alignment can be skipped
96 * when the ISA enable bit in the bridge control is not set 96 * when the ISA enable bit in the bridge control is not set
@@ -183,7 +183,7 @@ static int __devinit assign_all_busses(const struct dmi_system_id *d)
183} 183}
184#endif 184#endif
185 185
186static struct dmi_system_id __devinitdata pciprobe_dmi_table[] = { 186static const struct dmi_system_id __devinitconst pciprobe_dmi_table[] = {
187#ifdef __i386__ 187#ifdef __i386__
188/* 188/*
189 * Laptops which need pci=assign-busses to see Cardbus cards 189 * Laptops which need pci=assign-busses to see Cardbus cards
diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c
index f6adf2c6d751..aaf26ae58cd5 100644
--- a/arch/x86/pci/early.c
+++ b/arch/x86/pci/early.c
@@ -69,11 +69,12 @@ void early_dump_pci_device(u8 bus, u8 slot, u8 func)
69 int j; 69 int j;
70 u32 val; 70 u32 val;
71 71
72 printk(KERN_INFO "PCI: %02x:%02x:%02x", bus, slot, func); 72 printk(KERN_INFO "pci 0000:%02x:%02x.%d config space:",
73 bus, slot, func);
73 74
74 for (i = 0; i < 256; i += 4) { 75 for (i = 0; i < 256; i += 4) {
75 if (!(i & 0x0f)) 76 if (!(i & 0x0f))
76 printk("\n%04x:",i); 77 printk("\n %02x:",i);
77 78
78 val = read_pci_config(bus, slot, func, i); 79 val = read_pci_config(bus, slot, func, i);
79 for (j = 0; j < 4; j++) { 80 for (j = 0; j < 4; j++) {
@@ -96,20 +97,22 @@ void early_dump_pci_devices(void)
96 for (func = 0; func < 8; func++) { 97 for (func = 0; func < 8; func++) {
97 u32 class; 98 u32 class;
98 u8 type; 99 u8 type;
100
99 class = read_pci_config(bus, slot, func, 101 class = read_pci_config(bus, slot, func,
100 PCI_CLASS_REVISION); 102 PCI_CLASS_REVISION);
101 if (class == 0xffffffff) 103 if (class == 0xffffffff)
102 break; 104 continue;
103 105
104 early_dump_pci_device(bus, slot, func); 106 early_dump_pci_device(bus, slot, func);
105 107
106 /* No multi-function device? */ 108 if (func == 0) {
107 type = read_pci_config_byte(bus, slot, func, 109 type = read_pci_config_byte(bus, slot,
110 func,
108 PCI_HEADER_TYPE); 111 PCI_HEADER_TYPE);
109 if (!(type & 0x80)) 112 if (!(type & 0x80))
110 break; 113 break;
114 }
111 } 115 }
112 } 116 }
113 } 117 }
114} 118}
115
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 7d388d5cf548..6dd89555fbfa 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -356,7 +356,7 @@ static void __devinit pci_fixup_video(struct pci_dev *pdev)
356DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_video); 356DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_video);
357 357
358 358
359static struct dmi_system_id __devinitdata msi_k8t_dmi_table[] = { 359static const struct dmi_system_id __devinitconst msi_k8t_dmi_table[] = {
360 { 360 {
361 .ident = "MSI-K8T-Neo2Fir", 361 .ident = "MSI-K8T-Neo2Fir",
362 .matches = { 362 .matches = {
@@ -413,7 +413,7 @@ DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237,
413 */ 413 */
414static u16 toshiba_line_size; 414static u16 toshiba_line_size;
415 415
416static struct dmi_system_id __devinitdata toshiba_ohci1394_dmi_table[] = { 416static const struct dmi_system_id __devinitconst toshiba_ohci1394_dmi_table[] = {
417 { 417 {
418 .ident = "Toshiba PS5 based laptop", 418 .ident = "Toshiba PS5 based laptop",
419 .matches = { 419 .matches = {
@@ -495,26 +495,6 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SIEMENS, 0x0015,
495 pci_siemens_interrupt_controller); 495 pci_siemens_interrupt_controller);
496 496
497/* 497/*
498 * Regular PCI devices have 256 bytes, but AMD Family 10h/11h CPUs have
499 * 4096 bytes configuration space for each function of their processor
500 * configuration space.
501 */
502static void amd_cpu_pci_cfg_space_size(struct pci_dev *dev)
503{
504 dev->cfg_size = pci_cfg_space_size_ext(dev);
505}
506DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1200, amd_cpu_pci_cfg_space_size);
507DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1201, amd_cpu_pci_cfg_space_size);
508DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1202, amd_cpu_pci_cfg_space_size);
509DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1203, amd_cpu_pci_cfg_space_size);
510DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1204, amd_cpu_pci_cfg_space_size);
511DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1300, amd_cpu_pci_cfg_space_size);
512DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1301, amd_cpu_pci_cfg_space_size);
513DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1302, amd_cpu_pci_cfg_space_size);
514DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1303, amd_cpu_pci_cfg_space_size);
515DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1304, amd_cpu_pci_cfg_space_size);
516
517/*
518 * SB600: Disable BAR1 on device 14.0 to avoid HPET resources from 498 * SB600: Disable BAR1 on device 14.0 to avoid HPET resources from
519 * confusing the PCI engine: 499 * confusing the PCI engine:
520 */ 500 */
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 5ead808dd70c..f1817f71e009 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -258,24 +258,7 @@ void pcibios_set_master(struct pci_dev *dev)
258 pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat); 258 pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat);
259} 259}
260 260
261static void pci_unmap_page_range(struct vm_area_struct *vma)
262{
263 u64 addr = (u64)vma->vm_pgoff << PAGE_SHIFT;
264 free_memtype(addr, addr + vma->vm_end - vma->vm_start);
265}
266
267static void pci_track_mmap_page_range(struct vm_area_struct *vma)
268{
269 u64 addr = (u64)vma->vm_pgoff << PAGE_SHIFT;
270 unsigned long flags = pgprot_val(vma->vm_page_prot)
271 & _PAGE_CACHE_MASK;
272
273 reserve_memtype(addr, addr + vma->vm_end - vma->vm_start, flags, NULL);
274}
275
276static struct vm_operations_struct pci_mmap_ops = { 261static struct vm_operations_struct pci_mmap_ops = {
277 .open = pci_track_mmap_page_range,
278 .close = pci_unmap_page_range,
279 .access = generic_access_phys, 262 .access = generic_access_phys,
280}; 263};
281 264
@@ -283,11 +266,6 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
283 enum pci_mmap_state mmap_state, int write_combine) 266 enum pci_mmap_state mmap_state, int write_combine)
284{ 267{
285 unsigned long prot; 268 unsigned long prot;
286 u64 addr = vma->vm_pgoff << PAGE_SHIFT;
287 unsigned long len = vma->vm_end - vma->vm_start;
288 unsigned long flags;
289 unsigned long new_flags;
290 int retval;
291 269
292 /* I/O space cannot be accessed via normal processor loads and 270 /* I/O space cannot be accessed via normal processor loads and
293 * stores on this platform. 271 * stores on this platform.
@@ -308,27 +286,6 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
308 286
309 vma->vm_page_prot = __pgprot(prot); 287 vma->vm_page_prot = __pgprot(prot);
310 288
311 flags = pgprot_val(vma->vm_page_prot) & _PAGE_CACHE_MASK;
312 retval = reserve_memtype(addr, addr + len, flags, &new_flags);
313 if (retval)
314 return retval;
315
316 if (flags != new_flags) {
317 if (!is_new_memtype_allowed(flags, new_flags)) {
318 free_memtype(addr, addr+len);
319 return -EINVAL;
320 }
321 flags = new_flags;
322 }
323
324 if (((vma->vm_pgoff < max_low_pfn_mapped) ||
325 (vma->vm_pgoff >= (1UL<<(32 - PAGE_SHIFT)) &&
326 vma->vm_pgoff < max_pfn_mapped)) &&
327 ioremap_change_attr((unsigned long)__va(addr), len, flags)) {
328 free_memtype(addr, addr + len);
329 return -EINVAL;
330 }
331
332 if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, 289 if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
333 vma->vm_end - vma->vm_start, 290 vma->vm_end - vma->vm_start,
334 vma->vm_page_prot)) 291 vma->vm_page_prot))
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index f1065b129e9c..4061bb0f267d 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -50,8 +50,6 @@ static int __init pci_legacy_init(void)
50 if (pci_root_bus) 50 if (pci_root_bus)
51 pci_bus_add_devices(pci_root_bus); 51 pci_bus_add_devices(pci_root_bus);
52 52
53 pcibios_fixup_peer_bridges();
54
55 return 0; 53 return 0;
56} 54}
57 55
@@ -67,6 +65,7 @@ int __init pci_subsys_init(void)
67 pci_visws_init(); 65 pci_visws_init();
68#endif 66#endif
69 pci_legacy_init(); 67 pci_legacy_init();
68 pcibios_fixup_peer_bridges();
70 pcibios_irq_init(); 69 pcibios_irq_init();
71 pcibios_init(); 70 pcibios_init();
72 71
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 89bf9242c80a..905bb526b133 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -14,6 +14,7 @@
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/acpi.h> 15#include <linux/acpi.h>
16#include <linux/bitmap.h> 16#include <linux/bitmap.h>
17#include <linux/sort.h>
17#include <asm/e820.h> 18#include <asm/e820.h>
18#include <asm/pci_x86.h> 19#include <asm/pci_x86.h>
19 20
@@ -24,24 +25,49 @@
24/* Indicate if the mmcfg resources have been placed into the resource table. */ 25/* Indicate if the mmcfg resources have been placed into the resource table. */
25static int __initdata pci_mmcfg_resources_inserted; 26static int __initdata pci_mmcfg_resources_inserted;
26 27
28static __init int extend_mmcfg(int num)
29{
30 struct acpi_mcfg_allocation *new;
31 int new_num = pci_mmcfg_config_num + num;
32
33 new = kzalloc(sizeof(pci_mmcfg_config[0]) * new_num, GFP_KERNEL);
34 if (!new)
35 return -1;
36
37 if (pci_mmcfg_config) {
38 memcpy(new, pci_mmcfg_config,
39 sizeof(pci_mmcfg_config[0]) * new_num);
40 kfree(pci_mmcfg_config);
41 }
42 pci_mmcfg_config = new;
43
44 return 0;
45}
46
47static __init void fill_one_mmcfg(u64 addr, int segment, int start, int end)
48{
49 int i = pci_mmcfg_config_num;
50
51 pci_mmcfg_config_num++;
52 pci_mmcfg_config[i].address = addr;
53 pci_mmcfg_config[i].pci_segment = segment;
54 pci_mmcfg_config[i].start_bus_number = start;
55 pci_mmcfg_config[i].end_bus_number = end;
56}
57
27static const char __init *pci_mmcfg_e7520(void) 58static const char __init *pci_mmcfg_e7520(void)
28{ 59{
29 u32 win; 60 u32 win;
30 raw_pci_ops->read(0, 0, PCI_DEVFN(0, 0), 0xce, 2, &win); 61 raw_pci_ops->read(0, 0, PCI_DEVFN(0, 0), 0xce, 2, &win);
31 62
32 win = win & 0xf000; 63 win = win & 0xf000;
33 if(win == 0x0000 || win == 0xf000) 64 if (win == 0x0000 || win == 0xf000)
34 pci_mmcfg_config_num = 0; 65 return NULL;
35 else { 66
36 pci_mmcfg_config_num = 1; 67 if (extend_mmcfg(1) == -1)
37 pci_mmcfg_config = kzalloc(sizeof(pci_mmcfg_config[0]), GFP_KERNEL); 68 return NULL;
38 if (!pci_mmcfg_config) 69
39 return NULL; 70 fill_one_mmcfg(win << 16, 0, 0, 255);
40 pci_mmcfg_config[0].address = win << 16;
41 pci_mmcfg_config[0].pci_segment = 0;
42 pci_mmcfg_config[0].start_bus_number = 0;
43 pci_mmcfg_config[0].end_bus_number = 255;
44 }
45 71
46 return "Intel Corporation E7520 Memory Controller Hub"; 72 return "Intel Corporation E7520 Memory Controller Hub";
47} 73}
@@ -50,13 +76,11 @@ static const char __init *pci_mmcfg_intel_945(void)
50{ 76{
51 u32 pciexbar, mask = 0, len = 0; 77 u32 pciexbar, mask = 0, len = 0;
52 78
53 pci_mmcfg_config_num = 1;
54
55 raw_pci_ops->read(0, 0, PCI_DEVFN(0, 0), 0x48, 4, &pciexbar); 79 raw_pci_ops->read(0, 0, PCI_DEVFN(0, 0), 0x48, 4, &pciexbar);
56 80
57 /* Enable bit */ 81 /* Enable bit */
58 if (!(pciexbar & 1)) 82 if (!(pciexbar & 1))
59 pci_mmcfg_config_num = 0; 83 return NULL;
60 84
61 /* Size bits */ 85 /* Size bits */
62 switch ((pciexbar >> 1) & 3) { 86 switch ((pciexbar >> 1) & 3) {
@@ -73,28 +97,23 @@ static const char __init *pci_mmcfg_intel_945(void)
73 len = 0x04000000U; 97 len = 0x04000000U;
74 break; 98 break;
75 default: 99 default:
76 pci_mmcfg_config_num = 0; 100 return NULL;
77 } 101 }
78 102
79 /* Errata #2, things break when not aligned on a 256Mb boundary */ 103 /* Errata #2, things break when not aligned on a 256Mb boundary */
80 /* Can only happen in 64M/128M mode */ 104 /* Can only happen in 64M/128M mode */
81 105
82 if ((pciexbar & mask) & 0x0fffffffU) 106 if ((pciexbar & mask) & 0x0fffffffU)
83 pci_mmcfg_config_num = 0; 107 return NULL;
84 108
85 /* Don't hit the APIC registers and their friends */ 109 /* Don't hit the APIC registers and their friends */
86 if ((pciexbar & mask) >= 0xf0000000U) 110 if ((pciexbar & mask) >= 0xf0000000U)
87 pci_mmcfg_config_num = 0; 111 return NULL;
88 112
89 if (pci_mmcfg_config_num) { 113 if (extend_mmcfg(1) == -1)
90 pci_mmcfg_config = kzalloc(sizeof(pci_mmcfg_config[0]), GFP_KERNEL); 114 return NULL;
91 if (!pci_mmcfg_config) 115
92 return NULL; 116 fill_one_mmcfg(pciexbar & mask, 0, 0, (len >> 20) - 1);
93 pci_mmcfg_config[0].address = pciexbar & mask;
94 pci_mmcfg_config[0].pci_segment = 0;
95 pci_mmcfg_config[0].start_bus_number = 0;
96 pci_mmcfg_config[0].end_bus_number = (len >> 20) - 1;
97 }
98 117
99 return "Intel Corporation 945G/GZ/P/PL Express Memory Controller Hub"; 118 return "Intel Corporation 945G/GZ/P/PL Express Memory Controller Hub";
100} 119}
@@ -138,22 +157,77 @@ static const char __init *pci_mmcfg_amd_fam10h(void)
138 busnbits = 8; 157 busnbits = 8;
139 } 158 }
140 159
141 pci_mmcfg_config_num = (1 << segnbits); 160 if (extend_mmcfg(1 << segnbits) == -1)
142 pci_mmcfg_config = kzalloc(sizeof(pci_mmcfg_config[0]) *
143 pci_mmcfg_config_num, GFP_KERNEL);
144 if (!pci_mmcfg_config)
145 return NULL; 161 return NULL;
146 162
147 for (i = 0; i < (1 << segnbits); i++) { 163 for (i = 0; i < (1 << segnbits); i++)
148 pci_mmcfg_config[i].address = base + (1<<28) * i; 164 fill_one_mmcfg(base + (1<<28) * i, i, 0, (1 << busnbits) - 1);
149 pci_mmcfg_config[i].pci_segment = i;
150 pci_mmcfg_config[i].start_bus_number = 0;
151 pci_mmcfg_config[i].end_bus_number = (1 << busnbits) - 1;
152 }
153 165
154 return "AMD Family 10h NB"; 166 return "AMD Family 10h NB";
155} 167}
156 168
169static bool __initdata mcp55_checked;
170static const char __init *pci_mmcfg_nvidia_mcp55(void)
171{
172 int bus;
173 int mcp55_mmconf_found = 0;
174
175 static const u32 extcfg_regnum = 0x90;
176 static const u32 extcfg_regsize = 4;
177 static const u32 extcfg_enable_mask = 1<<31;
178 static const u32 extcfg_start_mask = 0xff<<16;
179 static const int extcfg_start_shift = 16;
180 static const u32 extcfg_size_mask = 0x3<<28;
181 static const int extcfg_size_shift = 28;
182 static const int extcfg_sizebus[] = {0x100, 0x80, 0x40, 0x20};
183 static const u32 extcfg_base_mask[] = {0x7ff8, 0x7ffc, 0x7ffe, 0x7fff};
184 static const int extcfg_base_lshift = 25;
185
186 /*
187 * do check if amd fam10h already took over
188 */
189 if (!acpi_disabled || pci_mmcfg_config_num || mcp55_checked)
190 return NULL;
191
192 mcp55_checked = true;
193 for (bus = 0; bus < 256; bus++) {
194 u64 base;
195 u32 l, extcfg;
196 u16 vendor, device;
197 int start, size_index, end;
198
199 raw_pci_ops->read(0, bus, PCI_DEVFN(0, 0), 0, 4, &l);
200 vendor = l & 0xffff;
201 device = (l >> 16) & 0xffff;
202
203 if (PCI_VENDOR_ID_NVIDIA != vendor || 0x0369 != device)
204 continue;
205
206 raw_pci_ops->read(0, bus, PCI_DEVFN(0, 0), extcfg_regnum,
207 extcfg_regsize, &extcfg);
208
209 if (!(extcfg & extcfg_enable_mask))
210 continue;
211
212 if (extend_mmcfg(1) == -1)
213 continue;
214
215 size_index = (extcfg & extcfg_size_mask) >> extcfg_size_shift;
216 base = extcfg & extcfg_base_mask[size_index];
217 /* base could > 4G */
218 base <<= extcfg_base_lshift;
219 start = (extcfg & extcfg_start_mask) >> extcfg_start_shift;
220 end = start + extcfg_sizebus[size_index] - 1;
221 fill_one_mmcfg(base, 0, start, end);
222 mcp55_mmconf_found++;
223 }
224
225 if (!mcp55_mmconf_found)
226 return NULL;
227
228 return "nVidia MCP55";
229}
230
157struct pci_mmcfg_hostbridge_probe { 231struct pci_mmcfg_hostbridge_probe {
158 u32 bus; 232 u32 bus;
159 u32 devfn; 233 u32 devfn;
@@ -171,8 +245,52 @@ static struct pci_mmcfg_hostbridge_probe pci_mmcfg_probes[] __initdata = {
171 0x1200, pci_mmcfg_amd_fam10h }, 245 0x1200, pci_mmcfg_amd_fam10h },
172 { 0xff, PCI_DEVFN(0, 0), PCI_VENDOR_ID_AMD, 246 { 0xff, PCI_DEVFN(0, 0), PCI_VENDOR_ID_AMD,
173 0x1200, pci_mmcfg_amd_fam10h }, 247 0x1200, pci_mmcfg_amd_fam10h },
248 { 0, PCI_DEVFN(0, 0), PCI_VENDOR_ID_NVIDIA,
249 0x0369, pci_mmcfg_nvidia_mcp55 },
174}; 250};
175 251
252static int __init cmp_mmcfg(const void *x1, const void *x2)
253{
254 const typeof(pci_mmcfg_config[0]) *m1 = x1;
255 const typeof(pci_mmcfg_config[0]) *m2 = x2;
256 int start1, start2;
257
258 start1 = m1->start_bus_number;
259 start2 = m2->start_bus_number;
260
261 return start1 - start2;
262}
263
264static void __init pci_mmcfg_check_end_bus_number(void)
265{
266 int i;
267 typeof(pci_mmcfg_config[0]) *cfg, *cfgx;
268
269 /* sort them at first */
270 sort(pci_mmcfg_config, pci_mmcfg_config_num,
271 sizeof(pci_mmcfg_config[0]), cmp_mmcfg, NULL);
272
273 /* last one*/
274 if (pci_mmcfg_config_num > 0) {
275 i = pci_mmcfg_config_num - 1;
276 cfg = &pci_mmcfg_config[i];
277 if (cfg->end_bus_number < cfg->start_bus_number)
278 cfg->end_bus_number = 255;
279 }
280
281 /* don't overlap please */
282 for (i = 0; i < pci_mmcfg_config_num - 1; i++) {
283 cfg = &pci_mmcfg_config[i];
284 cfgx = &pci_mmcfg_config[i+1];
285
286 if (cfg->end_bus_number < cfg->start_bus_number)
287 cfg->end_bus_number = 255;
288
289 if (cfg->end_bus_number >= cfgx->start_bus_number)
290 cfg->end_bus_number = cfgx->start_bus_number - 1;
291 }
292}
293
176static int __init pci_mmcfg_check_hostbridge(void) 294static int __init pci_mmcfg_check_hostbridge(void)
177{ 295{
178 u32 l; 296 u32 l;
@@ -186,31 +304,33 @@ static int __init pci_mmcfg_check_hostbridge(void)
186 304
187 pci_mmcfg_config_num = 0; 305 pci_mmcfg_config_num = 0;
188 pci_mmcfg_config = NULL; 306 pci_mmcfg_config = NULL;
189 name = NULL;
190 307
191 for (i = 0; !name && i < ARRAY_SIZE(pci_mmcfg_probes); i++) { 308 for (i = 0; i < ARRAY_SIZE(pci_mmcfg_probes); i++) {
192 bus = pci_mmcfg_probes[i].bus; 309 bus = pci_mmcfg_probes[i].bus;
193 devfn = pci_mmcfg_probes[i].devfn; 310 devfn = pci_mmcfg_probes[i].devfn;
194 raw_pci_ops->read(0, bus, devfn, 0, 4, &l); 311 raw_pci_ops->read(0, bus, devfn, 0, 4, &l);
195 vendor = l & 0xffff; 312 vendor = l & 0xffff;
196 device = (l >> 16) & 0xffff; 313 device = (l >> 16) & 0xffff;
197 314
315 name = NULL;
198 if (pci_mmcfg_probes[i].vendor == vendor && 316 if (pci_mmcfg_probes[i].vendor == vendor &&
199 pci_mmcfg_probes[i].device == device) 317 pci_mmcfg_probes[i].device == device)
200 name = pci_mmcfg_probes[i].probe(); 318 name = pci_mmcfg_probes[i].probe();
201 }
202 319
203 if (name) { 320 if (name)
204 printk(KERN_INFO "PCI: Found %s %s MMCONFIG support.\n", 321 printk(KERN_INFO "PCI: Found %s with MMCONFIG support.\n",
205 name, pci_mmcfg_config_num ? "with" : "without"); 322 name);
206 } 323 }
207 324
208 return name != NULL; 325 /* some end_bus_number is crazy, fix it */
326 pci_mmcfg_check_end_bus_number();
327
328 return pci_mmcfg_config_num != 0;
209} 329}
210 330
211static void __init pci_mmcfg_insert_resources(void) 331static void __init pci_mmcfg_insert_resources(void)
212{ 332{
213#define PCI_MMCFG_RESOURCE_NAME_LEN 19 333#define PCI_MMCFG_RESOURCE_NAME_LEN 24
214 int i; 334 int i;
215 struct resource *res; 335 struct resource *res;
216 char *names; 336 char *names;
@@ -228,9 +348,10 @@ static void __init pci_mmcfg_insert_resources(void)
228 struct acpi_mcfg_allocation *cfg = &pci_mmcfg_config[i]; 348 struct acpi_mcfg_allocation *cfg = &pci_mmcfg_config[i];
229 num_buses = cfg->end_bus_number - cfg->start_bus_number + 1; 349 num_buses = cfg->end_bus_number - cfg->start_bus_number + 1;
230 res->name = names; 350 res->name = names;
231 snprintf(names, PCI_MMCFG_RESOURCE_NAME_LEN, "PCI MMCONFIG %u", 351 snprintf(names, PCI_MMCFG_RESOURCE_NAME_LEN,
232 cfg->pci_segment); 352 "PCI MMCONFIG %u [%02x-%02x]", cfg->pci_segment,
233 res->start = cfg->address; 353 cfg->start_bus_number, cfg->end_bus_number);
354 res->start = cfg->address + (cfg->start_bus_number << 20);
234 res->end = res->start + (num_buses << 20) - 1; 355 res->end = res->start + (num_buses << 20) - 1;
235 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; 356 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
236 insert_resource(&iomem_resource, res); 357 insert_resource(&iomem_resource, res);
@@ -354,8 +475,6 @@ static void __init pci_mmcfg_reject_broken(int early)
354 (pci_mmcfg_config[0].address == 0)) 475 (pci_mmcfg_config[0].address == 0))
355 return; 476 return;
356 477
357 cfg = &pci_mmcfg_config[0];
358
359 for (i = 0; i < pci_mmcfg_config_num; i++) { 478 for (i = 0; i < pci_mmcfg_config_num; i++) {
360 int valid = 0; 479 int valid = 0;
361 u64 addr, size; 480 u64 addr, size;
@@ -423,10 +542,10 @@ static void __init __pci_mmcfg_init(int early)
423 known_bridge = 1; 542 known_bridge = 1;
424 } 543 }
425 544
426 if (!known_bridge) { 545 if (!known_bridge)
427 acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg); 546 acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg);
428 pci_mmcfg_reject_broken(early); 547
429 } 548 pci_mmcfg_reject_broken(early);
430 549
431 if ((pci_mmcfg_config_num == 0) || 550 if ((pci_mmcfg_config_num == 0) ||
432 (pci_mmcfg_config == NULL) || 551 (pci_mmcfg_config == NULL) ||
diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c
index 30007ffc8e11..94349f8b2f96 100644
--- a/arch/x86/pci/mmconfig_64.c
+++ b/arch/x86/pci/mmconfig_64.c
@@ -112,13 +112,18 @@ static struct pci_raw_ops pci_mmcfg = {
112static void __iomem * __init mcfg_ioremap(struct acpi_mcfg_allocation *cfg) 112static void __iomem * __init mcfg_ioremap(struct acpi_mcfg_allocation *cfg)
113{ 113{
114 void __iomem *addr; 114 void __iomem *addr;
115 u32 size; 115 u64 start, size;
116 116
117 size = (cfg->end_bus_number + 1) << 20; 117 start = cfg->start_bus_number;
118 addr = ioremap_nocache(cfg->address, size); 118 start <<= 20;
119 start += cfg->address;
120 size = cfg->end_bus_number + 1 - cfg->start_bus_number;
121 size <<= 20;
122 addr = ioremap_nocache(start, size);
119 if (addr) { 123 if (addr) {
120 printk(KERN_INFO "PCI: Using MMCONFIG at %Lx - %Lx\n", 124 printk(KERN_INFO "PCI: Using MMCONFIG at %Lx - %Lx\n",
121 cfg->address, cfg->address + size - 1); 125 start, start + size - 1);
126 addr -= cfg->start_bus_number << 20;
122 } 127 }
123 return addr; 128 return addr;
124} 129}
@@ -157,7 +162,7 @@ void __init pci_mmcfg_arch_free(void)
157 162
158 for (i = 0; i < pci_mmcfg_config_num; ++i) { 163 for (i = 0; i < pci_mmcfg_config_num; ++i) {
159 if (pci_mmcfg_virt[i].virt) { 164 if (pci_mmcfg_virt[i].virt) {
160 iounmap(pci_mmcfg_virt[i].virt); 165 iounmap(pci_mmcfg_virt[i].virt + (pci_mmcfg_virt[i].cfg->start_bus_number << 20));
161 pci_mmcfg_virt[i].virt = NULL; 166 pci_mmcfg_virt[i].virt = NULL;
162 pci_mmcfg_virt[i].cfg = NULL; 167 pci_mmcfg_virt[i].cfg = NULL;
163 } 168 }
diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile
index 9ff4d5b55ad1..58b32db33125 100644
--- a/arch/x86/power/Makefile
+++ b/arch/x86/power/Makefile
@@ -1,2 +1,7 @@
1# __restore_processor_state() restores %gs after S3 resume and so should not
2# itself be stack-protected
3nostackp := $(call cc-option, -fno-stack-protector)
4CFLAGS_cpu_$(BITS).o := $(nostackp)
5
1obj-$(CONFIG_PM_SLEEP) += cpu_$(BITS).o 6obj-$(CONFIG_PM_SLEEP) += cpu_$(BITS).o
2obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o 7obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o
diff --git a/arch/x86/power/cpu_32.c b/arch/x86/power/cpu_32.c
index 274d06082f48..ce702c5b3a2c 100644
--- a/arch/x86/power/cpu_32.c
+++ b/arch/x86/power/cpu_32.c
@@ -12,6 +12,7 @@
12#include <asm/mtrr.h> 12#include <asm/mtrr.h>
13#include <asm/mce.h> 13#include <asm/mce.h>
14#include <asm/xcr.h> 14#include <asm/xcr.h>
15#include <asm/suspend.h>
15 16
16static struct saved_context saved_context; 17static struct saved_context saved_context;
17 18
diff --git a/arch/x86/power/cpu_64.c b/arch/x86/power/cpu_64.c
index e3b6cf70d62c..5343540f2607 100644
--- a/arch/x86/power/cpu_64.c
+++ b/arch/x86/power/cpu_64.c
@@ -15,6 +15,7 @@
15#include <asm/pgtable.h> 15#include <asm/pgtable.h>
16#include <asm/mtrr.h> 16#include <asm/mtrr.h>
17#include <asm/xcr.h> 17#include <asm/xcr.h>
18#include <asm/suspend.h>
18 19
19static void fix_processor_context(void); 20static void fix_processor_context(void);
20 21
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index 6dd000dd7933..65fdc86e923f 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -14,6 +14,7 @@
14#include <asm/page.h> 14#include <asm/page.h>
15#include <asm/pgtable.h> 15#include <asm/pgtable.h>
16#include <asm/mtrr.h> 16#include <asm/mtrr.h>
17#include <asm/suspend.h>
17 18
18/* References to section boundaries */ 19/* References to section boundaries */
19extern const void __nosave_begin, __nosave_end; 20extern const void __nosave_begin, __nosave_end;
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index c52f4034c7fd..82cd39a6cbd3 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -103,7 +103,7 @@ static void xen_vcpu_setup(int cpu)
103 103
104 vcpup = &per_cpu(xen_vcpu_info, cpu); 104 vcpup = &per_cpu(xen_vcpu_info, cpu);
105 105
106 info.mfn = virt_to_mfn(vcpup); 106 info.mfn = arbitrary_virt_to_mfn(vcpup);
107 info.offset = offset_in_page(vcpup); 107 info.offset = offset_in_page(vcpup);
108 108
109 printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n", 109 printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n",
@@ -301,8 +301,10 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
301 frames = mcs.args; 301 frames = mcs.args;
302 302
303 for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { 303 for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
304 frames[f] = virt_to_mfn(va); 304 frames[f] = arbitrary_virt_to_mfn((void *)va);
305
305 make_lowmem_page_readonly((void *)va); 306 make_lowmem_page_readonly((void *)va);
307 make_lowmem_page_readonly(mfn_to_virt(frames[f]));
306 } 308 }
307 309
308 MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct)); 310 MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct));
@@ -314,7 +316,7 @@ static void load_TLS_descriptor(struct thread_struct *t,
314 unsigned int cpu, unsigned int i) 316 unsigned int cpu, unsigned int i)
315{ 317{
316 struct desc_struct *gdt = get_cpu_gdt_table(cpu); 318 struct desc_struct *gdt = get_cpu_gdt_table(cpu);
317 xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); 319 xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
318 struct multicall_space mc = __xen_mc_entry(0); 320 struct multicall_space mc = __xen_mc_entry(0);
319 321
320 MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); 322 MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
@@ -488,7 +490,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
488 break; 490 break;
489 491
490 default: { 492 default: {
491 xmaddr_t maddr = virt_to_machine(&dt[entry]); 493 xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]);
492 494
493 xen_mc_flush(); 495 xen_mc_flush();
494 if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) 496 if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 319bd40a57c2..db3802fb7b84 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -276,6 +276,13 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
276 p2m_top[topidx][idx] = mfn; 276 p2m_top[topidx][idx] = mfn;
277} 277}
278 278
279unsigned long arbitrary_virt_to_mfn(void *vaddr)
280{
281 xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
282
283 return PFN_DOWN(maddr.maddr);
284}
285
279xmaddr_t arbitrary_virt_to_machine(void *vaddr) 286xmaddr_t arbitrary_virt_to_machine(void *vaddr)
280{ 287{
281 unsigned long address = (unsigned long)vaddr; 288 unsigned long address = (unsigned long)vaddr;
@@ -1716,9 +1723,9 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1716{ 1723{
1717 pmd_t *kernel_pmd; 1724 pmd_t *kernel_pmd;
1718 1725
1719 init_pg_tables_start = __pa(pgd); 1726 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
1720 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; 1727 xen_start_info->nr_pt_frames * PAGE_SIZE +
1721 max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024); 1728 512*1024);
1722 1729
1723 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); 1730 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1724 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); 1731 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
@@ -1863,7 +1870,6 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = {
1863 1870
1864#ifdef CONFIG_X86_PAE 1871#ifdef CONFIG_X86_PAE
1865 .set_pte_atomic = xen_set_pte_atomic, 1872 .set_pte_atomic = xen_set_pte_atomic,
1866 .set_pte_present = xen_set_pte_at,
1867 .pte_clear = xen_pte_clear, 1873 .pte_clear = xen_pte_clear,
1868 .pmd_clear = xen_pmd_clear, 1874 .pmd_clear = xen_pmd_clear,
1869#endif /* CONFIG_X86_PAE */ 1875#endif /* CONFIG_X86_PAE */
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 035582ae815d..585a6e330837 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -158,7 +158,7 @@ static void __init xen_fill_possible_map(void)
158 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); 158 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
159 if (rc >= 0) { 159 if (rc >= 0) {
160 num_processors++; 160 num_processors++;
161 cpu_set(i, cpu_possible_map); 161 set_cpu_possible(i, true);
162 } 162 }
163 } 163 }
164} 164}
@@ -197,7 +197,7 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
197 while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) { 197 while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
198 for (cpu = nr_cpu_ids - 1; !cpu_possible(cpu); cpu--) 198 for (cpu = nr_cpu_ids - 1; !cpu_possible(cpu); cpu--)
199 continue; 199 continue;
200 cpu_clear(cpu, cpu_possible_map); 200 set_cpu_possible(cpu, false);
201 } 201 }
202 202
203 for_each_possible_cpu (cpu) { 203 for_each_possible_cpu (cpu) {
@@ -210,7 +210,7 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
210 if (IS_ERR(idle)) 210 if (IS_ERR(idle))
211 panic("failed fork for CPU %d", cpu); 211 panic("failed fork for CPU %d", cpu);
212 212
213 cpu_set(cpu, cpu_present_map); 213 set_cpu_present(cpu, true);
214 } 214 }
215} 215}
216 216
@@ -219,6 +219,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
219{ 219{
220 struct vcpu_guest_context *ctxt; 220 struct vcpu_guest_context *ctxt;
221 struct desc_struct *gdt; 221 struct desc_struct *gdt;
222 unsigned long gdt_mfn;
222 223
223 if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map)) 224 if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map))
224 return 0; 225 return 0;
@@ -248,9 +249,12 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
248 ctxt->ldt_ents = 0; 249 ctxt->ldt_ents = 0;
249 250
250 BUG_ON((unsigned long)gdt & ~PAGE_MASK); 251 BUG_ON((unsigned long)gdt & ~PAGE_MASK);
252
253 gdt_mfn = arbitrary_virt_to_mfn(gdt);
251 make_lowmem_page_readonly(gdt); 254 make_lowmem_page_readonly(gdt);
255 make_lowmem_page_readonly(mfn_to_virt(gdt_mfn));
252 256
253 ctxt->gdt_frames[0] = virt_to_mfn(gdt); 257 ctxt->gdt_frames[0] = gdt_mfn;
254 ctxt->gdt_ents = GDT_ENTRIES; 258 ctxt->gdt_ents = GDT_ENTRIES;
255 259
256 ctxt->user_regs.cs = __KERNEL_CS; 260 ctxt->user_regs.cs = __KERNEL_CS;