aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2008-10-12 06:39:30 -0400
committerIngo Molnar <mingo@elte.hu>2008-10-12 06:39:50 -0400
commit4c7145a1ec1bb789d5f07e47510e8bda546a7c4a (patch)
treee2767b77e5413473a3bba302237f4669a203f183 /arch/x86
parent74e91604b2452c15bbe72d77b37cf47ed0310d13 (diff)
parentfd048088306656824958e7783ffcee27e241b361 (diff)
Merge branch 'linus' into x86/spinlocks
Done to prevent this failure of an Octopus merge: Added arch/arm/include/asm/byteorder.h in both, but differently. ERROR: Merge conflict in arch/arm/include/asm/byteorder.h Auto-merging include/asm-x86/spinlock.h ERROR: Merge conflict in include/asm-x86/spinlock.h fatal: merge program failed
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig76
-rw-r--r--arch/x86/Kconfig.cpu51
-rw-r--r--arch/x86/boot/compressed/head_32.S5
-rw-r--r--arch/x86/boot/compressed/misc.c10
-rw-r--r--arch/x86/boot/compressed/relocs.c2
-rw-r--r--arch/x86/boot/cpu.c17
-rw-r--r--arch/x86/boot/header.S1
-rw-r--r--arch/x86/boot/mkcpustr.c38
-rw-r--r--arch/x86/configs/i386_defconfig19
-rw-r--r--arch/x86/configs/x86_64_defconfig29
-rw-r--r--arch/x86/crypto/Makefile2
-rw-r--r--arch/x86/crypto/crc32c-intel.c197
-rw-r--r--arch/x86/ia32/ia32_aout.c11
-rw-r--r--arch/x86/ia32/ia32_signal.c21
-rw-r--r--arch/x86/ia32/sys_ia32.c9
-rw-r--r--arch/x86/kernel/acpi/boot.c25
-rw-r--r--arch/x86/kernel/amd_iommu.c350
-rw-r--r--arch/x86/kernel/amd_iommu_init.c194
-rw-r--r--arch/x86/kernel/aperture_64.c6
-rw-r--r--arch/x86/kernel/apic_32.c332
-rw-r--r--arch/x86/kernel/apic_64.c389
-rw-r--r--arch/x86/kernel/apm_32.c4
-rw-r--r--arch/x86/kernel/bios_uv.c10
-rw-r--r--arch/x86/kernel/cpu/Makefile12
-rw-r--r--arch/x86/kernel/cpu/amd.c545
-rw-r--r--arch/x86/kernel/cpu/amd_64.c224
-rw-r--r--arch/x86/kernel/cpu/centaur.c1
-rw-r--r--arch/x86/kernel/cpu/centaur_64.c3
-rw-r--r--arch/x86/kernel/cpu/common.c495
-rw-r--r--arch/x86/kernel/cpu/common_64.c816
-rw-r--r--arch/x86/kernel/cpu/cpu.h1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c13
-rw-r--r--arch/x86/kernel/cpu/cpufreq/elanfreq.c42
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k6.c41
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c2
-rw-r--r--arch/x86/kernel/cpu/cyrix.c1
-rw-r--r--arch/x86/kernel/cpu/intel.c292
-rw-r--r--arch/x86/kernel/cpu/intel_64.c99
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c7
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c4
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c274
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c86
-rw-r--r--arch/x86/kernel/cpu/transmeta.c29
-rw-r--r--arch/x86/kernel/cpuid.c1
-rw-r--r--arch/x86/kernel/crash_dump_64.c13
-rw-r--r--arch/x86/kernel/ds.c954
-rw-r--r--arch/x86/kernel/e820.c2
-rw-r--r--arch/x86/kernel/early-quirks.c18
-rw-r--r--arch/x86/kernel/efi.c6
-rw-r--r--arch/x86/kernel/head64.c5
-rw-r--r--arch/x86/kernel/head_32.S34
-rw-r--r--arch/x86/kernel/head_64.S4
-rw-r--r--arch/x86/kernel/hpet.c19
-rw-r--r--arch/x86/kernel/io_delay.c8
-rw-r--r--arch/x86/kernel/irq_32.c2
-rw-r--r--arch/x86/kernel/irq_64.c2
-rw-r--r--arch/x86/kernel/k8.c5
-rw-r--r--arch/x86/kernel/kdebugfs.c1
-rw-r--r--arch/x86/kernel/kgdb.c50
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/nmi.c11
-rw-r--r--arch/x86/kernel/olpc.c6
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c2
-rw-r--r--arch/x86/kernel/pci-calgary_64.c18
-rw-r--r--arch/x86/kernel/pci-dma.c179
-rw-r--r--arch/x86/kernel/pci-gart_64.c162
-rw-r--r--arch/x86/kernel/pci-nommu.c10
-rw-r--r--arch/x86/kernel/pcspeaker.c13
-rw-r--r--arch/x86/kernel/process.c21
-rw-r--r--arch/x86/kernel/process_32.c62
-rw-r--r--arch/x86/kernel/process_64.c171
-rw-r--r--arch/x86/kernel/ptrace.c478
-rw-r--r--arch/x86/kernel/reboot.c6
-rw-r--r--arch/x86/kernel/setup.c21
-rw-r--r--arch/x86/kernel/sigframe.h5
-rw-r--r--arch/x86/kernel/signal_32.c11
-rw-r--r--arch/x86/kernel/signal_64.c105
-rw-r--r--arch/x86/kernel/smpboot.c6
-rw-r--r--arch/x86/kernel/sys_x86_64.c43
-rw-r--r--arch/x86/kernel/traps_64.c61
-rw-r--r--arch/x86/kernel/tsc.c293
-rw-r--r--arch/x86/kernel/visws_quirks.c16
-rw-r--r--arch/x86/kernel/vmi_32.c12
-rw-r--r--arch/x86/kernel/vsmp_64.c2
-rw-r--r--arch/x86/kvm/mmu.c4
-rw-r--r--arch/x86/kvm/svm.c12
-rw-r--r--arch/x86/kvm/vmx.c3
-rw-r--r--arch/x86/kvm/vmx.h17
-rw-r--r--arch/x86/lib/msr-on-cpu.c78
-rw-r--r--arch/x86/lib/string_32.c42
-rw-r--r--arch/x86/lib/strstr_32.c6
-rw-r--r--arch/x86/mach-voyager/voyager_smp.c2
-rw-r--r--arch/x86/mm/discontig_32.c2
-rw-r--r--arch/x86/mm/dump_pagetables.c4
-rw-r--r--arch/x86/mm/init_32.c88
-rw-r--r--arch/x86/mm/init_64.c174
-rw-r--r--arch/x86/mm/ioremap.c19
-rw-r--r--arch/x86/mm/numa_64.c10
-rw-r--r--arch/x86/mm/pageattr-test.c9
-rw-r--r--arch/x86/mm/pageattr.c463
-rw-r--r--arch/x86/mm/pat.c132
-rw-r--r--arch/x86/mm/pgtable.c6
-rw-r--r--arch/x86/mm/pgtable_32.c3
-rw-r--r--arch/x86/oprofile/nmi_int.c4
-rw-r--r--arch/x86/oprofile/op_model_p4.c175
-rw-r--r--arch/x86/pci/amd_bus.c2
-rw-r--r--arch/x86/pci/irq.c67
-rw-r--r--arch/x86/power/hibernate_asm_32.S14
-rw-r--r--arch/x86/xen/enlighten.c22
-rw-r--r--arch/x86/xen/setup.c2
112 files changed, 5280 insertions, 3709 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 21ef9dd36187..44d4f2130d01 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -29,6 +29,7 @@ config X86
29 select HAVE_FTRACE 29 select HAVE_FTRACE
30 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) 30 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
31 select HAVE_ARCH_KGDB if !X86_VOYAGER 31 select HAVE_ARCH_KGDB if !X86_VOYAGER
32 select HAVE_ARCH_TRACEHOOK
32 select HAVE_GENERIC_DMA_COHERENT if X86_32 33 select HAVE_GENERIC_DMA_COHERENT if X86_32
33 select HAVE_EFFICIENT_UNALIGNED_ACCESS 34 select HAVE_EFFICIENT_UNALIGNED_ACCESS
34 35
@@ -553,6 +554,7 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
553config AMD_IOMMU 554config AMD_IOMMU
554 bool "AMD IOMMU support" 555 bool "AMD IOMMU support"
555 select SWIOTLB 556 select SWIOTLB
557 select PCI_MSI
556 depends on X86_64 && PCI && ACPI 558 depends on X86_64 && PCI && ACPI
557 help 559 help
558 With this option you can enable support for AMD IOMMU hardware in 560 With this option you can enable support for AMD IOMMU hardware in
@@ -1020,7 +1022,7 @@ config HAVE_ARCH_ALLOC_REMAP
1020 1022
1021config ARCH_FLATMEM_ENABLE 1023config ARCH_FLATMEM_ENABLE
1022 def_bool y 1024 def_bool y
1023 depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && X86_PC && !NUMA 1025 depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && !NUMA
1024 1026
1025config ARCH_DISCONTIGMEM_ENABLE 1027config ARCH_DISCONTIGMEM_ENABLE
1026 def_bool y 1028 def_bool y
@@ -1036,7 +1038,7 @@ config ARCH_SPARSEMEM_DEFAULT
1036 1038
1037config ARCH_SPARSEMEM_ENABLE 1039config ARCH_SPARSEMEM_ENABLE
1038 def_bool y 1040 def_bool y
1039 depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC) 1041 depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC) || X86_GENERICARCH
1040 select SPARSEMEM_STATIC if X86_32 1042 select SPARSEMEM_STATIC if X86_32
1041 select SPARSEMEM_VMEMMAP_ENABLE if X86_64 1043 select SPARSEMEM_VMEMMAP_ENABLE if X86_64
1042 1044
@@ -1117,10 +1119,10 @@ config MTRR
1117 You can safely say Y even if your machine doesn't have MTRRs, you'll 1119 You can safely say Y even if your machine doesn't have MTRRs, you'll
1118 just add about 9 KB to your kernel. 1120 just add about 9 KB to your kernel.
1119 1121
1120 See <file:Documentation/mtrr.txt> for more information. 1122 See <file:Documentation/x86/mtrr.txt> for more information.
1121 1123
1122config MTRR_SANITIZER 1124config MTRR_SANITIZER
1123 bool 1125 def_bool y
1124 prompt "MTRR cleanup support" 1126 prompt "MTRR cleanup support"
1125 depends on MTRR 1127 depends on MTRR
1126 help 1128 help
@@ -1131,7 +1133,7 @@ config MTRR_SANITIZER
1131 The largest mtrr entry size for a continous block can be set with 1133 The largest mtrr entry size for a continous block can be set with
1132 mtrr_chunk_size. 1134 mtrr_chunk_size.
1133 1135
1134 If unsure, say N. 1136 If unsure, say Y.
1135 1137
1136config MTRR_SANITIZER_ENABLE_DEFAULT 1138config MTRR_SANITIZER_ENABLE_DEFAULT
1137 int "MTRR cleanup enable value (0-1)" 1139 int "MTRR cleanup enable value (0-1)"
@@ -1191,7 +1193,6 @@ config IRQBALANCE
1191config SECCOMP 1193config SECCOMP
1192 def_bool y 1194 def_bool y
1193 prompt "Enable seccomp to safely compute untrusted bytecode" 1195 prompt "Enable seccomp to safely compute untrusted bytecode"
1194 depends on PROC_FS
1195 help 1196 help
1196 This kernel feature is useful for number crunching applications 1197 This kernel feature is useful for number crunching applications
1197 that may need to compute untrusted bytecode during their 1198 that may need to compute untrusted bytecode during their
@@ -1199,7 +1200,7 @@ config SECCOMP
1199 the process as file descriptors supporting the read/write 1200 the process as file descriptors supporting the read/write
1200 syscalls, it's possible to isolate those applications in 1201 syscalls, it's possible to isolate those applications in
1201 their own address space using seccomp. Once seccomp is 1202 their own address space using seccomp. Once seccomp is
1202 enabled via /proc/<pid>/seccomp, it cannot be disabled 1203 enabled via prctl(PR_SET_SECCOMP), it cannot be disabled
1203 and the task is only allowed to execute a few safe syscalls 1204 and the task is only allowed to execute a few safe syscalls
1204 defined by each seccomp mode. 1205 defined by each seccomp mode.
1205 1206
@@ -1356,14 +1357,14 @@ config PHYSICAL_ALIGN
1356 Don't change this unless you know what you are doing. 1357 Don't change this unless you know what you are doing.
1357 1358
1358config HOTPLUG_CPU 1359config HOTPLUG_CPU
1359 bool "Support for suspend on SMP and hot-pluggable CPUs (EXPERIMENTAL)" 1360 bool "Support for hot-pluggable CPUs"
1360 depends on SMP && HOTPLUG && EXPERIMENTAL && !X86_VOYAGER 1361 depends on SMP && HOTPLUG && !X86_VOYAGER
1361 ---help--- 1362 ---help---
1362 Say Y here to experiment with turning CPUs off and on, and to 1363 Say Y here to allow turning CPUs off and on. CPUs can be
1363 enable suspend on SMP systems. CPUs can be controlled through 1364 controlled through /sys/devices/system/cpu.
1364 /sys/devices/system/cpu. 1365 ( Note: power management support will enable this option
1365 Say N if you want to disable CPU hotplug and don't need to 1366 automatically on SMP systems. )
1366 suspend. 1367 Say N if you want to disable CPU hotplug.
1367 1368
1368config COMPAT_VDSO 1369config COMPAT_VDSO
1369 def_bool y 1370 def_bool y
@@ -1378,6 +1379,51 @@ config COMPAT_VDSO
1378 1379
1379 If unsure, say Y. 1380 If unsure, say Y.
1380 1381
1382config CMDLINE_BOOL
1383 bool "Built-in kernel command line"
1384 default n
1385 help
1386 Allow for specifying boot arguments to the kernel at
1387 build time. On some systems (e.g. embedded ones), it is
1388 necessary or convenient to provide some or all of the
1389 kernel boot arguments with the kernel itself (that is,
1390 to not rely on the boot loader to provide them.)
1391
1392 To compile command line arguments into the kernel,
1393 set this option to 'Y', then fill in the
1394 the boot arguments in CONFIG_CMDLINE.
1395
1396 Systems with fully functional boot loaders (i.e. non-embedded)
1397 should leave this option set to 'N'.
1398
1399config CMDLINE
1400 string "Built-in kernel command string"
1401 depends on CMDLINE_BOOL
1402 default ""
1403 help
1404 Enter arguments here that should be compiled into the kernel
1405 image and used at boot time. If the boot loader provides a
1406 command line at boot time, it is appended to this string to
1407 form the full kernel command line, when the system boots.
1408
1409 However, you can use the CONFIG_CMDLINE_OVERRIDE option to
1410 change this behavior.
1411
1412 In most cases, the command line (whether built-in or provided
1413 by the boot loader) should specify the device for the root
1414 file system.
1415
1416config CMDLINE_OVERRIDE
1417 bool "Built-in command line overrides boot loader arguments"
1418 default n
1419 depends on CMDLINE_BOOL
1420 help
1421 Set this option to 'Y' to have the kernel ignore the boot loader
1422 command line, and use ONLY the built-in command line.
1423
1424 This is used to work around broken boot loaders. This should
1425 be set to 'N' under normal conditions.
1426
1381endmenu 1427endmenu
1382 1428
1383config ARCH_ENABLE_MEMORY_HOTPLUG 1429config ARCH_ENABLE_MEMORY_HOTPLUG
@@ -1781,7 +1827,7 @@ config COMPAT_FOR_U64_ALIGNMENT
1781 1827
1782config SYSVIPC_COMPAT 1828config SYSVIPC_COMPAT
1783 def_bool y 1829 def_bool y
1784 depends on X86_64 && COMPAT && SYSVIPC 1830 depends on COMPAT && SYSVIPC
1785 1831
1786endmenu 1832endmenu
1787 1833
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 6156ac25ff8c..f8843c3ae77d 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -382,14 +382,17 @@ config X86_OOSTORE
382# P6_NOPs are a relatively minor optimization that require a family >= 382# P6_NOPs are a relatively minor optimization that require a family >=
383# 6 processor, except that it is broken on certain VIA chips. 383# 6 processor, except that it is broken on certain VIA chips.
384# Furthermore, AMD chips prefer a totally different sequence of NOPs 384# Furthermore, AMD chips prefer a totally different sequence of NOPs
385# (which work on all CPUs). As a result, disallow these if we're 385# (which work on all CPUs). In addition, it looks like Virtual PC
386# compiling X86_GENERIC but not X86_64 (these NOPs do work on all 386# does not understand them.
387# x86-64 capable chips); the list of processors in the right-hand clause 387#
388# are the cores that benefit from this optimization. 388# As a result, disallow these if we're not compiling for X86_64 (these
389# NOPs do work on all x86-64 capable chips); the list of processors in
390# the right-hand clause are the cores that benefit from this optimization.
389# 391#
390config X86_P6_NOP 392config X86_P6_NOP
391 def_bool y 393 def_bool y
392 depends on (X86_64 || !X86_GENERIC) && (M686 || MPENTIUMII || MPENTIUMIII || MPENTIUMM || MCORE2 || MPENTIUM4 || MPSC) 394 depends on X86_64
395 depends on (MCORE2 || MPENTIUM4 || MPSC)
393 396
394config X86_TSC 397config X86_TSC
395 def_bool y 398 def_bool y
@@ -423,17 +426,9 @@ menuconfig PROCESSOR_SELECT
423 This lets you choose what x86 vendor support code your kernel 426 This lets you choose what x86 vendor support code your kernel
424 will include. 427 will include.
425 428
426config CPU_SUP_INTEL_32 429config CPU_SUP_INTEL
427 default y 430 default y
428 bool "Support Intel processors" if PROCESSOR_SELECT 431 bool "Support Intel processors" if PROCESSOR_SELECT
429 depends on !64BIT
430 help
431 This enables extended support for Intel processors
432
433config CPU_SUP_INTEL_64
434 default y
435 bool "Support Intel processors" if PROCESSOR_SELECT
436 depends on 64BIT
437 help 432 help
438 This enables extended support for Intel processors 433 This enables extended support for Intel processors
439 434
@@ -444,17 +439,9 @@ config CPU_SUP_CYRIX_32
444 help 439 help
445 This enables extended support for Cyrix processors 440 This enables extended support for Cyrix processors
446 441
447config CPU_SUP_AMD_32 442config CPU_SUP_AMD
448 default y
449 bool "Support AMD processors" if PROCESSOR_SELECT
450 depends on !64BIT
451 help
452 This enables extended support for AMD processors
453
454config CPU_SUP_AMD_64
455 default y 443 default y
456 bool "Support AMD processors" if PROCESSOR_SELECT 444 bool "Support AMD processors" if PROCESSOR_SELECT
457 depends on 64BIT
458 help 445 help
459 This enables extended support for AMD processors 446 This enables extended support for AMD processors
460 447
@@ -485,3 +472,21 @@ config CPU_SUP_UMC_32
485 depends on !64BIT 472 depends on !64BIT
486 help 473 help
487 This enables extended support for UMC processors 474 This enables extended support for UMC processors
475
476config X86_DS
477 bool "Debug Store support"
478 default y
479 help
480 Add support for Debug Store.
481 This allows the kernel to provide a memory buffer to the hardware
482 to store various profiling and tracing events.
483
484config X86_PTRACE_BTS
485 bool "ptrace interface to Branch Trace Store"
486 default y
487 depends on (X86_DS && X86_DEBUGCTLMSR)
488 help
489 Add a ptrace interface to allow collecting an execution trace
490 of the traced task.
491 This collects control flow changes in a (cyclic) buffer and allows
492 debuggers to fill in the gaps and show an execution trace of the debuggee.
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index ba7736cf2ec7..29c5fbf08392 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -137,14 +137,15 @@ relocated:
137 */ 137 */
138 movl output_len(%ebx), %eax 138 movl output_len(%ebx), %eax
139 pushl %eax 139 pushl %eax
140 # push arguments for decompress_kernel:
140 pushl %ebp # output address 141 pushl %ebp # output address
141 movl input_len(%ebx), %eax 142 movl input_len(%ebx), %eax
142 pushl %eax # input_len 143 pushl %eax # input_len
143 leal input_data(%ebx), %eax 144 leal input_data(%ebx), %eax
144 pushl %eax # input_data 145 pushl %eax # input_data
145 leal boot_heap(%ebx), %eax 146 leal boot_heap(%ebx), %eax
146 pushl %eax # heap area as third argument 147 pushl %eax # heap area
147 pushl %esi # real mode pointer as second arg 148 pushl %esi # real mode pointer
148 call decompress_kernel 149 call decompress_kernel
149 addl $20, %esp 150 addl $20, %esp
150 popl %ecx 151 popl %ecx
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index aaf5a2131efc..5780d361105b 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -27,7 +27,7 @@
27#include <linux/linkage.h> 27#include <linux/linkage.h>
28#include <linux/screen_info.h> 28#include <linux/screen_info.h>
29#include <linux/elf.h> 29#include <linux/elf.h>
30#include <asm/io.h> 30#include <linux/io.h>
31#include <asm/page.h> 31#include <asm/page.h>
32#include <asm/boot.h> 32#include <asm/boot.h>
33#include <asm/bootparam.h> 33#include <asm/bootparam.h>
@@ -251,7 +251,7 @@ static void __putstr(int error, const char *s)
251 y--; 251 y--;
252 } 252 }
253 } else { 253 } else {
254 vidmem [(x + cols * y) * 2] = c; 254 vidmem[(x + cols * y) * 2] = c;
255 if (++x >= cols) { 255 if (++x >= cols) {
256 x = 0; 256 x = 0;
257 if (++y >= lines) { 257 if (++y >= lines) {
@@ -277,7 +277,8 @@ static void *memset(void *s, int c, unsigned n)
277 int i; 277 int i;
278 char *ss = s; 278 char *ss = s;
279 279
280 for (i = 0; i < n; i++) ss[i] = c; 280 for (i = 0; i < n; i++)
281 ss[i] = c;
281 return s; 282 return s;
282} 283}
283 284
@@ -287,7 +288,8 @@ static void *memcpy(void *dest, const void *src, unsigned n)
287 const char *s = src; 288 const char *s = src;
288 char *d = dest; 289 char *d = dest;
289 290
290 for (i = 0; i < n; i++) d[i] = s[i]; 291 for (i = 0; i < n; i++)
292 d[i] = s[i];
291 return dest; 293 return dest;
292} 294}
293 295
diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c
index a1310c52fc0c..857e492c571e 100644
--- a/arch/x86/boot/compressed/relocs.c
+++ b/arch/x86/boot/compressed/relocs.c
@@ -492,7 +492,7 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym))
492 continue; 492 continue;
493 } 493 }
494 sh_symtab = sec_symtab->symtab; 494 sh_symtab = sec_symtab->symtab;
495 sym_strtab = sec->link->strtab; 495 sym_strtab = sec_symtab->link->strtab;
496 for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) { 496 for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) {
497 Elf32_Rel *rel; 497 Elf32_Rel *rel;
498 Elf32_Sym *sym; 498 Elf32_Sym *sym;
diff --git a/arch/x86/boot/cpu.c b/arch/x86/boot/cpu.c
index 75298fe2edca..6ec6bb6e9957 100644
--- a/arch/x86/boot/cpu.c
+++ b/arch/x86/boot/cpu.c
@@ -59,17 +59,18 @@ int validate_cpu(void)
59 u32 e = err_flags[i]; 59 u32 e = err_flags[i];
60 60
61 for (j = 0; j < 32; j++) { 61 for (j = 0; j < 32; j++) {
62 int n = (i << 5)+j; 62 if (msg_strs[0] < i ||
63 if (*msg_strs < n) { 63 (msg_strs[0] == i && msg_strs[1] < j)) {
64 /* Skip to the next string */ 64 /* Skip to the next string */
65 do { 65 msg_strs += 2;
66 msg_strs++; 66 while (*msg_strs++)
67 } while (*msg_strs); 67 ;
68 msg_strs++;
69 } 68 }
70 if (e & 1) { 69 if (e & 1) {
71 if (*msg_strs == n && msg_strs[1]) 70 if (msg_strs[0] == i &&
72 printf("%s ", msg_strs+1); 71 msg_strs[1] == j &&
72 msg_strs[2])
73 printf("%s ", msg_strs+2);
73 else 74 else
74 printf("%d:%d ", i, j); 75 printf("%d:%d ", i, j);
75 } 76 }
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index af86e431acfa..b993062e9a5f 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -30,7 +30,6 @@ SYSSEG = DEF_SYSSEG /* system loaded at 0x10000 (65536) */
30SYSSIZE = DEF_SYSSIZE /* system size: # of 16-byte clicks */ 30SYSSIZE = DEF_SYSSIZE /* system size: # of 16-byte clicks */
31 /* to be loaded */ 31 /* to be loaded */
32ROOT_DEV = 0 /* ROOT_DEV is now written by "build" */ 32ROOT_DEV = 0 /* ROOT_DEV is now written by "build" */
33SWAP_DEV = 0 /* SWAP_DEV is now written by "build" */
34 33
35#ifndef SVGA_MODE 34#ifndef SVGA_MODE
36#define SVGA_MODE ASK_VGA 35#define SVGA_MODE ASK_VGA
diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c
index 4589caa3e9d1..8ef60f20b371 100644
--- a/arch/x86/boot/mkcpustr.c
+++ b/arch/x86/boot/mkcpustr.c
@@ -17,31 +17,31 @@
17 17
18#include "../kernel/cpu/capflags.c" 18#include "../kernel/cpu/capflags.c"
19 19
20#if NCAPFLAGS > 8
21# error "Need to adjust the boot code handling of CPUID strings"
22#endif
23
24int main(void) 20int main(void)
25{ 21{
26 int i; 22 int i, j;
27 const char *str; 23 const char *str;
28 24
29 printf("static const char x86_cap_strs[] = \n"); 25 printf("static const char x86_cap_strs[] = \n");
30 26
31 for (i = 0; i < NCAPINTS*32; i++) { 27 for (i = 0; i < NCAPINTS; i++) {
32 str = x86_cap_flags[i]; 28 for (j = 0; j < 32; j++) {
33 29 str = x86_cap_flags[i*32+j];
34 if (i == NCAPINTS*32-1) { 30
35 /* The last entry must be unconditional; this 31 if (i == NCAPINTS-1 && j == 31) {
36 also consumes the compiler-added null character */ 32 /* The last entry must be unconditional; this
37 if (!str) 33 also consumes the compiler-added null
38 str = ""; 34 character */
39 printf("\t\"\\x%02x\"\"%s\"\n", i, str); 35 if (!str)
40 } else if (str) { 36 str = "";
41 printf("#if REQUIRED_MASK%d & (1 << %d)\n" 37 printf("\t\"\\x%02x\\x%02x\"\"%s\"\n",
42 "\t\"\\x%02x\"\"%s\\0\"\n" 38 i, j, str);
43 "#endif\n", 39 } else if (str) {
44 i >> 5, i & 31, i, str); 40 printf("#if REQUIRED_MASK%d & (1 << %d)\n"
41 "\t\"\\x%02x\\x%02x\"\"%s\\0\"\n"
42 "#endif\n",
43 i, j, i, j, str);
44 }
45 } 45 }
46 } 46 }
47 printf("\t;\n"); 47 printf("\t;\n");
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 104275e191a8..ef9a52005ec9 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -1,7 +1,7 @@
1# 1#
2# Automatically generated make config: don't edit 2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.27-rc4 3# Linux kernel version: 2.6.27-rc5
4# Mon Aug 25 15:04:00 2008 4# Wed Sep 3 17:23:09 2008
5# 5#
6# CONFIG_64BIT is not set 6# CONFIG_64BIT is not set
7CONFIG_X86_32=y 7CONFIG_X86_32=y
@@ -202,7 +202,7 @@ CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y
202# CONFIG_M586 is not set 202# CONFIG_M586 is not set
203# CONFIG_M586TSC is not set 203# CONFIG_M586TSC is not set
204# CONFIG_M586MMX is not set 204# CONFIG_M586MMX is not set
205# CONFIG_M686 is not set 205CONFIG_M686=y
206# CONFIG_MPENTIUMII is not set 206# CONFIG_MPENTIUMII is not set
207# CONFIG_MPENTIUMIII is not set 207# CONFIG_MPENTIUMIII is not set
208# CONFIG_MPENTIUMM is not set 208# CONFIG_MPENTIUMM is not set
@@ -221,13 +221,14 @@ CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y
221# CONFIG_MVIAC3_2 is not set 221# CONFIG_MVIAC3_2 is not set
222# CONFIG_MVIAC7 is not set 222# CONFIG_MVIAC7 is not set
223# CONFIG_MPSC is not set 223# CONFIG_MPSC is not set
224CONFIG_MCORE2=y 224# CONFIG_MCORE2 is not set
225# CONFIG_GENERIC_CPU is not set 225# CONFIG_GENERIC_CPU is not set
226CONFIG_X86_GENERIC=y 226CONFIG_X86_GENERIC=y
227CONFIG_X86_CPU=y 227CONFIG_X86_CPU=y
228CONFIG_X86_CMPXCHG=y 228CONFIG_X86_CMPXCHG=y
229CONFIG_X86_L1_CACHE_SHIFT=7 229CONFIG_X86_L1_CACHE_SHIFT=7
230CONFIG_X86_XADD=y 230CONFIG_X86_XADD=y
231# CONFIG_X86_PPRO_FENCE is not set
231CONFIG_X86_WP_WORKS_OK=y 232CONFIG_X86_WP_WORKS_OK=y
232CONFIG_X86_INVLPG=y 233CONFIG_X86_INVLPG=y
233CONFIG_X86_BSWAP=y 234CONFIG_X86_BSWAP=y
@@ -235,14 +236,15 @@ CONFIG_X86_POPAD_OK=y
235CONFIG_X86_INTEL_USERCOPY=y 236CONFIG_X86_INTEL_USERCOPY=y
236CONFIG_X86_USE_PPRO_CHECKSUM=y 237CONFIG_X86_USE_PPRO_CHECKSUM=y
237CONFIG_X86_TSC=y 238CONFIG_X86_TSC=y
239CONFIG_X86_CMOV=y
238CONFIG_X86_MINIMUM_CPU_FAMILY=4 240CONFIG_X86_MINIMUM_CPU_FAMILY=4
239CONFIG_X86_DEBUGCTLMSR=y 241CONFIG_X86_DEBUGCTLMSR=y
240CONFIG_HPET_TIMER=y 242CONFIG_HPET_TIMER=y
241CONFIG_HPET_EMULATE_RTC=y 243CONFIG_HPET_EMULATE_RTC=y
242CONFIG_DMI=y 244CONFIG_DMI=y
243# CONFIG_IOMMU_HELPER is not set 245# CONFIG_IOMMU_HELPER is not set
244CONFIG_NR_CPUS=4 246CONFIG_NR_CPUS=64
245# CONFIG_SCHED_SMT is not set 247CONFIG_SCHED_SMT=y
246CONFIG_SCHED_MC=y 248CONFIG_SCHED_MC=y
247# CONFIG_PREEMPT_NONE is not set 249# CONFIG_PREEMPT_NONE is not set
248CONFIG_PREEMPT_VOLUNTARY=y 250CONFIG_PREEMPT_VOLUNTARY=y
@@ -254,7 +256,8 @@ CONFIG_VM86=y
254# CONFIG_TOSHIBA is not set 256# CONFIG_TOSHIBA is not set
255# CONFIG_I8K is not set 257# CONFIG_I8K is not set
256CONFIG_X86_REBOOTFIXUPS=y 258CONFIG_X86_REBOOTFIXUPS=y
257# CONFIG_MICROCODE is not set 259CONFIG_MICROCODE=y
260CONFIG_MICROCODE_OLD_INTERFACE=y
258CONFIG_X86_MSR=y 261CONFIG_X86_MSR=y
259CONFIG_X86_CPUID=y 262CONFIG_X86_CPUID=y
260# CONFIG_NOHIGHMEM is not set 263# CONFIG_NOHIGHMEM is not set
@@ -2115,7 +2118,7 @@ CONFIG_IO_DELAY_0X80=y
2115CONFIG_DEFAULT_IO_DELAY_TYPE=0 2118CONFIG_DEFAULT_IO_DELAY_TYPE=0
2116CONFIG_DEBUG_BOOT_PARAMS=y 2119CONFIG_DEBUG_BOOT_PARAMS=y
2117# CONFIG_CPA_DEBUG is not set 2120# CONFIG_CPA_DEBUG is not set
2118# CONFIG_OPTIMIZE_INLINING is not set 2121CONFIG_OPTIMIZE_INLINING=y
2119 2122
2120# 2123#
2121# Security options 2124# Security options
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 678c8acefe04..e620ea6e2a7a 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -1,7 +1,7 @@
1# 1#
2# Automatically generated make config: don't edit 2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.27-rc4 3# Linux kernel version: 2.6.27-rc5
4# Mon Aug 25 14:40:46 2008 4# Wed Sep 3 17:13:39 2008
5# 5#
6CONFIG_64BIT=y 6CONFIG_64BIT=y
7# CONFIG_X86_32 is not set 7# CONFIG_X86_32 is not set
@@ -218,17 +218,14 @@ CONFIG_X86_PC=y
218# CONFIG_MVIAC3_2 is not set 218# CONFIG_MVIAC3_2 is not set
219# CONFIG_MVIAC7 is not set 219# CONFIG_MVIAC7 is not set
220# CONFIG_MPSC is not set 220# CONFIG_MPSC is not set
221CONFIG_MCORE2=y 221# CONFIG_MCORE2 is not set
222# CONFIG_GENERIC_CPU is not set 222CONFIG_GENERIC_CPU=y
223CONFIG_X86_CPU=y 223CONFIG_X86_CPU=y
224CONFIG_X86_L1_CACHE_BYTES=64 224CONFIG_X86_L1_CACHE_BYTES=128
225CONFIG_X86_INTERNODE_CACHE_BYTES=64 225CONFIG_X86_INTERNODE_CACHE_BYTES=128
226CONFIG_X86_CMPXCHG=y 226CONFIG_X86_CMPXCHG=y
227CONFIG_X86_L1_CACHE_SHIFT=6 227CONFIG_X86_L1_CACHE_SHIFT=7
228CONFIG_X86_WP_WORKS_OK=y 228CONFIG_X86_WP_WORKS_OK=y
229CONFIG_X86_INTEL_USERCOPY=y
230CONFIG_X86_USE_PPRO_CHECKSUM=y
231CONFIG_X86_P6_NOP=y
232CONFIG_X86_TSC=y 229CONFIG_X86_TSC=y
233CONFIG_X86_CMPXCHG64=y 230CONFIG_X86_CMPXCHG64=y
234CONFIG_X86_CMOV=y 231CONFIG_X86_CMOV=y
@@ -243,9 +240,8 @@ CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT=y
243CONFIG_AMD_IOMMU=y 240CONFIG_AMD_IOMMU=y
244CONFIG_SWIOTLB=y 241CONFIG_SWIOTLB=y
245CONFIG_IOMMU_HELPER=y 242CONFIG_IOMMU_HELPER=y
246# CONFIG_MAXSMP is not set 243CONFIG_NR_CPUS=64
247CONFIG_NR_CPUS=4 244CONFIG_SCHED_SMT=y
248# CONFIG_SCHED_SMT is not set
249CONFIG_SCHED_MC=y 245CONFIG_SCHED_MC=y
250# CONFIG_PREEMPT_NONE is not set 246# CONFIG_PREEMPT_NONE is not set
251CONFIG_PREEMPT_VOLUNTARY=y 247CONFIG_PREEMPT_VOLUNTARY=y
@@ -254,7 +250,8 @@ CONFIG_X86_LOCAL_APIC=y
254CONFIG_X86_IO_APIC=y 250CONFIG_X86_IO_APIC=y
255# CONFIG_X86_MCE is not set 251# CONFIG_X86_MCE is not set
256# CONFIG_I8K is not set 252# CONFIG_I8K is not set
257# CONFIG_MICROCODE is not set 253CONFIG_MICROCODE=y
254CONFIG_MICROCODE_OLD_INTERFACE=y
258CONFIG_X86_MSR=y 255CONFIG_X86_MSR=y
259CONFIG_X86_CPUID=y 256CONFIG_X86_CPUID=y
260CONFIG_NUMA=y 257CONFIG_NUMA=y
@@ -290,7 +287,7 @@ CONFIG_BOUNCE=y
290CONFIG_VIRT_TO_BUS=y 287CONFIG_VIRT_TO_BUS=y
291CONFIG_MTRR=y 288CONFIG_MTRR=y
292# CONFIG_MTRR_SANITIZER is not set 289# CONFIG_MTRR_SANITIZER is not set
293# CONFIG_X86_PAT is not set 290CONFIG_X86_PAT=y
294CONFIG_EFI=y 291CONFIG_EFI=y
295CONFIG_SECCOMP=y 292CONFIG_SECCOMP=y
296# CONFIG_HZ_100 is not set 293# CONFIG_HZ_100 is not set
@@ -2089,7 +2086,7 @@ CONFIG_IO_DELAY_0X80=y
2089CONFIG_DEFAULT_IO_DELAY_TYPE=0 2086CONFIG_DEFAULT_IO_DELAY_TYPE=0
2090CONFIG_DEBUG_BOOT_PARAMS=y 2087CONFIG_DEBUG_BOOT_PARAMS=y
2091# CONFIG_CPA_DEBUG is not set 2088# CONFIG_CPA_DEBUG is not set
2092# CONFIG_OPTIMIZE_INLINING is not set 2089CONFIG_OPTIMIZE_INLINING=y
2093 2090
2094# 2091#
2095# Security options 2092# Security options
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 3874c2de5403..903de4aa5094 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -10,6 +10,8 @@ obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
10obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o 10obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
11obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o 11obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
12 12
13obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
14
13aes-i586-y := aes-i586-asm_32.o aes_glue.o 15aes-i586-y := aes-i586-asm_32.o aes_glue.o
14twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o 16twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
15salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o 17salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
diff --git a/arch/x86/crypto/crc32c-intel.c b/arch/x86/crypto/crc32c-intel.c
new file mode 100644
index 000000000000..070afc5b6c94
--- /dev/null
+++ b/arch/x86/crypto/crc32c-intel.c
@@ -0,0 +1,197 @@
1/*
2 * Using hardware provided CRC32 instruction to accelerate the CRC32 disposal.
3 * CRC32C polynomial:0x1EDC6F41(BE)/0x82F63B78(LE)
4 * CRC32 is a new instruction in Intel SSE4.2, the reference can be found at:
5 * http://www.intel.com/products/processor/manuals/
6 * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
7 * Volume 2A: Instruction Set Reference, A-M
8 *
9 * Copyright (c) 2008 Austin Zhang <austin_zhang@linux.intel.com>
10 * Copyright (c) 2008 Kent Liu <kent.liu@intel.com>
11 *
12 * This program is free software; you can redistribute it and/or modify it
13 * under the terms of the GNU General Public License as published by the Free
14 * Software Foundation; either version 2 of the License, or (at your option)
15 * any later version.
16 *
17 */
18#include <linux/init.h>
19#include <linux/module.h>
20#include <linux/string.h>
21#include <linux/kernel.h>
22#include <crypto/internal/hash.h>
23
24#include <asm/cpufeature.h>
25
26#define CHKSUM_BLOCK_SIZE 1
27#define CHKSUM_DIGEST_SIZE 4
28
29#define SCALE_F sizeof(unsigned long)
30
31#ifdef CONFIG_X86_64
32#define REX_PRE "0x48, "
33#else
34#define REX_PRE
35#endif
36
37static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length)
38{
39 while (length--) {
40 __asm__ __volatile__(
41 ".byte 0xf2, 0xf, 0x38, 0xf0, 0xf1"
42 :"=S"(crc)
43 :"0"(crc), "c"(*data)
44 );
45 data++;
46 }
47
48 return crc;
49}
50
51static u32 __pure crc32c_intel_le_hw(u32 crc, unsigned char const *p, size_t len)
52{
53 unsigned int iquotient = len / SCALE_F;
54 unsigned int iremainder = len % SCALE_F;
55 unsigned long *ptmp = (unsigned long *)p;
56
57 while (iquotient--) {
58 __asm__ __volatile__(
59 ".byte 0xf2, " REX_PRE "0xf, 0x38, 0xf1, 0xf1;"
60 :"=S"(crc)
61 :"0"(crc), "c"(*ptmp)
62 );
63 ptmp++;
64 }
65
66 if (iremainder)
67 crc = crc32c_intel_le_hw_byte(crc, (unsigned char *)ptmp,
68 iremainder);
69
70 return crc;
71}
72
73/*
74 * Setting the seed allows arbitrary accumulators and flexible XOR policy
75 * If your algorithm starts with ~0, then XOR with ~0 before you set
76 * the seed.
77 */
78static int crc32c_intel_setkey(struct crypto_ahash *hash, const u8 *key,
79 unsigned int keylen)
80{
81 u32 *mctx = crypto_ahash_ctx(hash);
82
83 if (keylen != sizeof(u32)) {
84 crypto_ahash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
85 return -EINVAL;
86 }
87 *mctx = le32_to_cpup((__le32 *)key);
88 return 0;
89}
90
91static int crc32c_intel_init(struct ahash_request *req)
92{
93 u32 *mctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req));
94 u32 *crcp = ahash_request_ctx(req);
95
96 *crcp = *mctx;
97
98 return 0;
99}
100
101static int crc32c_intel_update(struct ahash_request *req)
102{
103 struct crypto_hash_walk walk;
104 u32 *crcp = ahash_request_ctx(req);
105 u32 crc = *crcp;
106 int nbytes;
107
108 for (nbytes = crypto_hash_walk_first(req, &walk); nbytes;
109 nbytes = crypto_hash_walk_done(&walk, 0))
110 crc = crc32c_intel_le_hw(crc, walk.data, nbytes);
111
112 *crcp = crc;
113 return 0;
114}
115
116static int crc32c_intel_final(struct ahash_request *req)
117{
118 u32 *crcp = ahash_request_ctx(req);
119
120 *(__le32 *)req->result = ~cpu_to_le32p(crcp);
121 return 0;
122}
123
124static int crc32c_intel_digest(struct ahash_request *req)
125{
126 struct crypto_hash_walk walk;
127 u32 *mctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req));
128 u32 crc = *mctx;
129 int nbytes;
130
131 for (nbytes = crypto_hash_walk_first(req, &walk); nbytes;
132 nbytes = crypto_hash_walk_done(&walk, 0))
133 crc = crc32c_intel_le_hw(crc, walk.data, nbytes);
134
135 *(__le32 *)req->result = ~cpu_to_le32(crc);
136 return 0;
137}
138
139static int crc32c_intel_cra_init(struct crypto_tfm *tfm)
140{
141 u32 *key = crypto_tfm_ctx(tfm);
142
143 *key = ~0;
144
145 tfm->crt_ahash.reqsize = sizeof(u32);
146
147 return 0;
148}
149
150static struct crypto_alg alg = {
151 .cra_name = "crc32c",
152 .cra_driver_name = "crc32c-intel",
153 .cra_priority = 200,
154 .cra_flags = CRYPTO_ALG_TYPE_AHASH,
155 .cra_blocksize = CHKSUM_BLOCK_SIZE,
156 .cra_alignmask = 3,
157 .cra_ctxsize = sizeof(u32),
158 .cra_module = THIS_MODULE,
159 .cra_list = LIST_HEAD_INIT(alg.cra_list),
160 .cra_init = crc32c_intel_cra_init,
161 .cra_type = &crypto_ahash_type,
162 .cra_u = {
163 .ahash = {
164 .digestsize = CHKSUM_DIGEST_SIZE,
165 .setkey = crc32c_intel_setkey,
166 .init = crc32c_intel_init,
167 .update = crc32c_intel_update,
168 .final = crc32c_intel_final,
169 .digest = crc32c_intel_digest,
170 }
171 }
172};
173
174
175static int __init crc32c_intel_mod_init(void)
176{
177 if (cpu_has_xmm4_2)
178 return crypto_register_alg(&alg);
179 else
180 return -ENODEV;
181}
182
183static void __exit crc32c_intel_mod_fini(void)
184{
185 crypto_unregister_alg(&alg);
186}
187
188module_init(crc32c_intel_mod_init);
189module_exit(crc32c_intel_mod_fini);
190
191MODULE_AUTHOR("Austin Zhang <austin.zhang@intel.com>, Kent Liu <kent.liu@intel.com>");
192MODULE_DESCRIPTION("CRC32c (Castagnoli) optimization using Intel Hardware.");
193MODULE_LICENSE("GPL");
194
195MODULE_ALIAS("crc32c");
196MODULE_ALIAS("crc32c-intel");
197
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index a0e1dbe67dc1..127ec3f07214 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -85,8 +85,10 @@ static void dump_thread32(struct pt_regs *regs, struct user32 *dump)
85 dump->regs.ax = regs->ax; 85 dump->regs.ax = regs->ax;
86 dump->regs.ds = current->thread.ds; 86 dump->regs.ds = current->thread.ds;
87 dump->regs.es = current->thread.es; 87 dump->regs.es = current->thread.es;
88 asm("movl %%fs,%0" : "=r" (fs)); dump->regs.fs = fs; 88 savesegment(fs, fs);
89 asm("movl %%gs,%0" : "=r" (gs)); dump->regs.gs = gs; 89 dump->regs.fs = fs;
90 savesegment(gs, gs);
91 dump->regs.gs = gs;
90 dump->regs.orig_ax = regs->orig_ax; 92 dump->regs.orig_ax = regs->orig_ax;
91 dump->regs.ip = regs->ip; 93 dump->regs.ip = regs->ip;
92 dump->regs.cs = regs->cs; 94 dump->regs.cs = regs->cs;
@@ -430,8 +432,9 @@ beyond_if:
430 current->mm->start_stack = 432 current->mm->start_stack =
431 (unsigned long)create_aout_tables((char __user *)bprm->p, bprm); 433 (unsigned long)create_aout_tables((char __user *)bprm->p, bprm);
432 /* start thread */ 434 /* start thread */
433 asm volatile("movl %0,%%fs" :: "r" (0)); \ 435 loadsegment(fs, 0);
434 asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS)); 436 loadsegment(ds, __USER32_DS);
437 loadsegment(es, __USER32_DS);
435 load_gs_index(0); 438 load_gs_index(0);
436 (regs)->ip = ex.a_entry; 439 (regs)->ip = ex.a_entry;
437 (regs)->sp = current->mm->start_stack; 440 (regs)->sp = current->mm->start_stack;
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index f25a10124005..8d64c1bc8474 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -207,7 +207,7 @@ struct rt_sigframe
207 { unsigned int cur; \ 207 { unsigned int cur; \
208 unsigned short pre; \ 208 unsigned short pre; \
209 err |= __get_user(pre, &sc->seg); \ 209 err |= __get_user(pre, &sc->seg); \
210 asm volatile("movl %%" #seg ",%0" : "=r" (cur)); \ 210 savesegment(seg, cur); \
211 pre |= mask; \ 211 pre |= mask; \
212 if (pre != cur) loadsegment(seg, pre); } 212 if (pre != cur) loadsegment(seg, pre); }
213 213
@@ -236,7 +236,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
236 */ 236 */
237 err |= __get_user(gs, &sc->gs); 237 err |= __get_user(gs, &sc->gs);
238 gs |= 3; 238 gs |= 3;
239 asm("movl %%gs,%0" : "=r" (oldgs)); 239 savesegment(gs, oldgs);
240 if (gs != oldgs) 240 if (gs != oldgs)
241 load_gs_index(gs); 241 load_gs_index(gs);
242 242
@@ -342,14 +342,13 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc,
342{ 342{
343 int tmp, err = 0; 343 int tmp, err = 0;
344 344
345 tmp = 0; 345 savesegment(gs, tmp);
346 __asm__("movl %%gs,%0" : "=r"(tmp): "0"(tmp));
347 err |= __put_user(tmp, (unsigned int __user *)&sc->gs); 346 err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
348 __asm__("movl %%fs,%0" : "=r"(tmp): "0"(tmp)); 347 savesegment(fs, tmp);
349 err |= __put_user(tmp, (unsigned int __user *)&sc->fs); 348 err |= __put_user(tmp, (unsigned int __user *)&sc->fs);
350 __asm__("movl %%ds,%0" : "=r"(tmp): "0"(tmp)); 349 savesegment(ds, tmp);
351 err |= __put_user(tmp, (unsigned int __user *)&sc->ds); 350 err |= __put_user(tmp, (unsigned int __user *)&sc->ds);
352 __asm__("movl %%es,%0" : "=r"(tmp): "0"(tmp)); 351 savesegment(es, tmp);
353 err |= __put_user(tmp, (unsigned int __user *)&sc->es); 352 err |= __put_user(tmp, (unsigned int __user *)&sc->es);
354 353
355 err |= __put_user((u32)regs->di, &sc->di); 354 err |= __put_user((u32)regs->di, &sc->di);
@@ -491,8 +490,8 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
491 regs->dx = 0; 490 regs->dx = 0;
492 regs->cx = 0; 491 regs->cx = 0;
493 492
494 asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); 493 loadsegment(ds, __USER32_DS);
495 asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); 494 loadsegment(es, __USER32_DS);
496 495
497 regs->cs = __USER32_CS; 496 regs->cs = __USER32_CS;
498 regs->ss = __USER32_DS; 497 regs->ss = __USER32_DS;
@@ -588,8 +587,8 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
588 regs->dx = (unsigned long) &frame->info; 587 regs->dx = (unsigned long) &frame->info;
589 regs->cx = (unsigned long) &frame->uc; 588 regs->cx = (unsigned long) &frame->uc;
590 589
591 asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); 590 loadsegment(ds, __USER32_DS);
592 asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); 591 loadsegment(es, __USER32_DS);
593 592
594 regs->cs = __USER32_CS; 593 regs->cs = __USER32_CS;
595 regs->ss = __USER32_DS; 594 regs->ss = __USER32_DS;
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index d3c64088b981..beda4232ce69 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -556,15 +556,6 @@ asmlinkage long sys32_rt_sigqueueinfo(int pid, int sig,
556 return ret; 556 return ret;
557} 557}
558 558
559/* These are here just in case some old ia32 binary calls it. */
560asmlinkage long sys32_pause(void)
561{
562 current->state = TASK_INTERRUPTIBLE;
563 schedule();
564 return -ERESTARTNOHAND;
565}
566
567
568#ifdef CONFIG_SYSCTL_SYSCALL 559#ifdef CONFIG_SYSCTL_SYSCALL
569struct sysctl_ia32 { 560struct sysctl_ia32 {
570 unsigned int name; 561 unsigned int name;
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 27ef365e757d..c2ac1b4515a0 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -58,7 +58,6 @@ EXPORT_SYMBOL(acpi_disabled);
58#ifdef CONFIG_X86_64 58#ifdef CONFIG_X86_64
59 59
60#include <asm/proto.h> 60#include <asm/proto.h>
61#include <asm/genapic.h>
62 61
63#else /* X86 */ 62#else /* X86 */
64 63
@@ -97,8 +96,6 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
97#warning ACPI uses CMPXCHG, i486 and later hardware 96#warning ACPI uses CMPXCHG, i486 and later hardware
98#endif 97#endif
99 98
100static int acpi_mcfg_64bit_base_addr __initdata = FALSE;
101
102/* -------------------------------------------------------------------------- 99/* --------------------------------------------------------------------------
103 Boot-time Configuration 100 Boot-time Configuration
104 -------------------------------------------------------------------------- */ 101 -------------------------------------------------------------------------- */
@@ -160,6 +157,8 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
160struct acpi_mcfg_allocation *pci_mmcfg_config; 157struct acpi_mcfg_allocation *pci_mmcfg_config;
161int pci_mmcfg_config_num; 158int pci_mmcfg_config_num;
162 159
160static int acpi_mcfg_64bit_base_addr __initdata = FALSE;
161
163static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg) 162static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg)
164{ 163{
165 if (!strcmp(mcfg->header.oem_id, "SGI")) 164 if (!strcmp(mcfg->header.oem_id, "SGI"))
@@ -253,10 +252,8 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled)
253 return; 252 return;
254 } 253 }
255 254
256#ifdef CONFIG_X86_32
257 if (boot_cpu_physical_apicid != -1U) 255 if (boot_cpu_physical_apicid != -1U)
258 ver = apic_version[boot_cpu_physical_apicid]; 256 ver = apic_version[boot_cpu_physical_apicid];
259#endif
260 257
261 generic_processor_info(id, ver); 258 generic_processor_info(id, ver);
262} 259}
@@ -776,10 +773,8 @@ static void __init acpi_register_lapic_address(unsigned long address)
776 set_fixmap_nocache(FIX_APIC_BASE, address); 773 set_fixmap_nocache(FIX_APIC_BASE, address);
777 if (boot_cpu_physical_apicid == -1U) { 774 if (boot_cpu_physical_apicid == -1U) {
778 boot_cpu_physical_apicid = read_apic_id(); 775 boot_cpu_physical_apicid = read_apic_id();
779#ifdef CONFIG_X86_32
780 apic_version[boot_cpu_physical_apicid] = 776 apic_version[boot_cpu_physical_apicid] =
781 GET_APIC_VERSION(apic_read(APIC_LVR)); 777 GET_APIC_VERSION(apic_read(APIC_LVR));
782#endif
783 } 778 }
784} 779}
785 780
@@ -1607,6 +1602,14 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
1607 */ 1602 */
1608 { 1603 {
1609 .callback = dmi_ignore_irq0_timer_override, 1604 .callback = dmi_ignore_irq0_timer_override,
1605 .ident = "HP nx6115 laptop",
1606 .matches = {
1607 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
1608 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6115"),
1609 },
1610 },
1611 {
1612 .callback = dmi_ignore_irq0_timer_override,
1610 .ident = "HP NX6125 laptop", 1613 .ident = "HP NX6125 laptop",
1611 .matches = { 1614 .matches = {
1612 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), 1615 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
@@ -1621,6 +1624,14 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
1621 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"), 1624 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"),
1622 }, 1625 },
1623 }, 1626 },
1627 {
1628 .callback = dmi_ignore_irq0_timer_override,
1629 .ident = "HP 6715b laptop",
1630 .matches = {
1631 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
1632 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"),
1633 },
1634 },
1624 {} 1635 {}
1625}; 1636};
1626 1637
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 69b4d060b21c..34e4d112b1ef 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -33,6 +33,10 @@
33 33
34static DEFINE_RWLOCK(amd_iommu_devtable_lock); 34static DEFINE_RWLOCK(amd_iommu_devtable_lock);
35 35
36/* A list of preallocated protection domains */
37static LIST_HEAD(iommu_pd_list);
38static DEFINE_SPINLOCK(iommu_pd_list_lock);
39
36/* 40/*
37 * general struct to manage commands send to an IOMMU 41 * general struct to manage commands send to an IOMMU
38 */ 42 */
@@ -51,6 +55,102 @@ static int iommu_has_npcache(struct amd_iommu *iommu)
51 55
52/**************************************************************************** 56/****************************************************************************
53 * 57 *
58 * Interrupt handling functions
59 *
60 ****************************************************************************/
61
62static void iommu_print_event(void *__evt)
63{
64 u32 *event = __evt;
65 int type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK;
66 int devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
67 int domid = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK;
68 int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
69 u64 address = (u64)(((u64)event[3]) << 32) | event[2];
70
71 printk(KERN_ERR "AMD IOMMU: Event logged [");
72
73 switch (type) {
74 case EVENT_TYPE_ILL_DEV:
75 printk("ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x "
76 "address=0x%016llx flags=0x%04x]\n",
77 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
78 address, flags);
79 break;
80 case EVENT_TYPE_IO_FAULT:
81 printk("IO_PAGE_FAULT device=%02x:%02x.%x "
82 "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
83 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
84 domid, address, flags);
85 break;
86 case EVENT_TYPE_DEV_TAB_ERR:
87 printk("DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
88 "address=0x%016llx flags=0x%04x]\n",
89 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
90 address, flags);
91 break;
92 case EVENT_TYPE_PAGE_TAB_ERR:
93 printk("PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
94 "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
95 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
96 domid, address, flags);
97 break;
98 case EVENT_TYPE_ILL_CMD:
99 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
100 break;
101 case EVENT_TYPE_CMD_HARD_ERR:
102 printk("COMMAND_HARDWARE_ERROR address=0x%016llx "
103 "flags=0x%04x]\n", address, flags);
104 break;
105 case EVENT_TYPE_IOTLB_INV_TO:
106 printk("IOTLB_INV_TIMEOUT device=%02x:%02x.%x "
107 "address=0x%016llx]\n",
108 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
109 address);
110 break;
111 case EVENT_TYPE_INV_DEV_REQ:
112 printk("INVALID_DEVICE_REQUEST device=%02x:%02x.%x "
113 "address=0x%016llx flags=0x%04x]\n",
114 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
115 address, flags);
116 break;
117 default:
118 printk(KERN_ERR "UNKNOWN type=0x%02x]\n", type);
119 }
120}
121
122static void iommu_poll_events(struct amd_iommu *iommu)
123{
124 u32 head, tail;
125 unsigned long flags;
126
127 spin_lock_irqsave(&iommu->lock, flags);
128
129 head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
130 tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
131
132 while (head != tail) {
133 iommu_print_event(iommu->evt_buf + head);
134 head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size;
135 }
136
137 writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
138
139 spin_unlock_irqrestore(&iommu->lock, flags);
140}
141
142irqreturn_t amd_iommu_int_handler(int irq, void *data)
143{
144 struct amd_iommu *iommu;
145
146 list_for_each_entry(iommu, &amd_iommu_list, list)
147 iommu_poll_events(iommu);
148
149 return IRQ_HANDLED;
150}
151
152/****************************************************************************
153 *
54 * IOMMU command queuing functions 154 * IOMMU command queuing functions
55 * 155 *
56 ****************************************************************************/ 156 ****************************************************************************/
@@ -101,10 +201,10 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
101 */ 201 */
102static int iommu_completion_wait(struct amd_iommu *iommu) 202static int iommu_completion_wait(struct amd_iommu *iommu)
103{ 203{
104 int ret, ready = 0; 204 int ret = 0, ready = 0;
105 unsigned status = 0; 205 unsigned status = 0;
106 struct iommu_cmd cmd; 206 struct iommu_cmd cmd;
107 unsigned long i = 0; 207 unsigned long flags, i = 0;
108 208
109 memset(&cmd, 0, sizeof(cmd)); 209 memset(&cmd, 0, sizeof(cmd));
110 cmd.data[0] = CMD_COMPL_WAIT_INT_MASK; 210 cmd.data[0] = CMD_COMPL_WAIT_INT_MASK;
@@ -112,10 +212,12 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
112 212
113 iommu->need_sync = 0; 213 iommu->need_sync = 0;
114 214
115 ret = iommu_queue_command(iommu, &cmd); 215 spin_lock_irqsave(&iommu->lock, flags);
216
217 ret = __iommu_queue_command(iommu, &cmd);
116 218
117 if (ret) 219 if (ret)
118 return ret; 220 goto out;
119 221
120 while (!ready && (i < EXIT_LOOP_COUNT)) { 222 while (!ready && (i < EXIT_LOOP_COUNT)) {
121 ++i; 223 ++i;
@@ -130,6 +232,8 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
130 232
131 if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit())) 233 if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit()))
132 printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n"); 234 printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n");
235out:
236 spin_unlock_irqrestore(&iommu->lock, flags);
133 237
134 return 0; 238 return 0;
135} 239}
@@ -140,6 +244,7 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
140static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) 244static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
141{ 245{
142 struct iommu_cmd cmd; 246 struct iommu_cmd cmd;
247 int ret;
143 248
144 BUG_ON(iommu == NULL); 249 BUG_ON(iommu == NULL);
145 250
@@ -147,9 +252,11 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
147 CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY); 252 CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
148 cmd.data[0] = devid; 253 cmd.data[0] = devid;
149 254
255 ret = iommu_queue_command(iommu, &cmd);
256
150 iommu->need_sync = 1; 257 iommu->need_sync = 1;
151 258
152 return iommu_queue_command(iommu, &cmd); 259 return ret;
153} 260}
154 261
155/* 262/*
@@ -159,6 +266,7 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
159 u64 address, u16 domid, int pde, int s) 266 u64 address, u16 domid, int pde, int s)
160{ 267{
161 struct iommu_cmd cmd; 268 struct iommu_cmd cmd;
269 int ret;
162 270
163 memset(&cmd, 0, sizeof(cmd)); 271 memset(&cmd, 0, sizeof(cmd));
164 address &= PAGE_MASK; 272 address &= PAGE_MASK;
@@ -171,9 +279,11 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
171 if (pde) /* PDE bit - we wan't flush everything not only the PTEs */ 279 if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
172 cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; 280 cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
173 281
282 ret = iommu_queue_command(iommu, &cmd);
283
174 iommu->need_sync = 1; 284 iommu->need_sync = 1;
175 285
176 return iommu_queue_command(iommu, &cmd); 286 return ret;
177} 287}
178 288
179/* 289/*
@@ -203,6 +313,14 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
203 return 0; 313 return 0;
204} 314}
205 315
316/* Flush the whole IO/TLB for a given protection domain */
317static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid)
318{
319 u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
320
321 iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1);
322}
323
206/**************************************************************************** 324/****************************************************************************
207 * 325 *
208 * The functions below are used the create the page table mappings for 326 * The functions below are used the create the page table mappings for
@@ -362,11 +480,6 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
362 * efficient allocator. 480 * efficient allocator.
363 * 481 *
364 ****************************************************************************/ 482 ****************************************************************************/
365static unsigned long dma_mask_to_pages(unsigned long mask)
366{
367 return (mask >> PAGE_SHIFT) +
368 (PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT);
369}
370 483
371/* 484/*
372 * The address allocator core function. 485 * The address allocator core function.
@@ -375,25 +488,31 @@ static unsigned long dma_mask_to_pages(unsigned long mask)
375 */ 488 */
376static unsigned long dma_ops_alloc_addresses(struct device *dev, 489static unsigned long dma_ops_alloc_addresses(struct device *dev,
377 struct dma_ops_domain *dom, 490 struct dma_ops_domain *dom,
378 unsigned int pages) 491 unsigned int pages,
492 unsigned long align_mask,
493 u64 dma_mask)
379{ 494{
380 unsigned long limit = dma_mask_to_pages(*dev->dma_mask); 495 unsigned long limit;
381 unsigned long address; 496 unsigned long address;
382 unsigned long size = dom->aperture_size >> PAGE_SHIFT;
383 unsigned long boundary_size; 497 unsigned long boundary_size;
384 498
385 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, 499 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
386 PAGE_SIZE) >> PAGE_SHIFT; 500 PAGE_SIZE) >> PAGE_SHIFT;
387 limit = limit < size ? limit : size; 501 limit = iommu_device_max_index(dom->aperture_size >> PAGE_SHIFT, 0,
502 dma_mask >> PAGE_SHIFT);
388 503
389 if (dom->next_bit >= limit) 504 if (dom->next_bit >= limit) {
390 dom->next_bit = 0; 505 dom->next_bit = 0;
506 dom->need_flush = true;
507 }
391 508
392 address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages, 509 address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages,
393 0 , boundary_size, 0); 510 0 , boundary_size, align_mask);
394 if (address == -1) 511 if (address == -1) {
395 address = iommu_area_alloc(dom->bitmap, limit, 0, pages, 512 address = iommu_area_alloc(dom->bitmap, limit, 0, pages,
396 0, boundary_size, 0); 513 0, boundary_size, align_mask);
514 dom->need_flush = true;
515 }
397 516
398 if (likely(address != -1)) { 517 if (likely(address != -1)) {
399 dom->next_bit = address + pages; 518 dom->next_bit = address + pages;
@@ -459,7 +578,7 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
459 if (start_page + pages > last_page) 578 if (start_page + pages > last_page)
460 pages = last_page - start_page; 579 pages = last_page - start_page;
461 580
462 set_bit_string(dom->bitmap, start_page, pages); 581 iommu_area_reserve(dom->bitmap, start_page, pages);
463} 582}
464 583
465static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom) 584static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
@@ -553,6 +672,9 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
553 dma_dom->bitmap[0] = 1; 672 dma_dom->bitmap[0] = 1;
554 dma_dom->next_bit = 0; 673 dma_dom->next_bit = 0;
555 674
675 dma_dom->need_flush = false;
676 dma_dom->target_dev = 0xffff;
677
556 /* Intialize the exclusion range if necessary */ 678 /* Intialize the exclusion range if necessary */
557 if (iommu->exclusion_start && 679 if (iommu->exclusion_start &&
558 iommu->exclusion_start < dma_dom->aperture_size) { 680 iommu->exclusion_start < dma_dom->aperture_size) {
@@ -623,12 +745,13 @@ static void set_device_domain(struct amd_iommu *iommu,
623 745
624 u64 pte_root = virt_to_phys(domain->pt_root); 746 u64 pte_root = virt_to_phys(domain->pt_root);
625 747
626 pte_root |= (domain->mode & 0x07) << 9; 748 pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
627 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | 2; 749 << DEV_ENTRY_MODE_SHIFT;
750 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
628 751
629 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 752 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
630 amd_iommu_dev_table[devid].data[0] = pte_root; 753 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
631 amd_iommu_dev_table[devid].data[1] = pte_root >> 32; 754 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
632 amd_iommu_dev_table[devid].data[2] = domain->id; 755 amd_iommu_dev_table[devid].data[2] = domain->id;
633 756
634 amd_iommu_pd_table[devid] = domain; 757 amd_iommu_pd_table[devid] = domain;
@@ -646,6 +769,45 @@ static void set_device_domain(struct amd_iommu *iommu,
646 *****************************************************************************/ 769 *****************************************************************************/
647 770
648/* 771/*
772 * This function checks if the driver got a valid device from the caller to
773 * avoid dereferencing invalid pointers.
774 */
775static bool check_device(struct device *dev)
776{
777 if (!dev || !dev->dma_mask)
778 return false;
779
780 return true;
781}
782
783/*
784 * In this function the list of preallocated protection domains is traversed to
785 * find the domain for a specific device
786 */
787static struct dma_ops_domain *find_protection_domain(u16 devid)
788{
789 struct dma_ops_domain *entry, *ret = NULL;
790 unsigned long flags;
791
792 if (list_empty(&iommu_pd_list))
793 return NULL;
794
795 spin_lock_irqsave(&iommu_pd_list_lock, flags);
796
797 list_for_each_entry(entry, &iommu_pd_list, list) {
798 if (entry->target_dev == devid) {
799 ret = entry;
800 list_del(&ret->list);
801 break;
802 }
803 }
804
805 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
806
807 return ret;
808}
809
810/*
649 * In the dma_ops path we only have the struct device. This function 811 * In the dma_ops path we only have the struct device. This function
650 * finds the corresponding IOMMU, the protection domain and the 812 * finds the corresponding IOMMU, the protection domain and the
651 * requestor id for a given device. 813 * requestor id for a given device.
@@ -661,27 +823,30 @@ static int get_device_resources(struct device *dev,
661 struct pci_dev *pcidev; 823 struct pci_dev *pcidev;
662 u16 _bdf; 824 u16 _bdf;
663 825
664 BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask); 826 *iommu = NULL;
827 *domain = NULL;
828 *bdf = 0xffff;
829
830 if (dev->bus != &pci_bus_type)
831 return 0;
665 832
666 pcidev = to_pci_dev(dev); 833 pcidev = to_pci_dev(dev);
667 _bdf = calc_devid(pcidev->bus->number, pcidev->devfn); 834 _bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
668 835
669 /* device not translated by any IOMMU in the system? */ 836 /* device not translated by any IOMMU in the system? */
670 if (_bdf > amd_iommu_last_bdf) { 837 if (_bdf > amd_iommu_last_bdf)
671 *iommu = NULL;
672 *domain = NULL;
673 *bdf = 0xffff;
674 return 0; 838 return 0;
675 }
676 839
677 *bdf = amd_iommu_alias_table[_bdf]; 840 *bdf = amd_iommu_alias_table[_bdf];
678 841
679 *iommu = amd_iommu_rlookup_table[*bdf]; 842 *iommu = amd_iommu_rlookup_table[*bdf];
680 if (*iommu == NULL) 843 if (*iommu == NULL)
681 return 0; 844 return 0;
682 dma_dom = (*iommu)->default_dom;
683 *domain = domain_for_device(*bdf); 845 *domain = domain_for_device(*bdf);
684 if (*domain == NULL) { 846 if (*domain == NULL) {
847 dma_dom = find_protection_domain(*bdf);
848 if (!dma_dom)
849 dma_dom = (*iommu)->default_dom;
685 *domain = &dma_dom->domain; 850 *domain = &dma_dom->domain;
686 set_device_domain(*iommu, *domain, *bdf); 851 set_device_domain(*iommu, *domain, *bdf);
687 printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " 852 printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
@@ -760,17 +925,24 @@ static dma_addr_t __map_single(struct device *dev,
760 struct dma_ops_domain *dma_dom, 925 struct dma_ops_domain *dma_dom,
761 phys_addr_t paddr, 926 phys_addr_t paddr,
762 size_t size, 927 size_t size,
763 int dir) 928 int dir,
929 bool align,
930 u64 dma_mask)
764{ 931{
765 dma_addr_t offset = paddr & ~PAGE_MASK; 932 dma_addr_t offset = paddr & ~PAGE_MASK;
766 dma_addr_t address, start; 933 dma_addr_t address, start;
767 unsigned int pages; 934 unsigned int pages;
935 unsigned long align_mask = 0;
768 int i; 936 int i;
769 937
770 pages = iommu_num_pages(paddr, size); 938 pages = iommu_num_pages(paddr, size);
771 paddr &= PAGE_MASK; 939 paddr &= PAGE_MASK;
772 940
773 address = dma_ops_alloc_addresses(dev, dma_dom, pages); 941 if (align)
942 align_mask = (1UL << get_order(size)) - 1;
943
944 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
945 dma_mask);
774 if (unlikely(address == bad_dma_address)) 946 if (unlikely(address == bad_dma_address))
775 goto out; 947 goto out;
776 948
@@ -782,6 +954,12 @@ static dma_addr_t __map_single(struct device *dev,
782 } 954 }
783 address += offset; 955 address += offset;
784 956
957 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
958 iommu_flush_tlb(iommu, dma_dom->domain.id);
959 dma_dom->need_flush = false;
960 } else if (unlikely(iommu_has_npcache(iommu)))
961 iommu_flush_pages(iommu, dma_dom->domain.id, address, size);
962
785out: 963out:
786 return address; 964 return address;
787} 965}
@@ -812,6 +990,9 @@ static void __unmap_single(struct amd_iommu *iommu,
812 } 990 }
813 991
814 dma_ops_free_addresses(dma_dom, dma_addr, pages); 992 dma_ops_free_addresses(dma_dom, dma_addr, pages);
993
994 if (amd_iommu_unmap_flush)
995 iommu_flush_pages(iommu, dma_dom->domain.id, dma_addr, size);
815} 996}
816 997
817/* 998/*
@@ -825,6 +1006,12 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
825 struct protection_domain *domain; 1006 struct protection_domain *domain;
826 u16 devid; 1007 u16 devid;
827 dma_addr_t addr; 1008 dma_addr_t addr;
1009 u64 dma_mask;
1010
1011 if (!check_device(dev))
1012 return bad_dma_address;
1013
1014 dma_mask = *dev->dma_mask;
828 1015
829 get_device_resources(dev, &iommu, &domain, &devid); 1016 get_device_resources(dev, &iommu, &domain, &devid);
830 1017
@@ -833,14 +1020,12 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
833 return (dma_addr_t)paddr; 1020 return (dma_addr_t)paddr;
834 1021
835 spin_lock_irqsave(&domain->lock, flags); 1022 spin_lock_irqsave(&domain->lock, flags);
836 addr = __map_single(dev, iommu, domain->priv, paddr, size, dir); 1023 addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false,
1024 dma_mask);
837 if (addr == bad_dma_address) 1025 if (addr == bad_dma_address)
838 goto out; 1026 goto out;
839 1027
840 if (iommu_has_npcache(iommu)) 1028 if (unlikely(iommu->need_sync))
841 iommu_flush_pages(iommu, domain->id, addr, size);
842
843 if (iommu->need_sync)
844 iommu_completion_wait(iommu); 1029 iommu_completion_wait(iommu);
845 1030
846out: 1031out:
@@ -860,7 +1045,8 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
860 struct protection_domain *domain; 1045 struct protection_domain *domain;
861 u16 devid; 1046 u16 devid;
862 1047
863 if (!get_device_resources(dev, &iommu, &domain, &devid)) 1048 if (!check_device(dev) ||
1049 !get_device_resources(dev, &iommu, &domain, &devid))
864 /* device not handled by any AMD IOMMU */ 1050 /* device not handled by any AMD IOMMU */
865 return; 1051 return;
866 1052
@@ -868,9 +1054,7 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
868 1054
869 __unmap_single(iommu, domain->priv, dma_addr, size, dir); 1055 __unmap_single(iommu, domain->priv, dma_addr, size, dir);
870 1056
871 iommu_flush_pages(iommu, domain->id, dma_addr, size); 1057 if (unlikely(iommu->need_sync))
872
873 if (iommu->need_sync)
874 iommu_completion_wait(iommu); 1058 iommu_completion_wait(iommu);
875 1059
876 spin_unlock_irqrestore(&domain->lock, flags); 1060 spin_unlock_irqrestore(&domain->lock, flags);
@@ -909,6 +1093,12 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
909 struct scatterlist *s; 1093 struct scatterlist *s;
910 phys_addr_t paddr; 1094 phys_addr_t paddr;
911 int mapped_elems = 0; 1095 int mapped_elems = 0;
1096 u64 dma_mask;
1097
1098 if (!check_device(dev))
1099 return 0;
1100
1101 dma_mask = *dev->dma_mask;
912 1102
913 get_device_resources(dev, &iommu, &domain, &devid); 1103 get_device_resources(dev, &iommu, &domain, &devid);
914 1104
@@ -921,19 +1111,17 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
921 paddr = sg_phys(s); 1111 paddr = sg_phys(s);
922 1112
923 s->dma_address = __map_single(dev, iommu, domain->priv, 1113 s->dma_address = __map_single(dev, iommu, domain->priv,
924 paddr, s->length, dir); 1114 paddr, s->length, dir, false,
1115 dma_mask);
925 1116
926 if (s->dma_address) { 1117 if (s->dma_address) {
927 s->dma_length = s->length; 1118 s->dma_length = s->length;
928 mapped_elems++; 1119 mapped_elems++;
929 } else 1120 } else
930 goto unmap; 1121 goto unmap;
931 if (iommu_has_npcache(iommu))
932 iommu_flush_pages(iommu, domain->id, s->dma_address,
933 s->dma_length);
934 } 1122 }
935 1123
936 if (iommu->need_sync) 1124 if (unlikely(iommu->need_sync))
937 iommu_completion_wait(iommu); 1125 iommu_completion_wait(iommu);
938 1126
939out: 1127out:
@@ -967,7 +1155,8 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
967 u16 devid; 1155 u16 devid;
968 int i; 1156 int i;
969 1157
970 if (!get_device_resources(dev, &iommu, &domain, &devid)) 1158 if (!check_device(dev) ||
1159 !get_device_resources(dev, &iommu, &domain, &devid))
971 return; 1160 return;
972 1161
973 spin_lock_irqsave(&domain->lock, flags); 1162 spin_lock_irqsave(&domain->lock, flags);
@@ -975,12 +1164,10 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
975 for_each_sg(sglist, s, nelems, i) { 1164 for_each_sg(sglist, s, nelems, i) {
976 __unmap_single(iommu, domain->priv, s->dma_address, 1165 __unmap_single(iommu, domain->priv, s->dma_address,
977 s->dma_length, dir); 1166 s->dma_length, dir);
978 iommu_flush_pages(iommu, domain->id, s->dma_address,
979 s->dma_length);
980 s->dma_address = s->dma_length = 0; 1167 s->dma_address = s->dma_length = 0;
981 } 1168 }
982 1169
983 if (iommu->need_sync) 1170 if (unlikely(iommu->need_sync))
984 iommu_completion_wait(iommu); 1171 iommu_completion_wait(iommu);
985 1172
986 spin_unlock_irqrestore(&domain->lock, flags); 1173 spin_unlock_irqrestore(&domain->lock, flags);
@@ -998,25 +1185,33 @@ static void *alloc_coherent(struct device *dev, size_t size,
998 struct protection_domain *domain; 1185 struct protection_domain *domain;
999 u16 devid; 1186 u16 devid;
1000 phys_addr_t paddr; 1187 phys_addr_t paddr;
1188 u64 dma_mask = dev->coherent_dma_mask;
1189
1190 if (!check_device(dev))
1191 return NULL;
1001 1192
1193 if (!get_device_resources(dev, &iommu, &domain, &devid))
1194 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
1195
1196 flag |= __GFP_ZERO;
1002 virt_addr = (void *)__get_free_pages(flag, get_order(size)); 1197 virt_addr = (void *)__get_free_pages(flag, get_order(size));
1003 if (!virt_addr) 1198 if (!virt_addr)
1004 return 0; 1199 return 0;
1005 1200
1006 memset(virt_addr, 0, size);
1007 paddr = virt_to_phys(virt_addr); 1201 paddr = virt_to_phys(virt_addr);
1008 1202
1009 get_device_resources(dev, &iommu, &domain, &devid);
1010
1011 if (!iommu || !domain) { 1203 if (!iommu || !domain) {
1012 *dma_addr = (dma_addr_t)paddr; 1204 *dma_addr = (dma_addr_t)paddr;
1013 return virt_addr; 1205 return virt_addr;
1014 } 1206 }
1015 1207
1208 if (!dma_mask)
1209 dma_mask = *dev->dma_mask;
1210
1016 spin_lock_irqsave(&domain->lock, flags); 1211 spin_lock_irqsave(&domain->lock, flags);
1017 1212
1018 *dma_addr = __map_single(dev, iommu, domain->priv, paddr, 1213 *dma_addr = __map_single(dev, iommu, domain->priv, paddr,
1019 size, DMA_BIDIRECTIONAL); 1214 size, DMA_BIDIRECTIONAL, true, dma_mask);
1020 1215
1021 if (*dma_addr == bad_dma_address) { 1216 if (*dma_addr == bad_dma_address) {
1022 free_pages((unsigned long)virt_addr, get_order(size)); 1217 free_pages((unsigned long)virt_addr, get_order(size));
@@ -1024,10 +1219,7 @@ static void *alloc_coherent(struct device *dev, size_t size,
1024 goto out; 1219 goto out;
1025 } 1220 }
1026 1221
1027 if (iommu_has_npcache(iommu)) 1222 if (unlikely(iommu->need_sync))
1028 iommu_flush_pages(iommu, domain->id, *dma_addr, size);
1029
1030 if (iommu->need_sync)
1031 iommu_completion_wait(iommu); 1223 iommu_completion_wait(iommu);
1032 1224
1033out: 1225out:
@@ -1038,8 +1230,6 @@ out:
1038 1230
1039/* 1231/*
1040 * The exported free_coherent function for dma_ops. 1232 * The exported free_coherent function for dma_ops.
1041 * FIXME: fix the generic x86 DMA layer so that it actually calls that
1042 * function.
1043 */ 1233 */
1044static void free_coherent(struct device *dev, size_t size, 1234static void free_coherent(struct device *dev, size_t size,
1045 void *virt_addr, dma_addr_t dma_addr) 1235 void *virt_addr, dma_addr_t dma_addr)
@@ -1049,6 +1239,9 @@ static void free_coherent(struct device *dev, size_t size,
1049 struct protection_domain *domain; 1239 struct protection_domain *domain;
1050 u16 devid; 1240 u16 devid;
1051 1241
1242 if (!check_device(dev))
1243 return;
1244
1052 get_device_resources(dev, &iommu, &domain, &devid); 1245 get_device_resources(dev, &iommu, &domain, &devid);
1053 1246
1054 if (!iommu || !domain) 1247 if (!iommu || !domain)
@@ -1057,9 +1250,8 @@ static void free_coherent(struct device *dev, size_t size,
1057 spin_lock_irqsave(&domain->lock, flags); 1250 spin_lock_irqsave(&domain->lock, flags);
1058 1251
1059 __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); 1252 __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
1060 iommu_flush_pages(iommu, domain->id, dma_addr, size);
1061 1253
1062 if (iommu->need_sync) 1254 if (unlikely(iommu->need_sync))
1063 iommu_completion_wait(iommu); 1255 iommu_completion_wait(iommu);
1064 1256
1065 spin_unlock_irqrestore(&domain->lock, flags); 1257 spin_unlock_irqrestore(&domain->lock, flags);
@@ -1069,6 +1261,30 @@ free_mem:
1069} 1261}
1070 1262
1071/* 1263/*
1264 * This function is called by the DMA layer to find out if we can handle a
1265 * particular device. It is part of the dma_ops.
1266 */
1267static int amd_iommu_dma_supported(struct device *dev, u64 mask)
1268{
1269 u16 bdf;
1270 struct pci_dev *pcidev;
1271
1272 /* No device or no PCI device */
1273 if (!dev || dev->bus != &pci_bus_type)
1274 return 0;
1275
1276 pcidev = to_pci_dev(dev);
1277
1278 bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
1279
1280 /* Out of our scope? */
1281 if (bdf > amd_iommu_last_bdf)
1282 return 0;
1283
1284 return 1;
1285}
1286
1287/*
1072 * The function for pre-allocating protection domains. 1288 * The function for pre-allocating protection domains.
1073 * 1289 *
1074 * If the driver core informs the DMA layer if a driver grabs a device 1290 * If the driver core informs the DMA layer if a driver grabs a device
@@ -1097,10 +1313,9 @@ void prealloc_protection_domains(void)
1097 if (!dma_dom) 1313 if (!dma_dom)
1098 continue; 1314 continue;
1099 init_unity_mappings_for_device(dma_dom, devid); 1315 init_unity_mappings_for_device(dma_dom, devid);
1100 set_device_domain(iommu, &dma_dom->domain, devid); 1316 dma_dom->target_dev = devid;
1101 printk(KERN_INFO "AMD IOMMU: Allocated domain %d for device ", 1317
1102 dma_dom->domain.id); 1318 list_add_tail(&dma_dom->list, &iommu_pd_list);
1103 print_devid(devid, 1);
1104 } 1319 }
1105} 1320}
1106 1321
@@ -1111,6 +1326,7 @@ static struct dma_mapping_ops amd_iommu_dma_ops = {
1111 .unmap_single = unmap_single, 1326 .unmap_single = unmap_single,
1112 .map_sg = map_sg, 1327 .map_sg = map_sg,
1113 .unmap_sg = unmap_sg, 1328 .unmap_sg = unmap_sg,
1329 .dma_supported = amd_iommu_dma_supported,
1114}; 1330};
1115 1331
1116/* 1332/*
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index a69cc0f52042..148fcfe22f17 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -22,6 +22,8 @@
22#include <linux/gfp.h> 22#include <linux/gfp.h>
23#include <linux/list.h> 23#include <linux/list.h>
24#include <linux/sysdev.h> 24#include <linux/sysdev.h>
25#include <linux/interrupt.h>
26#include <linux/msi.h>
25#include <asm/pci-direct.h> 27#include <asm/pci-direct.h>
26#include <asm/amd_iommu_types.h> 28#include <asm/amd_iommu_types.h>
27#include <asm/amd_iommu.h> 29#include <asm/amd_iommu.h>
@@ -30,7 +32,6 @@
30/* 32/*
31 * definitions for the ACPI scanning code 33 * definitions for the ACPI scanning code
32 */ 34 */
33#define PCI_BUS(x) (((x) >> 8) & 0xff)
34#define IVRS_HEADER_LENGTH 48 35#define IVRS_HEADER_LENGTH 48
35 36
36#define ACPI_IVHD_TYPE 0x10 37#define ACPI_IVHD_TYPE 0x10
@@ -121,6 +122,7 @@ LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
121 we find in ACPI */ 122 we find in ACPI */
122unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ 123unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */
123int amd_iommu_isolate; /* if 1, device isolation is enabled */ 124int amd_iommu_isolate; /* if 1, device isolation is enabled */
125bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
124 126
125LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the 127LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
126 system */ 128 system */
@@ -234,7 +236,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
234{ 236{
235 u32 ctrl; 237 u32 ctrl;
236 238
237 ctrl = (u64)readl(iommu->mmio_base + MMIO_CONTROL_OFFSET); 239 ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
238 ctrl &= ~(1 << bit); 240 ctrl &= ~(1 << bit);
239 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); 241 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
240} 242}
@@ -242,13 +244,23 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
242/* Function to enable the hardware */ 244/* Function to enable the hardware */
243void __init iommu_enable(struct amd_iommu *iommu) 245void __init iommu_enable(struct amd_iommu *iommu)
244{ 246{
245 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at "); 247 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU "
246 print_devid(iommu->devid, 0); 248 "at %02x:%02x.%x cap 0x%hx\n",
247 printk(" cap 0x%hx\n", iommu->cap_ptr); 249 iommu->dev->bus->number,
250 PCI_SLOT(iommu->dev->devfn),
251 PCI_FUNC(iommu->dev->devfn),
252 iommu->cap_ptr);
248 253
249 iommu_feature_enable(iommu, CONTROL_IOMMU_EN); 254 iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
250} 255}
251 256
257/* Function to enable IOMMU event logging and event interrupts */
258void __init iommu_enable_event_logging(struct amd_iommu *iommu)
259{
260 iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
261 iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
262}
263
252/* 264/*
253 * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in 265 * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
254 * the system has one. 266 * the system has one.
@@ -286,6 +298,14 @@ static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
286 ****************************************************************************/ 298 ****************************************************************************/
287 299
288/* 300/*
301 * This function calculates the length of a given IVHD entry
302 */
303static inline int ivhd_entry_length(u8 *ivhd)
304{
305 return 0x04 << (*ivhd >> 6);
306}
307
308/*
289 * This function reads the last device id the IOMMU has to handle from the PCI 309 * This function reads the last device id the IOMMU has to handle from the PCI
290 * capability header for this IOMMU 310 * capability header for this IOMMU
291 */ 311 */
@@ -329,7 +349,7 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
329 default: 349 default:
330 break; 350 break;
331 } 351 }
332 p += 0x04 << (*p >> 6); 352 p += ivhd_entry_length(p);
333 } 353 }
334 354
335 WARN_ON(p != end); 355 WARN_ON(p != end);
@@ -414,7 +434,32 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
414 434
415static void __init free_command_buffer(struct amd_iommu *iommu) 435static void __init free_command_buffer(struct amd_iommu *iommu)
416{ 436{
417 free_pages((unsigned long)iommu->cmd_buf, get_order(CMD_BUFFER_SIZE)); 437 free_pages((unsigned long)iommu->cmd_buf,
438 get_order(iommu->cmd_buf_size));
439}
440
441/* allocates the memory where the IOMMU will log its events to */
442static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
443{
444 u64 entry;
445 iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
446 get_order(EVT_BUFFER_SIZE));
447
448 if (iommu->evt_buf == NULL)
449 return NULL;
450
451 entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
452 memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
453 &entry, sizeof(entry));
454
455 iommu->evt_buf_size = EVT_BUFFER_SIZE;
456
457 return iommu->evt_buf;
458}
459
460static void __init free_event_buffer(struct amd_iommu *iommu)
461{
462 free_pages((unsigned long)iommu->evt_buf, get_order(EVT_BUFFER_SIZE));
418} 463}
419 464
420/* sets a specific bit in the device table entry. */ 465/* sets a specific bit in the device table entry. */
@@ -487,19 +532,21 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
487 */ 532 */
488static void __init init_iommu_from_pci(struct amd_iommu *iommu) 533static void __init init_iommu_from_pci(struct amd_iommu *iommu)
489{ 534{
490 int bus = PCI_BUS(iommu->devid);
491 int dev = PCI_SLOT(iommu->devid);
492 int fn = PCI_FUNC(iommu->devid);
493 int cap_ptr = iommu->cap_ptr; 535 int cap_ptr = iommu->cap_ptr;
494 u32 range; 536 u32 range, misc;
495 537
496 iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET); 538 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
539 &iommu->cap);
540 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_RANGE_OFFSET,
541 &range);
542 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_MISC_OFFSET,
543 &misc);
497 544
498 range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
499 iommu->first_device = calc_devid(MMIO_GET_BUS(range), 545 iommu->first_device = calc_devid(MMIO_GET_BUS(range),
500 MMIO_GET_FD(range)); 546 MMIO_GET_FD(range));
501 iommu->last_device = calc_devid(MMIO_GET_BUS(range), 547 iommu->last_device = calc_devid(MMIO_GET_BUS(range),
502 MMIO_GET_LD(range)); 548 MMIO_GET_LD(range));
549 iommu->evt_msi_num = MMIO_MSI_NUM(misc);
503} 550}
504 551
505/* 552/*
@@ -604,7 +651,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
604 break; 651 break;
605 } 652 }
606 653
607 p += 0x04 << (e->type >> 6); 654 p += ivhd_entry_length(p);
608 } 655 }
609} 656}
610 657
@@ -622,6 +669,7 @@ static int __init init_iommu_devices(struct amd_iommu *iommu)
622static void __init free_iommu_one(struct amd_iommu *iommu) 669static void __init free_iommu_one(struct amd_iommu *iommu)
623{ 670{
624 free_command_buffer(iommu); 671 free_command_buffer(iommu);
672 free_event_buffer(iommu);
625 iommu_unmap_mmio_space(iommu); 673 iommu_unmap_mmio_space(iommu);
626} 674}
627 675
@@ -649,8 +697,12 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
649 /* 697 /*
650 * Copy data from ACPI table entry to the iommu struct 698 * Copy data from ACPI table entry to the iommu struct
651 */ 699 */
652 iommu->devid = h->devid; 700 iommu->dev = pci_get_bus_and_slot(PCI_BUS(h->devid), h->devid & 0xff);
701 if (!iommu->dev)
702 return 1;
703
653 iommu->cap_ptr = h->cap_ptr; 704 iommu->cap_ptr = h->cap_ptr;
705 iommu->pci_seg = h->pci_seg;
654 iommu->mmio_phys = h->mmio_phys; 706 iommu->mmio_phys = h->mmio_phys;
655 iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys); 707 iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys);
656 if (!iommu->mmio_base) 708 if (!iommu->mmio_base)
@@ -661,10 +713,18 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
661 if (!iommu->cmd_buf) 713 if (!iommu->cmd_buf)
662 return -ENOMEM; 714 return -ENOMEM;
663 715
716 iommu->evt_buf = alloc_event_buffer(iommu);
717 if (!iommu->evt_buf)
718 return -ENOMEM;
719
720 iommu->int_enabled = false;
721
664 init_iommu_from_pci(iommu); 722 init_iommu_from_pci(iommu);
665 init_iommu_from_acpi(iommu, h); 723 init_iommu_from_acpi(iommu, h);
666 init_iommu_devices(iommu); 724 init_iommu_devices(iommu);
667 725
726 pci_enable_device(iommu->dev);
727
668 return 0; 728 return 0;
669} 729}
670 730
@@ -706,6 +766,95 @@ static int __init init_iommu_all(struct acpi_table_header *table)
706 766
707/**************************************************************************** 767/****************************************************************************
708 * 768 *
769 * The following functions initialize the MSI interrupts for all IOMMUs
770 * in the system. Its a bit challenging because there could be multiple
771 * IOMMUs per PCI BDF but we can call pci_enable_msi(x) only once per
772 * pci_dev.
773 *
774 ****************************************************************************/
775
776static int __init iommu_setup_msix(struct amd_iommu *iommu)
777{
778 struct amd_iommu *curr;
779 struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */
780 int nvec = 0, i;
781
782 list_for_each_entry(curr, &amd_iommu_list, list) {
783 if (curr->dev == iommu->dev) {
784 entries[nvec].entry = curr->evt_msi_num;
785 entries[nvec].vector = 0;
786 curr->int_enabled = true;
787 nvec++;
788 }
789 }
790
791 if (pci_enable_msix(iommu->dev, entries, nvec)) {
792 pci_disable_msix(iommu->dev);
793 return 1;
794 }
795
796 for (i = 0; i < nvec; ++i) {
797 int r = request_irq(entries->vector, amd_iommu_int_handler,
798 IRQF_SAMPLE_RANDOM,
799 "AMD IOMMU",
800 NULL);
801 if (r)
802 goto out_free;
803 }
804
805 return 0;
806
807out_free:
808 for (i -= 1; i >= 0; --i)
809 free_irq(entries->vector, NULL);
810
811 pci_disable_msix(iommu->dev);
812
813 return 1;
814}
815
816static int __init iommu_setup_msi(struct amd_iommu *iommu)
817{
818 int r;
819 struct amd_iommu *curr;
820
821 list_for_each_entry(curr, &amd_iommu_list, list) {
822 if (curr->dev == iommu->dev)
823 curr->int_enabled = true;
824 }
825
826
827 if (pci_enable_msi(iommu->dev))
828 return 1;
829
830 r = request_irq(iommu->dev->irq, amd_iommu_int_handler,
831 IRQF_SAMPLE_RANDOM,
832 "AMD IOMMU",
833 NULL);
834
835 if (r) {
836 pci_disable_msi(iommu->dev);
837 return 1;
838 }
839
840 return 0;
841}
842
843static int __init iommu_init_msi(struct amd_iommu *iommu)
844{
845 if (iommu->int_enabled)
846 return 0;
847
848 if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSIX))
849 return iommu_setup_msix(iommu);
850 else if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
851 return iommu_setup_msi(iommu);
852
853 return 1;
854}
855
856/****************************************************************************
857 *
709 * The next functions belong to the third pass of parsing the ACPI 858 * The next functions belong to the third pass of parsing the ACPI
710 * table. In this last pass the memory mapping requirements are 859 * table. In this last pass the memory mapping requirements are
711 * gathered (like exclusion and unity mapping reanges). 860 * gathered (like exclusion and unity mapping reanges).
@@ -811,7 +960,6 @@ static void init_device_table(void)
811 for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) { 960 for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) {
812 set_dev_entry_bit(devid, DEV_ENTRY_VALID); 961 set_dev_entry_bit(devid, DEV_ENTRY_VALID);
813 set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION); 962 set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION);
814 set_dev_entry_bit(devid, DEV_ENTRY_NO_PAGE_FAULT);
815 } 963 }
816} 964}
817 965
@@ -825,6 +973,8 @@ static void __init enable_iommus(void)
825 973
826 list_for_each_entry(iommu, &amd_iommu_list, list) { 974 list_for_each_entry(iommu, &amd_iommu_list, list) {
827 iommu_set_exclusion_range(iommu); 975 iommu_set_exclusion_range(iommu);
976 iommu_init_msi(iommu);
977 iommu_enable_event_logging(iommu);
828 iommu_enable(iommu); 978 iommu_enable(iommu);
829 } 979 }
830} 980}
@@ -995,11 +1145,17 @@ int __init amd_iommu_init(void)
995 else 1145 else
996 printk("disabled\n"); 1146 printk("disabled\n");
997 1147
1148 if (amd_iommu_unmap_flush)
1149 printk(KERN_INFO "AMD IOMMU: IO/TLB flush on unmap enabled\n");
1150 else
1151 printk(KERN_INFO "AMD IOMMU: Lazy IO/TLB flushing enabled\n");
1152
998out: 1153out:
999 return ret; 1154 return ret;
1000 1155
1001free: 1156free:
1002 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1); 1157 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
1158 get_order(MAX_DOMAIN_ID/8));
1003 1159
1004 free_pages((unsigned long)amd_iommu_pd_table, 1160 free_pages((unsigned long)amd_iommu_pd_table,
1005 get_order(rlookup_table_size)); 1161 get_order(rlookup_table_size));
@@ -1057,8 +1213,10 @@ void __init amd_iommu_detect(void)
1057static int __init parse_amd_iommu_options(char *str) 1213static int __init parse_amd_iommu_options(char *str)
1058{ 1214{
1059 for (; *str; ++str) { 1215 for (; *str; ++str) {
1060 if (strcmp(str, "isolate") == 0) 1216 if (strncmp(str, "isolate", 7) == 0)
1061 amd_iommu_isolate = 1; 1217 amd_iommu_isolate = 1;
1218 if (strncmp(str, "fullflush", 11) == 0)
1219 amd_iommu_unmap_flush = true;
1062 } 1220 }
1063 1221
1064 return 1; 1222 return 1;
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 44e21826db11..9a32b37ee2ee 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -455,11 +455,11 @@ out:
455 force_iommu || 455 force_iommu ||
456 valid_agp || 456 valid_agp ||
457 fallback_aper_force) { 457 fallback_aper_force) {
458 printk(KERN_ERR 458 printk(KERN_INFO
459 "Your BIOS doesn't leave a aperture memory hole\n"); 459 "Your BIOS doesn't leave a aperture memory hole\n");
460 printk(KERN_ERR 460 printk(KERN_INFO
461 "Please enable the IOMMU option in the BIOS setup\n"); 461 "Please enable the IOMMU option in the BIOS setup\n");
462 printk(KERN_ERR 462 printk(KERN_INFO
463 "This costs you %d MB of RAM\n", 463 "This costs you %d MB of RAM\n",
464 32 << fallback_aper_order); 464 32 << fallback_aper_order);
465 465
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index 584272105051..a91c57cb666a 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -60,10 +60,8 @@ unsigned long mp_lapic_addr;
60static int force_enable_local_apic; 60static int force_enable_local_apic;
61int disable_apic; 61int disable_apic;
62 62
63/* Local APIC timer verification ok */
64static int local_apic_timer_verify_ok;
65/* Disable local APIC timer from the kernel commandline or via dmi quirk */ 63/* Disable local APIC timer from the kernel commandline or via dmi quirk */
66static int local_apic_timer_disabled; 64static int disable_apic_timer __cpuinitdata;
67/* Local APIC timer works in C2 */ 65/* Local APIC timer works in C2 */
68int local_apic_timer_c2_ok; 66int local_apic_timer_c2_ok;
69EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); 67EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
@@ -130,7 +128,11 @@ static inline int lapic_get_version(void)
130 */ 128 */
131static inline int lapic_is_integrated(void) 129static inline int lapic_is_integrated(void)
132{ 130{
131#ifdef CONFIG_X86_64
132 return 1;
133#else
133 return APIC_INTEGRATED(lapic_get_version()); 134 return APIC_INTEGRATED(lapic_get_version());
135#endif
134} 136}
135 137
136/* 138/*
@@ -244,8 +246,12 @@ int lapic_get_maxlvt(void)
244 * Local APIC timer 246 * Local APIC timer
245 */ 247 */
246 248
247/* Clock divisor is set to 16 */ 249/* Clock divisor */
250#ifdef CONFG_X86_64
251#define APIC_DIVISOR 1
252#else
248#define APIC_DIVISOR 16 253#define APIC_DIVISOR 16
254#endif
249 255
250/* 256/*
251 * This function sets up the local APIC timer, with a timeout of 257 * This function sets up the local APIC timer, with a timeout of
@@ -253,6 +259,9 @@ int lapic_get_maxlvt(void)
253 * this function twice on the boot CPU, once with a bogus timeout 259 * this function twice on the boot CPU, once with a bogus timeout
254 * value, second time for real. The other (noncalibrating) CPUs 260 * value, second time for real. The other (noncalibrating) CPUs
255 * call this function only once, with the real, calibrated value. 261 * call this function only once, with the real, calibrated value.
262 *
263 * We do reads before writes even if unnecessary, to get around the
264 * P5 APIC double write bug.
256 */ 265 */
257static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) 266static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
258{ 267{
@@ -274,14 +283,44 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
274 */ 283 */
275 tmp_value = apic_read(APIC_TDCR); 284 tmp_value = apic_read(APIC_TDCR);
276 apic_write(APIC_TDCR, 285 apic_write(APIC_TDCR,
277 (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | 286 (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) |
278 APIC_TDR_DIV_16); 287 APIC_TDR_DIV_16);
279 288
280 if (!oneshot) 289 if (!oneshot)
281 apic_write(APIC_TMICT, clocks / APIC_DIVISOR); 290 apic_write(APIC_TMICT, clocks / APIC_DIVISOR);
282} 291}
283 292
284/* 293/*
294 * Setup extended LVT, AMD specific (K8, family 10h)
295 *
296 * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
297 * MCE interrupts are supported. Thus MCE offset must be set to 0.
298 */
299
300#define APIC_EILVT_LVTOFF_MCE 0
301#define APIC_EILVT_LVTOFF_IBS 1
302
303static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
304{
305 unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
306 unsigned int v = (mask << 16) | (msg_type << 8) | vector;
307
308 apic_write(reg, v);
309}
310
311u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
312{
313 setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
314 return APIC_EILVT_LVTOFF_MCE;
315}
316
317u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
318{
319 setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
320 return APIC_EILVT_LVTOFF_IBS;
321}
322
323/*
285 * Program the next event, relative to now 324 * Program the next event, relative to now
286 */ 325 */
287static int lapic_next_event(unsigned long delta, 326static int lapic_next_event(unsigned long delta,
@@ -300,8 +339,8 @@ static void lapic_timer_setup(enum clock_event_mode mode,
300 unsigned long flags; 339 unsigned long flags;
301 unsigned int v; 340 unsigned int v;
302 341
303 /* Lapic used for broadcast ? */ 342 /* Lapic used as dummy for broadcast ? */
304 if (!local_apic_timer_verify_ok) 343 if (evt->features & CLOCK_EVT_FEAT_DUMMY)
305 return; 344 return;
306 345
307 local_irq_save(flags); 346 local_irq_save(flags);
@@ -514,7 +553,7 @@ static int __init calibrate_APIC_clock(void)
514 return -1; 553 return -1;
515 } 554 }
516 555
517 local_apic_timer_verify_ok = 1; 556 levt->features &= ~CLOCK_EVT_FEAT_DUMMY;
518 557
519 /* We trust the pm timer based calibration */ 558 /* We trust the pm timer based calibration */
520 if (!pm_referenced) { 559 if (!pm_referenced) {
@@ -548,11 +587,11 @@ static int __init calibrate_APIC_clock(void)
548 if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2) 587 if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2)
549 apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); 588 apic_printk(APIC_VERBOSE, "... jiffies result ok\n");
550 else 589 else
551 local_apic_timer_verify_ok = 0; 590 levt->features |= CLOCK_EVT_FEAT_DUMMY;
552 } else 591 } else
553 local_irq_enable(); 592 local_irq_enable();
554 593
555 if (!local_apic_timer_verify_ok) { 594 if (levt->features & CLOCK_EVT_FEAT_DUMMY) {
556 printk(KERN_WARNING 595 printk(KERN_WARNING
557 "APIC timer disabled due to verification failure.\n"); 596 "APIC timer disabled due to verification failure.\n");
558 return -1; 597 return -1;
@@ -574,7 +613,8 @@ void __init setup_boot_APIC_clock(void)
574 * timer as a dummy clock event source on SMP systems, so the 613 * timer as a dummy clock event source on SMP systems, so the
575 * broadcast mechanism is used. On UP systems simply ignore it. 614 * broadcast mechanism is used. On UP systems simply ignore it.
576 */ 615 */
577 if (local_apic_timer_disabled) { 616 if (disable_apic_timer) {
617 printk(KERN_INFO "Disabling APIC timer\n");
578 /* No broadcast on UP ! */ 618 /* No broadcast on UP ! */
579 if (num_possible_cpus() > 1) { 619 if (num_possible_cpus() > 1) {
580 lapic_clockevent.mult = 1; 620 lapic_clockevent.mult = 1;
@@ -643,7 +683,11 @@ static void local_apic_timer_interrupt(void)
643 /* 683 /*
644 * the NMI deadlock-detector uses this. 684 * the NMI deadlock-detector uses this.
645 */ 685 */
686#ifdef CONFIG_X86_64
687 add_pda(apic_timer_irqs, 1);
688#else
646 per_cpu(irq_stat, cpu).apic_timer_irqs++; 689 per_cpu(irq_stat, cpu).apic_timer_irqs++;
690#endif
647 691
648 evt->event_handler(evt); 692 evt->event_handler(evt);
649} 693}
@@ -683,35 +727,6 @@ int setup_profiling_timer(unsigned int multiplier)
683} 727}
684 728
685/* 729/*
686 * Setup extended LVT, AMD specific (K8, family 10h)
687 *
688 * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
689 * MCE interrupts are supported. Thus MCE offset must be set to 0.
690 */
691
692#define APIC_EILVT_LVTOFF_MCE 0
693#define APIC_EILVT_LVTOFF_IBS 1
694
695static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
696{
697 unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
698 unsigned int v = (mask << 16) | (msg_type << 8) | vector;
699 apic_write(reg, v);
700}
701
702u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
703{
704 setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
705 return APIC_EILVT_LVTOFF_MCE;
706}
707
708u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
709{
710 setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
711 return APIC_EILVT_LVTOFF_IBS;
712}
713
714/*
715 * Local APIC start and shutdown 730 * Local APIC start and shutdown
716 */ 731 */
717 732
@@ -756,7 +771,7 @@ void clear_local_APIC(void)
756 } 771 }
757 772
758 /* lets not touch this if we didn't frob it */ 773 /* lets not touch this if we didn't frob it */
759#ifdef CONFIG_X86_MCE_P4THERMAL 774#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL)
760 if (maxlvt >= 5) { 775 if (maxlvt >= 5) {
761 v = apic_read(APIC_LVTTHMR); 776 v = apic_read(APIC_LVTTHMR);
762 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); 777 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
@@ -773,10 +788,6 @@ void clear_local_APIC(void)
773 if (maxlvt >= 4) 788 if (maxlvt >= 4)
774 apic_write(APIC_LVTPC, APIC_LVT_MASKED); 789 apic_write(APIC_LVTPC, APIC_LVT_MASKED);
775 790
776#ifdef CONFIG_X86_MCE_P4THERMAL
777 if (maxlvt >= 5)
778 apic_write(APIC_LVTTHMR, APIC_LVT_MASKED);
779#endif
780 /* Integrated APIC (!82489DX) ? */ 791 /* Integrated APIC (!82489DX) ? */
781 if (lapic_is_integrated()) { 792 if (lapic_is_integrated()) {
782 if (maxlvt > 3) 793 if (maxlvt > 3)
@@ -791,7 +802,7 @@ void clear_local_APIC(void)
791 */ 802 */
792void disable_local_APIC(void) 803void disable_local_APIC(void)
793{ 804{
794 unsigned long value; 805 unsigned int value;
795 806
796 clear_local_APIC(); 807 clear_local_APIC();
797 808
@@ -803,6 +814,7 @@ void disable_local_APIC(void)
803 value &= ~APIC_SPIV_APIC_ENABLED; 814 value &= ~APIC_SPIV_APIC_ENABLED;
804 apic_write(APIC_SPIV, value); 815 apic_write(APIC_SPIV, value);
805 816
817#ifdef CONFIG_X86_32
806 /* 818 /*
807 * When LAPIC was disabled by the BIOS and enabled by the kernel, 819 * When LAPIC was disabled by the BIOS and enabled by the kernel,
808 * restore the disabled state. 820 * restore the disabled state.
@@ -814,6 +826,7 @@ void disable_local_APIC(void)
814 l &= ~MSR_IA32_APICBASE_ENABLE; 826 l &= ~MSR_IA32_APICBASE_ENABLE;
815 wrmsr(MSR_IA32_APICBASE, l, h); 827 wrmsr(MSR_IA32_APICBASE, l, h);
816 } 828 }
829#endif
817} 830}
818 831
819/* 832/*
@@ -830,11 +843,15 @@ void lapic_shutdown(void)
830 return; 843 return;
831 844
832 local_irq_save(flags); 845 local_irq_save(flags);
833 clear_local_APIC();
834 846
835 if (enabled_via_apicbase) 847#ifdef CONFIG_X86_32
848 if (!enabled_via_apicbase)
849 clear_local_APIC();
850 else
851#endif
836 disable_local_APIC(); 852 disable_local_APIC();
837 853
854
838 local_irq_restore(flags); 855 local_irq_restore(flags);
839} 856}
840 857
@@ -879,6 +896,12 @@ int __init verify_local_APIC(void)
879 */ 896 */
880 reg0 = apic_read(APIC_ID); 897 reg0 = apic_read(APIC_ID);
881 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); 898 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
899 apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
900 reg1 = apic_read(APIC_ID);
901 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
902 apic_write(APIC_ID, reg0);
903 if (reg1 != (reg0 ^ APIC_ID_MASK))
904 return 0;
882 905
883 /* 906 /*
884 * The next two are just to see if we have sane values. 907 * The next two are just to see if we have sane values.
@@ -904,14 +927,15 @@ void __init sync_Arb_IDs(void)
904 */ 927 */
905 if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD) 928 if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
906 return; 929 return;
930
907 /* 931 /*
908 * Wait for idle. 932 * Wait for idle.
909 */ 933 */
910 apic_wait_icr_idle(); 934 apic_wait_icr_idle();
911 935
912 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); 936 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
913 apic_write(APIC_ICR, 937 apic_write(APIC_ICR, APIC_DEST_ALLINC |
914 APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT); 938 APIC_INT_LEVELTRIG | APIC_DM_INIT);
915} 939}
916 940
917/* 941/*
@@ -919,7 +943,7 @@ void __init sync_Arb_IDs(void)
919 */ 943 */
920void __init init_bsp_APIC(void) 944void __init init_bsp_APIC(void)
921{ 945{
922 unsigned long value; 946 unsigned int value;
923 947
924 /* 948 /*
925 * Don't do the setup now if we have a SMP BIOS as the 949 * Don't do the setup now if we have a SMP BIOS as the
@@ -940,11 +964,13 @@ void __init init_bsp_APIC(void)
940 value &= ~APIC_VECTOR_MASK; 964 value &= ~APIC_VECTOR_MASK;
941 value |= APIC_SPIV_APIC_ENABLED; 965 value |= APIC_SPIV_APIC_ENABLED;
942 966
967#ifdef CONFIG_X86_32
943 /* This bit is reserved on P4/Xeon and should be cleared */ 968 /* This bit is reserved on P4/Xeon and should be cleared */
944 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && 969 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
945 (boot_cpu_data.x86 == 15)) 970 (boot_cpu_data.x86 == 15))
946 value &= ~APIC_SPIV_FOCUS_DISABLED; 971 value &= ~APIC_SPIV_FOCUS_DISABLED;
947 else 972 else
973#endif
948 value |= APIC_SPIV_FOCUS_DISABLED; 974 value |= APIC_SPIV_FOCUS_DISABLED;
949 value |= SPURIOUS_APIC_VECTOR; 975 value |= SPURIOUS_APIC_VECTOR;
950 apic_write(APIC_SPIV, value); 976 apic_write(APIC_SPIV, value);
@@ -963,6 +989,16 @@ static void __cpuinit lapic_setup_esr(void)
963{ 989{
964 unsigned long oldvalue, value, maxlvt; 990 unsigned long oldvalue, value, maxlvt;
965 if (lapic_is_integrated() && !esr_disable) { 991 if (lapic_is_integrated() && !esr_disable) {
992 if (esr_disable) {
993 /*
994 * Something untraceable is creating bad interrupts on
995 * secondary quads ... for the moment, just leave the
996 * ESR disabled - we can't do anything useful with the
997 * errors anyway - mbligh
998 */
999 printk(KERN_INFO "Leaving ESR disabled.\n");
1000 return;
1001 }
966 /* !82489DX */ 1002 /* !82489DX */
967 maxlvt = lapic_get_maxlvt(); 1003 maxlvt = lapic_get_maxlvt();
968 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ 1004 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
@@ -983,16 +1019,7 @@ static void __cpuinit lapic_setup_esr(void)
983 "vector: 0x%08lx after: 0x%08lx\n", 1019 "vector: 0x%08lx after: 0x%08lx\n",
984 oldvalue, value); 1020 oldvalue, value);
985 } else { 1021 } else {
986 if (esr_disable) 1022 printk(KERN_INFO "No ESR for 82489DX.\n");
987 /*
988 * Something untraceable is creating bad interrupts on
989 * secondary quads ... for the moment, just leave the
990 * ESR disabled - we can't do anything useful with the
991 * errors anyway - mbligh
992 */
993 printk(KERN_INFO "Leaving ESR disabled.\n");
994 else
995 printk(KERN_INFO "No ESR for 82489DX.\n");
996 } 1023 }
997} 1024}
998 1025
@@ -1130,13 +1157,17 @@ void __cpuinit setup_local_APIC(void)
1130 1157
1131void __cpuinit end_local_APIC_setup(void) 1158void __cpuinit end_local_APIC_setup(void)
1132{ 1159{
1133 unsigned long value;
1134
1135 lapic_setup_esr(); 1160 lapic_setup_esr();
1136 /* Disable the local apic timer */ 1161
1137 value = apic_read(APIC_LVTT); 1162#ifdef CONFIG_X86_32
1138 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); 1163 {
1139 apic_write(APIC_LVTT, value); 1164 unsigned int value;
1165 /* Disable the local apic timer */
1166 value = apic_read(APIC_LVTT);
1167 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
1168 apic_write(APIC_LVTT, value);
1169 }
1170#endif
1140 1171
1141 setup_apic_nmi_watchdog(NULL); 1172 setup_apic_nmi_watchdog(NULL);
1142 apic_pm_activate(); 1173 apic_pm_activate();
@@ -1367,6 +1398,7 @@ void smp_error_interrupt(struct pt_regs *regs)
1367 */ 1398 */
1368void __init connect_bsp_APIC(void) 1399void __init connect_bsp_APIC(void)
1369{ 1400{
1401#ifdef CONFIG_X86_32
1370 if (pic_mode) { 1402 if (pic_mode) {
1371 /* 1403 /*
1372 * Do not trust the local APIC being empty at bootup. 1404 * Do not trust the local APIC being empty at bootup.
@@ -1381,6 +1413,7 @@ void __init connect_bsp_APIC(void)
1381 outb(0x70, 0x22); 1413 outb(0x70, 0x22);
1382 outb(0x01, 0x23); 1414 outb(0x01, 0x23);
1383 } 1415 }
1416#endif
1384 enable_apic_mode(); 1417 enable_apic_mode();
1385} 1418}
1386 1419
@@ -1393,6 +1426,9 @@ void __init connect_bsp_APIC(void)
1393 */ 1426 */
1394void disconnect_bsp_APIC(int virt_wire_setup) 1427void disconnect_bsp_APIC(int virt_wire_setup)
1395{ 1428{
1429 unsigned int value;
1430
1431#ifdef CONFIG_X86_32
1396 if (pic_mode) { 1432 if (pic_mode) {
1397 /* 1433 /*
1398 * Put the board back into PIC mode (has an effect only on 1434 * Put the board back into PIC mode (has an effect only on
@@ -1404,54 +1440,53 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1404 "entering PIC mode.\n"); 1440 "entering PIC mode.\n");
1405 outb(0x70, 0x22); 1441 outb(0x70, 0x22);
1406 outb(0x00, 0x23); 1442 outb(0x00, 0x23);
1407 } else { 1443 return;
1408 /* Go back to Virtual Wire compatibility mode */ 1444 }
1409 unsigned long value; 1445#endif
1410 1446
1411 /* For the spurious interrupt use vector F, and enable it */ 1447 /* Go back to Virtual Wire compatibility mode */
1412 value = apic_read(APIC_SPIV);
1413 value &= ~APIC_VECTOR_MASK;
1414 value |= APIC_SPIV_APIC_ENABLED;
1415 value |= 0xf;
1416 apic_write(APIC_SPIV, value);
1417 1448
1418 if (!virt_wire_setup) { 1449 /* For the spurious interrupt use vector F, and enable it */
1419 /* 1450 value = apic_read(APIC_SPIV);
1420 * For LVT0 make it edge triggered, active high, 1451 value &= ~APIC_VECTOR_MASK;
1421 * external and enabled 1452 value |= APIC_SPIV_APIC_ENABLED;
1422 */ 1453 value |= 0xf;
1423 value = apic_read(APIC_LVT0); 1454 apic_write(APIC_SPIV, value);
1424 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1425 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1426 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1427 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1428 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
1429 apic_write(APIC_LVT0, value);
1430 } else {
1431 /* Disable LVT0 */
1432 apic_write(APIC_LVT0, APIC_LVT_MASKED);
1433 }
1434 1455
1456 if (!virt_wire_setup) {
1435 /* 1457 /*
1436 * For LVT1 make it edge triggered, active high, nmi and 1458 * For LVT0 make it edge triggered, active high,
1437 * enabled 1459 * external and enabled
1438 */ 1460 */
1439 value = apic_read(APIC_LVT1); 1461 value = apic_read(APIC_LVT0);
1440 value &= ~( 1462 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1441 APIC_MODE_MASK | APIC_SEND_PENDING |
1442 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | 1463 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1443 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); 1464 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1444 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; 1465 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1445 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); 1466 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
1446 apic_write(APIC_LVT1, value); 1467 apic_write(APIC_LVT0, value);
1468 } else {
1469 /* Disable LVT0 */
1470 apic_write(APIC_LVT0, APIC_LVT_MASKED);
1447 } 1471 }
1472
1473 /*
1474 * For LVT1 make it edge triggered, active high,
1475 * nmi and enabled
1476 */
1477 value = apic_read(APIC_LVT1);
1478 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1479 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1480 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1481 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1482 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
1483 apic_write(APIC_LVT1, value);
1448} 1484}
1449 1485
1450void __cpuinit generic_processor_info(int apicid, int version) 1486void __cpuinit generic_processor_info(int apicid, int version)
1451{ 1487{
1452 int cpu; 1488 int cpu;
1453 cpumask_t tmp_map; 1489 cpumask_t tmp_map;
1454 physid_mask_t phys_cpu;
1455 1490
1456 /* 1491 /*
1457 * Validate version 1492 * Validate version
@@ -1464,9 +1499,6 @@ void __cpuinit generic_processor_info(int apicid, int version)
1464 } 1499 }
1465 apic_version[apicid] = version; 1500 apic_version[apicid] = version;
1466 1501
1467 phys_cpu = apicid_to_cpu_present(apicid);
1468 physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu);
1469
1470 if (num_processors >= NR_CPUS) { 1502 if (num_processors >= NR_CPUS) {
1471 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." 1503 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
1472 " Processor ignored.\n", NR_CPUS); 1504 " Processor ignored.\n", NR_CPUS);
@@ -1477,17 +1509,19 @@ void __cpuinit generic_processor_info(int apicid, int version)
1477 cpus_complement(tmp_map, cpu_present_map); 1509 cpus_complement(tmp_map, cpu_present_map);
1478 cpu = first_cpu(tmp_map); 1510 cpu = first_cpu(tmp_map);
1479 1511
1480 if (apicid == boot_cpu_physical_apicid) 1512 physid_set(apicid, phys_cpu_present_map);
1513 if (apicid == boot_cpu_physical_apicid) {
1481 /* 1514 /*
1482 * x86_bios_cpu_apicid is required to have processors listed 1515 * x86_bios_cpu_apicid is required to have processors listed
1483 * in same order as logical cpu numbers. Hence the first 1516 * in same order as logical cpu numbers. Hence the first
1484 * entry is BSP, and so on. 1517 * entry is BSP, and so on.
1485 */ 1518 */
1486 cpu = 0; 1519 cpu = 0;
1487 1520 }
1488 if (apicid > max_physical_apicid) 1521 if (apicid > max_physical_apicid)
1489 max_physical_apicid = apicid; 1522 max_physical_apicid = apicid;
1490 1523
1524#ifdef CONFIG_X86_32
1491 /* 1525 /*
1492 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y 1526 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
1493 * but we need to work other dependencies like SMP_SUSPEND etc 1527 * but we need to work other dependencies like SMP_SUSPEND etc
@@ -1507,7 +1541,9 @@ void __cpuinit generic_processor_info(int apicid, int version)
1507 def_to_bigsmp = 1; 1541 def_to_bigsmp = 1;
1508 } 1542 }
1509 } 1543 }
1510#ifdef CONFIG_SMP 1544#endif
1545
1546#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64)
1511 /* are we being called early in kernel startup? */ 1547 /* are we being called early in kernel startup? */
1512 if (early_per_cpu_ptr(x86_cpu_to_apicid)) { 1548 if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
1513 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); 1549 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
@@ -1520,6 +1556,7 @@ void __cpuinit generic_processor_info(int apicid, int version)
1520 per_cpu(x86_bios_cpu_apicid, cpu) = apicid; 1556 per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
1521 } 1557 }
1522#endif 1558#endif
1559
1523 cpu_set(cpu, cpu_possible_map); 1560 cpu_set(cpu, cpu_possible_map);
1524 cpu_set(cpu, cpu_present_map); 1561 cpu_set(cpu, cpu_present_map);
1525} 1562}
@@ -1530,6 +1567,11 @@ void __cpuinit generic_processor_info(int apicid, int version)
1530#ifdef CONFIG_PM 1567#ifdef CONFIG_PM
1531 1568
1532static struct { 1569static struct {
1570 /*
1571 * 'active' is true if the local APIC was enabled by us and
1572 * not the BIOS; this signifies that we are also responsible
1573 * for disabling it before entering apm/acpi suspend
1574 */
1533 int active; 1575 int active;
1534 /* r/w apic fields */ 1576 /* r/w apic fields */
1535 unsigned int apic_id; 1577 unsigned int apic_id;
@@ -1570,7 +1612,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
1570 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); 1612 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
1571 apic_pm_state.apic_tmict = apic_read(APIC_TMICT); 1613 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
1572 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); 1614 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
1573#ifdef CONFIG_X86_MCE_P4THERMAL 1615#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
1574 if (maxlvt >= 5) 1616 if (maxlvt >= 5)
1575 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); 1617 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
1576#endif 1618#endif
@@ -1594,16 +1636,23 @@ static int lapic_resume(struct sys_device *dev)
1594 1636
1595 local_irq_save(flags); 1637 local_irq_save(flags);
1596 1638
1597 /* 1639#ifdef CONFIG_X86_64
1598 * Make sure the APICBASE points to the right address 1640 if (x2apic)
1599 * 1641 enable_x2apic();
1600 * FIXME! This will be wrong if we ever support suspend on 1642 else
1601 * SMP! We'll need to do this as part of the CPU restore! 1643#endif
1602 */ 1644 {
1603 rdmsr(MSR_IA32_APICBASE, l, h); 1645 /*
1604 l &= ~MSR_IA32_APICBASE_BASE; 1646 * Make sure the APICBASE points to the right address
1605 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; 1647 *
1606 wrmsr(MSR_IA32_APICBASE, l, h); 1648 * FIXME! This will be wrong if we ever support suspend on
1649 * SMP! We'll need to do this as part of the CPU restore!
1650 */
1651 rdmsr(MSR_IA32_APICBASE, l, h);
1652 l &= ~MSR_IA32_APICBASE_BASE;
1653 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
1654 wrmsr(MSR_IA32_APICBASE, l, h);
1655 }
1607 1656
1608 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); 1657 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
1609 apic_write(APIC_ID, apic_pm_state.apic_id); 1658 apic_write(APIC_ID, apic_pm_state.apic_id);
@@ -1613,7 +1662,7 @@ static int lapic_resume(struct sys_device *dev)
1613 apic_write(APIC_SPIV, apic_pm_state.apic_spiv); 1662 apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
1614 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); 1663 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
1615 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); 1664 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
1616#ifdef CONFIG_X86_MCE_P4THERMAL 1665#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
1617 if (maxlvt >= 5) 1666 if (maxlvt >= 5)
1618 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); 1667 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
1619#endif 1668#endif
@@ -1627,7 +1676,9 @@ static int lapic_resume(struct sys_device *dev)
1627 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); 1676 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
1628 apic_write(APIC_ESR, 0); 1677 apic_write(APIC_ESR, 0);
1629 apic_read(APIC_ESR); 1678 apic_read(APIC_ESR);
1679
1630 local_irq_restore(flags); 1680 local_irq_restore(flags);
1681
1631 return 0; 1682 return 0;
1632} 1683}
1633 1684
@@ -1683,20 +1734,20 @@ static int __init parse_lapic(char *arg)
1683} 1734}
1684early_param("lapic", parse_lapic); 1735early_param("lapic", parse_lapic);
1685 1736
1686static int __init parse_nolapic(char *arg) 1737static int __init setup_disableapic(char *arg)
1687{ 1738{
1688 disable_apic = 1; 1739 disable_apic = 1;
1689 setup_clear_cpu_cap(X86_FEATURE_APIC); 1740 setup_clear_cpu_cap(X86_FEATURE_APIC);
1690 return 0; 1741 return 0;
1691} 1742}
1692early_param("nolapic", parse_nolapic); 1743early_param("disableapic", setup_disableapic);
1693 1744
1694static int __init parse_disable_lapic_timer(char *arg) 1745/* same as disableapic, for compatibility */
1746static int __init setup_nolapic(char *arg)
1695{ 1747{
1696 local_apic_timer_disabled = 1; 1748 return setup_disableapic(arg);
1697 return 0;
1698} 1749}
1699early_param("nolapic_timer", parse_disable_lapic_timer); 1750early_param("nolapic", setup_nolapic);
1700 1751
1701static int __init parse_lapic_timer_c2_ok(char *arg) 1752static int __init parse_lapic_timer_c2_ok(char *arg)
1702{ 1753{
@@ -1705,15 +1756,40 @@ static int __init parse_lapic_timer_c2_ok(char *arg)
1705} 1756}
1706early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); 1757early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
1707 1758
1759static int __init parse_disable_apic_timer(char *arg)
1760{
1761 disable_apic_timer = 1;
1762 return 0;
1763}
1764early_param("noapictimer", parse_disable_apic_timer);
1765
1766static int __init parse_nolapic_timer(char *arg)
1767{
1768 disable_apic_timer = 1;
1769 return 0;
1770}
1771early_param("nolapic_timer", parse_nolapic_timer);
1772
1708static int __init apic_set_verbosity(char *arg) 1773static int __init apic_set_verbosity(char *arg)
1709{ 1774{
1710 if (!arg) 1775 if (!arg) {
1776#ifdef CONFIG_X86_64
1777 skip_ioapic_setup = 0;
1778 ioapic_force = 1;
1779 return 0;
1780#endif
1711 return -EINVAL; 1781 return -EINVAL;
1782 }
1712 1783
1713 if (strcmp(arg, "debug") == 0) 1784 if (strcmp("debug", arg) == 0)
1714 apic_verbosity = APIC_DEBUG; 1785 apic_verbosity = APIC_DEBUG;
1715 else if (strcmp(arg, "verbose") == 0) 1786 else if (strcmp("verbose", arg) == 0)
1716 apic_verbosity = APIC_VERBOSE; 1787 apic_verbosity = APIC_VERBOSE;
1788 else {
1789 printk(KERN_WARNING "APIC Verbosity level %s not recognised"
1790 " use apic=verbose or apic=debug\n", arg);
1791 return -EINVAL;
1792 }
1717 1793
1718 return 0; 1794 return 0;
1719} 1795}
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 1a6011855af3..53898b65a6ae 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -45,6 +45,7 @@
45#include <mach_ipi.h> 45#include <mach_ipi.h>
46#include <mach_apic.h> 46#include <mach_apic.h>
47 47
48/* Disable local APIC timer from the kernel commandline or via dmi quirk */
48static int disable_apic_timer __cpuinitdata; 49static int disable_apic_timer __cpuinitdata;
49static int apic_calibrate_pmtmr __initdata; 50static int apic_calibrate_pmtmr __initdata;
50int disable_apic; 51int disable_apic;
@@ -80,6 +81,9 @@ static void lapic_timer_setup(enum clock_event_mode mode,
80static void lapic_timer_broadcast(cpumask_t mask); 81static void lapic_timer_broadcast(cpumask_t mask);
81static void apic_pm_activate(void); 82static void apic_pm_activate(void);
82 83
84/*
85 * The local apic timer can be used for any function which is CPU local.
86 */
83static struct clock_event_device lapic_clockevent = { 87static struct clock_event_device lapic_clockevent = {
84 .name = "lapic", 88 .name = "lapic",
85 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT 89 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
@@ -106,11 +110,15 @@ static inline int lapic_get_version(void)
106} 110}
107 111
108/* 112/*
109 * Check, if the APIC is integrated or a seperate chip 113 * Check, if the APIC is integrated or a separate chip
110 */ 114 */
111static inline int lapic_is_integrated(void) 115static inline int lapic_is_integrated(void)
112{ 116{
117#ifdef CONFIG_X86_64
113 return 1; 118 return 1;
119#else
120 return APIC_INTEGRATED(lapic_get_version());
121#endif
114} 122}
115 123
116/* 124/*
@@ -125,6 +133,11 @@ static int modern_apic(void)
125 return lapic_get_version() >= 0x14; 133 return lapic_get_version() >= 0x14;
126} 134}
127 135
136/*
137 * Paravirt kernels also might be using these below ops. So we still
138 * use generic apic_read()/apic_write(), which might be pointing to different
139 * ops in PARAVIRT case.
140 */
128void xapic_wait_icr_idle(void) 141void xapic_wait_icr_idle(void)
129{ 142{
130 while (apic_read(APIC_ICR) & APIC_ICR_BUSY) 143 while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
@@ -149,7 +162,7 @@ u32 safe_xapic_wait_icr_idle(void)
149 162
150void xapic_icr_write(u32 low, u32 id) 163void xapic_icr_write(u32 low, u32 id)
151{ 164{
152 apic_write(APIC_ICR2, id << 24); 165 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id));
153 apic_write(APIC_ICR, low); 166 apic_write(APIC_ICR, low);
154} 167}
155 168
@@ -160,7 +173,7 @@ u64 xapic_icr_read(void)
160 icr2 = apic_read(APIC_ICR2); 173 icr2 = apic_read(APIC_ICR2);
161 icr1 = apic_read(APIC_ICR); 174 icr1 = apic_read(APIC_ICR);
162 175
163 return (icr1 | ((u64)icr2 << 32)); 176 return icr1 | ((u64)icr2 << 32);
164} 177}
165 178
166static struct apic_ops xapic_ops = { 179static struct apic_ops xapic_ops = {
@@ -173,7 +186,6 @@ static struct apic_ops xapic_ops = {
173}; 186};
174 187
175struct apic_ops __read_mostly *apic_ops = &xapic_ops; 188struct apic_ops __read_mostly *apic_ops = &xapic_ops;
176
177EXPORT_SYMBOL_GPL(apic_ops); 189EXPORT_SYMBOL_GPL(apic_ops);
178 190
179static void x2apic_wait_icr_idle(void) 191static void x2apic_wait_icr_idle(void)
@@ -243,6 +255,17 @@ int lapic_get_maxlvt(void)
243} 255}
244 256
245/* 257/*
258 * Local APIC timer
259 */
260
261/* Clock divisor */
262#ifdef CONFG_X86_64
263#define APIC_DIVISOR 1
264#else
265#define APIC_DIVISOR 16
266#endif
267
268/*
246 * This function sets up the local APIC timer, with a timeout of 269 * This function sets up the local APIC timer, with a timeout of
247 * 'clocks' APIC bus clock. During calibration we actually call 270 * 'clocks' APIC bus clock. During calibration we actually call
248 * this function twice on the boot CPU, once with a bogus timeout 271 * this function twice on the boot CPU, once with a bogus timeout
@@ -252,7 +275,6 @@ int lapic_get_maxlvt(void)
252 * We do reads before writes even if unnecessary, to get around the 275 * We do reads before writes even if unnecessary, to get around the
253 * P5 APIC double write bug. 276 * P5 APIC double write bug.
254 */ 277 */
255
256static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) 278static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
257{ 279{
258 unsigned int lvtt_value, tmp_value; 280 unsigned int lvtt_value, tmp_value;
@@ -260,6 +282,9 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
260 lvtt_value = LOCAL_TIMER_VECTOR; 282 lvtt_value = LOCAL_TIMER_VECTOR;
261 if (!oneshot) 283 if (!oneshot)
262 lvtt_value |= APIC_LVT_TIMER_PERIODIC; 284 lvtt_value |= APIC_LVT_TIMER_PERIODIC;
285 if (!lapic_is_integrated())
286 lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
287
263 if (!irqen) 288 if (!irqen)
264 lvtt_value |= APIC_LVT_MASKED; 289 lvtt_value |= APIC_LVT_MASKED;
265 290
@@ -269,12 +294,12 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
269 * Divide PICLK by 16 294 * Divide PICLK by 16
270 */ 295 */
271 tmp_value = apic_read(APIC_TDCR); 296 tmp_value = apic_read(APIC_TDCR);
272 apic_write(APIC_TDCR, (tmp_value 297 apic_write(APIC_TDCR,
273 & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) 298 (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) |
274 | APIC_TDR_DIV_16); 299 APIC_TDR_DIV_16);
275 300
276 if (!oneshot) 301 if (!oneshot)
277 apic_write(APIC_TMICT, clocks); 302 apic_write(APIC_TMICT, clocks / APIC_DIVISOR);
278} 303}
279 304
280/* 305/*
@@ -444,7 +469,7 @@ static int __init calibrate_APIC_clock(void)
444 lapic_clockevent.min_delta_ns = 469 lapic_clockevent.min_delta_ns =
445 clockevent_delta2ns(0xF, &lapic_clockevent); 470 clockevent_delta2ns(0xF, &lapic_clockevent);
446 471
447 calibration_result = result / HZ; 472 calibration_result = (result * APIC_DIVISOR) / HZ;
448 473
449 /* 474 /*
450 * Do a sanity check on the APIC calibration result 475 * Do a sanity check on the APIC calibration result
@@ -466,10 +491,10 @@ static int __init calibrate_APIC_clock(void)
466void __init setup_boot_APIC_clock(void) 491void __init setup_boot_APIC_clock(void)
467{ 492{
468 /* 493 /*
469 * The local apic timer can be disabled via the kernel commandline. 494 * The local apic timer can be disabled via the kernel
470 * Register the lapic timer as a dummy clock event source on SMP 495 * commandline or from the CPU detection code. Register the lapic
471 * systems, so the broadcast mechanism is used. On UP systems simply 496 * timer as a dummy clock event source on SMP systems, so the
472 * ignore it. 497 * broadcast mechanism is used. On UP systems simply ignore it.
473 */ 498 */
474 if (disable_apic_timer) { 499 if (disable_apic_timer) {
475 printk(KERN_INFO "Disabling APIC timer\n"); 500 printk(KERN_INFO "Disabling APIC timer\n");
@@ -481,7 +506,9 @@ void __init setup_boot_APIC_clock(void)
481 return; 506 return;
482 } 507 }
483 508
484 printk(KERN_INFO "Using local APIC timer interrupts.\n"); 509 apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
510 "calibrating APIC timer ...\n");
511
485 if (calibrate_APIC_clock()) { 512 if (calibrate_APIC_clock()) {
486 /* No broadcast on UP ! */ 513 /* No broadcast on UP ! */
487 if (num_possible_cpus() > 1) 514 if (num_possible_cpus() > 1)
@@ -500,6 +527,7 @@ void __init setup_boot_APIC_clock(void)
500 printk(KERN_WARNING "APIC timer registered as dummy," 527 printk(KERN_WARNING "APIC timer registered as dummy,"
501 " due to nmi_watchdog=%d!\n", nmi_watchdog); 528 " due to nmi_watchdog=%d!\n", nmi_watchdog);
502 529
530 /* Setup the lapic or request the broadcast */
503 setup_APIC_timer(); 531 setup_APIC_timer();
504} 532}
505 533
@@ -538,7 +566,11 @@ static void local_apic_timer_interrupt(void)
538 /* 566 /*
539 * the NMI deadlock-detector uses this. 567 * the NMI deadlock-detector uses this.
540 */ 568 */
569#ifdef CONFIG_X86_64
541 add_pda(apic_timer_irqs, 1); 570 add_pda(apic_timer_irqs, 1);
571#else
572 per_cpu(irq_stat, cpu).apic_timer_irqs++;
573#endif
542 574
543 evt->event_handler(evt); 575 evt->event_handler(evt);
544} 576}
@@ -569,6 +601,7 @@ void smp_apic_timer_interrupt(struct pt_regs *regs)
569 irq_enter(); 601 irq_enter();
570 local_apic_timer_interrupt(); 602 local_apic_timer_interrupt();
571 irq_exit(); 603 irq_exit();
604
572 set_irq_regs(old_regs); 605 set_irq_regs(old_regs);
573} 606}
574 607
@@ -622,6 +655,13 @@ void clear_local_APIC(void)
622 apic_write(APIC_LVTPC, v | APIC_LVT_MASKED); 655 apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
623 } 656 }
624 657
658 /* lets not touch this if we didn't frob it */
659#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL)
660 if (maxlvt >= 5) {
661 v = apic_read(APIC_LVTTHMR);
662 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
663 }
664#endif
625 /* 665 /*
626 * Clean APIC state for other OSs: 666 * Clean APIC state for other OSs:
627 */ 667 */
@@ -632,8 +672,14 @@ void clear_local_APIC(void)
632 apic_write(APIC_LVTERR, APIC_LVT_MASKED); 672 apic_write(APIC_LVTERR, APIC_LVT_MASKED);
633 if (maxlvt >= 4) 673 if (maxlvt >= 4)
634 apic_write(APIC_LVTPC, APIC_LVT_MASKED); 674 apic_write(APIC_LVTPC, APIC_LVT_MASKED);
635 apic_write(APIC_ESR, 0); 675
636 apic_read(APIC_ESR); 676 /* Integrated APIC (!82489DX) ? */
677 if (lapic_is_integrated()) {
678 if (maxlvt > 3)
679 /* Clear ESR due to Pentium errata 3AP and 11AP */
680 apic_write(APIC_ESR, 0);
681 apic_read(APIC_ESR);
682 }
637} 683}
638 684
639/** 685/**
@@ -652,8 +698,28 @@ void disable_local_APIC(void)
652 value = apic_read(APIC_SPIV); 698 value = apic_read(APIC_SPIV);
653 value &= ~APIC_SPIV_APIC_ENABLED; 699 value &= ~APIC_SPIV_APIC_ENABLED;
654 apic_write(APIC_SPIV, value); 700 apic_write(APIC_SPIV, value);
701
702#ifdef CONFIG_X86_32
703 /*
704 * When LAPIC was disabled by the BIOS and enabled by the kernel,
705 * restore the disabled state.
706 */
707 if (enabled_via_apicbase) {
708 unsigned int l, h;
709
710 rdmsr(MSR_IA32_APICBASE, l, h);
711 l &= ~MSR_IA32_APICBASE_ENABLE;
712 wrmsr(MSR_IA32_APICBASE, l, h);
713 }
714#endif
655} 715}
656 716
717/*
718 * If Linux enabled the LAPIC against the BIOS default disable it down before
719 * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and
720 * not power-off. Additionally clear all LVT entries before disable_local_APIC
721 * for the case where Linux didn't enable the LAPIC.
722 */
657void lapic_shutdown(void) 723void lapic_shutdown(void)
658{ 724{
659 unsigned long flags; 725 unsigned long flags;
@@ -663,7 +729,13 @@ void lapic_shutdown(void)
663 729
664 local_irq_save(flags); 730 local_irq_save(flags);
665 731
666 disable_local_APIC(); 732#ifdef CONFIG_X86_32
733 if (!enabled_via_apicbase)
734 clear_local_APIC();
735 else
736#endif
737 disable_local_APIC();
738
667 739
668 local_irq_restore(flags); 740 local_irq_restore(flags);
669} 741}
@@ -734,8 +806,11 @@ int __init verify_local_APIC(void)
734 */ 806 */
735void __init sync_Arb_IDs(void) 807void __init sync_Arb_IDs(void)
736{ 808{
737 /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */ 809 /*
738 if (modern_apic()) 810 * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not
811 * needed on AMD.
812 */
813 if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
739 return; 814 return;
740 815
741 /* 816 /*
@@ -744,8 +819,8 @@ void __init sync_Arb_IDs(void)
744 apic_wait_icr_idle(); 819 apic_wait_icr_idle();
745 820
746 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); 821 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
747 apic_write(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG 822 apic_write(APIC_ICR, APIC_DEST_ALLINC |
748 | APIC_DM_INIT); 823 APIC_INT_LEVELTRIG | APIC_DM_INIT);
749} 824}
750 825
751/* 826/*
@@ -762,8 +837,6 @@ void __init init_bsp_APIC(void)
762 if (smp_found_config || !cpu_has_apic) 837 if (smp_found_config || !cpu_has_apic)
763 return; 838 return;
764 839
765 value = apic_read(APIC_LVR);
766
767 /* 840 /*
768 * Do not trust the local APIC being empty at bootup. 841 * Do not trust the local APIC being empty at bootup.
769 */ 842 */
@@ -775,7 +848,15 @@ void __init init_bsp_APIC(void)
775 value = apic_read(APIC_SPIV); 848 value = apic_read(APIC_SPIV);
776 value &= ~APIC_VECTOR_MASK; 849 value &= ~APIC_VECTOR_MASK;
777 value |= APIC_SPIV_APIC_ENABLED; 850 value |= APIC_SPIV_APIC_ENABLED;
778 value |= APIC_SPIV_FOCUS_DISABLED; 851
852#ifdef CONFIG_X86_32
853 /* This bit is reserved on P4/Xeon and should be cleared */
854 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
855 (boot_cpu_data.x86 == 15))
856 value &= ~APIC_SPIV_FOCUS_DISABLED;
857 else
858#endif
859 value |= APIC_SPIV_FOCUS_DISABLED;
779 value |= SPURIOUS_APIC_VECTOR; 860 value |= SPURIOUS_APIC_VECTOR;
780 apic_write(APIC_SPIV, value); 861 apic_write(APIC_SPIV, value);
781 862
@@ -784,9 +865,50 @@ void __init init_bsp_APIC(void)
784 */ 865 */
785 apic_write(APIC_LVT0, APIC_DM_EXTINT); 866 apic_write(APIC_LVT0, APIC_DM_EXTINT);
786 value = APIC_DM_NMI; 867 value = APIC_DM_NMI;
868 if (!lapic_is_integrated()) /* 82489DX */
869 value |= APIC_LVT_LEVEL_TRIGGER;
787 apic_write(APIC_LVT1, value); 870 apic_write(APIC_LVT1, value);
788} 871}
789 872
873static void __cpuinit lapic_setup_esr(void)
874{
875 unsigned long oldvalue, value, maxlvt;
876 if (lapic_is_integrated() && !esr_disable) {
877 if (esr_disable) {
878 /*
879 * Something untraceable is creating bad interrupts on
880 * secondary quads ... for the moment, just leave the
881 * ESR disabled - we can't do anything useful with the
882 * errors anyway - mbligh
883 */
884 printk(KERN_INFO "Leaving ESR disabled.\n");
885 return;
886 }
887 /* !82489DX */
888 maxlvt = lapic_get_maxlvt();
889 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
890 apic_write(APIC_ESR, 0);
891 oldvalue = apic_read(APIC_ESR);
892
893 /* enables sending errors */
894 value = ERROR_APIC_VECTOR;
895 apic_write(APIC_LVTERR, value);
896 /*
897 * spec says clear errors after enabling vector.
898 */
899 if (maxlvt > 3)
900 apic_write(APIC_ESR, 0);
901 value = apic_read(APIC_ESR);
902 if (value != oldvalue)
903 apic_printk(APIC_VERBOSE, "ESR value before enabling "
904 "vector: 0x%08lx after: 0x%08lx\n",
905 oldvalue, value);
906 } else {
907 printk(KERN_INFO "No ESR for 82489DX.\n");
908 }
909}
910
911
790/** 912/**
791 * setup_local_APIC - setup the local APIC 913 * setup_local_APIC - setup the local APIC
792 */ 914 */
@@ -892,21 +1014,20 @@ void __cpuinit setup_local_APIC(void)
892 preempt_enable(); 1014 preempt_enable();
893} 1015}
894 1016
895static void __cpuinit lapic_setup_esr(void)
896{
897 unsigned maxlvt = lapic_get_maxlvt();
898
899 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR);
900 /*
901 * spec says clear errors after enabling vector.
902 */
903 if (maxlvt > 3)
904 apic_write(APIC_ESR, 0);
905}
906
907void __cpuinit end_local_APIC_setup(void) 1017void __cpuinit end_local_APIC_setup(void)
908{ 1018{
909 lapic_setup_esr(); 1019 lapic_setup_esr();
1020
1021#ifdef CONFIG_X86_32
1022 {
1023 unsigned int value;
1024 /* Disable the local apic timer */
1025 value = apic_read(APIC_LVTT);
1026 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
1027 apic_write(APIC_LVTT, value);
1028 }
1029#endif
1030
910 setup_apic_nmi_watchdog(NULL); 1031 setup_apic_nmi_watchdog(NULL);
911 apic_pm_activate(); 1032 apic_pm_activate();
912} 1033}
@@ -1108,6 +1229,8 @@ void __init init_apic_mappings(void)
1108 * This initializes the IO-APIC and APIC hardware if this is 1229 * This initializes the IO-APIC and APIC hardware if this is
1109 * a UP kernel. 1230 * a UP kernel.
1110 */ 1231 */
1232int apic_version[MAX_APICS];
1233
1111int __init APIC_init_uniprocessor(void) 1234int __init APIC_init_uniprocessor(void)
1112{ 1235{
1113 if (disable_apic) { 1236 if (disable_apic) {
@@ -1209,17 +1332,57 @@ asmlinkage void smp_error_interrupt(void)
1209} 1332}
1210 1333
1211/** 1334/**
1212 * * connect_bsp_APIC - attach the APIC to the interrupt system 1335 * connect_bsp_APIC - attach the APIC to the interrupt system
1213 * */ 1336 */
1214void __init connect_bsp_APIC(void) 1337void __init connect_bsp_APIC(void)
1215{ 1338{
1339#ifdef CONFIG_X86_32
1340 if (pic_mode) {
1341 /*
1342 * Do not trust the local APIC being empty at bootup.
1343 */
1344 clear_local_APIC();
1345 /*
1346 * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's
1347 * local APIC to INT and NMI lines.
1348 */
1349 apic_printk(APIC_VERBOSE, "leaving PIC mode, "
1350 "enabling APIC mode.\n");
1351 outb(0x70, 0x22);
1352 outb(0x01, 0x23);
1353 }
1354#endif
1216 enable_apic_mode(); 1355 enable_apic_mode();
1217} 1356}
1218 1357
1358/**
1359 * disconnect_bsp_APIC - detach the APIC from the interrupt system
1360 * @virt_wire_setup: indicates, whether virtual wire mode is selected
1361 *
1362 * Virtual wire mode is necessary to deliver legacy interrupts even when the
1363 * APIC is disabled.
1364 */
1219void disconnect_bsp_APIC(int virt_wire_setup) 1365void disconnect_bsp_APIC(int virt_wire_setup)
1220{ 1366{
1367 unsigned int value;
1368
1369#ifdef CONFIG_X86_32
1370 if (pic_mode) {
1371 /*
1372 * Put the board back into PIC mode (has an effect only on
1373 * certain older boards). Note that APIC interrupts, including
1374 * IPIs, won't work beyond this point! The only exception are
1375 * INIT IPIs.
1376 */
1377 apic_printk(APIC_VERBOSE, "disabling APIC mode, "
1378 "entering PIC mode.\n");
1379 outb(0x70, 0x22);
1380 outb(0x00, 0x23);
1381 return;
1382 }
1383#endif
1384
1221 /* Go back to Virtual Wire compatibility mode */ 1385 /* Go back to Virtual Wire compatibility mode */
1222 unsigned long value;
1223 1386
1224 /* For the spurious interrupt use vector F, and enable it */ 1387 /* For the spurious interrupt use vector F, and enable it */
1225 value = apic_read(APIC_SPIV); 1388 value = apic_read(APIC_SPIV);
@@ -1245,7 +1408,10 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1245 apic_write(APIC_LVT0, APIC_LVT_MASKED); 1408 apic_write(APIC_LVT0, APIC_LVT_MASKED);
1246 } 1409 }
1247 1410
1248 /* For LVT1 make it edge triggered, active high, nmi and enabled */ 1411 /*
1412 * For LVT1 make it edge triggered, active high,
1413 * nmi and enabled
1414 */
1249 value = apic_read(APIC_LVT1); 1415 value = apic_read(APIC_LVT1);
1250 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | 1416 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1251 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | 1417 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
@@ -1260,9 +1426,20 @@ void __cpuinit generic_processor_info(int apicid, int version)
1260 int cpu; 1426 int cpu;
1261 cpumask_t tmp_map; 1427 cpumask_t tmp_map;
1262 1428
1429 /*
1430 * Validate version
1431 */
1432 if (version == 0x0) {
1433 printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! "
1434 "fixing up to 0x10. (tell your hw vendor)\n",
1435 version);
1436 version = 0x10;
1437 }
1438 apic_version[apicid] = version;
1439
1263 if (num_processors >= NR_CPUS) { 1440 if (num_processors >= NR_CPUS) {
1264 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." 1441 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
1265 " Processor ignored.\n", NR_CPUS); 1442 " Processor ignored.\n", NR_CPUS);
1266 return; 1443 return;
1267 } 1444 }
1268 1445
@@ -1282,6 +1459,29 @@ void __cpuinit generic_processor_info(int apicid, int version)
1282 if (apicid > max_physical_apicid) 1459 if (apicid > max_physical_apicid)
1283 max_physical_apicid = apicid; 1460 max_physical_apicid = apicid;
1284 1461
1462#ifdef CONFIG_X86_32
1463 /*
1464 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
1465 * but we need to work other dependencies like SMP_SUSPEND etc
1466 * before this can be done without some confusion.
1467 * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
1468 * - Ashok Raj <ashok.raj@intel.com>
1469 */
1470 if (max_physical_apicid >= 8) {
1471 switch (boot_cpu_data.x86_vendor) {
1472 case X86_VENDOR_INTEL:
1473 if (!APIC_XAPIC(version)) {
1474 def_to_bigsmp = 0;
1475 break;
1476 }
1477 /* If P4 and above fall through */
1478 case X86_VENDOR_AMD:
1479 def_to_bigsmp = 1;
1480 }
1481 }
1482#endif
1483
1484#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64)
1285 /* are we being called early in kernel startup? */ 1485 /* are we being called early in kernel startup? */
1286 if (early_per_cpu_ptr(x86_cpu_to_apicid)) { 1486 if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
1287 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); 1487 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
@@ -1293,6 +1493,7 @@ void __cpuinit generic_processor_info(int apicid, int version)
1293 per_cpu(x86_cpu_to_apicid, cpu) = apicid; 1493 per_cpu(x86_cpu_to_apicid, cpu) = apicid;
1294 per_cpu(x86_bios_cpu_apicid, cpu) = apicid; 1494 per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
1295 } 1495 }
1496#endif
1296 1497
1297 cpu_set(cpu, cpu_possible_map); 1498 cpu_set(cpu, cpu_possible_map);
1298 cpu_set(cpu, cpu_present_map); 1499 cpu_set(cpu, cpu_present_map);
@@ -1309,9 +1510,11 @@ int hard_smp_processor_id(void)
1309#ifdef CONFIG_PM 1510#ifdef CONFIG_PM
1310 1511
1311static struct { 1512static struct {
1312 /* 'active' is true if the local APIC was enabled by us and 1513 /*
1313 not the BIOS; this signifies that we are also responsible 1514 * 'active' is true if the local APIC was enabled by us and
1314 for disabling it before entering apm/acpi suspend */ 1515 * not the BIOS; this signifies that we are also responsible
1516 * for disabling it before entering apm/acpi suspend
1517 */
1315 int active; 1518 int active;
1316 /* r/w apic fields */ 1519 /* r/w apic fields */
1317 unsigned int apic_id; 1520 unsigned int apic_id;
@@ -1352,10 +1555,11 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
1352 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); 1555 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
1353 apic_pm_state.apic_tmict = apic_read(APIC_TMICT); 1556 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
1354 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); 1557 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
1355#ifdef CONFIG_X86_MCE_INTEL 1558#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
1356 if (maxlvt >= 5) 1559 if (maxlvt >= 5)
1357 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); 1560 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
1358#endif 1561#endif
1562
1359 local_irq_save(flags); 1563 local_irq_save(flags);
1360 disable_local_APIC(); 1564 disable_local_APIC();
1361 local_irq_restore(flags); 1565 local_irq_restore(flags);
@@ -1374,13 +1578,24 @@ static int lapic_resume(struct sys_device *dev)
1374 maxlvt = lapic_get_maxlvt(); 1578 maxlvt = lapic_get_maxlvt();
1375 1579
1376 local_irq_save(flags); 1580 local_irq_save(flags);
1377 if (!x2apic) { 1581
1582#ifdef CONFIG_X86_64
1583 if (x2apic)
1584 enable_x2apic();
1585 else
1586#endif
1587 {
1588 /*
1589 * Make sure the APICBASE points to the right address
1590 *
1591 * FIXME! This will be wrong if we ever support suspend on
1592 * SMP! We'll need to do this as part of the CPU restore!
1593 */
1378 rdmsr(MSR_IA32_APICBASE, l, h); 1594 rdmsr(MSR_IA32_APICBASE, l, h);
1379 l &= ~MSR_IA32_APICBASE_BASE; 1595 l &= ~MSR_IA32_APICBASE_BASE;
1380 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; 1596 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
1381 wrmsr(MSR_IA32_APICBASE, l, h); 1597 wrmsr(MSR_IA32_APICBASE, l, h);
1382 } else 1598 }
1383 enable_x2apic();
1384 1599
1385 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); 1600 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
1386 apic_write(APIC_ID, apic_pm_state.apic_id); 1601 apic_write(APIC_ID, apic_pm_state.apic_id);
@@ -1390,7 +1605,7 @@ static int lapic_resume(struct sys_device *dev)
1390 apic_write(APIC_SPIV, apic_pm_state.apic_spiv); 1605 apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
1391 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); 1606 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
1392 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); 1607 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
1393#ifdef CONFIG_X86_MCE_INTEL 1608#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
1394 if (maxlvt >= 5) 1609 if (maxlvt >= 5)
1395 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); 1610 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
1396#endif 1611#endif
@@ -1404,10 +1619,17 @@ static int lapic_resume(struct sys_device *dev)
1404 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); 1619 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
1405 apic_write(APIC_ESR, 0); 1620 apic_write(APIC_ESR, 0);
1406 apic_read(APIC_ESR); 1621 apic_read(APIC_ESR);
1622
1407 local_irq_restore(flags); 1623 local_irq_restore(flags);
1624
1408 return 0; 1625 return 0;
1409} 1626}
1410 1627
1628/*
1629 * This device has no shutdown method - fully functioning local APICs
1630 * are needed on every CPU up until machine_halt/restart/poweroff.
1631 */
1632
1411static struct sysdev_class lapic_sysclass = { 1633static struct sysdev_class lapic_sysclass = {
1412 .name = "lapic", 1634 .name = "lapic",
1413 .resume = lapic_resume, 1635 .resume = lapic_resume,
@@ -1533,28 +1755,7 @@ early_param("nox2apic", setup_nox2apic);
1533/* 1755/*
1534 * APIC command line parameters 1756 * APIC command line parameters
1535 */ 1757 */
1536static int __init apic_set_verbosity(char *str) 1758static int __init setup_disableapic(char *arg)
1537{
1538 if (str == NULL) {
1539 skip_ioapic_setup = 0;
1540 ioapic_force = 1;
1541 return 0;
1542 }
1543 if (strcmp("debug", str) == 0)
1544 apic_verbosity = APIC_DEBUG;
1545 else if (strcmp("verbose", str) == 0)
1546 apic_verbosity = APIC_VERBOSE;
1547 else {
1548 printk(KERN_WARNING "APIC Verbosity level %s not recognised"
1549 " use apic=verbose or apic=debug\n", str);
1550 return -EINVAL;
1551 }
1552
1553 return 0;
1554}
1555early_param("apic", apic_set_verbosity);
1556
1557static __init int setup_disableapic(char *str)
1558{ 1759{
1559 disable_apic = 1; 1760 disable_apic = 1;
1560 setup_clear_cpu_cap(X86_FEATURE_APIC); 1761 setup_clear_cpu_cap(X86_FEATURE_APIC);
@@ -1563,9 +1764,9 @@ static __init int setup_disableapic(char *str)
1563early_param("disableapic", setup_disableapic); 1764early_param("disableapic", setup_disableapic);
1564 1765
1565/* same as disableapic, for compatibility */ 1766/* same as disableapic, for compatibility */
1566static __init int setup_nolapic(char *str) 1767static int __init setup_nolapic(char *arg)
1567{ 1768{
1568 return setup_disableapic(str); 1769 return setup_disableapic(arg);
1569} 1770}
1570early_param("nolapic", setup_nolapic); 1771early_param("nolapic", setup_nolapic);
1571 1772
@@ -1576,14 +1777,19 @@ static int __init parse_lapic_timer_c2_ok(char *arg)
1576} 1777}
1577early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); 1778early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
1578 1779
1579static __init int setup_noapictimer(char *str) 1780static int __init parse_disable_apic_timer(char *arg)
1580{ 1781{
1581 if (str[0] != ' ' && str[0] != 0)
1582 return 0;
1583 disable_apic_timer = 1; 1782 disable_apic_timer = 1;
1584 return 1; 1783 return 0;
1784}
1785early_param("noapictimer", parse_disable_apic_timer);
1786
1787static int __init parse_nolapic_timer(char *arg)
1788{
1789 disable_apic_timer = 1;
1790 return 0;
1585} 1791}
1586__setup("noapictimer", setup_noapictimer); 1792early_param("nolapic_timer", parse_nolapic_timer);
1587 1793
1588static __init int setup_apicpmtimer(char *s) 1794static __init int setup_apicpmtimer(char *s)
1589{ 1795{
@@ -1593,6 +1799,31 @@ static __init int setup_apicpmtimer(char *s)
1593} 1799}
1594__setup("apicpmtimer", setup_apicpmtimer); 1800__setup("apicpmtimer", setup_apicpmtimer);
1595 1801
1802static int __init apic_set_verbosity(char *arg)
1803{
1804 if (!arg) {
1805#ifdef CONFIG_X86_64
1806 skip_ioapic_setup = 0;
1807 ioapic_force = 1;
1808 return 0;
1809#endif
1810 return -EINVAL;
1811 }
1812
1813 if (strcmp("debug", arg) == 0)
1814 apic_verbosity = APIC_DEBUG;
1815 else if (strcmp("verbose", arg) == 0)
1816 apic_verbosity = APIC_VERBOSE;
1817 else {
1818 printk(KERN_WARNING "APIC Verbosity level %s not recognised"
1819 " use apic=verbose or apic=debug\n", arg);
1820 return -EINVAL;
1821 }
1822
1823 return 0;
1824}
1825early_param("apic", apic_set_verbosity);
1826
1596static int __init lapic_insert_resource(void) 1827static int __init lapic_insert_resource(void)
1597{ 1828{
1598 if (!apic_phys) 1829 if (!apic_phys)
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 9ee24e6bc4b0..5145a6e72bbb 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -228,12 +228,12 @@
228#include <linux/suspend.h> 228#include <linux/suspend.h>
229#include <linux/kthread.h> 229#include <linux/kthread.h>
230#include <linux/jiffies.h> 230#include <linux/jiffies.h>
231#include <linux/smp_lock.h>
232 231
233#include <asm/system.h> 232#include <asm/system.h>
234#include <asm/uaccess.h> 233#include <asm/uaccess.h>
235#include <asm/desc.h> 234#include <asm/desc.h>
236#include <asm/i8253.h> 235#include <asm/i8253.h>
236#include <asm/olpc.h>
237#include <asm/paravirt.h> 237#include <asm/paravirt.h>
238#include <asm/reboot.h> 238#include <asm/reboot.h>
239 239
@@ -2217,7 +2217,7 @@ static int __init apm_init(void)
2217 2217
2218 dmi_check_system(apm_dmi_table); 2218 dmi_check_system(apm_dmi_table);
2219 2219
2220 if (apm_info.bios.version == 0 || paravirt_enabled()) { 2220 if (apm_info.bios.version == 0 || paravirt_enabled() || machine_is_olpc()) {
2221 printk(KERN_INFO "apm: BIOS not found.\n"); 2221 printk(KERN_INFO "apm: BIOS not found.\n");
2222 return -ENODEV; 2222 return -ENODEV;
2223 } 2223 }
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
index c639bd55391c..fdd585f9c53d 100644
--- a/arch/x86/kernel/bios_uv.c
+++ b/arch/x86/kernel/bios_uv.c
@@ -25,11 +25,11 @@ x86_bios_strerror(long status)
25{ 25{
26 const char *str; 26 const char *str;
27 switch (status) { 27 switch (status) {
28 case 0: str = "Call completed without error"; break; 28 case 0: str = "Call completed without error"; break;
29 case -1: str = "Not implemented"; break; 29 case -1: str = "Not implemented"; break;
30 case -2: str = "Invalid argument"; break; 30 case -2: str = "Invalid argument"; break;
31 case -3: str = "Call completed with error"; break; 31 case -3: str = "Call completed with error"; break;
32 default: str = "Unknown BIOS status code"; break; 32 default: str = "Unknown BIOS status code"; break;
33 } 33 }
34 return str; 34 return str;
35} 35}
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 403e689df0b8..7f0b45a5d788 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -3,15 +3,13 @@
3# 3#
4 4
5obj-y := intel_cacheinfo.o addon_cpuid_features.o 5obj-y := intel_cacheinfo.o addon_cpuid_features.o
6obj-y += proc.o capflags.o powerflags.o 6obj-y += proc.o capflags.o powerflags.o common.o
7 7
8obj-$(CONFIG_X86_32) += common.o bugs.o cmpxchg.o 8obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
9obj-$(CONFIG_X86_64) += common_64.o bugs_64.o 9obj-$(CONFIG_X86_64) += bugs_64.o
10 10
11obj-$(CONFIG_CPU_SUP_INTEL_32) += intel.o 11obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
12obj-$(CONFIG_CPU_SUP_INTEL_64) += intel_64.o 12obj-$(CONFIG_CPU_SUP_AMD) += amd.o
13obj-$(CONFIG_CPU_SUP_AMD_32) += amd.o
14obj-$(CONFIG_CPU_SUP_AMD_64) += amd_64.o
15obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o 13obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
16obj-$(CONFIG_CPU_SUP_CENTAUR_32) += centaur.o 14obj-$(CONFIG_CPU_SUP_CENTAUR_32) += centaur.o
17obj-$(CONFIG_CPU_SUP_CENTAUR_64) += centaur_64.o 15obj-$(CONFIG_CPU_SUP_CENTAUR_64) += centaur_64.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index d64ea6097ca7..32e73520adf7 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1,13 +1,22 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <linux/bitops.h> 2#include <linux/bitops.h>
3#include <linux/mm.h> 3#include <linux/mm.h>
4
4#include <asm/io.h> 5#include <asm/io.h>
5#include <asm/processor.h> 6#include <asm/processor.h>
6#include <asm/apic.h> 7#include <asm/apic.h>
7 8
9#ifdef CONFIG_X86_64
10# include <asm/numa_64.h>
11# include <asm/mmconfig.h>
12# include <asm/cacheflush.h>
13#endif
14
8#include <mach_apic.h> 15#include <mach_apic.h>
16
9#include "cpu.h" 17#include "cpu.h"
10 18
19#ifdef CONFIG_X86_32
11/* 20/*
12 * B step AMD K6 before B 9730xxxx have hardware bugs that can cause 21 * B step AMD K6 before B 9730xxxx have hardware bugs that can cause
13 * misexecution of code under Linux. Owners of such processors should 22 * misexecution of code under Linux. Owners of such processors should
@@ -24,26 +33,273 @@
24extern void vide(void); 33extern void vide(void);
25__asm__(".align 4\nvide: ret"); 34__asm__(".align 4\nvide: ret");
26 35
27static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) 36static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c)
28{ 37{
29 if (cpuid_eax(0x80000000) >= 0x80000007) { 38/*
30 c->x86_power = cpuid_edx(0x80000007); 39 * General Systems BIOSen alias the cpu frequency registers
31 if (c->x86_power & (1<<8)) 40 * of the Elan at 0x000df000. Unfortuantly, one of the Linux
32 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 41 * drivers subsequently pokes it, and changes the CPU speed.
42 * Workaround : Remove the unneeded alias.
43 */
44#define CBAR (0xfffc) /* Configuration Base Address (32-bit) */
45#define CBAR_ENB (0x80000000)
46#define CBAR_KEY (0X000000CB)
47 if (c->x86_model == 9 || c->x86_model == 10) {
48 if (inl (CBAR) & CBAR_ENB)
49 outl (0 | CBAR_KEY, CBAR);
33 } 50 }
34
35 /* Set MTRR capability flag if appropriate */
36 if (c->x86_model == 13 || c->x86_model == 9 ||
37 (c->x86_model == 8 && c->x86_mask >= 8))
38 set_cpu_cap(c, X86_FEATURE_K6_MTRR);
39} 51}
40 52
41static void __cpuinit init_amd(struct cpuinfo_x86 *c) 53
54static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
42{ 55{
43 u32 l, h; 56 u32 l, h;
44 int mbytes = num_physpages >> (20-PAGE_SHIFT); 57 int mbytes = num_physpages >> (20-PAGE_SHIFT);
45 int r;
46 58
59 if (c->x86_model < 6) {
60 /* Based on AMD doc 20734R - June 2000 */
61 if (c->x86_model == 0) {
62 clear_cpu_cap(c, X86_FEATURE_APIC);
63 set_cpu_cap(c, X86_FEATURE_PGE);
64 }
65 return;
66 }
67
68 if (c->x86_model == 6 && c->x86_mask == 1) {
69 const int K6_BUG_LOOP = 1000000;
70 int n;
71 void (*f_vide)(void);
72 unsigned long d, d2;
73
74 printk(KERN_INFO "AMD K6 stepping B detected - ");
75
76 /*
77 * It looks like AMD fixed the 2.6.2 bug and improved indirect
78 * calls at the same time.
79 */
80
81 n = K6_BUG_LOOP;
82 f_vide = vide;
83 rdtscl(d);
84 while (n--)
85 f_vide();
86 rdtscl(d2);
87 d = d2-d;
88
89 if (d > 20*K6_BUG_LOOP)
90 printk("system stability may be impaired when more than 32 MB are used.\n");
91 else
92 printk("probably OK (after B9730xxxx).\n");
93 printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n");
94 }
95
96 /* K6 with old style WHCR */
97 if (c->x86_model < 8 ||
98 (c->x86_model == 8 && c->x86_mask < 8)) {
99 /* We can only write allocate on the low 508Mb */
100 if (mbytes > 508)
101 mbytes = 508;
102
103 rdmsr(MSR_K6_WHCR, l, h);
104 if ((l&0x0000FFFF) == 0) {
105 unsigned long flags;
106 l = (1<<0)|((mbytes/4)<<1);
107 local_irq_save(flags);
108 wbinvd();
109 wrmsr(MSR_K6_WHCR, l, h);
110 local_irq_restore(flags);
111 printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n",
112 mbytes);
113 }
114 return;
115 }
116
117 if ((c->x86_model == 8 && c->x86_mask > 7) ||
118 c->x86_model == 9 || c->x86_model == 13) {
119 /* The more serious chips .. */
120
121 if (mbytes > 4092)
122 mbytes = 4092;
123
124 rdmsr(MSR_K6_WHCR, l, h);
125 if ((l&0xFFFF0000) == 0) {
126 unsigned long flags;
127 l = ((mbytes>>2)<<22)|(1<<16);
128 local_irq_save(flags);
129 wbinvd();
130 wrmsr(MSR_K6_WHCR, l, h);
131 local_irq_restore(flags);
132 printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n",
133 mbytes);
134 }
135
136 return;
137 }
138
139 if (c->x86_model == 10) {
140 /* AMD Geode LX is model 10 */
141 /* placeholder for any needed mods */
142 return;
143 }
144}
145
146static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
147{
148 u32 l, h;
149
150 /*
151 * Bit 15 of Athlon specific MSR 15, needs to be 0
152 * to enable SSE on Palomino/Morgan/Barton CPU's.
153 * If the BIOS didn't enable it already, enable it here.
154 */
155 if (c->x86_model >= 6 && c->x86_model <= 10) {
156 if (!cpu_has(c, X86_FEATURE_XMM)) {
157 printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
158 rdmsr(MSR_K7_HWCR, l, h);
159 l &= ~0x00008000;
160 wrmsr(MSR_K7_HWCR, l, h);
161 set_cpu_cap(c, X86_FEATURE_XMM);
162 }
163 }
164
165 /*
166 * It's been determined by AMD that Athlons since model 8 stepping 1
167 * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
168 * As per AMD technical note 27212 0.2
169 */
170 if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
171 rdmsr(MSR_K7_CLK_CTL, l, h);
172 if ((l & 0xfff00000) != 0x20000000) {
173 printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l,
174 ((l & 0x000fffff)|0x20000000));
175 wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
176 }
177 }
178
179 set_cpu_cap(c, X86_FEATURE_K7);
180}
181#endif
182
183#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
184static int __cpuinit nearby_node(int apicid)
185{
186 int i, node;
187
188 for (i = apicid - 1; i >= 0; i--) {
189 node = apicid_to_node[i];
190 if (node != NUMA_NO_NODE && node_online(node))
191 return node;
192 }
193 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
194 node = apicid_to_node[i];
195 if (node != NUMA_NO_NODE && node_online(node))
196 return node;
197 }
198 return first_node(node_online_map); /* Shouldn't happen */
199}
200#endif
201
202/*
203 * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
204 * Assumes number of cores is a power of two.
205 */
206static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
207{
208#ifdef CONFIG_X86_HT
209 unsigned bits;
210
211 bits = c->x86_coreid_bits;
212
213 /* Low order bits define the core id (index of core in socket) */
214 c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
215 /* Convert the initial APIC ID into the socket ID */
216 c->phys_proc_id = c->initial_apicid >> bits;
217#endif
218}
219
220static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
221{
222#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
223 int cpu = smp_processor_id();
224 int node;
225 unsigned apicid = hard_smp_processor_id();
226
227 node = c->phys_proc_id;
228 if (apicid_to_node[apicid] != NUMA_NO_NODE)
229 node = apicid_to_node[apicid];
230 if (!node_online(node)) {
231 /* Two possibilities here:
232 - The CPU is missing memory and no node was created.
233 In that case try picking one from a nearby CPU
234 - The APIC IDs differ from the HyperTransport node IDs
235 which the K8 northbridge parsing fills in.
236 Assume they are all increased by a constant offset,
237 but in the same order as the HT nodeids.
238 If that doesn't result in a usable node fall back to the
239 path for the previous case. */
240
241 int ht_nodeid = c->initial_apicid;
242
243 if (ht_nodeid >= 0 &&
244 apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
245 node = apicid_to_node[ht_nodeid];
246 /* Pick a nearby node */
247 if (!node_online(node))
248 node = nearby_node(apicid);
249 }
250 numa_set_node(cpu, node);
251
252 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
253#endif
254}
255
256static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
257{
258#ifdef CONFIG_X86_HT
259 unsigned bits, ecx;
260
261 /* Multi core CPU? */
262 if (c->extended_cpuid_level < 0x80000008)
263 return;
264
265 ecx = cpuid_ecx(0x80000008);
266
267 c->x86_max_cores = (ecx & 0xff) + 1;
268
269 /* CPU telling us the core id bits shift? */
270 bits = (ecx >> 12) & 0xF;
271
272 /* Otherwise recompute */
273 if (bits == 0) {
274 while ((1 << bits) < c->x86_max_cores)
275 bits++;
276 }
277
278 c->x86_coreid_bits = bits;
279#endif
280}
281
282static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
283{
284 early_init_amd_mc(c);
285
286 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
287 if (c->x86_power & (1<<8))
288 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
289
290#ifdef CONFIG_X86_64
291 set_cpu_cap(c, X86_FEATURE_SYSCALL32);
292#else
293 /* Set MTRR capability flag if appropriate */
294 if (c->x86 == 5)
295 if (c->x86_model == 13 || c->x86_model == 9 ||
296 (c->x86_model == 8 && c->x86_mask >= 8))
297 set_cpu_cap(c, X86_FEATURE_K6_MTRR);
298#endif
299}
300
301static void __cpuinit init_amd(struct cpuinfo_x86 *c)
302{
47#ifdef CONFIG_SMP 303#ifdef CONFIG_SMP
48 unsigned long long value; 304 unsigned long long value;
49 305
@@ -54,7 +310,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
54 * Errata 63 for SH-B3 steppings 310 * Errata 63 for SH-B3 steppings
55 * Errata 122 for all steppings (F+ have it disabled by default) 311 * Errata 122 for all steppings (F+ have it disabled by default)
56 */ 312 */
57 if (c->x86 == 15) { 313 if (c->x86 == 0xf) {
58 rdmsrl(MSR_K7_HWCR, value); 314 rdmsrl(MSR_K7_HWCR, value);
59 value |= 1 << 6; 315 value |= 1 << 6;
60 wrmsrl(MSR_K7_HWCR, value); 316 wrmsrl(MSR_K7_HWCR, value);
@@ -64,209 +320,119 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
64 early_init_amd(c); 320 early_init_amd(c);
65 321
66 /* 322 /*
67 * FIXME: We should handle the K5 here. Set up the write
68 * range and also turn on MSR 83 bits 4 and 31 (write alloc,
69 * no bus pipeline)
70 */
71
72 /*
73 * Bit 31 in normal CPUID used for nonstandard 3DNow ID; 323 * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
74 * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway 324 * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
75 */ 325 */
76 clear_cpu_cap(c, 0*32+31); 326 clear_cpu_cap(c, 0*32+31);
77 327
78 r = get_model_name(c); 328#ifdef CONFIG_X86_64
329 /* On C+ stepping K8 rep microcode works well for copy/memset */
330 if (c->x86 == 0xf) {
331 u32 level;
79 332
80 switch (c->x86) { 333 level = cpuid_eax(1);
81 case 4: 334 if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
82 /* 335 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
83 * General Systems BIOSen alias the cpu frequency registers
84 * of the Elan at 0x000df000. Unfortuantly, one of the Linux
85 * drivers subsequently pokes it, and changes the CPU speed.
86 * Workaround : Remove the unneeded alias.
87 */
88#define CBAR (0xfffc) /* Configuration Base Address (32-bit) */
89#define CBAR_ENB (0x80000000)
90#define CBAR_KEY (0X000000CB)
91 if (c->x86_model == 9 || c->x86_model == 10) {
92 if (inl (CBAR) & CBAR_ENB)
93 outl (0 | CBAR_KEY, CBAR);
94 }
95 break;
96 case 5:
97 if (c->x86_model < 6) {
98 /* Based on AMD doc 20734R - June 2000 */
99 if (c->x86_model == 0) {
100 clear_cpu_cap(c, X86_FEATURE_APIC);
101 set_cpu_cap(c, X86_FEATURE_PGE);
102 }
103 break;
104 }
105
106 if (c->x86_model == 6 && c->x86_mask == 1) {
107 const int K6_BUG_LOOP = 1000000;
108 int n;
109 void (*f_vide)(void);
110 unsigned long d, d2;
111
112 printk(KERN_INFO "AMD K6 stepping B detected - ");
113
114 /*
115 * It looks like AMD fixed the 2.6.2 bug and improved indirect
116 * calls at the same time.
117 */
118
119 n = K6_BUG_LOOP;
120 f_vide = vide;
121 rdtscl(d);
122 while (n--)
123 f_vide();
124 rdtscl(d2);
125 d = d2-d;
126
127 if (d > 20*K6_BUG_LOOP)
128 printk("system stability may be impaired when more than 32 MB are used.\n");
129 else
130 printk("probably OK (after B9730xxxx).\n");
131 printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n");
132 }
133
134 /* K6 with old style WHCR */
135 if (c->x86_model < 8 ||
136 (c->x86_model == 8 && c->x86_mask < 8)) {
137 /* We can only write allocate on the low 508Mb */
138 if (mbytes > 508)
139 mbytes = 508;
140
141 rdmsr(MSR_K6_WHCR, l, h);
142 if ((l&0x0000FFFF) == 0) {
143 unsigned long flags;
144 l = (1<<0)|((mbytes/4)<<1);
145 local_irq_save(flags);
146 wbinvd();
147 wrmsr(MSR_K6_WHCR, l, h);
148 local_irq_restore(flags);
149 printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n",
150 mbytes);
151 }
152 break;
153 }
154
155 if ((c->x86_model == 8 && c->x86_mask > 7) ||
156 c->x86_model == 9 || c->x86_model == 13) {
157 /* The more serious chips .. */
158
159 if (mbytes > 4092)
160 mbytes = 4092;
161
162 rdmsr(MSR_K6_WHCR, l, h);
163 if ((l&0xFFFF0000) == 0) {
164 unsigned long flags;
165 l = ((mbytes>>2)<<22)|(1<<16);
166 local_irq_save(flags);
167 wbinvd();
168 wrmsr(MSR_K6_WHCR, l, h);
169 local_irq_restore(flags);
170 printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n",
171 mbytes);
172 }
173
174 break;
175 }
176
177 if (c->x86_model == 10) {
178 /* AMD Geode LX is model 10 */
179 /* placeholder for any needed mods */
180 break;
181 }
182 break;
183 case 6: /* An Athlon/Duron */
184
185 /*
186 * Bit 15 of Athlon specific MSR 15, needs to be 0
187 * to enable SSE on Palomino/Morgan/Barton CPU's.
188 * If the BIOS didn't enable it already, enable it here.
189 */
190 if (c->x86_model >= 6 && c->x86_model <= 10) {
191 if (!cpu_has(c, X86_FEATURE_XMM)) {
192 printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
193 rdmsr(MSR_K7_HWCR, l, h);
194 l &= ~0x00008000;
195 wrmsr(MSR_K7_HWCR, l, h);
196 set_cpu_cap(c, X86_FEATURE_XMM);
197 }
198 }
199
200 /*
201 * It's been determined by AMD that Athlons since model 8 stepping 1
202 * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
203 * As per AMD technical note 27212 0.2
204 */
205 if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
206 rdmsr(MSR_K7_CLK_CTL, l, h);
207 if ((l & 0xfff00000) != 0x20000000) {
208 printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l,
209 ((l & 0x000fffff)|0x20000000));
210 wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
211 }
212 }
213 break;
214 } 336 }
337 if (c->x86 == 0x10 || c->x86 == 0x11)
338 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
339#else
340
341 /*
342 * FIXME: We should handle the K5 here. Set up the write
343 * range and also turn on MSR 83 bits 4 and 31 (write alloc,
344 * no bus pipeline)
345 */
215 346
216 switch (c->x86) { 347 switch (c->x86) {
217 case 15: 348 case 4:
218 /* Use K8 tuning for Fam10h and Fam11h */ 349 init_amd_k5(c);
219 case 0x10:
220 case 0x11:
221 set_cpu_cap(c, X86_FEATURE_K8);
222 break; 350 break;
223 case 6: 351 case 5:
224 set_cpu_cap(c, X86_FEATURE_K7); 352 init_amd_k6(c);
353 break;
354 case 6: /* An Athlon/Duron */
355 init_amd_k7(c);
225 break; 356 break;
226 } 357 }
358
359 /* K6s reports MCEs but don't actually have all the MSRs */
360 if (c->x86 < 6)
361 clear_cpu_cap(c, X86_FEATURE_MCE);
362#endif
363
364 /* Enable workaround for FXSAVE leak */
227 if (c->x86 >= 6) 365 if (c->x86 >= 6)
228 set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); 366 set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
229 367
230 display_cacheinfo(c); 368 if (!c->x86_model_id[0]) {
231 369 switch (c->x86) {
232 if (cpuid_eax(0x80000000) >= 0x80000008) 370 case 0xf:
233 c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; 371 /* Should distinguish Models here, but this is only
372 a fallback anyways. */
373 strcpy(c->x86_model_id, "Hammer");
374 break;
375 }
376 }
234 377
235#ifdef CONFIG_X86_HT 378 display_cacheinfo(c);
236 /*
237 * On a AMD multi core setup the lower bits of the APIC id
238 * distinguish the cores.
239 */
240 if (c->x86_max_cores > 1) {
241 int cpu = smp_processor_id();
242 unsigned bits = (cpuid_ecx(0x80000008) >> 12) & 0xf;
243 379
244 if (bits == 0) { 380 /* Multi core CPU? */
245 while ((1 << bits) < c->x86_max_cores) 381 if (c->extended_cpuid_level >= 0x80000008) {
246 bits++; 382 amd_detect_cmp(c);
247 } 383 srat_detect_node(c);
248 c->cpu_core_id = c->phys_proc_id & ((1<<bits)-1);
249 c->phys_proc_id >>= bits;
250 printk(KERN_INFO "CPU %d(%d) -> Core %d\n",
251 cpu, c->x86_max_cores, c->cpu_core_id);
252 } 384 }
385
386#ifdef CONFIG_X86_32
387 detect_ht(c);
253#endif 388#endif
254 389
255 if (cpuid_eax(0x80000000) >= 0x80000006) { 390 if (c->extended_cpuid_level >= 0x80000006) {
256 if ((c->x86 == 0x10) && (cpuid_edx(0x80000006) & 0xf000)) 391 if ((c->x86 >= 0x0f) && (cpuid_edx(0x80000006) & 0xf000))
257 num_cache_leaves = 4; 392 num_cache_leaves = 4;
258 else 393 else
259 num_cache_leaves = 3; 394 num_cache_leaves = 3;
260 } 395 }
261 396
262 /* K6s reports MCEs but don't actually have all the MSRs */ 397 if (c->x86 >= 0xf && c->x86 <= 0x11)
263 if (c->x86 < 6) 398 set_cpu_cap(c, X86_FEATURE_K8);
264 clear_cpu_cap(c, X86_FEATURE_MCE);
265 399
266 if (cpu_has_xmm2) 400 if (cpu_has_xmm2) {
401 /* MFENCE stops RDTSC speculation */
267 set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); 402 set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
403 }
404
405#ifdef CONFIG_X86_64
406 if (c->x86 == 0x10) {
407 /* do this for boot cpu */
408 if (c == &boot_cpu_data)
409 check_enable_amd_mmconf_dmi();
410
411 fam10h_check_enable_mmcfg();
412 }
413
414 if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
415 unsigned long long tseg;
416
417 /*
418 * Split up direct mapping around the TSEG SMM area.
419 * Don't do it for gbpages because there seems very little
420 * benefit in doing so.
421 */
422 if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
423 printk(KERN_DEBUG "tseg: %010llx\n", tseg);
424 if ((tseg>>PMD_SHIFT) <
425 (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
426 ((tseg>>PMD_SHIFT) <
427 (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
428 (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
429 set_memory_4k((unsigned long)__va(tseg), 1);
430 }
431 }
432#endif
268} 433}
269 434
435#ifdef CONFIG_X86_32
270static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) 436static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size)
271{ 437{
272 /* AMD errata T13 (order #21922) */ 438 /* AMD errata T13 (order #21922) */
@@ -279,10 +445,12 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int
279 } 445 }
280 return size; 446 return size;
281} 447}
448#endif
282 449
283static struct cpu_dev amd_cpu_dev __cpuinitdata = { 450static struct cpu_dev amd_cpu_dev __cpuinitdata = {
284 .c_vendor = "AMD", 451 .c_vendor = "AMD",
285 .c_ident = { "AuthenticAMD" }, 452 .c_ident = { "AuthenticAMD" },
453#ifdef CONFIG_X86_32
286 .c_models = { 454 .c_models = {
287 { .vendor = X86_VENDOR_AMD, .family = 4, .model_names = 455 { .vendor = X86_VENDOR_AMD, .family = 4, .model_names =
288 { 456 {
@@ -295,9 +463,10 @@ static struct cpu_dev amd_cpu_dev __cpuinitdata = {
295 } 463 }
296 }, 464 },
297 }, 465 },
466 .c_size_cache = amd_size_cache,
467#endif
298 .c_early_init = early_init_amd, 468 .c_early_init = early_init_amd,
299 .c_init = init_amd, 469 .c_init = init_amd,
300 .c_size_cache = amd_size_cache,
301 .c_x86_vendor = X86_VENDOR_AMD, 470 .c_x86_vendor = X86_VENDOR_AMD,
302}; 471};
303 472
diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c
deleted file mode 100644
index d1c721c0c49f..000000000000
--- a/arch/x86/kernel/cpu/amd_64.c
+++ /dev/null
@@ -1,224 +0,0 @@
1#include <linux/init.h>
2#include <linux/mm.h>
3
4#include <asm/numa_64.h>
5#include <asm/mmconfig.h>
6#include <asm/cacheflush.h>
7
8#include <mach_apic.h>
9
10#include "cpu.h"
11
12int force_mwait __cpuinitdata;
13
14#ifdef CONFIG_NUMA
15static int __cpuinit nearby_node(int apicid)
16{
17 int i, node;
18
19 for (i = apicid - 1; i >= 0; i--) {
20 node = apicid_to_node[i];
21 if (node != NUMA_NO_NODE && node_online(node))
22 return node;
23 }
24 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
25 node = apicid_to_node[i];
26 if (node != NUMA_NO_NODE && node_online(node))
27 return node;
28 }
29 return first_node(node_online_map); /* Shouldn't happen */
30}
31#endif
32
33/*
34 * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
35 * Assumes number of cores is a power of two.
36 */
37static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
38{
39#ifdef CONFIG_SMP
40 unsigned bits;
41#ifdef CONFIG_NUMA
42 int cpu = smp_processor_id();
43 int node = 0;
44 unsigned apicid = hard_smp_processor_id();
45#endif
46 bits = c->x86_coreid_bits;
47
48 /* Low order bits define the core id (index of core in socket) */
49 c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
50 /* Convert the initial APIC ID into the socket ID */
51 c->phys_proc_id = c->initial_apicid >> bits;
52
53#ifdef CONFIG_NUMA
54 node = c->phys_proc_id;
55 if (apicid_to_node[apicid] != NUMA_NO_NODE)
56 node = apicid_to_node[apicid];
57 if (!node_online(node)) {
58 /* Two possibilities here:
59 - The CPU is missing memory and no node was created.
60 In that case try picking one from a nearby CPU
61 - The APIC IDs differ from the HyperTransport node IDs
62 which the K8 northbridge parsing fills in.
63 Assume they are all increased by a constant offset,
64 but in the same order as the HT nodeids.
65 If that doesn't result in a usable node fall back to the
66 path for the previous case. */
67
68 int ht_nodeid = c->initial_apicid;
69
70 if (ht_nodeid >= 0 &&
71 apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
72 node = apicid_to_node[ht_nodeid];
73 /* Pick a nearby node */
74 if (!node_online(node))
75 node = nearby_node(apicid);
76 }
77 numa_set_node(cpu, node);
78
79 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
80#endif
81#endif
82}
83
84static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
85{
86#ifdef CONFIG_SMP
87 unsigned bits, ecx;
88
89 /* Multi core CPU? */
90 if (c->extended_cpuid_level < 0x80000008)
91 return;
92
93 ecx = cpuid_ecx(0x80000008);
94
95 c->x86_max_cores = (ecx & 0xff) + 1;
96
97 /* CPU telling us the core id bits shift? */
98 bits = (ecx >> 12) & 0xF;
99
100 /* Otherwise recompute */
101 if (bits == 0) {
102 while ((1 << bits) < c->x86_max_cores)
103 bits++;
104 }
105
106 c->x86_coreid_bits = bits;
107
108#endif
109}
110
111static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
112{
113 early_init_amd_mc(c);
114
115 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
116 if (c->x86_power & (1<<8))
117 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
118
119 set_cpu_cap(c, X86_FEATURE_SYSCALL32);
120}
121
122static void __cpuinit init_amd(struct cpuinfo_x86 *c)
123{
124 unsigned level;
125
126#ifdef CONFIG_SMP
127 unsigned long value;
128
129 /*
130 * Disable TLB flush filter by setting HWCR.FFDIS on K8
131 * bit 6 of msr C001_0015
132 *
133 * Errata 63 for SH-B3 steppings
134 * Errata 122 for all steppings (F+ have it disabled by default)
135 */
136 if (c->x86 == 0xf) {
137 rdmsrl(MSR_K8_HWCR, value);
138 value |= 1 << 6;
139 wrmsrl(MSR_K8_HWCR, value);
140 }
141#endif
142
143 /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
144 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
145 clear_cpu_cap(c, 0*32+31);
146
147 /* On C+ stepping K8 rep microcode works well for copy/memset */
148 if (c->x86 == 0xf) {
149 level = cpuid_eax(1);
150 if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
151 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
152 }
153 if (c->x86 == 0x10 || c->x86 == 0x11)
154 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
155
156 /* Enable workaround for FXSAVE leak */
157 if (c->x86 >= 6)
158 set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
159
160 level = get_model_name(c);
161 if (!level) {
162 switch (c->x86) {
163 case 0xf:
164 /* Should distinguish Models here, but this is only
165 a fallback anyways. */
166 strcpy(c->x86_model_id, "Hammer");
167 break;
168 }
169 }
170 display_cacheinfo(c);
171
172 /* Multi core CPU? */
173 if (c->extended_cpuid_level >= 0x80000008)
174 amd_detect_cmp(c);
175
176 if (c->extended_cpuid_level >= 0x80000006 &&
177 (cpuid_edx(0x80000006) & 0xf000))
178 num_cache_leaves = 4;
179 else
180 num_cache_leaves = 3;
181
182 if (c->x86 >= 0xf && c->x86 <= 0x11)
183 set_cpu_cap(c, X86_FEATURE_K8);
184
185 /* MFENCE stops RDTSC speculation */
186 set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
187
188 if (c->x86 == 0x10) {
189 /* do this for boot cpu */
190 if (c == &boot_cpu_data)
191 check_enable_amd_mmconf_dmi();
192
193 fam10h_check_enable_mmcfg();
194 }
195
196 if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
197 unsigned long long tseg;
198
199 /*
200 * Split up direct mapping around the TSEG SMM area.
201 * Don't do it for gbpages because there seems very little
202 * benefit in doing so.
203 */
204 if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
205 printk(KERN_DEBUG "tseg: %010llx\n", tseg);
206 if ((tseg>>PMD_SHIFT) <
207 (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
208 ((tseg>>PMD_SHIFT) <
209 (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
210 (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
211 set_memory_4k((unsigned long)__va(tseg), 1);
212 }
213 }
214}
215
216static struct cpu_dev amd_cpu_dev __cpuinitdata = {
217 .c_vendor = "AMD",
218 .c_ident = { "AuthenticAMD" },
219 .c_early_init = early_init_amd,
220 .c_init = init_amd,
221 .c_x86_vendor = X86_VENDOR_AMD,
222};
223
224cpu_dev_register(amd_cpu_dev);
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index e5f6d89521bf..89bfdd9cacc6 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -289,7 +289,6 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
289 if (c->x86_model >= 6 && c->x86_model < 9) 289 if (c->x86_model >= 6 && c->x86_model < 9)
290 set_cpu_cap(c, X86_FEATURE_3DNOW); 290 set_cpu_cap(c, X86_FEATURE_3DNOW);
291 291
292 get_model_name(c);
293 display_cacheinfo(c); 292 display_cacheinfo(c);
294} 293}
295 294
diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c
index 49cfc6d2f2fb..a1625f5a1e78 100644
--- a/arch/x86/kernel/cpu/centaur_64.c
+++ b/arch/x86/kernel/cpu/centaur_64.c
@@ -16,9 +16,10 @@ static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
16 16
17static void __cpuinit init_centaur(struct cpuinfo_x86 *c) 17static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
18{ 18{
19 early_init_centaur(c);
20
19 if (c->x86 == 0x6 && c->x86_model >= 0xf) { 21 if (c->x86 == 0x6 && c->x86_model >= 0xf) {
20 c->x86_cache_alignment = c->x86_clflush_size * 2; 22 c->x86_cache_alignment = c->x86_clflush_size * 2;
21 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
22 set_cpu_cap(c, X86_FEATURE_REP_GOOD); 23 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
23 } 24 }
24 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); 25 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 7d5a07f0fd24..7581b62df184 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1,29 +1,61 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <linux/kernel.h>
3#include <linux/sched.h>
2#include <linux/string.h> 4#include <linux/string.h>
5#include <linux/bootmem.h>
6#include <linux/bitops.h>
7#include <linux/module.h>
8#include <linux/kgdb.h>
9#include <linux/topology.h>
3#include <linux/delay.h> 10#include <linux/delay.h>
4#include <linux/smp.h> 11#include <linux/smp.h>
5#include <linux/module.h>
6#include <linux/percpu.h> 12#include <linux/percpu.h>
7#include <linux/bootmem.h>
8#include <asm/processor.h>
9#include <asm/i387.h> 13#include <asm/i387.h>
10#include <asm/msr.h> 14#include <asm/msr.h>
11#include <asm/io.h> 15#include <asm/io.h>
16#include <asm/linkage.h>
12#include <asm/mmu_context.h> 17#include <asm/mmu_context.h>
13#include <asm/mtrr.h> 18#include <asm/mtrr.h>
14#include <asm/mce.h> 19#include <asm/mce.h>
15#include <asm/pat.h> 20#include <asm/pat.h>
16#include <asm/asm.h> 21#include <asm/asm.h>
22#include <asm/numa.h>
17#ifdef CONFIG_X86_LOCAL_APIC 23#ifdef CONFIG_X86_LOCAL_APIC
18#include <asm/mpspec.h> 24#include <asm/mpspec.h>
19#include <asm/apic.h> 25#include <asm/apic.h>
20#include <mach_apic.h> 26#include <mach_apic.h>
27#include <asm/genapic.h>
21#endif 28#endif
22 29
30#include <asm/pda.h>
31#include <asm/pgtable.h>
32#include <asm/processor.h>
33#include <asm/desc.h>
34#include <asm/atomic.h>
35#include <asm/proto.h>
36#include <asm/sections.h>
37#include <asm/setup.h>
38
23#include "cpu.h" 39#include "cpu.h"
24 40
25static struct cpu_dev *this_cpu __cpuinitdata; 41static struct cpu_dev *this_cpu __cpuinitdata;
26 42
43#ifdef CONFIG_X86_64
44/* We need valid kernel segments for data and code in long mode too
45 * IRET will check the segment types kkeil 2000/10/28
46 * Also sysret mandates a special GDT layout
47 */
48/* The TLS descriptors are currently at a different place compared to i386.
49 Hopefully nobody expects them at a fixed place (Wine?) */
50DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
51 [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
52 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
53 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
54 [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
55 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
56 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
57} };
58#else
27DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { 59DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
28 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, 60 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
29 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, 61 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
@@ -58,8 +90,10 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
58 [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, 90 [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
59 [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } }, 91 [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } },
60} }; 92} };
93#endif
61EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); 94EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
62 95
96#ifdef CONFIG_X86_32
63static int cachesize_override __cpuinitdata = -1; 97static int cachesize_override __cpuinitdata = -1;
64static int disable_x86_serial_nr __cpuinitdata = 1; 98static int disable_x86_serial_nr __cpuinitdata = 1;
65 99
@@ -70,34 +104,6 @@ static int __init cachesize_setup(char *str)
70} 104}
71__setup("cachesize=", cachesize_setup); 105__setup("cachesize=", cachesize_setup);
72 106
73/*
74 * Naming convention should be: <Name> [(<Codename>)]
75 * This table only is used unless init_<vendor>() below doesn't set it;
76 * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
77 *
78 */
79
80/* Look up CPU names by table lookup. */
81static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
82{
83 struct cpu_model_info *info;
84
85 if (c->x86_model >= 16)
86 return NULL; /* Range check */
87
88 if (!this_cpu)
89 return NULL;
90
91 info = this_cpu->c_models;
92
93 while (info && info->family) {
94 if (info->family == c->x86)
95 return info->model_names[c->x86_model];
96 info++;
97 }
98 return NULL; /* Not found */
99}
100
101static int __init x86_fxsr_setup(char *s) 107static int __init x86_fxsr_setup(char *s)
102{ 108{
103 setup_clear_cpu_cap(X86_FEATURE_FXSR); 109 setup_clear_cpu_cap(X86_FEATURE_FXSR);
@@ -162,6 +168,48 @@ static int __init x86_serial_nr_setup(char *s)
162 return 1; 168 return 1;
163} 169}
164__setup("serialnumber", x86_serial_nr_setup); 170__setup("serialnumber", x86_serial_nr_setup);
171#else
172static inline int flag_is_changeable_p(u32 flag)
173{
174 return 1;
175}
176/* Probe for the CPUID instruction */
177static inline int have_cpuid_p(void)
178{
179 return 1;
180}
181static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
182{
183}
184#endif
185
186/*
187 * Naming convention should be: <Name> [(<Codename>)]
188 * This table only is used unless init_<vendor>() below doesn't set it;
189 * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
190 *
191 */
192
193/* Look up CPU names by table lookup. */
194static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
195{
196 struct cpu_model_info *info;
197
198 if (c->x86_model >= 16)
199 return NULL; /* Range check */
200
201 if (!this_cpu)
202 return NULL;
203
204 info = this_cpu->c_models;
205
206 while (info && info->family) {
207 if (info->family == c->x86)
208 return info->model_names[c->x86_model];
209 info++;
210 }
211 return NULL; /* Not found */
212}
165 213
166__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; 214__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
167 215
@@ -174,13 +222,18 @@ void switch_to_new_gdt(void)
174 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); 222 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
175 gdt_descr.size = GDT_SIZE - 1; 223 gdt_descr.size = GDT_SIZE - 1;
176 load_gdt(&gdt_descr); 224 load_gdt(&gdt_descr);
225#ifdef CONFIG_X86_32
177 asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory"); 226 asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
227#endif
178} 228}
179 229
180static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; 230static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
181 231
182static void __cpuinit default_init(struct cpuinfo_x86 *c) 232static void __cpuinit default_init(struct cpuinfo_x86 *c)
183{ 233{
234#ifdef CONFIG_X86_64
235 display_cacheinfo(c);
236#else
184 /* Not much we can do here... */ 237 /* Not much we can do here... */
185 /* Check if at least it has cpuid */ 238 /* Check if at least it has cpuid */
186 if (c->cpuid_level == -1) { 239 if (c->cpuid_level == -1) {
@@ -190,6 +243,7 @@ static void __cpuinit default_init(struct cpuinfo_x86 *c)
190 else if (c->x86 == 3) 243 else if (c->x86 == 3)
191 strcpy(c->x86_model_id, "386"); 244 strcpy(c->x86_model_id, "386");
192 } 245 }
246#endif
193} 247}
194 248
195static struct cpu_dev __cpuinitdata default_cpu = { 249static struct cpu_dev __cpuinitdata default_cpu = {
@@ -198,13 +252,13 @@ static struct cpu_dev __cpuinitdata default_cpu = {
198 .c_x86_vendor = X86_VENDOR_UNKNOWN, 252 .c_x86_vendor = X86_VENDOR_UNKNOWN,
199}; 253};
200 254
201int __cpuinit get_model_name(struct cpuinfo_x86 *c) 255static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
202{ 256{
203 unsigned int *v; 257 unsigned int *v;
204 char *p, *q; 258 char *p, *q;
205 259
206 if (c->extended_cpuid_level < 0x80000004) 260 if (c->extended_cpuid_level < 0x80000004)
207 return 0; 261 return;
208 262
209 v = (unsigned int *) c->x86_model_id; 263 v = (unsigned int *) c->x86_model_id;
210 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); 264 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
@@ -223,8 +277,6 @@ int __cpuinit get_model_name(struct cpuinfo_x86 *c)
223 while (q <= &c->x86_model_id[48]) 277 while (q <= &c->x86_model_id[48])
224 *q++ = '\0'; /* Zero-pad the rest */ 278 *q++ = '\0'; /* Zero-pad the rest */
225 } 279 }
226
227 return 1;
228} 280}
229 281
230void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) 282void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
@@ -238,6 +290,10 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
238 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", 290 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
239 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); 291 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
240 c->x86_cache_size = (ecx>>24) + (edx>>24); 292 c->x86_cache_size = (ecx>>24) + (edx>>24);
293#ifdef CONFIG_X86_64
294 /* On K8 L1 TLB is inclusive, so don't count it */
295 c->x86_tlbsize = 0;
296#endif
241 } 297 }
242 298
243 if (n < 0x80000006) /* Some chips just has a large L1. */ 299 if (n < 0x80000006) /* Some chips just has a large L1. */
@@ -246,6 +302,9 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
246 cpuid(0x80000006, &dummy, &ebx, &ecx, &edx); 302 cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
247 l2size = ecx >> 16; 303 l2size = ecx >> 16;
248 304
305#ifdef CONFIG_X86_64
306 c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
307#else
249 /* do processor-specific cache resizing */ 308 /* do processor-specific cache resizing */
250 if (this_cpu->c_size_cache) 309 if (this_cpu->c_size_cache)
251 l2size = this_cpu->c_size_cache(c, l2size); 310 l2size = this_cpu->c_size_cache(c, l2size);
@@ -256,6 +315,7 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
256 315
257 if (l2size == 0) 316 if (l2size == 0)
258 return; /* Again, no L2 cache is possible */ 317 return; /* Again, no L2 cache is possible */
318#endif
259 319
260 c->x86_cache_size = l2size; 320 c->x86_cache_size = l2size;
261 321
@@ -263,9 +323,9 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
263 l2size, ecx & 0xFF); 323 l2size, ecx & 0xFF);
264} 324}
265 325
266#ifdef CONFIG_X86_HT
267void __cpuinit detect_ht(struct cpuinfo_x86 *c) 326void __cpuinit detect_ht(struct cpuinfo_x86 *c)
268{ 327{
328#ifdef CONFIG_X86_HT
269 u32 eax, ebx, ecx, edx; 329 u32 eax, ebx, ecx, edx;
270 int index_msb, core_bits; 330 int index_msb, core_bits;
271 331
@@ -275,6 +335,9 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
275 if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) 335 if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
276 goto out; 336 goto out;
277 337
338 if (cpu_has(c, X86_FEATURE_XTOPOLOGY))
339 return;
340
278 cpuid(1, &eax, &ebx, &ecx, &edx); 341 cpuid(1, &eax, &ebx, &ecx, &edx);
279 342
280 smp_num_siblings = (ebx & 0xff0000) >> 16; 343 smp_num_siblings = (ebx & 0xff0000) >> 16;
@@ -291,8 +354,11 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
291 } 354 }
292 355
293 index_msb = get_count_order(smp_num_siblings); 356 index_msb = get_count_order(smp_num_siblings);
357#ifdef CONFIG_X86_64
358 c->phys_proc_id = phys_pkg_id(index_msb);
359#else
294 c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb); 360 c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb);
295 361#endif
296 362
297 smp_num_siblings = smp_num_siblings / c->x86_max_cores; 363 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
298 364
@@ -300,8 +366,13 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
300 366
301 core_bits = get_count_order(c->x86_max_cores); 367 core_bits = get_count_order(c->x86_max_cores);
302 368
369#ifdef CONFIG_X86_64
370 c->cpu_core_id = phys_pkg_id(index_msb) &
371 ((1 << core_bits) - 1);
372#else
303 c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) & 373 c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) &
304 ((1 << core_bits) - 1); 374 ((1 << core_bits) - 1);
375#endif
305 } 376 }
306 377
307out: 378out:
@@ -311,8 +382,8 @@ out:
311 printk(KERN_INFO "CPU: Processor Core ID: %d\n", 382 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
312 c->cpu_core_id); 383 c->cpu_core_id);
313 } 384 }
314}
315#endif 385#endif
386}
316 387
317static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) 388static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
318{ 389{
@@ -335,7 +406,7 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
335 406
336 if (!printed) { 407 if (!printed) {
337 printed++; 408 printed++;
338 printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n"); 409 printk(KERN_ERR "CPU: vendor_id '%s' unknown, using generic init.\n", v);
339 printk(KERN_ERR "CPU: Your system may be unstable.\n"); 410 printk(KERN_ERR "CPU: Your system may be unstable.\n");
340 } 411 }
341 412
@@ -392,7 +463,47 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
392 c->x86_capability[6] = cpuid_ecx(0x80000001); 463 c->x86_capability[6] = cpuid_ecx(0x80000001);
393 } 464 }
394 } 465 }
466
467#ifdef CONFIG_X86_64
468 if (c->extended_cpuid_level >= 0x80000008) {
469 u32 eax = cpuid_eax(0x80000008);
470
471 c->x86_virt_bits = (eax >> 8) & 0xff;
472 c->x86_phys_bits = eax & 0xff;
473 }
474#endif
475
476 if (c->extended_cpuid_level >= 0x80000007)
477 c->x86_power = cpuid_edx(0x80000007);
478
395} 479}
480
481static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
482{
483#ifdef CONFIG_X86_32
484 int i;
485
486 /*
487 * First of all, decide if this is a 486 or higher
488 * It's a 486 if we can modify the AC flag
489 */
490 if (flag_is_changeable_p(X86_EFLAGS_AC))
491 c->x86 = 4;
492 else
493 c->x86 = 3;
494
495 for (i = 0; i < X86_VENDOR_NUM; i++)
496 if (cpu_devs[i] && cpu_devs[i]->c_identify) {
497 c->x86_vendor_id[0] = 0;
498 cpu_devs[i]->c_identify(c);
499 if (c->x86_vendor_id[0]) {
500 get_cpu_vendor(c);
501 break;
502 }
503 }
504#endif
505}
506
396/* 507/*
397 * Do minimum CPU detection early. 508 * Do minimum CPU detection early.
398 * Fields really needed: vendor, cpuid_level, family, model, mask, 509 * Fields really needed: vendor, cpuid_level, family, model, mask,
@@ -404,16 +515,23 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
404 */ 515 */
405static void __init early_identify_cpu(struct cpuinfo_x86 *c) 516static void __init early_identify_cpu(struct cpuinfo_x86 *c)
406{ 517{
518#ifdef CONFIG_X86_64
519 c->x86_clflush_size = 64;
520#else
407 c->x86_clflush_size = 32; 521 c->x86_clflush_size = 32;
522#endif
408 c->x86_cache_alignment = c->x86_clflush_size; 523 c->x86_cache_alignment = c->x86_clflush_size;
409 524
410 if (!have_cpuid_p())
411 return;
412
413 memset(&c->x86_capability, 0, sizeof c->x86_capability); 525 memset(&c->x86_capability, 0, sizeof c->x86_capability);
414
415 c->extended_cpuid_level = 0; 526 c->extended_cpuid_level = 0;
416 527
528 if (!have_cpuid_p())
529 identify_cpu_without_cpuid(c);
530
531 /* cyrix could have cpuid enabled via c_identify()*/
532 if (!have_cpuid_p())
533 return;
534
417 cpu_detect(c); 535 cpu_detect(c);
418 536
419 get_cpu_vendor(c); 537 get_cpu_vendor(c);
@@ -454,39 +572,27 @@ void __init early_cpu_init(void)
454 572
455/* 573/*
456 * The NOPL instruction is supposed to exist on all CPUs with 574 * The NOPL instruction is supposed to exist on all CPUs with
457 * family >= 6, unfortunately, that's not true in practice because 575 * family >= 6; unfortunately, that's not true in practice because
458 * of early VIA chips and (more importantly) broken virtualizers that 576 * of early VIA chips and (more importantly) broken virtualizers that
459 * are not easy to detect. Hence, probe for it based on first 577 * are not easy to detect. In the latter case it doesn't even *fail*
460 * principles. 578 * reliably, so probing for it doesn't even work. Disable it completely
579 * unless we can find a reliable way to detect all the broken cases.
461 */ 580 */
462static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) 581static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
463{ 582{
464 const u32 nopl_signature = 0x888c53b1; /* Random number */
465 u32 has_nopl = nopl_signature;
466
467 clear_cpu_cap(c, X86_FEATURE_NOPL); 583 clear_cpu_cap(c, X86_FEATURE_NOPL);
468 if (c->x86 >= 6) {
469 asm volatile("\n"
470 "1: .byte 0x0f,0x1f,0xc0\n" /* nopl %eax */
471 "2:\n"
472 " .section .fixup,\"ax\"\n"
473 "3: xor %0,%0\n"
474 " jmp 2b\n"
475 " .previous\n"
476 _ASM_EXTABLE(1b,3b)
477 : "+a" (has_nopl));
478
479 if (has_nopl == nopl_signature)
480 set_cpu_cap(c, X86_FEATURE_NOPL);
481 }
482} 584}
483 585
484static void __cpuinit generic_identify(struct cpuinfo_x86 *c) 586static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
485{ 587{
588 c->extended_cpuid_level = 0;
589
486 if (!have_cpuid_p()) 590 if (!have_cpuid_p())
487 return; 591 identify_cpu_without_cpuid(c);
488 592
489 c->extended_cpuid_level = 0; 593 /* cyrix could have cpuid enabled via c_identify()*/
594 if (!have_cpuid_p())
595 return;
490 596
491 cpu_detect(c); 597 cpu_detect(c);
492 598
@@ -496,16 +602,20 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
496 602
497 if (c->cpuid_level >= 0x00000001) { 603 if (c->cpuid_level >= 0x00000001) {
498 c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF; 604 c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
499#ifdef CONFIG_X86_HT 605#ifdef CONFIG_X86_32
606# ifdef CONFIG_X86_HT
500 c->apicid = phys_pkg_id(c->initial_apicid, 0); 607 c->apicid = phys_pkg_id(c->initial_apicid, 0);
501 c->phys_proc_id = c->initial_apicid; 608# else
502#else
503 c->apicid = c->initial_apicid; 609 c->apicid = c->initial_apicid;
610# endif
611#endif
612
613#ifdef CONFIG_X86_HT
614 c->phys_proc_id = c->initial_apicid;
504#endif 615#endif
505 } 616 }
506 617
507 if (c->extended_cpuid_level >= 0x80000004) 618 get_model_name(c); /* Default name */
508 get_model_name(c); /* Default name */
509 619
510 init_scattered_cpuid_features(c); 620 init_scattered_cpuid_features(c);
511 detect_nopl(c); 621 detect_nopl(c);
@@ -521,30 +631,29 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
521 c->loops_per_jiffy = loops_per_jiffy; 631 c->loops_per_jiffy = loops_per_jiffy;
522 c->x86_cache_size = -1; 632 c->x86_cache_size = -1;
523 c->x86_vendor = X86_VENDOR_UNKNOWN; 633 c->x86_vendor = X86_VENDOR_UNKNOWN;
524 c->cpuid_level = -1; /* CPUID not detected */
525 c->x86_model = c->x86_mask = 0; /* So far unknown... */ 634 c->x86_model = c->x86_mask = 0; /* So far unknown... */
526 c->x86_vendor_id[0] = '\0'; /* Unset */ 635 c->x86_vendor_id[0] = '\0'; /* Unset */
527 c->x86_model_id[0] = '\0'; /* Unset */ 636 c->x86_model_id[0] = '\0'; /* Unset */
528 c->x86_max_cores = 1; 637 c->x86_max_cores = 1;
638 c->x86_coreid_bits = 0;
639#ifdef CONFIG_X86_64
640 c->x86_clflush_size = 64;
641#else
642 c->cpuid_level = -1; /* CPUID not detected */
529 c->x86_clflush_size = 32; 643 c->x86_clflush_size = 32;
644#endif
645 c->x86_cache_alignment = c->x86_clflush_size;
530 memset(&c->x86_capability, 0, sizeof c->x86_capability); 646 memset(&c->x86_capability, 0, sizeof c->x86_capability);
531 647
532 if (!have_cpuid_p()) {
533 /*
534 * First of all, decide if this is a 486 or higher
535 * It's a 486 if we can modify the AC flag
536 */
537 if (flag_is_changeable_p(X86_EFLAGS_AC))
538 c->x86 = 4;
539 else
540 c->x86 = 3;
541 }
542
543 generic_identify(c); 648 generic_identify(c);
544 649
545 if (this_cpu->c_identify) 650 if (this_cpu->c_identify)
546 this_cpu->c_identify(c); 651 this_cpu->c_identify(c);
547 652
653#ifdef CONFIG_X86_64
654 c->apicid = phys_pkg_id(0);
655#endif
656
548 /* 657 /*
549 * Vendor-specific initialization. In this section we 658 * Vendor-specific initialization. In this section we
550 * canonicalize the feature flags, meaning if there are 659 * canonicalize the feature flags, meaning if there are
@@ -578,6 +687,10 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
578 c->x86, c->x86_model); 687 c->x86, c->x86_model);
579 } 688 }
580 689
690#ifdef CONFIG_X86_64
691 detect_ht(c);
692#endif
693
581 /* 694 /*
582 * On SMP, boot_cpu_data holds the common feature set between 695 * On SMP, boot_cpu_data holds the common feature set between
583 * all CPUs; so make sure that we indicate which features are 696 * all CPUs; so make sure that we indicate which features are
@@ -594,24 +707,34 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
594 for (i = 0; i < NCAPINTS; i++) 707 for (i = 0; i < NCAPINTS; i++)
595 c->x86_capability[i] &= ~cleared_cpu_caps[i]; 708 c->x86_capability[i] &= ~cleared_cpu_caps[i];
596 709
710#ifdef CONFIG_X86_MCE
597 /* Init Machine Check Exception if available. */ 711 /* Init Machine Check Exception if available. */
598 mcheck_init(c); 712 mcheck_init(c);
713#endif
599 714
600 select_idle_routine(c); 715 select_idle_routine(c);
716
717#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
718 numa_add_cpu(smp_processor_id());
719#endif
601} 720}
602 721
603void __init identify_boot_cpu(void) 722void __init identify_boot_cpu(void)
604{ 723{
605 identify_cpu(&boot_cpu_data); 724 identify_cpu(&boot_cpu_data);
725#ifdef CONFIG_X86_32
606 sysenter_setup(); 726 sysenter_setup();
607 enable_sep_cpu(); 727 enable_sep_cpu();
728#endif
608} 729}
609 730
610void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) 731void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
611{ 732{
612 BUG_ON(c == &boot_cpu_data); 733 BUG_ON(c == &boot_cpu_data);
613 identify_cpu(c); 734 identify_cpu(c);
735#ifdef CONFIG_X86_32
614 enable_sep_cpu(); 736 enable_sep_cpu();
737#endif
615 mtrr_ap_init(); 738 mtrr_ap_init();
616} 739}
617 740
@@ -709,6 +832,89 @@ __setup("clearcpuid=", setup_disablecpuid);
709 832
710cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; 833cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
711 834
835#ifdef CONFIG_X86_64
836struct x8664_pda **_cpu_pda __read_mostly;
837EXPORT_SYMBOL(_cpu_pda);
838
839struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
840
841char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
842
843void __cpuinit pda_init(int cpu)
844{
845 struct x8664_pda *pda = cpu_pda(cpu);
846
847 /* Setup up data that may be needed in __get_free_pages early */
848 loadsegment(fs, 0);
849 loadsegment(gs, 0);
850 /* Memory clobbers used to order PDA accessed */
851 mb();
852 wrmsrl(MSR_GS_BASE, pda);
853 mb();
854
855 pda->cpunumber = cpu;
856 pda->irqcount = -1;
857 pda->kernelstack = (unsigned long)stack_thread_info() -
858 PDA_STACKOFFSET + THREAD_SIZE;
859 pda->active_mm = &init_mm;
860 pda->mmu_state = 0;
861
862 if (cpu == 0) {
863 /* others are initialized in smpboot.c */
864 pda->pcurrent = &init_task;
865 pda->irqstackptr = boot_cpu_stack;
866 pda->irqstackptr += IRQSTACKSIZE - 64;
867 } else {
868 if (!pda->irqstackptr) {
869 pda->irqstackptr = (char *)
870 __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
871 if (!pda->irqstackptr)
872 panic("cannot allocate irqstack for cpu %d",
873 cpu);
874 pda->irqstackptr += IRQSTACKSIZE - 64;
875 }
876
877 if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
878 pda->nodenumber = cpu_to_node(cpu);
879 }
880}
881
882char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
883 DEBUG_STKSZ] __page_aligned_bss;
884
885extern asmlinkage void ignore_sysret(void);
886
887/* May not be marked __init: used by software suspend */
888void syscall_init(void)
889{
890 /*
891 * LSTAR and STAR live in a bit strange symbiosis.
892 * They both write to the same internal register. STAR allows to
893 * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
894 */
895 wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
896 wrmsrl(MSR_LSTAR, system_call);
897 wrmsrl(MSR_CSTAR, ignore_sysret);
898
899#ifdef CONFIG_IA32_EMULATION
900 syscall32_cpu_init();
901#endif
902
903 /* Flags to clear on syscall */
904 wrmsrl(MSR_SYSCALL_MASK,
905 X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
906}
907
908unsigned long kernel_eflags;
909
910/*
911 * Copies of the original ist values from the tss are only accessed during
912 * debugging, no special alignment required.
913 */
914DEFINE_PER_CPU(struct orig_ist, orig_ist);
915
916#else
917
712/* Make sure %fs is initialized properly in idle threads */ 918/* Make sure %fs is initialized properly in idle threads */
713struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) 919struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
714{ 920{
@@ -716,13 +922,136 @@ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
716 regs->fs = __KERNEL_PERCPU; 922 regs->fs = __KERNEL_PERCPU;
717 return regs; 923 return regs;
718} 924}
925#endif
719 926
720/* 927/*
721 * cpu_init() initializes state that is per-CPU. Some data is already 928 * cpu_init() initializes state that is per-CPU. Some data is already
722 * initialized (naturally) in the bootstrap process, such as the GDT 929 * initialized (naturally) in the bootstrap process, such as the GDT
723 * and IDT. We reload them nevertheless, this function acts as a 930 * and IDT. We reload them nevertheless, this function acts as a
724 * 'CPU state barrier', nothing should get across. 931 * 'CPU state barrier', nothing should get across.
932 * A lot of state is already set up in PDA init for 64 bit
725 */ 933 */
934#ifdef CONFIG_X86_64
935void __cpuinit cpu_init(void)
936{
937 int cpu = stack_smp_processor_id();
938 struct tss_struct *t = &per_cpu(init_tss, cpu);
939 struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
940 unsigned long v;
941 char *estacks = NULL;
942 struct task_struct *me;
943 int i;
944
945 /* CPU 0 is initialised in head64.c */
946 if (cpu != 0)
947 pda_init(cpu);
948 else
949 estacks = boot_exception_stacks;
950
951 me = current;
952
953 if (cpu_test_and_set(cpu, cpu_initialized))
954 panic("CPU#%d already initialized!\n", cpu);
955
956 printk(KERN_INFO "Initializing CPU#%d\n", cpu);
957
958 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
959
960 /*
961 * Initialize the per-CPU GDT with the boot GDT,
962 * and set up the GDT descriptor:
963 */
964
965 switch_to_new_gdt();
966 load_idt((const struct desc_ptr *)&idt_descr);
967
968 memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
969 syscall_init();
970
971 wrmsrl(MSR_FS_BASE, 0);
972 wrmsrl(MSR_KERNEL_GS_BASE, 0);
973 barrier();
974
975 check_efer();
976 if (cpu != 0 && x2apic)
977 enable_x2apic();
978
979 /*
980 * set up and load the per-CPU TSS
981 */
982 if (!orig_ist->ist[0]) {
983 static const unsigned int order[N_EXCEPTION_STACKS] = {
984 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
985 [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
986 };
987 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
988 if (cpu) {
989 estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
990 if (!estacks)
991 panic("Cannot allocate exception "
992 "stack %ld %d\n", v, cpu);
993 }
994 estacks += PAGE_SIZE << order[v];
995 orig_ist->ist[v] = t->x86_tss.ist[v] =
996 (unsigned long)estacks;
997 }
998 }
999
1000 t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
1001 /*
1002 * <= is required because the CPU will access up to
1003 * 8 bits beyond the end of the IO permission bitmap.
1004 */
1005 for (i = 0; i <= IO_BITMAP_LONGS; i++)
1006 t->io_bitmap[i] = ~0UL;
1007
1008 atomic_inc(&init_mm.mm_count);
1009 me->active_mm = &init_mm;
1010 if (me->mm)
1011 BUG();
1012 enter_lazy_tlb(&init_mm, me);
1013
1014 load_sp0(t, &current->thread);
1015 set_tss_desc(cpu, t);
1016 load_TR_desc();
1017 load_LDT(&init_mm.context);
1018
1019#ifdef CONFIG_KGDB
1020 /*
1021 * If the kgdb is connected no debug regs should be altered. This
1022 * is only applicable when KGDB and a KGDB I/O module are built
1023 * into the kernel and you are using early debugging with
1024 * kgdbwait. KGDB will control the kernel HW breakpoint registers.
1025 */
1026 if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
1027 arch_kgdb_ops.correct_hw_break();
1028 else {
1029#endif
1030 /*
1031 * Clear all 6 debug registers:
1032 */
1033
1034 set_debugreg(0UL, 0);
1035 set_debugreg(0UL, 1);
1036 set_debugreg(0UL, 2);
1037 set_debugreg(0UL, 3);
1038 set_debugreg(0UL, 6);
1039 set_debugreg(0UL, 7);
1040#ifdef CONFIG_KGDB
1041 /* If the kgdb is connected no debug regs should be altered. */
1042 }
1043#endif
1044
1045 fpu_init();
1046
1047 raw_local_save_flags(kernel_eflags);
1048
1049 if (is_uv_system())
1050 uv_cpu_init();
1051}
1052
1053#else
1054
726void __cpuinit cpu_init(void) 1055void __cpuinit cpu_init(void)
727{ 1056{
728 int cpu = smp_processor_id(); 1057 int cpu = smp_processor_id();
@@ -803,3 +1132,5 @@ void __cpuinit cpu_uninit(void)
803 per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm; 1132 per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
804} 1133}
805#endif 1134#endif
1135
1136#endif
diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c
deleted file mode 100644
index bcb48ce05d23..000000000000
--- a/arch/x86/kernel/cpu/common_64.c
+++ /dev/null
@@ -1,816 +0,0 @@
1#include <linux/init.h>
2#include <linux/kernel.h>
3#include <linux/sched.h>
4#include <linux/string.h>
5#include <linux/bootmem.h>
6#include <linux/bitops.h>
7#include <linux/module.h>
8#include <linux/kgdb.h>
9#include <linux/topology.h>
10#include <linux/delay.h>
11#include <linux/smp.h>
12#include <linux/percpu.h>
13#include <asm/i387.h>
14#include <asm/msr.h>
15#include <asm/io.h>
16#include <asm/linkage.h>
17#include <asm/mmu_context.h>
18#include <asm/mtrr.h>
19#include <asm/mce.h>
20#include <asm/pat.h>
21#include <asm/asm.h>
22#include <asm/numa.h>
23#ifdef CONFIG_X86_LOCAL_APIC
24#include <asm/mpspec.h>
25#include <asm/apic.h>
26#include <mach_apic.h>
27#endif
28#include <asm/pda.h>
29#include <asm/pgtable.h>
30#include <asm/processor.h>
31#include <asm/desc.h>
32#include <asm/atomic.h>
33#include <asm/proto.h>
34#include <asm/sections.h>
35#include <asm/setup.h>
36#include <asm/genapic.h>
37
38#include "cpu.h"
39
40static struct cpu_dev *this_cpu __cpuinitdata;
41
42/* We need valid kernel segments for data and code in long mode too
43 * IRET will check the segment types kkeil 2000/10/28
44 * Also sysret mandates a special GDT layout
45 */
46/* The TLS descriptors are currently at a different place compared to i386.
47 Hopefully nobody expects them at a fixed place (Wine?) */
48DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
49 [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
50 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
51 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
52 [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
53 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
54 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
55} };
56EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
57
58__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
59
60/* Current gdt points %fs at the "master" per-cpu area: after this,
61 * it's on the real one. */
62void switch_to_new_gdt(void)
63{
64 struct desc_ptr gdt_descr;
65
66 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
67 gdt_descr.size = GDT_SIZE - 1;
68 load_gdt(&gdt_descr);
69}
70
71static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
72
73static void __cpuinit default_init(struct cpuinfo_x86 *c)
74{
75 display_cacheinfo(c);
76}
77
78static struct cpu_dev __cpuinitdata default_cpu = {
79 .c_init = default_init,
80 .c_vendor = "Unknown",
81 .c_x86_vendor = X86_VENDOR_UNKNOWN,
82};
83
84int __cpuinit get_model_name(struct cpuinfo_x86 *c)
85{
86 unsigned int *v;
87 char *p, *q;
88
89 if (c->extended_cpuid_level < 0x80000004)
90 return 0;
91
92 v = (unsigned int *) c->x86_model_id;
93 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
94 cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
95 cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
96 c->x86_model_id[48] = 0;
97
98 /* Intel chips right-justify this string for some dumb reason;
99 undo that brain damage */
100 p = q = &c->x86_model_id[0];
101 while (*p == ' ')
102 p++;
103 if (p != q) {
104 while (*p)
105 *q++ = *p++;
106 while (q <= &c->x86_model_id[48])
107 *q++ = '\0'; /* Zero-pad the rest */
108 }
109
110 return 1;
111}
112
113
114void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
115{
116 unsigned int n, dummy, ebx, ecx, edx, l2size;
117
118 n = c->extended_cpuid_level;
119
120 if (n >= 0x80000005) {
121 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
122 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
123 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
124 c->x86_cache_size = (ecx>>24) + (edx>>24);
125 /* On K8 L1 TLB is inclusive, so don't count it */
126 c->x86_tlbsize = 0;
127 }
128
129 if (n < 0x80000006) /* Some chips just has a large L1. */
130 return;
131
132 cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
133 l2size = ecx >> 16;
134 c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
135
136 c->x86_cache_size = l2size;
137
138 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
139 l2size, ecx & 0xFF);
140}
141
142void __cpuinit detect_ht(struct cpuinfo_x86 *c)
143{
144#ifdef CONFIG_SMP
145 u32 eax, ebx, ecx, edx;
146 int index_msb, core_bits;
147
148 if (!cpu_has(c, X86_FEATURE_HT))
149 return;
150 if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
151 goto out;
152
153 if (cpu_has(c, X86_FEATURE_XTOPOLOGY))
154 return;
155
156 cpuid(1, &eax, &ebx, &ecx, &edx);
157
158 smp_num_siblings = (ebx & 0xff0000) >> 16;
159
160 if (smp_num_siblings == 1) {
161 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
162 } else if (smp_num_siblings > 1) {
163
164 if (smp_num_siblings > NR_CPUS) {
165 printk(KERN_WARNING "CPU: Unsupported number of siblings %d",
166 smp_num_siblings);
167 smp_num_siblings = 1;
168 return;
169 }
170
171 index_msb = get_count_order(smp_num_siblings);
172 c->phys_proc_id = phys_pkg_id(index_msb);
173
174 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
175
176 index_msb = get_count_order(smp_num_siblings);
177
178 core_bits = get_count_order(c->x86_max_cores);
179
180 c->cpu_core_id = phys_pkg_id(index_msb) &
181 ((1 << core_bits) - 1);
182 }
183
184out:
185 if ((c->x86_max_cores * smp_num_siblings) > 1) {
186 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
187 c->phys_proc_id);
188 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
189 c->cpu_core_id);
190 }
191#endif
192}
193
194static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
195{
196 char *v = c->x86_vendor_id;
197 int i;
198 static int printed;
199
200 for (i = 0; i < X86_VENDOR_NUM; i++) {
201 if (!cpu_devs[i])
202 break;
203
204 if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
205 (cpu_devs[i]->c_ident[1] &&
206 !strcmp(v, cpu_devs[i]->c_ident[1]))) {
207 this_cpu = cpu_devs[i];
208 c->x86_vendor = this_cpu->c_x86_vendor;
209 return;
210 }
211 }
212
213 if (!printed) {
214 printed++;
215 printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
216 printk(KERN_ERR "CPU: Your system may be unstable.\n");
217 }
218
219 c->x86_vendor = X86_VENDOR_UNKNOWN;
220 this_cpu = &default_cpu;
221}
222
223void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
224{
225 /* Get vendor name */
226 cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
227 (unsigned int *)&c->x86_vendor_id[0],
228 (unsigned int *)&c->x86_vendor_id[8],
229 (unsigned int *)&c->x86_vendor_id[4]);
230
231 c->x86 = 4;
232 /* Intel-defined flags: level 0x00000001 */
233 if (c->cpuid_level >= 0x00000001) {
234 u32 junk, tfms, cap0, misc;
235 cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
236 c->x86 = (tfms >> 8) & 0xf;
237 c->x86_model = (tfms >> 4) & 0xf;
238 c->x86_mask = tfms & 0xf;
239 if (c->x86 == 0xf)
240 c->x86 += (tfms >> 20) & 0xff;
241 if (c->x86 >= 0x6)
242 c->x86_model += ((tfms >> 16) & 0xf) << 4;
243 if (cap0 & (1<<19)) {
244 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
245 c->x86_cache_alignment = c->x86_clflush_size;
246 }
247 }
248}
249
250
251static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
252{
253 u32 tfms, xlvl;
254 u32 ebx;
255
256 /* Intel-defined flags: level 0x00000001 */
257 if (c->cpuid_level >= 0x00000001) {
258 u32 capability, excap;
259
260 cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
261 c->x86_capability[0] = capability;
262 c->x86_capability[4] = excap;
263 }
264
265 /* AMD-defined flags: level 0x80000001 */
266 xlvl = cpuid_eax(0x80000000);
267 c->extended_cpuid_level = xlvl;
268 if ((xlvl & 0xffff0000) == 0x80000000) {
269 if (xlvl >= 0x80000001) {
270 c->x86_capability[1] = cpuid_edx(0x80000001);
271 c->x86_capability[6] = cpuid_ecx(0x80000001);
272 }
273 }
274
275 /* Transmeta-defined flags: level 0x80860001 */
276 xlvl = cpuid_eax(0x80860000);
277 if ((xlvl & 0xffff0000) == 0x80860000) {
278 /* Don't set x86_cpuid_level here for now to not confuse. */
279 if (xlvl >= 0x80860001)
280 c->x86_capability[2] = cpuid_edx(0x80860001);
281 }
282
283 if (c->extended_cpuid_level >= 0x80000007)
284 c->x86_power = cpuid_edx(0x80000007);
285
286 if (c->extended_cpuid_level >= 0x80000008) {
287 u32 eax = cpuid_eax(0x80000008);
288
289 c->x86_virt_bits = (eax >> 8) & 0xff;
290 c->x86_phys_bits = eax & 0xff;
291 }
292}
293
294/* Do some early cpuid on the boot CPU to get some parameter that are
295 needed before check_bugs. Everything advanced is in identify_cpu
296 below. */
297static void __init early_identify_cpu(struct cpuinfo_x86 *c)
298{
299
300 c->x86_clflush_size = 64;
301 c->x86_cache_alignment = c->x86_clflush_size;
302
303 memset(&c->x86_capability, 0, sizeof c->x86_capability);
304
305 c->extended_cpuid_level = 0;
306
307 cpu_detect(c);
308
309 get_cpu_vendor(c);
310
311 get_cpu_cap(c);
312
313 if (this_cpu->c_early_init)
314 this_cpu->c_early_init(c);
315
316 validate_pat_support(c);
317}
318
319void __init early_cpu_init(void)
320{
321 struct cpu_dev **cdev;
322 int count = 0;
323
324 printk("KERNEL supported cpus:\n");
325 for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
326 struct cpu_dev *cpudev = *cdev;
327 unsigned int j;
328
329 if (count >= X86_VENDOR_NUM)
330 break;
331 cpu_devs[count] = cpudev;
332 count++;
333
334 for (j = 0; j < 2; j++) {
335 if (!cpudev->c_ident[j])
336 continue;
337 printk(" %s %s\n", cpudev->c_vendor,
338 cpudev->c_ident[j]);
339 }
340 }
341
342 early_identify_cpu(&boot_cpu_data);
343}
344
345/*
346 * The NOPL instruction is supposed to exist on all CPUs with
347 * family >= 6, unfortunately, that's not true in practice because
348 * of early VIA chips and (more importantly) broken virtualizers that
349 * are not easy to detect. Hence, probe for it based on first
350 * principles.
351 *
352 * Note: no 64-bit chip is known to lack these, but put the code here
353 * for consistency with 32 bits, and to make it utterly trivial to
354 * diagnose the problem should it ever surface.
355 */
356static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
357{
358 const u32 nopl_signature = 0x888c53b1; /* Random number */
359 u32 has_nopl = nopl_signature;
360
361 clear_cpu_cap(c, X86_FEATURE_NOPL);
362 if (c->x86 >= 6) {
363 asm volatile("\n"
364 "1: .byte 0x0f,0x1f,0xc0\n" /* nopl %eax */
365 "2:\n"
366 " .section .fixup,\"ax\"\n"
367 "3: xor %0,%0\n"
368 " jmp 2b\n"
369 " .previous\n"
370 _ASM_EXTABLE(1b,3b)
371 : "+a" (has_nopl));
372
373 if (has_nopl == nopl_signature)
374 set_cpu_cap(c, X86_FEATURE_NOPL);
375 }
376}
377
378static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
379{
380 c->extended_cpuid_level = 0;
381
382 cpu_detect(c);
383
384 get_cpu_vendor(c);
385
386 get_cpu_cap(c);
387
388 c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
389#ifdef CONFIG_SMP
390 c->phys_proc_id = c->initial_apicid;
391#endif
392
393 if (c->extended_cpuid_level >= 0x80000004)
394 get_model_name(c); /* Default name */
395
396 init_scattered_cpuid_features(c);
397 detect_nopl(c);
398}
399
400/*
401 * This does the hard work of actually picking apart the CPU stuff...
402 */
403static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
404{
405 int i;
406
407 c->loops_per_jiffy = loops_per_jiffy;
408 c->x86_cache_size = -1;
409 c->x86_vendor = X86_VENDOR_UNKNOWN;
410 c->x86_model = c->x86_mask = 0; /* So far unknown... */
411 c->x86_vendor_id[0] = '\0'; /* Unset */
412 c->x86_model_id[0] = '\0'; /* Unset */
413 c->x86_max_cores = 1;
414 c->x86_coreid_bits = 0;
415 c->x86_clflush_size = 64;
416 c->x86_cache_alignment = c->x86_clflush_size;
417 memset(&c->x86_capability, 0, sizeof c->x86_capability);
418
419 generic_identify(c);
420
421 c->apicid = phys_pkg_id(0);
422
423 /*
424 * Vendor-specific initialization. In this section we
425 * canonicalize the feature flags, meaning if there are
426 * features a certain CPU supports which CPUID doesn't
427 * tell us, CPUID claiming incorrect flags, or other bugs,
428 * we handle them here.
429 *
430 * At the end of this section, c->x86_capability better
431 * indicate the features this CPU genuinely supports!
432 */
433 if (this_cpu->c_init)
434 this_cpu->c_init(c);
435
436 detect_ht(c);
437
438 /*
439 * On SMP, boot_cpu_data holds the common feature set between
440 * all CPUs; so make sure that we indicate which features are
441 * common between the CPUs. The first time this routine gets
442 * executed, c == &boot_cpu_data.
443 */
444 if (c != &boot_cpu_data) {
445 /* AND the already accumulated flags with these */
446 for (i = 0; i < NCAPINTS; i++)
447 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
448 }
449
450 /* Clear all flags overriden by options */
451 for (i = 0; i < NCAPINTS; i++)
452 c->x86_capability[i] &= ~cleared_cpu_caps[i];
453
454#ifdef CONFIG_X86_MCE
455 mcheck_init(c);
456#endif
457 select_idle_routine(c);
458
459#ifdef CONFIG_NUMA
460 numa_add_cpu(smp_processor_id());
461#endif
462
463}
464
465void __init identify_boot_cpu(void)
466{
467 identify_cpu(&boot_cpu_data);
468}
469
470void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
471{
472 BUG_ON(c == &boot_cpu_data);
473 identify_cpu(c);
474 mtrr_ap_init();
475}
476
477struct msr_range {
478 unsigned min;
479 unsigned max;
480};
481
482static struct msr_range msr_range_array[] __cpuinitdata = {
483 { 0x00000000, 0x00000418},
484 { 0xc0000000, 0xc000040b},
485 { 0xc0010000, 0xc0010142},
486 { 0xc0011000, 0xc001103b},
487};
488
489static void __cpuinit print_cpu_msr(void)
490{
491 unsigned index;
492 u64 val;
493 int i;
494 unsigned index_min, index_max;
495
496 for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
497 index_min = msr_range_array[i].min;
498 index_max = msr_range_array[i].max;
499 for (index = index_min; index < index_max; index++) {
500 if (rdmsrl_amd_safe(index, &val))
501 continue;
502 printk(KERN_INFO " MSR%08x: %016llx\n", index, val);
503 }
504 }
505}
506
507static int show_msr __cpuinitdata;
508static __init int setup_show_msr(char *arg)
509{
510 int num;
511
512 get_option(&arg, &num);
513
514 if (num > 0)
515 show_msr = num;
516 return 1;
517}
518__setup("show_msr=", setup_show_msr);
519
520static __init int setup_noclflush(char *arg)
521{
522 setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
523 return 1;
524}
525__setup("noclflush", setup_noclflush);
526
527void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
528{
529 if (c->x86_model_id[0])
530 printk(KERN_CONT "%s", c->x86_model_id);
531
532 if (c->x86_mask || c->cpuid_level >= 0)
533 printk(KERN_CONT " stepping %02x\n", c->x86_mask);
534 else
535 printk(KERN_CONT "\n");
536
537#ifdef CONFIG_SMP
538 if (c->cpu_index < show_msr)
539 print_cpu_msr();
540#else
541 if (show_msr)
542 print_cpu_msr();
543#endif
544}
545
546static __init int setup_disablecpuid(char *arg)
547{
548 int bit;
549 if (get_option(&arg, &bit) && bit < NCAPINTS*32)
550 setup_clear_cpu_cap(bit);
551 else
552 return 0;
553 return 1;
554}
555__setup("clearcpuid=", setup_disablecpuid);
556
557cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
558
559struct x8664_pda **_cpu_pda __read_mostly;
560EXPORT_SYMBOL(_cpu_pda);
561
562struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
563
564char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
565
566unsigned long __supported_pte_mask __read_mostly = ~0UL;
567EXPORT_SYMBOL_GPL(__supported_pte_mask);
568
569static int do_not_nx __cpuinitdata;
570
571/* noexec=on|off
572Control non executable mappings for 64bit processes.
573
574on Enable(default)
575off Disable
576*/
577static int __init nonx_setup(char *str)
578{
579 if (!str)
580 return -EINVAL;
581 if (!strncmp(str, "on", 2)) {
582 __supported_pte_mask |= _PAGE_NX;
583 do_not_nx = 0;
584 } else if (!strncmp(str, "off", 3)) {
585 do_not_nx = 1;
586 __supported_pte_mask &= ~_PAGE_NX;
587 }
588 return 0;
589}
590early_param("noexec", nonx_setup);
591
592int force_personality32;
593
594/* noexec32=on|off
595Control non executable heap for 32bit processes.
596To control the stack too use noexec=off
597
598on PROT_READ does not imply PROT_EXEC for 32bit processes (default)
599off PROT_READ implies PROT_EXEC
600*/
601static int __init nonx32_setup(char *str)
602{
603 if (!strcmp(str, "on"))
604 force_personality32 &= ~READ_IMPLIES_EXEC;
605 else if (!strcmp(str, "off"))
606 force_personality32 |= READ_IMPLIES_EXEC;
607 return 1;
608}
609__setup("noexec32=", nonx32_setup);
610
611void pda_init(int cpu)
612{
613 struct x8664_pda *pda = cpu_pda(cpu);
614
615 /* Setup up data that may be needed in __get_free_pages early */
616 loadsegment(fs, 0);
617 loadsegment(gs, 0);
618 /* Memory clobbers used to order PDA accessed */
619 mb();
620 wrmsrl(MSR_GS_BASE, pda);
621 mb();
622
623 pda->cpunumber = cpu;
624 pda->irqcount = -1;
625 pda->kernelstack = (unsigned long)stack_thread_info() -
626 PDA_STACKOFFSET + THREAD_SIZE;
627 pda->active_mm = &init_mm;
628 pda->mmu_state = 0;
629
630 if (cpu == 0) {
631 /* others are initialized in smpboot.c */
632 pda->pcurrent = &init_task;
633 pda->irqstackptr = boot_cpu_stack;
634 pda->irqstackptr += IRQSTACKSIZE - 64;
635 } else {
636 if (!pda->irqstackptr) {
637 pda->irqstackptr = (char *)
638 __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
639 if (!pda->irqstackptr)
640 panic("cannot allocate irqstack for cpu %d",
641 cpu);
642 pda->irqstackptr += IRQSTACKSIZE - 64;
643 }
644
645 if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
646 pda->nodenumber = cpu_to_node(cpu);
647 }
648}
649
650char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
651 DEBUG_STKSZ] __page_aligned_bss;
652
653extern asmlinkage void ignore_sysret(void);
654
655/* May not be marked __init: used by software suspend */
656void syscall_init(void)
657{
658 /*
659 * LSTAR and STAR live in a bit strange symbiosis.
660 * They both write to the same internal register. STAR allows to
661 * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
662 */
663 wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
664 wrmsrl(MSR_LSTAR, system_call);
665 wrmsrl(MSR_CSTAR, ignore_sysret);
666
667#ifdef CONFIG_IA32_EMULATION
668 syscall32_cpu_init();
669#endif
670
671 /* Flags to clear on syscall */
672 wrmsrl(MSR_SYSCALL_MASK,
673 X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
674}
675
676void __cpuinit check_efer(void)
677{
678 unsigned long efer;
679
680 rdmsrl(MSR_EFER, efer);
681 if (!(efer & EFER_NX) || do_not_nx)
682 __supported_pte_mask &= ~_PAGE_NX;
683}
684
685unsigned long kernel_eflags;
686
687/*
688 * Copies of the original ist values from the tss are only accessed during
689 * debugging, no special alignment required.
690 */
691DEFINE_PER_CPU(struct orig_ist, orig_ist);
692
693/*
694 * cpu_init() initializes state that is per-CPU. Some data is already
695 * initialized (naturally) in the bootstrap process, such as the GDT
696 * and IDT. We reload them nevertheless, this function acts as a
697 * 'CPU state barrier', nothing should get across.
698 * A lot of state is already set up in PDA init.
699 */
700void __cpuinit cpu_init(void)
701{
702 int cpu = stack_smp_processor_id();
703 struct tss_struct *t = &per_cpu(init_tss, cpu);
704 struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
705 unsigned long v;
706 char *estacks = NULL;
707 struct task_struct *me;
708 int i;
709
710 /* CPU 0 is initialised in head64.c */
711 if (cpu != 0)
712 pda_init(cpu);
713 else
714 estacks = boot_exception_stacks;
715
716 me = current;
717
718 if (cpu_test_and_set(cpu, cpu_initialized))
719 panic("CPU#%d already initialized!\n", cpu);
720
721 printk(KERN_INFO "Initializing CPU#%d\n", cpu);
722
723 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
724
725 /*
726 * Initialize the per-CPU GDT with the boot GDT,
727 * and set up the GDT descriptor:
728 */
729
730 switch_to_new_gdt();
731 load_idt((const struct desc_ptr *)&idt_descr);
732
733 memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
734 syscall_init();
735
736 wrmsrl(MSR_FS_BASE, 0);
737 wrmsrl(MSR_KERNEL_GS_BASE, 0);
738 barrier();
739
740 check_efer();
741 if (cpu != 0 && x2apic)
742 enable_x2apic();
743
744 /*
745 * set up and load the per-CPU TSS
746 */
747 if (!orig_ist->ist[0]) {
748 static const unsigned int order[N_EXCEPTION_STACKS] = {
749 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
750 [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
751 };
752 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
753 if (cpu) {
754 estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
755 if (!estacks)
756 panic("Cannot allocate exception "
757 "stack %ld %d\n", v, cpu);
758 }
759 estacks += PAGE_SIZE << order[v];
760 orig_ist->ist[v] = t->x86_tss.ist[v] =
761 (unsigned long)estacks;
762 }
763 }
764
765 t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
766 /*
767 * <= is required because the CPU will access up to
768 * 8 bits beyond the end of the IO permission bitmap.
769 */
770 for (i = 0; i <= IO_BITMAP_LONGS; i++)
771 t->io_bitmap[i] = ~0UL;
772
773 atomic_inc(&init_mm.mm_count);
774 me->active_mm = &init_mm;
775 if (me->mm)
776 BUG();
777 enter_lazy_tlb(&init_mm, me);
778
779 load_sp0(t, &current->thread);
780 set_tss_desc(cpu, t);
781 load_TR_desc();
782 load_LDT(&init_mm.context);
783
784#ifdef CONFIG_KGDB
785 /*
786 * If the kgdb is connected no debug regs should be altered. This
787 * is only applicable when KGDB and a KGDB I/O module are built
788 * into the kernel and you are using early debugging with
789 * kgdbwait. KGDB will control the kernel HW breakpoint registers.
790 */
791 if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
792 arch_kgdb_ops.correct_hw_break();
793 else {
794#endif
795 /*
796 * Clear all 6 debug registers:
797 */
798
799 set_debugreg(0UL, 0);
800 set_debugreg(0UL, 1);
801 set_debugreg(0UL, 2);
802 set_debugreg(0UL, 3);
803 set_debugreg(0UL, 6);
804 set_debugreg(0UL, 7);
805#ifdef CONFIG_KGDB
806 /* If the kgdb is connected no debug regs should be altered. */
807 }
808#endif
809
810 fpu_init();
811
812 raw_local_save_flags(kernel_eflags);
813
814 if (is_uv_system())
815 uv_cpu_init();
816}
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 3cc9d92afd8f..de4094a39210 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -31,7 +31,6 @@ struct cpu_dev {
31 31
32extern struct cpu_dev *__x86_cpu_dev_start[], *__x86_cpu_dev_end[]; 32extern struct cpu_dev *__x86_cpu_dev_start[], *__x86_cpu_dev_end[];
33 33
34extern int get_model_name(struct cpuinfo_x86 *c);
35extern void display_cacheinfo(struct cpuinfo_x86 *c); 34extern void display_cacheinfo(struct cpuinfo_x86 *c);
36 35
37#endif 36#endif
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index dd097b835839..c24c4a487b7c 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -256,7 +256,8 @@ static u32 get_cur_val(const cpumask_t *mask)
256 * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and 256 * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
257 * no meaning should be associated with absolute values of these MSRs. 257 * no meaning should be associated with absolute values of these MSRs.
258 */ 258 */
259static unsigned int get_measured_perf(unsigned int cpu) 259static unsigned int get_measured_perf(struct cpufreq_policy *policy,
260 unsigned int cpu)
260{ 261{
261 union { 262 union {
262 struct { 263 struct {
@@ -326,7 +327,7 @@ static unsigned int get_measured_perf(unsigned int cpu)
326 327
327#endif 328#endif
328 329
329 retval = per_cpu(drv_data, cpu)->max_freq * perf_percent / 100; 330 retval = per_cpu(drv_data, policy->cpu)->max_freq * perf_percent / 100;
330 331
331 put_cpu(); 332 put_cpu();
332 set_cpus_allowed_ptr(current, &saved_mask); 333 set_cpus_allowed_ptr(current, &saved_mask);
@@ -785,7 +786,11 @@ static int __init acpi_cpufreq_init(void)
785 if (ret) 786 if (ret)
786 return ret; 787 return ret;
787 788
788 return cpufreq_register_driver(&acpi_cpufreq_driver); 789 ret = cpufreq_register_driver(&acpi_cpufreq_driver);
790 if (ret)
791 free_percpu(acpi_perf_data);
792
793 return ret;
789} 794}
790 795
791static void __exit acpi_cpufreq_exit(void) 796static void __exit acpi_cpufreq_exit(void)
@@ -795,8 +800,6 @@ static void __exit acpi_cpufreq_exit(void)
795 cpufreq_unregister_driver(&acpi_cpufreq_driver); 800 cpufreq_unregister_driver(&acpi_cpufreq_driver);
796 801
797 free_percpu(acpi_perf_data); 802 free_percpu(acpi_perf_data);
798
799 return;
800} 803}
801 804
802module_param(acpi_pstate_strict, uint, 0644); 805module_param(acpi_pstate_strict, uint, 0644);
diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
index e4a4bf870e94..fe613c93b366 100644
--- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
@@ -25,8 +25,8 @@
25#include <linux/cpufreq.h> 25#include <linux/cpufreq.h>
26 26
27#include <asm/msr.h> 27#include <asm/msr.h>
28#include <asm/timex.h> 28#include <linux/timex.h>
29#include <asm/io.h> 29#include <linux/io.h>
30 30
31#define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */ 31#define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */
32#define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */ 32#define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */
@@ -82,7 +82,7 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
82 u8 clockspeed_reg; /* Clock Speed Register */ 82 u8 clockspeed_reg; /* Clock Speed Register */
83 83
84 local_irq_disable(); 84 local_irq_disable();
85 outb_p(0x80,REG_CSCIR); 85 outb_p(0x80, REG_CSCIR);
86 clockspeed_reg = inb_p(REG_CSCDR); 86 clockspeed_reg = inb_p(REG_CSCDR);
87 local_irq_enable(); 87 local_irq_enable();
88 88
@@ -98,10 +98,10 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
98 } 98 }
99 99
100 /* 33 MHz is not 32 MHz... */ 100 /* 33 MHz is not 32 MHz... */
101 if ((clockspeed_reg & 0xE0)==0xA0) 101 if ((clockspeed_reg & 0xE0) == 0xA0)
102 return 33000; 102 return 33000;
103 103
104 return ((1<<((clockspeed_reg & 0xE0) >> 5)) * 1000); 104 return (1<<((clockspeed_reg & 0xE0) >> 5)) * 1000;
105} 105}
106 106
107 107
@@ -117,7 +117,7 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
117 * There is no return value. 117 * There is no return value.
118 */ 118 */
119 119
120static void elanfreq_set_cpu_state (unsigned int state) 120static void elanfreq_set_cpu_state(unsigned int state)
121{ 121{
122 struct cpufreq_freqs freqs; 122 struct cpufreq_freqs freqs;
123 123
@@ -144,20 +144,20 @@ static void elanfreq_set_cpu_state (unsigned int state)
144 */ 144 */
145 145
146 local_irq_disable(); 146 local_irq_disable();
147 outb_p(0x40,REG_CSCIR); /* Disable hyperspeed mode */ 147 outb_p(0x40, REG_CSCIR); /* Disable hyperspeed mode */
148 outb_p(0x00,REG_CSCDR); 148 outb_p(0x00, REG_CSCDR);
149 local_irq_enable(); /* wait till internal pipelines and */ 149 local_irq_enable(); /* wait till internal pipelines and */
150 udelay(1000); /* buffers have cleaned up */ 150 udelay(1000); /* buffers have cleaned up */
151 151
152 local_irq_disable(); 152 local_irq_disable();
153 153
154 /* now, set the CPU clock speed register (0x80) */ 154 /* now, set the CPU clock speed register (0x80) */
155 outb_p(0x80,REG_CSCIR); 155 outb_p(0x80, REG_CSCIR);
156 outb_p(elan_multiplier[state].val80h,REG_CSCDR); 156 outb_p(elan_multiplier[state].val80h, REG_CSCDR);
157 157
158 /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */ 158 /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */
159 outb_p(0x40,REG_CSCIR); 159 outb_p(0x40, REG_CSCIR);
160 outb_p(elan_multiplier[state].val40h,REG_CSCDR); 160 outb_p(elan_multiplier[state].val40h, REG_CSCDR);
161 udelay(10000); 161 udelay(10000);
162 local_irq_enable(); 162 local_irq_enable();
163 163
@@ -173,12 +173,12 @@ static void elanfreq_set_cpu_state (unsigned int state)
173 * for the hardware supported by the driver. 173 * for the hardware supported by the driver.
174 */ 174 */
175 175
176static int elanfreq_verify (struct cpufreq_policy *policy) 176static int elanfreq_verify(struct cpufreq_policy *policy)
177{ 177{
178 return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]); 178 return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]);
179} 179}
180 180
181static int elanfreq_target (struct cpufreq_policy *policy, 181static int elanfreq_target(struct cpufreq_policy *policy,
182 unsigned int target_freq, 182 unsigned int target_freq,
183 unsigned int relation) 183 unsigned int relation)
184{ 184{
@@ -205,7 +205,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy)
205 205
206 /* capability check */ 206 /* capability check */
207 if ((c->x86_vendor != X86_VENDOR_AMD) || 207 if ((c->x86_vendor != X86_VENDOR_AMD) ||
208 (c->x86 != 4) || (c->x86_model!=10)) 208 (c->x86 != 4) || (c->x86_model != 10))
209 return -ENODEV; 209 return -ENODEV;
210 210
211 /* max freq */ 211 /* max freq */
@@ -213,7 +213,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy)
213 max_freq = elanfreq_get_cpu_frequency(0); 213 max_freq = elanfreq_get_cpu_frequency(0);
214 214
215 /* table init */ 215 /* table init */
216 for (i=0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) { 216 for (i = 0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) {
217 if (elanfreq_table[i].frequency > max_freq) 217 if (elanfreq_table[i].frequency > max_freq)
218 elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID; 218 elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID;
219 } 219 }
@@ -224,7 +224,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy)
224 224
225 result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table); 225 result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table);
226 if (result) 226 if (result)
227 return (result); 227 return result;
228 228
229 cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu); 229 cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu);
230 return 0; 230 return 0;
@@ -260,7 +260,7 @@ __setup("elanfreq=", elanfreq_setup);
260#endif 260#endif
261 261
262 262
263static struct freq_attr* elanfreq_attr[] = { 263static struct freq_attr *elanfreq_attr[] = {
264 &cpufreq_freq_attr_scaling_available_freqs, 264 &cpufreq_freq_attr_scaling_available_freqs,
265 NULL, 265 NULL,
266}; 266};
@@ -284,9 +284,9 @@ static int __init elanfreq_init(void)
284 284
285 /* Test if we have the right hardware */ 285 /* Test if we have the right hardware */
286 if ((c->x86_vendor != X86_VENDOR_AMD) || 286 if ((c->x86_vendor != X86_VENDOR_AMD) ||
287 (c->x86 != 4) || (c->x86_model!=10)) { 287 (c->x86 != 4) || (c->x86_model != 10)) {
288 printk(KERN_INFO "elanfreq: error: no Elan processor found!\n"); 288 printk(KERN_INFO "elanfreq: error: no Elan processor found!\n");
289 return -ENODEV; 289 return -ENODEV;
290 } 290 }
291 return cpufreq_register_driver(&elanfreq_driver); 291 return cpufreq_register_driver(&elanfreq_driver);
292} 292}
@@ -298,7 +298,7 @@ static void __exit elanfreq_exit(void)
298} 298}
299 299
300 300
301module_param (max_freq, int, 0444); 301module_param(max_freq, int, 0444);
302 302
303MODULE_LICENSE("GPL"); 303MODULE_LICENSE("GPL");
304MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, Sven Geggus <sven@geggus.net>"); 304MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, Sven Geggus <sven@geggus.net>");
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index f1685fb91fbd..b8e05ee4f736 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -171,7 +171,7 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
171 } 171 }
172 172
173 if (c->x86 != 0xF) { 173 if (c->x86 != 0xF) {
174 printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to <cpufreq@lists.linux.org.uk>\n"); 174 printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to <cpufreq@vger.kernel.org>\n");
175 return 0; 175 return 0;
176 } 176 }
177 177
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
index eb9b62b0830c..b5ced806a316 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
@@ -15,12 +15,11 @@
15#include <linux/slab.h> 15#include <linux/slab.h>
16 16
17#include <asm/msr.h> 17#include <asm/msr.h>
18#include <asm/timex.h> 18#include <linux/timex.h>
19#include <asm/io.h> 19#include <linux/io.h>
20 20
21 21#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long
22#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long 22 as it is unused */
23 as it is unused */
24 23
25static unsigned int busfreq; /* FSB, in 10 kHz */ 24static unsigned int busfreq; /* FSB, in 10 kHz */
26static unsigned int max_multiplier; 25static unsigned int max_multiplier;
@@ -53,7 +52,7 @@ static int powernow_k6_get_cpu_multiplier(void)
53 52
54 msrval = POWERNOW_IOPORT + 0x1; 53 msrval = POWERNOW_IOPORT + 0x1;
55 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ 54 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
56 invalue=inl(POWERNOW_IOPORT + 0x8); 55 invalue = inl(POWERNOW_IOPORT + 0x8);
57 msrval = POWERNOW_IOPORT + 0x0; 56 msrval = POWERNOW_IOPORT + 0x0;
58 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ 57 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
59 58
@@ -67,9 +66,9 @@ static int powernow_k6_get_cpu_multiplier(void)
67 * 66 *
68 * Tries to change the PowerNow! multiplier 67 * Tries to change the PowerNow! multiplier
69 */ 68 */
70static void powernow_k6_set_state (unsigned int best_i) 69static void powernow_k6_set_state(unsigned int best_i)
71{ 70{
72 unsigned long outvalue=0, invalue=0; 71 unsigned long outvalue = 0, invalue = 0;
73 unsigned long msrval; 72 unsigned long msrval;
74 struct cpufreq_freqs freqs; 73 struct cpufreq_freqs freqs;
75 74
@@ -90,10 +89,10 @@ static void powernow_k6_set_state (unsigned int best_i)
90 89
91 msrval = POWERNOW_IOPORT + 0x1; 90 msrval = POWERNOW_IOPORT + 0x1;
92 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ 91 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
93 invalue=inl(POWERNOW_IOPORT + 0x8); 92 invalue = inl(POWERNOW_IOPORT + 0x8);
94 invalue = invalue & 0xf; 93 invalue = invalue & 0xf;
95 outvalue = outvalue | invalue; 94 outvalue = outvalue | invalue;
96 outl(outvalue ,(POWERNOW_IOPORT + 0x8)); 95 outl(outvalue , (POWERNOW_IOPORT + 0x8));
97 msrval = POWERNOW_IOPORT + 0x0; 96 msrval = POWERNOW_IOPORT + 0x0;
98 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ 97 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
99 98
@@ -124,7 +123,7 @@ static int powernow_k6_verify(struct cpufreq_policy *policy)
124 * 123 *
125 * sets a new CPUFreq policy 124 * sets a new CPUFreq policy
126 */ 125 */
127static int powernow_k6_target (struct cpufreq_policy *policy, 126static int powernow_k6_target(struct cpufreq_policy *policy,
128 unsigned int target_freq, 127 unsigned int target_freq,
129 unsigned int relation) 128 unsigned int relation)
130{ 129{
@@ -152,7 +151,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
152 busfreq = cpu_khz / max_multiplier; 151 busfreq = cpu_khz / max_multiplier;
153 152
154 /* table init */ 153 /* table init */
155 for (i=0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) { 154 for (i = 0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) {
156 if (clock_ratio[i].index > max_multiplier) 155 if (clock_ratio[i].index > max_multiplier)
157 clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID; 156 clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID;
158 else 157 else
@@ -165,7 +164,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
165 164
166 result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio); 165 result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio);
167 if (result) 166 if (result)
168 return (result); 167 return result;
169 168
170 cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu); 169 cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu);
171 170
@@ -176,8 +175,8 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
176static int powernow_k6_cpu_exit(struct cpufreq_policy *policy) 175static int powernow_k6_cpu_exit(struct cpufreq_policy *policy)
177{ 176{
178 unsigned int i; 177 unsigned int i;
179 for (i=0; i<8; i++) { 178 for (i = 0; i < 8; i++) {
180 if (i==max_multiplier) 179 if (i == max_multiplier)
181 powernow_k6_set_state(i); 180 powernow_k6_set_state(i);
182 } 181 }
183 cpufreq_frequency_table_put_attr(policy->cpu); 182 cpufreq_frequency_table_put_attr(policy->cpu);
@@ -189,7 +188,7 @@ static unsigned int powernow_k6_get(unsigned int cpu)
189 return busfreq * powernow_k6_get_cpu_multiplier(); 188 return busfreq * powernow_k6_get_cpu_multiplier();
190} 189}
191 190
192static struct freq_attr* powernow_k6_attr[] = { 191static struct freq_attr *powernow_k6_attr[] = {
193 &cpufreq_freq_attr_scaling_available_freqs, 192 &cpufreq_freq_attr_scaling_available_freqs,
194 NULL, 193 NULL,
195}; 194};
@@ -227,7 +226,7 @@ static int __init powernow_k6_init(void)
227 } 226 }
228 227
229 if (cpufreq_register_driver(&powernow_k6_driver)) { 228 if (cpufreq_register_driver(&powernow_k6_driver)) {
230 release_region (POWERNOW_IOPORT, 16); 229 release_region(POWERNOW_IOPORT, 16);
231 return -EINVAL; 230 return -EINVAL;
232 } 231 }
233 232
@@ -243,13 +242,13 @@ static int __init powernow_k6_init(void)
243static void __exit powernow_k6_exit(void) 242static void __exit powernow_k6_exit(void)
244{ 243{
245 cpufreq_unregister_driver(&powernow_k6_driver); 244 cpufreq_unregister_driver(&powernow_k6_driver);
246 release_region (POWERNOW_IOPORT, 16); 245 release_region(POWERNOW_IOPORT, 16);
247} 246}
248 247
249 248
250MODULE_AUTHOR ("Arjan van de Ven <arjanv@redhat.com>, Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>"); 249MODULE_AUTHOR("Arjan van de Ven <arjanv@redhat.com>, Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>");
251MODULE_DESCRIPTION ("PowerNow! driver for AMD K6-2+ / K6-3+ processors."); 250MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors.");
252MODULE_LICENSE ("GPL"); 251MODULE_LICENSE("GPL");
253 252
254module_init(powernow_k6_init); 253module_init(powernow_k6_init);
255module_exit(powernow_k6_exit); 254module_exit(powernow_k6_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index 15e13c01cc36..3b5f06423e77 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -26,7 +26,7 @@
26#include <asm/cpufeature.h> 26#include <asm/cpufeature.h>
27 27
28#define PFX "speedstep-centrino: " 28#define PFX "speedstep-centrino: "
29#define MAINTAINER "cpufreq@lists.linux.org.uk" 29#define MAINTAINER "cpufreq@vger.kernel.org"
30 30
31#define dprintk(msg...) \ 31#define dprintk(msg...) \
32 cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg) 32 cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg)
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 3f8c7283d816..ffd0f5ed071a 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -301,7 +301,6 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
301 */ 301 */
302 if ((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <= dir1 && dir1 <= 0x8f)) 302 if ((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <= dir1 && dir1 <= 0x8f))
303 geode_configure(); 303 geode_configure();
304 get_model_name(c); /* get CPU marketing name */
305 return; 304 return;
306 } else { /* MediaGX */ 305 } else { /* MediaGX */
307 Cx86_cb[2] = (dir0_lsn & 1) ? '3' : '4'; 306 Cx86_cb[2] = (dir0_lsn & 1) ? '3' : '4';
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 959417b8cd64..99468dbd08da 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -15,6 +15,11 @@
15#include <asm/ds.h> 15#include <asm/ds.h>
16#include <asm/bugs.h> 16#include <asm/bugs.h>
17 17
18#ifdef CONFIG_X86_64
19#include <asm/topology.h>
20#include <asm/numa_64.h>
21#endif
22
18#include "cpu.h" 23#include "cpu.h"
19 24
20#ifdef CONFIG_X86_LOCAL_APIC 25#ifdef CONFIG_X86_LOCAL_APIC
@@ -25,14 +30,20 @@
25 30
26static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) 31static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
27{ 32{
28 /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
29 if (c->x86 == 15 && c->x86_cache_alignment == 64)
30 c->x86_cache_alignment = 128;
31 if ((c->x86 == 0xf && c->x86_model >= 0x03) || 33 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
32 (c->x86 == 0x6 && c->x86_model >= 0x0e)) 34 (c->x86 == 0x6 && c->x86_model >= 0x0e))
33 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 35 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
36
37#ifdef CONFIG_X86_64
38 set_cpu_cap(c, X86_FEATURE_SYSENTER32);
39#else
40 /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
41 if (c->x86 == 15 && c->x86_cache_alignment == 64)
42 c->x86_cache_alignment = 128;
43#endif
34} 44}
35 45
46#ifdef CONFIG_X86_32
36/* 47/*
37 * Early probe support logic for ppro memory erratum #50 48 * Early probe support logic for ppro memory erratum #50
38 * 49 *
@@ -52,15 +63,54 @@ int __cpuinit ppro_with_ram_bug(void)
52 return 0; 63 return 0;
53} 64}
54 65
66#ifdef CONFIG_X86_F00F_BUG
67static void __cpuinit trap_init_f00f_bug(void)
68{
69 __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO);
55 70
56/* 71 /*
57 * P4 Xeon errata 037 workaround. 72 * Update the IDT descriptor and reload the IDT so that
58 * Hardware prefetcher may cause stale data to be loaded into the cache. 73 * it uses the read-only mapped virtual address.
59 */ 74 */
60static void __cpuinit Intel_errata_workarounds(struct cpuinfo_x86 *c) 75 idt_descr.address = fix_to_virt(FIX_F00F_IDT);
76 load_idt(&idt_descr);
77}
78#endif
79
80static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
61{ 81{
62 unsigned long lo, hi; 82 unsigned long lo, hi;
63 83
84#ifdef CONFIG_X86_F00F_BUG
85 /*
86 * All current models of Pentium and Pentium with MMX technology CPUs
87 * have the F0 0F bug, which lets nonprivileged users lock up the system.
88 * Note that the workaround only should be initialized once...
89 */
90 c->f00f_bug = 0;
91 if (!paravirt_enabled() && c->x86 == 5) {
92 static int f00f_workaround_enabled;
93
94 c->f00f_bug = 1;
95 if (!f00f_workaround_enabled) {
96 trap_init_f00f_bug();
97 printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n");
98 f00f_workaround_enabled = 1;
99 }
100 }
101#endif
102
103 /*
104 * SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until
105 * model 3 mask 3
106 */
107 if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633)
108 clear_cpu_cap(c, X86_FEATURE_SEP);
109
110 /*
111 * P4 Xeon errata 037 workaround.
112 * Hardware prefetcher may cause stale data to be loaded into the cache.
113 */
64 if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) { 114 if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) {
65 rdmsr(MSR_IA32_MISC_ENABLE, lo, hi); 115 rdmsr(MSR_IA32_MISC_ENABLE, lo, hi);
66 if ((lo & (1<<9)) == 0) { 116 if ((lo & (1<<9)) == 0) {
@@ -70,13 +120,68 @@ static void __cpuinit Intel_errata_workarounds(struct cpuinfo_x86 *c)
70 wrmsr (MSR_IA32_MISC_ENABLE, lo, hi); 120 wrmsr (MSR_IA32_MISC_ENABLE, lo, hi);
71 } 121 }
72 } 122 }
123
124 /*
125 * See if we have a good local APIC by checking for buggy Pentia,
126 * i.e. all B steppings and the C2 stepping of P54C when using their
127 * integrated APIC (see 11AP erratum in "Pentium Processor
128 * Specification Update").
129 */
130 if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
131 (c->x86_mask < 0x6 || c->x86_mask == 0xb))
132 set_cpu_cap(c, X86_FEATURE_11AP);
133
134
135#ifdef CONFIG_X86_INTEL_USERCOPY
136 /*
137 * Set up the preferred alignment for movsl bulk memory moves
138 */
139 switch (c->x86) {
140 case 4: /* 486: untested */
141 break;
142 case 5: /* Old Pentia: untested */
143 break;
144 case 6: /* PII/PIII only like movsl with 8-byte alignment */
145 movsl_mask.mask = 7;
146 break;
147 case 15: /* P4 is OK down to 8-byte alignment */
148 movsl_mask.mask = 7;
149 break;
150 }
151#endif
152
153#ifdef CONFIG_X86_NUMAQ
154 numaq_tsc_disable();
155#endif
156}
157#else
158static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
159{
73} 160}
161#endif
74 162
163static void __cpuinit srat_detect_node(void)
164{
165#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
166 unsigned node;
167 int cpu = smp_processor_id();
168 int apicid = hard_smp_processor_id();
169
170 /* Don't do the funky fallback heuristics the AMD version employs
171 for now. */
172 node = apicid_to_node[apicid];
173 if (node == NUMA_NO_NODE || !node_online(node))
174 node = first_node(node_online_map);
175 numa_set_node(cpu, node);
176
177 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
178#endif
179}
75 180
76/* 181/*
77 * find out the number of processor cores on the die 182 * find out the number of processor cores on the die
78 */ 183 */
79static int __cpuinit num_cpu_cores(struct cpuinfo_x86 *c) 184static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
80{ 185{
81 unsigned int eax, ebx, ecx, edx; 186 unsigned int eax, ebx, ecx, edx;
82 187
@@ -91,45 +196,51 @@ static int __cpuinit num_cpu_cores(struct cpuinfo_x86 *c)
91 return 1; 196 return 1;
92} 197}
93 198
94#ifdef CONFIG_X86_F00F_BUG 199static void __cpuinit detect_vmx_virtcap(struct cpuinfo_x86 *c)
95static void __cpuinit trap_init_f00f_bug(void)
96{ 200{
97 __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); 201 /* Intel VMX MSR indicated features */
98 202#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000
99 /* 203#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000
100 * Update the IDT descriptor and reload the IDT so that 204#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000
101 * it uses the read-only mapped virtual address. 205#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001
102 */ 206#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002
103 idt_descr.address = fix_to_virt(FIX_F00F_IDT); 207#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020
104 load_idt(&idt_descr); 208
209 u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
210
211 clear_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
212 clear_cpu_cap(c, X86_FEATURE_VNMI);
213 clear_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
214 clear_cpu_cap(c, X86_FEATURE_EPT);
215 clear_cpu_cap(c, X86_FEATURE_VPID);
216
217 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
218 msr_ctl = vmx_msr_high | vmx_msr_low;
219 if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)
220 set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
221 if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI)
222 set_cpu_cap(c, X86_FEATURE_VNMI);
223 if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) {
224 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
225 vmx_msr_low, vmx_msr_high);
226 msr_ctl2 = vmx_msr_high | vmx_msr_low;
227 if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
228 (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
229 set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
230 if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT)
231 set_cpu_cap(c, X86_FEATURE_EPT);
232 if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
233 set_cpu_cap(c, X86_FEATURE_VPID);
234 }
105} 235}
106#endif
107 236
108static void __cpuinit init_intel(struct cpuinfo_x86 *c) 237static void __cpuinit init_intel(struct cpuinfo_x86 *c)
109{ 238{
110 unsigned int l2 = 0; 239 unsigned int l2 = 0;
111 char *p = NULL;
112 240
113 early_init_intel(c); 241 early_init_intel(c);
114 242
115#ifdef CONFIG_X86_F00F_BUG 243 intel_workarounds(c);
116 /*
117 * All current models of Pentium and Pentium with MMX technology CPUs
118 * have the F0 0F bug, which lets nonprivileged users lock up the system.
119 * Note that the workaround only should be initialized once...
120 */
121 c->f00f_bug = 0;
122 if (!paravirt_enabled() && c->x86 == 5) {
123 static int f00f_workaround_enabled;
124
125 c->f00f_bug = 1;
126 if (!f00f_workaround_enabled) {
127 trap_init_f00f_bug();
128 printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n");
129 f00f_workaround_enabled = 1;
130 }
131 }
132#endif
133 244
134 l2 = init_intel_cacheinfo(c); 245 l2 = init_intel_cacheinfo(c);
135 if (c->cpuid_level > 9) { 246 if (c->cpuid_level > 9) {
@@ -139,16 +250,32 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
139 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); 250 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
140 } 251 }
141 252
142 /* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until model 3 mask 3 */ 253 if (cpu_has_xmm2)
143 if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633) 254 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
144 clear_cpu_cap(c, X86_FEATURE_SEP); 255 if (cpu_has_ds) {
256 unsigned int l1;
257 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
258 if (!(l1 & (1<<11)))
259 set_cpu_cap(c, X86_FEATURE_BTS);
260 if (!(l1 & (1<<12)))
261 set_cpu_cap(c, X86_FEATURE_PEBS);
262 ds_init_intel(c);
263 }
145 264
265#ifdef CONFIG_X86_64
266 if (c->x86 == 15)
267 c->x86_cache_alignment = c->x86_clflush_size * 2;
268 if (c->x86 == 6)
269 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
270#else
146 /* 271 /*
147 * Names for the Pentium II/Celeron processors 272 * Names for the Pentium II/Celeron processors
148 * detectable only by also checking the cache size. 273 * detectable only by also checking the cache size.
149 * Dixon is NOT a Celeron. 274 * Dixon is NOT a Celeron.
150 */ 275 */
151 if (c->x86 == 6) { 276 if (c->x86 == 6) {
277 char *p = NULL;
278
152 switch (c->x86_model) { 279 switch (c->x86_model) {
153 case 5: 280 case 5:
154 if (c->x86_mask == 0) { 281 if (c->x86_mask == 0) {
@@ -171,77 +298,41 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
171 p = "Celeron (Coppermine)"; 298 p = "Celeron (Coppermine)";
172 break; 299 break;
173 } 300 }
301
302 if (p)
303 strcpy(c->x86_model_id, p);
174 } 304 }
175 305
176 if (p) 306 if (c->x86 == 15)
177 strcpy(c->x86_model_id, p); 307 set_cpu_cap(c, X86_FEATURE_P4);
308 if (c->x86 == 6)
309 set_cpu_cap(c, X86_FEATURE_P3);
178 310
179 detect_extended_topology(c); 311 if (cpu_has_bts)
312 ptrace_bts_init_intel(c);
180 313
314#endif
315
316 detect_extended_topology(c);
181 if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { 317 if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
182 /* 318 /*
183 * let's use the legacy cpuid vector 0x1 and 0x4 for topology 319 * let's use the legacy cpuid vector 0x1 and 0x4 for topology
184 * detection. 320 * detection.
185 */ 321 */
186 c->x86_max_cores = num_cpu_cores(c); 322 c->x86_max_cores = intel_num_cpu_cores(c);
323#ifdef CONFIG_X86_32
187 detect_ht(c); 324 detect_ht(c);
188 }
189
190 /* Work around errata */
191 Intel_errata_workarounds(c);
192
193#ifdef CONFIG_X86_INTEL_USERCOPY
194 /*
195 * Set up the preferred alignment for movsl bulk memory moves
196 */
197 switch (c->x86) {
198 case 4: /* 486: untested */
199 break;
200 case 5: /* Old Pentia: untested */
201 break;
202 case 6: /* PII/PIII only like movsl with 8-byte alignment */
203 movsl_mask.mask = 7;
204 break;
205 case 15: /* P4 is OK down to 8-byte alignment */
206 movsl_mask.mask = 7;
207 break;
208 }
209#endif 325#endif
210
211 if (cpu_has_xmm2)
212 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
213 if (c->x86 == 15) {
214 set_cpu_cap(c, X86_FEATURE_P4);
215 } 326 }
216 if (c->x86 == 6)
217 set_cpu_cap(c, X86_FEATURE_P3);
218 if (cpu_has_ds) {
219 unsigned int l1;
220 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
221 if (!(l1 & (1<<11)))
222 set_cpu_cap(c, X86_FEATURE_BTS);
223 if (!(l1 & (1<<12)))
224 set_cpu_cap(c, X86_FEATURE_PEBS);
225 }
226
227 if (cpu_has_bts)
228 ds_init_intel(c);
229 327
230 /* 328 /* Work around errata */
231 * See if we have a good local APIC by checking for buggy Pentia, 329 srat_detect_node();
232 * i.e. all B steppings and the C2 stepping of P54C when using their
233 * integrated APIC (see 11AP erratum in "Pentium Processor
234 * Specification Update").
235 */
236 if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
237 (c->x86_mask < 0x6 || c->x86_mask == 0xb))
238 set_cpu_cap(c, X86_FEATURE_11AP);
239 330
240#ifdef CONFIG_X86_NUMAQ 331 if (cpu_has(c, X86_FEATURE_VMX))
241 numaq_tsc_disable(); 332 detect_vmx_virtcap(c);
242#endif
243} 333}
244 334
335#ifdef CONFIG_X86_32
245static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) 336static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
246{ 337{
247 /* 338 /*
@@ -254,10 +345,12 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i
254 size = 256; 345 size = 256;
255 return size; 346 return size;
256} 347}
348#endif
257 349
258static struct cpu_dev intel_cpu_dev __cpuinitdata = { 350static struct cpu_dev intel_cpu_dev __cpuinitdata = {
259 .c_vendor = "Intel", 351 .c_vendor = "Intel",
260 .c_ident = { "GenuineIntel" }, 352 .c_ident = { "GenuineIntel" },
353#ifdef CONFIG_X86_32
261 .c_models = { 354 .c_models = {
262 { .vendor = X86_VENDOR_INTEL, .family = 4, .model_names = 355 { .vendor = X86_VENDOR_INTEL, .family = 4, .model_names =
263 { 356 {
@@ -307,13 +400,12 @@ static struct cpu_dev intel_cpu_dev __cpuinitdata = {
307 } 400 }
308 }, 401 },
309 }, 402 },
403 .c_size_cache = intel_size_cache,
404#endif
310 .c_early_init = early_init_intel, 405 .c_early_init = early_init_intel,
311 .c_init = init_intel, 406 .c_init = init_intel,
312 .c_size_cache = intel_size_cache,
313 .c_x86_vendor = X86_VENDOR_INTEL, 407 .c_x86_vendor = X86_VENDOR_INTEL,
314}; 408};
315 409
316cpu_dev_register(intel_cpu_dev); 410cpu_dev_register(intel_cpu_dev);
317 411
318/* arch_initcall(intel_cpu_init); */
319
diff --git a/arch/x86/kernel/cpu/intel_64.c b/arch/x86/kernel/cpu/intel_64.c
deleted file mode 100644
index 0c0a58dfe099..000000000000
--- a/arch/x86/kernel/cpu/intel_64.c
+++ /dev/null
@@ -1,99 +0,0 @@
1#include <linux/init.h>
2#include <linux/smp.h>
3#include <asm/processor.h>
4#include <asm/ptrace.h>
5#include <asm/topology.h>
6#include <asm/numa_64.h>
7
8#include "cpu.h"
9
10static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
11{
12 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
13 (c->x86 == 0x6 && c->x86_model >= 0x0e))
14 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15
16 set_cpu_cap(c, X86_FEATURE_SYSENTER32);
17}
18
19/*
20 * find out the number of processor cores on the die
21 */
22static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
23{
24 unsigned int eax, t;
25
26 if (c->cpuid_level < 4)
27 return 1;
28
29 cpuid_count(4, 0, &eax, &t, &t, &t);
30
31 if (eax & 0x1f)
32 return ((eax >> 26) + 1);
33 else
34 return 1;
35}
36
37static void __cpuinit srat_detect_node(void)
38{
39#ifdef CONFIG_NUMA
40 unsigned node;
41 int cpu = smp_processor_id();
42 int apicid = hard_smp_processor_id();
43
44 /* Don't do the funky fallback heuristics the AMD version employs
45 for now. */
46 node = apicid_to_node[apicid];
47 if (node == NUMA_NO_NODE || !node_online(node))
48 node = first_node(node_online_map);
49 numa_set_node(cpu, node);
50
51 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
52#endif
53}
54
55static void __cpuinit init_intel(struct cpuinfo_x86 *c)
56{
57 init_intel_cacheinfo(c);
58 if (c->cpuid_level > 9) {
59 unsigned eax = cpuid_eax(10);
60 /* Check for version and the number of counters */
61 if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
62 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
63 }
64
65 if (cpu_has_ds) {
66 unsigned int l1, l2;
67 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
68 if (!(l1 & (1<<11)))
69 set_cpu_cap(c, X86_FEATURE_BTS);
70 if (!(l1 & (1<<12)))
71 set_cpu_cap(c, X86_FEATURE_PEBS);
72 }
73
74
75 if (cpu_has_bts)
76 ds_init_intel(c);
77
78 if (c->x86 == 15)
79 c->x86_cache_alignment = c->x86_clflush_size * 2;
80 if (c->x86 == 6)
81 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
82 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
83
84 detect_extended_topology(c);
85 if (!cpu_has(c, X86_FEATURE_XTOPOLOGY))
86 c->x86_max_cores = intel_num_cpu_cores(c);
87
88 srat_detect_node();
89}
90
91static struct cpu_dev intel_cpu_dev __cpuinitdata = {
92 .c_vendor = "Intel",
93 .c_ident = { "GenuineIntel" },
94 .c_early_init = early_init_intel,
95 .c_init = init_intel,
96 .c_x86_vendor = X86_VENDOR_INTEL,
97};
98
99cpu_dev_register(intel_cpu_dev);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 726a5fcdf341..4b031a4ac856 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -860,7 +860,7 @@ error:
860 return err; 860 return err;
861} 861}
862 862
863static void mce_remove_device(unsigned int cpu) 863static __cpuinit void mce_remove_device(unsigned int cpu)
864{ 864{
865 int i; 865 int i;
866 866
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index cb7d3b6a80eb..4e8d77f01eeb 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -401,12 +401,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
401 tmp |= ~((1<<(hi - 1)) - 1); 401 tmp |= ~((1<<(hi - 1)) - 1);
402 402
403 if (tmp != mask_lo) { 403 if (tmp != mask_lo) {
404 static int once = 1; 404 WARN_ONCE(1, KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n");
405
406 if (once) {
407 printk(KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n");
408 once = 0;
409 }
410 mask_lo = tmp; 405 mask_lo = tmp;
411 } 406 }
412 } 407 }
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index 84c480bb3715..4c4214690dd1 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -405,9 +405,9 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
405 } 405 }
406 /* RED-PEN: base can be > 32bit */ 406 /* RED-PEN: base can be > 32bit */
407 len += seq_printf(seq, 407 len += seq_printf(seq,
408 "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n", 408 "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n",
409 i, base, base >> (20 - PAGE_SHIFT), size, factor, 409 i, base, base >> (20 - PAGE_SHIFT), size, factor,
410 mtrr_attrib_to_str(type), mtrr_usage_table[i]); 410 mtrr_usage_table[i], mtrr_attrib_to_str(type));
411 } 411 }
412 } 412 }
413 return 0; 413 return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 58ac5d3d4361..c78c04821ea1 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -759,7 +759,8 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
759 /* take out UC ranges */ 759 /* take out UC ranges */
760 for (i = 0; i < num_var_ranges; i++) { 760 for (i = 0; i < num_var_ranges; i++) {
761 type = range_state[i].type; 761 type = range_state[i].type;
762 if (type != MTRR_TYPE_UNCACHABLE) 762 if (type != MTRR_TYPE_UNCACHABLE &&
763 type != MTRR_TYPE_WRPROT)
763 continue; 764 continue;
764 size = range_state[i].size_pfn; 765 size = range_state[i].size_pfn;
765 if (!size) 766 if (!size)
@@ -834,7 +835,14 @@ static int __init enable_mtrr_cleanup_setup(char *str)
834 enable_mtrr_cleanup = 1; 835 enable_mtrr_cleanup = 1;
835 return 0; 836 return 0;
836} 837}
837early_param("enble_mtrr_cleanup", enable_mtrr_cleanup_setup); 838early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
839
840static int __init mtrr_cleanup_debug_setup(char *str)
841{
842 debug_print = 1;
843 return 0;
844}
845early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
838 846
839struct var_mtrr_state { 847struct var_mtrr_state {
840 unsigned long range_startk; 848 unsigned long range_startk;
@@ -898,6 +906,27 @@ set_var_mtrr_all(unsigned int address_bits)
898 } 906 }
899} 907}
900 908
909static unsigned long to_size_factor(unsigned long sizek, char *factorp)
910{
911 char factor;
912 unsigned long base = sizek;
913
914 if (base & ((1<<10) - 1)) {
915 /* not MB alignment */
916 factor = 'K';
917 } else if (base & ((1<<20) - 1)){
918 factor = 'M';
919 base >>= 10;
920 } else {
921 factor = 'G';
922 base >>= 20;
923 }
924
925 *factorp = factor;
926
927 return base;
928}
929
901static unsigned int __init 930static unsigned int __init
902range_to_mtrr(unsigned int reg, unsigned long range_startk, 931range_to_mtrr(unsigned int reg, unsigned long range_startk,
903 unsigned long range_sizek, unsigned char type) 932 unsigned long range_sizek, unsigned char type)
@@ -919,13 +948,21 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk,
919 align = max_align; 948 align = max_align;
920 949
921 sizek = 1 << align; 950 sizek = 1 << align;
922 if (debug_print) 951 if (debug_print) {
952 char start_factor = 'K', size_factor = 'K';
953 unsigned long start_base, size_base;
954
955 start_base = to_size_factor(range_startk, &start_factor),
956 size_base = to_size_factor(sizek, &size_factor),
957
923 printk(KERN_DEBUG "Setting variable MTRR %d, " 958 printk(KERN_DEBUG "Setting variable MTRR %d, "
924 "base: %ldMB, range: %ldMB, type %s\n", 959 "base: %ld%cB, range: %ld%cB, type %s\n",
925 reg, range_startk >> 10, sizek >> 10, 960 reg, start_base, start_factor,
961 size_base, size_factor,
926 (type == MTRR_TYPE_UNCACHABLE)?"UC": 962 (type == MTRR_TYPE_UNCACHABLE)?"UC":
927 ((type == MTRR_TYPE_WRBACK)?"WB":"Other") 963 ((type == MTRR_TYPE_WRBACK)?"WB":"Other")
928 ); 964 );
965 }
929 save_var_mtrr(reg++, range_startk, sizek, type); 966 save_var_mtrr(reg++, range_startk, sizek, type);
930 range_startk += sizek; 967 range_startk += sizek;
931 range_sizek -= sizek; 968 range_sizek -= sizek;
@@ -970,6 +1007,8 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
970 /* try to append some small hole */ 1007 /* try to append some small hole */
971 range0_basek = state->range_startk; 1008 range0_basek = state->range_startk;
972 range0_sizek = ALIGN(state->range_sizek, chunk_sizek); 1009 range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
1010
1011 /* no increase */
973 if (range0_sizek == state->range_sizek) { 1012 if (range0_sizek == state->range_sizek) {
974 if (debug_print) 1013 if (debug_print)
975 printk(KERN_DEBUG "rangeX: %016lx - %016lx\n", 1014 printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
@@ -980,13 +1019,40 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
980 return 0; 1019 return 0;
981 } 1020 }
982 1021
983 range0_sizek -= chunk_sizek; 1022 /* only cut back, when it is not the last */
984 if (range0_sizek && sizek) { 1023 if (sizek) {
985 while (range0_basek + range0_sizek > (basek + sizek)) { 1024 while (range0_basek + range0_sizek > (basek + sizek)) {
986 range0_sizek -= chunk_sizek; 1025 if (range0_sizek >= chunk_sizek)
987 if (!range0_sizek) 1026 range0_sizek -= chunk_sizek;
988 break; 1027 else
989 } 1028 range0_sizek = 0;
1029
1030 if (!range0_sizek)
1031 break;
1032 }
1033 }
1034
1035second_try:
1036 range_basek = range0_basek + range0_sizek;
1037
1038 /* one hole in the middle */
1039 if (range_basek > basek && range_basek <= (basek + sizek))
1040 second_sizek = range_basek - basek;
1041
1042 if (range0_sizek > state->range_sizek) {
1043
1044 /* one hole in middle or at end */
1045 hole_sizek = range0_sizek - state->range_sizek - second_sizek;
1046
1047 /* hole size should be less than half of range0 size */
1048 if (hole_sizek >= (range0_sizek >> 1) &&
1049 range0_sizek >= chunk_sizek) {
1050 range0_sizek -= chunk_sizek;
1051 second_sizek = 0;
1052 hole_sizek = 0;
1053
1054 goto second_try;
1055 }
990 } 1056 }
991 1057
992 if (range0_sizek) { 1058 if (range0_sizek) {
@@ -996,50 +1062,28 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
996 (range0_basek + range0_sizek)<<10); 1062 (range0_basek + range0_sizek)<<10);
997 state->reg = range_to_mtrr(state->reg, range0_basek, 1063 state->reg = range_to_mtrr(state->reg, range0_basek,
998 range0_sizek, MTRR_TYPE_WRBACK); 1064 range0_sizek, MTRR_TYPE_WRBACK);
999
1000 }
1001
1002 range_basek = range0_basek + range0_sizek;
1003 range_sizek = chunk_sizek;
1004
1005 if (range_basek + range_sizek > basek &&
1006 range_basek + range_sizek <= (basek + sizek)) {
1007 /* one hole */
1008 second_basek = basek;
1009 second_sizek = range_basek + range_sizek - basek;
1010 } 1065 }
1011 1066
1012 /* if last piece, only could one hole near end */ 1067 if (range0_sizek < state->range_sizek) {
1013 if ((second_basek || !basek) && 1068 /* need to handle left over */
1014 range_sizek - (state->range_sizek - range0_sizek) - second_sizek <
1015 (chunk_sizek >> 1)) {
1016 /*
1017 * one hole in middle (second_sizek is 0) or at end
1018 * (second_sizek is 0 )
1019 */
1020 hole_sizek = range_sizek - (state->range_sizek - range0_sizek)
1021 - second_sizek;
1022 hole_basek = range_basek + range_sizek - hole_sizek
1023 - second_sizek;
1024 } else {
1025 /* fallback for big hole, or several holes */
1026 range_sizek = state->range_sizek - range0_sizek; 1069 range_sizek = state->range_sizek - range0_sizek;
1027 second_basek = 0; 1070
1028 second_sizek = 0; 1071 if (debug_print)
1072 printk(KERN_DEBUG "range: %016lx - %016lx\n",
1073 range_basek<<10,
1074 (range_basek + range_sizek)<<10);
1075 state->reg = range_to_mtrr(state->reg, range_basek,
1076 range_sizek, MTRR_TYPE_WRBACK);
1029 } 1077 }
1030 1078
1031 if (debug_print)
1032 printk(KERN_DEBUG "range: %016lx - %016lx\n", range_basek<<10,
1033 (range_basek + range_sizek)<<10);
1034 state->reg = range_to_mtrr(state->reg, range_basek, range_sizek,
1035 MTRR_TYPE_WRBACK);
1036 if (hole_sizek) { 1079 if (hole_sizek) {
1080 hole_basek = range_basek - hole_sizek - second_sizek;
1037 if (debug_print) 1081 if (debug_print)
1038 printk(KERN_DEBUG "hole: %016lx - %016lx\n", 1082 printk(KERN_DEBUG "hole: %016lx - %016lx\n",
1039 hole_basek<<10, (hole_basek + hole_sizek)<<10); 1083 hole_basek<<10,
1040 state->reg = range_to_mtrr(state->reg, hole_basek, hole_sizek, 1084 (hole_basek + hole_sizek)<<10);
1041 MTRR_TYPE_UNCACHABLE); 1085 state->reg = range_to_mtrr(state->reg, hole_basek,
1042 1086 hole_sizek, MTRR_TYPE_UNCACHABLE);
1043 } 1087 }
1044 1088
1045 return second_sizek; 1089 return second_sizek;
@@ -1154,11 +1198,11 @@ struct mtrr_cleanup_result {
1154}; 1198};
1155 1199
1156/* 1200/*
1157 * gran_size: 1M, 2M, ..., 2G 1201 * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G
1158 * chunk size: gran_size, ..., 4G 1202 * chunk size: gran_size, ..., 2G
1159 * so we need (2+13)*6 1203 * so we need (1+16)*8
1160 */ 1204 */
1161#define NUM_RESULT 90 1205#define NUM_RESULT 136
1162#define PSHIFT (PAGE_SHIFT - 10) 1206#define PSHIFT (PAGE_SHIFT - 10)
1163 1207
1164static struct mtrr_cleanup_result __initdata result[NUM_RESULT]; 1208static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
@@ -1168,13 +1212,14 @@ static unsigned long __initdata min_loss_pfn[RANGE_NUM];
1168static int __init mtrr_cleanup(unsigned address_bits) 1212static int __init mtrr_cleanup(unsigned address_bits)
1169{ 1213{
1170 unsigned long extra_remove_base, extra_remove_size; 1214 unsigned long extra_remove_base, extra_remove_size;
1171 unsigned long i, base, size, def, dummy; 1215 unsigned long base, size, def, dummy;
1172 mtrr_type type; 1216 mtrr_type type;
1173 int nr_range, nr_range_new; 1217 int nr_range, nr_range_new;
1174 u64 chunk_size, gran_size; 1218 u64 chunk_size, gran_size;
1175 unsigned long range_sums, range_sums_new; 1219 unsigned long range_sums, range_sums_new;
1176 int index_good; 1220 int index_good;
1177 int num_reg_good; 1221 int num_reg_good;
1222 int i;
1178 1223
1179 /* extra one for all 0 */ 1224 /* extra one for all 0 */
1180 int num[MTRR_NUM_TYPES + 1]; 1225 int num[MTRR_NUM_TYPES + 1];
@@ -1204,6 +1249,8 @@ static int __init mtrr_cleanup(unsigned address_bits)
1204 continue; 1249 continue;
1205 if (!size) 1250 if (!size)
1206 type = MTRR_NUM_TYPES; 1251 type = MTRR_NUM_TYPES;
1252 if (type == MTRR_TYPE_WRPROT)
1253 type = MTRR_TYPE_UNCACHABLE;
1207 num[type]++; 1254 num[type]++;
1208 } 1255 }
1209 1256
@@ -1216,23 +1263,57 @@ static int __init mtrr_cleanup(unsigned address_bits)
1216 num_var_ranges - num[MTRR_NUM_TYPES]) 1263 num_var_ranges - num[MTRR_NUM_TYPES])
1217 return 0; 1264 return 0;
1218 1265
1266 /* print original var MTRRs at first, for debugging: */
1267 printk(KERN_DEBUG "original variable MTRRs\n");
1268 for (i = 0; i < num_var_ranges; i++) {
1269 char start_factor = 'K', size_factor = 'K';
1270 unsigned long start_base, size_base;
1271
1272 size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
1273 if (!size_base)
1274 continue;
1275
1276 size_base = to_size_factor(size_base, &size_factor),
1277 start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
1278 start_base = to_size_factor(start_base, &start_factor),
1279 type = range_state[i].type;
1280
1281 printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
1282 i, start_base, start_factor,
1283 size_base, size_factor,
1284 (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
1285 ((type == MTRR_TYPE_WRPROT) ? "WP" :
1286 ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
1287 );
1288 }
1289
1219 memset(range, 0, sizeof(range)); 1290 memset(range, 0, sizeof(range));
1220 extra_remove_size = 0; 1291 extra_remove_size = 0;
1221 if (mtrr_tom2) { 1292 extra_remove_base = 1 << (32 - PAGE_SHIFT);
1222 extra_remove_base = 1 << (32 - PAGE_SHIFT); 1293 if (mtrr_tom2)
1223 extra_remove_size = 1294 extra_remove_size =
1224 (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base; 1295 (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
1225 }
1226 nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base, 1296 nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
1227 extra_remove_size); 1297 extra_remove_size);
1298 /*
1299 * [0, 1M) should always be coverred by var mtrr with WB
1300 * and fixed mtrrs should take effective before var mtrr for it
1301 */
1302 nr_range = add_range_with_merge(range, nr_range, 0,
1303 (1ULL<<(20 - PAGE_SHIFT)) - 1);
1304 /* sort the ranges */
1305 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
1306
1228 range_sums = sum_ranges(range, nr_range); 1307 range_sums = sum_ranges(range, nr_range);
1229 printk(KERN_INFO "total RAM coverred: %ldM\n", 1308 printk(KERN_INFO "total RAM coverred: %ldM\n",
1230 range_sums >> (20 - PAGE_SHIFT)); 1309 range_sums >> (20 - PAGE_SHIFT));
1231 1310
1232 if (mtrr_chunk_size && mtrr_gran_size) { 1311 if (mtrr_chunk_size && mtrr_gran_size) {
1233 int num_reg; 1312 int num_reg;
1313 char gran_factor, chunk_factor, lose_factor;
1314 unsigned long gran_base, chunk_base, lose_base;
1234 1315
1235 debug_print = 1; 1316 debug_print++;
1236 /* convert ranges to var ranges state */ 1317 /* convert ranges to var ranges state */
1237 num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size, 1318 num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size,
1238 mtrr_gran_size); 1319 mtrr_gran_size);
@@ -1256,34 +1337,48 @@ static int __init mtrr_cleanup(unsigned address_bits)
1256 result[i].lose_cover_sizek = 1337 result[i].lose_cover_sizek =
1257 (range_sums - range_sums_new) << PSHIFT; 1338 (range_sums - range_sums_new) << PSHIFT;
1258 1339
1259 printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t", 1340 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1260 result[i].bad?"*BAD*":" ", result[i].gran_sizek >> 10, 1341 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1261 result[i].chunk_sizek >> 10); 1342 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1262 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ldM \n", 1343 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
1344 result[i].bad?"*BAD*":" ",
1345 gran_base, gran_factor, chunk_base, chunk_factor);
1346 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
1263 result[i].num_reg, result[i].bad?"-":"", 1347 result[i].num_reg, result[i].bad?"-":"",
1264 result[i].lose_cover_sizek >> 10); 1348 lose_base, lose_factor);
1265 if (!result[i].bad) { 1349 if (!result[i].bad) {
1266 set_var_mtrr_all(address_bits); 1350 set_var_mtrr_all(address_bits);
1267 return 1; 1351 return 1;
1268 } 1352 }
1269 printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, " 1353 printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
1270 "will find optimal one\n"); 1354 "will find optimal one\n");
1271 debug_print = 0; 1355 debug_print--;
1272 memset(result, 0, sizeof(result[0])); 1356 memset(result, 0, sizeof(result[0]));
1273 } 1357 }
1274 1358
1275 i = 0; 1359 i = 0;
1276 memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn)); 1360 memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
1277 memset(result, 0, sizeof(result)); 1361 memset(result, 0, sizeof(result));
1278 for (gran_size = (1ULL<<20); gran_size < (1ULL<<32); gran_size <<= 1) { 1362 for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
1279 for (chunk_size = gran_size; chunk_size < (1ULL<<33); 1363 char gran_factor;
1364 unsigned long gran_base;
1365
1366 if (debug_print)
1367 gran_base = to_size_factor(gran_size >> 10, &gran_factor);
1368
1369 for (chunk_size = gran_size; chunk_size < (1ULL<<32);
1280 chunk_size <<= 1) { 1370 chunk_size <<= 1) {
1281 int num_reg; 1371 int num_reg;
1282 1372
1283 if (debug_print) 1373 if (debug_print) {
1284 printk(KERN_INFO 1374 char chunk_factor;
1285 "\ngran_size: %lldM chunk_size_size: %lldM\n", 1375 unsigned long chunk_base;
1286 gran_size >> 20, chunk_size >> 20); 1376
1377 chunk_base = to_size_factor(chunk_size>>10, &chunk_factor),
1378 printk(KERN_INFO "\n");
1379 printk(KERN_INFO "gran_size: %ld%c chunk_size: %ld%c \n",
1380 gran_base, gran_factor, chunk_base, chunk_factor);
1381 }
1287 if (i >= NUM_RESULT) 1382 if (i >= NUM_RESULT)
1288 continue; 1383 continue;
1289 1384
@@ -1326,12 +1421,18 @@ static int __init mtrr_cleanup(unsigned address_bits)
1326 1421
1327 /* print out all */ 1422 /* print out all */
1328 for (i = 0; i < NUM_RESULT; i++) { 1423 for (i = 0; i < NUM_RESULT; i++) {
1329 printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t", 1424 char gran_factor, chunk_factor, lose_factor;
1330 result[i].bad?"*BAD* ":" ", result[i].gran_sizek >> 10, 1425 unsigned long gran_base, chunk_base, lose_base;
1331 result[i].chunk_sizek >> 10); 1426
1332 printk(KERN_CONT "num_reg: %d \tlose RAM: %s%ldM\n", 1427 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1333 result[i].num_reg, result[i].bad?"-":"", 1428 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1334 result[i].lose_cover_sizek >> 10); 1429 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1430 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
1431 result[i].bad?"*BAD*":" ",
1432 gran_base, gran_factor, chunk_base, chunk_factor);
1433 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
1434 result[i].num_reg, result[i].bad?"-":"",
1435 lose_base, lose_factor);
1335 } 1436 }
1336 1437
1337 /* try to find the optimal index */ 1438 /* try to find the optimal index */
@@ -1339,10 +1440,8 @@ static int __init mtrr_cleanup(unsigned address_bits)
1339 nr_mtrr_spare_reg = num_var_ranges - 1; 1440 nr_mtrr_spare_reg = num_var_ranges - 1;
1340 num_reg_good = -1; 1441 num_reg_good = -1;
1341 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { 1442 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
1342 if (!min_loss_pfn[i]) { 1443 if (!min_loss_pfn[i])
1343 num_reg_good = i; 1444 num_reg_good = i;
1344 break;
1345 }
1346 } 1445 }
1347 1446
1348 index_good = -1; 1447 index_good = -1;
@@ -1358,21 +1457,26 @@ static int __init mtrr_cleanup(unsigned address_bits)
1358 } 1457 }
1359 1458
1360 if (index_good != -1) { 1459 if (index_good != -1) {
1460 char gran_factor, chunk_factor, lose_factor;
1461 unsigned long gran_base, chunk_base, lose_base;
1462
1361 printk(KERN_INFO "Found optimal setting for mtrr clean up\n"); 1463 printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
1362 i = index_good; 1464 i = index_good;
1363 printk(KERN_INFO "gran_size: %ldM \tchunk_size: %ldM \t", 1465 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1364 result[i].gran_sizek >> 10, 1466 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1365 result[i].chunk_sizek >> 10); 1467 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1366 printk(KERN_CONT "num_reg: %d \tlose RAM: %ldM\n", 1468 printk(KERN_INFO "gran_size: %ld%c \tchunk_size: %ld%c \t",
1367 result[i].num_reg, 1469 gran_base, gran_factor, chunk_base, chunk_factor);
1368 result[i].lose_cover_sizek >> 10); 1470 printk(KERN_CONT "num_reg: %d \tlose RAM: %ld%c\n",
1471 result[i].num_reg, lose_base, lose_factor);
1369 /* convert ranges to var ranges state */ 1472 /* convert ranges to var ranges state */
1370 chunk_size = result[i].chunk_sizek; 1473 chunk_size = result[i].chunk_sizek;
1371 chunk_size <<= 10; 1474 chunk_size <<= 10;
1372 gran_size = result[i].gran_sizek; 1475 gran_size = result[i].gran_sizek;
1373 gran_size <<= 10; 1476 gran_size <<= 10;
1374 debug_print = 1; 1477 debug_print++;
1375 x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); 1478 x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
1479 debug_print--;
1376 set_var_mtrr_all(address_bits); 1480 set_var_mtrr_all(address_bits);
1377 return 1; 1481 return 1;
1378 } 1482 }
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 05cc22dbd4ff..6bff382094f5 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -295,13 +295,19 @@ static int setup_k7_watchdog(unsigned nmi_hz)
295 /* setup the timer */ 295 /* setup the timer */
296 wrmsr(evntsel_msr, evntsel, 0); 296 wrmsr(evntsel_msr, evntsel, 0);
297 write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz); 297 write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz);
298 apic_write(APIC_LVTPC, APIC_DM_NMI);
299 evntsel |= K7_EVNTSEL_ENABLE;
300 wrmsr(evntsel_msr, evntsel, 0);
301 298
299 /* initialize the wd struct before enabling */
302 wd->perfctr_msr = perfctr_msr; 300 wd->perfctr_msr = perfctr_msr;
303 wd->evntsel_msr = evntsel_msr; 301 wd->evntsel_msr = evntsel_msr;
304 wd->cccr_msr = 0; /* unused */ 302 wd->cccr_msr = 0; /* unused */
303
304 /* ok, everything is initialized, announce that we're set */
305 cpu_nmi_set_wd_enabled();
306
307 apic_write(APIC_LVTPC, APIC_DM_NMI);
308 evntsel |= K7_EVNTSEL_ENABLE;
309 wrmsr(evntsel_msr, evntsel, 0);
310
305 return 1; 311 return 1;
306} 312}
307 313
@@ -379,13 +385,19 @@ static int setup_p6_watchdog(unsigned nmi_hz)
379 wrmsr(evntsel_msr, evntsel, 0); 385 wrmsr(evntsel_msr, evntsel, 0);
380 nmi_hz = adjust_for_32bit_ctr(nmi_hz); 386 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
381 write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz); 387 write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz);
382 apic_write(APIC_LVTPC, APIC_DM_NMI);
383 evntsel |= P6_EVNTSEL0_ENABLE;
384 wrmsr(evntsel_msr, evntsel, 0);
385 388
389 /* initialize the wd struct before enabling */
386 wd->perfctr_msr = perfctr_msr; 390 wd->perfctr_msr = perfctr_msr;
387 wd->evntsel_msr = evntsel_msr; 391 wd->evntsel_msr = evntsel_msr;
388 wd->cccr_msr = 0; /* unused */ 392 wd->cccr_msr = 0; /* unused */
393
394 /* ok, everything is initialized, announce that we're set */
395 cpu_nmi_set_wd_enabled();
396
397 apic_write(APIC_LVTPC, APIC_DM_NMI);
398 evntsel |= P6_EVNTSEL0_ENABLE;
399 wrmsr(evntsel_msr, evntsel, 0);
400
389 return 1; 401 return 1;
390} 402}
391 403
@@ -432,6 +444,27 @@ static const struct wd_ops p6_wd_ops = {
432#define P4_CCCR_ENABLE (1 << 12) 444#define P4_CCCR_ENABLE (1 << 12)
433#define P4_CCCR_OVF (1 << 31) 445#define P4_CCCR_OVF (1 << 31)
434 446
447#define P4_CONTROLS 18
448static unsigned int p4_controls[18] = {
449 MSR_P4_BPU_CCCR0,
450 MSR_P4_BPU_CCCR1,
451 MSR_P4_BPU_CCCR2,
452 MSR_P4_BPU_CCCR3,
453 MSR_P4_MS_CCCR0,
454 MSR_P4_MS_CCCR1,
455 MSR_P4_MS_CCCR2,
456 MSR_P4_MS_CCCR3,
457 MSR_P4_FLAME_CCCR0,
458 MSR_P4_FLAME_CCCR1,
459 MSR_P4_FLAME_CCCR2,
460 MSR_P4_FLAME_CCCR3,
461 MSR_P4_IQ_CCCR0,
462 MSR_P4_IQ_CCCR1,
463 MSR_P4_IQ_CCCR2,
464 MSR_P4_IQ_CCCR3,
465 MSR_P4_IQ_CCCR4,
466 MSR_P4_IQ_CCCR5,
467};
435/* 468/*
436 * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter 469 * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
437 * CRU_ESCR0 (with any non-null event selector) through a complemented 470 * CRU_ESCR0 (with any non-null event selector) through a complemented
@@ -473,6 +506,26 @@ static int setup_p4_watchdog(unsigned nmi_hz)
473 evntsel_msr = MSR_P4_CRU_ESCR0; 506 evntsel_msr = MSR_P4_CRU_ESCR0;
474 cccr_msr = MSR_P4_IQ_CCCR0; 507 cccr_msr = MSR_P4_IQ_CCCR0;
475 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4); 508 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
509
510 /*
511 * If we're on the kdump kernel or other situation, we may
512 * still have other performance counter registers set to
513 * interrupt and they'll keep interrupting forever because
514 * of the P4_CCCR_OVF quirk. So we need to ACK all the
515 * pending interrupts and disable all the registers here,
516 * before reenabling the NMI delivery. Refer to p4_rearm()
517 * about the P4_CCCR_OVF quirk.
518 */
519 if (reset_devices) {
520 unsigned int low, high;
521 int i;
522
523 for (i = 0; i < P4_CONTROLS; i++) {
524 rdmsr(p4_controls[i], low, high);
525 low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF);
526 wrmsr(p4_controls[i], low, high);
527 }
528 }
476 } else { 529 } else {
477 /* logical cpu 1 */ 530 /* logical cpu 1 */
478 perfctr_msr = MSR_P4_IQ_PERFCTR1; 531 perfctr_msr = MSR_P4_IQ_PERFCTR1;
@@ -499,12 +552,17 @@ static int setup_p4_watchdog(unsigned nmi_hz)
499 wrmsr(evntsel_msr, evntsel, 0); 552 wrmsr(evntsel_msr, evntsel, 0);
500 wrmsr(cccr_msr, cccr_val, 0); 553 wrmsr(cccr_msr, cccr_val, 0);
501 write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz); 554 write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz);
502 apic_write(APIC_LVTPC, APIC_DM_NMI); 555
503 cccr_val |= P4_CCCR_ENABLE;
504 wrmsr(cccr_msr, cccr_val, 0);
505 wd->perfctr_msr = perfctr_msr; 556 wd->perfctr_msr = perfctr_msr;
506 wd->evntsel_msr = evntsel_msr; 557 wd->evntsel_msr = evntsel_msr;
507 wd->cccr_msr = cccr_msr; 558 wd->cccr_msr = cccr_msr;
559
560 /* ok, everything is initialized, announce that we're set */
561 cpu_nmi_set_wd_enabled();
562
563 apic_write(APIC_LVTPC, APIC_DM_NMI);
564 cccr_val |= P4_CCCR_ENABLE;
565 wrmsr(cccr_msr, cccr_val, 0);
508 return 1; 566 return 1;
509} 567}
510 568
@@ -620,13 +678,17 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz)
620 wrmsr(evntsel_msr, evntsel, 0); 678 wrmsr(evntsel_msr, evntsel, 0);
621 nmi_hz = adjust_for_32bit_ctr(nmi_hz); 679 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
622 write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz); 680 write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz);
623 apic_write(APIC_LVTPC, APIC_DM_NMI);
624 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
625 wrmsr(evntsel_msr, evntsel, 0);
626 681
627 wd->perfctr_msr = perfctr_msr; 682 wd->perfctr_msr = perfctr_msr;
628 wd->evntsel_msr = evntsel_msr; 683 wd->evntsel_msr = evntsel_msr;
629 wd->cccr_msr = 0; /* unused */ 684 wd->cccr_msr = 0; /* unused */
685
686 /* ok, everything is initialized, announce that we're set */
687 cpu_nmi_set_wd_enabled();
688
689 apic_write(APIC_LVTPC, APIC_DM_NMI);
690 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
691 wrmsr(evntsel_msr, evntsel, 0);
630 intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1); 692 intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
631 return 1; 693 return 1;
632} 694}
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index 7c46e6ecedca..52b3fefbd5af 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -5,6 +5,18 @@
5#include <asm/msr.h> 5#include <asm/msr.h>
6#include "cpu.h" 6#include "cpu.h"
7 7
8static void __cpuinit early_init_transmeta(struct cpuinfo_x86 *c)
9{
10 u32 xlvl;
11
12 /* Transmeta-defined flags: level 0x80860001 */
13 xlvl = cpuid_eax(0x80860000);
14 if ((xlvl & 0xffff0000) == 0x80860000) {
15 if (xlvl >= 0x80860001)
16 c->x86_capability[2] = cpuid_edx(0x80860001);
17 }
18}
19
8static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) 20static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
9{ 21{
10 unsigned int cap_mask, uk, max, dummy; 22 unsigned int cap_mask, uk, max, dummy;
@@ -12,7 +24,8 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
12 unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev; 24 unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev;
13 char cpu_info[65]; 25 char cpu_info[65];
14 26
15 get_model_name(c); /* Same as AMD/Cyrix */ 27 early_init_transmeta(c);
28
16 display_cacheinfo(c); 29 display_cacheinfo(c);
17 30
18 /* Print CMS and CPU revision */ 31 /* Print CMS and CPU revision */
@@ -85,23 +98,11 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
85#endif 98#endif
86} 99}
87 100
88static void __cpuinit transmeta_identify(struct cpuinfo_x86 *c)
89{
90 u32 xlvl;
91
92 /* Transmeta-defined flags: level 0x80860001 */
93 xlvl = cpuid_eax(0x80860000);
94 if ((xlvl & 0xffff0000) == 0x80860000) {
95 if (xlvl >= 0x80860001)
96 c->x86_capability[2] = cpuid_edx(0x80860001);
97 }
98}
99
100static struct cpu_dev transmeta_cpu_dev __cpuinitdata = { 101static struct cpu_dev transmeta_cpu_dev __cpuinitdata = {
101 .c_vendor = "Transmeta", 102 .c_vendor = "Transmeta",
102 .c_ident = { "GenuineTMx86", "TransmetaCPU" }, 103 .c_ident = { "GenuineTMx86", "TransmetaCPU" },
104 .c_early_init = early_init_transmeta,
103 .c_init = init_transmeta, 105 .c_init = init_transmeta,
104 .c_identify = transmeta_identify,
105 .c_x86_vendor = X86_VENDOR_TRANSMETA, 106 .c_x86_vendor = X86_VENDOR_TRANSMETA,
106}; 107};
107 108
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 8e9cd6a8ec12..6a44d6465991 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -36,7 +36,6 @@
36#include <linux/smp_lock.h> 36#include <linux/smp_lock.h>
37#include <linux/major.h> 37#include <linux/major.h>
38#include <linux/fs.h> 38#include <linux/fs.h>
39#include <linux/smp_lock.h>
40#include <linux/device.h> 39#include <linux/device.h>
41#include <linux/cpu.h> 40#include <linux/cpu.h>
42#include <linux/notifier.h> 41#include <linux/notifier.h>
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 15e6c6bc4a46..e90a60ef10c2 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -7,9 +7,8 @@
7 7
8#include <linux/errno.h> 8#include <linux/errno.h>
9#include <linux/crash_dump.h> 9#include <linux/crash_dump.h>
10 10#include <linux/uaccess.h>
11#include <asm/uaccess.h> 11#include <linux/io.h>
12#include <asm/io.h>
13 12
14/** 13/**
15 * copy_oldmem_page - copy one page from "oldmem" 14 * copy_oldmem_page - copy one page from "oldmem"
@@ -25,7 +24,7 @@
25 * in the current kernel. We stitch up a pte, similar to kmap_atomic. 24 * in the current kernel. We stitch up a pte, similar to kmap_atomic.
26 */ 25 */
27ssize_t copy_oldmem_page(unsigned long pfn, char *buf, 26ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
28 size_t csize, unsigned long offset, int userbuf) 27 size_t csize, unsigned long offset, int userbuf)
29{ 28{
30 void *vaddr; 29 void *vaddr;
31 30
@@ -33,14 +32,16 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
33 return 0; 32 return 0;
34 33
35 vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE); 34 vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
35 if (!vaddr)
36 return -ENOMEM;
36 37
37 if (userbuf) { 38 if (userbuf) {
38 if (copy_to_user(buf, (vaddr + offset), csize)) { 39 if (copy_to_user(buf, vaddr + offset, csize)) {
39 iounmap(vaddr); 40 iounmap(vaddr);
40 return -EFAULT; 41 return -EFAULT;
41 } 42 }
42 } else 43 } else
43 memcpy(buf, (vaddr + offset), csize); 44 memcpy(buf, vaddr + offset, csize);
44 45
45 iounmap(vaddr); 46 iounmap(vaddr);
46 return csize; 47 return csize;
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 11c11b8ec48d..2b69994fd3a8 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -2,26 +2,49 @@
2 * Debug Store support 2 * Debug Store support
3 * 3 *
4 * This provides a low-level interface to the hardware's Debug Store 4 * This provides a low-level interface to the hardware's Debug Store
5 * feature that is used for last branch recording (LBR) and 5 * feature that is used for branch trace store (BTS) and
6 * precise-event based sampling (PEBS). 6 * precise-event based sampling (PEBS).
7 * 7 *
8 * Different architectures use a different DS layout/pointer size. 8 * It manages:
9 * The below functions therefore work on a void*. 9 * - per-thread and per-cpu allocation of BTS and PEBS
10 * - buffer memory allocation (optional)
11 * - buffer overflow handling
12 * - buffer access
10 * 13 *
14 * It assumes:
15 * - get_task_struct on all parameter tasks
16 * - current is allowed to trace parameter tasks
11 * 17 *
12 * Since there is no user for PEBS, yet, only LBR (or branch
13 * trace store, BTS) is supported.
14 * 18 *
15 * 19 * Copyright (C) 2007-2008 Intel Corporation.
16 * Copyright (C) 2007 Intel Corporation. 20 * Markus Metzger <markus.t.metzger@intel.com>, 2007-2008
17 * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007
18 */ 21 */
19 22
23
24#ifdef CONFIG_X86_DS
25
20#include <asm/ds.h> 26#include <asm/ds.h>
21 27
22#include <linux/errno.h> 28#include <linux/errno.h>
23#include <linux/string.h> 29#include <linux/string.h>
24#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/sched.h>
32#include <linux/mm.h>
33
34
35/*
36 * The configuration for a particular DS hardware implementation.
37 */
38struct ds_configuration {
39 /* the size of the DS structure in bytes */
40 unsigned char sizeof_ds;
41 /* the size of one pointer-typed field in the DS structure in bytes;
42 this covers the first 8 fields related to buffer management. */
43 unsigned char sizeof_field;
44 /* the size of a BTS/PEBS record in bytes */
45 unsigned char sizeof_rec[2];
46};
47static struct ds_configuration ds_cfg;
25 48
26 49
27/* 50/*
@@ -44,378 +67,747 @@
44 * (interrupt occurs when write pointer passes interrupt pointer) 67 * (interrupt occurs when write pointer passes interrupt pointer)
45 * - value to which counter is reset following counter overflow 68 * - value to which counter is reset following counter overflow
46 * 69 *
47 * On later architectures, the last branch recording hardware uses 70 * Later architectures use 64bit pointers throughout, whereas earlier
48 * 64bit pointers even in 32bit mode. 71 * architectures use 32bit pointers in 32bit mode.
49 *
50 *
51 * Branch Trace Store (BTS) records store information about control
52 * flow changes. They at least provide the following information:
53 * - source linear address
54 * - destination linear address
55 * 72 *
56 * Netburst supported a predicated bit that had been dropped in later
57 * architectures. We do not suppor it.
58 * 73 *
74 * We compute the base address for the first 8 fields based on:
75 * - the field size stored in the DS configuration
76 * - the relative field position
77 * - an offset giving the start of the respective region
59 * 78 *
60 * In order to abstract from the actual DS and BTS layout, we describe 79 * This offset is further used to index various arrays holding
61 * the access to the relevant fields. 80 * information for BTS and PEBS at the respective index.
62 * Thanks to Andi Kleen for proposing this design.
63 * 81 *
64 * The implementation, however, is not as general as it might seem. In 82 * On later 32bit processors, we only access the lower 32bit of the
65 * order to stay somewhat simple and efficient, we assume an 83 * 64bit pointer fields. The upper halves will be zeroed out.
66 * underlying unsigned type (mostly a pointer type) and we expect the
67 * field to be at least as big as that type.
68 */ 84 */
69 85
70/* 86enum ds_field {
71 * A special from_ip address to indicate that the BTS record is an 87 ds_buffer_base = 0,
72 * info record that needs to be interpreted or skipped. 88 ds_index,
73 */ 89 ds_absolute_maximum,
74#define BTS_ESCAPE_ADDRESS (-1) 90 ds_interrupt_threshold,
91};
75 92
76/* 93enum ds_qualifier {
77 * A field access descriptor 94 ds_bts = 0,
78 */ 95 ds_pebs
79struct access_desc {
80 unsigned char offset;
81 unsigned char size;
82}; 96};
83 97
98static inline unsigned long ds_get(const unsigned char *base,
99 enum ds_qualifier qual, enum ds_field field)
100{
101 base += (ds_cfg.sizeof_field * (field + (4 * qual)));
102 return *(unsigned long *)base;
103}
104
105static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
106 enum ds_field field, unsigned long value)
107{
108 base += (ds_cfg.sizeof_field * (field + (4 * qual)));
109 (*(unsigned long *)base) = value;
110}
111
112
84/* 113/*
85 * The configuration for a particular DS/BTS hardware implementation. 114 * Locking is done only for allocating BTS or PEBS resources and for
115 * guarding context and buffer memory allocation.
116 *
117 * Most functions require the current task to own the ds context part
118 * they are going to access. All the locking is done when validating
119 * access to the context.
86 */ 120 */
87struct ds_configuration { 121static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock);
88 /* the DS configuration */
89 unsigned char sizeof_ds;
90 struct access_desc bts_buffer_base;
91 struct access_desc bts_index;
92 struct access_desc bts_absolute_maximum;
93 struct access_desc bts_interrupt_threshold;
94 /* the BTS configuration */
95 unsigned char sizeof_bts;
96 struct access_desc from_ip;
97 struct access_desc to_ip;
98 /* BTS variants used to store additional information like
99 timestamps */
100 struct access_desc info_type;
101 struct access_desc info_data;
102 unsigned long debugctl_mask;
103};
104 122
105/* 123/*
106 * The global configuration used by the below accessor functions 124 * Validate that the current task is allowed to access the BTS/PEBS
125 * buffer of the parameter task.
126 *
127 * Returns 0, if access is granted; -Eerrno, otherwise.
107 */ 128 */
108static struct ds_configuration ds_cfg; 129static inline int ds_validate_access(struct ds_context *context,
130 enum ds_qualifier qual)
131{
132 if (!context)
133 return -EPERM;
134
135 if (context->owner[qual] == current)
136 return 0;
137
138 return -EPERM;
139}
140
109 141
110/* 142/*
111 * Accessor functions for some DS and BTS fields using the above 143 * We either support (system-wide) per-cpu or per-thread allocation.
112 * global ptrace_bts_cfg. 144 * We distinguish the two based on the task_struct pointer, where a
145 * NULL pointer indicates per-cpu allocation for the current cpu.
146 *
147 * Allocations are use-counted. As soon as resources are allocated,
148 * further allocations must be of the same type (per-cpu or
149 * per-thread). We model this by counting allocations (i.e. the number
150 * of tracers of a certain type) for one type negatively:
151 * =0 no tracers
152 * >0 number of per-thread tracers
153 * <0 number of per-cpu tracers
154 *
155 * The below functions to get and put tracers and to check the
156 * allocation type require the ds_lock to be held by the caller.
157 *
158 * Tracers essentially gives the number of ds contexts for a certain
159 * type of allocation.
113 */ 160 */
114static inline unsigned long get_bts_buffer_base(char *base) 161static long tracers;
162
163static inline void get_tracer(struct task_struct *task)
115{ 164{
116 return *(unsigned long *)(base + ds_cfg.bts_buffer_base.offset); 165 tracers += (task ? 1 : -1);
117} 166}
118static inline void set_bts_buffer_base(char *base, unsigned long value) 167
168static inline void put_tracer(struct task_struct *task)
119{ 169{
120 (*(unsigned long *)(base + ds_cfg.bts_buffer_base.offset)) = value; 170 tracers -= (task ? 1 : -1);
121} 171}
122static inline unsigned long get_bts_index(char *base) 172
173static inline int check_tracer(struct task_struct *task)
123{ 174{
124 return *(unsigned long *)(base + ds_cfg.bts_index.offset); 175 return (task ? (tracers >= 0) : (tracers <= 0));
125} 176}
126static inline void set_bts_index(char *base, unsigned long value) 177
178
179/*
180 * The DS context is either attached to a thread or to a cpu:
181 * - in the former case, the thread_struct contains a pointer to the
182 * attached context.
183 * - in the latter case, we use a static array of per-cpu context
184 * pointers.
185 *
186 * Contexts are use-counted. They are allocated on first access and
187 * deallocated when the last user puts the context.
188 *
189 * We distinguish between an allocating and a non-allocating get of a
190 * context:
191 * - the allocating get is used for requesting BTS/PEBS resources. It
192 * requires the caller to hold the global ds_lock.
193 * - the non-allocating get is used for all other cases. A
194 * non-existing context indicates an error. It acquires and releases
195 * the ds_lock itself for obtaining the context.
196 *
197 * A context and its DS configuration are allocated and deallocated
198 * together. A context always has a DS configuration of the
199 * appropriate size.
200 */
201static DEFINE_PER_CPU(struct ds_context *, system_context);
202
203#define this_system_context per_cpu(system_context, smp_processor_id())
204
205/*
206 * Returns the pointer to the parameter task's context or to the
207 * system-wide context, if task is NULL.
208 *
209 * Increases the use count of the returned context, if not NULL.
210 */
211static inline struct ds_context *ds_get_context(struct task_struct *task)
127{ 212{
128 (*(unsigned long *)(base + ds_cfg.bts_index.offset)) = value; 213 struct ds_context *context;
214
215 spin_lock(&ds_lock);
216
217 context = (task ? task->thread.ds_ctx : this_system_context);
218 if (context)
219 context->count++;
220
221 spin_unlock(&ds_lock);
222
223 return context;
129} 224}
130static inline unsigned long get_bts_absolute_maximum(char *base) 225
226/*
227 * Same as ds_get_context, but allocates the context and it's DS
228 * structure, if necessary; returns NULL; if out of memory.
229 *
230 * pre: requires ds_lock to be held
231 */
232static inline struct ds_context *ds_alloc_context(struct task_struct *task)
131{ 233{
132 return *(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset); 234 struct ds_context **p_context =
235 (task ? &task->thread.ds_ctx : &this_system_context);
236 struct ds_context *context = *p_context;
237
238 if (!context) {
239 context = kzalloc(sizeof(*context), GFP_KERNEL);
240
241 if (!context)
242 return NULL;
243
244 context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL);
245 if (!context->ds) {
246 kfree(context);
247 return NULL;
248 }
249
250 *p_context = context;
251
252 context->this = p_context;
253 context->task = task;
254
255 if (task)
256 set_tsk_thread_flag(task, TIF_DS_AREA_MSR);
257
258 if (!task || (task == current))
259 wrmsr(MSR_IA32_DS_AREA, (unsigned long)context->ds, 0);
260
261 get_tracer(task);
262 }
263
264 context->count++;
265
266 return context;
133} 267}
134static inline void set_bts_absolute_maximum(char *base, unsigned long value) 268
269/*
270 * Decreases the use count of the parameter context, if not NULL.
271 * Deallocates the context, if the use count reaches zero.
272 */
273static inline void ds_put_context(struct ds_context *context)
135{ 274{
136 (*(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset)) = value; 275 if (!context)
276 return;
277
278 spin_lock(&ds_lock);
279
280 if (--context->count)
281 goto out;
282
283 *(context->this) = NULL;
284
285 if (context->task)
286 clear_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
287
288 if (!context->task || (context->task == current))
289 wrmsrl(MSR_IA32_DS_AREA, 0);
290
291 put_tracer(context->task);
292
293 /* free any leftover buffers from tracers that did not
294 * deallocate them properly. */
295 kfree(context->buffer[ds_bts]);
296 kfree(context->buffer[ds_pebs]);
297 kfree(context->ds);
298 kfree(context);
299 out:
300 spin_unlock(&ds_lock);
137} 301}
138static inline unsigned long get_bts_interrupt_threshold(char *base) 302
303
304/*
305 * Handle a buffer overflow
306 *
307 * task: the task whose buffers are overflowing;
308 * NULL for a buffer overflow on the current cpu
309 * context: the ds context
310 * qual: the buffer type
311 */
312static void ds_overflow(struct task_struct *task, struct ds_context *context,
313 enum ds_qualifier qual)
139{ 314{
140 return *(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset); 315 if (!context)
316 return;
317
318 if (context->callback[qual])
319 (*context->callback[qual])(task);
320
321 /* todo: do some more overflow handling */
141} 322}
142static inline void set_bts_interrupt_threshold(char *base, unsigned long value) 323
324
325/*
326 * Allocate a non-pageable buffer of the parameter size.
327 * Checks the memory and the locked memory rlimit.
328 *
329 * Returns the buffer, if successful;
330 * NULL, if out of memory or rlimit exceeded.
331 *
332 * size: the requested buffer size in bytes
333 * pages (out): if not NULL, contains the number of pages reserved
334 */
335static inline void *ds_allocate_buffer(size_t size, unsigned int *pages)
143{ 336{
144 (*(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset)) = value; 337 unsigned long rlim, vm, pgsz;
338 void *buffer;
339
340 pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
341
342 rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
343 vm = current->mm->total_vm + pgsz;
344 if (rlim < vm)
345 return NULL;
346
347 rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
348 vm = current->mm->locked_vm + pgsz;
349 if (rlim < vm)
350 return NULL;
351
352 buffer = kzalloc(size, GFP_KERNEL);
353 if (!buffer)
354 return NULL;
355
356 current->mm->total_vm += pgsz;
357 current->mm->locked_vm += pgsz;
358
359 if (pages)
360 *pages = pgsz;
361
362 return buffer;
145} 363}
146static inline unsigned long get_from_ip(char *base) 364
365static int ds_request(struct task_struct *task, void *base, size_t size,
366 ds_ovfl_callback_t ovfl, enum ds_qualifier qual)
147{ 367{
148 return *(unsigned long *)(base + ds_cfg.from_ip.offset); 368 struct ds_context *context;
369 unsigned long buffer, adj;
370 const unsigned long alignment = (1 << 3);
371 int error = 0;
372
373 if (!ds_cfg.sizeof_ds)
374 return -EOPNOTSUPP;
375
376 /* we require some space to do alignment adjustments below */
377 if (size < (alignment + ds_cfg.sizeof_rec[qual]))
378 return -EINVAL;
379
380 /* buffer overflow notification is not yet implemented */
381 if (ovfl)
382 return -EOPNOTSUPP;
383
384
385 spin_lock(&ds_lock);
386
387 if (!check_tracer(task))
388 return -EPERM;
389
390 error = -ENOMEM;
391 context = ds_alloc_context(task);
392 if (!context)
393 goto out_unlock;
394
395 error = -EALREADY;
396 if (context->owner[qual] == current)
397 goto out_unlock;
398 error = -EPERM;
399 if (context->owner[qual] != NULL)
400 goto out_unlock;
401 context->owner[qual] = current;
402
403 spin_unlock(&ds_lock);
404
405
406 error = -ENOMEM;
407 if (!base) {
408 base = ds_allocate_buffer(size, &context->pages[qual]);
409 if (!base)
410 goto out_release;
411
412 context->buffer[qual] = base;
413 }
414 error = 0;
415
416 context->callback[qual] = ovfl;
417
418 /* adjust the buffer address and size to meet alignment
419 * constraints:
420 * - buffer is double-word aligned
421 * - size is multiple of record size
422 *
423 * We checked the size at the very beginning; we have enough
424 * space to do the adjustment.
425 */
426 buffer = (unsigned long)base;
427
428 adj = ALIGN(buffer, alignment) - buffer;
429 buffer += adj;
430 size -= adj;
431
432 size /= ds_cfg.sizeof_rec[qual];
433 size *= ds_cfg.sizeof_rec[qual];
434
435 ds_set(context->ds, qual, ds_buffer_base, buffer);
436 ds_set(context->ds, qual, ds_index, buffer);
437 ds_set(context->ds, qual, ds_absolute_maximum, buffer + size);
438
439 if (ovfl) {
440 /* todo: select a suitable interrupt threshold */
441 } else
442 ds_set(context->ds, qual,
443 ds_interrupt_threshold, buffer + size + 1);
444
445 /* we keep the context until ds_release */
446 return error;
447
448 out_release:
449 context->owner[qual] = NULL;
450 ds_put_context(context);
451 return error;
452
453 out_unlock:
454 spin_unlock(&ds_lock);
455 ds_put_context(context);
456 return error;
149} 457}
150static inline void set_from_ip(char *base, unsigned long value) 458
459int ds_request_bts(struct task_struct *task, void *base, size_t size,
460 ds_ovfl_callback_t ovfl)
151{ 461{
152 (*(unsigned long *)(base + ds_cfg.from_ip.offset)) = value; 462 return ds_request(task, base, size, ovfl, ds_bts);
153} 463}
154static inline unsigned long get_to_ip(char *base) 464
465int ds_request_pebs(struct task_struct *task, void *base, size_t size,
466 ds_ovfl_callback_t ovfl)
155{ 467{
156 return *(unsigned long *)(base + ds_cfg.to_ip.offset); 468 return ds_request(task, base, size, ovfl, ds_pebs);
157} 469}
158static inline void set_to_ip(char *base, unsigned long value) 470
471static int ds_release(struct task_struct *task, enum ds_qualifier qual)
159{ 472{
160 (*(unsigned long *)(base + ds_cfg.to_ip.offset)) = value; 473 struct ds_context *context;
474 int error;
475
476 context = ds_get_context(task);
477 error = ds_validate_access(context, qual);
478 if (error < 0)
479 goto out;
480
481 kfree(context->buffer[qual]);
482 context->buffer[qual] = NULL;
483
484 current->mm->total_vm -= context->pages[qual];
485 current->mm->locked_vm -= context->pages[qual];
486 context->pages[qual] = 0;
487 context->owner[qual] = NULL;
488
489 /*
490 * we put the context twice:
491 * once for the ds_get_context
492 * once for the corresponding ds_request
493 */
494 ds_put_context(context);
495 out:
496 ds_put_context(context);
497 return error;
161} 498}
162static inline unsigned char get_info_type(char *base) 499
500int ds_release_bts(struct task_struct *task)
163{ 501{
164 return *(unsigned char *)(base + ds_cfg.info_type.offset); 502 return ds_release(task, ds_bts);
165} 503}
166static inline void set_info_type(char *base, unsigned char value) 504
505int ds_release_pebs(struct task_struct *task)
167{ 506{
168 (*(unsigned char *)(base + ds_cfg.info_type.offset)) = value; 507 return ds_release(task, ds_pebs);
169} 508}
170static inline unsigned long get_info_data(char *base) 509
510static int ds_get_index(struct task_struct *task, size_t *pos,
511 enum ds_qualifier qual)
171{ 512{
172 return *(unsigned long *)(base + ds_cfg.info_data.offset); 513 struct ds_context *context;
514 unsigned long base, index;
515 int error;
516
517 context = ds_get_context(task);
518 error = ds_validate_access(context, qual);
519 if (error < 0)
520 goto out;
521
522 base = ds_get(context->ds, qual, ds_buffer_base);
523 index = ds_get(context->ds, qual, ds_index);
524
525 error = ((index - base) / ds_cfg.sizeof_rec[qual]);
526 if (pos)
527 *pos = error;
528 out:
529 ds_put_context(context);
530 return error;
173} 531}
174static inline void set_info_data(char *base, unsigned long value) 532
533int ds_get_bts_index(struct task_struct *task, size_t *pos)
175{ 534{
176 (*(unsigned long *)(base + ds_cfg.info_data.offset)) = value; 535 return ds_get_index(task, pos, ds_bts);
177} 536}
178 537
538int ds_get_pebs_index(struct task_struct *task, size_t *pos)
539{
540 return ds_get_index(task, pos, ds_pebs);
541}
179 542
180int ds_allocate(void **dsp, size_t bts_size_in_bytes) 543static int ds_get_end(struct task_struct *task, size_t *pos,
544 enum ds_qualifier qual)
181{ 545{
182 size_t bts_size_in_records; 546 struct ds_context *context;
183 unsigned long bts; 547 unsigned long base, end;
184 void *ds; 548 int error;
549
550 context = ds_get_context(task);
551 error = ds_validate_access(context, qual);
552 if (error < 0)
553 goto out;
554
555 base = ds_get(context->ds, qual, ds_buffer_base);
556 end = ds_get(context->ds, qual, ds_absolute_maximum);
557
558 error = ((end - base) / ds_cfg.sizeof_rec[qual]);
559 if (pos)
560 *pos = error;
561 out:
562 ds_put_context(context);
563 return error;
564}
185 565
186 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) 566int ds_get_bts_end(struct task_struct *task, size_t *pos)
187 return -EOPNOTSUPP; 567{
568 return ds_get_end(task, pos, ds_bts);
569}
188 570
189 if (bts_size_in_bytes < 0) 571int ds_get_pebs_end(struct task_struct *task, size_t *pos)
190 return -EINVAL; 572{
573 return ds_get_end(task, pos, ds_pebs);
574}
191 575
192 bts_size_in_records = 576static int ds_access(struct task_struct *task, size_t index,
193 bts_size_in_bytes / ds_cfg.sizeof_bts; 577 const void **record, enum ds_qualifier qual)
194 bts_size_in_bytes = 578{
195 bts_size_in_records * ds_cfg.sizeof_bts; 579 struct ds_context *context;
580 unsigned long base, idx;
581 int error;
196 582
197 if (bts_size_in_bytes <= 0) 583 if (!record)
198 return -EINVAL; 584 return -EINVAL;
199 585
200 bts = (unsigned long)kzalloc(bts_size_in_bytes, GFP_KERNEL); 586 context = ds_get_context(task);
201 587 error = ds_validate_access(context, qual);
202 if (!bts) 588 if (error < 0)
203 return -ENOMEM; 589 goto out;
204 590
205 ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL); 591 base = ds_get(context->ds, qual, ds_buffer_base);
592 idx = base + (index * ds_cfg.sizeof_rec[qual]);
206 593
207 if (!ds) { 594 error = -EINVAL;
208 kfree((void *)bts); 595 if (idx > ds_get(context->ds, qual, ds_absolute_maximum))
209 return -ENOMEM; 596 goto out;
210 }
211
212 set_bts_buffer_base(ds, bts);
213 set_bts_index(ds, bts);
214 set_bts_absolute_maximum(ds, bts + bts_size_in_bytes);
215 set_bts_interrupt_threshold(ds, bts + bts_size_in_bytes + 1);
216 597
217 *dsp = ds; 598 *record = (const void *)idx;
218 return 0; 599 error = ds_cfg.sizeof_rec[qual];
600 out:
601 ds_put_context(context);
602 return error;
219} 603}
220 604
221int ds_free(void **dsp) 605int ds_access_bts(struct task_struct *task, size_t index, const void **record)
222{ 606{
223 if (*dsp) { 607 return ds_access(task, index, record, ds_bts);
224 kfree((void *)get_bts_buffer_base(*dsp));
225 kfree(*dsp);
226 *dsp = NULL;
227 }
228 return 0;
229} 608}
230 609
231int ds_get_bts_size(void *ds) 610int ds_access_pebs(struct task_struct *task, size_t index, const void **record)
232{ 611{
233 int size_in_bytes; 612 return ds_access(task, index, record, ds_pebs);
234
235 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
236 return -EOPNOTSUPP;
237
238 if (!ds)
239 return 0;
240
241 size_in_bytes =
242 get_bts_absolute_maximum(ds) -
243 get_bts_buffer_base(ds);
244 return size_in_bytes;
245} 613}
246 614
247int ds_get_bts_end(void *ds) 615static int ds_write(struct task_struct *task, const void *record, size_t size,
616 enum ds_qualifier qual, int force)
248{ 617{
249 int size_in_bytes = ds_get_bts_size(ds); 618 struct ds_context *context;
250 619 int error;
251 if (size_in_bytes <= 0)
252 return size_in_bytes;
253 620
254 return size_in_bytes / ds_cfg.sizeof_bts; 621 if (!record)
255} 622 return -EINVAL;
256 623
257int ds_get_bts_index(void *ds) 624 error = -EPERM;
258{ 625 context = ds_get_context(task);
259 int index_offset_in_bytes; 626 if (!context)
627 goto out;
260 628
261 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) 629 if (!force) {
262 return -EOPNOTSUPP; 630 error = ds_validate_access(context, qual);
631 if (error < 0)
632 goto out;
633 }
263 634
264 index_offset_in_bytes = 635 error = 0;
265 get_bts_index(ds) - 636 while (size) {
266 get_bts_buffer_base(ds); 637 unsigned long base, index, end, write_end, int_th;
638 unsigned long write_size, adj_write_size;
639
640 /*
641 * write as much as possible without producing an
642 * overflow interrupt.
643 *
644 * interrupt_threshold must either be
645 * - bigger than absolute_maximum or
646 * - point to a record between buffer_base and absolute_maximum
647 *
648 * index points to a valid record.
649 */
650 base = ds_get(context->ds, qual, ds_buffer_base);
651 index = ds_get(context->ds, qual, ds_index);
652 end = ds_get(context->ds, qual, ds_absolute_maximum);
653 int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
654
655 write_end = min(end, int_th);
656
657 /* if we are already beyond the interrupt threshold,
658 * we fill the entire buffer */
659 if (write_end <= index)
660 write_end = end;
661
662 if (write_end <= index)
663 goto out;
664
665 write_size = min((unsigned long) size, write_end - index);
666 memcpy((void *)index, record, write_size);
667
668 record = (const char *)record + write_size;
669 size -= write_size;
670 error += write_size;
671
672 adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
673 adj_write_size *= ds_cfg.sizeof_rec[qual];
674
675 /* zero out trailing bytes */
676 memset((char *)index + write_size, 0,
677 adj_write_size - write_size);
678 index += adj_write_size;
679
680 if (index >= end)
681 index = base;
682 ds_set(context->ds, qual, ds_index, index);
683
684 if (index >= int_th)
685 ds_overflow(task, context, qual);
686 }
267 687
268 return index_offset_in_bytes / ds_cfg.sizeof_bts; 688 out:
689 ds_put_context(context);
690 return error;
269} 691}
270 692
271int ds_set_overflow(void *ds, int method) 693int ds_write_bts(struct task_struct *task, const void *record, size_t size)
272{ 694{
273 switch (method) { 695 return ds_write(task, record, size, ds_bts, /* force = */ 0);
274 case DS_O_SIGNAL:
275 return -EOPNOTSUPP;
276 case DS_O_WRAP:
277 return 0;
278 default:
279 return -EINVAL;
280 }
281} 696}
282 697
283int ds_get_overflow(void *ds) 698int ds_write_pebs(struct task_struct *task, const void *record, size_t size)
284{ 699{
285 return DS_O_WRAP; 700 return ds_write(task, record, size, ds_pebs, /* force = */ 0);
286} 701}
287 702
288int ds_clear(void *ds) 703int ds_unchecked_write_bts(struct task_struct *task,
704 const void *record, size_t size)
289{ 705{
290 int bts_size = ds_get_bts_size(ds); 706 return ds_write(task, record, size, ds_bts, /* force = */ 1);
291 unsigned long bts_base;
292
293 if (bts_size <= 0)
294 return bts_size;
295
296 bts_base = get_bts_buffer_base(ds);
297 memset((void *)bts_base, 0, bts_size);
298
299 set_bts_index(ds, bts_base);
300 return 0;
301} 707}
302 708
303int ds_read_bts(void *ds, int index, struct bts_struct *out) 709int ds_unchecked_write_pebs(struct task_struct *task,
710 const void *record, size_t size)
304{ 711{
305 void *bts; 712 return ds_write(task, record, size, ds_pebs, /* force = */ 1);
713}
306 714
307 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) 715static int ds_reset_or_clear(struct task_struct *task,
308 return -EOPNOTSUPP; 716 enum ds_qualifier qual, int clear)
717{
718 struct ds_context *context;
719 unsigned long base, end;
720 int error;
309 721
310 if (index < 0) 722 context = ds_get_context(task);
311 return -EINVAL; 723 error = ds_validate_access(context, qual);
724 if (error < 0)
725 goto out;
312 726
313 if (index >= ds_get_bts_size(ds)) 727 base = ds_get(context->ds, qual, ds_buffer_base);
314 return -EINVAL; 728 end = ds_get(context->ds, qual, ds_absolute_maximum);
315 729
316 bts = (void *)(get_bts_buffer_base(ds) + (index * ds_cfg.sizeof_bts)); 730 if (clear)
731 memset((void *)base, 0, end - base);
317 732
318 memset(out, 0, sizeof(*out)); 733 ds_set(context->ds, qual, ds_index, base);
319 if (get_from_ip(bts) == BTS_ESCAPE_ADDRESS) {
320 out->qualifier = get_info_type(bts);
321 out->variant.jiffies = get_info_data(bts);
322 } else {
323 out->qualifier = BTS_BRANCH;
324 out->variant.lbr.from_ip = get_from_ip(bts);
325 out->variant.lbr.to_ip = get_to_ip(bts);
326 }
327 734
328 return sizeof(*out);; 735 error = 0;
736 out:
737 ds_put_context(context);
738 return error;
329} 739}
330 740
331int ds_write_bts(void *ds, const struct bts_struct *in) 741int ds_reset_bts(struct task_struct *task)
332{ 742{
333 unsigned long bts; 743 return ds_reset_or_clear(task, ds_bts, /* clear = */ 0);
334 744}
335 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
336 return -EOPNOTSUPP;
337
338 if (ds_get_bts_size(ds) <= 0)
339 return -ENXIO;
340 745
341 bts = get_bts_index(ds); 746int ds_reset_pebs(struct task_struct *task)
747{
748 return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0);
749}
342 750
343 memset((void *)bts, 0, ds_cfg.sizeof_bts); 751int ds_clear_bts(struct task_struct *task)
344 switch (in->qualifier) { 752{
345 case BTS_INVALID: 753 return ds_reset_or_clear(task, ds_bts, /* clear = */ 1);
346 break; 754}
347 755
348 case BTS_BRANCH: 756int ds_clear_pebs(struct task_struct *task)
349 set_from_ip((void *)bts, in->variant.lbr.from_ip); 757{
350 set_to_ip((void *)bts, in->variant.lbr.to_ip); 758 return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1);
351 break; 759}
352 760
353 case BTS_TASK_ARRIVES: 761int ds_get_pebs_reset(struct task_struct *task, u64 *value)
354 case BTS_TASK_DEPARTS: 762{
355 set_from_ip((void *)bts, BTS_ESCAPE_ADDRESS); 763 struct ds_context *context;
356 set_info_type((void *)bts, in->qualifier); 764 int error;
357 set_info_data((void *)bts, in->variant.jiffies);
358 break;
359 765
360 default: 766 if (!value)
361 return -EINVAL; 767 return -EINVAL;
362 }
363 768
364 bts = bts + ds_cfg.sizeof_bts; 769 context = ds_get_context(task);
365 if (bts >= get_bts_absolute_maximum(ds)) 770 error = ds_validate_access(context, ds_pebs);
366 bts = get_bts_buffer_base(ds); 771 if (error < 0)
367 set_bts_index(ds, bts); 772 goto out;
368 773
369 return ds_cfg.sizeof_bts; 774 *value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8));
775
776 error = 0;
777 out:
778 ds_put_context(context);
779 return error;
370} 780}
371 781
372unsigned long ds_debugctl_mask(void) 782int ds_set_pebs_reset(struct task_struct *task, u64 value)
373{ 783{
374 return ds_cfg.debugctl_mask; 784 struct ds_context *context;
375} 785 int error;
376 786
377#ifdef __i386__ 787 context = ds_get_context(task);
378static const struct ds_configuration ds_cfg_netburst = { 788 error = ds_validate_access(context, ds_pebs);
379 .sizeof_ds = 9 * 4, 789 if (error < 0)
380 .bts_buffer_base = { 0, 4 }, 790 goto out;
381 .bts_index = { 4, 4 },
382 .bts_absolute_maximum = { 8, 4 },
383 .bts_interrupt_threshold = { 12, 4 },
384 .sizeof_bts = 3 * 4,
385 .from_ip = { 0, 4 },
386 .to_ip = { 4, 4 },
387 .info_type = { 4, 1 },
388 .info_data = { 8, 4 },
389 .debugctl_mask = (1<<2)|(1<<3)
390};
391 791
392static const struct ds_configuration ds_cfg_pentium_m = { 792 *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value;
393 .sizeof_ds = 9 * 4, 793
394 .bts_buffer_base = { 0, 4 }, 794 error = 0;
395 .bts_index = { 4, 4 }, 795 out:
396 .bts_absolute_maximum = { 8, 4 }, 796 ds_put_context(context);
397 .bts_interrupt_threshold = { 12, 4 }, 797 return error;
398 .sizeof_bts = 3 * 4, 798}
399 .from_ip = { 0, 4 }, 799
400 .to_ip = { 4, 4 }, 800static const struct ds_configuration ds_cfg_var = {
401 .info_type = { 4, 1 }, 801 .sizeof_ds = sizeof(long) * 12,
402 .info_data = { 8, 4 }, 802 .sizeof_field = sizeof(long),
403 .debugctl_mask = (1<<6)|(1<<7) 803 .sizeof_rec[ds_bts] = sizeof(long) * 3,
804 .sizeof_rec[ds_pebs] = sizeof(long) * 10
404}; 805};
405#endif /* _i386_ */ 806static const struct ds_configuration ds_cfg_64 = {
406 807 .sizeof_ds = 8 * 12,
407static const struct ds_configuration ds_cfg_core2 = { 808 .sizeof_field = 8,
408 .sizeof_ds = 9 * 8, 809 .sizeof_rec[ds_bts] = 8 * 3,
409 .bts_buffer_base = { 0, 8 }, 810 .sizeof_rec[ds_pebs] = 8 * 10
410 .bts_index = { 8, 8 },
411 .bts_absolute_maximum = { 16, 8 },
412 .bts_interrupt_threshold = { 24, 8 },
413 .sizeof_bts = 3 * 8,
414 .from_ip = { 0, 8 },
415 .to_ip = { 8, 8 },
416 .info_type = { 8, 1 },
417 .info_data = { 16, 8 },
418 .debugctl_mask = (1<<6)|(1<<7)|(1<<9)
419}; 811};
420 812
421static inline void 813static inline void
@@ -429,14 +821,13 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
429 switch (c->x86) { 821 switch (c->x86) {
430 case 0x6: 822 case 0x6:
431 switch (c->x86_model) { 823 switch (c->x86_model) {
432#ifdef __i386__
433 case 0xD: 824 case 0xD:
434 case 0xE: /* Pentium M */ 825 case 0xE: /* Pentium M */
435 ds_configure(&ds_cfg_pentium_m); 826 ds_configure(&ds_cfg_var);
436 break; 827 break;
437#endif /* _i386_ */
438 case 0xF: /* Core2 */ 828 case 0xF: /* Core2 */
439 ds_configure(&ds_cfg_core2); 829 case 0x1C: /* Atom */
830 ds_configure(&ds_cfg_64);
440 break; 831 break;
441 default: 832 default:
442 /* sorry, don't know about them */ 833 /* sorry, don't know about them */
@@ -445,13 +836,11 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
445 break; 836 break;
446 case 0xF: 837 case 0xF:
447 switch (c->x86_model) { 838 switch (c->x86_model) {
448#ifdef __i386__
449 case 0x0: 839 case 0x0:
450 case 0x1: 840 case 0x1:
451 case 0x2: /* Netburst */ 841 case 0x2: /* Netburst */
452 ds_configure(&ds_cfg_netburst); 842 ds_configure(&ds_cfg_var);
453 break; 843 break;
454#endif /* _i386_ */
455 default: 844 default:
456 /* sorry, don't know about them */ 845 /* sorry, don't know about them */
457 break; 846 break;
@@ -462,3 +851,14 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
462 break; 851 break;
463 } 852 }
464} 853}
854
855void ds_free(struct ds_context *context)
856{
857 /* This is called when the task owning the parameter context
858 * is dying. There should not be any user of that context left
859 * to disturb us, anymore. */
860 unsigned long leftovers = context->count;
861 while (leftovers--)
862 ds_put_context(context);
863}
864#endif /* CONFIG_X86_DS */
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index e24d1bc47b46..78e642feac30 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1206,7 +1206,7 @@ static int __init parse_memmap_opt(char *p)
1206 if (!p) 1206 if (!p)
1207 return -EINVAL; 1207 return -EINVAL;
1208 1208
1209 if (!strcmp(p, "exactmap")) { 1209 if (!strncmp(p, "exactmap", 8)) {
1210#ifdef CONFIG_CRASH_DUMP 1210#ifdef CONFIG_CRASH_DUMP
1211 /* 1211 /*
1212 * If we are doing a crash dump, we still need to know 1212 * If we are doing a crash dump, we still need to know
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 4353cf5e6fac..24bb5faf5efa 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -95,6 +95,20 @@ static void __init nvidia_bugs(int num, int slot, int func)
95 95
96} 96}
97 97
98#ifdef CONFIG_DMAR
99static void __init intel_g33_dmar(int num, int slot, int func)
100{
101 struct acpi_table_header *dmar_tbl;
102 acpi_status status;
103
104 status = acpi_get_table(ACPI_SIG_DMAR, 0, &dmar_tbl);
105 if (ACPI_SUCCESS(status)) {
106 printk(KERN_INFO "BIOS BUG: DMAR advertised on Intel G31/G33 chipset -- ignoring\n");
107 dmar_disabled = 1;
108 }
109}
110#endif
111
98#define QFLAG_APPLY_ONCE 0x1 112#define QFLAG_APPLY_ONCE 0x1
99#define QFLAG_APPLIED 0x2 113#define QFLAG_APPLIED 0x2
100#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) 114#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED)
@@ -114,6 +128,10 @@ static struct chipset early_qrk[] __initdata = {
114 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs }, 128 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs },
115 { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, 129 { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
116 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config }, 130 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config },
131#ifdef CONFIG_DMAR
132 { PCI_VENDOR_ID_INTEL, 0x29c0,
133 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, intel_g33_dmar },
134#endif
117 {} 135 {}
118}; 136};
119 137
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 06cc8d4254b1..945a31cdd81f 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -414,9 +414,11 @@ void __init efi_init(void)
414 if (memmap.map == NULL) 414 if (memmap.map == NULL)
415 printk(KERN_ERR "Could not map the EFI memory map!\n"); 415 printk(KERN_ERR "Could not map the EFI memory map!\n");
416 memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size); 416 memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
417
417 if (memmap.desc_size != sizeof(efi_memory_desc_t)) 418 if (memmap.desc_size != sizeof(efi_memory_desc_t))
418 printk(KERN_WARNING "Kernel-defined memdesc" 419 printk(KERN_WARNING
419 "doesn't match the one from EFI!\n"); 420 "Kernel-defined memdesc doesn't match the one from EFI!\n");
421
420 if (add_efi_memmap) 422 if (add_efi_memmap)
421 do_add_efi_memmap(); 423 do_add_efi_memmap();
422 424
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 9bfc4d72fb2e..d16084f90649 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -108,12 +108,11 @@ void __init x86_64_start_kernel(char * real_mode_data)
108 } 108 }
109 load_idt((const struct desc_ptr *)&idt_descr); 109 load_idt((const struct desc_ptr *)&idt_descr);
110 110
111 early_printk("Kernel alive\n"); 111 if (console_loglevel == 10)
112 early_printk("Kernel alive\n");
112 113
113 x86_64_init_pda(); 114 x86_64_init_pda();
114 115
115 early_printk("Kernel really alive\n");
116
117 x86_64_start_reservations(real_mode_data); 116 x86_64_start_reservations(real_mode_data);
118} 117}
119 118
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index a7010c3a377a..e835b4eea70b 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -172,10 +172,6 @@ num_subarch_entries = (. - subarch_entries) / 4
172 * 172 *
173 * Note that the stack is not yet set up! 173 * Note that the stack is not yet set up!
174 */ 174 */
175#define PTE_ATTR 0x007 /* PRESENT+RW+USER */
176#define PDE_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */
177#define PGD_ATTR 0x001 /* PRESENT (no other attributes) */
178
179default_entry: 175default_entry:
180#ifdef CONFIG_X86_PAE 176#ifdef CONFIG_X86_PAE
181 177
@@ -196,9 +192,9 @@ default_entry:
196 movl $pa(pg0), %edi 192 movl $pa(pg0), %edi
197 movl %edi, pa(init_pg_tables_start) 193 movl %edi, pa(init_pg_tables_start)
198 movl $pa(swapper_pg_pmd), %edx 194 movl $pa(swapper_pg_pmd), %edx
199 movl $PTE_ATTR, %eax 195 movl $PTE_IDENT_ATTR, %eax
20010: 19610:
201 leal PDE_ATTR(%edi),%ecx /* Create PMD entry */ 197 leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */
202 movl %ecx,(%edx) /* Store PMD entry */ 198 movl %ecx,(%edx) /* Store PMD entry */
203 /* Upper half already zero */ 199 /* Upper half already zero */
204 addl $8,%edx 200 addl $8,%edx
@@ -215,7 +211,7 @@ default_entry:
215 * End condition: we must map up to and including INIT_MAP_BEYOND_END 211 * End condition: we must map up to and including INIT_MAP_BEYOND_END
216 * bytes beyond the end of our own page tables. 212 * bytes beyond the end of our own page tables.
217 */ 213 */
218 leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp 214 leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
219 cmpl %ebp,%eax 215 cmpl %ebp,%eax
220 jb 10b 216 jb 10b
2211: 2171:
@@ -224,7 +220,7 @@ default_entry:
224 movl %eax, pa(max_pfn_mapped) 220 movl %eax, pa(max_pfn_mapped)
225 221
226 /* Do early initialization of the fixmap area */ 222 /* Do early initialization of the fixmap area */
227 movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax 223 movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
228 movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8) 224 movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8)
229#else /* Not PAE */ 225#else /* Not PAE */
230 226
@@ -233,9 +229,9 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
233 movl $pa(pg0), %edi 229 movl $pa(pg0), %edi
234 movl %edi, pa(init_pg_tables_start) 230 movl %edi, pa(init_pg_tables_start)
235 movl $pa(swapper_pg_dir), %edx 231 movl $pa(swapper_pg_dir), %edx
236 movl $PTE_ATTR, %eax 232 movl $PTE_IDENT_ATTR, %eax
23710: 23310:
238 leal PDE_ATTR(%edi),%ecx /* Create PDE entry */ 234 leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */
239 movl %ecx,(%edx) /* Store identity PDE entry */ 235 movl %ecx,(%edx) /* Store identity PDE entry */
240 movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */ 236 movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */
241 addl $4,%edx 237 addl $4,%edx
@@ -249,7 +245,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
249 * bytes beyond the end of our own page tables; the +0x007 is 245 * bytes beyond the end of our own page tables; the +0x007 is
250 * the attribute bits 246 * the attribute bits
251 */ 247 */
252 leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp 248 leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
253 cmpl %ebp,%eax 249 cmpl %ebp,%eax
254 jb 10b 250 jb 10b
255 movl %edi,pa(init_pg_tables_end) 251 movl %edi,pa(init_pg_tables_end)
@@ -257,7 +253,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
257 movl %eax, pa(max_pfn_mapped) 253 movl %eax, pa(max_pfn_mapped)
258 254
259 /* Do early initialization of the fixmap area */ 255 /* Do early initialization of the fixmap area */
260 movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax 256 movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
261 movl %eax,pa(swapper_pg_dir+0xffc) 257 movl %eax,pa(swapper_pg_dir+0xffc)
262#endif 258#endif
263 jmp 3f 259 jmp 3f
@@ -634,19 +630,19 @@ ENTRY(empty_zero_page)
634 /* Page-aligned for the benefit of paravirt? */ 630 /* Page-aligned for the benefit of paravirt? */
635 .align PAGE_SIZE_asm 631 .align PAGE_SIZE_asm
636ENTRY(swapper_pg_dir) 632ENTRY(swapper_pg_dir)
637 .long pa(swapper_pg_pmd+PGD_ATTR),0 /* low identity map */ 633 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */
638# if KPMDS == 3 634# if KPMDS == 3
639 .long pa(swapper_pg_pmd+PGD_ATTR),0 635 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
640 .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0 636 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
641 .long pa(swapper_pg_pmd+PGD_ATTR+0x2000),0 637 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x2000),0
642# elif KPMDS == 2 638# elif KPMDS == 2
643 .long 0,0 639 .long 0,0
644 .long pa(swapper_pg_pmd+PGD_ATTR),0 640 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
645 .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0 641 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
646# elif KPMDS == 1 642# elif KPMDS == 1
647 .long 0,0 643 .long 0,0
648 .long 0,0 644 .long 0,0
649 .long pa(swapper_pg_pmd+PGD_ATTR),0 645 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
650# else 646# else
651# error "Kernel PMDs should be 1, 2 or 3" 647# error "Kernel PMDs should be 1, 2 or 3"
652# endif 648# endif
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index db3280afe886..26cfdc1d7c7f 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -110,7 +110,7 @@ startup_64:
110 movq %rdi, %rax 110 movq %rdi, %rax
111 shrq $PMD_SHIFT, %rax 111 shrq $PMD_SHIFT, %rax
112 andq $(PTRS_PER_PMD - 1), %rax 112 andq $(PTRS_PER_PMD - 1), %rax
113 leaq __PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx 113 leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx
114 leaq level2_spare_pgt(%rip), %rbx 114 leaq level2_spare_pgt(%rip), %rbx
115 movq %rdx, 0(%rbx, %rax, 8) 115 movq %rdx, 0(%rbx, %rax, 8)
116ident_complete: 116ident_complete:
@@ -374,7 +374,7 @@ NEXT_PAGE(level2_ident_pgt)
374 /* Since I easily can, map the first 1G. 374 /* Since I easily can, map the first 1G.
375 * Don't set NX because code runs from these pages. 375 * Don't set NX because code runs from these pages.
376 */ 376 */
377 PMDS(0, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) 377 PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
378 378
379NEXT_PAGE(level2_kernel_pgt) 379NEXT_PAGE(level2_kernel_pgt)
380 /* 380 /*
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 59fd3b6b1303..73deaffadd03 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -210,8 +210,8 @@ static void hpet_legacy_clockevent_register(void)
210 /* Calculate the min / max delta */ 210 /* Calculate the min / max delta */
211 hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, 211 hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
212 &hpet_clockevent); 212 &hpet_clockevent);
213 hpet_clockevent.min_delta_ns = clockevent_delta2ns(0x30, 213 /* 5 usec minimum reprogramming delta. */
214 &hpet_clockevent); 214 hpet_clockevent.min_delta_ns = 5000;
215 215
216 /* 216 /*
217 * Start hpet with the boot cpu mask and make it 217 * Start hpet with the boot cpu mask and make it
@@ -270,15 +270,22 @@ static void hpet_legacy_set_mode(enum clock_event_mode mode,
270} 270}
271 271
272static int hpet_legacy_next_event(unsigned long delta, 272static int hpet_legacy_next_event(unsigned long delta,
273 struct clock_event_device *evt) 273 struct clock_event_device *evt)
274{ 274{
275 unsigned long cnt; 275 u32 cnt;
276 276
277 cnt = hpet_readl(HPET_COUNTER); 277 cnt = hpet_readl(HPET_COUNTER);
278 cnt += delta; 278 cnt += (u32) delta;
279 hpet_writel(cnt, HPET_T0_CMP); 279 hpet_writel(cnt, HPET_T0_CMP);
280 280
281 return ((long)(hpet_readl(HPET_COUNTER) - cnt ) > 0) ? -ETIME : 0; 281 /*
282 * We need to read back the CMP register to make sure that
283 * what we wrote hit the chip before we compare it to the
284 * counter.
285 */
286 WARN_ON((u32)hpet_readl(HPET_T0_CMP) != cnt);
287
288 return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
282} 289}
283 290
284/* 291/*
diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c
index 1c3a66a67f83..720d2607aacb 100644
--- a/arch/x86/kernel/io_delay.c
+++ b/arch/x86/kernel/io_delay.c
@@ -92,6 +92,14 @@ static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = {
92 DMI_MATCH(DMI_BOARD_NAME, "30BF") 92 DMI_MATCH(DMI_BOARD_NAME, "30BF")
93 } 93 }
94 }, 94 },
95 {
96 .callback = dmi_io_delay_0xed_port,
97 .ident = "Presario F700",
98 .matches = {
99 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
100 DMI_MATCH(DMI_BOARD_NAME, "30D3")
101 }
102 },
95 { } 103 { }
96}; 104};
97 105
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 1cf8c1fcc088..b71e02d42f4f 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -325,7 +325,7 @@ skip:
325 for_each_online_cpu(j) 325 for_each_online_cpu(j)
326 seq_printf(p, "%10u ", 326 seq_printf(p, "%10u ",
327 per_cpu(irq_stat,j).irq_call_count); 327 per_cpu(irq_stat,j).irq_call_count);
328 seq_printf(p, " function call interrupts\n"); 328 seq_printf(p, " Function call interrupts\n");
329 seq_printf(p, "TLB: "); 329 seq_printf(p, "TLB: ");
330 for_each_online_cpu(j) 330 for_each_online_cpu(j)
331 seq_printf(p, "%10u ", 331 seq_printf(p, "%10u ",
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 1f78b238d8d2..f065fe9071b9 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -129,7 +129,7 @@ skip:
129 seq_printf(p, "CAL: "); 129 seq_printf(p, "CAL: ");
130 for_each_online_cpu(j) 130 for_each_online_cpu(j)
131 seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count); 131 seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count);
132 seq_printf(p, " function call interrupts\n"); 132 seq_printf(p, " Function call interrupts\n");
133 seq_printf(p, "TLB: "); 133 seq_printf(p, "TLB: ");
134 for_each_online_cpu(j) 134 for_each_online_cpu(j)
135 seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count); 135 seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count);
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c
index 7377ccb21335..304d8bad6559 100644
--- a/arch/x86/kernel/k8.c
+++ b/arch/x86/kernel/k8.c
@@ -16,8 +16,9 @@ EXPORT_SYMBOL(num_k8_northbridges);
16static u32 *flush_words; 16static u32 *flush_words;
17 17
18struct pci_device_id k8_nb_ids[] = { 18struct pci_device_id k8_nb_ids[] = {
19 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, 19 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
20 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, 20 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
21 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_11H_NB_MISC) },
21 {} 22 {}
22}; 23};
23EXPORT_SYMBOL(k8_nb_ids); 24EXPORT_SYMBOL(k8_nb_ids);
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index f2d43bc75514..ff7d3b0124f1 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -139,6 +139,7 @@ static int __init create_setup_data_nodes(struct dentry *parent)
139 if (PageHighMem(pg)) { 139 if (PageHighMem(pg)) {
140 data = ioremap_cache(pa_data, sizeof(*data)); 140 data = ioremap_cache(pa_data, sizeof(*data));
141 if (!data) { 141 if (!data) {
142 kfree(node);
142 error = -ENXIO; 143 error = -ENXIO;
143 goto err_dir; 144 goto err_dir;
144 } 145 }
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index f47f0eb886b8..10435a120d22 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -69,6 +69,9 @@ static int gdb_x86vector = -1;
69 */ 69 */
70void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) 70void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
71{ 71{
72#ifndef CONFIG_X86_32
73 u32 *gdb_regs32 = (u32 *)gdb_regs;
74#endif
72 gdb_regs[GDB_AX] = regs->ax; 75 gdb_regs[GDB_AX] = regs->ax;
73 gdb_regs[GDB_BX] = regs->bx; 76 gdb_regs[GDB_BX] = regs->bx;
74 gdb_regs[GDB_CX] = regs->cx; 77 gdb_regs[GDB_CX] = regs->cx;
@@ -76,9 +79,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
76 gdb_regs[GDB_SI] = regs->si; 79 gdb_regs[GDB_SI] = regs->si;
77 gdb_regs[GDB_DI] = regs->di; 80 gdb_regs[GDB_DI] = regs->di;
78 gdb_regs[GDB_BP] = regs->bp; 81 gdb_regs[GDB_BP] = regs->bp;
79 gdb_regs[GDB_PS] = regs->flags;
80 gdb_regs[GDB_PC] = regs->ip; 82 gdb_regs[GDB_PC] = regs->ip;
81#ifdef CONFIG_X86_32 83#ifdef CONFIG_X86_32
84 gdb_regs[GDB_PS] = regs->flags;
82 gdb_regs[GDB_DS] = regs->ds; 85 gdb_regs[GDB_DS] = regs->ds;
83 gdb_regs[GDB_ES] = regs->es; 86 gdb_regs[GDB_ES] = regs->es;
84 gdb_regs[GDB_CS] = regs->cs; 87 gdb_regs[GDB_CS] = regs->cs;
@@ -94,6 +97,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
94 gdb_regs[GDB_R13] = regs->r13; 97 gdb_regs[GDB_R13] = regs->r13;
95 gdb_regs[GDB_R14] = regs->r14; 98 gdb_regs[GDB_R14] = regs->r14;
96 gdb_regs[GDB_R15] = regs->r15; 99 gdb_regs[GDB_R15] = regs->r15;
100 gdb_regs32[GDB_PS] = regs->flags;
101 gdb_regs32[GDB_CS] = regs->cs;
102 gdb_regs32[GDB_SS] = regs->ss;
97#endif 103#endif
98 gdb_regs[GDB_SP] = regs->sp; 104 gdb_regs[GDB_SP] = regs->sp;
99} 105}
@@ -112,6 +118,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
112 */ 118 */
113void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) 119void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
114{ 120{
121#ifndef CONFIG_X86_32
122 u32 *gdb_regs32 = (u32 *)gdb_regs;
123#endif
115 gdb_regs[GDB_AX] = 0; 124 gdb_regs[GDB_AX] = 0;
116 gdb_regs[GDB_BX] = 0; 125 gdb_regs[GDB_BX] = 0;
117 gdb_regs[GDB_CX] = 0; 126 gdb_regs[GDB_CX] = 0;
@@ -129,8 +138,10 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
129 gdb_regs[GDB_FS] = 0xFFFF; 138 gdb_regs[GDB_FS] = 0xFFFF;
130 gdb_regs[GDB_GS] = 0xFFFF; 139 gdb_regs[GDB_GS] = 0xFFFF;
131#else 140#else
132 gdb_regs[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); 141 gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8);
133 gdb_regs[GDB_PC] = 0; 142 gdb_regs32[GDB_CS] = __KERNEL_CS;
143 gdb_regs32[GDB_SS] = __KERNEL_DS;
144 gdb_regs[GDB_PC] = p->thread.ip;
134 gdb_regs[GDB_R8] = 0; 145 gdb_regs[GDB_R8] = 0;
135 gdb_regs[GDB_R9] = 0; 146 gdb_regs[GDB_R9] = 0;
136 gdb_regs[GDB_R10] = 0; 147 gdb_regs[GDB_R10] = 0;
@@ -153,6 +164,9 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
153 */ 164 */
154void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) 165void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
155{ 166{
167#ifndef CONFIG_X86_32
168 u32 *gdb_regs32 = (u32 *)gdb_regs;
169#endif
156 regs->ax = gdb_regs[GDB_AX]; 170 regs->ax = gdb_regs[GDB_AX];
157 regs->bx = gdb_regs[GDB_BX]; 171 regs->bx = gdb_regs[GDB_BX];
158 regs->cx = gdb_regs[GDB_CX]; 172 regs->cx = gdb_regs[GDB_CX];
@@ -160,9 +174,9 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
160 regs->si = gdb_regs[GDB_SI]; 174 regs->si = gdb_regs[GDB_SI];
161 regs->di = gdb_regs[GDB_DI]; 175 regs->di = gdb_regs[GDB_DI];
162 regs->bp = gdb_regs[GDB_BP]; 176 regs->bp = gdb_regs[GDB_BP];
163 regs->flags = gdb_regs[GDB_PS];
164 regs->ip = gdb_regs[GDB_PC]; 177 regs->ip = gdb_regs[GDB_PC];
165#ifdef CONFIG_X86_32 178#ifdef CONFIG_X86_32
179 regs->flags = gdb_regs[GDB_PS];
166 regs->ds = gdb_regs[GDB_DS]; 180 regs->ds = gdb_regs[GDB_DS];
167 regs->es = gdb_regs[GDB_ES]; 181 regs->es = gdb_regs[GDB_ES];
168 regs->cs = gdb_regs[GDB_CS]; 182 regs->cs = gdb_regs[GDB_CS];
@@ -175,6 +189,9 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
175 regs->r13 = gdb_regs[GDB_R13]; 189 regs->r13 = gdb_regs[GDB_R13];
176 regs->r14 = gdb_regs[GDB_R14]; 190 regs->r14 = gdb_regs[GDB_R14];
177 regs->r15 = gdb_regs[GDB_R15]; 191 regs->r15 = gdb_regs[GDB_R15];
192 regs->flags = gdb_regs32[GDB_PS];
193 regs->cs = gdb_regs32[GDB_CS];
194 regs->ss = gdb_regs32[GDB_SS];
178#endif 195#endif
179} 196}
180 197
@@ -378,10 +395,8 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
378 if (remcomInBuffer[0] == 's') { 395 if (remcomInBuffer[0] == 's') {
379 linux_regs->flags |= X86_EFLAGS_TF; 396 linux_regs->flags |= X86_EFLAGS_TF;
380 kgdb_single_step = 1; 397 kgdb_single_step = 1;
381 if (kgdb_contthread) { 398 atomic_set(&kgdb_cpu_doing_single_step,
382 atomic_set(&kgdb_cpu_doing_single_step, 399 raw_smp_processor_id());
383 raw_smp_processor_id());
384 }
385 } 400 }
386 401
387 get_debugreg(dr6, 6); 402 get_debugreg(dr6, 6);
@@ -440,12 +455,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
440 return NOTIFY_DONE; 455 return NOTIFY_DONE;
441 456
442 case DIE_NMI_IPI: 457 case DIE_NMI_IPI:
443 if (atomic_read(&kgdb_active) != -1) { 458 /* Just ignore, we will handle the roundup on DIE_NMI. */
444 /* KGDB CPU roundup */
445 kgdb_nmicallback(raw_smp_processor_id(), regs);
446 was_in_debug_nmi[raw_smp_processor_id()] = 1;
447 touch_nmi_watchdog();
448 }
449 return NOTIFY_DONE; 459 return NOTIFY_DONE;
450 460
451 case DIE_NMIUNKNOWN: 461 case DIE_NMIUNKNOWN:
@@ -466,9 +476,15 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
466 476
467 case DIE_DEBUG: 477 case DIE_DEBUG:
468 if (atomic_read(&kgdb_cpu_doing_single_step) == 478 if (atomic_read(&kgdb_cpu_doing_single_step) ==
469 raw_smp_processor_id() && 479 raw_smp_processor_id()) {
470 user_mode(regs)) 480 if (user_mode(regs))
471 return single_step_cont(regs, args); 481 return single_step_cont(regs, args);
482 break;
483 } else if (test_thread_flag(TIF_SINGLESTEP))
484 /* This means a user thread is single stepping
485 * a system call which should be ignored
486 */
487 return NOTIFY_DONE;
472 /* fall through */ 488 /* fall through */
473 default: 489 default:
474 if (user_mode(regs)) 490 if (user_mode(regs))
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 8b7a3cf37d2b..478bca986eca 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -178,7 +178,7 @@ static void kvm_flush_tlb(void)
178 kvm_deferred_mmu_op(&ftlb, sizeof ftlb); 178 kvm_deferred_mmu_op(&ftlb, sizeof ftlb);
179} 179}
180 180
181static void kvm_release_pt(u32 pfn) 181static void kvm_release_pt(unsigned long pfn)
182{ 182{
183 struct kvm_mmu_op_release_pt rpt = { 183 struct kvm_mmu_op_release_pt rpt = {
184 .header.op = KVM_MMU_OP_RELEASE_PT, 184 .header.op = KVM_MMU_OP_RELEASE_PT,
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index abb78a2cc4ad..2c97f07f1c2c 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -299,6 +299,15 @@ void acpi_nmi_disable(void)
299 on_each_cpu(__acpi_nmi_disable, NULL, 1); 299 on_each_cpu(__acpi_nmi_disable, NULL, 1);
300} 300}
301 301
302/*
303 * This function is called as soon the LAPIC NMI watchdog driver has everything
304 * in place and it's ready to check if the NMIs belong to the NMI watchdog
305 */
306void cpu_nmi_set_wd_enabled(void)
307{
308 __get_cpu_var(wd_enabled) = 1;
309}
310
302void setup_apic_nmi_watchdog(void *unused) 311void setup_apic_nmi_watchdog(void *unused)
303{ 312{
304 if (__get_cpu_var(wd_enabled)) 313 if (__get_cpu_var(wd_enabled))
@@ -311,8 +320,6 @@ void setup_apic_nmi_watchdog(void *unused)
311 320
312 switch (nmi_watchdog) { 321 switch (nmi_watchdog) {
313 case NMI_LOCAL_APIC: 322 case NMI_LOCAL_APIC:
314 /* enable it before to avoid race with handler */
315 __get_cpu_var(wd_enabled) = 1;
316 if (lapic_watchdog_init(nmi_hz) < 0) { 323 if (lapic_watchdog_init(nmi_hz) < 0) {
317 __get_cpu_var(wd_enabled) = 0; 324 __get_cpu_var(wd_enabled) = 0;
318 return; 325 return;
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 3e6672274807..7a13fac63a1f 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -190,12 +190,12 @@ EXPORT_SYMBOL_GPL(olpc_ec_cmd);
190static void __init platform_detect(void) 190static void __init platform_detect(void)
191{ 191{
192 size_t propsize; 192 size_t propsize;
193 u32 rev; 193 __be32 rev;
194 194
195 if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4, 195 if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4,
196 &propsize) || propsize != 4) { 196 &propsize) || propsize != 4) {
197 printk(KERN_ERR "ofw: getprop call failed!\n"); 197 printk(KERN_ERR "ofw: getprop call failed!\n");
198 rev = 0; 198 rev = cpu_to_be32(0);
199 } 199 }
200 olpc_platform_info.boardrev = be32_to_cpu(rev); 200 olpc_platform_info.boardrev = be32_to_cpu(rev);
201} 201}
@@ -203,7 +203,7 @@ static void __init platform_detect(void)
203static void __init platform_detect(void) 203static void __init platform_detect(void)
204{ 204{
205 /* stopgap until OFW support is added to the kernel */ 205 /* stopgap until OFW support is added to the kernel */
206 olpc_platform_info.boardrev = be32_to_cpu(0xc2); 206 olpc_platform_info.boardrev = 0xc2;
207} 207}
208#endif 208#endif
209 209
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index 58262218781b..9fe644f4861d 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -23,7 +23,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
23 start = start_##ops##_##x; \ 23 start = start_##ops##_##x; \
24 end = end_##ops##_##x; \ 24 end = end_##ops##_##x; \
25 goto patch_site 25 goto patch_site
26 switch(type) { 26 switch (type) {
27 PATCH_SITE(pv_irq_ops, irq_disable); 27 PATCH_SITE(pv_irq_ops, irq_disable);
28 PATCH_SITE(pv_irq_ops, irq_enable); 28 PATCH_SITE(pv_irq_ops, irq_enable);
29 PATCH_SITE(pv_irq_ops, restore_fl); 29 PATCH_SITE(pv_irq_ops, restore_fl);
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index dcdac6c826e9..080d1d27f37a 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -261,7 +261,7 @@ static void iommu_range_reserve(struct iommu_table *tbl,
261 badbit, tbl, start_addr, npages); 261 badbit, tbl, start_addr, npages);
262 } 262 }
263 263
264 set_bit_string(tbl->it_map, index, npages); 264 iommu_area_reserve(tbl->it_map, index, npages);
265 265
266 spin_unlock_irqrestore(&tbl->it_lock, flags); 266 spin_unlock_irqrestore(&tbl->it_lock, flags);
267} 267}
@@ -491,6 +491,8 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size,
491 npages = size >> PAGE_SHIFT; 491 npages = size >> PAGE_SHIFT;
492 order = get_order(size); 492 order = get_order(size);
493 493
494 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
495
494 /* alloc enough pages (and possibly more) */ 496 /* alloc enough pages (and possibly more) */
495 ret = (void *)__get_free_pages(flag, order); 497 ret = (void *)__get_free_pages(flag, order);
496 if (!ret) 498 if (!ret)
@@ -510,8 +512,22 @@ error:
510 return ret; 512 return ret;
511} 513}
512 514
515static void calgary_free_coherent(struct device *dev, size_t size,
516 void *vaddr, dma_addr_t dma_handle)
517{
518 unsigned int npages;
519 struct iommu_table *tbl = find_iommu_table(dev);
520
521 size = PAGE_ALIGN(size);
522 npages = size >> PAGE_SHIFT;
523
524 iommu_free(tbl, dma_handle, npages);
525 free_pages((unsigned long)vaddr, get_order(size));
526}
527
513static struct dma_mapping_ops calgary_dma_ops = { 528static struct dma_mapping_ops calgary_dma_ops = {
514 .alloc_coherent = calgary_alloc_coherent, 529 .alloc_coherent = calgary_alloc_coherent,
530 .free_coherent = calgary_free_coherent,
515 .map_single = calgary_map_single, 531 .map_single = calgary_map_single,
516 .unmap_single = calgary_unmap_single, 532 .unmap_single = calgary_unmap_single,
517 .map_sg = calgary_map_sg, 533 .map_sg = calgary_map_sg,
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 87d4d6964ec2..0a3824e837b4 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -41,11 +41,12 @@ EXPORT_SYMBOL(bad_dma_address);
41/* Dummy device used for NULL arguments (normally ISA). Better would 41/* Dummy device used for NULL arguments (normally ISA). Better would
42 be probably a smaller DMA mask, but this is bug-to-bug compatible 42 be probably a smaller DMA mask, but this is bug-to-bug compatible
43 to older i386. */ 43 to older i386. */
44struct device fallback_dev = { 44struct device x86_dma_fallback_dev = {
45 .bus_id = "fallback device", 45 .bus_id = "fallback device",
46 .coherent_dma_mask = DMA_32BIT_MASK, 46 .coherent_dma_mask = DMA_32BIT_MASK,
47 .dma_mask = &fallback_dev.coherent_dma_mask, 47 .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask,
48}; 48};
49EXPORT_SYMBOL(x86_dma_fallback_dev);
49 50
50int dma_set_mask(struct device *dev, u64 mask) 51int dma_set_mask(struct device *dev, u64 mask)
51{ 52{
@@ -82,7 +83,7 @@ void __init dma32_reserve_bootmem(void)
82 * using 512M as goal 83 * using 512M as goal
83 */ 84 */
84 align = 64ULL<<20; 85 align = 64ULL<<20;
85 size = round_up(dma32_bootmem_size, align); 86 size = roundup(dma32_bootmem_size, align);
86 dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align, 87 dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
87 512ULL<<20); 88 512ULL<<20);
88 if (dma32_bootmem_ptr) 89 if (dma32_bootmem_ptr)
@@ -133,6 +134,37 @@ unsigned long iommu_num_pages(unsigned long addr, unsigned long len)
133EXPORT_SYMBOL(iommu_num_pages); 134EXPORT_SYMBOL(iommu_num_pages);
134#endif 135#endif
135 136
137void *dma_generic_alloc_coherent(struct device *dev, size_t size,
138 dma_addr_t *dma_addr, gfp_t flag)
139{
140 unsigned long dma_mask;
141 struct page *page;
142 dma_addr_t addr;
143
144 dma_mask = dma_alloc_coherent_mask(dev, flag);
145
146 flag |= __GFP_ZERO;
147again:
148 page = alloc_pages_node(dev_to_node(dev), flag, get_order(size));
149 if (!page)
150 return NULL;
151
152 addr = page_to_phys(page);
153 if (!is_buffer_dma_capable(dma_mask, addr, size)) {
154 __free_pages(page, get_order(size));
155
156 if (dma_mask < DMA_32BIT_MASK && !(flag & GFP_DMA)) {
157 flag = (flag & ~GFP_DMA32) | GFP_DMA;
158 goto again;
159 }
160
161 return NULL;
162 }
163
164 *dma_addr = addr;
165 return page_address(page);
166}
167
136/* 168/*
137 * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter 169 * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
138 * documentation. 170 * documentation.
@@ -241,147 +273,6 @@ int dma_supported(struct device *dev, u64 mask)
241} 273}
242EXPORT_SYMBOL(dma_supported); 274EXPORT_SYMBOL(dma_supported);
243 275
244/* Allocate DMA memory on node near device */
245static noinline struct page *
246dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
247{
248 int node;
249
250 node = dev_to_node(dev);
251
252 return alloc_pages_node(node, gfp, order);
253}
254
255/*
256 * Allocate memory for a coherent mapping.
257 */
258void *
259dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
260 gfp_t gfp)
261{
262 struct dma_mapping_ops *ops = get_dma_ops(dev);
263 void *memory = NULL;
264 struct page *page;
265 unsigned long dma_mask = 0;
266 dma_addr_t bus;
267 int noretry = 0;
268
269 /* ignore region specifiers */
270 gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
271
272 if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
273 return memory;
274
275 if (!dev) {
276 dev = &fallback_dev;
277 gfp |= GFP_DMA;
278 }
279 dma_mask = dev->coherent_dma_mask;
280 if (dma_mask == 0)
281 dma_mask = (gfp & GFP_DMA) ? DMA_24BIT_MASK : DMA_32BIT_MASK;
282
283 /* Device not DMA able */
284 if (dev->dma_mask == NULL)
285 return NULL;
286
287 /* Don't invoke OOM killer or retry in lower 16MB DMA zone */
288 if (gfp & __GFP_DMA)
289 noretry = 1;
290
291#ifdef CONFIG_X86_64
292 /* Why <=? Even when the mask is smaller than 4GB it is often
293 larger than 16MB and in this case we have a chance of
294 finding fitting memory in the next higher zone first. If
295 not retry with true GFP_DMA. -AK */
296 if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
297 gfp |= GFP_DMA32;
298 if (dma_mask < DMA_32BIT_MASK)
299 noretry = 1;
300 }
301#endif
302
303 again:
304 page = dma_alloc_pages(dev,
305 noretry ? gfp | __GFP_NORETRY : gfp, get_order(size));
306 if (page == NULL)
307 return NULL;
308
309 {
310 int high, mmu;
311 bus = page_to_phys(page);
312 memory = page_address(page);
313 high = (bus + size) >= dma_mask;
314 mmu = high;
315 if (force_iommu && !(gfp & GFP_DMA))
316 mmu = 1;
317 else if (high) {
318 free_pages((unsigned long)memory,
319 get_order(size));
320
321 /* Don't use the 16MB ZONE_DMA unless absolutely
322 needed. It's better to use remapping first. */
323 if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
324 gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
325 goto again;
326 }
327
328 /* Let low level make its own zone decisions */
329 gfp &= ~(GFP_DMA32|GFP_DMA);
330
331 if (ops->alloc_coherent)
332 return ops->alloc_coherent(dev, size,
333 dma_handle, gfp);
334 return NULL;
335 }
336
337 memset(memory, 0, size);
338 if (!mmu) {
339 *dma_handle = bus;
340 return memory;
341 }
342 }
343
344 if (ops->alloc_coherent) {
345 free_pages((unsigned long)memory, get_order(size));
346 gfp &= ~(GFP_DMA|GFP_DMA32);
347 return ops->alloc_coherent(dev, size, dma_handle, gfp);
348 }
349
350 if (ops->map_simple) {
351 *dma_handle = ops->map_simple(dev, virt_to_phys(memory),
352 size,
353 PCI_DMA_BIDIRECTIONAL);
354 if (*dma_handle != bad_dma_address)
355 return memory;
356 }
357
358 if (panic_on_overflow)
359 panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n",
360 (unsigned long)size);
361 free_pages((unsigned long)memory, get_order(size));
362 return NULL;
363}
364EXPORT_SYMBOL(dma_alloc_coherent);
365
366/*
367 * Unmap coherent memory.
368 * The caller must ensure that the device has finished accessing the mapping.
369 */
370void dma_free_coherent(struct device *dev, size_t size,
371 void *vaddr, dma_addr_t bus)
372{
373 struct dma_mapping_ops *ops = get_dma_ops(dev);
374
375 int order = get_order(size);
376 WARN_ON(irqs_disabled()); /* for portability */
377 if (dma_release_from_coherent(dev, order, vaddr))
378 return;
379 if (ops->unmap_single)
380 ops->unmap_single(dev, bus, size, 0);
381 free_pages((unsigned long)vaddr, order);
382}
383EXPORT_SYMBOL(dma_free_coherent);
384
385static int __init pci_iommu_init(void) 276static int __init pci_iommu_init(void)
386{ 277{
387 calgary_iommu_init(); 278 calgary_iommu_init();
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 49285f8fd4d5..145f1c83369f 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -27,8 +27,8 @@
27#include <linux/scatterlist.h> 27#include <linux/scatterlist.h>
28#include <linux/iommu-helper.h> 28#include <linux/iommu-helper.h>
29#include <linux/sysdev.h> 29#include <linux/sysdev.h>
30#include <linux/io.h>
30#include <asm/atomic.h> 31#include <asm/atomic.h>
31#include <asm/io.h>
32#include <asm/mtrr.h> 32#include <asm/mtrr.h>
33#include <asm/pgtable.h> 33#include <asm/pgtable.h>
34#include <asm/proto.h> 34#include <asm/proto.h>
@@ -80,9 +80,10 @@ AGPEXTERN int agp_memory_reserved;
80AGPEXTERN __u32 *agp_gatt_table; 80AGPEXTERN __u32 *agp_gatt_table;
81 81
82static unsigned long next_bit; /* protected by iommu_bitmap_lock */ 82static unsigned long next_bit; /* protected by iommu_bitmap_lock */
83static int need_flush; /* global flush state. set for each gart wrap */ 83static bool need_flush; /* global flush state. set for each gart wrap */
84 84
85static unsigned long alloc_iommu(struct device *dev, int size) 85static unsigned long alloc_iommu(struct device *dev, int size,
86 unsigned long align_mask)
86{ 87{
87 unsigned long offset, flags; 88 unsigned long offset, flags;
88 unsigned long boundary_size; 89 unsigned long boundary_size;
@@ -90,26 +91,27 @@ static unsigned long alloc_iommu(struct device *dev, int size)
90 91
91 base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev), 92 base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev),
92 PAGE_SIZE) >> PAGE_SHIFT; 93 PAGE_SIZE) >> PAGE_SHIFT;
93 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, 94 boundary_size = ALIGN((unsigned long long)dma_get_seg_boundary(dev) + 1,
94 PAGE_SIZE) >> PAGE_SHIFT; 95 PAGE_SIZE) >> PAGE_SHIFT;
95 96
96 spin_lock_irqsave(&iommu_bitmap_lock, flags); 97 spin_lock_irqsave(&iommu_bitmap_lock, flags);
97 offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit, 98 offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit,
98 size, base_index, boundary_size, 0); 99 size, base_index, boundary_size, align_mask);
99 if (offset == -1) { 100 if (offset == -1) {
100 need_flush = 1; 101 need_flush = true;
101 offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0, 102 offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0,
102 size, base_index, boundary_size, 0); 103 size, base_index, boundary_size,
104 align_mask);
103 } 105 }
104 if (offset != -1) { 106 if (offset != -1) {
105 next_bit = offset+size; 107 next_bit = offset+size;
106 if (next_bit >= iommu_pages) { 108 if (next_bit >= iommu_pages) {
107 next_bit = 0; 109 next_bit = 0;
108 need_flush = 1; 110 need_flush = true;
109 } 111 }
110 } 112 }
111 if (iommu_fullflush) 113 if (iommu_fullflush)
112 need_flush = 1; 114 need_flush = true;
113 spin_unlock_irqrestore(&iommu_bitmap_lock, flags); 115 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
114 116
115 return offset; 117 return offset;
@@ -134,7 +136,7 @@ static void flush_gart(void)
134 spin_lock_irqsave(&iommu_bitmap_lock, flags); 136 spin_lock_irqsave(&iommu_bitmap_lock, flags);
135 if (need_flush) { 137 if (need_flush) {
136 k8_flush_garts(); 138 k8_flush_garts();
137 need_flush = 0; 139 need_flush = false;
138 } 140 }
139 spin_unlock_irqrestore(&iommu_bitmap_lock, flags); 141 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
140} 142}
@@ -173,7 +175,8 @@ static void dump_leak(void)
173 iommu_leak_pages); 175 iommu_leak_pages);
174 for (i = 0; i < iommu_leak_pages; i += 2) { 176 for (i = 0; i < iommu_leak_pages; i += 2) {
175 printk(KERN_DEBUG "%lu: ", iommu_pages-i); 177 printk(KERN_DEBUG "%lu: ", iommu_pages-i);
176 printk_address((unsigned long) iommu_leak_tab[iommu_pages-i], 0); 178 printk_address((unsigned long) iommu_leak_tab[iommu_pages-i],
179 0);
177 printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' '); 180 printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' ');
178 } 181 }
179 printk(KERN_DEBUG "\n"); 182 printk(KERN_DEBUG "\n");
@@ -212,34 +215,24 @@ static void iommu_full(struct device *dev, size_t size, int dir)
212static inline int 215static inline int
213need_iommu(struct device *dev, unsigned long addr, size_t size) 216need_iommu(struct device *dev, unsigned long addr, size_t size)
214{ 217{
215 u64 mask = *dev->dma_mask; 218 return force_iommu ||
216 int high = addr + size > mask; 219 !is_buffer_dma_capable(*dev->dma_mask, addr, size);
217 int mmu = high;
218
219 if (force_iommu)
220 mmu = 1;
221
222 return mmu;
223} 220}
224 221
225static inline int 222static inline int
226nonforced_iommu(struct device *dev, unsigned long addr, size_t size) 223nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
227{ 224{
228 u64 mask = *dev->dma_mask; 225 return !is_buffer_dma_capable(*dev->dma_mask, addr, size);
229 int high = addr + size > mask;
230 int mmu = high;
231
232 return mmu;
233} 226}
234 227
235/* Map a single continuous physical area into the IOMMU. 228/* Map a single continuous physical area into the IOMMU.
236 * Caller needs to check if the iommu is needed and flush. 229 * Caller needs to check if the iommu is needed and flush.
237 */ 230 */
238static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, 231static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
239 size_t size, int dir) 232 size_t size, int dir, unsigned long align_mask)
240{ 233{
241 unsigned long npages = iommu_num_pages(phys_mem, size); 234 unsigned long npages = iommu_num_pages(phys_mem, size);
242 unsigned long iommu_page = alloc_iommu(dev, npages); 235 unsigned long iommu_page = alloc_iommu(dev, npages, align_mask);
243 int i; 236 int i;
244 237
245 if (iommu_page == -1) { 238 if (iommu_page == -1) {
@@ -259,16 +252,6 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
259 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); 252 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
260} 253}
261 254
262static dma_addr_t
263gart_map_simple(struct device *dev, phys_addr_t paddr, size_t size, int dir)
264{
265 dma_addr_t map = dma_map_area(dev, paddr, size, dir);
266
267 flush_gart();
268
269 return map;
270}
271
272/* Map a single area into the IOMMU */ 255/* Map a single area into the IOMMU */
273static dma_addr_t 256static dma_addr_t
274gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir) 257gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir)
@@ -276,12 +259,13 @@ gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir)
276 unsigned long bus; 259 unsigned long bus;
277 260
278 if (!dev) 261 if (!dev)
279 dev = &fallback_dev; 262 dev = &x86_dma_fallback_dev;
280 263
281 if (!need_iommu(dev, paddr, size)) 264 if (!need_iommu(dev, paddr, size))
282 return paddr; 265 return paddr;
283 266
284 bus = gart_map_simple(dev, paddr, size, dir); 267 bus = dma_map_area(dev, paddr, size, dir, 0);
268 flush_gart();
285 269
286 return bus; 270 return bus;
287} 271}
@@ -340,7 +324,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
340 unsigned long addr = sg_phys(s); 324 unsigned long addr = sg_phys(s);
341 325
342 if (nonforced_iommu(dev, addr, s->length)) { 326 if (nonforced_iommu(dev, addr, s->length)) {
343 addr = dma_map_area(dev, addr, s->length, dir); 327 addr = dma_map_area(dev, addr, s->length, dir, 0);
344 if (addr == bad_dma_address) { 328 if (addr == bad_dma_address) {
345 if (i > 0) 329 if (i > 0)
346 gart_unmap_sg(dev, sg, i, dir); 330 gart_unmap_sg(dev, sg, i, dir);
@@ -362,7 +346,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
362 int nelems, struct scatterlist *sout, 346 int nelems, struct scatterlist *sout,
363 unsigned long pages) 347 unsigned long pages)
364{ 348{
365 unsigned long iommu_start = alloc_iommu(dev, pages); 349 unsigned long iommu_start = alloc_iommu(dev, pages, 0);
366 unsigned long iommu_page = iommu_start; 350 unsigned long iommu_page = iommu_start;
367 struct scatterlist *s; 351 struct scatterlist *s;
368 int i; 352 int i;
@@ -427,7 +411,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
427 return 0; 411 return 0;
428 412
429 if (!dev) 413 if (!dev)
430 dev = &fallback_dev; 414 dev = &x86_dma_fallback_dev;
431 415
432 out = 0; 416 out = 0;
433 start = 0; 417 start = 0;
@@ -499,6 +483,46 @@ error:
499 return 0; 483 return 0;
500} 484}
501 485
486/* allocate and map a coherent mapping */
487static void *
488gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
489 gfp_t flag)
490{
491 dma_addr_t paddr;
492 unsigned long align_mask;
493 struct page *page;
494
495 if (force_iommu && !(flag & GFP_DMA)) {
496 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
497 page = alloc_pages(flag | __GFP_ZERO, get_order(size));
498 if (!page)
499 return NULL;
500
501 align_mask = (1UL << get_order(size)) - 1;
502 paddr = dma_map_area(dev, page_to_phys(page), size,
503 DMA_BIDIRECTIONAL, align_mask);
504
505 flush_gart();
506 if (paddr != bad_dma_address) {
507 *dma_addr = paddr;
508 return page_address(page);
509 }
510 __free_pages(page, get_order(size));
511 } else
512 return dma_generic_alloc_coherent(dev, size, dma_addr, flag);
513
514 return NULL;
515}
516
517/* free a coherent mapping */
518static void
519gart_free_coherent(struct device *dev, size_t size, void *vaddr,
520 dma_addr_t dma_addr)
521{
522 gart_unmap_single(dev, dma_addr, size, DMA_BIDIRECTIONAL);
523 free_pages((unsigned long)vaddr, get_order(size));
524}
525
502static int no_agp; 526static int no_agp;
503 527
504static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) 528static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
@@ -626,7 +650,6 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
626 struct pci_dev *dev; 650 struct pci_dev *dev;
627 void *gatt; 651 void *gatt;
628 int i, error; 652 int i, error;
629 unsigned long start_pfn, end_pfn;
630 653
631 printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); 654 printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
632 aper_size = aper_base = info->aper_size = 0; 655 aper_size = aper_base = info->aper_size = 0;
@@ -650,13 +673,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
650 info->aper_size = aper_size >> 20; 673 info->aper_size = aper_size >> 20;
651 674
652 gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); 675 gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32);
653 gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); 676 gatt = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
677 get_order(gatt_size));
654 if (!gatt) 678 if (!gatt)
655 panic("Cannot allocate GATT table"); 679 panic("Cannot allocate GATT table");
656 if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT)) 680 if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT))
657 panic("Could not set GART PTEs to uncacheable pages"); 681 panic("Could not set GART PTEs to uncacheable pages");
658 682
659 memset(gatt, 0, gatt_size);
660 agp_gatt_table = gatt; 683 agp_gatt_table = gatt;
661 684
662 enable_gart_translations(); 685 enable_gart_translations();
@@ -665,19 +688,14 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
665 if (!error) 688 if (!error)
666 error = sysdev_register(&device_gart); 689 error = sysdev_register(&device_gart);
667 if (error) 690 if (error)
668 panic("Could not register gart_sysdev -- would corrupt data on next suspend"); 691 panic("Could not register gart_sysdev -- "
692 "would corrupt data on next suspend");
669 693
670 flush_gart(); 694 flush_gart();
671 695
672 printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", 696 printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n",
673 aper_base, aper_size>>10); 697 aper_base, aper_size>>10);
674 698
675 /* need to map that range */
676 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
677 if (end_pfn > max_low_pfn_mapped) {
678 start_pfn = (aper_base>>PAGE_SHIFT);
679 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
680 }
681 return 0; 699 return 0;
682 700
683 nommu: 701 nommu:
@@ -687,20 +705,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
687 return -1; 705 return -1;
688} 706}
689 707
690extern int agp_amd64_init(void);
691
692static struct dma_mapping_ops gart_dma_ops = { 708static struct dma_mapping_ops gart_dma_ops = {
693 .map_single = gart_map_single, 709 .map_single = gart_map_single,
694 .map_simple = gart_map_simple,
695 .unmap_single = gart_unmap_single, 710 .unmap_single = gart_unmap_single,
696 .sync_single_for_cpu = NULL,
697 .sync_single_for_device = NULL,
698 .sync_single_range_for_cpu = NULL,
699 .sync_single_range_for_device = NULL,
700 .sync_sg_for_cpu = NULL,
701 .sync_sg_for_device = NULL,
702 .map_sg = gart_map_sg, 711 .map_sg = gart_map_sg,
703 .unmap_sg = gart_unmap_sg, 712 .unmap_sg = gart_unmap_sg,
713 .alloc_coherent = gart_alloc_coherent,
714 .free_coherent = gart_free_coherent,
704}; 715};
705 716
706void gart_iommu_shutdown(void) 717void gart_iommu_shutdown(void)
@@ -727,7 +738,8 @@ void __init gart_iommu_init(void)
727{ 738{
728 struct agp_kern_info info; 739 struct agp_kern_info info;
729 unsigned long iommu_start; 740 unsigned long iommu_start;
730 unsigned long aper_size; 741 unsigned long aper_base, aper_size;
742 unsigned long start_pfn, end_pfn;
731 unsigned long scratch; 743 unsigned long scratch;
732 long i; 744 long i;
733 745
@@ -759,30 +771,35 @@ void __init gart_iommu_init(void)
759 (no_agp && init_k8_gatt(&info) < 0)) { 771 (no_agp && init_k8_gatt(&info) < 0)) {
760 if (max_pfn > MAX_DMA32_PFN) { 772 if (max_pfn > MAX_DMA32_PFN) {
761 printk(KERN_WARNING "More than 4GB of memory " 773 printk(KERN_WARNING "More than 4GB of memory "
762 "but GART IOMMU not available.\n" 774 "but GART IOMMU not available.\n");
763 KERN_WARNING "falling back to iommu=soft.\n"); 775 printk(KERN_WARNING "falling back to iommu=soft.\n");
764 } 776 }
765 return; 777 return;
766 } 778 }
767 779
780 /* need to map that range */
781 aper_size = info.aper_size << 20;
782 aper_base = info.aper_base;
783 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
784 if (end_pfn > max_low_pfn_mapped) {
785 start_pfn = (aper_base>>PAGE_SHIFT);
786 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
787 }
788
768 printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); 789 printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n");
769 aper_size = info.aper_size * 1024 * 1024;
770 iommu_size = check_iommu_size(info.aper_base, aper_size); 790 iommu_size = check_iommu_size(info.aper_base, aper_size);
771 iommu_pages = iommu_size >> PAGE_SHIFT; 791 iommu_pages = iommu_size >> PAGE_SHIFT;
772 792
773 iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL, 793 iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO,
774 get_order(iommu_pages/8)); 794 get_order(iommu_pages/8));
775 if (!iommu_gart_bitmap) 795 if (!iommu_gart_bitmap)
776 panic("Cannot allocate iommu bitmap\n"); 796 panic("Cannot allocate iommu bitmap\n");
777 memset(iommu_gart_bitmap, 0, iommu_pages/8);
778 797
779#ifdef CONFIG_IOMMU_LEAK 798#ifdef CONFIG_IOMMU_LEAK
780 if (leak_trace) { 799 if (leak_trace) {
781 iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, 800 iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO,
782 get_order(iommu_pages*sizeof(void *))); 801 get_order(iommu_pages*sizeof(void *)));
783 if (iommu_leak_tab) 802 if (!iommu_leak_tab)
784 memset(iommu_leak_tab, 0, iommu_pages * 8);
785 else
786 printk(KERN_DEBUG 803 printk(KERN_DEBUG
787 "PCI-DMA: Cannot allocate leak trace area\n"); 804 "PCI-DMA: Cannot allocate leak trace area\n");
788 } 805 }
@@ -792,7 +809,7 @@ void __init gart_iommu_init(void)
792 * Out of IOMMU space handling. 809 * Out of IOMMU space handling.
793 * Reserve some invalid pages at the beginning of the GART. 810 * Reserve some invalid pages at the beginning of the GART.
794 */ 811 */
795 set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); 812 iommu_area_reserve(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
796 813
797 agp_memory_reserved = iommu_size; 814 agp_memory_reserved = iommu_size;
798 printk(KERN_INFO 815 printk(KERN_INFO
@@ -850,7 +867,8 @@ void __init gart_parse_options(char *p)
850 if (!strncmp(p, "leak", 4)) { 867 if (!strncmp(p, "leak", 4)) {
851 leak_trace = 1; 868 leak_trace = 1;
852 p += 4; 869 p += 4;
853 if (*p == '=') ++p; 870 if (*p == '=')
871 ++p;
854 if (isdigit(*p) && get_option(&p, &arg)) 872 if (isdigit(*p) && get_option(&p, &arg))
855 iommu_leak_pages = arg; 873 iommu_leak_pages = arg;
856 } 874 }
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index 3f91f71cdc3e..c70ab5a5d4c8 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -14,7 +14,7 @@
14static int 14static int
15check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) 15check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
16{ 16{
17 if (hwdev && bus + size > *hwdev->dma_mask) { 17 if (hwdev && !is_buffer_dma_capable(*hwdev->dma_mask, bus, size)) {
18 if (*hwdev->dma_mask >= DMA_32BIT_MASK) 18 if (*hwdev->dma_mask >= DMA_32BIT_MASK)
19 printk(KERN_ERR 19 printk(KERN_ERR
20 "nommu_%s: overflow %Lx+%zu of device mask %Lx\n", 20 "nommu_%s: overflow %Lx+%zu of device mask %Lx\n",
@@ -72,7 +72,15 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
72 return nents; 72 return nents;
73} 73}
74 74
75static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
76 dma_addr_t dma_addr)
77{
78 free_pages((unsigned long)vaddr, get_order(size));
79}
80
75struct dma_mapping_ops nommu_dma_ops = { 81struct dma_mapping_ops nommu_dma_ops = {
82 .alloc_coherent = dma_generic_alloc_coherent,
83 .free_coherent = nommu_free_coherent,
76 .map_single = nommu_map_single, 84 .map_single = nommu_map_single,
77 .map_sg = nommu_map_sg, 85 .map_sg = nommu_map_sg,
78 .is_phys = 1, 86 .is_phys = 1,
diff --git a/arch/x86/kernel/pcspeaker.c b/arch/x86/kernel/pcspeaker.c
index bc1f2d3ea277..a311ffcaad16 100644
--- a/arch/x86/kernel/pcspeaker.c
+++ b/arch/x86/kernel/pcspeaker.c
@@ -1,20 +1,13 @@
1#include <linux/platform_device.h> 1#include <linux/platform_device.h>
2#include <linux/errno.h> 2#include <linux/err.h>
3#include <linux/init.h> 3#include <linux/init.h>
4 4
5static __init int add_pcspkr(void) 5static __init int add_pcspkr(void)
6{ 6{
7 struct platform_device *pd; 7 struct platform_device *pd;
8 int ret;
9 8
10 pd = platform_device_alloc("pcspkr", -1); 9 pd = platform_device_register_simple("pcspkr", -1, NULL, 0);
11 if (!pd)
12 return -ENOMEM;
13 10
14 ret = platform_device_add(pd); 11 return IS_ERR(pd) ? PTR_ERR(pd) : 0;
15 if (ret)
16 platform_device_put(pd);
17
18 return ret;
19} 12}
20device_initcall(add_pcspkr); 13device_initcall(add_pcspkr);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 7fc4d5b0a6a0..c622772744d8 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -15,7 +15,6 @@ unsigned long idle_nomwait;
15EXPORT_SYMBOL(idle_nomwait); 15EXPORT_SYMBOL(idle_nomwait);
16 16
17struct kmem_cache *task_xstate_cachep; 17struct kmem_cache *task_xstate_cachep;
18static int force_mwait __cpuinitdata;
19 18
20int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 19int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
21{ 20{
@@ -185,7 +184,8 @@ static void mwait_idle(void)
185static void poll_idle(void) 184static void poll_idle(void)
186{ 185{
187 local_irq_enable(); 186 local_irq_enable();
188 cpu_relax(); 187 while (!need_resched())
188 cpu_relax();
189} 189}
190 190
191/* 191/*
@@ -246,6 +246,14 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
246 return 1; 246 return 1;
247} 247}
248 248
249static cpumask_t c1e_mask = CPU_MASK_NONE;
250static int c1e_detected;
251
252void c1e_remove_cpu(int cpu)
253{
254 cpu_clear(cpu, c1e_mask);
255}
256
249/* 257/*
250 * C1E aware idle routine. We check for C1E active in the interrupt 258 * C1E aware idle routine. We check for C1E active in the interrupt
251 * pending message MSR. If we detect C1E, then we handle it the same 259 * pending message MSR. If we detect C1E, then we handle it the same
@@ -253,9 +261,6 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
253 */ 261 */
254static void c1e_idle(void) 262static void c1e_idle(void)
255{ 263{
256 static cpumask_t c1e_mask = CPU_MASK_NONE;
257 static int c1e_detected;
258
259 if (need_resched()) 264 if (need_resched())
260 return; 265 return;
261 266
@@ -265,8 +270,10 @@ static void c1e_idle(void)
265 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); 270 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
266 if (lo & K8_INTP_C1E_ACTIVE_MASK) { 271 if (lo & K8_INTP_C1E_ACTIVE_MASK) {
267 c1e_detected = 1; 272 c1e_detected = 1;
268 mark_tsc_unstable("TSC halt in C1E"); 273 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
269 printk(KERN_INFO "System has C1E enabled\n"); 274 mark_tsc_unstable("TSC halt in AMD C1E");
275 printk(KERN_INFO "System has AMD C1E enabled\n");
276 set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
270 } 277 }
271 } 278 }
272 279
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 2c9abc95e026..205188db9626 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -37,6 +37,7 @@
37#include <linux/tick.h> 37#include <linux/tick.h>
38#include <linux/percpu.h> 38#include <linux/percpu.h>
39#include <linux/prctl.h> 39#include <linux/prctl.h>
40#include <linux/dmi.h>
40 41
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
42#include <asm/pgtable.h> 43#include <asm/pgtable.h>
@@ -55,6 +56,7 @@
55#include <asm/tlbflush.h> 56#include <asm/tlbflush.h>
56#include <asm/cpu.h> 57#include <asm/cpu.h>
57#include <asm/kdebug.h> 58#include <asm/kdebug.h>
59#include <asm/idle.h>
58#include <asm/syscalls.h> 60#include <asm/syscalls.h>
59#include <asm/smp.h> 61#include <asm/smp.h>
60 62
@@ -90,6 +92,7 @@ static void cpu_exit_clear(void)
90 cpu_clear(cpu, cpu_callin_map); 92 cpu_clear(cpu, cpu_callin_map);
91 93
92 numa_remove_cpu(cpu); 94 numa_remove_cpu(cpu);
95 c1e_remove_cpu(cpu);
93} 96}
94 97
95/* We don't actually take CPU down, just spin without interrupts. */ 98/* We don't actually take CPU down, just spin without interrupts. */
@@ -161,6 +164,7 @@ void __show_registers(struct pt_regs *regs, int all)
161 unsigned long d0, d1, d2, d3, d6, d7; 164 unsigned long d0, d1, d2, d3, d6, d7;
162 unsigned long sp; 165 unsigned long sp;
163 unsigned short ss, gs; 166 unsigned short ss, gs;
167 const char *board;
164 168
165 if (user_mode_vm(regs)) { 169 if (user_mode_vm(regs)) {
166 sp = regs->sp; 170 sp = regs->sp;
@@ -173,11 +177,15 @@ void __show_registers(struct pt_regs *regs, int all)
173 } 177 }
174 178
175 printk("\n"); 179 printk("\n");
176 printk("Pid: %d, comm: %s %s (%s %.*s)\n", 180
181 board = dmi_get_system_info(DMI_PRODUCT_NAME);
182 if (!board)
183 board = "";
184 printk("Pid: %d, comm: %s %s (%s %.*s) %s\n",
177 task_pid_nr(current), current->comm, 185 task_pid_nr(current), current->comm,
178 print_tainted(), init_utsname()->release, 186 print_tainted(), init_utsname()->release,
179 (int)strcspn(init_utsname()->version, " "), 187 (int)strcspn(init_utsname()->version, " "),
180 init_utsname()->version); 188 init_utsname()->version, board);
181 189
182 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", 190 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
183 (u16)regs->cs, regs->ip, regs->flags, 191 (u16)regs->cs, regs->ip, regs->flags,
@@ -277,6 +285,14 @@ void exit_thread(void)
277 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; 285 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
278 put_cpu(); 286 put_cpu();
279 } 287 }
288#ifdef CONFIG_X86_DS
289 /* Free any DS contexts that have not been properly released. */
290 if (unlikely(current->thread.ds_ctx)) {
291 /* we clear debugctl to make sure DS is not used. */
292 update_debugctlmsr(0);
293 ds_free(current->thread.ds_ctx);
294 }
295#endif /* CONFIG_X86_DS */
280} 296}
281 297
282void flush_thread(void) 298void flush_thread(void)
@@ -438,6 +454,35 @@ int set_tsc_mode(unsigned int val)
438 return 0; 454 return 0;
439} 455}
440 456
457#ifdef CONFIG_X86_DS
458static int update_debugctl(struct thread_struct *prev,
459 struct thread_struct *next, unsigned long debugctl)
460{
461 unsigned long ds_prev = 0;
462 unsigned long ds_next = 0;
463
464 if (prev->ds_ctx)
465 ds_prev = (unsigned long)prev->ds_ctx->ds;
466 if (next->ds_ctx)
467 ds_next = (unsigned long)next->ds_ctx->ds;
468
469 if (ds_next != ds_prev) {
470 /* we clear debugctl to make sure DS
471 * is not in use when we change it */
472 debugctl = 0;
473 update_debugctlmsr(0);
474 wrmsr(MSR_IA32_DS_AREA, ds_next, 0);
475 }
476 return debugctl;
477}
478#else
479static int update_debugctl(struct thread_struct *prev,
480 struct thread_struct *next, unsigned long debugctl)
481{
482 return debugctl;
483}
484#endif /* CONFIG_X86_DS */
485
441static noinline void 486static noinline void
442__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, 487__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
443 struct tss_struct *tss) 488 struct tss_struct *tss)
@@ -448,14 +493,7 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
448 prev = &prev_p->thread; 493 prev = &prev_p->thread;
449 next = &next_p->thread; 494 next = &next_p->thread;
450 495
451 debugctl = prev->debugctlmsr; 496 debugctl = update_debugctl(prev, next, prev->debugctlmsr);
452 if (next->ds_area_msr != prev->ds_area_msr) {
453 /* we clear debugctl to make sure DS
454 * is not in use when we change it */
455 debugctl = 0;
456 update_debugctlmsr(0);
457 wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0);
458 }
459 497
460 if (next->debugctlmsr != debugctl) 498 if (next->debugctlmsr != debugctl)
461 update_debugctlmsr(next->debugctlmsr); 499 update_debugctlmsr(next->debugctlmsr);
@@ -479,13 +517,13 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
479 hard_enable_TSC(); 517 hard_enable_TSC();
480 } 518 }
481 519
482#ifdef X86_BTS 520#ifdef CONFIG_X86_PTRACE_BTS
483 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) 521 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
484 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); 522 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
485 523
486 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) 524 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
487 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); 525 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
488#endif 526#endif /* CONFIG_X86_PTRACE_BTS */
489 527
490 528
491 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { 529 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 00263c9e6500..2a8ccb9238b4 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -37,11 +37,11 @@
37#include <linux/kdebug.h> 37#include <linux/kdebug.h>
38#include <linux/tick.h> 38#include <linux/tick.h>
39#include <linux/prctl.h> 39#include <linux/prctl.h>
40#include <linux/uaccess.h>
41#include <linux/io.h>
40 42
41#include <asm/uaccess.h>
42#include <asm/pgtable.h> 43#include <asm/pgtable.h>
43#include <asm/system.h> 44#include <asm/system.h>
44#include <asm/io.h>
45#include <asm/processor.h> 45#include <asm/processor.h>
46#include <asm/i387.h> 46#include <asm/i387.h>
47#include <asm/mmu_context.h> 47#include <asm/mmu_context.h>
@@ -89,11 +89,13 @@ void exit_idle(void)
89#ifdef CONFIG_HOTPLUG_CPU 89#ifdef CONFIG_HOTPLUG_CPU
90DECLARE_PER_CPU(int, cpu_state); 90DECLARE_PER_CPU(int, cpu_state);
91 91
92#include <asm/nmi.h> 92#include <linux/nmi.h>
93/* We halt the CPU with physical CPU hotplug */ 93/* We halt the CPU with physical CPU hotplug */
94static inline void play_dead(void) 94static inline void play_dead(void)
95{ 95{
96 idle_task_exit(); 96 idle_task_exit();
97 c1e_remove_cpu(raw_smp_processor_id());
98
97 mb(); 99 mb();
98 /* Ack it */ 100 /* Ack it */
99 __get_cpu_var(cpu_state) = CPU_DEAD; 101 __get_cpu_var(cpu_state) = CPU_DEAD;
@@ -152,7 +154,7 @@ void cpu_idle(void)
152} 154}
153 155
154/* Prints also some state that isn't saved in the pt_regs */ 156/* Prints also some state that isn't saved in the pt_regs */
155void __show_regs(struct pt_regs * regs) 157void __show_regs(struct pt_regs *regs)
156{ 158{
157 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; 159 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
158 unsigned long d0, d1, d2, d3, d6, d7; 160 unsigned long d0, d1, d2, d3, d6, d7;
@@ -161,59 +163,61 @@ void __show_regs(struct pt_regs * regs)
161 163
162 printk("\n"); 164 printk("\n");
163 print_modules(); 165 print_modules();
164 printk("Pid: %d, comm: %.20s %s %s %.*s\n", 166 printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s\n",
165 current->pid, current->comm, print_tainted(), 167 current->pid, current->comm, print_tainted(),
166 init_utsname()->release, 168 init_utsname()->release,
167 (int)strcspn(init_utsname()->version, " "), 169 (int)strcspn(init_utsname()->version, " "),
168 init_utsname()->version); 170 init_utsname()->version);
169 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); 171 printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
170 printk_address(regs->ip, 1); 172 printk_address(regs->ip, 1);
171 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp, 173 printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
172 regs->flags); 174 regs->sp, regs->flags);
173 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", 175 printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
174 regs->ax, regs->bx, regs->cx); 176 regs->ax, regs->bx, regs->cx);
175 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", 177 printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
176 regs->dx, regs->si, regs->di); 178 regs->dx, regs->si, regs->di);
177 printk("RBP: %016lx R08: %016lx R09: %016lx\n", 179 printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
178 regs->bp, regs->r8, regs->r9); 180 regs->bp, regs->r8, regs->r9);
179 printk("R10: %016lx R11: %016lx R12: %016lx\n", 181 printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
180 regs->r10, regs->r11, regs->r12); 182 regs->r10, regs->r11, regs->r12);
181 printk("R13: %016lx R14: %016lx R15: %016lx\n", 183 printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
182 regs->r13, regs->r14, regs->r15); 184 regs->r13, regs->r14, regs->r15);
183 185
184 asm("movl %%ds,%0" : "=r" (ds)); 186 asm("movl %%ds,%0" : "=r" (ds));
185 asm("movl %%cs,%0" : "=r" (cs)); 187 asm("movl %%cs,%0" : "=r" (cs));
186 asm("movl %%es,%0" : "=r" (es)); 188 asm("movl %%es,%0" : "=r" (es));
187 asm("movl %%fs,%0" : "=r" (fsindex)); 189 asm("movl %%fs,%0" : "=r" (fsindex));
188 asm("movl %%gs,%0" : "=r" (gsindex)); 190 asm("movl %%gs,%0" : "=r" (gsindex));
189 191
190 rdmsrl(MSR_FS_BASE, fs); 192 rdmsrl(MSR_FS_BASE, fs);
191 rdmsrl(MSR_GS_BASE, gs); 193 rdmsrl(MSR_GS_BASE, gs);
192 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 194 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
193 195
194 cr0 = read_cr0(); 196 cr0 = read_cr0();
195 cr2 = read_cr2(); 197 cr2 = read_cr2();
196 cr3 = read_cr3(); 198 cr3 = read_cr3();
197 cr4 = read_cr4(); 199 cr4 = read_cr4();
198 200
199 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 201 printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
200 fs,fsindex,gs,gsindex,shadowgs); 202 fs, fsindex, gs, gsindex, shadowgs);
201 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); 203 printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
202 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); 204 es, cr0);
205 printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
206 cr4);
203 207
204 get_debugreg(d0, 0); 208 get_debugreg(d0, 0);
205 get_debugreg(d1, 1); 209 get_debugreg(d1, 1);
206 get_debugreg(d2, 2); 210 get_debugreg(d2, 2);
207 printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); 211 printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
208 get_debugreg(d3, 3); 212 get_debugreg(d3, 3);
209 get_debugreg(d6, 6); 213 get_debugreg(d6, 6);
210 get_debugreg(d7, 7); 214 get_debugreg(d7, 7);
211 printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); 215 printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
212} 216}
213 217
214void show_regs(struct pt_regs *regs) 218void show_regs(struct pt_regs *regs)
215{ 219{
216 printk("CPU %d:", smp_processor_id()); 220 printk(KERN_INFO "CPU %d:", smp_processor_id());
217 __show_regs(regs); 221 __show_regs(regs);
218 show_trace(NULL, regs, (void *)(regs + 1), regs->bp); 222 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
219} 223}
@@ -239,6 +243,14 @@ void exit_thread(void)
239 t->io_bitmap_max = 0; 243 t->io_bitmap_max = 0;
240 put_cpu(); 244 put_cpu();
241 } 245 }
246#ifdef CONFIG_X86_DS
247 /* Free any DS contexts that have not been properly released. */
248 if (unlikely(t->ds_ctx)) {
249 /* we clear debugctl to make sure DS is not used. */
250 update_debugctlmsr(0);
251 ds_free(t->ds_ctx);
252 }
253#endif /* CONFIG_X86_DS */
242} 254}
243 255
244void flush_thread(void) 256void flush_thread(void)
@@ -314,10 +326,10 @@ void prepare_to_copy(struct task_struct *tsk)
314 326
315int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, 327int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
316 unsigned long unused, 328 unsigned long unused,
317 struct task_struct * p, struct pt_regs * regs) 329 struct task_struct *p, struct pt_regs *regs)
318{ 330{
319 int err; 331 int err;
320 struct pt_regs * childregs; 332 struct pt_regs *childregs;
321 struct task_struct *me = current; 333 struct task_struct *me = current;
322 334
323 childregs = ((struct pt_regs *) 335 childregs = ((struct pt_regs *)
@@ -362,10 +374,10 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
362 if (test_thread_flag(TIF_IA32)) 374 if (test_thread_flag(TIF_IA32))
363 err = do_set_thread_area(p, -1, 375 err = do_set_thread_area(p, -1,
364 (struct user_desc __user *)childregs->si, 0); 376 (struct user_desc __user *)childregs->si, 0);
365 else 377 else
366#endif 378#endif
367 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); 379 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
368 if (err) 380 if (err)
369 goto out; 381 goto out;
370 } 382 }
371 err = 0; 383 err = 0;
@@ -472,13 +484,27 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
472 next = &next_p->thread; 484 next = &next_p->thread;
473 485
474 debugctl = prev->debugctlmsr; 486 debugctl = prev->debugctlmsr;
475 if (next->ds_area_msr != prev->ds_area_msr) { 487
476 /* we clear debugctl to make sure DS 488#ifdef CONFIG_X86_DS
477 * is not in use when we change it */ 489 {
478 debugctl = 0; 490 unsigned long ds_prev = 0, ds_next = 0;
479 update_debugctlmsr(0); 491
480 wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr); 492 if (prev->ds_ctx)
493 ds_prev = (unsigned long)prev->ds_ctx->ds;
494 if (next->ds_ctx)
495 ds_next = (unsigned long)next->ds_ctx->ds;
496
497 if (ds_next != ds_prev) {
498 /*
499 * We clear debugctl to make sure DS
500 * is not in use when we change it:
501 */
502 debugctl = 0;
503 update_debugctlmsr(0);
504 wrmsrl(MSR_IA32_DS_AREA, ds_next);
505 }
481 } 506 }
507#endif /* CONFIG_X86_DS */
482 508
483 if (next->debugctlmsr != debugctl) 509 if (next->debugctlmsr != debugctl)
484 update_debugctlmsr(next->debugctlmsr); 510 update_debugctlmsr(next->debugctlmsr);
@@ -516,13 +542,13 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
516 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 542 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
517 } 543 }
518 544
519#ifdef X86_BTS 545#ifdef CONFIG_X86_PTRACE_BTS
520 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) 546 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
521 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); 547 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
522 548
523 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) 549 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
524 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); 550 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
525#endif 551#endif /* CONFIG_X86_PTRACE_BTS */
526} 552}
527 553
528/* 554/*
@@ -544,7 +570,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
544 unsigned fsindex, gsindex; 570 unsigned fsindex, gsindex;
545 571
546 /* we're going to use this soon, after a few expensive things */ 572 /* we're going to use this soon, after a few expensive things */
547 if (next_p->fpu_counter>5) 573 if (next_p->fpu_counter > 5)
548 prefetch(next->xstate); 574 prefetch(next->xstate);
549 575
550 /* 576 /*
@@ -552,13 +578,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
552 */ 578 */
553 load_sp0(tss, next); 579 load_sp0(tss, next);
554 580
555 /* 581 /*
556 * Switch DS and ES. 582 * Switch DS and ES.
557 * This won't pick up thread selector changes, but I guess that is ok. 583 * This won't pick up thread selector changes, but I guess that is ok.
558 */ 584 */
559 savesegment(es, prev->es); 585 savesegment(es, prev->es);
560 if (unlikely(next->es | prev->es)) 586 if (unlikely(next->es | prev->es))
561 loadsegment(es, next->es); 587 loadsegment(es, next->es);
562 588
563 savesegment(ds, prev->ds); 589 savesegment(ds, prev->ds);
564 if (unlikely(next->ds | prev->ds)) 590 if (unlikely(next->ds | prev->ds))
@@ -584,7 +610,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
584 */ 610 */
585 arch_leave_lazy_cpu_mode(); 611 arch_leave_lazy_cpu_mode();
586 612
587 /* 613 /*
588 * Switch FS and GS. 614 * Switch FS and GS.
589 * 615 *
590 * Segment register != 0 always requires a reload. Also 616 * Segment register != 0 always requires a reload. Also
@@ -593,13 +619,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
593 */ 619 */
594 if (unlikely(fsindex | next->fsindex | prev->fs)) { 620 if (unlikely(fsindex | next->fsindex | prev->fs)) {
595 loadsegment(fs, next->fsindex); 621 loadsegment(fs, next->fsindex);
596 /* 622 /*
597 * Check if the user used a selector != 0; if yes 623 * Check if the user used a selector != 0; if yes
598 * clear 64bit base, since overloaded base is always 624 * clear 64bit base, since overloaded base is always
599 * mapped to the Null selector 625 * mapped to the Null selector
600 */ 626 */
601 if (fsindex) 627 if (fsindex)
602 prev->fs = 0; 628 prev->fs = 0;
603 } 629 }
604 /* when next process has a 64bit base use it */ 630 /* when next process has a 64bit base use it */
605 if (next->fs) 631 if (next->fs)
@@ -609,7 +635,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
609 if (unlikely(gsindex | next->gsindex | prev->gs)) { 635 if (unlikely(gsindex | next->gsindex | prev->gs)) {
610 load_gs_index(next->gsindex); 636 load_gs_index(next->gsindex);
611 if (gsindex) 637 if (gsindex)
612 prev->gs = 0; 638 prev->gs = 0;
613 } 639 }
614 if (next->gs) 640 if (next->gs)
615 wrmsrl(MSR_KERNEL_GS_BASE, next->gs); 641 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
@@ -618,12 +644,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
618 /* Must be after DS reload */ 644 /* Must be after DS reload */
619 unlazy_fpu(prev_p); 645 unlazy_fpu(prev_p);
620 646
621 /* 647 /*
622 * Switch the PDA and FPU contexts. 648 * Switch the PDA and FPU contexts.
623 */ 649 */
624 prev->usersp = read_pda(oldrsp); 650 prev->usersp = read_pda(oldrsp);
625 write_pda(oldrsp, next->usersp); 651 write_pda(oldrsp, next->usersp);
626 write_pda(pcurrent, next_p); 652 write_pda(pcurrent, next_p);
627 653
628 write_pda(kernelstack, 654 write_pda(kernelstack,
629 (unsigned long)task_stack_page(next_p) + 655 (unsigned long)task_stack_page(next_p) +
@@ -664,7 +690,7 @@ long sys_execve(char __user *name, char __user * __user *argv,
664 char __user * __user *envp, struct pt_regs *regs) 690 char __user * __user *envp, struct pt_regs *regs)
665{ 691{
666 long error; 692 long error;
667 char * filename; 693 char *filename;
668 694
669 filename = getname(name); 695 filename = getname(name);
670 error = PTR_ERR(filename); 696 error = PTR_ERR(filename);
@@ -722,55 +748,55 @@ asmlinkage long sys_vfork(struct pt_regs *regs)
722unsigned long get_wchan(struct task_struct *p) 748unsigned long get_wchan(struct task_struct *p)
723{ 749{
724 unsigned long stack; 750 unsigned long stack;
725 u64 fp,ip; 751 u64 fp, ip;
726 int count = 0; 752 int count = 0;
727 753
728 if (!p || p == current || p->state==TASK_RUNNING) 754 if (!p || p == current || p->state == TASK_RUNNING)
729 return 0; 755 return 0;
730 stack = (unsigned long)task_stack_page(p); 756 stack = (unsigned long)task_stack_page(p);
731 if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE) 757 if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
732 return 0; 758 return 0;
733 fp = *(u64 *)(p->thread.sp); 759 fp = *(u64 *)(p->thread.sp);
734 do { 760 do {
735 if (fp < (unsigned long)stack || 761 if (fp < (unsigned long)stack ||
736 fp > (unsigned long)stack+THREAD_SIZE) 762 fp > (unsigned long)stack+THREAD_SIZE)
737 return 0; 763 return 0;
738 ip = *(u64 *)(fp+8); 764 ip = *(u64 *)(fp+8);
739 if (!in_sched_functions(ip)) 765 if (!in_sched_functions(ip))
740 return ip; 766 return ip;
741 fp = *(u64 *)fp; 767 fp = *(u64 *)fp;
742 } while (count++ < 16); 768 } while (count++ < 16);
743 return 0; 769 return 0;
744} 770}
745 771
746long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) 772long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
747{ 773{
748 int ret = 0; 774 int ret = 0;
749 int doit = task == current; 775 int doit = task == current;
750 int cpu; 776 int cpu;
751 777
752 switch (code) { 778 switch (code) {
753 case ARCH_SET_GS: 779 case ARCH_SET_GS:
754 if (addr >= TASK_SIZE_OF(task)) 780 if (addr >= TASK_SIZE_OF(task))
755 return -EPERM; 781 return -EPERM;
756 cpu = get_cpu(); 782 cpu = get_cpu();
757 /* handle small bases via the GDT because that's faster to 783 /* handle small bases via the GDT because that's faster to
758 switch. */ 784 switch. */
759 if (addr <= 0xffffffff) { 785 if (addr <= 0xffffffff) {
760 set_32bit_tls(task, GS_TLS, addr); 786 set_32bit_tls(task, GS_TLS, addr);
761 if (doit) { 787 if (doit) {
762 load_TLS(&task->thread, cpu); 788 load_TLS(&task->thread, cpu);
763 load_gs_index(GS_TLS_SEL); 789 load_gs_index(GS_TLS_SEL);
764 } 790 }
765 task->thread.gsindex = GS_TLS_SEL; 791 task->thread.gsindex = GS_TLS_SEL;
766 task->thread.gs = 0; 792 task->thread.gs = 0;
767 } else { 793 } else {
768 task->thread.gsindex = 0; 794 task->thread.gsindex = 0;
769 task->thread.gs = addr; 795 task->thread.gs = addr;
770 if (doit) { 796 if (doit) {
771 load_gs_index(0); 797 load_gs_index(0);
772 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); 798 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
773 } 799 }
774 } 800 }
775 put_cpu(); 801 put_cpu();
776 break; 802 break;
@@ -824,8 +850,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
824 rdmsrl(MSR_KERNEL_GS_BASE, base); 850 rdmsrl(MSR_KERNEL_GS_BASE, base);
825 else 851 else
826 base = task->thread.gs; 852 base = task->thread.gs;
827 } 853 } else
828 else
829 base = task->thread.gs; 854 base = task->thread.gs;
830 ret = put_user(base, (unsigned long __user *)addr); 855 ret = put_user(base, (unsigned long __user *)addr);
831 break; 856 break;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index fc3e8dcd9da6..e375b658efc3 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -14,6 +14,7 @@
14#include <linux/errno.h> 14#include <linux/errno.h>
15#include <linux/ptrace.h> 15#include <linux/ptrace.h>
16#include <linux/regset.h> 16#include <linux/regset.h>
17#include <linux/tracehook.h>
17#include <linux/user.h> 18#include <linux/user.h>
18#include <linux/elf.h> 19#include <linux/elf.h>
19#include <linux/security.h> 20#include <linux/security.h>
@@ -554,45 +555,115 @@ static int ptrace_set_debugreg(struct task_struct *child,
554 return 0; 555 return 0;
555} 556}
556 557
557#ifdef X86_BTS 558#ifdef CONFIG_X86_PTRACE_BTS
559/*
560 * The configuration for a particular BTS hardware implementation.
561 */
562struct bts_configuration {
563 /* the size of a BTS record in bytes; at most BTS_MAX_RECORD_SIZE */
564 unsigned char sizeof_bts;
565 /* the size of a field in the BTS record in bytes */
566 unsigned char sizeof_field;
567 /* a bitmask to enable/disable BTS in DEBUGCTL MSR */
568 unsigned long debugctl_mask;
569};
570static struct bts_configuration bts_cfg;
571
572#define BTS_MAX_RECORD_SIZE (8 * 3)
573
574
575/*
576 * Branch Trace Store (BTS) uses the following format. Different
577 * architectures vary in the size of those fields.
578 * - source linear address
579 * - destination linear address
580 * - flags
581 *
582 * Later architectures use 64bit pointers throughout, whereas earlier
583 * architectures use 32bit pointers in 32bit mode.
584 *
585 * We compute the base address for the first 8 fields based on:
586 * - the field size stored in the DS configuration
587 * - the relative field position
588 *
589 * In order to store additional information in the BTS buffer, we use
590 * a special source address to indicate that the record requires
591 * special interpretation.
592 *
593 * Netburst indicated via a bit in the flags field whether the branch
594 * was predicted; this is ignored.
595 */
596
597enum bts_field {
598 bts_from = 0,
599 bts_to,
600 bts_flags,
601
602 bts_escape = (unsigned long)-1,
603 bts_qual = bts_to,
604 bts_jiffies = bts_flags
605};
606
607static inline unsigned long bts_get(const char *base, enum bts_field field)
608{
609 base += (bts_cfg.sizeof_field * field);
610 return *(unsigned long *)base;
611}
558 612
559static int ptrace_bts_get_size(struct task_struct *child) 613static inline void bts_set(char *base, enum bts_field field, unsigned long val)
560{ 614{
561 if (!child->thread.ds_area_msr) 615 base += (bts_cfg.sizeof_field * field);;
562 return -ENXIO; 616 (*(unsigned long *)base) = val;
617}
563 618
564 return ds_get_bts_index((void *)child->thread.ds_area_msr); 619/*
620 * Translate a BTS record from the raw format into the bts_struct format
621 *
622 * out (out): bts_struct interpretation
623 * raw: raw BTS record
624 */
625static void ptrace_bts_translate_record(struct bts_struct *out, const void *raw)
626{
627 memset(out, 0, sizeof(*out));
628 if (bts_get(raw, bts_from) == bts_escape) {
629 out->qualifier = bts_get(raw, bts_qual);
630 out->variant.jiffies = bts_get(raw, bts_jiffies);
631 } else {
632 out->qualifier = BTS_BRANCH;
633 out->variant.lbr.from_ip = bts_get(raw, bts_from);
634 out->variant.lbr.to_ip = bts_get(raw, bts_to);
635 }
565} 636}
566 637
567static int ptrace_bts_read_record(struct task_struct *child, 638static int ptrace_bts_read_record(struct task_struct *child, size_t index,
568 long index,
569 struct bts_struct __user *out) 639 struct bts_struct __user *out)
570{ 640{
571 struct bts_struct ret; 641 struct bts_struct ret;
572 int retval; 642 const void *bts_record;
573 int bts_end; 643 size_t bts_index, bts_end;
574 int bts_index; 644 int error;
575
576 if (!child->thread.ds_area_msr)
577 return -ENXIO;
578 645
579 if (index < 0) 646 error = ds_get_bts_end(child, &bts_end);
580 return -EINVAL; 647 if (error < 0)
648 return error;
581 649
582 bts_end = ds_get_bts_end((void *)child->thread.ds_area_msr);
583 if (bts_end <= index) 650 if (bts_end <= index)
584 return -EINVAL; 651 return -EINVAL;
585 652
653 error = ds_get_bts_index(child, &bts_index);
654 if (error < 0)
655 return error;
656
586 /* translate the ptrace bts index into the ds bts index */ 657 /* translate the ptrace bts index into the ds bts index */
587 bts_index = ds_get_bts_index((void *)child->thread.ds_area_msr); 658 bts_index += bts_end - (index + 1);
588 bts_index -= (index + 1); 659 if (bts_end <= bts_index)
589 if (bts_index < 0) 660 bts_index -= bts_end;
590 bts_index += bts_end;
591 661
592 retval = ds_read_bts((void *)child->thread.ds_area_msr, 662 error = ds_access_bts(child, bts_index, &bts_record);
593 bts_index, &ret); 663 if (error < 0)
594 if (retval < 0) 664 return error;
595 return retval; 665
666 ptrace_bts_translate_record(&ret, bts_record);
596 667
597 if (copy_to_user(out, &ret, sizeof(ret))) 668 if (copy_to_user(out, &ret, sizeof(ret)))
598 return -EFAULT; 669 return -EFAULT;
@@ -600,101 +671,106 @@ static int ptrace_bts_read_record(struct task_struct *child,
600 return sizeof(ret); 671 return sizeof(ret);
601} 672}
602 673
603static int ptrace_bts_clear(struct task_struct *child)
604{
605 if (!child->thread.ds_area_msr)
606 return -ENXIO;
607
608 return ds_clear((void *)child->thread.ds_area_msr);
609}
610
611static int ptrace_bts_drain(struct task_struct *child, 674static int ptrace_bts_drain(struct task_struct *child,
612 long size, 675 long size,
613 struct bts_struct __user *out) 676 struct bts_struct __user *out)
614{ 677{
615 int end, i; 678 struct bts_struct ret;
616 void *ds = (void *)child->thread.ds_area_msr; 679 const unsigned char *raw;
617 680 size_t end, i;
618 if (!ds) 681 int error;
619 return -ENXIO;
620 682
621 end = ds_get_bts_index(ds); 683 error = ds_get_bts_index(child, &end);
622 if (end <= 0) 684 if (error < 0)
623 return end; 685 return error;
624 686
625 if (size < (end * sizeof(struct bts_struct))) 687 if (size < (end * sizeof(struct bts_struct)))
626 return -EIO; 688 return -EIO;
627 689
628 for (i = 0; i < end; i++, out++) { 690 error = ds_access_bts(child, 0, (const void **)&raw);
629 struct bts_struct ret; 691 if (error < 0)
630 int retval; 692 return error;
631 693
632 retval = ds_read_bts(ds, i, &ret); 694 for (i = 0; i < end; i++, out++, raw += bts_cfg.sizeof_bts) {
633 if (retval < 0) 695 ptrace_bts_translate_record(&ret, raw);
634 return retval;
635 696
636 if (copy_to_user(out, &ret, sizeof(ret))) 697 if (copy_to_user(out, &ret, sizeof(ret)))
637 return -EFAULT; 698 return -EFAULT;
638 } 699 }
639 700
640 ds_clear(ds); 701 error = ds_clear_bts(child);
702 if (error < 0)
703 return error;
641 704
642 return end; 705 return end;
643} 706}
644 707
708static void ptrace_bts_ovfl(struct task_struct *child)
709{
710 send_sig(child->thread.bts_ovfl_signal, child, 0);
711}
712
645static int ptrace_bts_config(struct task_struct *child, 713static int ptrace_bts_config(struct task_struct *child,
646 long cfg_size, 714 long cfg_size,
647 const struct ptrace_bts_config __user *ucfg) 715 const struct ptrace_bts_config __user *ucfg)
648{ 716{
649 struct ptrace_bts_config cfg; 717 struct ptrace_bts_config cfg;
650 int bts_size, ret = 0; 718 int error = 0;
651 void *ds; 719
720 error = -EOPNOTSUPP;
721 if (!bts_cfg.sizeof_bts)
722 goto errout;
652 723
724 error = -EIO;
653 if (cfg_size < sizeof(cfg)) 725 if (cfg_size < sizeof(cfg))
654 return -EIO; 726 goto errout;
655 727
728 error = -EFAULT;
656 if (copy_from_user(&cfg, ucfg, sizeof(cfg))) 729 if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
657 return -EFAULT; 730 goto errout;
658 731
659 if ((int)cfg.size < 0) 732 error = -EINVAL;
660 return -EINVAL; 733 if ((cfg.flags & PTRACE_BTS_O_SIGNAL) &&
734 !(cfg.flags & PTRACE_BTS_O_ALLOC))
735 goto errout;
661 736
662 bts_size = 0; 737 if (cfg.flags & PTRACE_BTS_O_ALLOC) {
663 ds = (void *)child->thread.ds_area_msr; 738 ds_ovfl_callback_t ovfl = NULL;
664 if (ds) { 739 unsigned int sig = 0;
665 bts_size = ds_get_bts_size(ds); 740
666 if (bts_size < 0) 741 /* we ignore the error in case we were not tracing child */
667 return bts_size; 742 (void)ds_release_bts(child);
668 }
669 cfg.size = PAGE_ALIGN(cfg.size);
670 743
671 if (bts_size != cfg.size) { 744 if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
672 ret = ptrace_bts_realloc(child, cfg.size, 745 if (!cfg.signal)
673 cfg.flags & PTRACE_BTS_O_CUT_SIZE); 746 goto errout;
674 if (ret < 0) 747
748 sig = cfg.signal;
749 ovfl = ptrace_bts_ovfl;
750 }
751
752 error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl);
753 if (error < 0)
675 goto errout; 754 goto errout;
676 755
677 ds = (void *)child->thread.ds_area_msr; 756 child->thread.bts_ovfl_signal = sig;
678 } 757 }
679 758
680 if (cfg.flags & PTRACE_BTS_O_SIGNAL) 759 error = -EINVAL;
681 ret = ds_set_overflow(ds, DS_O_SIGNAL); 760 if (!child->thread.ds_ctx && cfg.flags)
682 else
683 ret = ds_set_overflow(ds, DS_O_WRAP);
684 if (ret < 0)
685 goto errout; 761 goto errout;
686 762
687 if (cfg.flags & PTRACE_BTS_O_TRACE) 763 if (cfg.flags & PTRACE_BTS_O_TRACE)
688 child->thread.debugctlmsr |= ds_debugctl_mask(); 764 child->thread.debugctlmsr |= bts_cfg.debugctl_mask;
689 else 765 else
690 child->thread.debugctlmsr &= ~ds_debugctl_mask(); 766 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
691 767
692 if (cfg.flags & PTRACE_BTS_O_SCHED) 768 if (cfg.flags & PTRACE_BTS_O_SCHED)
693 set_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 769 set_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
694 else 770 else
695 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 771 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
696 772
697 ret = sizeof(cfg); 773 error = sizeof(cfg);
698 774
699out: 775out:
700 if (child->thread.debugctlmsr) 776 if (child->thread.debugctlmsr)
@@ -702,10 +778,10 @@ out:
702 else 778 else
703 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); 779 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
704 780
705 return ret; 781 return error;
706 782
707errout: 783errout:
708 child->thread.debugctlmsr &= ~ds_debugctl_mask(); 784 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
709 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 785 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
710 goto out; 786 goto out;
711} 787}
@@ -714,29 +790,40 @@ static int ptrace_bts_status(struct task_struct *child,
714 long cfg_size, 790 long cfg_size,
715 struct ptrace_bts_config __user *ucfg) 791 struct ptrace_bts_config __user *ucfg)
716{ 792{
717 void *ds = (void *)child->thread.ds_area_msr;
718 struct ptrace_bts_config cfg; 793 struct ptrace_bts_config cfg;
794 size_t end;
795 const void *base, *max;
796 int error;
719 797
720 if (cfg_size < sizeof(cfg)) 798 if (cfg_size < sizeof(cfg))
721 return -EIO; 799 return -EIO;
722 800
723 memset(&cfg, 0, sizeof(cfg)); 801 error = ds_get_bts_end(child, &end);
802 if (error < 0)
803 return error;
724 804
725 if (ds) { 805 error = ds_access_bts(child, /* index = */ 0, &base);
726 cfg.size = ds_get_bts_size(ds); 806 if (error < 0)
807 return error;
727 808
728 if (ds_get_overflow(ds) == DS_O_SIGNAL) 809 error = ds_access_bts(child, /* index = */ end, &max);
729 cfg.flags |= PTRACE_BTS_O_SIGNAL; 810 if (error < 0)
811 return error;
730 812
731 if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) && 813 memset(&cfg, 0, sizeof(cfg));
732 child->thread.debugctlmsr & ds_debugctl_mask()) 814 cfg.size = (max - base);
733 cfg.flags |= PTRACE_BTS_O_TRACE; 815 cfg.signal = child->thread.bts_ovfl_signal;
816 cfg.bts_size = sizeof(struct bts_struct);
734 817
735 if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS)) 818 if (cfg.signal)
736 cfg.flags |= PTRACE_BTS_O_SCHED; 819 cfg.flags |= PTRACE_BTS_O_SIGNAL;
737 }
738 820
739 cfg.bts_size = sizeof(struct bts_struct); 821 if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) &&
822 child->thread.debugctlmsr & bts_cfg.debugctl_mask)
823 cfg.flags |= PTRACE_BTS_O_TRACE;
824
825 if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS))
826 cfg.flags |= PTRACE_BTS_O_SCHED;
740 827
741 if (copy_to_user(ucfg, &cfg, sizeof(cfg))) 828 if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
742 return -EFAULT; 829 return -EFAULT;
@@ -744,89 +831,38 @@ static int ptrace_bts_status(struct task_struct *child,
744 return sizeof(cfg); 831 return sizeof(cfg);
745} 832}
746 833
747
748static int ptrace_bts_write_record(struct task_struct *child, 834static int ptrace_bts_write_record(struct task_struct *child,
749 const struct bts_struct *in) 835 const struct bts_struct *in)
750{ 836{
751 int retval; 837 unsigned char bts_record[BTS_MAX_RECORD_SIZE];
752 838
753 if (!child->thread.ds_area_msr) 839 BUG_ON(BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts);
754 return -ENXIO;
755 840
756 retval = ds_write_bts((void *)child->thread.ds_area_msr, in); 841 memset(bts_record, 0, bts_cfg.sizeof_bts);
757 if (retval) 842 switch (in->qualifier) {
758 return retval; 843 case BTS_INVALID:
844 break;
759 845
760 return sizeof(*in); 846 case BTS_BRANCH:
761} 847 bts_set(bts_record, bts_from, in->variant.lbr.from_ip);
848 bts_set(bts_record, bts_to, in->variant.lbr.to_ip);
849 break;
762 850
763static int ptrace_bts_realloc(struct task_struct *child, 851 case BTS_TASK_ARRIVES:
764 int size, int reduce_size) 852 case BTS_TASK_DEPARTS:
765{ 853 bts_set(bts_record, bts_from, bts_escape);
766 unsigned long rlim, vm; 854 bts_set(bts_record, bts_qual, in->qualifier);
767 int ret, old_size; 855 bts_set(bts_record, bts_jiffies, in->variant.jiffies);
856 break;
768 857
769 if (size < 0) 858 default:
770 return -EINVAL; 859 return -EINVAL;
771
772 old_size = ds_get_bts_size((void *)child->thread.ds_area_msr);
773 if (old_size < 0)
774 return old_size;
775
776 ret = ds_free((void **)&child->thread.ds_area_msr);
777 if (ret < 0)
778 goto out;
779
780 size >>= PAGE_SHIFT;
781 old_size >>= PAGE_SHIFT;
782
783 current->mm->total_vm -= old_size;
784 current->mm->locked_vm -= old_size;
785
786 if (size == 0)
787 goto out;
788
789 rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
790 vm = current->mm->total_vm + size;
791 if (rlim < vm) {
792 ret = -ENOMEM;
793
794 if (!reduce_size)
795 goto out;
796
797 size = rlim - current->mm->total_vm;
798 if (size <= 0)
799 goto out;
800 }
801
802 rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
803 vm = current->mm->locked_vm + size;
804 if (rlim < vm) {
805 ret = -ENOMEM;
806
807 if (!reduce_size)
808 goto out;
809
810 size = rlim - current->mm->locked_vm;
811 if (size <= 0)
812 goto out;
813 } 860 }
814 861
815 ret = ds_allocate((void **)&child->thread.ds_area_msr, 862 /* The writing task will be the switched-to task on a context
816 size << PAGE_SHIFT); 863 * switch. It needs to write into the switched-from task's BTS
817 if (ret < 0) 864 * buffer. */
818 goto out; 865 return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts);
819
820 current->mm->total_vm += size;
821 current->mm->locked_vm += size;
822
823out:
824 if (child->thread.ds_area_msr)
825 set_tsk_thread_flag(child, TIF_DS_AREA_MSR);
826 else
827 clear_tsk_thread_flag(child, TIF_DS_AREA_MSR);
828
829 return ret;
830} 866}
831 867
832void ptrace_bts_take_timestamp(struct task_struct *tsk, 868void ptrace_bts_take_timestamp(struct task_struct *tsk,
@@ -839,7 +875,66 @@ void ptrace_bts_take_timestamp(struct task_struct *tsk,
839 875
840 ptrace_bts_write_record(tsk, &rec); 876 ptrace_bts_write_record(tsk, &rec);
841} 877}
842#endif /* X86_BTS */ 878
879static const struct bts_configuration bts_cfg_netburst = {
880 .sizeof_bts = sizeof(long) * 3,
881 .sizeof_field = sizeof(long),
882 .debugctl_mask = (1<<2)|(1<<3)|(1<<5)
883};
884
885static const struct bts_configuration bts_cfg_pentium_m = {
886 .sizeof_bts = sizeof(long) * 3,
887 .sizeof_field = sizeof(long),
888 .debugctl_mask = (1<<6)|(1<<7)
889};
890
891static const struct bts_configuration bts_cfg_core2 = {
892 .sizeof_bts = 8 * 3,
893 .sizeof_field = 8,
894 .debugctl_mask = (1<<6)|(1<<7)|(1<<9)
895};
896
897static inline void bts_configure(const struct bts_configuration *cfg)
898{
899 bts_cfg = *cfg;
900}
901
902void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c)
903{
904 switch (c->x86) {
905 case 0x6:
906 switch (c->x86_model) {
907 case 0xD:
908 case 0xE: /* Pentium M */
909 bts_configure(&bts_cfg_pentium_m);
910 break;
911 case 0xF: /* Core2 */
912 case 0x1C: /* Atom */
913 bts_configure(&bts_cfg_core2);
914 break;
915 default:
916 /* sorry, don't know about them */
917 break;
918 }
919 break;
920 case 0xF:
921 switch (c->x86_model) {
922 case 0x0:
923 case 0x1:
924 case 0x2: /* Netburst */
925 bts_configure(&bts_cfg_netburst);
926 break;
927 default:
928 /* sorry, don't know about them */
929 break;
930 }
931 break;
932 default:
933 /* sorry, don't know about them */
934 break;
935 }
936}
937#endif /* CONFIG_X86_PTRACE_BTS */
843 938
844/* 939/*
845 * Called by kernel/ptrace.c when detaching.. 940 * Called by kernel/ptrace.c when detaching..
@@ -852,15 +947,15 @@ void ptrace_disable(struct task_struct *child)
852#ifdef TIF_SYSCALL_EMU 947#ifdef TIF_SYSCALL_EMU
853 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); 948 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
854#endif 949#endif
855 if (child->thread.ds_area_msr) { 950#ifdef CONFIG_X86_PTRACE_BTS
856#ifdef X86_BTS 951 (void)ds_release_bts(child);
857 ptrace_bts_realloc(child, 0, 0); 952
858#endif 953 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
859 child->thread.debugctlmsr &= ~ds_debugctl_mask(); 954 if (!child->thread.debugctlmsr)
860 if (!child->thread.debugctlmsr) 955 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
861 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); 956
862 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 957 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
863 } 958#endif /* CONFIG_X86_PTRACE_BTS */
864} 959}
865 960
866#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 961#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
@@ -980,7 +1075,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
980 /* 1075 /*
981 * These bits need more cooking - not enabled yet: 1076 * These bits need more cooking - not enabled yet:
982 */ 1077 */
983#ifdef X86_BTS 1078#ifdef CONFIG_X86_PTRACE_BTS
984 case PTRACE_BTS_CONFIG: 1079 case PTRACE_BTS_CONFIG:
985 ret = ptrace_bts_config 1080 ret = ptrace_bts_config
986 (child, data, (struct ptrace_bts_config __user *)addr); 1081 (child, data, (struct ptrace_bts_config __user *)addr);
@@ -992,7 +1087,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
992 break; 1087 break;
993 1088
994 case PTRACE_BTS_SIZE: 1089 case PTRACE_BTS_SIZE:
995 ret = ptrace_bts_get_size(child); 1090 ret = ds_get_bts_index(child, /* pos = */ NULL);
996 break; 1091 break;
997 1092
998 case PTRACE_BTS_GET: 1093 case PTRACE_BTS_GET:
@@ -1001,14 +1096,14 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
1001 break; 1096 break;
1002 1097
1003 case PTRACE_BTS_CLEAR: 1098 case PTRACE_BTS_CLEAR:
1004 ret = ptrace_bts_clear(child); 1099 ret = ds_clear_bts(child);
1005 break; 1100 break;
1006 1101
1007 case PTRACE_BTS_DRAIN: 1102 case PTRACE_BTS_DRAIN:
1008 ret = ptrace_bts_drain 1103 ret = ptrace_bts_drain
1009 (child, data, (struct bts_struct __user *) addr); 1104 (child, data, (struct bts_struct __user *) addr);
1010 break; 1105 break;
1011#endif 1106#endif /* CONFIG_X86_PTRACE_BTS */
1012 1107
1013 default: 1108 default:
1014 ret = ptrace_request(child, request, addr, data); 1109 ret = ptrace_request(child, request, addr, data);
@@ -1375,30 +1470,6 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
1375 force_sig_info(SIGTRAP, &info, tsk); 1470 force_sig_info(SIGTRAP, &info, tsk);
1376} 1471}
1377 1472
1378static void syscall_trace(struct pt_regs *regs)
1379{
1380 if (!(current->ptrace & PT_PTRACED))
1381 return;
1382
1383#if 0
1384 printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n",
1385 current->comm,
1386 regs->ip, regs->sp, regs->ax, regs->orig_ax, __builtin_return_address(0),
1387 current_thread_info()->flags, current->ptrace);
1388#endif
1389
1390 ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
1391 ? 0x80 : 0));
1392 /*
1393 * this isn't the same as continuing with a signal, but it will do
1394 * for normal use. strace only continues with a signal if the
1395 * stopping signal is not SIGTRAP. -brl
1396 */
1397 if (current->exit_code) {
1398 send_sig(current->exit_code, current, 1);
1399 current->exit_code = 0;
1400 }
1401}
1402 1473
1403#ifdef CONFIG_X86_32 1474#ifdef CONFIG_X86_32
1404# define IS_IA32 1 1475# define IS_IA32 1
@@ -1432,8 +1503,9 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)
1432 if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) 1503 if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
1433 ret = -1L; 1504 ret = -1L;
1434 1505
1435 if (ret || test_thread_flag(TIF_SYSCALL_TRACE)) 1506 if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
1436 syscall_trace(regs); 1507 tracehook_report_syscall_entry(regs))
1508 ret = -1L;
1437 1509
1438 if (unlikely(current->audit_context)) { 1510 if (unlikely(current->audit_context)) {
1439 if (IS_IA32) 1511 if (IS_IA32)
@@ -1459,7 +1531,7 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
1459 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); 1531 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1460 1532
1461 if (test_thread_flag(TIF_SYSCALL_TRACE)) 1533 if (test_thread_flag(TIF_SYSCALL_TRACE))
1462 syscall_trace(regs); 1534 tracehook_report_syscall_exit(regs, 0);
1463 1535
1464 /* 1536 /*
1465 * If TIF_SYSCALL_EMU is set, we only get here because of 1537 * If TIF_SYSCALL_EMU is set, we only get here because of
@@ -1475,6 +1547,6 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
1475 * system call instruction. 1547 * system call instruction.
1476 */ 1548 */
1477 if (test_thread_flag(TIF_SINGLESTEP) && 1549 if (test_thread_flag(TIF_SINGLESTEP) &&
1478 (current->ptrace & PT_PTRACED)) 1550 tracehook_consider_fatal_signal(current, SIGTRAP, SIG_DFL))
1479 send_sigtrap(current, regs, 0); 1551 send_sigtrap(current, regs, 0);
1480} 1552}
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 724adfc63cb9..f4c93f1cfc19 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -29,7 +29,11 @@ EXPORT_SYMBOL(pm_power_off);
29 29
30static const struct desc_ptr no_idt = {}; 30static const struct desc_ptr no_idt = {};
31static int reboot_mode; 31static int reboot_mode;
32enum reboot_type reboot_type = BOOT_KBD; 32/*
33 * Keyboard reset and triple fault may result in INIT, not RESET, which
34 * doesn't work when we're in vmx root mode. Try ACPI first.
35 */
36enum reboot_type reboot_type = BOOT_ACPI;
33int reboot_force; 37int reboot_force;
34 38
35#if defined(CONFIG_X86_32) && defined(CONFIG_SMP) 39#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 673f12cf6eb0..46c98efbbf8d 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -223,6 +223,9 @@ unsigned long saved_video_mode;
223#define RAMDISK_LOAD_FLAG 0x4000 223#define RAMDISK_LOAD_FLAG 0x4000
224 224
225static char __initdata command_line[COMMAND_LINE_SIZE]; 225static char __initdata command_line[COMMAND_LINE_SIZE];
226#ifdef CONFIG_CMDLINE_BOOL
227static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE;
228#endif
226 229
227#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) 230#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
228struct edd edd; 231struct edd edd;
@@ -665,11 +668,28 @@ void __init setup_arch(char **cmdline_p)
665 bss_resource.start = virt_to_phys(&__bss_start); 668 bss_resource.start = virt_to_phys(&__bss_start);
666 bss_resource.end = virt_to_phys(&__bss_stop)-1; 669 bss_resource.end = virt_to_phys(&__bss_stop)-1;
667 670
671#ifdef CONFIG_CMDLINE_BOOL
672#ifdef CONFIG_CMDLINE_OVERRIDE
673 strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
674#else
675 if (builtin_cmdline[0]) {
676 /* append boot loader cmdline to builtin */
677 strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
678 strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
679 strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
680 }
681#endif
682#endif
683
668 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); 684 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
669 *cmdline_p = command_line; 685 *cmdline_p = command_line;
670 686
671 parse_early_param(); 687 parse_early_param();
672 688
689#ifdef CONFIG_X86_64
690 check_efer();
691#endif
692
673#if defined(CONFIG_VMI) && defined(CONFIG_X86_32) 693#if defined(CONFIG_VMI) && defined(CONFIG_X86_32)
674 /* 694 /*
675 * Must be before kernel pagetables are setup 695 * Must be before kernel pagetables are setup
@@ -738,7 +758,6 @@ void __init setup_arch(char **cmdline_p)
738#else 758#else
739 num_physpages = max_pfn; 759 num_physpages = max_pfn;
740 760
741 check_efer();
742 if (cpu_has_x2apic) 761 if (cpu_has_x2apic)
743 check_x2apic(); 762 check_x2apic();
744 763
diff --git a/arch/x86/kernel/sigframe.h b/arch/x86/kernel/sigframe.h
index 6dd7e2b70a4b..cc673aa55ce4 100644
--- a/arch/x86/kernel/sigframe.h
+++ b/arch/x86/kernel/sigframe.h
@@ -34,4 +34,9 @@ struct rt_sigframe {
34 struct siginfo info; 34 struct siginfo info;
35 /* fp state follows here */ 35 /* fp state follows here */
36}; 36};
37
38int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
39 sigset_t *set, struct pt_regs *regs);
40int ia32_setup_frame(int sig, struct k_sigaction *ka,
41 sigset_t *set, struct pt_regs *regs);
37#endif 42#endif
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index 8d380b699c0c..b21070ea33a4 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -17,6 +17,7 @@
17#include <linux/errno.h> 17#include <linux/errno.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/wait.h> 19#include <linux/wait.h>
20#include <linux/tracehook.h>
20#include <linux/elf.h> 21#include <linux/elf.h>
21#include <linux/smp.h> 22#include <linux/smp.h>
22#include <linux/mm.h> 23#include <linux/mm.h>
@@ -556,8 +557,6 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
556 * handler too. 557 * handler too.
557 */ 558 */
558 regs->flags &= ~X86_EFLAGS_TF; 559 regs->flags &= ~X86_EFLAGS_TF;
559 if (test_thread_flag(TIF_SINGLESTEP))
560 ptrace_notify(SIGTRAP);
561 560
562 spin_lock_irq(&current->sighand->siglock); 561 spin_lock_irq(&current->sighand->siglock);
563 sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask); 562 sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
@@ -566,6 +565,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
566 recalc_sigpending(); 565 recalc_sigpending();
567 spin_unlock_irq(&current->sighand->siglock); 566 spin_unlock_irq(&current->sighand->siglock);
568 567
568 tracehook_signal_handler(sig, info, ka, regs,
569 test_thread_flag(TIF_SINGLESTEP));
570
569 return 0; 571 return 0;
570} 572}
571 573
@@ -659,5 +661,10 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
659 if (thread_info_flags & _TIF_SIGPENDING) 661 if (thread_info_flags & _TIF_SIGPENDING)
660 do_signal(regs); 662 do_signal(regs);
661 663
664 if (thread_info_flags & _TIF_NOTIFY_RESUME) {
665 clear_thread_flag(TIF_NOTIFY_RESUME);
666 tracehook_notify_resume(regs);
667 }
668
662 clear_thread_flag(TIF_IRET); 669 clear_thread_flag(TIF_IRET);
663} 670}
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index 4665b598a376..823a55bf8c39 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -15,17 +15,20 @@
15#include <linux/errno.h> 15#include <linux/errno.h>
16#include <linux/wait.h> 16#include <linux/wait.h>
17#include <linux/ptrace.h> 17#include <linux/ptrace.h>
18#include <linux/tracehook.h>
18#include <linux/unistd.h> 19#include <linux/unistd.h>
19#include <linux/stddef.h> 20#include <linux/stddef.h>
20#include <linux/personality.h> 21#include <linux/personality.h>
21#include <linux/compiler.h> 22#include <linux/compiler.h>
23#include <linux/uaccess.h>
24
22#include <asm/processor.h> 25#include <asm/processor.h>
23#include <asm/ucontext.h> 26#include <asm/ucontext.h>
24#include <asm/uaccess.h>
25#include <asm/i387.h> 27#include <asm/i387.h>
26#include <asm/proto.h> 28#include <asm/proto.h>
27#include <asm/ia32_unistd.h> 29#include <asm/ia32_unistd.h>
28#include <asm/mce.h> 30#include <asm/mce.h>
31#include <asm/syscall.h>
29#include <asm/syscalls.h> 32#include <asm/syscalls.h>
30#include "sigframe.h" 33#include "sigframe.h"
31 34
@@ -42,11 +45,6 @@
42# define FIX_EFLAGS __FIX_EFLAGS 45# define FIX_EFLAGS __FIX_EFLAGS
43#endif 46#endif
44 47
45int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
46 sigset_t *set, struct pt_regs * regs);
47int ia32_setup_frame(int sig, struct k_sigaction *ka,
48 sigset_t *set, struct pt_regs * regs);
49
50asmlinkage long 48asmlinkage long
51sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, 49sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
52 struct pt_regs *regs) 50 struct pt_regs *regs)
@@ -66,7 +64,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
66 /* Always make any pending restarted system calls return -EINTR */ 64 /* Always make any pending restarted system calls return -EINTR */
67 current_thread_info()->restart_block.fn = do_no_restart_syscall; 65 current_thread_info()->restart_block.fn = do_no_restart_syscall;
68 66
69#define COPY(x) err |= __get_user(regs->x, &sc->x) 67#define COPY(x) (err |= __get_user(regs->x, &sc->x))
70 68
71 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); 69 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
72 COPY(dx); COPY(cx); COPY(ip); 70 COPY(dx); COPY(cx); COPY(ip);
@@ -96,7 +94,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
96 } 94 }
97 95
98 { 96 {
99 struct _fpstate __user * buf; 97 struct _fpstate __user *buf;
100 err |= __get_user(buf, &sc->fpstate); 98 err |= __get_user(buf, &sc->fpstate);
101 err |= restore_i387_xstate(buf); 99 err |= restore_i387_xstate(buf);
102 } 100 }
@@ -122,7 +120,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
122 current->blocked = set; 120 current->blocked = set;
123 recalc_sigpending(); 121 recalc_sigpending();
124 spin_unlock_irq(&current->sighand->siglock); 122 spin_unlock_irq(&current->sighand->siglock);
125 123
126 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) 124 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
127 goto badframe; 125 goto badframe;
128 126
@@ -132,16 +130,17 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
132 return ax; 130 return ax;
133 131
134badframe: 132badframe:
135 signal_fault(regs,frame,"sigreturn"); 133 signal_fault(regs, frame, "sigreturn");
136 return 0; 134 return 0;
137} 135}
138 136
139/* 137/*
140 * Set up a signal frame. 138 * Set up a signal frame.
141 */ 139 */
142 140
143static inline int 141static inline int
144setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned long mask, struct task_struct *me) 142setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs,
143 unsigned long mask, struct task_struct *me)
145{ 144{
146 int err = 0; 145 int err = 0;
147 146
@@ -197,7 +196,7 @@ get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
197} 196}
198 197
199static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 198static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
200 sigset_t *set, struct pt_regs * regs) 199 sigset_t *set, struct pt_regs *regs)
201{ 200{
202 struct rt_sigframe __user *frame; 201 struct rt_sigframe __user *frame;
203 void __user *fp = NULL; 202 void __user *fp = NULL;
@@ -210,19 +209,19 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
210 (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8; 209 (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
211 210
212 if (save_i387_xstate(fp) < 0) 211 if (save_i387_xstate(fp) < 0)
213 err |= -1; 212 err |= -1;
214 } else 213 } else
215 frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8; 214 frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8;
216 215
217 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 216 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
218 goto give_sigsegv; 217 goto give_sigsegv;
219 218
220 if (ka->sa.sa_flags & SA_SIGINFO) { 219 if (ka->sa.sa_flags & SA_SIGINFO) {
221 err |= copy_siginfo_to_user(&frame->info, info); 220 err |= copy_siginfo_to_user(&frame->info, info);
222 if (err) 221 if (err)
223 goto give_sigsegv; 222 goto give_sigsegv;
224 } 223 }
225 224
226 /* Create the ucontext. */ 225 /* Create the ucontext. */
227 if (cpu_has_xsave) 226 if (cpu_has_xsave)
228 err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags); 227 err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
@@ -235,9 +234,9 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
235 err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); 234 err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
236 err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me); 235 err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me);
237 err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate); 236 err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate);
238 if (sizeof(*set) == 16) { 237 if (sizeof(*set) == 16) {
239 __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]); 238 __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
240 __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); 239 __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]);
241 } else 240 } else
242 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); 241 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
243 242
@@ -248,7 +247,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
248 err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); 247 err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
249 } else { 248 } else {
250 /* could use a vstub here */ 249 /* could use a vstub here */
251 goto give_sigsegv; 250 goto give_sigsegv;
252 } 251 }
253 252
254 if (err) 253 if (err)
@@ -256,7 +255,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
256 255
257 /* Set up registers for signal handler */ 256 /* Set up registers for signal handler */
258 regs->di = sig; 257 regs->di = sig;
259 /* In case the signal handler was declared without prototypes */ 258 /* In case the signal handler was declared without prototypes */
260 regs->ax = 0; 259 regs->ax = 0;
261 260
262 /* This also works for non SA_SIGINFO handlers because they expect the 261 /* This also works for non SA_SIGINFO handlers because they expect the
@@ -279,37 +278,8 @@ give_sigsegv:
279} 278}
280 279
281/* 280/*
282 * Return -1L or the syscall number that @regs is executing.
283 */
284static long current_syscall(struct pt_regs *regs)
285{
286 /*
287 * We always sign-extend a -1 value being set here,
288 * so this is always either -1L or a syscall number.
289 */
290 return regs->orig_ax;
291}
292
293/*
294 * Return a value that is -EFOO if the system call in @regs->orig_ax
295 * returned an error. This only works for @regs from @current.
296 */
297static long current_syscall_ret(struct pt_regs *regs)
298{
299#ifdef CONFIG_IA32_EMULATION
300 if (test_thread_flag(TIF_IA32))
301 /*
302 * Sign-extend the value so (int)-EFOO becomes (long)-EFOO
303 * and will match correctly in comparisons.
304 */
305 return (int) regs->ax;
306#endif
307 return regs->ax;
308}
309
310/*
311 * OK, we're invoking a handler 281 * OK, we're invoking a handler
312 */ 282 */
313 283
314static int 284static int
315handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, 285handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
@@ -318,9 +288,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
318 int ret; 288 int ret;
319 289
320 /* Are we from a system call? */ 290 /* Are we from a system call? */
321 if (current_syscall(regs) >= 0) { 291 if (syscall_get_nr(current, regs) >= 0) {
322 /* If so, check system call restarting.. */ 292 /* If so, check system call restarting.. */
323 switch (current_syscall_ret(regs)) { 293 switch (syscall_get_error(current, regs)) {
324 case -ERESTART_RESTARTBLOCK: 294 case -ERESTART_RESTARTBLOCK:
325 case -ERESTARTNOHAND: 295 case -ERESTARTNOHAND:
326 regs->ax = -EINTR; 296 regs->ax = -EINTR;
@@ -353,7 +323,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
353 ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs); 323 ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs);
354 else 324 else
355 ret = ia32_setup_frame(sig, ka, oldset, regs); 325 ret = ia32_setup_frame(sig, ka, oldset, regs);
356 } else 326 } else
357#endif 327#endif
358 ret = setup_rt_frame(sig, ka, info, oldset, regs); 328 ret = setup_rt_frame(sig, ka, info, oldset, regs);
359 329
@@ -377,15 +347,16 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
377 * handler too. 347 * handler too.
378 */ 348 */
379 regs->flags &= ~X86_EFLAGS_TF; 349 regs->flags &= ~X86_EFLAGS_TF;
380 if (test_thread_flag(TIF_SINGLESTEP))
381 ptrace_notify(SIGTRAP);
382 350
383 spin_lock_irq(&current->sighand->siglock); 351 spin_lock_irq(&current->sighand->siglock);
384 sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask); 352 sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
385 if (!(ka->sa.sa_flags & SA_NODEFER)) 353 if (!(ka->sa.sa_flags & SA_NODEFER))
386 sigaddset(&current->blocked,sig); 354 sigaddset(&current->blocked, sig);
387 recalc_sigpending(); 355 recalc_sigpending();
388 spin_unlock_irq(&current->sighand->siglock); 356 spin_unlock_irq(&current->sighand->siglock);
357
358 tracehook_signal_handler(sig, info, ka, regs,
359 test_thread_flag(TIF_SINGLESTEP));
389 } 360 }
390 361
391 return ret; 362 return ret;
@@ -442,9 +413,9 @@ static void do_signal(struct pt_regs *regs)
442 } 413 }
443 414
444 /* Did we come from a system call? */ 415 /* Did we come from a system call? */
445 if (current_syscall(regs) >= 0) { 416 if (syscall_get_nr(current, regs) >= 0) {
446 /* Restart the system call - no handlers present */ 417 /* Restart the system call - no handlers present */
447 switch (current_syscall_ret(regs)) { 418 switch (syscall_get_error(current, regs)) {
448 case -ERESTARTNOHAND: 419 case -ERESTARTNOHAND:
449 case -ERESTARTSYS: 420 case -ERESTARTSYS:
450 case -ERESTARTNOINTR: 421 case -ERESTARTNOINTR:
@@ -482,17 +453,23 @@ void do_notify_resume(struct pt_regs *regs, void *unused,
482 /* deal with pending signal delivery */ 453 /* deal with pending signal delivery */
483 if (thread_info_flags & _TIF_SIGPENDING) 454 if (thread_info_flags & _TIF_SIGPENDING)
484 do_signal(regs); 455 do_signal(regs);
456
457 if (thread_info_flags & _TIF_NOTIFY_RESUME) {
458 clear_thread_flag(TIF_NOTIFY_RESUME);
459 tracehook_notify_resume(regs);
460 }
485} 461}
486 462
487void signal_fault(struct pt_regs *regs, void __user *frame, char *where) 463void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
488{ 464{
489 struct task_struct *me = current; 465 struct task_struct *me = current;
490 if (show_unhandled_signals && printk_ratelimit()) { 466 if (show_unhandled_signals && printk_ratelimit()) {
491 printk("%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", 467 printk("%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
492 me->comm,me->pid,where,frame,regs->ip,regs->sp,regs->orig_ax); 468 me->comm, me->pid, where, frame, regs->ip,
469 regs->sp, regs->orig_ax);
493 print_vma_addr(" in ", regs->ip); 470 print_vma_addr(" in ", regs->ip);
494 printk("\n"); 471 printk("\n");
495 } 472 }
496 473
497 force_sig(SIGSEGV, me); 474 force_sig(SIGSEGV, me);
498} 475}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index aa804c64b167..9056f7e272c0 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -258,6 +258,7 @@ static void __cpuinit smp_callin(void)
258 end_local_APIC_setup(); 258 end_local_APIC_setup();
259 map_cpu_to_logical_apicid(); 259 map_cpu_to_logical_apicid();
260 260
261 notify_cpu_starting(cpuid);
261 /* 262 /*
262 * Get our bogomips. 263 * Get our bogomips.
263 * 264 *
@@ -1313,16 +1314,13 @@ __init void prefill_possible_map(void)
1313 if (!num_processors) 1314 if (!num_processors)
1314 num_processors = 1; 1315 num_processors = 1;
1315 1316
1316#ifdef CONFIG_HOTPLUG_CPU
1317 if (additional_cpus == -1) { 1317 if (additional_cpus == -1) {
1318 if (disabled_cpus > 0) 1318 if (disabled_cpus > 0)
1319 additional_cpus = disabled_cpus; 1319 additional_cpus = disabled_cpus;
1320 else 1320 else
1321 additional_cpus = 0; 1321 additional_cpus = 0;
1322 } 1322 }
1323#else 1323
1324 additional_cpus = 0;
1325#endif
1326 possible = num_processors + additional_cpus; 1324 possible = num_processors + additional_cpus;
1327 if (possible > NR_CPUS) 1325 if (possible > NR_CPUS)
1328 possible = NR_CPUS; 1326 possible = NR_CPUS;
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index c9288c883e20..6bc211accf08 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -13,16 +13,17 @@
13#include <linux/utsname.h> 13#include <linux/utsname.h>
14#include <linux/personality.h> 14#include <linux/personality.h>
15#include <linux/random.h> 15#include <linux/random.h>
16#include <linux/uaccess.h>
16 17
17#include <asm/uaccess.h>
18#include <asm/ia32.h> 18#include <asm/ia32.h>
19#include <asm/syscalls.h> 19#include <asm/syscalls.h>
20 20
21asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, 21asmlinkage long sys_mmap(unsigned long addr, unsigned long len,
22 unsigned long fd, unsigned long off) 22 unsigned long prot, unsigned long flags,
23 unsigned long fd, unsigned long off)
23{ 24{
24 long error; 25 long error;
25 struct file * file; 26 struct file *file;
26 27
27 error = -EINVAL; 28 error = -EINVAL;
28 if (off & ~PAGE_MASK) 29 if (off & ~PAGE_MASK)
@@ -57,9 +58,9 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
57 unmapped base down for this case. This can give 58 unmapped base down for this case. This can give
58 conflicts with the heap, but we assume that glibc 59 conflicts with the heap, but we assume that glibc
59 malloc knows how to fall back to mmap. Give it 1GB 60 malloc knows how to fall back to mmap. Give it 1GB
60 of playground for now. -AK */ 61 of playground for now. -AK */
61 *begin = 0x40000000; 62 *begin = 0x40000000;
62 *end = 0x80000000; 63 *end = 0x80000000;
63 if (current->flags & PF_RANDOMIZE) { 64 if (current->flags & PF_RANDOMIZE) {
64 new_begin = randomize_range(*begin, *begin + 0x02000000, 0); 65 new_begin = randomize_range(*begin, *begin + 0x02000000, 0);
65 if (new_begin) 66 if (new_begin)
@@ -67,9 +68,9 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
67 } 68 }
68 } else { 69 } else {
69 *begin = TASK_UNMAPPED_BASE; 70 *begin = TASK_UNMAPPED_BASE;
70 *end = TASK_SIZE; 71 *end = TASK_SIZE;
71 } 72 }
72} 73}
73 74
74unsigned long 75unsigned long
75arch_get_unmapped_area(struct file *filp, unsigned long addr, 76arch_get_unmapped_area(struct file *filp, unsigned long addr,
@@ -79,11 +80,11 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
79 struct vm_area_struct *vma; 80 struct vm_area_struct *vma;
80 unsigned long start_addr; 81 unsigned long start_addr;
81 unsigned long begin, end; 82 unsigned long begin, end;
82 83
83 if (flags & MAP_FIXED) 84 if (flags & MAP_FIXED)
84 return addr; 85 return addr;
85 86
86 find_start_end(flags, &begin, &end); 87 find_start_end(flags, &begin, &end);
87 88
88 if (len > end) 89 if (len > end)
89 return -ENOMEM; 90 return -ENOMEM;
@@ -97,12 +98,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
97 } 98 }
98 if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32)) 99 if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32))
99 && len <= mm->cached_hole_size) { 100 && len <= mm->cached_hole_size) {
100 mm->cached_hole_size = 0; 101 mm->cached_hole_size = 0;
101 mm->free_area_cache = begin; 102 mm->free_area_cache = begin;
102 } 103 }
103 addr = mm->free_area_cache; 104 addr = mm->free_area_cache;
104 if (addr < begin) 105 if (addr < begin)
105 addr = begin; 106 addr = begin;
106 start_addr = addr; 107 start_addr = addr;
107 108
108full_search: 109full_search:
@@ -128,7 +129,7 @@ full_search:
128 return addr; 129 return addr;
129 } 130 }
130 if (addr + mm->cached_hole_size < vma->vm_start) 131 if (addr + mm->cached_hole_size < vma->vm_start)
131 mm->cached_hole_size = vma->vm_start - addr; 132 mm->cached_hole_size = vma->vm_start - addr;
132 133
133 addr = vma->vm_end; 134 addr = vma->vm_end;
134 } 135 }
@@ -178,7 +179,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
178 vma = find_vma(mm, addr-len); 179 vma = find_vma(mm, addr-len);
179 if (!vma || addr <= vma->vm_start) 180 if (!vma || addr <= vma->vm_start)
180 /* remember the address as a hint for next time */ 181 /* remember the address as a hint for next time */
181 return (mm->free_area_cache = addr-len); 182 return mm->free_area_cache = addr-len;
182 } 183 }
183 184
184 if (mm->mmap_base < len) 185 if (mm->mmap_base < len)
@@ -195,7 +196,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
195 vma = find_vma(mm, addr); 196 vma = find_vma(mm, addr);
196 if (!vma || addr+len <= vma->vm_start) 197 if (!vma || addr+len <= vma->vm_start)
197 /* remember the address as a hint for next time */ 198 /* remember the address as a hint for next time */
198 return (mm->free_area_cache = addr); 199 return mm->free_area_cache = addr;
199 200
200 /* remember the largest hole we saw so far */ 201 /* remember the largest hole we saw so far */
201 if (addr + mm->cached_hole_size < vma->vm_start) 202 if (addr + mm->cached_hole_size < vma->vm_start)
@@ -225,13 +226,13 @@ bottomup:
225} 226}
226 227
227 228
228asmlinkage long sys_uname(struct new_utsname __user * name) 229asmlinkage long sys_uname(struct new_utsname __user *name)
229{ 230{
230 int err; 231 int err;
231 down_read(&uts_sem); 232 down_read(&uts_sem);
232 err = copy_to_user(name, utsname(), sizeof (*name)); 233 err = copy_to_user(name, utsname(), sizeof(*name));
233 up_read(&uts_sem); 234 up_read(&uts_sem);
234 if (personality(current->personality) == PER_LINUX32) 235 if (personality(current->personality) == PER_LINUX32)
235 err |= copy_to_user(&name->machine, "i686", 5); 236 err |= copy_to_user(&name->machine, "i686", 5);
236 return err ? -EFAULT : 0; 237 return err ? -EFAULT : 0;
237} 238}
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index b42068fb7b76..2887a789e38f 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -32,6 +32,8 @@
32#include <linux/bug.h> 32#include <linux/bug.h>
33#include <linux/nmi.h> 33#include <linux/nmi.h>
34#include <linux/mm.h> 34#include <linux/mm.h>
35#include <linux/smp.h>
36#include <linux/io.h>
35 37
36#if defined(CONFIG_EDAC) 38#if defined(CONFIG_EDAC)
37#include <linux/edac.h> 39#include <linux/edac.h>
@@ -45,9 +47,6 @@
45#include <asm/unwind.h> 47#include <asm/unwind.h>
46#include <asm/desc.h> 48#include <asm/desc.h>
47#include <asm/i387.h> 49#include <asm/i387.h>
48#include <asm/nmi.h>
49#include <asm/smp.h>
50#include <asm/io.h>
51#include <asm/pgalloc.h> 50#include <asm/pgalloc.h>
52#include <asm/proto.h> 51#include <asm/proto.h>
53#include <asm/pda.h> 52#include <asm/pda.h>
@@ -85,7 +84,8 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
85 84
86void printk_address(unsigned long address, int reliable) 85void printk_address(unsigned long address, int reliable)
87{ 86{
88 printk(" [<%016lx>] %s%pS\n", address, reliable ? "": "? ", (void *) address); 87 printk(" [<%016lx>] %s%pS\n",
88 address, reliable ? "" : "? ", (void *) address);
89} 89}
90 90
91static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, 91static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
@@ -98,7 +98,8 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
98 [STACKFAULT_STACK - 1] = "#SS", 98 [STACKFAULT_STACK - 1] = "#SS",
99 [MCE_STACK - 1] = "#MC", 99 [MCE_STACK - 1] = "#MC",
100#if DEBUG_STKSZ > EXCEPTION_STKSZ 100#if DEBUG_STKSZ > EXCEPTION_STKSZ
101 [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" 101 [N_EXCEPTION_STACKS ...
102 N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
102#endif 103#endif
103 }; 104 };
104 unsigned k; 105 unsigned k;
@@ -163,7 +164,7 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
163} 164}
164 165
165/* 166/*
166 * x86-64 can have up to three kernel stacks: 167 * x86-64 can have up to three kernel stacks:
167 * process stack 168 * process stack
168 * interrupt stack 169 * interrupt stack
169 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack 170 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
@@ -219,7 +220,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
219 const struct stacktrace_ops *ops, void *data) 220 const struct stacktrace_ops *ops, void *data)
220{ 221{
221 const unsigned cpu = get_cpu(); 222 const unsigned cpu = get_cpu();
222 unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr; 223 unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
223 unsigned used = 0; 224 unsigned used = 0;
224 struct thread_info *tinfo; 225 struct thread_info *tinfo;
225 226
@@ -237,7 +238,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
237 if (!bp) { 238 if (!bp) {
238 if (task == current) { 239 if (task == current) {
239 /* Grab bp right from our regs */ 240 /* Grab bp right from our regs */
240 asm("movq %%rbp, %0" : "=r" (bp) :); 241 asm("movq %%rbp, %0" : "=r" (bp) : );
241 } else { 242 } else {
242 /* bp is the last reg pushed by switch_to */ 243 /* bp is the last reg pushed by switch_to */
243 bp = *(unsigned long *) task->thread.sp; 244 bp = *(unsigned long *) task->thread.sp;
@@ -356,11 +357,15 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
356 unsigned long *stack; 357 unsigned long *stack;
357 int i; 358 int i;
358 const int cpu = smp_processor_id(); 359 const int cpu = smp_processor_id();
359 unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr); 360 unsigned long *irqstack_end =
360 unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); 361 (unsigned long *) (cpu_pda(cpu)->irqstackptr);
362 unsigned long *irqstack =
363 (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
361 364
362 // debugging aid: "show_stack(NULL, NULL);" prints the 365 /*
363 // back trace for this cpu. 366 * debugging aid: "show_stack(NULL, NULL);" prints the
367 * back trace for this cpu.
368 */
364 369
365 if (sp == NULL) { 370 if (sp == NULL) {
366 if (task) 371 if (task)
@@ -404,7 +409,7 @@ void dump_stack(void)
404 409
405#ifdef CONFIG_FRAME_POINTER 410#ifdef CONFIG_FRAME_POINTER
406 if (!bp) 411 if (!bp)
407 asm("movq %%rbp, %0" : "=r" (bp):); 412 asm("movq %%rbp, %0" : "=r" (bp) : );
408#endif 413#endif
409 414
410 printk("Pid: %d, comm: %.20s %s %s %.*s\n", 415 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
@@ -414,7 +419,6 @@ void dump_stack(void)
414 init_utsname()->version); 419 init_utsname()->version);
415 show_trace(NULL, NULL, &stack, bp); 420 show_trace(NULL, NULL, &stack, bp);
416} 421}
417
418EXPORT_SYMBOL(dump_stack); 422EXPORT_SYMBOL(dump_stack);
419 423
420void show_registers(struct pt_regs *regs) 424void show_registers(struct pt_regs *regs)
@@ -492,7 +496,7 @@ unsigned __kprobes long oops_begin(void)
492 raw_local_irq_save(flags); 496 raw_local_irq_save(flags);
493 cpu = smp_processor_id(); 497 cpu = smp_processor_id();
494 if (!__raw_spin_trylock(&die_lock)) { 498 if (!__raw_spin_trylock(&die_lock)) {
495 if (cpu == die_owner) 499 if (cpu == die_owner)
496 /* nested oops. should stop eventually */; 500 /* nested oops. should stop eventually */;
497 else 501 else
498 __raw_spin_lock(&die_lock); 502 __raw_spin_lock(&die_lock);
@@ -637,7 +641,7 @@ kernel_trap:
637} 641}
638 642
639#define DO_ERROR(trapnr, signr, str, name) \ 643#define DO_ERROR(trapnr, signr, str, name) \
640asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ 644asmlinkage void do_##name(struct pt_regs *regs, long error_code) \
641{ \ 645{ \
642 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 646 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
643 == NOTIFY_STOP) \ 647 == NOTIFY_STOP) \
@@ -647,7 +651,7 @@ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
647} 651}
648 652
649#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ 653#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
650asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ 654asmlinkage void do_##name(struct pt_regs *regs, long error_code) \
651{ \ 655{ \
652 siginfo_t info; \ 656 siginfo_t info; \
653 info.si_signo = signr; \ 657 info.si_signo = signr; \
@@ -682,7 +686,7 @@ asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
682 preempt_conditional_cli(regs); 686 preempt_conditional_cli(regs);
683} 687}
684 688
685asmlinkage void do_double_fault(struct pt_regs * regs, long error_code) 689asmlinkage void do_double_fault(struct pt_regs *regs, long error_code)
686{ 690{
687 static const char str[] = "double fault"; 691 static const char str[] = "double fault";
688 struct task_struct *tsk = current; 692 struct task_struct *tsk = current;
@@ -777,9 +781,10 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
777} 781}
778 782
779static notrace __kprobes void 783static notrace __kprobes void
780unknown_nmi_error(unsigned char reason, struct pt_regs * regs) 784unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
781{ 785{
782 if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) 786 if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) ==
787 NOTIFY_STOP)
783 return; 788 return;
784 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", 789 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
785 reason); 790 reason);
@@ -881,7 +886,7 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
881 else if (user_mode(eregs)) 886 else if (user_mode(eregs))
882 regs = task_pt_regs(current); 887 regs = task_pt_regs(current);
883 /* Exception from kernel and interrupts are enabled. Move to 888 /* Exception from kernel and interrupts are enabled. Move to
884 kernel process stack. */ 889 kernel process stack. */
885 else if (eregs->flags & X86_EFLAGS_IF) 890 else if (eregs->flags & X86_EFLAGS_IF)
886 regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); 891 regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
887 if (eregs != regs) 892 if (eregs != regs)
@@ -890,7 +895,7 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
890} 895}
891 896
892/* runs on IST stack. */ 897/* runs on IST stack. */
893asmlinkage void __kprobes do_debug(struct pt_regs * regs, 898asmlinkage void __kprobes do_debug(struct pt_regs *regs,
894 unsigned long error_code) 899 unsigned long error_code)
895{ 900{
896 struct task_struct *tsk = current; 901 struct task_struct *tsk = current;
@@ -1034,7 +1039,7 @@ asmlinkage void do_coprocessor_error(struct pt_regs *regs)
1034 1039
1035asmlinkage void bad_intr(void) 1040asmlinkage void bad_intr(void)
1036{ 1041{
1037 printk("bad interrupt"); 1042 printk("bad interrupt");
1038} 1043}
1039 1044
1040asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) 1045asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
@@ -1046,7 +1051,7 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
1046 1051
1047 conditional_sti(regs); 1052 conditional_sti(regs);
1048 if (!user_mode(regs) && 1053 if (!user_mode(regs) &&
1049 kernel_math_error(regs, "kernel simd math error", 19)) 1054 kernel_math_error(regs, "kernel simd math error", 19))
1050 return; 1055 return;
1051 1056
1052 /* 1057 /*
@@ -1091,7 +1096,7 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
1091 force_sig_info(SIGFPE, &info, task); 1096 force_sig_info(SIGFPE, &info, task);
1092} 1097}
1093 1098
1094asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs) 1099asmlinkage void do_spurious_interrupt_bug(struct pt_regs *regs)
1095{ 1100{
1096} 1101}
1097 1102
@@ -1148,8 +1153,10 @@ void __init trap_init(void)
1148 set_intr_gate(0, &divide_error); 1153 set_intr_gate(0, &divide_error);
1149 set_intr_gate_ist(1, &debug, DEBUG_STACK); 1154 set_intr_gate_ist(1, &debug, DEBUG_STACK);
1150 set_intr_gate_ist(2, &nmi, NMI_STACK); 1155 set_intr_gate_ist(2, &nmi, NMI_STACK);
1151 set_system_gate_ist(3, &int3, DEBUG_STACK); /* int3 can be called from all */ 1156 /* int3 can be called from all */
1152 set_system_gate(4, &overflow); /* int4 can be called from all */ 1157 set_system_gate_ist(3, &int3, DEBUG_STACK);
1158 /* int4 can be called from all */
1159 set_system_gate(4, &overflow);
1153 set_intr_gate(5, &bounds); 1160 set_intr_gate(5, &bounds);
1154 set_intr_gate(6, &invalid_op); 1161 set_intr_gate(6, &invalid_op);
1155 set_intr_gate(7, &device_not_available); 1162 set_intr_gate(7, &device_not_available);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 346cae5ac423..161bb850fc47 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -104,7 +104,7 @@ __setup("notsc", notsc_setup);
104/* 104/*
105 * Read TSC and the reference counters. Take care of SMI disturbance 105 * Read TSC and the reference counters. Take care of SMI disturbance
106 */ 106 */
107static u64 tsc_read_refs(u64 *pm, u64 *hpet) 107static u64 tsc_read_refs(u64 *p, int hpet)
108{ 108{
109 u64 t1, t2; 109 u64 t1, t2;
110 int i; 110 int i;
@@ -112,9 +112,9 @@ static u64 tsc_read_refs(u64 *pm, u64 *hpet)
112 for (i = 0; i < MAX_RETRIES; i++) { 112 for (i = 0; i < MAX_RETRIES; i++) {
113 t1 = get_cycles(); 113 t1 = get_cycles();
114 if (hpet) 114 if (hpet)
115 *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF; 115 *p = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
116 else 116 else
117 *pm = acpi_pm_read_early(); 117 *p = acpi_pm_read_early();
118 t2 = get_cycles(); 118 t2 = get_cycles();
119 if ((t2 - t1) < SMI_TRESHOLD) 119 if ((t2 - t1) < SMI_TRESHOLD)
120 return t2; 120 return t2;
@@ -123,13 +123,59 @@ static u64 tsc_read_refs(u64 *pm, u64 *hpet)
123} 123}
124 124
125/* 125/*
126 * Calculate the TSC frequency from HPET reference
127 */
128static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2)
129{
130 u64 tmp;
131
132 if (hpet2 < hpet1)
133 hpet2 += 0x100000000ULL;
134 hpet2 -= hpet1;
135 tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
136 do_div(tmp, 1000000);
137 do_div(deltatsc, tmp);
138
139 return (unsigned long) deltatsc;
140}
141
142/*
143 * Calculate the TSC frequency from PMTimer reference
144 */
145static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2)
146{
147 u64 tmp;
148
149 if (!pm1 && !pm2)
150 return ULONG_MAX;
151
152 if (pm2 < pm1)
153 pm2 += (u64)ACPI_PM_OVRRUN;
154 pm2 -= pm1;
155 tmp = pm2 * 1000000000LL;
156 do_div(tmp, PMTMR_TICKS_PER_SEC);
157 do_div(deltatsc, tmp);
158
159 return (unsigned long) deltatsc;
160}
161
162#define CAL_MS 10
163#define CAL_LATCH (CLOCK_TICK_RATE / (1000 / CAL_MS))
164#define CAL_PIT_LOOPS 1000
165
166#define CAL2_MS 50
167#define CAL2_LATCH (CLOCK_TICK_RATE / (1000 / CAL2_MS))
168#define CAL2_PIT_LOOPS 5000
169
170
171/*
126 * Try to calibrate the TSC against the Programmable 172 * Try to calibrate the TSC against the Programmable
127 * Interrupt Timer and return the frequency of the TSC 173 * Interrupt Timer and return the frequency of the TSC
128 * in kHz. 174 * in kHz.
129 * 175 *
130 * Return ULONG_MAX on failure to calibrate. 176 * Return ULONG_MAX on failure to calibrate.
131 */ 177 */
132static unsigned long pit_calibrate_tsc(void) 178static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
133{ 179{
134 u64 tsc, t1, t2, delta; 180 u64 tsc, t1, t2, delta;
135 unsigned long tscmin, tscmax; 181 unsigned long tscmin, tscmax;
@@ -144,8 +190,8 @@ static unsigned long pit_calibrate_tsc(void)
144 * (LSB then MSB) to begin countdown. 190 * (LSB then MSB) to begin countdown.
145 */ 191 */
146 outb(0xb0, 0x43); 192 outb(0xb0, 0x43);
147 outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42); 193 outb(latch & 0xff, 0x42);
148 outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42); 194 outb(latch >> 8, 0x42);
149 195
150 tsc = t1 = t2 = get_cycles(); 196 tsc = t1 = t2 = get_cycles();
151 197
@@ -166,31 +212,154 @@ static unsigned long pit_calibrate_tsc(void)
166 /* 212 /*
167 * Sanity checks: 213 * Sanity checks:
168 * 214 *
169 * If we were not able to read the PIT more than 5000 215 * If we were not able to read the PIT more than loopmin
170 * times, then we have been hit by a massive SMI 216 * times, then we have been hit by a massive SMI
171 * 217 *
172 * If the maximum is 10 times larger than the minimum, 218 * If the maximum is 10 times larger than the minimum,
173 * then we got hit by an SMI as well. 219 * then we got hit by an SMI as well.
174 */ 220 */
175 if (pitcnt < 5000 || tscmax > 10 * tscmin) 221 if (pitcnt < loopmin || tscmax > 10 * tscmin)
176 return ULONG_MAX; 222 return ULONG_MAX;
177 223
178 /* Calculate the PIT value */ 224 /* Calculate the PIT value */
179 delta = t2 - t1; 225 delta = t2 - t1;
180 do_div(delta, 50); 226 do_div(delta, ms);
181 return delta; 227 return delta;
182} 228}
183 229
230/*
231 * This reads the current MSB of the PIT counter, and
232 * checks if we are running on sufficiently fast and
233 * non-virtualized hardware.
234 *
235 * Our expectations are:
236 *
237 * - the PIT is running at roughly 1.19MHz
238 *
239 * - each IO is going to take about 1us on real hardware,
240 * but we allow it to be much faster (by a factor of 10) or
241 * _slightly_ slower (ie we allow up to a 2us read+counter
242 * update - anything else implies a unacceptably slow CPU
243 * or PIT for the fast calibration to work.
244 *
245 * - with 256 PIT ticks to read the value, we have 214us to
246 * see the same MSB (and overhead like doing a single TSC
247 * read per MSB value etc).
248 *
249 * - We're doing 2 reads per loop (LSB, MSB), and we expect
250 * them each to take about a microsecond on real hardware.
251 * So we expect a count value of around 100. But we'll be
252 * generous, and accept anything over 50.
253 *
254 * - if the PIT is stuck, and we see *many* more reads, we
255 * return early (and the next caller of pit_expect_msb()
256 * then consider it a failure when they don't see the
257 * next expected value).
258 *
259 * These expectations mean that we know that we have seen the
260 * transition from one expected value to another with a fairly
261 * high accuracy, and we didn't miss any events. We can thus
262 * use the TSC value at the transitions to calculate a pretty
263 * good value for the TSC frequencty.
264 */
265static inline int pit_expect_msb(unsigned char val)
266{
267 int count = 0;
268
269 for (count = 0; count < 50000; count++) {
270 /* Ignore LSB */
271 inb(0x42);
272 if (inb(0x42) != val)
273 break;
274 }
275 return count > 50;
276}
277
278/*
279 * How many MSB values do we want to see? We aim for a
280 * 15ms calibration, which assuming a 2us counter read
281 * error should give us roughly 150 ppm precision for
282 * the calibration.
283 */
284#define QUICK_PIT_MS 15
285#define QUICK_PIT_ITERATIONS (QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
286
287static unsigned long quick_pit_calibrate(void)
288{
289 /* Set the Gate high, disable speaker */
290 outb((inb(0x61) & ~0x02) | 0x01, 0x61);
291
292 /*
293 * Counter 2, mode 0 (one-shot), binary count
294 *
295 * NOTE! Mode 2 decrements by two (and then the
296 * output is flipped each time, giving the same
297 * final output frequency as a decrement-by-one),
298 * so mode 0 is much better when looking at the
299 * individual counts.
300 */
301 outb(0xb0, 0x43);
302
303 /* Start at 0xffff */
304 outb(0xff, 0x42);
305 outb(0xff, 0x42);
306
307 if (pit_expect_msb(0xff)) {
308 int i;
309 u64 t1, t2, delta;
310 unsigned char expect = 0xfe;
311
312 t1 = get_cycles();
313 for (i = 0; i < QUICK_PIT_ITERATIONS; i++, expect--) {
314 if (!pit_expect_msb(expect))
315 goto failed;
316 }
317 t2 = get_cycles();
318
319 /*
320 * Make sure we can rely on the second TSC timestamp:
321 */
322 if (!pit_expect_msb(expect))
323 goto failed;
324
325 /*
326 * Ok, if we get here, then we've seen the
327 * MSB of the PIT decrement QUICK_PIT_ITERATIONS
328 * times, and each MSB had many hits, so we never
329 * had any sudden jumps.
330 *
331 * As a result, we can depend on there not being
332 * any odd delays anywhere, and the TSC reads are
333 * reliable.
334 *
335 * kHz = ticks / time-in-seconds / 1000;
336 * kHz = (t2 - t1) / (QPI * 256 / PIT_TICK_RATE) / 1000
337 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (QPI * 256 * 1000)
338 */
339 delta = (t2 - t1)*PIT_TICK_RATE;
340 do_div(delta, QUICK_PIT_ITERATIONS*256*1000);
341 printk("Fast TSC calibration using PIT\n");
342 return delta;
343 }
344failed:
345 return 0;
346}
184 347
185/** 348/**
186 * native_calibrate_tsc - calibrate the tsc on boot 349 * native_calibrate_tsc - calibrate the tsc on boot
187 */ 350 */
188unsigned long native_calibrate_tsc(void) 351unsigned long native_calibrate_tsc(void)
189{ 352{
190 u64 tsc1, tsc2, delta, pm1, pm2, hpet1, hpet2; 353 u64 tsc1, tsc2, delta, ref1, ref2;
191 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; 354 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
192 unsigned long flags; 355 unsigned long flags, latch, ms, fast_calibrate;
193 int hpet = is_hpet_enabled(), i; 356 int hpet = is_hpet_enabled(), i, loopmin;
357
358 local_irq_save(flags);
359 fast_calibrate = quick_pit_calibrate();
360 local_irq_restore(flags);
361 if (fast_calibrate)
362 return fast_calibrate;
194 363
195 /* 364 /*
196 * Run 5 calibration loops to get the lowest frequency value 365 * Run 5 calibration loops to get the lowest frequency value
@@ -216,7 +385,13 @@ unsigned long native_calibrate_tsc(void)
216 * calibration delay loop as we have to wait for a certain 385 * calibration delay loop as we have to wait for a certain
217 * amount of time anyway. 386 * amount of time anyway.
218 */ 387 */
219 for (i = 0; i < 5; i++) { 388
389 /* Preset PIT loop values */
390 latch = CAL_LATCH;
391 ms = CAL_MS;
392 loopmin = CAL_PIT_LOOPS;
393
394 for (i = 0; i < 3; i++) {
220 unsigned long tsc_pit_khz; 395 unsigned long tsc_pit_khz;
221 396
222 /* 397 /*
@@ -226,16 +401,16 @@ unsigned long native_calibrate_tsc(void)
226 * read the end value. 401 * read the end value.
227 */ 402 */
228 local_irq_save(flags); 403 local_irq_save(flags);
229 tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL); 404 tsc1 = tsc_read_refs(&ref1, hpet);
230 tsc_pit_khz = pit_calibrate_tsc(); 405 tsc_pit_khz = pit_calibrate_tsc(latch, ms, loopmin);
231 tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL); 406 tsc2 = tsc_read_refs(&ref2, hpet);
232 local_irq_restore(flags); 407 local_irq_restore(flags);
233 408
234 /* Pick the lowest PIT TSC calibration so far */ 409 /* Pick the lowest PIT TSC calibration so far */
235 tsc_pit_min = min(tsc_pit_min, tsc_pit_khz); 410 tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);
236 411
237 /* hpet or pmtimer available ? */ 412 /* hpet or pmtimer available ? */
238 if (!hpet && !pm1 && !pm2) 413 if (!hpet && !ref1 && !ref2)
239 continue; 414 continue;
240 415
241 /* Check, whether the sampling was disturbed by an SMI */ 416 /* Check, whether the sampling was disturbed by an SMI */
@@ -243,23 +418,41 @@ unsigned long native_calibrate_tsc(void)
243 continue; 418 continue;
244 419
245 tsc2 = (tsc2 - tsc1) * 1000000LL; 420 tsc2 = (tsc2 - tsc1) * 1000000LL;
421 if (hpet)
422 tsc2 = calc_hpet_ref(tsc2, ref1, ref2);
423 else
424 tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2);
246 425
247 if (hpet) { 426 tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2);
248 if (hpet2 < hpet1) 427
249 hpet2 += 0x100000000ULL; 428 /* Check the reference deviation */
250 hpet2 -= hpet1; 429 delta = ((u64) tsc_pit_min) * 100;
251 tsc1 = ((u64)hpet2 * hpet_readl(HPET_PERIOD)); 430 do_div(delta, tsc_ref_min);
252 do_div(tsc1, 1000000); 431
253 } else { 432 /*
254 if (pm2 < pm1) 433 * If both calibration results are inside a 10% window
255 pm2 += (u64)ACPI_PM_OVRRUN; 434 * then we can be sure, that the calibration
256 pm2 -= pm1; 435 * succeeded. We break out of the loop right away. We
257 tsc1 = pm2 * 1000000000LL; 436 * use the reference value, as it is more precise.
258 do_div(tsc1, PMTMR_TICKS_PER_SEC); 437 */
438 if (delta >= 90 && delta <= 110) {
439 printk(KERN_INFO
440 "TSC: PIT calibration matches %s. %d loops\n",
441 hpet ? "HPET" : "PMTIMER", i + 1);
442 return tsc_ref_min;
259 } 443 }
260 444
261 do_div(tsc2, tsc1); 445 /*
262 tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2); 446 * Check whether PIT failed more than once. This
447 * happens in virtualized environments. We need to
448 * give the virtual PC a slightly longer timeframe for
449 * the HPET/PMTIMER to make the result precise.
450 */
451 if (i == 1 && tsc_pit_min == ULONG_MAX) {
452 latch = CAL2_LATCH;
453 ms = CAL2_MS;
454 loopmin = CAL2_PIT_LOOPS;
455 }
263 } 456 }
264 457
265 /* 458 /*
@@ -267,11 +460,10 @@ unsigned long native_calibrate_tsc(void)
267 */ 460 */
268 if (tsc_pit_min == ULONG_MAX) { 461 if (tsc_pit_min == ULONG_MAX) {
269 /* PIT gave no useful value */ 462 /* PIT gave no useful value */
270 printk(KERN_WARNING "TSC: PIT calibration failed due to " 463 printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n");
271 "SMI disturbance.\n");
272 464
273 /* We don't have an alternative source, disable TSC */ 465 /* We don't have an alternative source, disable TSC */
274 if (!hpet && !pm1 && !pm2) { 466 if (!hpet && !ref1 && !ref2) {
275 printk("TSC: No reference (HPET/PMTIMER) available\n"); 467 printk("TSC: No reference (HPET/PMTIMER) available\n");
276 return 0; 468 return 0;
277 } 469 }
@@ -279,7 +471,7 @@ unsigned long native_calibrate_tsc(void)
279 /* The alternative source failed as well, disable TSC */ 471 /* The alternative source failed as well, disable TSC */
280 if (tsc_ref_min == ULONG_MAX) { 472 if (tsc_ref_min == ULONG_MAX) {
281 printk(KERN_WARNING "TSC: HPET/PMTIMER calibration " 473 printk(KERN_WARNING "TSC: HPET/PMTIMER calibration "
282 "failed due to SMI disturbance.\n"); 474 "failed.\n");
283 return 0; 475 return 0;
284 } 476 }
285 477
@@ -291,44 +483,25 @@ unsigned long native_calibrate_tsc(void)
291 } 483 }
292 484
293 /* We don't have an alternative source, use the PIT calibration value */ 485 /* We don't have an alternative source, use the PIT calibration value */
294 if (!hpet && !pm1 && !pm2) { 486 if (!hpet && !ref1 && !ref2) {
295 printk(KERN_INFO "TSC: Using PIT calibration value\n"); 487 printk(KERN_INFO "TSC: Using PIT calibration value\n");
296 return tsc_pit_min; 488 return tsc_pit_min;
297 } 489 }
298 490
299 /* The alternative source failed, use the PIT calibration value */ 491 /* The alternative source failed, use the PIT calibration value */
300 if (tsc_ref_min == ULONG_MAX) { 492 if (tsc_ref_min == ULONG_MAX) {
301 printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed due " 493 printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed. "
302 "to SMI disturbance. Using PIT calibration\n"); 494 "Using PIT calibration\n");
303 return tsc_pit_min; 495 return tsc_pit_min;
304 } 496 }
305 497
306 /* Check the reference deviation */
307 delta = ((u64) tsc_pit_min) * 100;
308 do_div(delta, tsc_ref_min);
309
310 /*
311 * If both calibration results are inside a 5% window, the we
312 * use the lower frequency of those as it is probably the
313 * closest estimate.
314 */
315 if (delta >= 95 && delta <= 105) {
316 printk(KERN_INFO "TSC: PIT calibration confirmed by %s.\n",
317 hpet ? "HPET" : "PMTIMER");
318 printk(KERN_INFO "TSC: using %s calibration value\n",
319 tsc_pit_min <= tsc_ref_min ? "PIT" :
320 hpet ? "HPET" : "PMTIMER");
321 return tsc_pit_min <= tsc_ref_min ? tsc_pit_min : tsc_ref_min;
322 }
323
324 printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n",
325 hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
326
327 /* 498 /*
328 * The calibration values differ too much. In doubt, we use 499 * The calibration values differ too much. In doubt, we use
329 * the PIT value as we know that there are PMTIMERs around 500 * the PIT value as we know that there are PMTIMERs around
330 * running at double speed. 501 * running at double speed. At least we let the user know:
331 */ 502 */
503 printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n",
504 hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
332 printk(KERN_INFO "TSC: Using PIT calibration value\n"); 505 printk(KERN_INFO "TSC: Using PIT calibration value\n");
333 return tsc_pit_min; 506 return tsc_pit_min;
334} 507}
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 594ef47f0a63..61a97e616f70 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -25,45 +25,31 @@
25#include <asm/visws/cobalt.h> 25#include <asm/visws/cobalt.h>
26#include <asm/visws/piix4.h> 26#include <asm/visws/piix4.h>
27#include <asm/arch_hooks.h> 27#include <asm/arch_hooks.h>
28#include <asm/io_apic.h>
28#include <asm/fixmap.h> 29#include <asm/fixmap.h>
29#include <asm/reboot.h> 30#include <asm/reboot.h>
30#include <asm/setup.h> 31#include <asm/setup.h>
31#include <asm/e820.h> 32#include <asm/e820.h>
32#include <asm/smp.h>
33#include <asm/io.h> 33#include <asm/io.h>
34 34
35#include <mach_ipi.h> 35#include <mach_ipi.h>
36 36
37#include "mach_apic.h" 37#include "mach_apic.h"
38 38
39#include <linux/init.h>
40#include <linux/smp.h>
41
42#include <linux/kernel_stat.h> 39#include <linux/kernel_stat.h>
43#include <linux/interrupt.h>
44#include <linux/init.h>
45 40
46#include <asm/io.h>
47#include <asm/apic.h>
48#include <asm/i8259.h> 41#include <asm/i8259.h>
49#include <asm/irq_vectors.h> 42#include <asm/irq_vectors.h>
50#include <asm/visws/cobalt.h>
51#include <asm/visws/lithium.h> 43#include <asm/visws/lithium.h>
52#include <asm/visws/piix4.h>
53 44
54#include <linux/sched.h> 45#include <linux/sched.h>
55#include <linux/kernel.h> 46#include <linux/kernel.h>
56#include <linux/init.h>
57#include <linux/pci.h> 47#include <linux/pci.h>
58#include <linux/pci_ids.h> 48#include <linux/pci_ids.h>
59 49
60extern int no_broadcast; 50extern int no_broadcast;
61 51
62#include <asm/io.h>
63#include <asm/apic.h> 52#include <asm/apic.h>
64#include <asm/arch_hooks.h>
65#include <asm/visws/cobalt.h>
66#include <asm/visws/lithium.h>
67 53
68char visws_board_type = -1; 54char visws_board_type = -1;
69char visws_board_rev = -1; 55char visws_board_rev = -1;
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 61531d5c9507..8b6c393ab9fd 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -235,7 +235,7 @@ static void vmi_write_ldt_entry(struct desc_struct *dt, int entry,
235 const void *desc) 235 const void *desc)
236{ 236{
237 u32 *ldt_entry = (u32 *)desc; 237 u32 *ldt_entry = (u32 *)desc;
238 vmi_ops.write_idt_entry(dt, entry, ldt_entry[0], ldt_entry[1]); 238 vmi_ops.write_ldt_entry(dt, entry, ldt_entry[0], ldt_entry[1]);
239} 239}
240 240
241static void vmi_load_sp0(struct tss_struct *tss, 241static void vmi_load_sp0(struct tss_struct *tss,
@@ -393,13 +393,13 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
393} 393}
394#endif 394#endif
395 395
396static void vmi_allocate_pte(struct mm_struct *mm, u32 pfn) 396static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
397{ 397{
398 vmi_set_page_type(pfn, VMI_PAGE_L1); 398 vmi_set_page_type(pfn, VMI_PAGE_L1);
399 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); 399 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
400} 400}
401 401
402static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn) 402static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn)
403{ 403{
404 /* 404 /*
405 * This call comes in very early, before mem_map is setup. 405 * This call comes in very early, before mem_map is setup.
@@ -410,20 +410,20 @@ static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn)
410 vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0); 410 vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
411} 411}
412 412
413static void vmi_allocate_pmd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count) 413static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count)
414{ 414{
415 vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE); 415 vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
416 vmi_check_page_type(clonepfn, VMI_PAGE_L2); 416 vmi_check_page_type(clonepfn, VMI_PAGE_L2);
417 vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count); 417 vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
418} 418}
419 419
420static void vmi_release_pte(u32 pfn) 420static void vmi_release_pte(unsigned long pfn)
421{ 421{
422 vmi_ops.release_page(pfn, VMI_PAGE_L1); 422 vmi_ops.release_page(pfn, VMI_PAGE_L1);
423 vmi_set_page_type(pfn, VMI_PAGE_NORMAL); 423 vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
424} 424}
425 425
426static void vmi_release_pmd(u32 pfn) 426static void vmi_release_pmd(unsigned long pfn)
427{ 427{
428 vmi_ops.release_page(pfn, VMI_PAGE_L2); 428 vmi_ops.release_page(pfn, VMI_PAGE_L2);
429 vmi_set_page_type(pfn, VMI_PAGE_NORMAL); 429 vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index 0c029e8959c7..7766d36983fc 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -61,7 +61,7 @@ static void vsmp_irq_enable(void)
61 native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC)); 61 native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
62} 62}
63 63
64static unsigned __init vsmp_patch(u8 type, u16 clobbers, void *ibuf, 64static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf,
65 unsigned long addr, unsigned len) 65 unsigned long addr, unsigned len)
66{ 66{
67 switch (type) { 67 switch (type) {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0bfe2bd305eb..3da2508eb22a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -711,6 +711,10 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
711 u64 *spte; 711 u64 *spte;
712 int young = 0; 712 int young = 0;
713 713
714 /* always return old for EPT */
715 if (!shadow_accessed_mask)
716 return 0;
717
714 spte = rmap_next(kvm, rmapp, NULL); 718 spte = rmap_next(kvm, rmapp, NULL);
715 while (spte) { 719 while (spte) {
716 int _young; 720 int _young;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e2ee264740c7..8233b86c778c 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -62,6 +62,7 @@ static int npt = 1;
62module_param(npt, int, S_IRUGO); 62module_param(npt, int, S_IRUGO);
63 63
64static void kvm_reput_irq(struct vcpu_svm *svm); 64static void kvm_reput_irq(struct vcpu_svm *svm);
65static void svm_flush_tlb(struct kvm_vcpu *vcpu);
65 66
66static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) 67static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
67{ 68{
@@ -878,6 +879,10 @@ set:
878static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 879static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
879{ 880{
880 unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE; 881 unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE;
882 unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
883
884 if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
885 force_new_asid(vcpu);
881 886
882 vcpu->arch.cr4 = cr4; 887 vcpu->arch.cr4 = cr4;
883 if (!npt_enabled) 888 if (!npt_enabled)
@@ -1027,6 +1032,13 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1027 KVMTRACE_3D(TDP_FAULT, &svm->vcpu, error_code, 1032 KVMTRACE_3D(TDP_FAULT, &svm->vcpu, error_code,
1028 (u32)fault_address, (u32)(fault_address >> 32), 1033 (u32)fault_address, (u32)(fault_address >> 32),
1029 handler); 1034 handler);
1035 /*
1036 * FIXME: Tis shouldn't be necessary here, but there is a flush
1037 * missing in the MMU code. Until we find this bug, flush the
1038 * complete TLB here on an NPF
1039 */
1040 if (npt_enabled)
1041 svm_flush_tlb(&svm->vcpu);
1030 1042
1031 if (event_injection) 1043 if (event_injection)
1032 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); 1044 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2a69773e3b26..7041cc52b562 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3301,8 +3301,7 @@ static int __init vmx_init(void)
3301 kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK | 3301 kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
3302 VMX_EPT_WRITABLE_MASK | 3302 VMX_EPT_WRITABLE_MASK |
3303 VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT); 3303 VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
3304 kvm_mmu_set_mask_ptes(0ull, VMX_EPT_FAKE_ACCESSED_MASK, 3304 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
3305 VMX_EPT_FAKE_DIRTY_MASK, 0ull,
3306 VMX_EPT_EXECUTABLE_MASK); 3305 VMX_EPT_EXECUTABLE_MASK);
3307 kvm_enable_tdp(); 3306 kvm_enable_tdp();
3308 } else 3307 } else
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
index 425a13436b3f..17e25995b65b 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -331,21 +331,6 @@ enum vmcs_field {
331 331
332#define AR_RESERVD_MASK 0xfffe0f00 332#define AR_RESERVD_MASK 0xfffe0f00
333 333
334#define MSR_IA32_VMX_BASIC 0x480
335#define MSR_IA32_VMX_PINBASED_CTLS 0x481
336#define MSR_IA32_VMX_PROCBASED_CTLS 0x482
337#define MSR_IA32_VMX_EXIT_CTLS 0x483
338#define MSR_IA32_VMX_ENTRY_CTLS 0x484
339#define MSR_IA32_VMX_MISC 0x485
340#define MSR_IA32_VMX_CR0_FIXED0 0x486
341#define MSR_IA32_VMX_CR0_FIXED1 0x487
342#define MSR_IA32_VMX_CR4_FIXED0 0x488
343#define MSR_IA32_VMX_CR4_FIXED1 0x489
344#define MSR_IA32_VMX_VMCS_ENUM 0x48a
345#define MSR_IA32_VMX_PROCBASED_CTLS2 0x48b
346#define MSR_IA32_VMX_EPT_VPID_CAP 0x48c
347
348#define MSR_IA32_FEATURE_CONTROL 0x3a
349#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1 334#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1
350#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4 335#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4
351 336
@@ -370,8 +355,6 @@ enum vmcs_field {
370#define VMX_EPT_READABLE_MASK 0x1ull 355#define VMX_EPT_READABLE_MASK 0x1ull
371#define VMX_EPT_WRITABLE_MASK 0x2ull 356#define VMX_EPT_WRITABLE_MASK 0x2ull
372#define VMX_EPT_EXECUTABLE_MASK 0x4ull 357#define VMX_EPT_EXECUTABLE_MASK 0x4ull
373#define VMX_EPT_FAKE_ACCESSED_MASK (1ull << 62)
374#define VMX_EPT_FAKE_DIRTY_MASK (1ull << 63)
375 358
376#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul 359#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul
377 360
diff --git a/arch/x86/lib/msr-on-cpu.c b/arch/x86/lib/msr-on-cpu.c
index 01b868ba82f8..321cf720dbb6 100644
--- a/arch/x86/lib/msr-on-cpu.c
+++ b/arch/x86/lib/msr-on-cpu.c
@@ -16,37 +16,46 @@ static void __rdmsr_on_cpu(void *info)
16 rdmsr(rv->msr_no, rv->l, rv->h); 16 rdmsr(rv->msr_no, rv->l, rv->h);
17} 17}
18 18
19static void __rdmsr_safe_on_cpu(void *info) 19static void __wrmsr_on_cpu(void *info)
20{ 20{
21 struct msr_info *rv = info; 21 struct msr_info *rv = info;
22 22
23 rv->err = rdmsr_safe(rv->msr_no, &rv->l, &rv->h); 23 wrmsr(rv->msr_no, rv->l, rv->h);
24} 24}
25 25
26static int _rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h, int safe) 26int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
27{ 27{
28 int err = 0; 28 int err;
29 struct msr_info rv; 29 struct msr_info rv;
30 30
31 rv.msr_no = msr_no; 31 rv.msr_no = msr_no;
32 if (safe) { 32 err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
33 err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu,
34 &rv, 1);
35 err = err ? err : rv.err;
36 } else {
37 err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
38 }
39 *l = rv.l; 33 *l = rv.l;
40 *h = rv.h; 34 *h = rv.h;
41 35
42 return err; 36 return err;
43} 37}
44 38
45static void __wrmsr_on_cpu(void *info) 39int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
40{
41 int err;
42 struct msr_info rv;
43
44 rv.msr_no = msr_no;
45 rv.l = l;
46 rv.h = h;
47 err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
48
49 return err;
50}
51
52/* These "safe" variants are slower and should be used when the target MSR
53 may not actually exist. */
54static void __rdmsr_safe_on_cpu(void *info)
46{ 55{
47 struct msr_info *rv = info; 56 struct msr_info *rv = info;
48 57
49 wrmsr(rv->msr_no, rv->l, rv->h); 58 rv->err = rdmsr_safe(rv->msr_no, &rv->l, &rv->h);
50} 59}
51 60
52static void __wrmsr_safe_on_cpu(void *info) 61static void __wrmsr_safe_on_cpu(void *info)
@@ -56,45 +65,30 @@ static void __wrmsr_safe_on_cpu(void *info)
56 rv->err = wrmsr_safe(rv->msr_no, rv->l, rv->h); 65 rv->err = wrmsr_safe(rv->msr_no, rv->l, rv->h);
57} 66}
58 67
59static int _wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h, int safe) 68int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
60{ 69{
61 int err = 0; 70 int err;
62 struct msr_info rv; 71 struct msr_info rv;
63 72
64 rv.msr_no = msr_no; 73 rv.msr_no = msr_no;
65 rv.l = l; 74 err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
66 rv.h = h; 75 *l = rv.l;
67 if (safe) { 76 *h = rv.h;
68 err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu,
69 &rv, 1);
70 err = err ? err : rv.err;
71 } else {
72 err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
73 }
74
75 return err;
76}
77 77
78int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) 78 return err ? err : rv.err;
79{
80 return _wrmsr_on_cpu(cpu, msr_no, l, h, 0);
81} 79}
82 80
83int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
84{
85 return _rdmsr_on_cpu(cpu, msr_no, l, h, 0);
86}
87
88/* These "safe" variants are slower and should be used when the target MSR
89 may not actually exist. */
90int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) 81int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
91{ 82{
92 return _wrmsr_on_cpu(cpu, msr_no, l, h, 1); 83 int err;
93} 84 struct msr_info rv;
94 85
95int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) 86 rv.msr_no = msr_no;
96{ 87 rv.l = l;
97 return _rdmsr_on_cpu(cpu, msr_no, l, h, 1); 88 rv.h = h;
89 err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
90
91 return err ? err : rv.err;
98} 92}
99 93
100EXPORT_SYMBOL(rdmsr_on_cpu); 94EXPORT_SYMBOL(rdmsr_on_cpu);
diff --git a/arch/x86/lib/string_32.c b/arch/x86/lib/string_32.c
index 94972e7c094d..82004d2bf05e 100644
--- a/arch/x86/lib/string_32.c
+++ b/arch/x86/lib/string_32.c
@@ -22,7 +22,7 @@ char *strcpy(char *dest, const char *src)
22 "testb %%al,%%al\n\t" 22 "testb %%al,%%al\n\t"
23 "jne 1b" 23 "jne 1b"
24 : "=&S" (d0), "=&D" (d1), "=&a" (d2) 24 : "=&S" (d0), "=&D" (d1), "=&a" (d2)
25 :"0" (src), "1" (dest) : "memory"); 25 : "0" (src), "1" (dest) : "memory");
26 return dest; 26 return dest;
27} 27}
28EXPORT_SYMBOL(strcpy); 28EXPORT_SYMBOL(strcpy);
@@ -42,7 +42,7 @@ char *strncpy(char *dest, const char *src, size_t count)
42 "stosb\n" 42 "stosb\n"
43 "2:" 43 "2:"
44 : "=&S" (d0), "=&D" (d1), "=&c" (d2), "=&a" (d3) 44 : "=&S" (d0), "=&D" (d1), "=&c" (d2), "=&a" (d3)
45 :"0" (src), "1" (dest), "2" (count) : "memory"); 45 : "0" (src), "1" (dest), "2" (count) : "memory");
46 return dest; 46 return dest;
47} 47}
48EXPORT_SYMBOL(strncpy); 48EXPORT_SYMBOL(strncpy);
@@ -60,7 +60,7 @@ char *strcat(char *dest, const char *src)
60 "testb %%al,%%al\n\t" 60 "testb %%al,%%al\n\t"
61 "jne 1b" 61 "jne 1b"
62 : "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3) 62 : "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3)
63 : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu): "memory"); 63 : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu) : "memory");
64 return dest; 64 return dest;
65} 65}
66EXPORT_SYMBOL(strcat); 66EXPORT_SYMBOL(strcat);
@@ -105,9 +105,9 @@ int strcmp(const char *cs, const char *ct)
105 "2:\tsbbl %%eax,%%eax\n\t" 105 "2:\tsbbl %%eax,%%eax\n\t"
106 "orb $1,%%al\n" 106 "orb $1,%%al\n"
107 "3:" 107 "3:"
108 :"=a" (res), "=&S" (d0), "=&D" (d1) 108 : "=a" (res), "=&S" (d0), "=&D" (d1)
109 :"1" (cs), "2" (ct) 109 : "1" (cs), "2" (ct)
110 :"memory"); 110 : "memory");
111 return res; 111 return res;
112} 112}
113EXPORT_SYMBOL(strcmp); 113EXPORT_SYMBOL(strcmp);
@@ -130,9 +130,9 @@ int strncmp(const char *cs, const char *ct, size_t count)
130 "3:\tsbbl %%eax,%%eax\n\t" 130 "3:\tsbbl %%eax,%%eax\n\t"
131 "orb $1,%%al\n" 131 "orb $1,%%al\n"
132 "4:" 132 "4:"
133 :"=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2) 133 : "=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2)
134 :"1" (cs), "2" (ct), "3" (count) 134 : "1" (cs), "2" (ct), "3" (count)
135 :"memory"); 135 : "memory");
136 return res; 136 return res;
137} 137}
138EXPORT_SYMBOL(strncmp); 138EXPORT_SYMBOL(strncmp);
@@ -152,9 +152,9 @@ char *strchr(const char *s, int c)
152 "movl $1,%1\n" 152 "movl $1,%1\n"
153 "2:\tmovl %1,%0\n\t" 153 "2:\tmovl %1,%0\n\t"
154 "decl %0" 154 "decl %0"
155 :"=a" (res), "=&S" (d0) 155 : "=a" (res), "=&S" (d0)
156 :"1" (s), "0" (c) 156 : "1" (s), "0" (c)
157 :"memory"); 157 : "memory");
158 return res; 158 return res;
159} 159}
160EXPORT_SYMBOL(strchr); 160EXPORT_SYMBOL(strchr);
@@ -169,9 +169,9 @@ size_t strlen(const char *s)
169 "scasb\n\t" 169 "scasb\n\t"
170 "notl %0\n\t" 170 "notl %0\n\t"
171 "decl %0" 171 "decl %0"
172 :"=c" (res), "=&D" (d0) 172 : "=c" (res), "=&D" (d0)
173 :"1" (s), "a" (0), "0" (0xffffffffu) 173 : "1" (s), "a" (0), "0" (0xffffffffu)
174 :"memory"); 174 : "memory");
175 return res; 175 return res;
176} 176}
177EXPORT_SYMBOL(strlen); 177EXPORT_SYMBOL(strlen);
@@ -189,9 +189,9 @@ void *memchr(const void *cs, int c, size_t count)
189 "je 1f\n\t" 189 "je 1f\n\t"
190 "movl $1,%0\n" 190 "movl $1,%0\n"
191 "1:\tdecl %0" 191 "1:\tdecl %0"
192 :"=D" (res), "=&c" (d0) 192 : "=D" (res), "=&c" (d0)
193 :"a" (c), "0" (cs), "1" (count) 193 : "a" (c), "0" (cs), "1" (count)
194 :"memory"); 194 : "memory");
195 return res; 195 return res;
196} 196}
197EXPORT_SYMBOL(memchr); 197EXPORT_SYMBOL(memchr);
@@ -228,9 +228,9 @@ size_t strnlen(const char *s, size_t count)
228 "cmpl $-1,%1\n\t" 228 "cmpl $-1,%1\n\t"
229 "jne 1b\n" 229 "jne 1b\n"
230 "3:\tsubl %2,%0" 230 "3:\tsubl %2,%0"
231 :"=a" (res), "=&d" (d0) 231 : "=a" (res), "=&d" (d0)
232 :"c" (s), "1" (count) 232 : "c" (s), "1" (count)
233 :"memory"); 233 : "memory");
234 return res; 234 return res;
235} 235}
236EXPORT_SYMBOL(strnlen); 236EXPORT_SYMBOL(strnlen);
diff --git a/arch/x86/lib/strstr_32.c b/arch/x86/lib/strstr_32.c
index 42e8a50303f3..8e2d55f754bf 100644
--- a/arch/x86/lib/strstr_32.c
+++ b/arch/x86/lib/strstr_32.c
@@ -23,9 +23,9 @@ __asm__ __volatile__(
23 "jne 1b\n\t" 23 "jne 1b\n\t"
24 "xorl %%eax,%%eax\n\t" 24 "xorl %%eax,%%eax\n\t"
25 "2:" 25 "2:"
26 :"=a" (__res), "=&c" (d0), "=&S" (d1) 26 : "=a" (__res), "=&c" (d0), "=&S" (d1)
27 :"0" (0), "1" (0xffffffff), "2" (cs), "g" (ct) 27 : "0" (0), "1" (0xffffffff), "2" (cs), "g" (ct)
28 :"dx", "di"); 28 : "dx", "di");
29return __res; 29return __res;
30} 30}
31 31
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index ee0fba092157..199a5f4a873c 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -448,6 +448,8 @@ static void __init start_secondary(void *unused)
448 448
449 VDEBUG(("VOYAGER SMP: CPU%d, stack at about %p\n", cpuid, &cpuid)); 449 VDEBUG(("VOYAGER SMP: CPU%d, stack at about %p\n", cpuid, &cpuid));
450 450
451 notify_cpu_starting(cpuid);
452
451 /* enable interrupts */ 453 /* enable interrupts */
452 local_irq_enable(); 454 local_irq_enable();
453 455
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index 62fa440678d8..847c164725f4 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -328,7 +328,7 @@ void __init initmem_init(unsigned long start_pfn,
328 328
329 get_memcfg_numa(); 329 get_memcfg_numa();
330 330
331 kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE); 331 kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE);
332 332
333 kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); 333 kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
334 do { 334 do {
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index a20d1fa64b4e..e7277cbcfb40 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -148,8 +148,8 @@ static void note_page(struct seq_file *m, struct pg_state *st,
148 * we have now. "break" is either changing perms, levels or 148 * we have now. "break" is either changing perms, levels or
149 * address space marker. 149 * address space marker.
150 */ 150 */
151 prot = pgprot_val(new_prot) & ~(PTE_PFN_MASK); 151 prot = pgprot_val(new_prot) & PTE_FLAGS_MASK;
152 cur = pgprot_val(st->current_prot) & ~(PTE_PFN_MASK); 152 cur = pgprot_val(st->current_prot) & PTE_FLAGS_MASK;
153 153
154 if (!st->level) { 154 if (!st->level) {
155 /* First entry */ 155 /* First entry */
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 4974e97dedfe..c3789bb19308 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -195,11 +195,30 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
195 pgd_t *pgd; 195 pgd_t *pgd;
196 pmd_t *pmd; 196 pmd_t *pmd;
197 pte_t *pte; 197 pte_t *pte;
198 unsigned pages_2m = 0, pages_4k = 0; 198 unsigned pages_2m, pages_4k;
199 int mapping_iter;
200
201 /*
202 * First iteration will setup identity mapping using large/small pages
203 * based on use_pse, with other attributes same as set by
204 * the early code in head_32.S
205 *
206 * Second iteration will setup the appropriate attributes (NX, GLOBAL..)
207 * as desired for the kernel identity mapping.
208 *
209 * This two pass mechanism conforms to the TLB app note which says:
210 *
211 * "Software should not write to a paging-structure entry in a way
212 * that would change, for any linear address, both the page size
213 * and either the page frame or attributes."
214 */
215 mapping_iter = 1;
199 216
200 if (!cpu_has_pse) 217 if (!cpu_has_pse)
201 use_pse = 0; 218 use_pse = 0;
202 219
220repeat:
221 pages_2m = pages_4k = 0;
203 pfn = start_pfn; 222 pfn = start_pfn;
204 pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET); 223 pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
205 pgd = pgd_base + pgd_idx; 224 pgd = pgd_base + pgd_idx;
@@ -225,6 +244,13 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
225 if (use_pse) { 244 if (use_pse) {
226 unsigned int addr2; 245 unsigned int addr2;
227 pgprot_t prot = PAGE_KERNEL_LARGE; 246 pgprot_t prot = PAGE_KERNEL_LARGE;
247 /*
248 * first pass will use the same initial
249 * identity mapping attribute + _PAGE_PSE.
250 */
251 pgprot_t init_prot =
252 __pgprot(PTE_IDENT_ATTR |
253 _PAGE_PSE);
228 254
229 addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE + 255 addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
230 PAGE_OFFSET + PAGE_SIZE-1; 256 PAGE_OFFSET + PAGE_SIZE-1;
@@ -234,7 +260,10 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
234 prot = PAGE_KERNEL_LARGE_EXEC; 260 prot = PAGE_KERNEL_LARGE_EXEC;
235 261
236 pages_2m++; 262 pages_2m++;
237 set_pmd(pmd, pfn_pmd(pfn, prot)); 263 if (mapping_iter == 1)
264 set_pmd(pmd, pfn_pmd(pfn, init_prot));
265 else
266 set_pmd(pmd, pfn_pmd(pfn, prot));
238 267
239 pfn += PTRS_PER_PTE; 268 pfn += PTRS_PER_PTE;
240 continue; 269 continue;
@@ -246,17 +275,43 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
246 for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn; 275 for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
247 pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) { 276 pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
248 pgprot_t prot = PAGE_KERNEL; 277 pgprot_t prot = PAGE_KERNEL;
278 /*
279 * first pass will use the same initial
280 * identity mapping attribute.
281 */
282 pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR);
249 283
250 if (is_kernel_text(addr)) 284 if (is_kernel_text(addr))
251 prot = PAGE_KERNEL_EXEC; 285 prot = PAGE_KERNEL_EXEC;
252 286
253 pages_4k++; 287 pages_4k++;
254 set_pte(pte, pfn_pte(pfn, prot)); 288 if (mapping_iter == 1)
289 set_pte(pte, pfn_pte(pfn, init_prot));
290 else
291 set_pte(pte, pfn_pte(pfn, prot));
255 } 292 }
256 } 293 }
257 } 294 }
258 update_page_count(PG_LEVEL_2M, pages_2m); 295 if (mapping_iter == 1) {
259 update_page_count(PG_LEVEL_4K, pages_4k); 296 /*
297 * update direct mapping page count only in the first
298 * iteration.
299 */
300 update_page_count(PG_LEVEL_2M, pages_2m);
301 update_page_count(PG_LEVEL_4K, pages_4k);
302
303 /*
304 * local global flush tlb, which will flush the previous
305 * mappings present in both small and large page TLB's.
306 */
307 __flush_tlb_all();
308
309 /*
310 * Second iteration will set the actual desired PTE attributes.
311 */
312 mapping_iter = 2;
313 goto repeat;
314 }
260} 315}
261 316
262/* 317/*
@@ -459,11 +514,7 @@ static void __init pagetable_init(void)
459{ 514{
460 pgd_t *pgd_base = swapper_pg_dir; 515 pgd_t *pgd_base = swapper_pg_dir;
461 516
462 paravirt_pagetable_setup_start(pgd_base);
463
464 permanent_kmaps_init(pgd_base); 517 permanent_kmaps_init(pgd_base);
465
466 paravirt_pagetable_setup_done(pgd_base);
467} 518}
468 519
469#ifdef CONFIG_ACPI_SLEEP 520#ifdef CONFIG_ACPI_SLEEP
@@ -723,7 +774,7 @@ void __init setup_bootmem_allocator(void)
723 after_init_bootmem = 1; 774 after_init_bootmem = 1;
724} 775}
725 776
726static void __init find_early_table_space(unsigned long end) 777static void __init find_early_table_space(unsigned long end, int use_pse)
727{ 778{
728 unsigned long puds, pmds, ptes, tables, start; 779 unsigned long puds, pmds, ptes, tables, start;
729 780
@@ -733,7 +784,7 @@ static void __init find_early_table_space(unsigned long end)
733 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; 784 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
734 tables += PAGE_ALIGN(pmds * sizeof(pmd_t)); 785 tables += PAGE_ALIGN(pmds * sizeof(pmd_t));
735 786
736 if (cpu_has_pse) { 787 if (use_pse) {
737 unsigned long extra; 788 unsigned long extra;
738 789
739 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); 790 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
@@ -773,12 +824,22 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
773 pgd_t *pgd_base = swapper_pg_dir; 824 pgd_t *pgd_base = swapper_pg_dir;
774 unsigned long start_pfn, end_pfn; 825 unsigned long start_pfn, end_pfn;
775 unsigned long big_page_start; 826 unsigned long big_page_start;
827#ifdef CONFIG_DEBUG_PAGEALLOC
828 /*
829 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
830 * This will simplify cpa(), which otherwise needs to support splitting
831 * large pages into small in interrupt context, etc.
832 */
833 int use_pse = 0;
834#else
835 int use_pse = cpu_has_pse;
836#endif
776 837
777 /* 838 /*
778 * Find space for the kernel direct mapping tables. 839 * Find space for the kernel direct mapping tables.
779 */ 840 */
780 if (!after_init_bootmem) 841 if (!after_init_bootmem)
781 find_early_table_space(end); 842 find_early_table_space(end, use_pse);
782 843
783#ifdef CONFIG_X86_PAE 844#ifdef CONFIG_X86_PAE
784 set_nx(); 845 set_nx();
@@ -824,7 +885,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
824 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); 885 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
825 if (start_pfn < end_pfn) 886 if (start_pfn < end_pfn)
826 kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 887 kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
827 cpu_has_pse); 888 use_pse);
828 889
829 /* tail is not big page alignment ? */ 890 /* tail is not big page alignment ? */
830 start_pfn = end_pfn; 891 start_pfn = end_pfn;
@@ -987,7 +1048,6 @@ void __init mem_init(void)
987 if (boot_cpu_data.wp_works_ok < 0) 1048 if (boot_cpu_data.wp_works_ok < 0)
988 test_wp_bit(); 1049 test_wp_bit();
989 1050
990 cpa_init();
991 save_pg_dir(); 1051 save_pg_dir();
992 zap_low_mappings(); 1052 zap_low_mappings();
993} 1053}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index d3746efb060d..83e13f2d53d2 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -88,6 +88,62 @@ early_param("gbpages", parse_direct_gbpages_on);
88 88
89int after_bootmem; 89int after_bootmem;
90 90
91unsigned long __supported_pte_mask __read_mostly = ~0UL;
92EXPORT_SYMBOL_GPL(__supported_pte_mask);
93
94static int do_not_nx __cpuinitdata;
95
96/*
97 * noexec=on|off
98 * Control non-executable mappings for 64-bit processes.
99 *
100 * on Enable (default)
101 * off Disable
102 */
103static int __init nonx_setup(char *str)
104{
105 if (!str)
106 return -EINVAL;
107 if (!strncmp(str, "on", 2)) {
108 __supported_pte_mask |= _PAGE_NX;
109 do_not_nx = 0;
110 } else if (!strncmp(str, "off", 3)) {
111 do_not_nx = 1;
112 __supported_pte_mask &= ~_PAGE_NX;
113 }
114 return 0;
115}
116early_param("noexec", nonx_setup);
117
118void __cpuinit check_efer(void)
119{
120 unsigned long efer;
121
122 rdmsrl(MSR_EFER, efer);
123 if (!(efer & EFER_NX) || do_not_nx)
124 __supported_pte_mask &= ~_PAGE_NX;
125}
126
127int force_personality32;
128
129/*
130 * noexec32=on|off
131 * Control non executable heap for 32bit processes.
132 * To control the stack too use noexec=off
133 *
134 * on PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
135 * off PROT_READ implies PROT_EXEC
136 */
137static int __init nonx32_setup(char *str)
138{
139 if (!strcmp(str, "on"))
140 force_personality32 &= ~READ_IMPLIES_EXEC;
141 else if (!strcmp(str, "off"))
142 force_personality32 |= READ_IMPLIES_EXEC;
143 return 1;
144}
145__setup("noexec32=", nonx32_setup);
146
91/* 147/*
92 * NOTE: This function is marked __ref because it calls __init function 148 * NOTE: This function is marked __ref because it calls __init function
93 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. 149 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
@@ -225,7 +281,7 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
225void __init cleanup_highmap(void) 281void __init cleanup_highmap(void)
226{ 282{
227 unsigned long vaddr = __START_KERNEL_map; 283 unsigned long vaddr = __START_KERNEL_map;
228 unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1; 284 unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1;
229 pmd_t *pmd = level2_kernel_pgt; 285 pmd_t *pmd = level2_kernel_pgt;
230 pmd_t *last_pmd = pmd + PTRS_PER_PMD; 286 pmd_t *last_pmd = pmd + PTRS_PER_PMD;
231 287
@@ -271,7 +327,8 @@ static __ref void unmap_low_page(void *adr)
271} 327}
272 328
273static unsigned long __meminit 329static unsigned long __meminit
274phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end) 330phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
331 pgprot_t prot)
275{ 332{
276 unsigned pages = 0; 333 unsigned pages = 0;
277 unsigned long last_map_addr = end; 334 unsigned long last_map_addr = end;
@@ -289,36 +346,43 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
289 break; 346 break;
290 } 347 }
291 348
349 /*
350 * We will re-use the existing mapping.
351 * Xen for example has some special requirements, like mapping
352 * pagetable pages as RO. So assume someone who pre-setup
353 * these mappings are more intelligent.
354 */
292 if (pte_val(*pte)) 355 if (pte_val(*pte))
293 continue; 356 continue;
294 357
295 if (0) 358 if (0)
296 printk(" pte=%p addr=%lx pte=%016lx\n", 359 printk(" pte=%p addr=%lx pte=%016lx\n",
297 pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte); 360 pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
298 set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL));
299 last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
300 pages++; 361 pages++;
362 set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot));
363 last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
301 } 364 }
365
302 update_page_count(PG_LEVEL_4K, pages); 366 update_page_count(PG_LEVEL_4K, pages);
303 367
304 return last_map_addr; 368 return last_map_addr;
305} 369}
306 370
307static unsigned long __meminit 371static unsigned long __meminit
308phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end) 372phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end,
373 pgprot_t prot)
309{ 374{
310 pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd); 375 pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
311 376
312 return phys_pte_init(pte, address, end); 377 return phys_pte_init(pte, address, end, prot);
313} 378}
314 379
315static unsigned long __meminit 380static unsigned long __meminit
316phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, 381phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
317 unsigned long page_size_mask) 382 unsigned long page_size_mask, pgprot_t prot)
318{ 383{
319 unsigned long pages = 0; 384 unsigned long pages = 0;
320 unsigned long last_map_addr = end; 385 unsigned long last_map_addr = end;
321 unsigned long start = address;
322 386
323 int i = pmd_index(address); 387 int i = pmd_index(address);
324 388
@@ -326,6 +390,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
326 unsigned long pte_phys; 390 unsigned long pte_phys;
327 pmd_t *pmd = pmd_page + pmd_index(address); 391 pmd_t *pmd = pmd_page + pmd_index(address);
328 pte_t *pte; 392 pte_t *pte;
393 pgprot_t new_prot = prot;
329 394
330 if (address >= end) { 395 if (address >= end) {
331 if (!after_bootmem) { 396 if (!after_bootmem) {
@@ -339,27 +404,40 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
339 if (!pmd_large(*pmd)) { 404 if (!pmd_large(*pmd)) {
340 spin_lock(&init_mm.page_table_lock); 405 spin_lock(&init_mm.page_table_lock);
341 last_map_addr = phys_pte_update(pmd, address, 406 last_map_addr = phys_pte_update(pmd, address,
342 end); 407 end, prot);
343 spin_unlock(&init_mm.page_table_lock); 408 spin_unlock(&init_mm.page_table_lock);
409 continue;
344 } 410 }
345 /* Count entries we're using from level2_ident_pgt */ 411 /*
346 if (start == 0) 412 * If we are ok with PG_LEVEL_2M mapping, then we will
347 pages++; 413 * use the existing mapping,
348 continue; 414 *
415 * Otherwise, we will split the large page mapping but
416 * use the same existing protection bits except for
417 * large page, so that we don't violate Intel's TLB
418 * Application note (317080) which says, while changing
419 * the page sizes, new and old translations should
420 * not differ with respect to page frame and
421 * attributes.
422 */
423 if (page_size_mask & (1 << PG_LEVEL_2M))
424 continue;
425 new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
349 } 426 }
350 427
351 if (page_size_mask & (1<<PG_LEVEL_2M)) { 428 if (page_size_mask & (1<<PG_LEVEL_2M)) {
352 pages++; 429 pages++;
353 spin_lock(&init_mm.page_table_lock); 430 spin_lock(&init_mm.page_table_lock);
354 set_pte((pte_t *)pmd, 431 set_pte((pte_t *)pmd,
355 pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); 432 pfn_pte(address >> PAGE_SHIFT,
433 __pgprot(pgprot_val(prot) | _PAGE_PSE)));
356 spin_unlock(&init_mm.page_table_lock); 434 spin_unlock(&init_mm.page_table_lock);
357 last_map_addr = (address & PMD_MASK) + PMD_SIZE; 435 last_map_addr = (address & PMD_MASK) + PMD_SIZE;
358 continue; 436 continue;
359 } 437 }
360 438
361 pte = alloc_low_page(&pte_phys); 439 pte = alloc_low_page(&pte_phys);
362 last_map_addr = phys_pte_init(pte, address, end); 440 last_map_addr = phys_pte_init(pte, address, end, new_prot);
363 unmap_low_page(pte); 441 unmap_low_page(pte);
364 442
365 spin_lock(&init_mm.page_table_lock); 443 spin_lock(&init_mm.page_table_lock);
@@ -372,12 +450,12 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
372 450
373static unsigned long __meminit 451static unsigned long __meminit
374phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end, 452phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
375 unsigned long page_size_mask) 453 unsigned long page_size_mask, pgprot_t prot)
376{ 454{
377 pmd_t *pmd = pmd_offset(pud, 0); 455 pmd_t *pmd = pmd_offset(pud, 0);
378 unsigned long last_map_addr; 456 unsigned long last_map_addr;
379 457
380 last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask); 458 last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot);
381 __flush_tlb_all(); 459 __flush_tlb_all();
382 return last_map_addr; 460 return last_map_addr;
383} 461}
@@ -394,6 +472,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
394 unsigned long pmd_phys; 472 unsigned long pmd_phys;
395 pud_t *pud = pud_page + pud_index(addr); 473 pud_t *pud = pud_page + pud_index(addr);
396 pmd_t *pmd; 474 pmd_t *pmd;
475 pgprot_t prot = PAGE_KERNEL;
397 476
398 if (addr >= end) 477 if (addr >= end)
399 break; 478 break;
@@ -405,10 +484,26 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
405 } 484 }
406 485
407 if (pud_val(*pud)) { 486 if (pud_val(*pud)) {
408 if (!pud_large(*pud)) 487 if (!pud_large(*pud)) {
409 last_map_addr = phys_pmd_update(pud, addr, end, 488 last_map_addr = phys_pmd_update(pud, addr, end,
410 page_size_mask); 489 page_size_mask, prot);
411 continue; 490 continue;
491 }
492 /*
493 * If we are ok with PG_LEVEL_1G mapping, then we will
494 * use the existing mapping.
495 *
496 * Otherwise, we will split the gbpage mapping but use
497 * the same existing protection bits except for large
498 * page, so that we don't violate Intel's TLB
499 * Application note (317080) which says, while changing
500 * the page sizes, new and old translations should
501 * not differ with respect to page frame and
502 * attributes.
503 */
504 if (page_size_mask & (1 << PG_LEVEL_1G))
505 continue;
506 prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
412 } 507 }
413 508
414 if (page_size_mask & (1<<PG_LEVEL_1G)) { 509 if (page_size_mask & (1<<PG_LEVEL_1G)) {
@@ -422,7 +517,8 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
422 } 517 }
423 518
424 pmd = alloc_low_page(&pmd_phys); 519 pmd = alloc_low_page(&pmd_phys);
425 last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask); 520 last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
521 prot);
426 unmap_low_page(pmd); 522 unmap_low_page(pmd);
427 523
428 spin_lock(&init_mm.page_table_lock); 524 spin_lock(&init_mm.page_table_lock);
@@ -430,6 +526,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
430 spin_unlock(&init_mm.page_table_lock); 526 spin_unlock(&init_mm.page_table_lock);
431 } 527 }
432 __flush_tlb_all(); 528 __flush_tlb_all();
529
433 update_page_count(PG_LEVEL_1G, pages); 530 update_page_count(PG_LEVEL_1G, pages);
434 531
435 return last_map_addr; 532 return last_map_addr;
@@ -446,27 +543,28 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
446 return phys_pud_init(pud, addr, end, page_size_mask); 543 return phys_pud_init(pud, addr, end, page_size_mask);
447} 544}
448 545
449static void __init find_early_table_space(unsigned long end) 546static void __init find_early_table_space(unsigned long end, int use_pse,
547 int use_gbpages)
450{ 548{
451 unsigned long puds, pmds, ptes, tables, start; 549 unsigned long puds, pmds, ptes, tables, start;
452 550
453 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; 551 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
454 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE); 552 tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
455 if (direct_gbpages) { 553 if (use_gbpages) {
456 unsigned long extra; 554 unsigned long extra;
457 extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); 555 extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
458 pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; 556 pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
459 } else 557 } else
460 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; 558 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
461 tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE); 559 tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
462 560
463 if (cpu_has_pse) { 561 if (use_pse) {
464 unsigned long extra; 562 unsigned long extra;
465 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); 563 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
466 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; 564 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
467 } else 565 } else
468 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; 566 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
469 tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE); 567 tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
470 568
471 /* 569 /*
472 * RED-PEN putting page tables only on node 0 could 570 * RED-PEN putting page tables only on node 0 could
@@ -528,6 +626,7 @@ static unsigned long __init kernel_physical_mapping_init(unsigned long start,
528 pgd_populate(&init_mm, pgd, __va(pud_phys)); 626 pgd_populate(&init_mm, pgd, __va(pud_phys));
529 spin_unlock(&init_mm.page_table_lock); 627 spin_unlock(&init_mm.page_table_lock);
530 } 628 }
629 __flush_tlb_all();
531 630
532 return last_map_addr; 631 return last_map_addr;
533} 632}
@@ -571,6 +670,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
571 670
572 struct map_range mr[NR_RANGE_MR]; 671 struct map_range mr[NR_RANGE_MR];
573 int nr_range, i; 672 int nr_range, i;
673 int use_pse, use_gbpages;
574 674
575 printk(KERN_INFO "init_memory_mapping\n"); 675 printk(KERN_INFO "init_memory_mapping\n");
576 676
@@ -584,9 +684,21 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
584 if (!after_bootmem) 684 if (!after_bootmem)
585 init_gbpages(); 685 init_gbpages();
586 686
587 if (direct_gbpages) 687#ifdef CONFIG_DEBUG_PAGEALLOC
688 /*
689 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
690 * This will simplify cpa(), which otherwise needs to support splitting
691 * large pages into small in interrupt context, etc.
692 */
693 use_pse = use_gbpages = 0;
694#else
695 use_pse = cpu_has_pse;
696 use_gbpages = direct_gbpages;
697#endif
698
699 if (use_gbpages)
588 page_size_mask |= 1 << PG_LEVEL_1G; 700 page_size_mask |= 1 << PG_LEVEL_1G;
589 if (cpu_has_pse) 701 if (use_pse)
590 page_size_mask |= 1 << PG_LEVEL_2M; 702 page_size_mask |= 1 << PG_LEVEL_2M;
591 703
592 memset(mr, 0, sizeof(mr)); 704 memset(mr, 0, sizeof(mr));
@@ -647,7 +759,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
647 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k")); 759 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
648 760
649 if (!after_bootmem) 761 if (!after_bootmem)
650 find_early_table_space(end); 762 find_early_table_space(end, use_pse, use_gbpages);
651 763
652 for (i = 0; i < nr_range; i++) 764 for (i = 0; i < nr_range; i++)
653 last_map_addr = kernel_physical_mapping_init( 765 last_map_addr = kernel_physical_mapping_init(
@@ -806,8 +918,6 @@ void __init mem_init(void)
806 reservedpages << (PAGE_SHIFT-10), 918 reservedpages << (PAGE_SHIFT-10),
807 datasize >> 10, 919 datasize >> 10,
808 initsize >> 10); 920 initsize >> 10);
809
810 cpa_init();
811} 921}
812 922
813void free_init_pages(char *what, unsigned long begin, unsigned long end) 923void free_init_pages(char *what, unsigned long begin, unsigned long end)
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index cac6da54203b..6ab3196d12b4 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -83,6 +83,25 @@ int page_is_ram(unsigned long pagenr)
83 return 0; 83 return 0;
84} 84}
85 85
86int pagerange_is_ram(unsigned long start, unsigned long end)
87{
88 int ram_page = 0, not_rampage = 0;
89 unsigned long page_nr;
90
91 for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT);
92 ++page_nr) {
93 if (page_is_ram(page_nr))
94 ram_page = 1;
95 else
96 not_rampage = 1;
97
98 if (ram_page == not_rampage)
99 return -1;
100 }
101
102 return ram_page;
103}
104
86/* 105/*
87 * Fix up the linear direct mapping of the kernel to avoid cache attribute 106 * Fix up the linear direct mapping of the kernel to avoid cache attribute
88 * conflicts. 107 * conflicts.
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index a4dd793d6003..cebcbf152d46 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -79,7 +79,7 @@ static int __init allocate_cachealigned_memnodemap(void)
79 return 0; 79 return 0;
80 80
81 addr = 0x8000; 81 addr = 0x8000;
82 nodemap_size = round_up(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); 82 nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
83 nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT, 83 nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT,
84 nodemap_size, L1_CACHE_BYTES); 84 nodemap_size, L1_CACHE_BYTES);
85 if (nodemap_addr == -1UL) { 85 if (nodemap_addr == -1UL) {
@@ -176,10 +176,10 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
176 unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size; 176 unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size;
177 unsigned long bootmap_start, nodedata_phys; 177 unsigned long bootmap_start, nodedata_phys;
178 void *bootmap; 178 void *bootmap;
179 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); 179 const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
180 int nid; 180 int nid;
181 181
182 start = round_up(start, ZONE_ALIGN); 182 start = roundup(start, ZONE_ALIGN);
183 183
184 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, 184 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid,
185 start, end); 185 start, end);
@@ -210,9 +210,9 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
210 bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn); 210 bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
211 nid = phys_to_nid(nodedata_phys); 211 nid = phys_to_nid(nodedata_phys);
212 if (nid == nodeid) 212 if (nid == nodeid)
213 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); 213 bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
214 else 214 else
215 bootmap_start = round_up(start, PAGE_SIZE); 215 bootmap_start = roundup(start, PAGE_SIZE);
216 /* 216 /*
217 * SMP_CACHE_BYTES could be enough, but init_bootmem_node like 217 * SMP_CACHE_BYTES could be enough, but init_bootmem_node like
218 * to use that to align to PAGE_SIZE 218 * to use that to align to PAGE_SIZE
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index d4aa503caaa2..e1d106909218 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -32,7 +32,7 @@ enum {
32 GPS = (1<<30) 32 GPS = (1<<30)
33}; 33};
34 34
35#define PAGE_TESTBIT __pgprot(_PAGE_UNUSED1) 35#define PAGE_CPA_TEST __pgprot(_PAGE_CPA_TEST)
36 36
37static int pte_testbit(pte_t pte) 37static int pte_testbit(pte_t pte)
38{ 38{
@@ -118,6 +118,7 @@ static int pageattr_test(void)
118 unsigned int level; 118 unsigned int level;
119 int i, k; 119 int i, k;
120 int err; 120 int err;
121 unsigned long test_addr;
121 122
122 if (print) 123 if (print)
123 printk(KERN_INFO "CPA self-test:\n"); 124 printk(KERN_INFO "CPA self-test:\n");
@@ -172,7 +173,8 @@ static int pageattr_test(void)
172 continue; 173 continue;
173 } 174 }
174 175
175 err = change_page_attr_set(addr[i], len[i], PAGE_TESTBIT); 176 test_addr = addr[i];
177 err = change_page_attr_set(&test_addr, len[i], PAGE_CPA_TEST, 0);
176 if (err < 0) { 178 if (err < 0) {
177 printk(KERN_ERR "CPA %d failed %d\n", i, err); 179 printk(KERN_ERR "CPA %d failed %d\n", i, err);
178 failed++; 180 failed++;
@@ -204,7 +206,8 @@ static int pageattr_test(void)
204 failed++; 206 failed++;
205 continue; 207 continue;
206 } 208 }
207 err = change_page_attr_clear(addr[i], len[i], PAGE_TESTBIT); 209 test_addr = addr[i];
210 err = change_page_attr_clear(&test_addr, len[i], PAGE_CPA_TEST, 0);
208 if (err < 0) { 211 if (err < 0) {
209 printk(KERN_ERR "CPA reverting failed: %d\n", err); 212 printk(KERN_ERR "CPA reverting failed: %d\n", err);
210 failed++; 213 failed++;
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 43e2f8483e4f..a9ec89c3fbca 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -25,15 +25,27 @@
25 * The current flushing context - we pass it instead of 5 arguments: 25 * The current flushing context - we pass it instead of 5 arguments:
26 */ 26 */
27struct cpa_data { 27struct cpa_data {
28 unsigned long vaddr; 28 unsigned long *vaddr;
29 pgprot_t mask_set; 29 pgprot_t mask_set;
30 pgprot_t mask_clr; 30 pgprot_t mask_clr;
31 int numpages; 31 int numpages;
32 int flushtlb; 32 int flags;
33 unsigned long pfn; 33 unsigned long pfn;
34 unsigned force_split : 1; 34 unsigned force_split : 1;
35 int curpage;
35}; 36};
36 37
38/*
39 * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
40 * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
41 * entries change the page attribute in parallel to some other cpu
42 * splitting a large page entry along with changing the attribute.
43 */
44static DEFINE_SPINLOCK(cpa_lock);
45
46#define CPA_FLUSHTLB 1
47#define CPA_ARRAY 2
48
37#ifdef CONFIG_PROC_FS 49#ifdef CONFIG_PROC_FS
38static unsigned long direct_pages_count[PG_LEVEL_NUM]; 50static unsigned long direct_pages_count[PG_LEVEL_NUM];
39 51
@@ -84,7 +96,7 @@ static inline unsigned long highmap_start_pfn(void)
84 96
85static inline unsigned long highmap_end_pfn(void) 97static inline unsigned long highmap_end_pfn(void)
86{ 98{
87 return __pa(round_up((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT; 99 return __pa(roundup((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT;
88} 100}
89 101
90#endif 102#endif
@@ -190,6 +202,41 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
190 } 202 }
191} 203}
192 204
205static void cpa_flush_array(unsigned long *start, int numpages, int cache)
206{
207 unsigned int i, level;
208 unsigned long *addr;
209
210 BUG_ON(irqs_disabled());
211
212 on_each_cpu(__cpa_flush_range, NULL, 1);
213
214 if (!cache)
215 return;
216
217 /* 4M threshold */
218 if (numpages >= 1024) {
219 if (boot_cpu_data.x86_model >= 4)
220 wbinvd();
221 return;
222 }
223 /*
224 * We only need to flush on one CPU,
225 * clflush is a MESI-coherent instruction that
226 * will cause all other CPUs to flush the same
227 * cachelines:
228 */
229 for (i = 0, addr = start; i < numpages; i++, addr++) {
230 pte_t *pte = lookup_address(*addr, &level);
231
232 /*
233 * Only flush present addresses:
234 */
235 if (pte && (pte_val(*pte) & _PAGE_PRESENT))
236 clflush_cache_range((void *) *addr, PAGE_SIZE);
237 }
238}
239
193/* 240/*
194 * Certain areas of memory on x86 require very specific protection flags, 241 * Certain areas of memory on x86 require very specific protection flags,
195 * for example the BIOS area or kernel text. Callers don't always get this 242 * for example the BIOS area or kernel text. Callers don't always get this
@@ -398,7 +445,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
398 */ 445 */
399 new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot)); 446 new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot));
400 __set_pmd_pte(kpte, address, new_pte); 447 __set_pmd_pte(kpte, address, new_pte);
401 cpa->flushtlb = 1; 448 cpa->flags |= CPA_FLUSHTLB;
402 do_split = 0; 449 do_split = 0;
403 } 450 }
404 451
@@ -408,84 +455,6 @@ out_unlock:
408 return do_split; 455 return do_split;
409} 456}
410 457
411static LIST_HEAD(page_pool);
412static unsigned long pool_size, pool_pages, pool_low;
413static unsigned long pool_used, pool_failed;
414
415static void cpa_fill_pool(struct page **ret)
416{
417 gfp_t gfp = GFP_KERNEL;
418 unsigned long flags;
419 struct page *p;
420
421 /*
422 * Avoid recursion (on debug-pagealloc) and also signal
423 * our priority to get to these pagetables:
424 */
425 if (current->flags & PF_MEMALLOC)
426 return;
427 current->flags |= PF_MEMALLOC;
428
429 /*
430 * Allocate atomically from atomic contexts:
431 */
432 if (in_atomic() || irqs_disabled() || debug_pagealloc)
433 gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN;
434
435 while (pool_pages < pool_size || (ret && !*ret)) {
436 p = alloc_pages(gfp, 0);
437 if (!p) {
438 pool_failed++;
439 break;
440 }
441 /*
442 * If the call site needs a page right now, provide it:
443 */
444 if (ret && !*ret) {
445 *ret = p;
446 continue;
447 }
448 spin_lock_irqsave(&pgd_lock, flags);
449 list_add(&p->lru, &page_pool);
450 pool_pages++;
451 spin_unlock_irqrestore(&pgd_lock, flags);
452 }
453
454 current->flags &= ~PF_MEMALLOC;
455}
456
457#define SHIFT_MB (20 - PAGE_SHIFT)
458#define ROUND_MB_GB ((1 << 10) - 1)
459#define SHIFT_MB_GB 10
460#define POOL_PAGES_PER_GB 16
461
462void __init cpa_init(void)
463{
464 struct sysinfo si;
465 unsigned long gb;
466
467 si_meminfo(&si);
468 /*
469 * Calculate the number of pool pages:
470 *
471 * Convert totalram (nr of pages) to MiB and round to the next
472 * GiB. Shift MiB to Gib and multiply the result by
473 * POOL_PAGES_PER_GB:
474 */
475 if (debug_pagealloc) {
476 gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB;
477 pool_size = POOL_PAGES_PER_GB * gb;
478 } else {
479 pool_size = 1;
480 }
481 pool_low = pool_size;
482
483 cpa_fill_pool(NULL);
484 printk(KERN_DEBUG
485 "CPA: page pool initialized %lu of %lu pages preallocated\n",
486 pool_pages, pool_size);
487}
488
489static int split_large_page(pte_t *kpte, unsigned long address) 458static int split_large_page(pte_t *kpte, unsigned long address)
490{ 459{
491 unsigned long flags, pfn, pfninc = 1; 460 unsigned long flags, pfn, pfninc = 1;
@@ -494,28 +463,15 @@ static int split_large_page(pte_t *kpte, unsigned long address)
494 pgprot_t ref_prot; 463 pgprot_t ref_prot;
495 struct page *base; 464 struct page *base;
496 465
497 /* 466 if (!debug_pagealloc)
498 * Get a page from the pool. The pool list is protected by the 467 spin_unlock(&cpa_lock);
499 * pgd_lock, which we have to take anyway for the split 468 base = alloc_pages(GFP_KERNEL, 0);
500 * operation: 469 if (!debug_pagealloc)
501 */ 470 spin_lock(&cpa_lock);
502 spin_lock_irqsave(&pgd_lock, flags); 471 if (!base)
503 if (list_empty(&page_pool)) { 472 return -ENOMEM;
504 spin_unlock_irqrestore(&pgd_lock, flags);
505 base = NULL;
506 cpa_fill_pool(&base);
507 if (!base)
508 return -ENOMEM;
509 spin_lock_irqsave(&pgd_lock, flags);
510 } else {
511 base = list_first_entry(&page_pool, struct page, lru);
512 list_del(&base->lru);
513 pool_pages--;
514
515 if (pool_pages < pool_low)
516 pool_low = pool_pages;
517 }
518 473
474 spin_lock_irqsave(&pgd_lock, flags);
519 /* 475 /*
520 * Check for races, another CPU might have split this page 476 * Check for races, another CPU might have split this page
521 * up for us already: 477 * up for us already:
@@ -572,11 +528,8 @@ out_unlock:
572 * If we dropped out via the lookup_address check under 528 * If we dropped out via the lookup_address check under
573 * pgd_lock then stick the page back into the pool: 529 * pgd_lock then stick the page back into the pool:
574 */ 530 */
575 if (base) { 531 if (base)
576 list_add(&base->lru, &page_pool); 532 __free_page(base);
577 pool_pages++;
578 } else
579 pool_used++;
580 spin_unlock_irqrestore(&pgd_lock, flags); 533 spin_unlock_irqrestore(&pgd_lock, flags);
581 534
582 return 0; 535 return 0;
@@ -584,11 +537,16 @@ out_unlock:
584 537
585static int __change_page_attr(struct cpa_data *cpa, int primary) 538static int __change_page_attr(struct cpa_data *cpa, int primary)
586{ 539{
587 unsigned long address = cpa->vaddr; 540 unsigned long address;
588 int do_split, err; 541 int do_split, err;
589 unsigned int level; 542 unsigned int level;
590 pte_t *kpte, old_pte; 543 pte_t *kpte, old_pte;
591 544
545 if (cpa->flags & CPA_ARRAY)
546 address = cpa->vaddr[cpa->curpage];
547 else
548 address = *cpa->vaddr;
549
592repeat: 550repeat:
593 kpte = lookup_address(address, &level); 551 kpte = lookup_address(address, &level);
594 if (!kpte) 552 if (!kpte)
@@ -600,7 +558,7 @@ repeat:
600 return 0; 558 return 0;
601 WARN(1, KERN_WARNING "CPA: called for zero pte. " 559 WARN(1, KERN_WARNING "CPA: called for zero pte. "
602 "vaddr = %lx cpa->vaddr = %lx\n", address, 560 "vaddr = %lx cpa->vaddr = %lx\n", address,
603 cpa->vaddr); 561 *cpa->vaddr);
604 return -EINVAL; 562 return -EINVAL;
605 } 563 }
606 564
@@ -626,7 +584,7 @@ repeat:
626 */ 584 */
627 if (pte_val(old_pte) != pte_val(new_pte)) { 585 if (pte_val(old_pte) != pte_val(new_pte)) {
628 set_pte_atomic(kpte, new_pte); 586 set_pte_atomic(kpte, new_pte);
629 cpa->flushtlb = 1; 587 cpa->flags |= CPA_FLUSHTLB;
630 } 588 }
631 cpa->numpages = 1; 589 cpa->numpages = 1;
632 return 0; 590 return 0;
@@ -650,7 +608,25 @@ repeat:
650 */ 608 */
651 err = split_large_page(kpte, address); 609 err = split_large_page(kpte, address);
652 if (!err) { 610 if (!err) {
653 cpa->flushtlb = 1; 611 /*
612 * Do a global flush tlb after splitting the large page
613 * and before we do the actual change page attribute in the PTE.
614 *
615 * With out this, we violate the TLB application note, that says
616 * "The TLBs may contain both ordinary and large-page
617 * translations for a 4-KByte range of linear addresses. This
618 * may occur if software modifies the paging structures so that
619 * the page size used for the address range changes. If the two
620 * translations differ with respect to page frame or attributes
621 * (e.g., permissions), processor behavior is undefined and may
622 * be implementation-specific."
623 *
624 * We do this global tlb flush inside the cpa_lock, so that we
625 * don't allow any other cpu, with stale tlb entries change the
626 * page attribute in parallel, that also falls into the
627 * just split large page entry.
628 */
629 flush_tlb_all();
654 goto repeat; 630 goto repeat;
655 } 631 }
656 632
@@ -663,6 +639,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
663{ 639{
664 struct cpa_data alias_cpa; 640 struct cpa_data alias_cpa;
665 int ret = 0; 641 int ret = 0;
642 unsigned long temp_cpa_vaddr, vaddr;
666 643
667 if (cpa->pfn >= max_pfn_mapped) 644 if (cpa->pfn >= max_pfn_mapped)
668 return 0; 645 return 0;
@@ -675,16 +652,24 @@ static int cpa_process_alias(struct cpa_data *cpa)
675 * No need to redo, when the primary call touched the direct 652 * No need to redo, when the primary call touched the direct
676 * mapping already: 653 * mapping already:
677 */ 654 */
678 if (!(within(cpa->vaddr, PAGE_OFFSET, 655 if (cpa->flags & CPA_ARRAY)
656 vaddr = cpa->vaddr[cpa->curpage];
657 else
658 vaddr = *cpa->vaddr;
659
660 if (!(within(vaddr, PAGE_OFFSET,
679 PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT)) 661 PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT))
680#ifdef CONFIG_X86_64 662#ifdef CONFIG_X86_64
681 || within(cpa->vaddr, PAGE_OFFSET + (1UL<<32), 663 || within(vaddr, PAGE_OFFSET + (1UL<<32),
682 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)) 664 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))
683#endif 665#endif
684 )) { 666 )) {
685 667
686 alias_cpa = *cpa; 668 alias_cpa = *cpa;
687 alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); 669 temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
670 alias_cpa.vaddr = &temp_cpa_vaddr;
671 alias_cpa.flags &= ~CPA_ARRAY;
672
688 673
689 ret = __change_page_attr_set_clr(&alias_cpa, 0); 674 ret = __change_page_attr_set_clr(&alias_cpa, 0);
690 } 675 }
@@ -696,7 +681,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
696 * No need to redo, when the primary call touched the high 681 * No need to redo, when the primary call touched the high
697 * mapping already: 682 * mapping already:
698 */ 683 */
699 if (within(cpa->vaddr, (unsigned long) _text, (unsigned long) _end)) 684 if (within(vaddr, (unsigned long) _text, (unsigned long) _end))
700 return 0; 685 return 0;
701 686
702 /* 687 /*
@@ -707,8 +692,9 @@ static int cpa_process_alias(struct cpa_data *cpa)
707 return 0; 692 return 0;
708 693
709 alias_cpa = *cpa; 694 alias_cpa = *cpa;
710 alias_cpa.vaddr = 695 temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base;
711 (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; 696 alias_cpa.vaddr = &temp_cpa_vaddr;
697 alias_cpa.flags &= ~CPA_ARRAY;
712 698
713 /* 699 /*
714 * The high mapping range is imprecise, so ignore the return value. 700 * The high mapping range is imprecise, so ignore the return value.
@@ -728,8 +714,15 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
728 * preservation check. 714 * preservation check.
729 */ 715 */
730 cpa->numpages = numpages; 716 cpa->numpages = numpages;
717 /* for array changes, we can't use large page */
718 if (cpa->flags & CPA_ARRAY)
719 cpa->numpages = 1;
731 720
721 if (!debug_pagealloc)
722 spin_lock(&cpa_lock);
732 ret = __change_page_attr(cpa, checkalias); 723 ret = __change_page_attr(cpa, checkalias);
724 if (!debug_pagealloc)
725 spin_unlock(&cpa_lock);
733 if (ret) 726 if (ret)
734 return ret; 727 return ret;
735 728
@@ -746,7 +739,11 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
746 */ 739 */
747 BUG_ON(cpa->numpages > numpages); 740 BUG_ON(cpa->numpages > numpages);
748 numpages -= cpa->numpages; 741 numpages -= cpa->numpages;
749 cpa->vaddr += cpa->numpages * PAGE_SIZE; 742 if (cpa->flags & CPA_ARRAY)
743 cpa->curpage++;
744 else
745 *cpa->vaddr += cpa->numpages * PAGE_SIZE;
746
750 } 747 }
751 return 0; 748 return 0;
752} 749}
@@ -757,9 +754,9 @@ static inline int cache_attr(pgprot_t attr)
757 (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD); 754 (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD);
758} 755}
759 756
760static int change_page_attr_set_clr(unsigned long addr, int numpages, 757static int change_page_attr_set_clr(unsigned long *addr, int numpages,
761 pgprot_t mask_set, pgprot_t mask_clr, 758 pgprot_t mask_set, pgprot_t mask_clr,
762 int force_split) 759 int force_split, int array)
763{ 760{
764 struct cpa_data cpa; 761 struct cpa_data cpa;
765 int ret, cache, checkalias; 762 int ret, cache, checkalias;
@@ -774,21 +771,38 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
774 return 0; 771 return 0;
775 772
776 /* Ensure we are PAGE_SIZE aligned */ 773 /* Ensure we are PAGE_SIZE aligned */
777 if (addr & ~PAGE_MASK) { 774 if (!array) {
778 addr &= PAGE_MASK; 775 if (*addr & ~PAGE_MASK) {
779 /* 776 *addr &= PAGE_MASK;
780 * People should not be passing in unaligned addresses: 777 /*
781 */ 778 * People should not be passing in unaligned addresses:
782 WARN_ON_ONCE(1); 779 */
780 WARN_ON_ONCE(1);
781 }
782 } else {
783 int i;
784 for (i = 0; i < numpages; i++) {
785 if (addr[i] & ~PAGE_MASK) {
786 addr[i] &= PAGE_MASK;
787 WARN_ON_ONCE(1);
788 }
789 }
783 } 790 }
784 791
792 /* Must avoid aliasing mappings in the highmem code */
793 kmap_flush_unused();
794
785 cpa.vaddr = addr; 795 cpa.vaddr = addr;
786 cpa.numpages = numpages; 796 cpa.numpages = numpages;
787 cpa.mask_set = mask_set; 797 cpa.mask_set = mask_set;
788 cpa.mask_clr = mask_clr; 798 cpa.mask_clr = mask_clr;
789 cpa.flushtlb = 0; 799 cpa.flags = 0;
800 cpa.curpage = 0;
790 cpa.force_split = force_split; 801 cpa.force_split = force_split;
791 802
803 if (array)
804 cpa.flags |= CPA_ARRAY;
805
792 /* No alias checking for _NX bit modifications */ 806 /* No alias checking for _NX bit modifications */
793 checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; 807 checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
794 808
@@ -797,7 +811,7 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
797 /* 811 /*
798 * Check whether we really changed something: 812 * Check whether we really changed something:
799 */ 813 */
800 if (!cpa.flushtlb) 814 if (!(cpa.flags & CPA_FLUSHTLB))
801 goto out; 815 goto out;
802 816
803 /* 817 /*
@@ -812,27 +826,30 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
812 * error case we fall back to cpa_flush_all (which uses 826 * error case we fall back to cpa_flush_all (which uses
813 * wbindv): 827 * wbindv):
814 */ 828 */
815 if (!ret && cpu_has_clflush) 829 if (!ret && cpu_has_clflush) {
816 cpa_flush_range(addr, numpages, cache); 830 if (cpa.flags & CPA_ARRAY)
817 else 831 cpa_flush_array(addr, numpages, cache);
832 else
833 cpa_flush_range(*addr, numpages, cache);
834 } else
818 cpa_flush_all(cache); 835 cpa_flush_all(cache);
819 836
820out: 837out:
821 cpa_fill_pool(NULL);
822
823 return ret; 838 return ret;
824} 839}
825 840
826static inline int change_page_attr_set(unsigned long addr, int numpages, 841static inline int change_page_attr_set(unsigned long *addr, int numpages,
827 pgprot_t mask) 842 pgprot_t mask, int array)
828{ 843{
829 return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0); 844 return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
845 array);
830} 846}
831 847
832static inline int change_page_attr_clear(unsigned long addr, int numpages, 848static inline int change_page_attr_clear(unsigned long *addr, int numpages,
833 pgprot_t mask) 849 pgprot_t mask, int array)
834{ 850{
835 return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0); 851 return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
852 array);
836} 853}
837 854
838int _set_memory_uc(unsigned long addr, int numpages) 855int _set_memory_uc(unsigned long addr, int numpages)
@@ -840,8 +857,8 @@ int _set_memory_uc(unsigned long addr, int numpages)
840 /* 857 /*
841 * for now UC MINUS. see comments in ioremap_nocache() 858 * for now UC MINUS. see comments in ioremap_nocache()
842 */ 859 */
843 return change_page_attr_set(addr, numpages, 860 return change_page_attr_set(&addr, numpages,
844 __pgprot(_PAGE_CACHE_UC_MINUS)); 861 __pgprot(_PAGE_CACHE_UC_MINUS), 0);
845} 862}
846 863
847int set_memory_uc(unsigned long addr, int numpages) 864int set_memory_uc(unsigned long addr, int numpages)
@@ -857,10 +874,48 @@ int set_memory_uc(unsigned long addr, int numpages)
857} 874}
858EXPORT_SYMBOL(set_memory_uc); 875EXPORT_SYMBOL(set_memory_uc);
859 876
877int set_memory_array_uc(unsigned long *addr, int addrinarray)
878{
879 unsigned long start;
880 unsigned long end;
881 int i;
882 /*
883 * for now UC MINUS. see comments in ioremap_nocache()
884 */
885 for (i = 0; i < addrinarray; i++) {
886 start = __pa(addr[i]);
887 for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
888 if (end != __pa(addr[i + 1]))
889 break;
890 i++;
891 }
892 if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL))
893 goto out;
894 }
895
896 return change_page_attr_set(addr, addrinarray,
897 __pgprot(_PAGE_CACHE_UC_MINUS), 1);
898out:
899 for (i = 0; i < addrinarray; i++) {
900 unsigned long tmp = __pa(addr[i]);
901
902 if (tmp == start)
903 break;
904 for (end = tmp + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
905 if (end != __pa(addr[i + 1]))
906 break;
907 i++;
908 }
909 free_memtype(tmp, end);
910 }
911 return -EINVAL;
912}
913EXPORT_SYMBOL(set_memory_array_uc);
914
860int _set_memory_wc(unsigned long addr, int numpages) 915int _set_memory_wc(unsigned long addr, int numpages)
861{ 916{
862 return change_page_attr_set(addr, numpages, 917 return change_page_attr_set(&addr, numpages,
863 __pgprot(_PAGE_CACHE_WC)); 918 __pgprot(_PAGE_CACHE_WC), 0);
864} 919}
865 920
866int set_memory_wc(unsigned long addr, int numpages) 921int set_memory_wc(unsigned long addr, int numpages)
@@ -878,8 +933,8 @@ EXPORT_SYMBOL(set_memory_wc);
878 933
879int _set_memory_wb(unsigned long addr, int numpages) 934int _set_memory_wb(unsigned long addr, int numpages)
880{ 935{
881 return change_page_attr_clear(addr, numpages, 936 return change_page_attr_clear(&addr, numpages,
882 __pgprot(_PAGE_CACHE_MASK)); 937 __pgprot(_PAGE_CACHE_MASK), 0);
883} 938}
884 939
885int set_memory_wb(unsigned long addr, int numpages) 940int set_memory_wb(unsigned long addr, int numpages)
@@ -890,37 +945,59 @@ int set_memory_wb(unsigned long addr, int numpages)
890} 945}
891EXPORT_SYMBOL(set_memory_wb); 946EXPORT_SYMBOL(set_memory_wb);
892 947
948int set_memory_array_wb(unsigned long *addr, int addrinarray)
949{
950 int i;
951
952 for (i = 0; i < addrinarray; i++) {
953 unsigned long start = __pa(addr[i]);
954 unsigned long end;
955
956 for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
957 if (end != __pa(addr[i + 1]))
958 break;
959 i++;
960 }
961 free_memtype(start, end);
962 }
963 return change_page_attr_clear(addr, addrinarray,
964 __pgprot(_PAGE_CACHE_MASK), 1);
965}
966EXPORT_SYMBOL(set_memory_array_wb);
967
893int set_memory_x(unsigned long addr, int numpages) 968int set_memory_x(unsigned long addr, int numpages)
894{ 969{
895 return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX)); 970 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
896} 971}
897EXPORT_SYMBOL(set_memory_x); 972EXPORT_SYMBOL(set_memory_x);
898 973
899int set_memory_nx(unsigned long addr, int numpages) 974int set_memory_nx(unsigned long addr, int numpages)
900{ 975{
901 return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX)); 976 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
902} 977}
903EXPORT_SYMBOL(set_memory_nx); 978EXPORT_SYMBOL(set_memory_nx);
904 979
905int set_memory_ro(unsigned long addr, int numpages) 980int set_memory_ro(unsigned long addr, int numpages)
906{ 981{
907 return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW)); 982 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
908} 983}
984EXPORT_SYMBOL_GPL(set_memory_ro);
909 985
910int set_memory_rw(unsigned long addr, int numpages) 986int set_memory_rw(unsigned long addr, int numpages)
911{ 987{
912 return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW)); 988 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
913} 989}
990EXPORT_SYMBOL_GPL(set_memory_rw);
914 991
915int set_memory_np(unsigned long addr, int numpages) 992int set_memory_np(unsigned long addr, int numpages)
916{ 993{
917 return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT)); 994 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
918} 995}
919 996
920int set_memory_4k(unsigned long addr, int numpages) 997int set_memory_4k(unsigned long addr, int numpages)
921{ 998{
922 return change_page_attr_set_clr(addr, numpages, __pgprot(0), 999 return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
923 __pgprot(0), 1); 1000 __pgprot(0), 1, 0);
924} 1001}
925 1002
926int set_pages_uc(struct page *page, int numpages) 1003int set_pages_uc(struct page *page, int numpages)
@@ -973,22 +1050,38 @@ int set_pages_rw(struct page *page, int numpages)
973 1050
974static int __set_pages_p(struct page *page, int numpages) 1051static int __set_pages_p(struct page *page, int numpages)
975{ 1052{
976 struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), 1053 unsigned long tempaddr = (unsigned long) page_address(page);
1054 struct cpa_data cpa = { .vaddr = &tempaddr,
977 .numpages = numpages, 1055 .numpages = numpages,
978 .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), 1056 .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
979 .mask_clr = __pgprot(0)}; 1057 .mask_clr = __pgprot(0),
1058 .flags = 0};
980 1059
981 return __change_page_attr_set_clr(&cpa, 1); 1060 /*
1061 * No alias checking needed for setting present flag. otherwise,
1062 * we may need to break large pages for 64-bit kernel text
1063 * mappings (this adds to complexity if we want to do this from
1064 * atomic context especially). Let's keep it simple!
1065 */
1066 return __change_page_attr_set_clr(&cpa, 0);
982} 1067}
983 1068
984static int __set_pages_np(struct page *page, int numpages) 1069static int __set_pages_np(struct page *page, int numpages)
985{ 1070{
986 struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), 1071 unsigned long tempaddr = (unsigned long) page_address(page);
1072 struct cpa_data cpa = { .vaddr = &tempaddr,
987 .numpages = numpages, 1073 .numpages = numpages,
988 .mask_set = __pgprot(0), 1074 .mask_set = __pgprot(0),
989 .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)}; 1075 .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
1076 .flags = 0};
990 1077
991 return __change_page_attr_set_clr(&cpa, 1); 1078 /*
1079 * No alias checking needed for setting not present flag. otherwise,
1080 * we may need to break large pages for 64-bit kernel text
1081 * mappings (this adds to complexity if we want to do this from
1082 * atomic context especially). Let's keep it simple!
1083 */
1084 return __change_page_attr_set_clr(&cpa, 0);
992} 1085}
993 1086
994void kernel_map_pages(struct page *page, int numpages, int enable) 1087void kernel_map_pages(struct page *page, int numpages, int enable)
@@ -1008,11 +1101,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
1008 1101
1009 /* 1102 /*
1010 * The return value is ignored as the calls cannot fail. 1103 * The return value is ignored as the calls cannot fail.
1011 * Large pages are kept enabled at boot time, and are 1104 * Large pages for identity mappings are not used at boot time
1012 * split up quickly with DEBUG_PAGEALLOC. If a splitup 1105 * and hence no memory allocations during large page split.
1013 * fails here (due to temporary memory shortage) no damage
1014 * is done because we just keep the largepage intact up
1015 * to the next attempt when it will likely be split up:
1016 */ 1106 */
1017 if (enable) 1107 if (enable)
1018 __set_pages_p(page, numpages); 1108 __set_pages_p(page, numpages);
@@ -1024,53 +1114,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
1024 * but that can deadlock->flush only current cpu: 1114 * but that can deadlock->flush only current cpu:
1025 */ 1115 */
1026 __flush_tlb_all(); 1116 __flush_tlb_all();
1027
1028 /*
1029 * Try to refill the page pool here. We can do this only after
1030 * the tlb flush.
1031 */
1032 cpa_fill_pool(NULL);
1033} 1117}
1034 1118
1035#ifdef CONFIG_DEBUG_FS
1036static int dpa_show(struct seq_file *m, void *v)
1037{
1038 seq_puts(m, "DEBUG_PAGEALLOC\n");
1039 seq_printf(m, "pool_size : %lu\n", pool_size);
1040 seq_printf(m, "pool_pages : %lu\n", pool_pages);
1041 seq_printf(m, "pool_low : %lu\n", pool_low);
1042 seq_printf(m, "pool_used : %lu\n", pool_used);
1043 seq_printf(m, "pool_failed : %lu\n", pool_failed);
1044
1045 return 0;
1046}
1047
1048static int dpa_open(struct inode *inode, struct file *filp)
1049{
1050 return single_open(filp, dpa_show, NULL);
1051}
1052
1053static const struct file_operations dpa_fops = {
1054 .open = dpa_open,
1055 .read = seq_read,
1056 .llseek = seq_lseek,
1057 .release = single_release,
1058};
1059
1060static int __init debug_pagealloc_proc_init(void)
1061{
1062 struct dentry *de;
1063
1064 de = debugfs_create_file("debug_pagealloc", 0600, NULL, NULL,
1065 &dpa_fops);
1066 if (!de)
1067 return -ENOMEM;
1068
1069 return 0;
1070}
1071__initcall(debug_pagealloc_proc_init);
1072#endif
1073
1074#ifdef CONFIG_HIBERNATION 1119#ifdef CONFIG_HIBERNATION
1075 1120
1076bool kernel_page_present(struct page *page) 1121bool kernel_page_present(struct page *page)
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 2a50e0fa64a5..738fd0f24958 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -7,24 +7,24 @@
7 * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen. 7 * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
8 */ 8 */
9 9
10#include <linux/mm.h> 10#include <linux/seq_file.h>
11#include <linux/bootmem.h>
12#include <linux/debugfs.h>
11#include <linux/kernel.h> 13#include <linux/kernel.h>
12#include <linux/gfp.h> 14#include <linux/gfp.h>
15#include <linux/mm.h>
13#include <linux/fs.h> 16#include <linux/fs.h>
14#include <linux/bootmem.h>
15#include <linux/debugfs.h>
16#include <linux/seq_file.h>
17 17
18#include <asm/msr.h> 18#include <asm/cacheflush.h>
19#include <asm/tlbflush.h>
20#include <asm/processor.h> 19#include <asm/processor.h>
21#include <asm/page.h> 20#include <asm/tlbflush.h>
22#include <asm/pgtable.h> 21#include <asm/pgtable.h>
23#include <asm/pat.h>
24#include <asm/e820.h>
25#include <asm/cacheflush.h>
26#include <asm/fcntl.h> 22#include <asm/fcntl.h>
23#include <asm/e820.h>
27#include <asm/mtrr.h> 24#include <asm/mtrr.h>
25#include <asm/page.h>
26#include <asm/msr.h>
27#include <asm/pat.h>
28#include <asm/io.h> 28#include <asm/io.h>
29 29
30#ifdef CONFIG_X86_PAT 30#ifdef CONFIG_X86_PAT
@@ -46,6 +46,7 @@ early_param("nopat", nopat);
46 46
47 47
48static int debug_enable; 48static int debug_enable;
49
49static int __init pat_debug_setup(char *str) 50static int __init pat_debug_setup(char *str)
50{ 51{
51 debug_enable = 1; 52 debug_enable = 1;
@@ -145,14 +146,14 @@ static char *cattr_name(unsigned long flags)
145 */ 146 */
146 147
147struct memtype { 148struct memtype {
148 u64 start; 149 u64 start;
149 u64 end; 150 u64 end;
150 unsigned long type; 151 unsigned long type;
151 struct list_head nd; 152 struct list_head nd;
152}; 153};
153 154
154static LIST_HEAD(memtype_list); 155static LIST_HEAD(memtype_list);
155static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ 156static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */
156 157
157/* 158/*
158 * Does intersection of PAT memory type and MTRR memory type and returns 159 * Does intersection of PAT memory type and MTRR memory type and returns
@@ -180,8 +181,8 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
180 return req_type; 181 return req_type;
181} 182}
182 183
183static int chk_conflict(struct memtype *new, struct memtype *entry, 184static int
184 unsigned long *type) 185chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type)
185{ 186{
186 if (new->type != entry->type) { 187 if (new->type != entry->type) {
187 if (type) { 188 if (type) {
@@ -211,6 +212,66 @@ static struct memtype *cached_entry;
211static u64 cached_start; 212static u64 cached_start;
212 213
213/* 214/*
215 * For RAM pages, mark the pages as non WB memory type using
216 * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or
217 * set_memory_wc() on a RAM page at a time before marking it as WB again.
218 * This is ok, because only one driver will be owning the page and
219 * doing set_memory_*() calls.
220 *
221 * For now, we use PageNonWB to track that the RAM page is being mapped
222 * as non WB. In future, we will have to use one more flag
223 * (or some other mechanism in page_struct) to distinguish between
224 * UC and WC mapping.
225 */
226static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,
227 unsigned long *new_type)
228{
229 struct page *page;
230 u64 pfn, end_pfn;
231
232 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
233 page = pfn_to_page(pfn);
234 if (page_mapped(page) || PageNonWB(page))
235 goto out;
236
237 SetPageNonWB(page);
238 }
239 return 0;
240
241out:
242 end_pfn = pfn;
243 for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
244 page = pfn_to_page(pfn);
245 ClearPageNonWB(page);
246 }
247
248 return -EINVAL;
249}
250
251static int free_ram_pages_type(u64 start, u64 end)
252{
253 struct page *page;
254 u64 pfn, end_pfn;
255
256 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
257 page = pfn_to_page(pfn);
258 if (page_mapped(page) || !PageNonWB(page))
259 goto out;
260
261 ClearPageNonWB(page);
262 }
263 return 0;
264
265out:
266 end_pfn = pfn;
267 for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
268 page = pfn_to_page(pfn);
269 SetPageNonWB(page);
270 }
271 return -EINVAL;
272}
273
274/*
214 * req_type typically has one of the: 275 * req_type typically has one of the:
215 * - _PAGE_CACHE_WB 276 * - _PAGE_CACHE_WB
216 * - _PAGE_CACHE_WC 277 * - _PAGE_CACHE_WC
@@ -226,14 +287,15 @@ static u64 cached_start;
226 * it will return a negative return value. 287 * it will return a negative return value.
227 */ 288 */
228int reserve_memtype(u64 start, u64 end, unsigned long req_type, 289int reserve_memtype(u64 start, u64 end, unsigned long req_type,
229 unsigned long *new_type) 290 unsigned long *new_type)
230{ 291{
231 struct memtype *new, *entry; 292 struct memtype *new, *entry;
232 unsigned long actual_type; 293 unsigned long actual_type;
233 struct list_head *where; 294 struct list_head *where;
295 int is_range_ram;
234 int err = 0; 296 int err = 0;
235 297
236 BUG_ON(start >= end); /* end is exclusive */ 298 BUG_ON(start >= end); /* end is exclusive */
237 299
238 if (!pat_enabled) { 300 if (!pat_enabled) {
239 /* This is identical to page table setting without PAT */ 301 /* This is identical to page table setting without PAT */
@@ -266,17 +328,24 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
266 actual_type = _PAGE_CACHE_WB; 328 actual_type = _PAGE_CACHE_WB;
267 else 329 else
268 actual_type = _PAGE_CACHE_UC_MINUS; 330 actual_type = _PAGE_CACHE_UC_MINUS;
269 } else 331 } else {
270 actual_type = pat_x_mtrr_type(start, end, 332 actual_type = pat_x_mtrr_type(start, end,
271 req_type & _PAGE_CACHE_MASK); 333 req_type & _PAGE_CACHE_MASK);
334 }
335
336 is_range_ram = pagerange_is_ram(start, end);
337 if (is_range_ram == 1)
338 return reserve_ram_pages_type(start, end, req_type, new_type);
339 else if (is_range_ram < 0)
340 return -EINVAL;
272 341
273 new = kmalloc(sizeof(struct memtype), GFP_KERNEL); 342 new = kmalloc(sizeof(struct memtype), GFP_KERNEL);
274 if (!new) 343 if (!new)
275 return -ENOMEM; 344 return -ENOMEM;
276 345
277 new->start = start; 346 new->start = start;
278 new->end = end; 347 new->end = end;
279 new->type = actual_type; 348 new->type = actual_type;
280 349
281 if (new_type) 350 if (new_type)
282 *new_type = actual_type; 351 *new_type = actual_type;
@@ -335,6 +404,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
335 start, end, cattr_name(new->type), cattr_name(req_type)); 404 start, end, cattr_name(new->type), cattr_name(req_type));
336 kfree(new); 405 kfree(new);
337 spin_unlock(&memtype_lock); 406 spin_unlock(&memtype_lock);
407
338 return err; 408 return err;
339 } 409 }
340 410
@@ -358,6 +428,7 @@ int free_memtype(u64 start, u64 end)
358{ 428{
359 struct memtype *entry; 429 struct memtype *entry;
360 int err = -EINVAL; 430 int err = -EINVAL;
431 int is_range_ram;
361 432
362 if (!pat_enabled) 433 if (!pat_enabled)
363 return 0; 434 return 0;
@@ -366,6 +437,12 @@ int free_memtype(u64 start, u64 end)
366 if (is_ISA_range(start, end - 1)) 437 if (is_ISA_range(start, end - 1))
367 return 0; 438 return 0;
368 439
440 is_range_ram = pagerange_is_ram(start, end);
441 if (is_range_ram == 1)
442 return free_ram_pages_type(start, end);
443 else if (is_range_ram < 0)
444 return -EINVAL;
445
369 spin_lock(&memtype_lock); 446 spin_lock(&memtype_lock);
370 list_for_each_entry(entry, &memtype_list, nd) { 447 list_for_each_entry(entry, &memtype_list, nd) {
371 if (entry->start == start && entry->end == end) { 448 if (entry->start == start && entry->end == end) {
@@ -386,6 +463,7 @@ int free_memtype(u64 start, u64 end)
386 } 463 }
387 464
388 dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end); 465 dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
466
389 return err; 467 return err;
390} 468}
391 469
@@ -492,9 +570,9 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
492 570
493void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) 571void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
494{ 572{
573 unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
495 u64 addr = (u64)pfn << PAGE_SHIFT; 574 u64 addr = (u64)pfn << PAGE_SHIFT;
496 unsigned long flags; 575 unsigned long flags;
497 unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
498 576
499 reserve_memtype(addr, addr + size, want_flags, &flags); 577 reserve_memtype(addr, addr + size, want_flags, &flags);
500 if (flags != want_flags) { 578 if (flags != want_flags) {
@@ -514,7 +592,7 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
514 free_memtype(addr, addr + size); 592 free_memtype(addr, addr + size);
515} 593}
516 594
517#if defined(CONFIG_DEBUG_FS) 595#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
518 596
519/* get Nth element of the linked list */ 597/* get Nth element of the linked list */
520static struct memtype *memtype_get_idx(loff_t pos) 598static struct memtype *memtype_get_idx(loff_t pos)
@@ -537,6 +615,7 @@ static struct memtype *memtype_get_idx(loff_t pos)
537 } 615 }
538 spin_unlock(&memtype_lock); 616 spin_unlock(&memtype_lock);
539 kfree(print_entry); 617 kfree(print_entry);
618
540 return NULL; 619 return NULL;
541} 620}
542 621
@@ -567,6 +646,7 @@ static int memtype_seq_show(struct seq_file *seq, void *v)
567 seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type), 646 seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
568 print_entry->start, print_entry->end); 647 print_entry->start, print_entry->end);
569 kfree(print_entry); 648 kfree(print_entry);
649
570 return 0; 650 return 0;
571} 651}
572 652
@@ -598,4 +678,4 @@ static int __init pat_memtype_list_init(void)
598 678
599late_initcall(pat_memtype_list_init); 679late_initcall(pat_memtype_list_init);
600 680
601#endif /* CONFIG_DEBUG_FS */ 681#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index d50302774fe2..86f2ffc43c3d 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -63,10 +63,8 @@ static inline void pgd_list_del(pgd_t *pgd)
63#define UNSHARED_PTRS_PER_PGD \ 63#define UNSHARED_PTRS_PER_PGD \
64 (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) 64 (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
65 65
66static void pgd_ctor(void *p) 66static void pgd_ctor(pgd_t *pgd)
67{ 67{
68 pgd_t *pgd = p;
69
70 /* If the pgd points to a shared pagetable level (either the 68 /* If the pgd points to a shared pagetable level (either the
71 ptes in non-PAE, or shared PMD in PAE), then just copy the 69 ptes in non-PAE, or shared PMD in PAE), then just copy the
72 references from swapper_pg_dir. */ 70 references from swapper_pg_dir. */
@@ -87,7 +85,7 @@ static void pgd_ctor(void *p)
87 pgd_list_add(pgd); 85 pgd_list_add(pgd);
88} 86}
89 87
90static void pgd_dtor(void *pgd) 88static void pgd_dtor(pgd_t *pgd)
91{ 89{
92 unsigned long flags; /* can be called from interrupt context */ 90 unsigned long flags; /* can be called from interrupt context */
93 91
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index cab0abbd1ebe..0951db9ee519 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -123,7 +123,8 @@ static int __init parse_vmalloc(char *arg)
123 if (!arg) 123 if (!arg)
124 return -EINVAL; 124 return -EINVAL;
125 125
126 __VMALLOC_RESERVE = memparse(arg, &arg); 126 /* Add VMALLOC_OFFSET to the parsed value due to vm area guard hole*/
127 __VMALLOC_RESERVE = memparse(arg, &arg) + VMALLOC_OFFSET;
127 return 0; 128 return 0;
128} 129}
129early_param("vmalloc", parse_vmalloc); 130early_param("vmalloc", parse_vmalloc);
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 0227694f7dab..8a5f1614a3d5 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -295,10 +295,12 @@ static void nmi_cpu_shutdown(void *dummy)
295 295
296static void nmi_shutdown(void) 296static void nmi_shutdown(void)
297{ 297{
298 struct op_msrs *msrs = &get_cpu_var(cpu_msrs); 298 struct op_msrs *msrs;
299
299 nmi_enabled = 0; 300 nmi_enabled = 0;
300 on_each_cpu(nmi_cpu_shutdown, NULL, 1); 301 on_each_cpu(nmi_cpu_shutdown, NULL, 1);
301 unregister_die_notifier(&profile_exceptions_nb); 302 unregister_die_notifier(&profile_exceptions_nb);
303 msrs = &get_cpu_var(cpu_msrs);
302 model->shutdown(msrs); 304 model->shutdown(msrs);
303 free_msrs(); 305 free_msrs();
304 put_cpu_var(cpu_msrs); 306 put_cpu_var(cpu_msrs);
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
index 56b4757a1f47..43ac5af338d8 100644
--- a/arch/x86/oprofile/op_model_p4.c
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -10,11 +10,12 @@
10 10
11#include <linux/oprofile.h> 11#include <linux/oprofile.h>
12#include <linux/smp.h> 12#include <linux/smp.h>
13#include <linux/ptrace.h>
14#include <linux/nmi.h>
13#include <asm/msr.h> 15#include <asm/msr.h>
14#include <asm/ptrace.h>
15#include <asm/fixmap.h> 16#include <asm/fixmap.h>
16#include <asm/apic.h> 17#include <asm/apic.h>
17#include <asm/nmi.h> 18
18 19
19#include "op_x86_model.h" 20#include "op_x86_model.h"
20#include "op_counter.h" 21#include "op_counter.h"
@@ -40,7 +41,7 @@ static unsigned int num_controls = NUM_CONTROLS_NON_HT;
40static inline void setup_num_counters(void) 41static inline void setup_num_counters(void)
41{ 42{
42#ifdef CONFIG_SMP 43#ifdef CONFIG_SMP
43 if (smp_num_siblings == 2){ 44 if (smp_num_siblings == 2) {
44 num_counters = NUM_COUNTERS_HT2; 45 num_counters = NUM_COUNTERS_HT2;
45 num_controls = NUM_CONTROLS_HT2; 46 num_controls = NUM_CONTROLS_HT2;
46 } 47 }
@@ -86,7 +87,7 @@ struct p4_event_binding {
86#define CTR_FLAME_2 (1 << 6) 87#define CTR_FLAME_2 (1 << 6)
87#define CTR_IQ_5 (1 << 7) 88#define CTR_IQ_5 (1 << 7)
88 89
89static struct p4_counter_binding p4_counters [NUM_COUNTERS_NON_HT] = { 90static struct p4_counter_binding p4_counters[NUM_COUNTERS_NON_HT] = {
90 { CTR_BPU_0, MSR_P4_BPU_PERFCTR0, MSR_P4_BPU_CCCR0 }, 91 { CTR_BPU_0, MSR_P4_BPU_PERFCTR0, MSR_P4_BPU_CCCR0 },
91 { CTR_MS_0, MSR_P4_MS_PERFCTR0, MSR_P4_MS_CCCR0 }, 92 { CTR_MS_0, MSR_P4_MS_PERFCTR0, MSR_P4_MS_CCCR0 },
92 { CTR_FLAME_0, MSR_P4_FLAME_PERFCTR0, MSR_P4_FLAME_CCCR0 }, 93 { CTR_FLAME_0, MSR_P4_FLAME_PERFCTR0, MSR_P4_FLAME_CCCR0 },
@@ -97,32 +98,32 @@ static struct p4_counter_binding p4_counters [NUM_COUNTERS_NON_HT] = {
97 { CTR_IQ_5, MSR_P4_IQ_PERFCTR5, MSR_P4_IQ_CCCR5 } 98 { CTR_IQ_5, MSR_P4_IQ_PERFCTR5, MSR_P4_IQ_CCCR5 }
98}; 99};
99 100
100#define NUM_UNUSED_CCCRS NUM_CCCRS_NON_HT - NUM_COUNTERS_NON_HT 101#define NUM_UNUSED_CCCRS (NUM_CCCRS_NON_HT - NUM_COUNTERS_NON_HT)
101 102
102/* p4 event codes in libop/op_event.h are indices into this table. */ 103/* p4 event codes in libop/op_event.h are indices into this table. */
103 104
104static struct p4_event_binding p4_events[NUM_EVENTS] = { 105static struct p4_event_binding p4_events[NUM_EVENTS] = {
105 106
106 { /* BRANCH_RETIRED */ 107 { /* BRANCH_RETIRED */
107 0x05, 0x06, 108 0x05, 0x06,
108 { {CTR_IQ_4, MSR_P4_CRU_ESCR2}, 109 { {CTR_IQ_4, MSR_P4_CRU_ESCR2},
109 {CTR_IQ_5, MSR_P4_CRU_ESCR3} } 110 {CTR_IQ_5, MSR_P4_CRU_ESCR3} }
110 }, 111 },
111 112
112 { /* MISPRED_BRANCH_RETIRED */ 113 { /* MISPRED_BRANCH_RETIRED */
113 0x04, 0x03, 114 0x04, 0x03,
114 { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, 115 { { CTR_IQ_4, MSR_P4_CRU_ESCR0},
115 { CTR_IQ_5, MSR_P4_CRU_ESCR1} } 116 { CTR_IQ_5, MSR_P4_CRU_ESCR1} }
116 }, 117 },
117 118
118 { /* TC_DELIVER_MODE */ 119 { /* TC_DELIVER_MODE */
119 0x01, 0x01, 120 0x01, 0x01,
120 { { CTR_MS_0, MSR_P4_TC_ESCR0}, 121 { { CTR_MS_0, MSR_P4_TC_ESCR0},
121 { CTR_MS_2, MSR_P4_TC_ESCR1} } 122 { CTR_MS_2, MSR_P4_TC_ESCR1} }
122 }, 123 },
123 124
124 { /* BPU_FETCH_REQUEST */ 125 { /* BPU_FETCH_REQUEST */
125 0x00, 0x03, 126 0x00, 0x03,
126 { { CTR_BPU_0, MSR_P4_BPU_ESCR0}, 127 { { CTR_BPU_0, MSR_P4_BPU_ESCR0},
127 { CTR_BPU_2, MSR_P4_BPU_ESCR1} } 128 { CTR_BPU_2, MSR_P4_BPU_ESCR1} }
128 }, 129 },
@@ -146,7 +147,7 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
146 }, 147 },
147 148
148 { /* LOAD_PORT_REPLAY */ 149 { /* LOAD_PORT_REPLAY */
149 0x02, 0x04, 150 0x02, 0x04,
150 { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0}, 151 { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0},
151 { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} } 152 { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} }
152 }, 153 },
@@ -170,43 +171,43 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
170 }, 171 },
171 172
172 { /* BSQ_CACHE_REFERENCE */ 173 { /* BSQ_CACHE_REFERENCE */
173 0x07, 0x0c, 174 0x07, 0x0c,
174 { { CTR_BPU_0, MSR_P4_BSU_ESCR0}, 175 { { CTR_BPU_0, MSR_P4_BSU_ESCR0},
175 { CTR_BPU_2, MSR_P4_BSU_ESCR1} } 176 { CTR_BPU_2, MSR_P4_BSU_ESCR1} }
176 }, 177 },
177 178
178 { /* IOQ_ALLOCATION */ 179 { /* IOQ_ALLOCATION */
179 0x06, 0x03, 180 0x06, 0x03,
180 { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, 181 { { CTR_BPU_0, MSR_P4_FSB_ESCR0},
181 { 0, 0 } } 182 { 0, 0 } }
182 }, 183 },
183 184
184 { /* IOQ_ACTIVE_ENTRIES */ 185 { /* IOQ_ACTIVE_ENTRIES */
185 0x06, 0x1a, 186 0x06, 0x1a,
186 { { CTR_BPU_2, MSR_P4_FSB_ESCR1}, 187 { { CTR_BPU_2, MSR_P4_FSB_ESCR1},
187 { 0, 0 } } 188 { 0, 0 } }
188 }, 189 },
189 190
190 { /* FSB_DATA_ACTIVITY */ 191 { /* FSB_DATA_ACTIVITY */
191 0x06, 0x17, 192 0x06, 0x17,
192 { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, 193 { { CTR_BPU_0, MSR_P4_FSB_ESCR0},
193 { CTR_BPU_2, MSR_P4_FSB_ESCR1} } 194 { CTR_BPU_2, MSR_P4_FSB_ESCR1} }
194 }, 195 },
195 196
196 { /* BSQ_ALLOCATION */ 197 { /* BSQ_ALLOCATION */
197 0x07, 0x05, 198 0x07, 0x05,
198 { { CTR_BPU_0, MSR_P4_BSU_ESCR0}, 199 { { CTR_BPU_0, MSR_P4_BSU_ESCR0},
199 { 0, 0 } } 200 { 0, 0 } }
200 }, 201 },
201 202
202 { /* BSQ_ACTIVE_ENTRIES */ 203 { /* BSQ_ACTIVE_ENTRIES */
203 0x07, 0x06, 204 0x07, 0x06,
204 { { CTR_BPU_2, MSR_P4_BSU_ESCR1 /* guess */}, 205 { { CTR_BPU_2, MSR_P4_BSU_ESCR1 /* guess */},
205 { 0, 0 } } 206 { 0, 0 } }
206 }, 207 },
207 208
208 { /* X87_ASSIST */ 209 { /* X87_ASSIST */
209 0x05, 0x03, 210 0x05, 0x03,
210 { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, 211 { { CTR_IQ_4, MSR_P4_CRU_ESCR2},
211 { CTR_IQ_5, MSR_P4_CRU_ESCR3} } 212 { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
212 }, 213 },
@@ -216,21 +217,21 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
216 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, 217 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
217 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } 218 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
218 }, 219 },
219 220
220 { /* PACKED_SP_UOP */ 221 { /* PACKED_SP_UOP */
221 0x01, 0x08, 222 0x01, 0x08,
222 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, 223 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
223 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } 224 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
224 }, 225 },
225 226
226 { /* PACKED_DP_UOP */ 227 { /* PACKED_DP_UOP */
227 0x01, 0x0c, 228 0x01, 0x0c,
228 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, 229 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
229 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } 230 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
230 }, 231 },
231 232
232 { /* SCALAR_SP_UOP */ 233 { /* SCALAR_SP_UOP */
233 0x01, 0x0a, 234 0x01, 0x0a,
234 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, 235 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
235 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } 236 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
236 }, 237 },
@@ -242,31 +243,31 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
242 }, 243 },
243 244
244 { /* 64BIT_MMX_UOP */ 245 { /* 64BIT_MMX_UOP */
245 0x01, 0x02, 246 0x01, 0x02,
246 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, 247 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
247 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } 248 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
248 }, 249 },
249 250
250 { /* 128BIT_MMX_UOP */ 251 { /* 128BIT_MMX_UOP */
251 0x01, 0x1a, 252 0x01, 0x1a,
252 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, 253 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
253 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } 254 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
254 }, 255 },
255 256
256 { /* X87_FP_UOP */ 257 { /* X87_FP_UOP */
257 0x01, 0x04, 258 0x01, 0x04,
258 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, 259 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
259 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } 260 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
260 }, 261 },
261 262
262 { /* X87_SIMD_MOVES_UOP */ 263 { /* X87_SIMD_MOVES_UOP */
263 0x01, 0x2e, 264 0x01, 0x2e,
264 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, 265 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
265 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } 266 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
266 }, 267 },
267 268
268 { /* MACHINE_CLEAR */ 269 { /* MACHINE_CLEAR */
269 0x05, 0x02, 270 0x05, 0x02,
270 { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, 271 { { CTR_IQ_4, MSR_P4_CRU_ESCR2},
271 { CTR_IQ_5, MSR_P4_CRU_ESCR3} } 272 { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
272 }, 273 },
@@ -276,9 +277,9 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
276 { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, 277 { { CTR_BPU_0, MSR_P4_FSB_ESCR0},
277 { CTR_BPU_2, MSR_P4_FSB_ESCR1} } 278 { CTR_BPU_2, MSR_P4_FSB_ESCR1} }
278 }, 279 },
279 280
280 { /* TC_MS_XFER */ 281 { /* TC_MS_XFER */
281 0x00, 0x05, 282 0x00, 0x05,
282 { { CTR_MS_0, MSR_P4_MS_ESCR0}, 283 { { CTR_MS_0, MSR_P4_MS_ESCR0},
283 { CTR_MS_2, MSR_P4_MS_ESCR1} } 284 { CTR_MS_2, MSR_P4_MS_ESCR1} }
284 }, 285 },
@@ -308,7 +309,7 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
308 }, 309 },
309 310
310 { /* INSTR_RETIRED */ 311 { /* INSTR_RETIRED */
311 0x04, 0x02, 312 0x04, 0x02,
312 { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, 313 { { CTR_IQ_4, MSR_P4_CRU_ESCR0},
313 { CTR_IQ_5, MSR_P4_CRU_ESCR1} } 314 { CTR_IQ_5, MSR_P4_CRU_ESCR1} }
314 }, 315 },
@@ -319,14 +320,14 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
319 { CTR_IQ_5, MSR_P4_CRU_ESCR1} } 320 { CTR_IQ_5, MSR_P4_CRU_ESCR1} }
320 }, 321 },
321 322
322 { /* UOP_TYPE */ 323 { /* UOP_TYPE */
323 0x02, 0x02, 324 0x02, 0x02,
324 { { CTR_IQ_4, MSR_P4_RAT_ESCR0}, 325 { { CTR_IQ_4, MSR_P4_RAT_ESCR0},
325 { CTR_IQ_5, MSR_P4_RAT_ESCR1} } 326 { CTR_IQ_5, MSR_P4_RAT_ESCR1} }
326 }, 327 },
327 328
328 { /* RETIRED_MISPRED_BRANCH_TYPE */ 329 { /* RETIRED_MISPRED_BRANCH_TYPE */
329 0x02, 0x05, 330 0x02, 0x05,
330 { { CTR_MS_0, MSR_P4_TBPU_ESCR0}, 331 { { CTR_MS_0, MSR_P4_TBPU_ESCR0},
331 { CTR_MS_2, MSR_P4_TBPU_ESCR1} } 332 { CTR_MS_2, MSR_P4_TBPU_ESCR1} }
332 }, 333 },
@@ -349,8 +350,8 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
349#define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1)) 350#define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1))
350#define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25)) 351#define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25))
351#define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9)) 352#define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9))
352#define ESCR_READ(escr,high,ev,i) do {rdmsr(ev->bindings[(i)].escr_address, (escr), (high));} while (0) 353#define ESCR_READ(escr, high, ev, i) do {rdmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0)
353#define ESCR_WRITE(escr,high,ev,i) do {wrmsr(ev->bindings[(i)].escr_address, (escr), (high));} while (0) 354#define ESCR_WRITE(escr, high, ev, i) do {wrmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0)
354 355
355#define CCCR_RESERVED_BITS 0x38030FFF 356#define CCCR_RESERVED_BITS 0x38030FFF
356#define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS) 357#define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS)
@@ -360,15 +361,15 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
360#define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27)) 361#define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27))
361#define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12)) 362#define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12))
362#define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12)) 363#define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12))
363#define CCCR_READ(low, high, i) do {rdmsr(p4_counters[(i)].cccr_address, (low), (high));} while (0) 364#define CCCR_READ(low, high, i) do {rdmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0)
364#define CCCR_WRITE(low, high, i) do {wrmsr(p4_counters[(i)].cccr_address, (low), (high));} while (0) 365#define CCCR_WRITE(low, high, i) do {wrmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0)
365#define CCCR_OVF_P(cccr) ((cccr) & (1U<<31)) 366#define CCCR_OVF_P(cccr) ((cccr) & (1U<<31))
366#define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31))) 367#define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31)))
367 368
368#define CTRL_IS_RESERVED(msrs,c) (msrs->controls[(c)].addr ? 1 : 0) 369#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0)
369#define CTR_IS_RESERVED(msrs,c) (msrs->counters[(c)].addr ? 1 : 0) 370#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0)
370#define CTR_READ(l,h,i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h));} while (0) 371#define CTR_READ(l, h, i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h)); } while (0)
371#define CTR_WRITE(l,i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1);} while (0) 372#define CTR_WRITE(l, i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1); } while (0)
372#define CTR_OVERFLOW_P(ctr) (!((ctr) & 0x80000000)) 373#define CTR_OVERFLOW_P(ctr) (!((ctr) & 0x80000000))
373 374
374 375
@@ -380,7 +381,7 @@ static unsigned int get_stagger(void)
380#ifdef CONFIG_SMP 381#ifdef CONFIG_SMP
381 int cpu = smp_processor_id(); 382 int cpu = smp_processor_id();
382 return (cpu != first_cpu(per_cpu(cpu_sibling_map, cpu))); 383 return (cpu != first_cpu(per_cpu(cpu_sibling_map, cpu)));
383#endif 384#endif
384 return 0; 385 return 0;
385} 386}
386 387
@@ -395,25 +396,23 @@ static unsigned long reset_value[NUM_COUNTERS_NON_HT];
395 396
396static void p4_fill_in_addresses(struct op_msrs * const msrs) 397static void p4_fill_in_addresses(struct op_msrs * const msrs)
397{ 398{
398 unsigned int i; 399 unsigned int i;
399 unsigned int addr, cccraddr, stag; 400 unsigned int addr, cccraddr, stag;
400 401
401 setup_num_counters(); 402 setup_num_counters();
402 stag = get_stagger(); 403 stag = get_stagger();
403 404
404 /* initialize some registers */ 405 /* initialize some registers */
405 for (i = 0; i < num_counters; ++i) { 406 for (i = 0; i < num_counters; ++i)
406 msrs->counters[i].addr = 0; 407 msrs->counters[i].addr = 0;
407 } 408 for (i = 0; i < num_controls; ++i)
408 for (i = 0; i < num_controls; ++i) {
409 msrs->controls[i].addr = 0; 409 msrs->controls[i].addr = 0;
410 } 410
411
412 /* the counter & cccr registers we pay attention to */ 411 /* the counter & cccr registers we pay attention to */
413 for (i = 0; i < num_counters; ++i) { 412 for (i = 0; i < num_counters; ++i) {
414 addr = p4_counters[VIRT_CTR(stag, i)].counter_address; 413 addr = p4_counters[VIRT_CTR(stag, i)].counter_address;
415 cccraddr = p4_counters[VIRT_CTR(stag, i)].cccr_address; 414 cccraddr = p4_counters[VIRT_CTR(stag, i)].cccr_address;
416 if (reserve_perfctr_nmi(addr)){ 415 if (reserve_perfctr_nmi(addr)) {
417 msrs->counters[i].addr = addr; 416 msrs->counters[i].addr = addr;
418 msrs->controls[i].addr = cccraddr; 417 msrs->controls[i].addr = cccraddr;
419 } 418 }
@@ -447,22 +446,22 @@ static void p4_fill_in_addresses(struct op_msrs * const msrs)
447 if (reserve_evntsel_nmi(addr)) 446 if (reserve_evntsel_nmi(addr))
448 msrs->controls[i].addr = addr; 447 msrs->controls[i].addr = addr;
449 } 448 }
450 449
451 for (addr = MSR_P4_MS_ESCR0 + stag; 450 for (addr = MSR_P4_MS_ESCR0 + stag;
452 addr <= MSR_P4_TC_ESCR1; ++i, addr += addr_increment()) { 451 addr <= MSR_P4_TC_ESCR1; ++i, addr += addr_increment()) {
453 if (reserve_evntsel_nmi(addr)) 452 if (reserve_evntsel_nmi(addr))
454 msrs->controls[i].addr = addr; 453 msrs->controls[i].addr = addr;
455 } 454 }
456 455
457 for (addr = MSR_P4_IX_ESCR0 + stag; 456 for (addr = MSR_P4_IX_ESCR0 + stag;
458 addr <= MSR_P4_CRU_ESCR3; ++i, addr += addr_increment()) { 457 addr <= MSR_P4_CRU_ESCR3; ++i, addr += addr_increment()) {
459 if (reserve_evntsel_nmi(addr)) 458 if (reserve_evntsel_nmi(addr))
460 msrs->controls[i].addr = addr; 459 msrs->controls[i].addr = addr;
461 } 460 }
462 461
463 /* there are 2 remaining non-contiguously located ESCRs */ 462 /* there are 2 remaining non-contiguously located ESCRs */
464 463
465 if (num_counters == NUM_COUNTERS_NON_HT) { 464 if (num_counters == NUM_COUNTERS_NON_HT) {
466 /* standard non-HT CPUs handle both remaining ESCRs*/ 465 /* standard non-HT CPUs handle both remaining ESCRs*/
467 if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR5)) 466 if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR5))
468 msrs->controls[i++].addr = MSR_P4_CRU_ESCR5; 467 msrs->controls[i++].addr = MSR_P4_CRU_ESCR5;
@@ -498,20 +497,20 @@ static void pmc_setup_one_p4_counter(unsigned int ctr)
498 unsigned int stag; 497 unsigned int stag;
499 498
500 stag = get_stagger(); 499 stag = get_stagger();
501 500
502 /* convert from counter *number* to counter *bit* */ 501 /* convert from counter *number* to counter *bit* */
503 counter_bit = 1 << VIRT_CTR(stag, ctr); 502 counter_bit = 1 << VIRT_CTR(stag, ctr);
504 503
505 /* find our event binding structure. */ 504 /* find our event binding structure. */
506 if (counter_config[ctr].event <= 0 || counter_config[ctr].event > NUM_EVENTS) { 505 if (counter_config[ctr].event <= 0 || counter_config[ctr].event > NUM_EVENTS) {
507 printk(KERN_ERR 506 printk(KERN_ERR
508 "oprofile: P4 event code 0x%lx out of range\n", 507 "oprofile: P4 event code 0x%lx out of range\n",
509 counter_config[ctr].event); 508 counter_config[ctr].event);
510 return; 509 return;
511 } 510 }
512 511
513 ev = &(p4_events[counter_config[ctr].event - 1]); 512 ev = &(p4_events[counter_config[ctr].event - 1]);
514 513
515 for (i = 0; i < maxbind; i++) { 514 for (i = 0; i < maxbind; i++) {
516 if (ev->bindings[i].virt_counter & counter_bit) { 515 if (ev->bindings[i].virt_counter & counter_bit) {
517 516
@@ -526,25 +525,24 @@ static void pmc_setup_one_p4_counter(unsigned int ctr)
526 ESCR_SET_OS_1(escr, counter_config[ctr].kernel); 525 ESCR_SET_OS_1(escr, counter_config[ctr].kernel);
527 } 526 }
528 ESCR_SET_EVENT_SELECT(escr, ev->event_select); 527 ESCR_SET_EVENT_SELECT(escr, ev->event_select);
529 ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask); 528 ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask);
530 ESCR_WRITE(escr, high, ev, i); 529 ESCR_WRITE(escr, high, ev, i);
531 530
532 /* modify CCCR */ 531 /* modify CCCR */
533 CCCR_READ(cccr, high, VIRT_CTR(stag, ctr)); 532 CCCR_READ(cccr, high, VIRT_CTR(stag, ctr));
534 CCCR_CLEAR(cccr); 533 CCCR_CLEAR(cccr);
535 CCCR_SET_REQUIRED_BITS(cccr); 534 CCCR_SET_REQUIRED_BITS(cccr);
536 CCCR_SET_ESCR_SELECT(cccr, ev->escr_select); 535 CCCR_SET_ESCR_SELECT(cccr, ev->escr_select);
537 if (stag == 0) { 536 if (stag == 0)
538 CCCR_SET_PMI_OVF_0(cccr); 537 CCCR_SET_PMI_OVF_0(cccr);
539 } else { 538 else
540 CCCR_SET_PMI_OVF_1(cccr); 539 CCCR_SET_PMI_OVF_1(cccr);
541 }
542 CCCR_WRITE(cccr, high, VIRT_CTR(stag, ctr)); 540 CCCR_WRITE(cccr, high, VIRT_CTR(stag, ctr));
543 return; 541 return;
544 } 542 }
545 } 543 }
546 544
547 printk(KERN_ERR 545 printk(KERN_ERR
548 "oprofile: P4 event code 0x%lx no binding, stag %d ctr %d\n", 546 "oprofile: P4 event code 0x%lx no binding, stag %d ctr %d\n",
549 counter_config[ctr].event, stag, ctr); 547 counter_config[ctr].event, stag, ctr);
550} 548}
@@ -559,14 +557,14 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs)
559 stag = get_stagger(); 557 stag = get_stagger();
560 558
561 rdmsr(MSR_IA32_MISC_ENABLE, low, high); 559 rdmsr(MSR_IA32_MISC_ENABLE, low, high);
562 if (! MISC_PMC_ENABLED_P(low)) { 560 if (!MISC_PMC_ENABLED_P(low)) {
563 printk(KERN_ERR "oprofile: P4 PMC not available\n"); 561 printk(KERN_ERR "oprofile: P4 PMC not available\n");
564 return; 562 return;
565 } 563 }
566 564
567 /* clear the cccrs we will use */ 565 /* clear the cccrs we will use */
568 for (i = 0 ; i < num_counters ; i++) { 566 for (i = 0 ; i < num_counters ; i++) {
569 if (unlikely(!CTRL_IS_RESERVED(msrs,i))) 567 if (unlikely(!CTRL_IS_RESERVED(msrs, i)))
570 continue; 568 continue;
571 rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); 569 rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
572 CCCR_CLEAR(low); 570 CCCR_CLEAR(low);
@@ -576,14 +574,14 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs)
576 574
577 /* clear all escrs (including those outside our concern) */ 575 /* clear all escrs (including those outside our concern) */
578 for (i = num_counters; i < num_controls; i++) { 576 for (i = num_counters; i < num_controls; i++) {
579 if (unlikely(!CTRL_IS_RESERVED(msrs,i))) 577 if (unlikely(!CTRL_IS_RESERVED(msrs, i)))
580 continue; 578 continue;
581 wrmsr(msrs->controls[i].addr, 0, 0); 579 wrmsr(msrs->controls[i].addr, 0, 0);
582 } 580 }
583 581
584 /* setup all counters */ 582 /* setup all counters */
585 for (i = 0 ; i < num_counters ; ++i) { 583 for (i = 0 ; i < num_counters ; ++i) {
586 if ((counter_config[i].enabled) && (CTRL_IS_RESERVED(msrs,i))) { 584 if ((counter_config[i].enabled) && (CTRL_IS_RESERVED(msrs, i))) {
587 reset_value[i] = counter_config[i].count; 585 reset_value[i] = counter_config[i].count;
588 pmc_setup_one_p4_counter(i); 586 pmc_setup_one_p4_counter(i);
589 CTR_WRITE(counter_config[i].count, VIRT_CTR(stag, i)); 587 CTR_WRITE(counter_config[i].count, VIRT_CTR(stag, i));
@@ -603,11 +601,11 @@ static int p4_check_ctrs(struct pt_regs * const regs,
603 stag = get_stagger(); 601 stag = get_stagger();
604 602
605 for (i = 0; i < num_counters; ++i) { 603 for (i = 0; i < num_counters; ++i) {
606 604
607 if (!reset_value[i]) 605 if (!reset_value[i])
608 continue; 606 continue;
609 607
610 /* 608 /*
611 * there is some eccentricity in the hardware which 609 * there is some eccentricity in the hardware which
612 * requires that we perform 2 extra corrections: 610 * requires that we perform 2 extra corrections:
613 * 611 *
@@ -616,24 +614,24 @@ static int p4_check_ctrs(struct pt_regs * const regs,
616 * 614 *
617 * - write the counter back twice to ensure it gets 615 * - write the counter back twice to ensure it gets
618 * updated properly. 616 * updated properly.
619 * 617 *
620 * the former seems to be related to extra NMIs happening 618 * the former seems to be related to extra NMIs happening
621 * during the current NMI; the latter is reported as errata 619 * during the current NMI; the latter is reported as errata
622 * N15 in intel doc 249199-029, pentium 4 specification 620 * N15 in intel doc 249199-029, pentium 4 specification
623 * update, though their suggested work-around does not 621 * update, though their suggested work-around does not
624 * appear to solve the problem. 622 * appear to solve the problem.
625 */ 623 */
626 624
627 real = VIRT_CTR(stag, i); 625 real = VIRT_CTR(stag, i);
628 626
629 CCCR_READ(low, high, real); 627 CCCR_READ(low, high, real);
630 CTR_READ(ctr, high, real); 628 CTR_READ(ctr, high, real);
631 if (CCCR_OVF_P(low) || CTR_OVERFLOW_P(ctr)) { 629 if (CCCR_OVF_P(low) || CTR_OVERFLOW_P(ctr)) {
632 oprofile_add_sample(regs, i); 630 oprofile_add_sample(regs, i);
633 CTR_WRITE(reset_value[i], real); 631 CTR_WRITE(reset_value[i], real);
634 CCCR_CLEAR_OVF(low); 632 CCCR_CLEAR_OVF(low);
635 CCCR_WRITE(low, high, real); 633 CCCR_WRITE(low, high, real);
636 CTR_WRITE(reset_value[i], real); 634 CTR_WRITE(reset_value[i], real);
637 } 635 }
638 } 636 }
639 637
@@ -683,15 +681,16 @@ static void p4_shutdown(struct op_msrs const * const msrs)
683 int i; 681 int i;
684 682
685 for (i = 0 ; i < num_counters ; ++i) { 683 for (i = 0 ; i < num_counters ; ++i) {
686 if (CTR_IS_RESERVED(msrs,i)) 684 if (CTR_IS_RESERVED(msrs, i))
687 release_perfctr_nmi(msrs->counters[i].addr); 685 release_perfctr_nmi(msrs->counters[i].addr);
688 } 686 }
689 /* some of the control registers are specially reserved in 687 /*
688 * some of the control registers are specially reserved in
690 * conjunction with the counter registers (hence the starting offset). 689 * conjunction with the counter registers (hence the starting offset).
691 * This saves a few bits. 690 * This saves a few bits.
692 */ 691 */
693 for (i = num_counters ; i < num_controls ; ++i) { 692 for (i = num_counters ; i < num_controls ; ++i) {
694 if (CTRL_IS_RESERVED(msrs,i)) 693 if (CTRL_IS_RESERVED(msrs, i))
695 release_evntsel_nmi(msrs->controls[i].addr); 694 release_evntsel_nmi(msrs->controls[i].addr);
696 } 695 }
697} 696}
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 6a0fca78c362..22e057665e55 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -580,7 +580,7 @@ static int __cpuinit amd_cpu_notify(struct notifier_block *self,
580 unsigned long action, void *hcpu) 580 unsigned long action, void *hcpu)
581{ 581{
582 int cpu = (long)hcpu; 582 int cpu = (long)hcpu;
583 switch(action) { 583 switch (action) {
584 case CPU_ONLINE: 584 case CPU_ONLINE:
585 case CPU_ONLINE_FROZEN: 585 case CPU_ONLINE_FROZEN:
586 smp_call_function_single(cpu, enable_pci_io_ecs, NULL, 0); 586 smp_call_function_single(cpu, enable_pci_io_ecs, NULL, 0);
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 8e077185e185..006599db0dc7 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -1043,35 +1043,44 @@ static void __init pcibios_fixup_irqs(void)
1043 if (io_apic_assign_pci_irqs) { 1043 if (io_apic_assign_pci_irqs) {
1044 int irq; 1044 int irq;
1045 1045
1046 if (pin) { 1046 if (!pin)
1047 /* 1047 continue;
1048 * interrupt pins are numbered starting 1048
1049 * from 1 1049 /*
1050 */ 1050 * interrupt pins are numbered starting from 1
1051 pin--; 1051 */
1052 irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, 1052 pin--;
1053 PCI_SLOT(dev->devfn), pin); 1053 irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
1054 /* 1054 PCI_SLOT(dev->devfn), pin);
1055 * Busses behind bridges are typically not listed in the MP-table. 1055 /*
1056 * In this case we have to look up the IRQ based on the parent bus, 1056 * Busses behind bridges are typically not listed in the
1057 * parent slot, and pin number. The SMP code detects such bridged 1057 * MP-table. In this case we have to look up the IRQ
1058 * busses itself so we should get into this branch reliably. 1058 * based on the parent bus, parent slot, and pin number.
1059 */ 1059 * The SMP code detects such bridged busses itself so we
1060 if (irq < 0 && dev->bus->parent) { /* go back to the bridge */ 1060 * should get into this branch reliably.
1061 struct pci_dev *bridge = dev->bus->self; 1061 */
1062 1062 if (irq < 0 && dev->bus->parent) {
1063 pin = (pin + PCI_SLOT(dev->devfn)) % 4; 1063 /* go back to the bridge */
1064 irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, 1064 struct pci_dev *bridge = dev->bus->self;
1065 PCI_SLOT(bridge->devfn), pin); 1065 int bus;
1066 if (irq >= 0) 1066
1067 dev_warn(&dev->dev, "using bridge %s INT %c to get IRQ %d\n", 1067 pin = (pin + PCI_SLOT(dev->devfn)) % 4;
1068 pci_name(bridge), 1068 bus = bridge->bus->number;
1069 'A' + pin, irq); 1069 irq = IO_APIC_get_PCI_irq_vector(bus,
1070 } 1070 PCI_SLOT(bridge->devfn), pin);
1071 if (irq >= 0) { 1071 if (irq >= 0)
1072 dev_info(&dev->dev, "PCI->APIC IRQ transform: INT %c -> IRQ %d\n", 'A' + pin, irq); 1072 dev_warn(&dev->dev,
1073 dev->irq = irq; 1073 "using bridge %s INT %c to "
1074 } 1074 "get IRQ %d\n",
1075 pci_name(bridge),
1076 'A' + pin, irq);
1077 }
1078 if (irq >= 0) {
1079 dev_info(&dev->dev,
1080 "PCI->APIC IRQ transform: INT %c "
1081 "-> IRQ %d\n",
1082 'A' + pin, irq);
1083 dev->irq = irq;
1075 } 1084 }
1076 } 1085 }
1077#endif 1086#endif
diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S
index 4fc7e872c85e..d1e9b53f9d33 100644
--- a/arch/x86/power/hibernate_asm_32.S
+++ b/arch/x86/power/hibernate_asm_32.S
@@ -1,5 +1,3 @@
1.text
2
3/* 1/*
4 * This may not use any stack, nor any variable that is not "NoSave": 2 * This may not use any stack, nor any variable that is not "NoSave":
5 * 3 *
@@ -12,17 +10,18 @@
12#include <asm/segment.h> 10#include <asm/segment.h>
13#include <asm/page.h> 11#include <asm/page.h>
14#include <asm/asm-offsets.h> 12#include <asm/asm-offsets.h>
13#include <asm/processor-flags.h>
15 14
16 .text 15.text
17 16
18ENTRY(swsusp_arch_suspend) 17ENTRY(swsusp_arch_suspend)
19
20 movl %esp, saved_context_esp 18 movl %esp, saved_context_esp
21 movl %ebx, saved_context_ebx 19 movl %ebx, saved_context_ebx
22 movl %ebp, saved_context_ebp 20 movl %ebp, saved_context_ebp
23 movl %esi, saved_context_esi 21 movl %esi, saved_context_esi
24 movl %edi, saved_context_edi 22 movl %edi, saved_context_edi
25 pushfl ; popl saved_context_eflags 23 pushfl
24 popl saved_context_eflags
26 25
27 call swsusp_save 26 call swsusp_save
28 ret 27 ret
@@ -59,7 +58,7 @@ done:
59 movl mmu_cr4_features, %ecx 58 movl mmu_cr4_features, %ecx
60 jecxz 1f # cr4 Pentium and higher, skip if zero 59 jecxz 1f # cr4 Pentium and higher, skip if zero
61 movl %ecx, %edx 60 movl %ecx, %edx
62 andl $~(1<<7), %edx; # PGE 61 andl $~(X86_CR4_PGE), %edx
63 movl %edx, %cr4; # turn off PGE 62 movl %edx, %cr4; # turn off PGE
641: 631:
65 movl %cr3, %eax; # flush TLB 64 movl %cr3, %eax; # flush TLB
@@ -74,7 +73,8 @@ done:
74 movl saved_context_esi, %esi 73 movl saved_context_esi, %esi
75 movl saved_context_edi, %edi 74 movl saved_context_edi, %edi
76 75
77 pushl saved_context_eflags ; popfl 76 pushl saved_context_eflags
77 popfl
78 78
79 xorl %eax, %eax 79 xorl %eax, %eax
80 80
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 8d28925ebed9..a27d562a9744 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -844,7 +844,7 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
844 844
845/* Early in boot, while setting up the initial pagetable, assume 845/* Early in boot, while setting up the initial pagetable, assume
846 everything is pinned. */ 846 everything is pinned. */
847static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn) 847static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
848{ 848{
849#ifdef CONFIG_FLATMEM 849#ifdef CONFIG_FLATMEM
850 BUG_ON(mem_map); /* should only be used early */ 850 BUG_ON(mem_map); /* should only be used early */
@@ -854,7 +854,7 @@ static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn)
854 854
855/* Early release_pte assumes that all pts are pinned, since there's 855/* Early release_pte assumes that all pts are pinned, since there's
856 only init_mm and anything attached to that is pinned. */ 856 only init_mm and anything attached to that is pinned. */
857static void xen_release_pte_init(u32 pfn) 857static void xen_release_pte_init(unsigned long pfn)
858{ 858{
859 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 859 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
860} 860}
@@ -870,7 +870,7 @@ static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
870 870
871/* This needs to make sure the new pte page is pinned iff its being 871/* This needs to make sure the new pte page is pinned iff its being
872 attached to a pinned pagetable. */ 872 attached to a pinned pagetable. */
873static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level) 873static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
874{ 874{
875 struct page *page = pfn_to_page(pfn); 875 struct page *page = pfn_to_page(pfn);
876 876
@@ -888,12 +888,12 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level)
888 } 888 }
889} 889}
890 890
891static void xen_alloc_pte(struct mm_struct *mm, u32 pfn) 891static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
892{ 892{
893 xen_alloc_ptpage(mm, pfn, PT_PTE); 893 xen_alloc_ptpage(mm, pfn, PT_PTE);
894} 894}
895 895
896static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn) 896static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
897{ 897{
898 xen_alloc_ptpage(mm, pfn, PT_PMD); 898 xen_alloc_ptpage(mm, pfn, PT_PMD);
899} 899}
@@ -941,7 +941,7 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
941} 941}
942 942
943/* This should never happen until we're OK to use struct page */ 943/* This should never happen until we're OK to use struct page */
944static void xen_release_ptpage(u32 pfn, unsigned level) 944static void xen_release_ptpage(unsigned long pfn, unsigned level)
945{ 945{
946 struct page *page = pfn_to_page(pfn); 946 struct page *page = pfn_to_page(pfn);
947 947
@@ -955,23 +955,23 @@ static void xen_release_ptpage(u32 pfn, unsigned level)
955 } 955 }
956} 956}
957 957
958static void xen_release_pte(u32 pfn) 958static void xen_release_pte(unsigned long pfn)
959{ 959{
960 xen_release_ptpage(pfn, PT_PTE); 960 xen_release_ptpage(pfn, PT_PTE);
961} 961}
962 962
963static void xen_release_pmd(u32 pfn) 963static void xen_release_pmd(unsigned long pfn)
964{ 964{
965 xen_release_ptpage(pfn, PT_PMD); 965 xen_release_ptpage(pfn, PT_PMD);
966} 966}
967 967
968#if PAGETABLE_LEVELS == 4 968#if PAGETABLE_LEVELS == 4
969static void xen_alloc_pud(struct mm_struct *mm, u32 pfn) 969static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
970{ 970{
971 xen_alloc_ptpage(mm, pfn, PT_PUD); 971 xen_alloc_ptpage(mm, pfn, PT_PUD);
972} 972}
973 973
974static void xen_release_pud(u32 pfn) 974static void xen_release_pud(unsigned long pfn)
975{ 975{
976 xen_release_ptpage(pfn, PT_PUD); 976 xen_release_ptpage(pfn, PT_PUD);
977} 977}
@@ -1354,7 +1354,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1354 .ptep_modify_prot_commit = __ptep_modify_prot_commit, 1354 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
1355 1355
1356 .pte_val = xen_pte_val, 1356 .pte_val = xen_pte_val,
1357 .pte_flags = native_pte_val, 1357 .pte_flags = native_pte_flags,
1358 .pgd_val = xen_pgd_val, 1358 .pgd_val = xen_pgd_val,
1359 1359
1360 .make_pte = xen_make_pte, 1360 .make_pte = xen_make_pte,
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index b6acc3a0af46..d67901083888 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -42,7 +42,7 @@ char * __init xen_memory_setup(void)
42 42
43 e820.nr_map = 0; 43 e820.nr_map = 0;
44 44
45 e820_add_region(0, PFN_PHYS(max_pfn), E820_RAM); 45 e820_add_region(0, PFN_PHYS((u64)max_pfn), E820_RAM);
46 46
47 /* 47 /*
48 * Even though this is normal, usable memory under Xen, reserve 48 * Even though this is normal, usable memory under Xen, reserve