aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorMike Travis <travis@sgi.com>2008-12-31 20:34:16 -0500
committerIngo Molnar <mingo@elte.hu>2009-01-03 12:53:31 -0500
commit7eb19553369c46cc1fa64caf120cbcab1b597f7c (patch)
treeef1a3beae706b9497c845d0a2557ceb4d2754998 /arch/x86
parent6092848a2a23b660150a38bc06f59d75838d70c8 (diff)
parent8c384cdee3e04d6194a2c2b192b624754f990835 (diff)
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-cpumask into merge-rr-cpumask
Conflicts: arch/x86/kernel/io_apic.c kernel/rcuclassic.c kernel/sched.c kernel/time/tick-sched.c Signed-off-by: Mike Travis <travis@sgi.com> [ mingo@elte.hu: backmerged typo fix for io_apic.c ] Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig88
-rw-r--r--arch/x86/Kconfig.debug20
-rw-r--r--arch/x86/boot/video-vga.c4
-rw-r--r--arch/x86/boot/video.c2
-rw-r--r--arch/x86/configs/i386_defconfig4
-rw-r--r--arch/x86/configs/x86_64_defconfig4
-rw-r--r--arch/x86/crypto/crc32c-intel.c121
-rw-r--r--arch/x86/ia32/ia32_aout.c2
-rw-r--r--arch/x86/ia32/ia32_signal.c109
-rw-r--r--arch/x86/include/asm/bitops.h10
-rw-r--r--arch/x86/include/asm/bug.h2
-rw-r--r--arch/x86/include/asm/byteorder.h74
-rw-r--r--arch/x86/include/asm/cpufeature.h5
-rw-r--r--arch/x86/include/asm/dma-mapping.h4
-rw-r--r--arch/x86/include/asm/ds.h248
-rw-r--r--arch/x86/include/asm/dwarf2.h97
-rw-r--r--arch/x86/include/asm/elf.h2
-rw-r--r--arch/x86/include/asm/ftrace.h29
-rw-r--r--arch/x86/include/asm/gart.h33
-rw-r--r--arch/x86/include/asm/hardirq_32.h2
-rw-r--r--arch/x86/include/asm/hardirq_64.h2
-rw-r--r--arch/x86/include/asm/hw_irq.h4
-rw-r--r--arch/x86/include/asm/hypervisor.h26
-rw-r--r--arch/x86/include/asm/ia32.h18
-rw-r--r--arch/x86/include/asm/idle.h5
-rw-r--r--arch/x86/include/asm/io.h37
-rw-r--r--arch/x86/include/asm/io_64.h2
-rw-r--r--arch/x86/include/asm/io_apic.h10
-rw-r--r--arch/x86/include/asm/iommu.h35
-rw-r--r--arch/x86/include/asm/irq.h4
-rw-r--r--arch/x86/include/asm/irq_regs_32.h2
-rw-r--r--arch/x86/include/asm/linkage.h60
-rw-r--r--arch/x86/include/asm/mmu_context_32.h13
-rw-r--r--arch/x86/include/asm/msr-index.h2
-rw-r--r--arch/x86/include/asm/msr.h15
-rw-r--r--arch/x86/include/asm/pci.h14
-rw-r--r--arch/x86/include/asm/pci_64.h1
-rw-r--r--arch/x86/include/asm/pgtable-2level.h50
-rw-r--r--arch/x86/include/asm/pgtable-3level.h1
-rw-r--r--arch/x86/include/asm/pgtable.h28
-rw-r--r--arch/x86/include/asm/pgtable_32.h9
-rw-r--r--arch/x86/include/asm/pgtable_64.h28
-rw-r--r--arch/x86/include/asm/prctl.h3
-rw-r--r--arch/x86/include/asm/processor.h17
-rw-r--r--arch/x86/include/asm/ptrace.h43
-rw-r--r--arch/x86/include/asm/setup.h4
-rw-r--r--arch/x86/include/asm/sigframe.h70
-rw-r--r--arch/x86/include/asm/signal.h6
-rw-r--r--arch/x86/include/asm/sparsemem.h2
-rw-r--r--arch/x86/include/asm/syscalls.h16
-rw-r--r--arch/x86/include/asm/system.h4
-rw-r--r--arch/x86/include/asm/thread_info.h7
-rw-r--r--arch/x86/include/asm/topology.h36
-rw-r--r--arch/x86/include/asm/trampoline.h7
-rw-r--r--arch/x86/include/asm/traps.h11
-rw-r--r--arch/x86/include/asm/tsc.h8
-rw-r--r--arch/x86/include/asm/uaccess.h4
-rw-r--r--arch/x86/include/asm/uv/bios.h34
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h103
-rw-r--r--arch/x86/include/asm/vmware.h27
-rw-r--r--arch/x86/include/asm/xen/hypercall.h6
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h39
-rw-r--r--arch/x86/include/asm/xen/page.h5
-rw-r--r--arch/x86/kernel/Makefile8
-rw-r--r--arch/x86/kernel/amd_iommu.c6
-rw-r--r--arch/x86/kernel/amd_iommu_init.c8
-rw-r--r--arch/x86/kernel/aperture_64.c5
-rw-r--r--arch/x86/kernel/apic.c22
-rw-r--r--arch/x86/kernel/asm-offsets_32.c2
-rw-r--r--arch/x86/kernel/asm-offsets_64.c4
-rw-r--r--arch/x86/kernel/bios_uv.c58
-rw-r--r--arch/x86/kernel/check.c161
-rw-r--r--arch/x86/kernel/cpu/Makefile6
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c8
-rw-r--r--arch/x86/kernel/cpu/amd.c9
-rw-r--r--arch/x86/kernel/cpu/common.c8
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c58
-rw-r--r--arch/x86/kernel/cpu/intel.c22
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c17
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c3
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel_64.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c346
-rw-r--r--arch/x86/kernel/cpu/vmware.c112
-rw-r--r--arch/x86/kernel/ds.c899
-rw-r--r--arch/x86/kernel/e820.c16
-rw-r--r--arch/x86/kernel/early-quirks.c1
-rw-r--r--arch/x86/kernel/early_printk.c47
-rw-r--r--arch/x86/kernel/entry_32.S477
-rw-r--r--arch/x86/kernel/entry_64.S1417
-rw-r--r--arch/x86/kernel/genx2apic_uv_x.c111
-rw-r--r--arch/x86/kernel/head.c1
-rw-r--r--arch/x86/kernel/head32.c3
-rw-r--r--arch/x86/kernel/head64.c3
-rw-r--r--arch/x86/kernel/hpet.c11
-rw-r--r--arch/x86/kernel/init_task.c1
-rw-r--r--arch/x86/kernel/io_apic.c5
-rw-r--r--arch/x86/kernel/irq_64.c27
-rw-r--r--arch/x86/kernel/irqinit_32.c2
-rw-r--r--arch/x86/kernel/irqinit_64.c66
-rw-r--r--arch/x86/kernel/microcode_amd.c232
-rw-r--r--arch/x86/kernel/microcode_core.c25
-rw-r--r--arch/x86/kernel/microcode_intel.c8
-rw-r--r--arch/x86/kernel/nmi.c15
-rw-r--r--arch/x86/kernel/pci-dma.c24
-rw-r--r--arch/x86/kernel/pci-gart_64.c4
-rw-r--r--arch/x86/kernel/pci-swiotlb_64.c29
-rw-r--r--arch/x86/kernel/process.c3
-rw-r--r--arch/x86/kernel/process_32.c63
-rw-r--r--arch/x86/kernel/process_64.c54
-rw-r--r--arch/x86/kernel/ptrace.c464
-rw-r--r--arch/x86/kernel/quirks.c2
-rw-r--r--arch/x86/kernel/setup.c162
-rw-r--r--arch/x86/kernel/setup_percpu.c8
-rw-r--r--arch/x86/kernel/sigframe.h42
-rw-r--r--arch/x86/kernel/signal.c (renamed from arch/x86/kernel/signal_32.c)567
-rw-r--r--arch/x86/kernel/signal_64.c516
-rw-r--r--arch/x86/kernel/smp.c18
-rw-r--r--arch/x86/kernel/smpboot.c19
-rw-r--r--arch/x86/kernel/time_32.c2
-rw-r--r--arch/x86/kernel/time_64.c6
-rw-r--r--arch/x86/kernel/tlb_32.c13
-rw-r--r--arch/x86/kernel/tlb_64.c2
-rw-r--r--arch/x86/kernel/tlb_uv.c4
-rw-r--r--arch/x86/kernel/trampoline.c19
-rw-r--r--arch/x86/kernel/traps.c38
-rw-r--r--arch/x86/kernel/tsc.c42
-rw-r--r--arch/x86/kernel/tsc_sync.c8
-rw-r--r--arch/x86/kernel/vmi_32.c119
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S1
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S1
-rw-r--r--arch/x86/kernel/vsyscall_64.c9
-rw-r--r--arch/x86/lguest/boot.c3
-rw-r--r--arch/x86/lguest/i386_head.S15
-rw-r--r--arch/x86/mm/fault.c2
-rw-r--r--arch/x86/mm/init_32.c33
-rw-r--r--arch/x86/mm/init_64.c2
-rw-r--r--arch/x86/mm/ioremap.c3
-rw-r--r--arch/x86/mm/pat.c236
-rw-r--r--arch/x86/oprofile/op_model_amd.c89
-rw-r--r--arch/x86/pci/common.c17
-rw-r--r--arch/x86/scripts/strip-symbols1
-rw-r--r--arch/x86/vdso/vdso32-setup.c2
-rw-r--r--arch/x86/vdso/vma.c2
-rw-r--r--arch/x86/xen/enlighten.c17
-rw-r--r--arch/x86/xen/mmu.c17
-rw-r--r--arch/x86/xen/multicalls.c2
-rw-r--r--arch/x86/xen/setup.c9
148 files changed, 4560 insertions, 4041 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0ca2eb7573cd..249d1e0824b5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -19,6 +19,8 @@ config X86_64
19config X86 19config X86
20 def_bool y 20 def_bool y
21 select HAVE_AOUT if X86_32 21 select HAVE_AOUT if X86_32
22 select HAVE_READQ
23 select HAVE_WRITEQ
22 select HAVE_UNSTABLE_SCHED_CLOCK 24 select HAVE_UNSTABLE_SCHED_CLOCK
23 select HAVE_IDE 25 select HAVE_IDE
24 select HAVE_OPROFILE 26 select HAVE_OPROFILE
@@ -90,6 +92,10 @@ config GENERIC_IOMAP
90config GENERIC_BUG 92config GENERIC_BUG
91 def_bool y 93 def_bool y
92 depends on BUG 94 depends on BUG
95 select GENERIC_BUG_RELATIVE_POINTERS if X86_64
96
97config GENERIC_BUG_RELATIVE_POINTERS
98 bool
93 99
94config GENERIC_HWEIGHT 100config GENERIC_HWEIGHT
95 def_bool y 101 def_bool y
@@ -244,16 +250,19 @@ config X86_HAS_BOOT_CPU_ID
244config SPARSE_IRQ 250config SPARSE_IRQ
245 bool "Support sparse irq numbering" 251 bool "Support sparse irq numbering"
246 depends on PCI_MSI || HT_IRQ 252 depends on PCI_MSI || HT_IRQ
247 default y
248 help 253 help
249 This enables support for sparse irq, esp for msi/msi-x. You may need 254 This enables support for sparse irqs. This is useful for distro
250 if you have lots of cards supports msi-x installed. 255 kernels that want to define a high CONFIG_NR_CPUS value but still
256 want to have low kernel memory footprint on smaller machines.
251 257
252 If you don't know what to do here, say Y. 258 ( Sparse IRQs can also be beneficial on NUMA boxes, as they spread
259 out the irq_desc[] array in a more NUMA-friendly way. )
260
261 If you don't know what to do here, say N.
253 262
254config NUMA_MIGRATE_IRQ_DESC 263config NUMA_MIGRATE_IRQ_DESC
255 bool "Move irq desc when changing irq smp_affinity" 264 bool "Move irq desc when changing irq smp_affinity"
256 depends on SPARSE_IRQ && SMP 265 depends on SPARSE_IRQ && NUMA
257 default n 266 default n
258 help 267 help
259 This enables moving irq_desc to cpu/node that irq will use handled. 268 This enables moving irq_desc to cpu/node that irq will use handled.
@@ -264,21 +273,13 @@ config X86_FIND_SMP_CONFIG
264 def_bool y 273 def_bool y
265 depends on X86_MPPARSE || X86_VOYAGER 274 depends on X86_MPPARSE || X86_VOYAGER
266 275
267if ACPI
268config X86_MPPARSE 276config X86_MPPARSE
269 def_bool y 277 bool "Enable MPS table" if ACPI
270 bool "Enable MPS table" 278 default y
271 depends on X86_LOCAL_APIC 279 depends on X86_LOCAL_APIC
272 help 280 help
273 For old smp systems that do not have proper acpi support. Newer systems 281 For old smp systems that do not have proper acpi support. Newer systems
274 (esp with 64bit cpus) with acpi support, MADT and DSDT will override it 282 (esp with 64bit cpus) with acpi support, MADT and DSDT will override it
275endif
276
277if !ACPI
278config X86_MPPARSE
279 def_bool y
280 depends on X86_LOCAL_APIC
281endif
282 283
283choice 284choice
284 prompt "Subarchitecture Type" 285 prompt "Subarchitecture Type"
@@ -500,7 +501,7 @@ config HPET_TIMER
500 The HPET provides a stable time base on SMP 501 The HPET provides a stable time base on SMP
501 systems, unlike the TSC, but it is more expensive to access, 502 systems, unlike the TSC, but it is more expensive to access,
502 as it is off-chip. You can find the HPET spec at 503 as it is off-chip. You can find the HPET spec at
503 <http://www.intel.com/hardwaredesign/hpetspec.htm>. 504 <http://www.intel.com/hardwaredesign/hpetspec_1.pdf>.
504 505
505 You can safely choose Y here. However, HPET will only be 506 You can safely choose Y here. However, HPET will only be
506 activated if the platform and the BIOS support this feature. 507 activated if the platform and the BIOS support this feature.
@@ -587,7 +588,7 @@ config AMD_IOMMU
587 588
588# need this always selected by IOMMU for the VIA workaround 589# need this always selected by IOMMU for the VIA workaround
589config SWIOTLB 590config SWIOTLB
590 bool 591 def_bool y if X86_64
591 help 592 help
592 Support for software bounce buffers used on x86-64 systems 593 Support for software bounce buffers used on x86-64 systems
593 which don't have a hardware IOMMU (e.g. the current generation 594 which don't have a hardware IOMMU (e.g. the current generation
@@ -679,6 +680,30 @@ config X86_VISWS_APIC
679 def_bool y 680 def_bool y
680 depends on X86_32 && X86_VISWS 681 depends on X86_32 && X86_VISWS
681 682
683config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
684 bool "Reroute for broken boot IRQs"
685 default n
686 depends on X86_IO_APIC
687 help
688 This option enables a workaround that fixes a source of
689 spurious interrupts. This is recommended when threaded
690 interrupt handling is used on systems where the generation of
691 superfluous "boot interrupts" cannot be disabled.
692
693 Some chipsets generate a legacy INTx "boot IRQ" when the IRQ
694 entry in the chipset's IO-APIC is masked (as, e.g. the RT
695 kernel does during interrupt handling). On chipsets where this
696 boot IRQ generation cannot be disabled, this workaround keeps
697 the original IRQ line masked so that only the equivalent "boot
698 IRQ" is delivered to the CPUs. The workaround also tells the
699 kernel to set up the IRQ handler on the boot IRQ line. In this
700 way only one interrupt is delivered to the kernel. Otherwise
701 the spurious second interrupt may cause the kernel to bring
702 down (vital) interrupt lines.
703
704 Only affects "broken" chipsets. Interrupt sharing may be
705 increased on these systems.
706
682config X86_MCE 707config X86_MCE
683 bool "Machine Check Exception" 708 bool "Machine Check Exception"
684 depends on !X86_VOYAGER 709 depends on !X86_VOYAGER
@@ -975,24 +1000,37 @@ config X86_PAE
975config ARCH_PHYS_ADDR_T_64BIT 1000config ARCH_PHYS_ADDR_T_64BIT
976 def_bool X86_64 || X86_PAE 1001 def_bool X86_64 || X86_PAE
977 1002
1003config DIRECT_GBPAGES
1004 bool "Enable 1GB pages for kernel pagetables" if EMBEDDED
1005 default y
1006 depends on X86_64
1007 help
1008 Allow the kernel linear mapping to use 1GB pages on CPUs that
1009 support it. This can improve the kernel's performance a tiny bit by
1010 reducing TLB pressure. If in doubt, say "Y".
1011
978# Common NUMA Features 1012# Common NUMA Features
979config NUMA 1013config NUMA
980 bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)" 1014 bool "Numa Memory Allocation and Scheduler Support"
981 depends on SMP 1015 depends on SMP
982 depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL) 1016 depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL)
983 default n if X86_PC 1017 default n if X86_PC
984 default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP) 1018 default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP)
985 help 1019 help
986 Enable NUMA (Non Uniform Memory Access) support. 1020 Enable NUMA (Non Uniform Memory Access) support.
1021
987 The kernel will try to allocate memory used by a CPU on the 1022 The kernel will try to allocate memory used by a CPU on the
988 local memory controller of the CPU and add some more 1023 local memory controller of the CPU and add some more
989 NUMA awareness to the kernel. 1024 NUMA awareness to the kernel.
990 1025
991 For 32-bit this is currently highly experimental and should be only 1026 For 64-bit this is recommended if the system is Intel Core i7
992 used for kernel development. It might also cause boot failures. 1027 (or later), AMD Opteron, or EM64T NUMA.
993 For 64-bit this is recommended on all multiprocessor Opteron systems. 1028
994 If the system is EM64T, you should say N unless your system is 1029 For 32-bit this is only needed on (rare) 32-bit-only platforms
995 EM64T NUMA. 1030 that support NUMA topologies, such as NUMAQ / Summit, or if you
1031 boot a 32-bit kernel on a 64-bit NUMA platform.
1032
1033 Otherwise, you should say N.
996 1034
997comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI" 1035comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI"
998 depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI) 1036 depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI)
@@ -1512,6 +1550,10 @@ config ARCH_ENABLE_MEMORY_HOTPLUG
1512 def_bool y 1550 def_bool y
1513 depends on X86_64 || (X86_32 && HIGHMEM) 1551 depends on X86_64 || (X86_32 && HIGHMEM)
1514 1552
1553config ARCH_ENABLE_MEMORY_HOTREMOVE
1554 def_bool y
1555 depends on MEMORY_HOTPLUG
1556
1515config HAVE_ARCH_EARLY_PFN_TO_NID 1557config HAVE_ARCH_EARLY_PFN_TO_NID
1516 def_bool X86_64 1558 def_bool X86_64
1517 depends on NUMA 1559 depends on NUMA
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index fa013f529b74..10d6cc3fd052 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -114,18 +114,6 @@ config DEBUG_RODATA
114 data. This is recommended so that we can catch kernel bugs sooner. 114 data. This is recommended so that we can catch kernel bugs sooner.
115 If in doubt, say "Y". 115 If in doubt, say "Y".
116 116
117config DIRECT_GBPAGES
118 bool "Enable gbpages-mapped kernel pagetables"
119 depends on DEBUG_KERNEL && EXPERIMENTAL && X86_64
120 help
121 Enable gigabyte pages support (if the CPU supports it). This can
122 improve the kernel's performance a tiny bit by reducing TLB
123 pressure.
124
125 This is experimental code.
126
127 If in doubt, say "N".
128
129config DEBUG_RODATA_TEST 117config DEBUG_RODATA_TEST
130 bool "Testcase for the DEBUG_RODATA feature" 118 bool "Testcase for the DEBUG_RODATA feature"
131 depends on DEBUG_RODATA 119 depends on DEBUG_RODATA
@@ -303,10 +291,10 @@ config OPTIMIZE_INLINING
303 developers have marked 'inline'. Doing so takes away freedom from gcc to 291 developers have marked 'inline'. Doing so takes away freedom from gcc to
304 do what it thinks is best, which is desirable for the gcc 3.x series of 292 do what it thinks is best, which is desirable for the gcc 3.x series of
305 compilers. The gcc 4.x series have a rewritten inlining algorithm and 293 compilers. The gcc 4.x series have a rewritten inlining algorithm and
306 disabling this option will generate a smaller kernel there. Hopefully 294 enabling this option will generate a smaller kernel there. Hopefully
307 this algorithm is so good that allowing gcc4 to make the decision can 295 this algorithm is so good that allowing gcc 4.x and above to make the
308 become the default in the future, until then this option is there to 296 decision will become the default in the future. Until then this option
309 test gcc for this. 297 is there to test gcc for this.
310 298
311 If unsure, say N. 299 If unsure, say N.
312 300
diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c
index b939cb476dec..5d4742ed4aa2 100644
--- a/arch/x86/boot/video-vga.c
+++ b/arch/x86/boot/video-vga.c
@@ -34,7 +34,7 @@ static struct mode_info cga_modes[] = {
34 { VIDEO_80x25, 80, 25, 0 }, 34 { VIDEO_80x25, 80, 25, 0 },
35}; 35};
36 36
37__videocard video_vga; 37static __videocard video_vga;
38 38
39/* Set basic 80x25 mode */ 39/* Set basic 80x25 mode */
40static u8 vga_set_basic_mode(void) 40static u8 vga_set_basic_mode(void)
@@ -259,7 +259,7 @@ static int vga_probe(void)
259 return mode_count[adapter]; 259 return mode_count[adapter];
260} 260}
261 261
262__videocard video_vga = { 262static __videocard video_vga = {
263 .card_name = "VGA", 263 .card_name = "VGA",
264 .probe = vga_probe, 264 .probe = vga_probe,
265 .set_mode = vga_set_mode, 265 .set_mode = vga_set_mode,
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
index 83598b23093a..3bef2c1febe9 100644
--- a/arch/x86/boot/video.c
+++ b/arch/x86/boot/video.c
@@ -226,7 +226,7 @@ static unsigned int mode_menu(void)
226 226
227#ifdef CONFIG_VIDEO_RETAIN 227#ifdef CONFIG_VIDEO_RETAIN
228/* Save screen content to the heap */ 228/* Save screen content to the heap */
229struct saved_screen { 229static struct saved_screen {
230 int x, y; 230 int x, y;
231 int curx, cury; 231 int curx, cury;
232 u16 *data; 232 u16 *data;
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 13b8c86ae985..b30a08ed8eb4 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -77,7 +77,7 @@ CONFIG_AUDIT=y
77CONFIG_AUDITSYSCALL=y 77CONFIG_AUDITSYSCALL=y
78CONFIG_AUDIT_TREE=y 78CONFIG_AUDIT_TREE=y
79# CONFIG_IKCONFIG is not set 79# CONFIG_IKCONFIG is not set
80CONFIG_LOG_BUF_SHIFT=17 80CONFIG_LOG_BUF_SHIFT=18
81CONFIG_CGROUPS=y 81CONFIG_CGROUPS=y
82# CONFIG_CGROUP_DEBUG is not set 82# CONFIG_CGROUP_DEBUG is not set
83CONFIG_CGROUP_NS=y 83CONFIG_CGROUP_NS=y
@@ -298,7 +298,7 @@ CONFIG_KEXEC=y
298CONFIG_CRASH_DUMP=y 298CONFIG_CRASH_DUMP=y
299# CONFIG_KEXEC_JUMP is not set 299# CONFIG_KEXEC_JUMP is not set
300CONFIG_PHYSICAL_START=0x1000000 300CONFIG_PHYSICAL_START=0x1000000
301CONFIG_RELOCATABLE=y 301# CONFIG_RELOCATABLE is not set
302CONFIG_PHYSICAL_ALIGN=0x200000 302CONFIG_PHYSICAL_ALIGN=0x200000
303CONFIG_HOTPLUG_CPU=y 303CONFIG_HOTPLUG_CPU=y
304# CONFIG_COMPAT_VDSO is not set 304# CONFIG_COMPAT_VDSO is not set
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index f0a03d7a7d63..0e7dbc0a3e46 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -77,7 +77,7 @@ CONFIG_AUDIT=y
77CONFIG_AUDITSYSCALL=y 77CONFIG_AUDITSYSCALL=y
78CONFIG_AUDIT_TREE=y 78CONFIG_AUDIT_TREE=y
79# CONFIG_IKCONFIG is not set 79# CONFIG_IKCONFIG is not set
80CONFIG_LOG_BUF_SHIFT=17 80CONFIG_LOG_BUF_SHIFT=18
81CONFIG_CGROUPS=y 81CONFIG_CGROUPS=y
82# CONFIG_CGROUP_DEBUG is not set 82# CONFIG_CGROUP_DEBUG is not set
83CONFIG_CGROUP_NS=y 83CONFIG_CGROUP_NS=y
@@ -298,7 +298,7 @@ CONFIG_SCHED_HRTICK=y
298CONFIG_KEXEC=y 298CONFIG_KEXEC=y
299CONFIG_CRASH_DUMP=y 299CONFIG_CRASH_DUMP=y
300CONFIG_PHYSICAL_START=0x1000000 300CONFIG_PHYSICAL_START=0x1000000
301CONFIG_RELOCATABLE=y 301# CONFIG_RELOCATABLE is not set
302CONFIG_PHYSICAL_ALIGN=0x200000 302CONFIG_PHYSICAL_ALIGN=0x200000
303CONFIG_HOTPLUG_CPU=y 303CONFIG_HOTPLUG_CPU=y
304# CONFIG_COMPAT_VDSO is not set 304# CONFIG_COMPAT_VDSO is not set
diff --git a/arch/x86/crypto/crc32c-intel.c b/arch/x86/crypto/crc32c-intel.c
index 070afc5b6c94..b9d00261703c 100644
--- a/arch/x86/crypto/crc32c-intel.c
+++ b/arch/x86/crypto/crc32c-intel.c
@@ -6,13 +6,22 @@
6 * Intel(R) 64 and IA-32 Architectures Software Developer's Manual 6 * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
7 * Volume 2A: Instruction Set Reference, A-M 7 * Volume 2A: Instruction Set Reference, A-M
8 * 8 *
9 * Copyright (c) 2008 Austin Zhang <austin_zhang@linux.intel.com> 9 * Copyright (C) 2008 Intel Corporation
10 * Copyright (c) 2008 Kent Liu <kent.liu@intel.com> 10 * Authors: Austin Zhang <austin_zhang@linux.intel.com>
11 * Kent Liu <kent.liu@intel.com>
11 * 12 *
12 * This program is free software; you can redistribute it and/or modify it 13 * This program is free software; you can redistribute it and/or modify it
13 * under the terms of the GNU General Public License as published by the Free 14 * under the terms and conditions of the GNU General Public License,
14 * Software Foundation; either version 2 of the License, or (at your option) 15 * version 2, as published by the Free Software Foundation.
15 * any later version. 16 *
17 * This program is distributed in the hope it will be useful, but WITHOUT
18 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
20 * more details.
21 *
22 * You should have received a copy of the GNU General Public License along with
23 * this program; if not, write to the Free Software Foundation, Inc.,
24 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
16 * 25 *
17 */ 26 */
18#include <linux/init.h> 27#include <linux/init.h>
@@ -75,99 +84,92 @@ static u32 __pure crc32c_intel_le_hw(u32 crc, unsigned char const *p, size_t len
75 * If your algorithm starts with ~0, then XOR with ~0 before you set 84 * If your algorithm starts with ~0, then XOR with ~0 before you set
76 * the seed. 85 * the seed.
77 */ 86 */
78static int crc32c_intel_setkey(struct crypto_ahash *hash, const u8 *key, 87static int crc32c_intel_setkey(struct crypto_shash *hash, const u8 *key,
79 unsigned int keylen) 88 unsigned int keylen)
80{ 89{
81 u32 *mctx = crypto_ahash_ctx(hash); 90 u32 *mctx = crypto_shash_ctx(hash);
82 91
83 if (keylen != sizeof(u32)) { 92 if (keylen != sizeof(u32)) {
84 crypto_ahash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN); 93 crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
85 return -EINVAL; 94 return -EINVAL;
86 } 95 }
87 *mctx = le32_to_cpup((__le32 *)key); 96 *mctx = le32_to_cpup((__le32 *)key);
88 return 0; 97 return 0;
89} 98}
90 99
91static int crc32c_intel_init(struct ahash_request *req) 100static int crc32c_intel_init(struct shash_desc *desc)
92{ 101{
93 u32 *mctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req)); 102 u32 *mctx = crypto_shash_ctx(desc->tfm);
94 u32 *crcp = ahash_request_ctx(req); 103 u32 *crcp = shash_desc_ctx(desc);
95 104
96 *crcp = *mctx; 105 *crcp = *mctx;
97 106
98 return 0; 107 return 0;
99} 108}
100 109
101static int crc32c_intel_update(struct ahash_request *req) 110static int crc32c_intel_update(struct shash_desc *desc, const u8 *data,
111 unsigned int len)
102{ 112{
103 struct crypto_hash_walk walk; 113 u32 *crcp = shash_desc_ctx(desc);
104 u32 *crcp = ahash_request_ctx(req);
105 u32 crc = *crcp;
106 int nbytes;
107
108 for (nbytes = crypto_hash_walk_first(req, &walk); nbytes;
109 nbytes = crypto_hash_walk_done(&walk, 0))
110 crc = crc32c_intel_le_hw(crc, walk.data, nbytes);
111 114
112 *crcp = crc; 115 *crcp = crc32c_intel_le_hw(*crcp, data, len);
113 return 0; 116 return 0;
114} 117}
115 118
116static int crc32c_intel_final(struct ahash_request *req) 119static int __crc32c_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
120 u8 *out)
117{ 121{
118 u32 *crcp = ahash_request_ctx(req); 122 *(__le32 *)out = ~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
119
120 *(__le32 *)req->result = ~cpu_to_le32p(crcp);
121 return 0; 123 return 0;
122} 124}
123 125
124static int crc32c_intel_digest(struct ahash_request *req) 126static int crc32c_intel_finup(struct shash_desc *desc, const u8 *data,
127 unsigned int len, u8 *out)
125{ 128{
126 struct crypto_hash_walk walk; 129 return __crc32c_intel_finup(shash_desc_ctx(desc), data, len, out);
127 u32 *mctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req)); 130}
128 u32 crc = *mctx;
129 int nbytes;
130 131
131 for (nbytes = crypto_hash_walk_first(req, &walk); nbytes; 132static int crc32c_intel_final(struct shash_desc *desc, u8 *out)
132 nbytes = crypto_hash_walk_done(&walk, 0)) 133{
133 crc = crc32c_intel_le_hw(crc, walk.data, nbytes); 134 u32 *crcp = shash_desc_ctx(desc);
134 135
135 *(__le32 *)req->result = ~cpu_to_le32(crc); 136 *(__le32 *)out = ~cpu_to_le32p(crcp);
136 return 0; 137 return 0;
137} 138}
138 139
140static int crc32c_intel_digest(struct shash_desc *desc, const u8 *data,
141 unsigned int len, u8 *out)
142{
143 return __crc32c_intel_finup(crypto_shash_ctx(desc->tfm), data, len,
144 out);
145}
146
139static int crc32c_intel_cra_init(struct crypto_tfm *tfm) 147static int crc32c_intel_cra_init(struct crypto_tfm *tfm)
140{ 148{
141 u32 *key = crypto_tfm_ctx(tfm); 149 u32 *key = crypto_tfm_ctx(tfm);
142 150
143 *key = ~0; 151 *key = ~0;
144 152
145 tfm->crt_ahash.reqsize = sizeof(u32);
146
147 return 0; 153 return 0;
148} 154}
149 155
150static struct crypto_alg alg = { 156static struct shash_alg alg = {
151 .cra_name = "crc32c", 157 .setkey = crc32c_intel_setkey,
152 .cra_driver_name = "crc32c-intel", 158 .init = crc32c_intel_init,
153 .cra_priority = 200, 159 .update = crc32c_intel_update,
154 .cra_flags = CRYPTO_ALG_TYPE_AHASH, 160 .final = crc32c_intel_final,
155 .cra_blocksize = CHKSUM_BLOCK_SIZE, 161 .finup = crc32c_intel_finup,
156 .cra_alignmask = 3, 162 .digest = crc32c_intel_digest,
157 .cra_ctxsize = sizeof(u32), 163 .descsize = sizeof(u32),
158 .cra_module = THIS_MODULE, 164 .digestsize = CHKSUM_DIGEST_SIZE,
159 .cra_list = LIST_HEAD_INIT(alg.cra_list), 165 .base = {
160 .cra_init = crc32c_intel_cra_init, 166 .cra_name = "crc32c",
161 .cra_type = &crypto_ahash_type, 167 .cra_driver_name = "crc32c-intel",
162 .cra_u = { 168 .cra_priority = 200,
163 .ahash = { 169 .cra_blocksize = CHKSUM_BLOCK_SIZE,
164 .digestsize = CHKSUM_DIGEST_SIZE, 170 .cra_ctxsize = sizeof(u32),
165 .setkey = crc32c_intel_setkey, 171 .cra_module = THIS_MODULE,
166 .init = crc32c_intel_init, 172 .cra_init = crc32c_intel_cra_init,
167 .update = crc32c_intel_update,
168 .final = crc32c_intel_final,
169 .digest = crc32c_intel_digest,
170 }
171 } 173 }
172}; 174};
173 175
@@ -175,14 +177,14 @@ static struct crypto_alg alg = {
175static int __init crc32c_intel_mod_init(void) 177static int __init crc32c_intel_mod_init(void)
176{ 178{
177 if (cpu_has_xmm4_2) 179 if (cpu_has_xmm4_2)
178 return crypto_register_alg(&alg); 180 return crypto_register_shash(&alg);
179 else 181 else
180 return -ENODEV; 182 return -ENODEV;
181} 183}
182 184
183static void __exit crc32c_intel_mod_fini(void) 185static void __exit crc32c_intel_mod_fini(void)
184{ 186{
185 crypto_unregister_alg(&alg); 187 crypto_unregister_shash(&alg);
186} 188}
187 189
188module_init(crc32c_intel_mod_init); 190module_init(crc32c_intel_mod_init);
@@ -194,4 +196,3 @@ MODULE_LICENSE("GPL");
194 196
195MODULE_ALIAS("crc32c"); 197MODULE_ALIAS("crc32c");
196MODULE_ALIAS("crc32c-intel"); 198MODULE_ALIAS("crc32c-intel");
197
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 127ec3f07214..2a4d073d2cf1 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -327,7 +327,7 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs)
327 current->mm->cached_hole_size = 0; 327 current->mm->cached_hole_size = 0;
328 328
329 current->mm->mmap = NULL; 329 current->mm->mmap = NULL;
330 compute_creds(bprm); 330 install_exec_creds(bprm);
331 current->flags &= ~PF_FORKNOEXEC; 331 current->flags &= ~PF_FORKNOEXEC;
332 332
333 if (N_MAGIC(ex) == OMAGIC) { 333 if (N_MAGIC(ex) == OMAGIC) {
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 4bc02b23674b..b195f85526e3 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -32,6 +32,8 @@
32#include <asm/proto.h> 32#include <asm/proto.h>
33#include <asm/vdso.h> 33#include <asm/vdso.h>
34 34
35#include <asm/sigframe.h>
36
35#define DEBUG_SIG 0 37#define DEBUG_SIG 0
36 38
37#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) 39#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
@@ -41,7 +43,6 @@
41 X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \ 43 X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \
42 X86_EFLAGS_CF) 44 X86_EFLAGS_CF)
43 45
44asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset);
45void signal_fault(struct pt_regs *regs, void __user *frame, char *where); 46void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
46 47
47int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from) 48int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
@@ -173,47 +174,28 @@ asmlinkage long sys32_sigaltstack(const stack_ia32_t __user *uss_ptr,
173/* 174/*
174 * Do a signal return; undo the signal stack. 175 * Do a signal return; undo the signal stack.
175 */ 176 */
177#define COPY(x) { \
178 err |= __get_user(regs->x, &sc->x); \
179}
176 180
177struct sigframe 181#define COPY_SEG_CPL3(seg) { \
178{ 182 unsigned short tmp; \
179 u32 pretcode; 183 err |= __get_user(tmp, &sc->seg); \
180 int sig; 184 regs->seg = tmp | 3; \
181 struct sigcontext_ia32 sc;
182 struct _fpstate_ia32 fpstate_unused; /* look at kernel/sigframe.h */
183 unsigned int extramask[_COMPAT_NSIG_WORDS-1];
184 char retcode[8];
185 /* fp state follows here */
186};
187
188struct rt_sigframe
189{
190 u32 pretcode;
191 int sig;
192 u32 pinfo;
193 u32 puc;
194 compat_siginfo_t info;
195 struct ucontext_ia32 uc;
196 char retcode[8];
197 /* fp state follows here */
198};
199
200#define COPY(x) { \
201 unsigned int reg; \
202 err |= __get_user(reg, &sc->x); \
203 regs->x = reg; \
204} 185}
205 186
206#define RELOAD_SEG(seg,mask) \ 187#define RELOAD_SEG(seg) { \
207 { unsigned int cur; \ 188 unsigned int cur, pre; \
208 unsigned short pre; \ 189 err |= __get_user(pre, &sc->seg); \
209 err |= __get_user(pre, &sc->seg); \ 190 savesegment(seg, cur); \
210 savesegment(seg, cur); \ 191 pre |= 3; \
211 pre |= mask; \ 192 if (pre != cur) \
212 if (pre != cur) loadsegment(seg, pre); } 193 loadsegment(seg, pre); \
194}
213 195
214static int ia32_restore_sigcontext(struct pt_regs *regs, 196static int ia32_restore_sigcontext(struct pt_regs *regs,
215 struct sigcontext_ia32 __user *sc, 197 struct sigcontext_ia32 __user *sc,
216 unsigned int *peax) 198 unsigned int *pax)
217{ 199{
218 unsigned int tmpflags, gs, oldgs, err = 0; 200 unsigned int tmpflags, gs, oldgs, err = 0;
219 void __user *buf; 201 void __user *buf;
@@ -240,18 +222,16 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
240 if (gs != oldgs) 222 if (gs != oldgs)
241 load_gs_index(gs); 223 load_gs_index(gs);
242 224
243 RELOAD_SEG(fs, 3); 225 RELOAD_SEG(fs);
244 RELOAD_SEG(ds, 3); 226 RELOAD_SEG(ds);
245 RELOAD_SEG(es, 3); 227 RELOAD_SEG(es);
246 228
247 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); 229 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
248 COPY(dx); COPY(cx); COPY(ip); 230 COPY(dx); COPY(cx); COPY(ip);
249 /* Don't touch extended registers */ 231 /* Don't touch extended registers */
250 232
251 err |= __get_user(regs->cs, &sc->cs); 233 COPY_SEG_CPL3(cs);
252 regs->cs |= 3; 234 COPY_SEG_CPL3(ss);
253 err |= __get_user(regs->ss, &sc->ss);
254 regs->ss |= 3;
255 235
256 err |= __get_user(tmpflags, &sc->flags); 236 err |= __get_user(tmpflags, &sc->flags);
257 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); 237 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
@@ -262,15 +242,13 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
262 buf = compat_ptr(tmp); 242 buf = compat_ptr(tmp);
263 err |= restore_i387_xstate_ia32(buf); 243 err |= restore_i387_xstate_ia32(buf);
264 244
265 err |= __get_user(tmp, &sc->ax); 245 err |= __get_user(*pax, &sc->ax);
266 *peax = tmp;
267
268 return err; 246 return err;
269} 247}
270 248
271asmlinkage long sys32_sigreturn(struct pt_regs *regs) 249asmlinkage long sys32_sigreturn(struct pt_regs *regs)
272{ 250{
273 struct sigframe __user *frame = (struct sigframe __user *)(regs->sp-8); 251 struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8);
274 sigset_t set; 252 sigset_t set;
275 unsigned int ax; 253 unsigned int ax;
276 254
@@ -300,12 +278,12 @@ badframe:
300 278
301asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs) 279asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs)
302{ 280{
303 struct rt_sigframe __user *frame; 281 struct rt_sigframe_ia32 __user *frame;
304 sigset_t set; 282 sigset_t set;
305 unsigned int ax; 283 unsigned int ax;
306 struct pt_regs tregs; 284 struct pt_regs tregs;
307 285
308 frame = (struct rt_sigframe __user *)(regs->sp - 4); 286 frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4);
309 287
310 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) 288 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
311 goto badframe; 289 goto badframe;
@@ -359,20 +337,15 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc,
359 err |= __put_user(regs->dx, &sc->dx); 337 err |= __put_user(regs->dx, &sc->dx);
360 err |= __put_user(regs->cx, &sc->cx); 338 err |= __put_user(regs->cx, &sc->cx);
361 err |= __put_user(regs->ax, &sc->ax); 339 err |= __put_user(regs->ax, &sc->ax);
362 err |= __put_user(regs->cs, &sc->cs);
363 err |= __put_user(regs->ss, &sc->ss);
364 err |= __put_user(current->thread.trap_no, &sc->trapno); 340 err |= __put_user(current->thread.trap_no, &sc->trapno);
365 err |= __put_user(current->thread.error_code, &sc->err); 341 err |= __put_user(current->thread.error_code, &sc->err);
366 err |= __put_user(regs->ip, &sc->ip); 342 err |= __put_user(regs->ip, &sc->ip);
343 err |= __put_user(regs->cs, (unsigned int __user *)&sc->cs);
367 err |= __put_user(regs->flags, &sc->flags); 344 err |= __put_user(regs->flags, &sc->flags);
368 err |= __put_user(regs->sp, &sc->sp_at_signal); 345 err |= __put_user(regs->sp, &sc->sp_at_signal);
346 err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss);
369 347
370 tmp = save_i387_xstate_ia32(fpstate); 348 err |= __put_user(ptr_to_compat(fpstate), &sc->fpstate);
371 if (tmp < 0)
372 err = -EFAULT;
373 else
374 err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL),
375 &sc->fpstate);
376 349
377 /* non-iBCS2 extensions.. */ 350 /* non-iBCS2 extensions.. */
378 err |= __put_user(mask, &sc->oldmask); 351 err |= __put_user(mask, &sc->oldmask);
@@ -400,7 +373,7 @@ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
400 } 373 }
401 374
402 /* This is the legacy signal stack switching. */ 375 /* This is the legacy signal stack switching. */
403 else if ((regs->ss & 0xffff) != __USER_DS && 376 else if ((regs->ss & 0xffff) != __USER32_DS &&
404 !(ka->sa.sa_flags & SA_RESTORER) && 377 !(ka->sa.sa_flags & SA_RESTORER) &&
405 ka->sa.sa_restorer) 378 ka->sa.sa_restorer)
406 sp = (unsigned long) ka->sa.sa_restorer; 379 sp = (unsigned long) ka->sa.sa_restorer;
@@ -408,6 +381,8 @@ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
408 if (used_math()) { 381 if (used_math()) {
409 sp = sp - sig_xstate_ia32_size; 382 sp = sp - sig_xstate_ia32_size;
410 *fpstate = (struct _fpstate_ia32 *) sp; 383 *fpstate = (struct _fpstate_ia32 *) sp;
384 if (save_i387_xstate_ia32(*fpstate) < 0)
385 return (void __user *) -1L;
411 } 386 }
412 387
413 sp -= frame_size; 388 sp -= frame_size;
@@ -420,7 +395,7 @@ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
420int ia32_setup_frame(int sig, struct k_sigaction *ka, 395int ia32_setup_frame(int sig, struct k_sigaction *ka,
421 compat_sigset_t *set, struct pt_regs *regs) 396 compat_sigset_t *set, struct pt_regs *regs)
422{ 397{
423 struct sigframe __user *frame; 398 struct sigframe_ia32 __user *frame;
424 void __user *restorer; 399 void __user *restorer;
425 int err = 0; 400 int err = 0;
426 void __user *fpstate = NULL; 401 void __user *fpstate = NULL;
@@ -430,12 +405,10 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
430 u16 poplmovl; 405 u16 poplmovl;
431 u32 val; 406 u32 val;
432 u16 int80; 407 u16 int80;
433 u16 pad;
434 } __attribute__((packed)) code = { 408 } __attribute__((packed)) code = {
435 0xb858, /* popl %eax ; movl $...,%eax */ 409 0xb858, /* popl %eax ; movl $...,%eax */
436 __NR_ia32_sigreturn, 410 __NR_ia32_sigreturn,
437 0x80cd, /* int $0x80 */ 411 0x80cd, /* int $0x80 */
438 0,
439 }; 412 };
440 413
441 frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); 414 frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
@@ -471,7 +444,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
471 * These are actually not used anymore, but left because some 444 * These are actually not used anymore, but left because some
472 * gdb versions depend on them as a marker. 445 * gdb versions depend on them as a marker.
473 */ 446 */
474 err |= __copy_to_user(frame->retcode, &code, 8); 447 err |= __put_user(*((u64 *)&code), (u64 *)frame->retcode);
475 if (err) 448 if (err)
476 return -EFAULT; 449 return -EFAULT;
477 450
@@ -501,7 +474,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
501int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 474int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
502 compat_sigset_t *set, struct pt_regs *regs) 475 compat_sigset_t *set, struct pt_regs *regs)
503{ 476{
504 struct rt_sigframe __user *frame; 477 struct rt_sigframe_ia32 __user *frame;
505 void __user *restorer; 478 void __user *restorer;
506 int err = 0; 479 int err = 0;
507 void __user *fpstate = NULL; 480 void __user *fpstate = NULL;
@@ -511,8 +484,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
511 u8 movl; 484 u8 movl;
512 u32 val; 485 u32 val;
513 u16 int80; 486 u16 int80;
514 u16 pad; 487 u8 pad;
515 u8 pad2;
516 } __attribute__((packed)) code = { 488 } __attribute__((packed)) code = {
517 0xb8, 489 0xb8,
518 __NR_ia32_rt_sigreturn, 490 __NR_ia32_rt_sigreturn,
@@ -559,7 +531,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
559 * Not actually used anymore, but left because some gdb 531 * Not actually used anymore, but left because some gdb
560 * versions need it. 532 * versions need it.
561 */ 533 */
562 err |= __copy_to_user(frame->retcode, &code, 8); 534 err |= __put_user(*((u64 *)&code), (u64 *)frame->retcode);
563 if (err) 535 if (err)
564 return -EFAULT; 536 return -EFAULT;
565 537
@@ -572,11 +544,6 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
572 regs->dx = (unsigned long) &frame->info; 544 regs->dx = (unsigned long) &frame->info;
573 regs->cx = (unsigned long) &frame->uc; 545 regs->cx = (unsigned long) &frame->uc;
574 546
575 /* Make -mregparm=3 work */
576 regs->ax = sig;
577 regs->dx = (unsigned long) &frame->info;
578 regs->cx = (unsigned long) &frame->uc;
579
580 loadsegment(ds, __USER32_DS); 547 loadsegment(ds, __USER32_DS);
581 loadsegment(es, __USER32_DS); 548 loadsegment(es, __USER32_DS);
582 549
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 360010322711..9fa9dcdf344b 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -168,7 +168,15 @@ static inline void __change_bit(int nr, volatile unsigned long *addr)
168 */ 168 */
169static inline void change_bit(int nr, volatile unsigned long *addr) 169static inline void change_bit(int nr, volatile unsigned long *addr)
170{ 170{
171 asm volatile(LOCK_PREFIX "btc %1,%0" : ADDR : "Ir" (nr)); 171 if (IS_IMMEDIATE(nr)) {
172 asm volatile(LOCK_PREFIX "xorb %1,%0"
173 : CONST_MASK_ADDR(nr, addr)
174 : "iq" ((u8)CONST_MASK(nr)));
175 } else {
176 asm volatile(LOCK_PREFIX "btc %1,%0"
177 : BITOP_ADDR(addr)
178 : "Ir" (nr));
179 }
172} 180}
173 181
174/** 182/**
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index 3def2065fcea..d9cf1cd156d2 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -9,7 +9,7 @@
9#ifdef CONFIG_X86_32 9#ifdef CONFIG_X86_32
10# define __BUG_C0 "2:\t.long 1b, %c0\n" 10# define __BUG_C0 "2:\t.long 1b, %c0\n"
11#else 11#else
12# define __BUG_C0 "2:\t.quad 1b, %c0\n" 12# define __BUG_C0 "2:\t.long 1b - 2b, %c0 - 2b\n"
13#endif 13#endif
14 14
15#define BUG() \ 15#define BUG() \
diff --git a/arch/x86/include/asm/byteorder.h b/arch/x86/include/asm/byteorder.h
index e02ae2d89acf..f110ad417df3 100644
--- a/arch/x86/include/asm/byteorder.h
+++ b/arch/x86/include/asm/byteorder.h
@@ -4,26 +4,33 @@
4#include <asm/types.h> 4#include <asm/types.h>
5#include <linux/compiler.h> 5#include <linux/compiler.h>
6 6
7#ifdef __GNUC__ 7#define __LITTLE_ENDIAN
8 8
9#ifdef __i386__ 9static inline __attribute_const__ __u32 __arch_swab32(__u32 val)
10
11static inline __attribute_const__ __u32 ___arch__swab32(__u32 x)
12{ 10{
13#ifdef CONFIG_X86_BSWAP 11#ifdef __i386__
14 asm("bswap %0" : "=r" (x) : "0" (x)); 12# ifdef CONFIG_X86_BSWAP
15#else 13 asm("bswap %0" : "=r" (val) : "0" (val));
14# else
16 asm("xchgb %b0,%h0\n\t" /* swap lower bytes */ 15 asm("xchgb %b0,%h0\n\t" /* swap lower bytes */
17 "rorl $16,%0\n\t" /* swap words */ 16 "rorl $16,%0\n\t" /* swap words */
18 "xchgb %b0,%h0" /* swap higher bytes */ 17 "xchgb %b0,%h0" /* swap higher bytes */
19 : "=q" (x) 18 : "=q" (val)
20 : "0" (x)); 19 : "0" (val));
20# endif
21
22#else /* __i386__ */
23 asm("bswapl %0"
24 : "=r" (val)
25 : "0" (val));
21#endif 26#endif
22 return x; 27 return val;
23} 28}
29#define __arch_swab32 __arch_swab32
24 30
25static inline __attribute_const__ __u64 ___arch__swab64(__u64 val) 31static inline __attribute_const__ __u64 __arch_swab64(__u64 val)
26{ 32{
33#ifdef __i386__
27 union { 34 union {
28 struct { 35 struct {
29 __u32 a; 36 __u32 a;
@@ -32,50 +39,27 @@ static inline __attribute_const__ __u64 ___arch__swab64(__u64 val)
32 __u64 u; 39 __u64 u;
33 } v; 40 } v;
34 v.u = val; 41 v.u = val;
35#ifdef CONFIG_X86_BSWAP 42# ifdef CONFIG_X86_BSWAP
36 asm("bswapl %0 ; bswapl %1 ; xchgl %0,%1" 43 asm("bswapl %0 ; bswapl %1 ; xchgl %0,%1"
37 : "=r" (v.s.a), "=r" (v.s.b) 44 : "=r" (v.s.a), "=r" (v.s.b)
38 : "0" (v.s.a), "1" (v.s.b)); 45 : "0" (v.s.a), "1" (v.s.b));
39#else 46# else
40 v.s.a = ___arch__swab32(v.s.a); 47 v.s.a = __arch_swab32(v.s.a);
41 v.s.b = ___arch__swab32(v.s.b); 48 v.s.b = __arch_swab32(v.s.b);
42 asm("xchgl %0,%1" 49 asm("xchgl %0,%1"
43 : "=r" (v.s.a), "=r" (v.s.b) 50 : "=r" (v.s.a), "=r" (v.s.b)
44 : "0" (v.s.a), "1" (v.s.b)); 51 : "0" (v.s.a), "1" (v.s.b));
45#endif 52# endif
46 return v.u; 53 return v.u;
47}
48
49#else /* __i386__ */ 54#else /* __i386__ */
50
51static inline __attribute_const__ __u64 ___arch__swab64(__u64 x)
52{
53 asm("bswapq %0" 55 asm("bswapq %0"
54 : "=r" (x) 56 : "=r" (val)
55 : "0" (x)); 57 : "0" (val));
56 return x; 58 return val;
57}
58
59static inline __attribute_const__ __u32 ___arch__swab32(__u32 x)
60{
61 asm("bswapl %0"
62 : "=r" (x)
63 : "0" (x));
64 return x;
65}
66
67#endif 59#endif
60}
61#define __arch_swab64 __arch_swab64
68 62
69/* Do not define swab16. Gcc is smart enough to recognize "C" version and 63#include <linux/byteorder.h>
70 convert it into rotation or exhange. */
71
72#define __arch__swab64(x) ___arch__swab64(x)
73#define __arch__swab32(x) ___arch__swab32(x)
74
75#define __BYTEORDER_HAS_U64__
76
77#endif /* __GNUC__ */
78
79#include <linux/byteorder/little_endian.h>
80 64
81#endif /* _ASM_X86_BYTEORDER_H */ 65#endif /* _ASM_X86_BYTEORDER_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index cfdf8c2c5c31..ea408dcba513 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -80,7 +80,6 @@
80#define X86_FEATURE_UP (3*32+ 9) /* smp kernel running on up */ 80#define X86_FEATURE_UP (3*32+ 9) /* smp kernel running on up */
81#define X86_FEATURE_FXSAVE_LEAK (3*32+10) /* "" FXSAVE leaks FOP/FIP/FOP */ 81#define X86_FEATURE_FXSAVE_LEAK (3*32+10) /* "" FXSAVE leaks FOP/FIP/FOP */
82#define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */ 82#define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */
83#define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */
84#define X86_FEATURE_PEBS (3*32+12) /* Precise-Event Based Sampling */ 83#define X86_FEATURE_PEBS (3*32+12) /* Precise-Event Based Sampling */
85#define X86_FEATURE_BTS (3*32+13) /* Branch Trace Store */ 84#define X86_FEATURE_BTS (3*32+13) /* Branch Trace Store */
86#define X86_FEATURE_SYSCALL32 (3*32+14) /* "" syscall in ia32 userspace */ 85#define X86_FEATURE_SYSCALL32 (3*32+14) /* "" syscall in ia32 userspace */
@@ -92,6 +91,8 @@
92#define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */ 91#define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */
93#define X86_FEATURE_AMDC1E (3*32+21) /* AMD C1E detected */ 92#define X86_FEATURE_AMDC1E (3*32+21) /* AMD C1E detected */
94#define X86_FEATURE_XTOPOLOGY (3*32+22) /* cpu topology enum extensions */ 93#define X86_FEATURE_XTOPOLOGY (3*32+22) /* cpu topology enum extensions */
94#define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */
95#define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */
95 96
96/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ 97/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
97#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ 98#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */
@@ -117,6 +118,7 @@
117#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ 118#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
118#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */ 119#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */
119#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */ 120#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */
121#define X86_FEATURE_HYPERVISOR (4*32+31) /* Running on a hypervisor */
120 122
121/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ 123/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
122#define X86_FEATURE_XSTORE (5*32+ 2) /* "rng" RNG present (xstore) */ 124#define X86_FEATURE_XSTORE (5*32+ 2) /* "rng" RNG present (xstore) */
@@ -237,6 +239,7 @@ extern const char * const x86_power_flags[32];
237#define cpu_has_xmm4_2 boot_cpu_has(X86_FEATURE_XMM4_2) 239#define cpu_has_xmm4_2 boot_cpu_has(X86_FEATURE_XMM4_2)
238#define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC) 240#define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC)
239#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) 241#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE)
242#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR)
240 243
241#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64) 244#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64)
242# define cpu_has_invlpg 1 245# define cpu_has_invlpg 1
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 097794ff6b79..4035357f5b9d 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -65,18 +65,16 @@ static inline struct dma_mapping_ops *get_dma_ops(struct device *dev)
65 return dma_ops; 65 return dma_ops;
66 else 66 else
67 return dev->archdata.dma_ops; 67 return dev->archdata.dma_ops;
68#endif /* _ASM_X86_DMA_MAPPING_H */ 68#endif
69} 69}
70 70
71/* Make sure we keep the same behaviour */ 71/* Make sure we keep the same behaviour */
72static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) 72static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
73{ 73{
74#ifdef CONFIG_X86_64
75 struct dma_mapping_ops *ops = get_dma_ops(dev); 74 struct dma_mapping_ops *ops = get_dma_ops(dev);
76 if (ops->mapping_error) 75 if (ops->mapping_error)
77 return ops->mapping_error(dev, dma_addr); 76 return ops->mapping_error(dev, dma_addr);
78 77
79#endif
80 return (dma_addr == bad_dma_address); 78 return (dma_addr == bad_dma_address);
81} 79}
82 80
diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h
index 99b6c39774a4..a8f672ba100c 100644
--- a/arch/x86/include/asm/ds.h
+++ b/arch/x86/include/asm/ds.h
@@ -6,13 +6,13 @@
6 * precise-event based sampling (PEBS). 6 * precise-event based sampling (PEBS).
7 * 7 *
8 * It manages: 8 * It manages:
9 * - per-thread and per-cpu allocation of BTS and PEBS 9 * - DS and BTS hardware configuration
10 * - buffer overflow handling (to be done) 10 * - buffer overflow handling (to be done)
11 * - buffer access 11 * - buffer access
12 * 12 *
13 * It assumes: 13 * It does not do:
14 * - get_task_struct on all traced tasks 14 * - security checking (is the caller allowed to trace the task)
15 * - current is allowed to trace tasks 15 * - buffer allocation (memory accounting)
16 * 16 *
17 * 17 *
18 * Copyright (C) 2007-2008 Intel Corporation. 18 * Copyright (C) 2007-2008 Intel Corporation.
@@ -31,6 +31,7 @@
31#ifdef CONFIG_X86_DS 31#ifdef CONFIG_X86_DS
32 32
33struct task_struct; 33struct task_struct;
34struct ds_context;
34struct ds_tracer; 35struct ds_tracer;
35struct bts_tracer; 36struct bts_tracer;
36struct pebs_tracer; 37struct pebs_tracer;
@@ -38,6 +39,38 @@ struct pebs_tracer;
38typedef void (*bts_ovfl_callback_t)(struct bts_tracer *); 39typedef void (*bts_ovfl_callback_t)(struct bts_tracer *);
39typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *); 40typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *);
40 41
42
43/*
44 * A list of features plus corresponding macros to talk about them in
45 * the ds_request function's flags parameter.
46 *
47 * We use the enum to index an array of corresponding control bits;
48 * we use the macro to index a flags bit-vector.
49 */
50enum ds_feature {
51 dsf_bts = 0,
52 dsf_bts_kernel,
53#define BTS_KERNEL (1 << dsf_bts_kernel)
54 /* trace kernel-mode branches */
55
56 dsf_bts_user,
57#define BTS_USER (1 << dsf_bts_user)
58 /* trace user-mode branches */
59
60 dsf_bts_overflow,
61 dsf_bts_max,
62 dsf_pebs = dsf_bts_max,
63
64 dsf_pebs_max,
65 dsf_ctl_max = dsf_pebs_max,
66 dsf_bts_timestamps = dsf_ctl_max,
67#define BTS_TIMESTAMPS (1 << dsf_bts_timestamps)
68 /* add timestamps into BTS trace */
69
70#define BTS_USER_FLAGS (BTS_KERNEL | BTS_USER | BTS_TIMESTAMPS)
71};
72
73
41/* 74/*
42 * Request BTS or PEBS 75 * Request BTS or PEBS
43 * 76 *
@@ -58,92 +91,135 @@ typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *);
58 * NULL if cyclic buffer requested 91 * NULL if cyclic buffer requested
59 * th: the interrupt threshold in records from the end of the buffer; 92 * th: the interrupt threshold in records from the end of the buffer;
60 * -1 if no interrupt threshold is requested. 93 * -1 if no interrupt threshold is requested.
94 * flags: a bit-mask of the above flags
61 */ 95 */
62extern struct bts_tracer *ds_request_bts(struct task_struct *task, 96extern struct bts_tracer *ds_request_bts(struct task_struct *task,
63 void *base, size_t size, 97 void *base, size_t size,
64 bts_ovfl_callback_t ovfl, size_t th); 98 bts_ovfl_callback_t ovfl,
99 size_t th, unsigned int flags);
65extern struct pebs_tracer *ds_request_pebs(struct task_struct *task, 100extern struct pebs_tracer *ds_request_pebs(struct task_struct *task,
66 void *base, size_t size, 101 void *base, size_t size,
67 pebs_ovfl_callback_t ovfl, 102 pebs_ovfl_callback_t ovfl,
68 size_t th); 103 size_t th, unsigned int flags);
69 104
70/* 105/*
71 * Release BTS or PEBS resources 106 * Release BTS or PEBS resources
72 * 107 * Suspend and resume BTS or PEBS tracing
73 * Returns 0 on success; -Eerrno otherwise
74 * 108 *
75 * tracer: the tracer handle returned from ds_request_~() 109 * tracer: the tracer handle returned from ds_request_~()
76 */ 110 */
77extern int ds_release_bts(struct bts_tracer *tracer); 111extern void ds_release_bts(struct bts_tracer *tracer);
78extern int ds_release_pebs(struct pebs_tracer *tracer); 112extern void ds_suspend_bts(struct bts_tracer *tracer);
113extern void ds_resume_bts(struct bts_tracer *tracer);
114extern void ds_release_pebs(struct pebs_tracer *tracer);
115extern void ds_suspend_pebs(struct pebs_tracer *tracer);
116extern void ds_resume_pebs(struct pebs_tracer *tracer);
117
79 118
80/* 119/*
81 * Get the (array) index of the write pointer. 120 * The raw DS buffer state as it is used for BTS and PEBS recording.
82 * (assuming an array of BTS/PEBS records)
83 *
84 * Returns 0 on success; -Eerrno on error
85 * 121 *
86 * tracer: the tracer handle returned from ds_request_~() 122 * This is the low-level, arch-dependent interface for working
87 * pos (out): will hold the result 123 * directly on the raw trace data.
88 */ 124 */
89extern int ds_get_bts_index(struct bts_tracer *tracer, size_t *pos); 125struct ds_trace {
90extern int ds_get_pebs_index(struct pebs_tracer *tracer, size_t *pos); 126 /* the number of bts/pebs records */
127 size_t n;
128 /* the size of a bts/pebs record in bytes */
129 size_t size;
130 /* pointers into the raw buffer:
131 - to the first entry */
132 void *begin;
133 /* - one beyond the last entry */
134 void *end;
135 /* - one beyond the newest entry */
136 void *top;
137 /* - the interrupt threshold */
138 void *ith;
139 /* flags given on ds_request() */
140 unsigned int flags;
141};
91 142
92/* 143/*
93 * Get the (array) index one record beyond the end of the array. 144 * An arch-independent view on branch trace data.
94 * (assuming an array of BTS/PEBS records)
95 *
96 * Returns 0 on success; -Eerrno on error
97 *
98 * tracer: the tracer handle returned from ds_request_~()
99 * pos (out): will hold the result
100 */ 145 */
101extern int ds_get_bts_end(struct bts_tracer *tracer, size_t *pos); 146enum bts_qualifier {
102extern int ds_get_pebs_end(struct pebs_tracer *tracer, size_t *pos); 147 bts_invalid,
148#define BTS_INVALID bts_invalid
149
150 bts_branch,
151#define BTS_BRANCH bts_branch
152
153 bts_task_arrives,
154#define BTS_TASK_ARRIVES bts_task_arrives
155
156 bts_task_departs,
157#define BTS_TASK_DEPARTS bts_task_departs
158
159 bts_qual_bit_size = 4,
160 bts_qual_max = (1 << bts_qual_bit_size),
161};
162
163struct bts_struct {
164 __u64 qualifier;
165 union {
166 /* BTS_BRANCH */
167 struct {
168 __u64 from;
169 __u64 to;
170 } lbr;
171 /* BTS_TASK_ARRIVES or BTS_TASK_DEPARTS */
172 struct {
173 __u64 jiffies;
174 pid_t pid;
175 } timestamp;
176 } variant;
177};
178
103 179
104/* 180/*
105 * Provide a pointer to the BTS/PEBS record at parameter index. 181 * The BTS state.
106 * (assuming an array of BTS/PEBS records)
107 *
108 * The pointer points directly into the buffer. The user is
109 * responsible for copying the record.
110 * 182 *
111 * Returns the size of a single record on success; -Eerrno on error 183 * This gives access to the raw DS state and adds functions to provide
112 * 184 * an arch-independent view of the BTS data.
113 * tracer: the tracer handle returned from ds_request_~()
114 * index: the index of the requested record
115 * record (out): pointer to the requested record
116 */ 185 */
117extern int ds_access_bts(struct bts_tracer *tracer, 186struct bts_trace {
118 size_t index, const void **record); 187 struct ds_trace ds;
119extern int ds_access_pebs(struct pebs_tracer *tracer, 188
120 size_t index, const void **record); 189 int (*read)(struct bts_tracer *tracer, const void *at,
190 struct bts_struct *out);
191 int (*write)(struct bts_tracer *tracer, const struct bts_struct *in);
192};
193
121 194
122/* 195/*
123 * Write one or more BTS/PEBS records at the write pointer index and 196 * The PEBS state.
124 * advance the write pointer.
125 *
126 * If size is not a multiple of the record size, trailing bytes are
127 * zeroed out.
128 * 197 *
129 * May result in one or more overflow notifications. 198 * This gives access to the raw DS state and the PEBS-specific counter
130 * 199 * reset value.
131 * If called during overflow handling, that is, with index >= 200 */
132 * interrupt threshold, the write will wrap around. 201struct pebs_trace {
202 struct ds_trace ds;
203
204 /* the PEBS reset value */
205 unsigned long long reset_value;
206};
207
208
209/*
210 * Read the BTS or PEBS trace.
133 * 211 *
134 * An overflow notification is given if and when the interrupt 212 * Returns a view on the trace collected for the parameter tracer.
135 * threshold is reached during or after the write.
136 * 213 *
137 * Returns the number of bytes written or -Eerrno. 214 * The view remains valid as long as the traced task is not running or
215 * the tracer is suspended.
216 * Writes into the trace buffer are not reflected.
138 * 217 *
139 * tracer: the tracer handle returned from ds_request_~() 218 * tracer: the tracer handle returned from ds_request_~()
140 * buffer: the buffer to write
141 * size: the size of the buffer
142 */ 219 */
143extern int ds_write_bts(struct bts_tracer *tracer, 220extern const struct bts_trace *ds_read_bts(struct bts_tracer *tracer);
144 const void *buffer, size_t size); 221extern const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer);
145extern int ds_write_pebs(struct pebs_tracer *tracer, 222
146 const void *buffer, size_t size);
147 223
148/* 224/*
149 * Reset the write pointer of the BTS/PEBS buffer. 225 * Reset the write pointer of the BTS/PEBS buffer.
@@ -156,27 +232,6 @@ extern int ds_reset_bts(struct bts_tracer *tracer);
156extern int ds_reset_pebs(struct pebs_tracer *tracer); 232extern int ds_reset_pebs(struct pebs_tracer *tracer);
157 233
158/* 234/*
159 * Clear the BTS/PEBS buffer and reset the write pointer.
160 * The entire buffer will be zeroed out.
161 *
162 * Returns 0 on success; -Eerrno on error
163 *
164 * tracer: the tracer handle returned from ds_request_~()
165 */
166extern int ds_clear_bts(struct bts_tracer *tracer);
167extern int ds_clear_pebs(struct pebs_tracer *tracer);
168
169/*
170 * Provide the PEBS counter reset value.
171 *
172 * Returns 0 on success; -Eerrno on error
173 *
174 * tracer: the tracer handle returned from ds_request_pebs()
175 * value (out): the counter reset value
176 */
177extern int ds_get_pebs_reset(struct pebs_tracer *tracer, u64 *value);
178
179/*
180 * Set the PEBS counter reset value. 235 * Set the PEBS counter reset value.
181 * 236 *
182 * Returns 0 on success; -Eerrno on error 237 * Returns 0 on success; -Eerrno on error
@@ -192,35 +247,26 @@ extern int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value);
192struct cpuinfo_x86; 247struct cpuinfo_x86;
193extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *); 248extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *);
194 249
195
196
197/* 250/*
198 * The DS context - part of struct thread_struct. 251 * Context switch work
199 */ 252 */
200#define MAX_SIZEOF_DS (12 * 8) 253extern void ds_switch_to(struct task_struct *prev, struct task_struct *next);
201
202struct ds_context {
203 /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */
204 unsigned char ds[MAX_SIZEOF_DS];
205 /* the owner of the BTS and PEBS configuration, respectively */
206 struct ds_tracer *owner[2];
207 /* use count */
208 unsigned long count;
209 /* a pointer to the context location inside the thread_struct
210 * or the per_cpu context array */
211 struct ds_context **this;
212 /* a pointer to the task owning this context, or NULL, if the
213 * context is owned by a cpu */
214 struct task_struct *task;
215};
216 254
217/* called by exit_thread() to free leftover contexts */ 255/*
218extern void ds_free(struct ds_context *context); 256 * Task clone/init and cleanup work
257 */
258extern void ds_copy_thread(struct task_struct *tsk, struct task_struct *father);
259extern void ds_exit_thread(struct task_struct *tsk);
219 260
220#else /* CONFIG_X86_DS */ 261#else /* CONFIG_X86_DS */
221 262
222struct cpuinfo_x86; 263struct cpuinfo_x86;
223static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {} 264static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {}
265static inline void ds_switch_to(struct task_struct *prev,
266 struct task_struct *next) {}
267static inline void ds_copy_thread(struct task_struct *tsk,
268 struct task_struct *father) {}
269static inline void ds_exit_thread(struct task_struct *tsk) {}
224 270
225#endif /* CONFIG_X86_DS */ 271#endif /* CONFIG_X86_DS */
226#endif /* _ASM_X86_DS_H */ 272#endif /* _ASM_X86_DS_H */
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h
index 804b6e6be929..3afc5e87cfdd 100644
--- a/arch/x86/include/asm/dwarf2.h
+++ b/arch/x86/include/asm/dwarf2.h
@@ -6,56 +6,91 @@
6#endif 6#endif
7 7
8/* 8/*
9 Macros for dwarf2 CFI unwind table entries. 9 * Macros for dwarf2 CFI unwind table entries.
10 See "as.info" for details on these pseudo ops. Unfortunately 10 * See "as.info" for details on these pseudo ops. Unfortunately
11 they are only supported in very new binutils, so define them 11 * they are only supported in very new binutils, so define them
12 away for older version. 12 * away for older version.
13 */ 13 */
14 14
15#ifdef CONFIG_AS_CFI 15#ifdef CONFIG_AS_CFI
16 16
17#define CFI_STARTPROC .cfi_startproc 17#define CFI_STARTPROC .cfi_startproc
18#define CFI_ENDPROC .cfi_endproc 18#define CFI_ENDPROC .cfi_endproc
19#define CFI_DEF_CFA .cfi_def_cfa 19#define CFI_DEF_CFA .cfi_def_cfa
20#define CFI_DEF_CFA_REGISTER .cfi_def_cfa_register 20#define CFI_DEF_CFA_REGISTER .cfi_def_cfa_register
21#define CFI_DEF_CFA_OFFSET .cfi_def_cfa_offset 21#define CFI_DEF_CFA_OFFSET .cfi_def_cfa_offset
22#define CFI_ADJUST_CFA_OFFSET .cfi_adjust_cfa_offset 22#define CFI_ADJUST_CFA_OFFSET .cfi_adjust_cfa_offset
23#define CFI_OFFSET .cfi_offset 23#define CFI_OFFSET .cfi_offset
24#define CFI_REL_OFFSET .cfi_rel_offset 24#define CFI_REL_OFFSET .cfi_rel_offset
25#define CFI_REGISTER .cfi_register 25#define CFI_REGISTER .cfi_register
26#define CFI_RESTORE .cfi_restore 26#define CFI_RESTORE .cfi_restore
27#define CFI_REMEMBER_STATE .cfi_remember_state 27#define CFI_REMEMBER_STATE .cfi_remember_state
28#define CFI_RESTORE_STATE .cfi_restore_state 28#define CFI_RESTORE_STATE .cfi_restore_state
29#define CFI_UNDEFINED .cfi_undefined 29#define CFI_UNDEFINED .cfi_undefined
30 30
31#ifdef CONFIG_AS_CFI_SIGNAL_FRAME 31#ifdef CONFIG_AS_CFI_SIGNAL_FRAME
32#define CFI_SIGNAL_FRAME .cfi_signal_frame 32#define CFI_SIGNAL_FRAME .cfi_signal_frame
33#else 33#else
34#define CFI_SIGNAL_FRAME 34#define CFI_SIGNAL_FRAME
35#endif 35#endif
36 36
37#else 37#else
38 38
39/* Due to the structure of pre-exisiting code, don't use assembler line 39/*
40 comment character # to ignore the arguments. Instead, use a dummy macro. */ 40 * Due to the structure of pre-exisiting code, don't use assembler line
41 * comment character # to ignore the arguments. Instead, use a dummy macro.
42 */
41.macro cfi_ignore a=0, b=0, c=0, d=0 43.macro cfi_ignore a=0, b=0, c=0, d=0
42.endm 44.endm
43 45
44#define CFI_STARTPROC cfi_ignore 46#define CFI_STARTPROC cfi_ignore
45#define CFI_ENDPROC cfi_ignore 47#define CFI_ENDPROC cfi_ignore
46#define CFI_DEF_CFA cfi_ignore 48#define CFI_DEF_CFA cfi_ignore
47#define CFI_DEF_CFA_REGISTER cfi_ignore 49#define CFI_DEF_CFA_REGISTER cfi_ignore
48#define CFI_DEF_CFA_OFFSET cfi_ignore 50#define CFI_DEF_CFA_OFFSET cfi_ignore
49#define CFI_ADJUST_CFA_OFFSET cfi_ignore 51#define CFI_ADJUST_CFA_OFFSET cfi_ignore
50#define CFI_OFFSET cfi_ignore 52#define CFI_OFFSET cfi_ignore
51#define CFI_REL_OFFSET cfi_ignore 53#define CFI_REL_OFFSET cfi_ignore
52#define CFI_REGISTER cfi_ignore 54#define CFI_REGISTER cfi_ignore
53#define CFI_RESTORE cfi_ignore 55#define CFI_RESTORE cfi_ignore
54#define CFI_REMEMBER_STATE cfi_ignore 56#define CFI_REMEMBER_STATE cfi_ignore
55#define CFI_RESTORE_STATE cfi_ignore 57#define CFI_RESTORE_STATE cfi_ignore
56#define CFI_UNDEFINED cfi_ignore 58#define CFI_UNDEFINED cfi_ignore
57#define CFI_SIGNAL_FRAME cfi_ignore 59#define CFI_SIGNAL_FRAME cfi_ignore
58 60
59#endif 61#endif
60 62
63/*
64 * An attempt to make CFI annotations more or less
65 * correct and shorter. It is implied that you know
66 * what you're doing if you use them.
67 */
68#ifdef __ASSEMBLY__
69#ifdef CONFIG_X86_64
70 .macro pushq_cfi reg
71 pushq \reg
72 CFI_ADJUST_CFA_OFFSET 8
73 .endm
74
75 .macro popq_cfi reg
76 popq \reg
77 CFI_ADJUST_CFA_OFFSET -8
78 .endm
79
80 .macro movq_cfi reg offset=0
81 movq %\reg, \offset(%rsp)
82 CFI_REL_OFFSET \reg, \offset
83 .endm
84
85 .macro movq_cfi_restore offset reg
86 movq \offset(%rsp), %\reg
87 CFI_RESTORE \reg
88 .endm
89#else /*!CONFIG_X86_64*/
90
91 /* 32bit defenitions are missed yet */
92
93#endif /*!CONFIG_X86_64*/
94#endif /*__ASSEMBLY__*/
95
61#endif /* _ASM_X86_DWARF2_H */ 96#endif /* _ASM_X86_DWARF2_H */
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 40ca1bea7916..f51a3ddde01a 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -325,7 +325,7 @@ struct linux_binprm;
325 325
326#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 326#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
327extern int arch_setup_additional_pages(struct linux_binprm *bprm, 327extern int arch_setup_additional_pages(struct linux_binprm *bprm,
328 int executable_stack); 328 int uses_interp);
329 329
330extern int syscall32_setup_pages(struct linux_binprm *, int exstack); 330extern int syscall32_setup_pages(struct linux_binprm *, int exstack);
331#define compat_arch_setup_additional_pages syscall32_setup_pages 331#define compat_arch_setup_additional_pages syscall32_setup_pages
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 7e61b4ceb9a4..b55b4a7fbefd 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -1,6 +1,33 @@
1#ifndef _ASM_X86_FTRACE_H 1#ifndef _ASM_X86_FTRACE_H
2#define _ASM_X86_FTRACE_H 2#define _ASM_X86_FTRACE_H
3 3
4#ifdef __ASSEMBLY__
5
6 .macro MCOUNT_SAVE_FRAME
7 /* taken from glibc */
8 subq $0x38, %rsp
9 movq %rax, (%rsp)
10 movq %rcx, 8(%rsp)
11 movq %rdx, 16(%rsp)
12 movq %rsi, 24(%rsp)
13 movq %rdi, 32(%rsp)
14 movq %r8, 40(%rsp)
15 movq %r9, 48(%rsp)
16 .endm
17
18 .macro MCOUNT_RESTORE_FRAME
19 movq 48(%rsp), %r9
20 movq 40(%rsp), %r8
21 movq 32(%rsp), %rdi
22 movq 24(%rsp), %rsi
23 movq 16(%rsp), %rdx
24 movq 8(%rsp), %rcx
25 movq (%rsp), %rax
26 addq $0x38, %rsp
27 .endm
28
29#endif
30
4#ifdef CONFIG_FUNCTION_TRACER 31#ifdef CONFIG_FUNCTION_TRACER
5#define MCOUNT_ADDR ((long)(mcount)) 32#define MCOUNT_ADDR ((long)(mcount))
6#define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ 33#define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */
@@ -46,7 +73,7 @@ struct ftrace_ret_stack {
46/* 73/*
47 * Primary handler of a function return. 74 * Primary handler of a function return.
48 * It relays on ftrace_return_to_handler. 75 * It relays on ftrace_return_to_handler.
49 * Defined in entry32.S 76 * Defined in entry_32/64.S
50 */ 77 */
51extern void return_to_handler(void); 78extern void return_to_handler(void);
52 79
diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h
index 74252264433d..6cfdafa409d8 100644
--- a/arch/x86/include/asm/gart.h
+++ b/arch/x86/include/asm/gart.h
@@ -29,6 +29,39 @@ extern int fix_aperture;
29#define AMD64_GARTCACHECTL 0x9c 29#define AMD64_GARTCACHECTL 0x9c
30#define AMD64_GARTEN (1<<0) 30#define AMD64_GARTEN (1<<0)
31 31
32#ifdef CONFIG_GART_IOMMU
33extern int gart_iommu_aperture;
34extern int gart_iommu_aperture_allowed;
35extern int gart_iommu_aperture_disabled;
36
37extern void early_gart_iommu_check(void);
38extern void gart_iommu_init(void);
39extern void gart_iommu_shutdown(void);
40extern void __init gart_parse_options(char *);
41extern void gart_iommu_hole_init(void);
42
43#else
44#define gart_iommu_aperture 0
45#define gart_iommu_aperture_allowed 0
46#define gart_iommu_aperture_disabled 1
47
48static inline void early_gart_iommu_check(void)
49{
50}
51static inline void gart_iommu_init(void)
52{
53}
54static inline void gart_iommu_shutdown(void)
55{
56}
57static inline void gart_parse_options(char *options)
58{
59}
60static inline void gart_iommu_hole_init(void)
61{
62}
63#endif
64
32extern int agp_amd64_init(void); 65extern int agp_amd64_init(void);
33 66
34static inline void enable_gart_translation(struct pci_dev *dev, u64 addr) 67static inline void enable_gart_translation(struct pci_dev *dev, u64 addr)
diff --git a/arch/x86/include/asm/hardirq_32.h b/arch/x86/include/asm/hardirq_32.h
index 5ca135e72f2b..cf7954d1405f 100644
--- a/arch/x86/include/asm/hardirq_32.h
+++ b/arch/x86/include/asm/hardirq_32.h
@@ -22,6 +22,8 @@ DECLARE_PER_CPU(irq_cpustat_t, irq_stat);
22#define __ARCH_IRQ_STAT 22#define __ARCH_IRQ_STAT
23#define __IRQ_STAT(cpu, member) (per_cpu(irq_stat, cpu).member) 23#define __IRQ_STAT(cpu, member) (per_cpu(irq_stat, cpu).member)
24 24
25#define inc_irq_stat(member) (__get_cpu_var(irq_stat).member++)
26
25void ack_bad_irq(unsigned int irq); 27void ack_bad_irq(unsigned int irq);
26#include <linux/irq_cpustat.h> 28#include <linux/irq_cpustat.h>
27 29
diff --git a/arch/x86/include/asm/hardirq_64.h b/arch/x86/include/asm/hardirq_64.h
index 1ba381fc51d3..b5a6b5d56704 100644
--- a/arch/x86/include/asm/hardirq_64.h
+++ b/arch/x86/include/asm/hardirq_64.h
@@ -11,6 +11,8 @@
11 11
12#define __ARCH_IRQ_STAT 1 12#define __ARCH_IRQ_STAT 1
13 13
14#define inc_irq_stat(member) add_pda(member, 1)
15
14#define local_softirq_pending() read_pda(__softirq_pending) 16#define local_softirq_pending() read_pda(__softirq_pending)
15 17
16#define __ARCH_SET_SOFTIRQ_PENDING 1 18#define __ARCH_SET_SOFTIRQ_PENDING 1
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index b97aecb0b61d..8de644b6b959 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -109,9 +109,7 @@ extern asmlinkage void smp_invalidate_interrupt(struct pt_regs *);
109#endif 109#endif
110#endif 110#endif
111 111
112#ifdef CONFIG_X86_32 112extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void);
113extern void (*const interrupt[NR_VECTORS])(void);
114#endif
115 113
116typedef int vector_irq_t[NR_VECTORS]; 114typedef int vector_irq_t[NR_VECTORS];
117DECLARE_PER_CPU(vector_irq_t, vector_irq); 115DECLARE_PER_CPU(vector_irq_t, vector_irq);
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
new file mode 100644
index 000000000000..369f5c5d09a1
--- /dev/null
+++ b/arch/x86/include/asm/hypervisor.h
@@ -0,0 +1,26 @@
1/*
2 * Copyright (C) 2008, VMware, Inc.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
12 * NON INFRINGEMENT. See the GNU General Public License for more
13 * details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 */
20#ifndef ASM_X86__HYPERVISOR_H
21#define ASM_X86__HYPERVISOR_H
22
23extern unsigned long get_hypervisor_tsc_freq(void);
24extern void init_hypervisor(struct cpuinfo_x86 *c);
25
26#endif
diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h
index 97989c0e534c..50ca486fd88c 100644
--- a/arch/x86/include/asm/ia32.h
+++ b/arch/x86/include/asm/ia32.h
@@ -129,24 +129,6 @@ typedef struct compat_siginfo {
129 } _sifields; 129 } _sifields;
130} compat_siginfo_t; 130} compat_siginfo_t;
131 131
132struct sigframe32 {
133 u32 pretcode;
134 int sig;
135 struct sigcontext_ia32 sc;
136 struct _fpstate_ia32 fpstate;
137 unsigned int extramask[_COMPAT_NSIG_WORDS-1];
138};
139
140struct rt_sigframe32 {
141 u32 pretcode;
142 int sig;
143 u32 pinfo;
144 u32 puc;
145 compat_siginfo_t info;
146 struct ucontext_ia32 uc;
147 struct _fpstate_ia32 fpstate;
148};
149
150struct ustat32 { 132struct ustat32 {
151 __u32 f_tfree; 133 __u32 f_tfree;
152 compat_ino_t f_tinode; 134 compat_ino_t f_tinode;
diff --git a/arch/x86/include/asm/idle.h b/arch/x86/include/asm/idle.h
index 44c89c3a23e9..38d87379e270 100644
--- a/arch/x86/include/asm/idle.h
+++ b/arch/x86/include/asm/idle.h
@@ -8,8 +8,13 @@ struct notifier_block;
8void idle_notifier_register(struct notifier_block *n); 8void idle_notifier_register(struct notifier_block *n);
9void idle_notifier_unregister(struct notifier_block *n); 9void idle_notifier_unregister(struct notifier_block *n);
10 10
11#ifdef CONFIG_X86_64
11void enter_idle(void); 12void enter_idle(void);
12void exit_idle(void); 13void exit_idle(void);
14#else /* !CONFIG_X86_64 */
15static inline void enter_idle(void) { }
16static inline void exit_idle(void) { }
17#endif /* CONFIG_X86_64 */
13 18
14void c1e_remove_cpu(int cpu); 19void c1e_remove_cpu(int cpu);
15 20
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index ac2abc88cd95..05cfed4485fa 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -4,6 +4,7 @@
4#define ARCH_HAS_IOREMAP_WC 4#define ARCH_HAS_IOREMAP_WC
5 5
6#include <linux/compiler.h> 6#include <linux/compiler.h>
7#include <asm-generic/int-ll64.h>
7 8
8#define build_mmio_read(name, size, type, reg, barrier) \ 9#define build_mmio_read(name, size, type, reg, barrier) \
9static inline type name(const volatile void __iomem *addr) \ 10static inline type name(const volatile void __iomem *addr) \
@@ -45,21 +46,39 @@ build_mmio_write(__writel, "l", unsigned int, "r", )
45#define mmiowb() barrier() 46#define mmiowb() barrier()
46 47
47#ifdef CONFIG_X86_64 48#ifdef CONFIG_X86_64
49
48build_mmio_read(readq, "q", unsigned long, "=r", :"memory") 50build_mmio_read(readq, "q", unsigned long, "=r", :"memory")
49build_mmio_read(__readq, "q", unsigned long, "=r", )
50build_mmio_write(writeq, "q", unsigned long, "r", :"memory") 51build_mmio_write(writeq, "q", unsigned long, "r", :"memory")
51build_mmio_write(__writeq, "q", unsigned long, "r", )
52 52
53#define readq_relaxed(a) __readq(a) 53#else
54#define __raw_readq __readq 54
55#define __raw_writeq writeq 55static inline __u64 readq(const volatile void __iomem *addr)
56{
57 const volatile u32 __iomem *p = addr;
58 u32 low, high;
59
60 low = readl(p);
61 high = readl(p + 1);
62
63 return low + ((u64)high << 32);
64}
65
66static inline void writeq(__u64 val, volatile void __iomem *addr)
67{
68 writel(val, addr);
69 writel(val >> 32, addr+4);
70}
56 71
57/* Let people know we have them */
58#define readq readq
59#define writeq writeq
60#endif 72#endif
61 73
62extern int iommu_bio_merge; 74#define readq_relaxed(a) readq(a)
75
76#define __raw_readq(a) readq(a)
77#define __raw_writeq(val, addr) writeq(val, addr)
78
79/* Let people know that we have them */
80#define readq readq
81#define writeq writeq
63 82
64#ifdef CONFIG_X86_32 83#ifdef CONFIG_X86_32
65# include "io_32.h" 84# include "io_32.h"
diff --git a/arch/x86/include/asm/io_64.h b/arch/x86/include/asm/io_64.h
index fea325a1122f..563c16270ba6 100644
--- a/arch/x86/include/asm/io_64.h
+++ b/arch/x86/include/asm/io_64.h
@@ -232,8 +232,6 @@ void memset_io(volatile void __iomem *a, int b, size_t c);
232 232
233#define flush_write_buffers() 233#define flush_write_buffers()
234 234
235#define BIO_VMERGE_BOUNDARY iommu_bio_merge
236
237/* 235/*
238 * Convert a virtual cached pointer to an uncached pointer 236 * Convert a virtual cached pointer to an uncached pointer
239 */ 237 */
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 25d527ca1362..7a1f44ac1f17 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -156,11 +156,21 @@ extern int sis_apic_bug;
156/* 1 if "noapic" boot option passed */ 156/* 1 if "noapic" boot option passed */
157extern int skip_ioapic_setup; 157extern int skip_ioapic_setup;
158 158
159/* 1 if "noapic" boot option passed */
160extern int noioapicquirk;
161
162/* -1 if "noapic" boot option passed */
163extern int noioapicreroute;
164
159/* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */ 165/* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */
160extern int timer_through_8259; 166extern int timer_through_8259;
161 167
162static inline void disable_ioapic_setup(void) 168static inline void disable_ioapic_setup(void)
163{ 169{
170#ifdef CONFIG_PCI
171 noioapicquirk = 1;
172 noioapicreroute = -1;
173#endif
164 skip_ioapic_setup = 1; 174 skip_ioapic_setup = 1;
165} 175}
166 176
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index 0b500c5b6446..a6ee9e6f530f 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -7,42 +7,7 @@ extern struct dma_mapping_ops nommu_dma_ops;
7extern int force_iommu, no_iommu; 7extern int force_iommu, no_iommu;
8extern int iommu_detected; 8extern int iommu_detected;
9 9
10extern unsigned long iommu_nr_pages(unsigned long addr, unsigned long len);
11
12/* 10 seconds */ 10/* 10 seconds */
13#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) 11#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
14 12
15#ifdef CONFIG_GART_IOMMU
16extern int gart_iommu_aperture;
17extern int gart_iommu_aperture_allowed;
18extern int gart_iommu_aperture_disabled;
19
20extern void early_gart_iommu_check(void);
21extern void gart_iommu_init(void);
22extern void gart_iommu_shutdown(void);
23extern void __init gart_parse_options(char *);
24extern void gart_iommu_hole_init(void);
25
26#else
27#define gart_iommu_aperture 0
28#define gart_iommu_aperture_allowed 0
29#define gart_iommu_aperture_disabled 1
30
31static inline void early_gart_iommu_check(void)
32{
33}
34static inline void gart_iommu_init(void)
35{
36}
37static inline void gart_iommu_shutdown(void)
38{
39}
40static inline void gart_parse_options(char *options)
41{
42}
43static inline void gart_iommu_hole_init(void)
44{
45}
46#endif
47
48#endif /* _ASM_X86_IOMMU_H */ 13#endif /* _ASM_X86_IOMMU_H */
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 4bb732e45a85..592688ed04d3 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -31,10 +31,6 @@ static inline int irq_canonicalize(int irq)
31# endif 31# endif
32#endif 32#endif
33 33
34#ifdef CONFIG_IRQBALANCE
35extern int irqbalance_disable(char *str);
36#endif
37
38#ifdef CONFIG_HOTPLUG_CPU 34#ifdef CONFIG_HOTPLUG_CPU
39#include <linux/cpumask.h> 35#include <linux/cpumask.h>
40extern void fixup_irqs(void); 36extern void fixup_irqs(void);
diff --git a/arch/x86/include/asm/irq_regs_32.h b/arch/x86/include/asm/irq_regs_32.h
index af2f02d27fc7..86afd7473457 100644
--- a/arch/x86/include/asm/irq_regs_32.h
+++ b/arch/x86/include/asm/irq_regs_32.h
@@ -9,6 +9,8 @@
9 9
10#include <asm/percpu.h> 10#include <asm/percpu.h>
11 11
12#define ARCH_HAS_OWN_IRQ_REGS
13
12DECLARE_PER_CPU(struct pt_regs *, irq_regs); 14DECLARE_PER_CPU(struct pt_regs *, irq_regs);
13 15
14static inline struct pt_regs *get_irq_regs(void) 16static inline struct pt_regs *get_irq_regs(void)
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index f61ee8f937e4..5d98d0b68ffc 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -57,5 +57,65 @@
57#define __ALIGN_STR ".align 16,0x90" 57#define __ALIGN_STR ".align 16,0x90"
58#endif 58#endif
59 59
60/*
61 * to check ENTRY_X86/END_X86 and
62 * KPROBE_ENTRY_X86/KPROBE_END_X86
63 * unbalanced-missed-mixed appearance
64 */
65#define __set_entry_x86 .set ENTRY_X86_IN, 0
66#define __unset_entry_x86 .set ENTRY_X86_IN, 1
67#define __set_kprobe_x86 .set KPROBE_X86_IN, 0
68#define __unset_kprobe_x86 .set KPROBE_X86_IN, 1
69
70#define __macro_err_x86 .error "ENTRY_X86/KPROBE_X86 unbalanced,missed,mixed"
71
72#define __check_entry_x86 \
73 .ifdef ENTRY_X86_IN; \
74 .ifeq ENTRY_X86_IN; \
75 __macro_err_x86; \
76 .abort; \
77 .endif; \
78 .endif
79
80#define __check_kprobe_x86 \
81 .ifdef KPROBE_X86_IN; \
82 .ifeq KPROBE_X86_IN; \
83 __macro_err_x86; \
84 .abort; \
85 .endif; \
86 .endif
87
88#define __check_entry_kprobe_x86 \
89 __check_entry_x86; \
90 __check_kprobe_x86
91
92#define ENTRY_KPROBE_FINAL_X86 __check_entry_kprobe_x86
93
94#define ENTRY_X86(name) \
95 __check_entry_kprobe_x86; \
96 __set_entry_x86; \
97 .globl name; \
98 __ALIGN; \
99 name:
100
101#define END_X86(name) \
102 __unset_entry_x86; \
103 __check_entry_kprobe_x86; \
104 .size name, .-name
105
106#define KPROBE_ENTRY_X86(name) \
107 __check_entry_kprobe_x86; \
108 __set_kprobe_x86; \
109 .pushsection .kprobes.text, "ax"; \
110 .globl name; \
111 __ALIGN; \
112 name:
113
114#define KPROBE_END_X86(name) \
115 __unset_kprobe_x86; \
116 __check_entry_kprobe_x86; \
117 .size name, .-name; \
118 .popsection
119
60#endif /* _ASM_X86_LINKAGE_H */ 120#endif /* _ASM_X86_LINKAGE_H */
61 121
diff --git a/arch/x86/include/asm/mmu_context_32.h b/arch/x86/include/asm/mmu_context_32.h
index 8e10015781fb..7e98ce1d2c0e 100644
--- a/arch/x86/include/asm/mmu_context_32.h
+++ b/arch/x86/include/asm/mmu_context_32.h
@@ -4,9 +4,8 @@
4static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) 4static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
5{ 5{
6#ifdef CONFIG_SMP 6#ifdef CONFIG_SMP
7 unsigned cpu = smp_processor_id(); 7 if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK)
8 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) 8 x86_write_percpu(cpu_tlbstate.state, TLBSTATE_LAZY);
9 per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_LAZY;
10#endif 9#endif
11} 10}
12 11
@@ -20,8 +19,8 @@ static inline void switch_mm(struct mm_struct *prev,
20 /* stop flush ipis for the previous mm */ 19 /* stop flush ipis for the previous mm */
21 cpu_clear(cpu, prev->cpu_vm_mask); 20 cpu_clear(cpu, prev->cpu_vm_mask);
22#ifdef CONFIG_SMP 21#ifdef CONFIG_SMP
23 per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK; 22 x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK);
24 per_cpu(cpu_tlbstate, cpu).active_mm = next; 23 x86_write_percpu(cpu_tlbstate.active_mm, next);
25#endif 24#endif
26 cpu_set(cpu, next->cpu_vm_mask); 25 cpu_set(cpu, next->cpu_vm_mask);
27 26
@@ -36,8 +35,8 @@ static inline void switch_mm(struct mm_struct *prev,
36 } 35 }
37#ifdef CONFIG_SMP 36#ifdef CONFIG_SMP
38 else { 37 else {
39 per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK; 38 x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK);
40 BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next); 39 BUG_ON(x86_read_percpu(cpu_tlbstate.active_mm) != next);
41 40
42 if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) { 41 if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
43 /* We were in lazy tlb mode and leave_mm disabled 42 /* We were in lazy tlb mode and leave_mm disabled
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index e38859d577a1..cb58643947b9 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -85,7 +85,9 @@
85/* AMD64 MSRs. Not complete. See the architecture manual for a more 85/* AMD64 MSRs. Not complete. See the architecture manual for a more
86 complete list. */ 86 complete list. */
87 87
88#define MSR_AMD64_PATCH_LEVEL 0x0000008b
88#define MSR_AMD64_NB_CFG 0xc001001f 89#define MSR_AMD64_NB_CFG 0xc001001f
90#define MSR_AMD64_PATCH_LOADER 0xc0010020
89#define MSR_AMD64_IBSFETCHCTL 0xc0011030 91#define MSR_AMD64_IBSFETCHCTL 0xc0011030
90#define MSR_AMD64_IBSFETCHLINAD 0xc0011031 92#define MSR_AMD64_IBSFETCHLINAD 0xc0011031
91#define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032 93#define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index c2a812ebde89..638bf6241807 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -22,10 +22,10 @@ static inline unsigned long long native_read_tscp(unsigned int *aux)
22} 22}
23 23
24/* 24/*
25 * i386 calling convention returns 64-bit value in edx:eax, while 25 * both i386 and x86_64 returns 64-bit value in edx:eax, but gcc's "A"
26 * x86_64 returns at rax. Also, the "A" constraint does not really 26 * constraint has different meanings. For i386, "A" means exactly
27 * mean rdx:rax in x86_64, so we need specialized behaviour for each 27 * edx:eax, while for x86_64 it doesn't mean rdx:rax or edx:eax. Instead,
28 * architecture 28 * it means rax *or* rdx.
29 */ 29 */
30#ifdef CONFIG_X86_64 30#ifdef CONFIG_X86_64
31#define DECLARE_ARGS(val, low, high) unsigned low, high 31#define DECLARE_ARGS(val, low, high) unsigned low, high
@@ -85,7 +85,8 @@ static inline void native_write_msr(unsigned int msr,
85 asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory"); 85 asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory");
86} 86}
87 87
88static inline int native_write_msr_safe(unsigned int msr, 88/* Can be uninlined because referenced by paravirt */
89notrace static inline int native_write_msr_safe(unsigned int msr,
89 unsigned low, unsigned high) 90 unsigned low, unsigned high)
90{ 91{
91 int err; 92 int err;
@@ -181,10 +182,10 @@ static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
181} 182}
182 183
183#define rdtscl(low) \ 184#define rdtscl(low) \
184 ((low) = (u32)native_read_tsc()) 185 ((low) = (u32)__native_read_tsc())
185 186
186#define rdtscll(val) \ 187#define rdtscll(val) \
187 ((val) = native_read_tsc()) 188 ((val) = __native_read_tsc())
188 189
189#define rdpmc(counter, low, high) \ 190#define rdpmc(counter, low, high) \
190do { \ 191do { \
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 875b38edf193..a977de23cb4d 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -19,6 +19,8 @@ struct pci_sysdata {
19}; 19};
20 20
21extern int pci_routeirq; 21extern int pci_routeirq;
22extern int noioapicquirk;
23extern int noioapicreroute;
22 24
23/* scan a bus after allocating a pci_sysdata for it */ 25/* scan a bus after allocating a pci_sysdata for it */
24extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops, 26extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
@@ -82,6 +84,8 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
82static inline void early_quirks(void) { } 84static inline void early_quirks(void) { }
83#endif 85#endif
84 86
87extern void pci_iommu_alloc(void);
88
85#endif /* __KERNEL__ */ 89#endif /* __KERNEL__ */
86 90
87#ifdef CONFIG_X86_32 91#ifdef CONFIG_X86_32
@@ -98,9 +102,9 @@ static inline void early_quirks(void) { }
98 102
99#ifdef CONFIG_NUMA 103#ifdef CONFIG_NUMA
100/* Returns the node based on pci bus */ 104/* Returns the node based on pci bus */
101static inline int __pcibus_to_node(struct pci_bus *bus) 105static inline int __pcibus_to_node(const struct pci_bus *bus)
102{ 106{
103 struct pci_sysdata *sd = bus->sysdata; 107 const struct pci_sysdata *sd = bus->sysdata;
104 108
105 return sd->node; 109 return sd->node;
106} 110}
@@ -109,6 +113,12 @@ static inline cpumask_t __pcibus_to_cpumask(struct pci_bus *bus)
109{ 113{
110 return node_to_cpumask(__pcibus_to_node(bus)); 114 return node_to_cpumask(__pcibus_to_node(bus));
111} 115}
116
117static inline const struct cpumask *
118cpumask_of_pcibus(const struct pci_bus *bus)
119{
120 return cpumask_of_node(__pcibus_to_node(bus));
121}
112#endif 122#endif
113 123
114#endif /* _ASM_X86_PCI_H */ 124#endif /* _ASM_X86_PCI_H */
diff --git a/arch/x86/include/asm/pci_64.h b/arch/x86/include/asm/pci_64.h
index d02d936840a3..4da207982777 100644
--- a/arch/x86/include/asm/pci_64.h
+++ b/arch/x86/include/asm/pci_64.h
@@ -23,7 +23,6 @@ extern int (*pci_config_write)(int seg, int bus, int dev, int fn,
23 int reg, int len, u32 value); 23 int reg, int len, u32 value);
24 24
25extern void dma32_reserve_bootmem(void); 25extern void dma32_reserve_bootmem(void);
26extern void pci_iommu_alloc(void);
27 26
28/* The PCI address space does equal the physical memory 27/* The PCI address space does equal the physical memory
29 * address space. The networking and block device layers use 28 * address space. The networking and block device layers use
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index b17edfd23628..e0d199fe1d83 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -56,23 +56,55 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp)
56#define pte_none(x) (!(x).pte_low) 56#define pte_none(x) (!(x).pte_low)
57 57
58/* 58/*
59 * Bits 0, 6 and 7 are taken, split up the 29 bits of offset 59 * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken,
60 * into this range: 60 * split up the 29 bits of offset into this range:
61 */ 61 */
62#define PTE_FILE_MAX_BITS 29 62#define PTE_FILE_MAX_BITS 29
63#define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1)
64#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
65#define PTE_FILE_SHIFT2 (_PAGE_BIT_FILE + 1)
66#define PTE_FILE_SHIFT3 (_PAGE_BIT_PROTNONE + 1)
67#else
68#define PTE_FILE_SHIFT2 (_PAGE_BIT_PROTNONE + 1)
69#define PTE_FILE_SHIFT3 (_PAGE_BIT_FILE + 1)
70#endif
71#define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1)
72#define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1)
63 73
64#define pte_to_pgoff(pte) \ 74#define pte_to_pgoff(pte) \
65 ((((pte).pte_low >> 1) & 0x1f) + (((pte).pte_low >> 8) << 5)) 75 ((((pte).pte_low >> PTE_FILE_SHIFT1) \
76 & ((1U << PTE_FILE_BITS1) - 1)) \
77 + ((((pte).pte_low >> PTE_FILE_SHIFT2) \
78 & ((1U << PTE_FILE_BITS2) - 1)) << PTE_FILE_BITS1) \
79 + (((pte).pte_low >> PTE_FILE_SHIFT3) \
80 << (PTE_FILE_BITS1 + PTE_FILE_BITS2)))
66 81
67#define pgoff_to_pte(off) \ 82#define pgoff_to_pte(off) \
68 ((pte_t) { .pte_low = (((off) & 0x1f) << 1) + \ 83 ((pte_t) { .pte_low = \
69 (((off) >> 5) << 8) + _PAGE_FILE }) 84 (((off) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1) \
85 + ((((off) >> PTE_FILE_BITS1) & ((1U << PTE_FILE_BITS2) - 1)) \
86 << PTE_FILE_SHIFT2) \
87 + (((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \
88 << PTE_FILE_SHIFT3) \
89 + _PAGE_FILE })
70 90
71/* Encode and de-code a swap entry */ 91/* Encode and de-code a swap entry */
72#define __swp_type(x) (((x).val >> 1) & 0x1f) 92#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
73#define __swp_offset(x) ((x).val >> 8) 93#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
74#define __swp_entry(type, offset) \ 94#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
75 ((swp_entry_t) { ((type) << 1) | ((offset) << 8) }) 95#else
96#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1)
97#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1)
98#endif
99
100#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
101
102#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \
103 & ((1U << SWP_TYPE_BITS) - 1))
104#define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT)
105#define __swp_entry(type, offset) ((swp_entry_t) { \
106 ((type) << (_PAGE_BIT_PRESENT + 1)) \
107 | ((offset) << SWP_OFFSET_SHIFT) })
76#define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low }) 108#define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low })
77#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) 109#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
78 110
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 52597aeadfff..447da43cddb3 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -166,6 +166,7 @@ static inline int pte_none(pte_t pte)
166#define PTE_FILE_MAX_BITS 32 166#define PTE_FILE_MAX_BITS 32
167 167
168/* Encode and de-code a swap entry */ 168/* Encode and de-code a swap entry */
169#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5)
169#define __swp_type(x) (((x).val) & 0x1f) 170#define __swp_type(x) (((x).val) & 0x1f)
170#define __swp_offset(x) ((x).val >> 5) 171#define __swp_offset(x) ((x).val >> 5)
171#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5}) 172#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5})
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index c012f3b11671..83e69f4a37f0 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -10,7 +10,6 @@
10#define _PAGE_BIT_PCD 4 /* page cache disabled */ 10#define _PAGE_BIT_PCD 4 /* page cache disabled */
11#define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */ 11#define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */
12#define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */ 12#define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */
13#define _PAGE_BIT_FILE 6
14#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ 13#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
15#define _PAGE_BIT_PAT 7 /* on 4KB pages */ 14#define _PAGE_BIT_PAT 7 /* on 4KB pages */
16#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ 15#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
@@ -22,6 +21,12 @@
22#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 21#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1
23#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ 22#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
24 23
24/* If _PAGE_BIT_PRESENT is clear, we use these: */
25/* - if the user mapped it with PROT_NONE; pte_present gives true */
26#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
27/* - set: nonlinear file mapping, saved PTE; unset:swap */
28#define _PAGE_BIT_FILE _PAGE_BIT_DIRTY
29
25#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT) 30#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
26#define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW) 31#define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW)
27#define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER) 32#define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER)
@@ -46,11 +51,8 @@
46#define _PAGE_NX (_AT(pteval_t, 0)) 51#define _PAGE_NX (_AT(pteval_t, 0))
47#endif 52#endif
48 53
49/* If _PAGE_PRESENT is clear, we use these: */ 54#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
50#define _PAGE_FILE _PAGE_DIRTY /* nonlinear file mapping, 55#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
51 * saved PTE; unset:swap */
52#define _PAGE_PROTNONE _PAGE_PSE /* if the user mapped it with PROT_NONE;
53 pte_present gives true */
54 56
55#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ 57#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
56 _PAGE_ACCESSED | _PAGE_DIRTY) 58 _PAGE_ACCESSED | _PAGE_DIRTY)
@@ -158,8 +160,19 @@
158#define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */ 160#define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */
159#endif 161#endif
160 162
163/*
164 * Macro to mark a page protection value as UC-
165 */
166#define pgprot_noncached(prot) \
167 ((boot_cpu_data.x86 > 3) \
168 ? (__pgprot(pgprot_val(prot) | _PAGE_CACHE_UC_MINUS)) \
169 : (prot))
170
161#ifndef __ASSEMBLY__ 171#ifndef __ASSEMBLY__
162 172
173#define pgprot_writecombine pgprot_writecombine
174extern pgprot_t pgprot_writecombine(pgprot_t prot);
175
163/* 176/*
164 * ZERO_PAGE is a global shared page that is always zero: used 177 * ZERO_PAGE is a global shared page that is always zero: used
165 * for zero-mapped memory areas etc.. 178 * for zero-mapped memory areas etc..
@@ -329,6 +342,9 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
329#define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask) 342#define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask)
330 343
331#ifndef __ASSEMBLY__ 344#ifndef __ASSEMBLY__
345/* Indicate that x86 has its own track and untrack pfn vma functions */
346#define __HAVE_PFNMAP_TRACKING
347
332#define __HAVE_PHYS_MEM_ACCESS_PROT 348#define __HAVE_PHYS_MEM_ACCESS_PROT
333struct file; 349struct file;
334pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, 350pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index f9d5889b336b..72b020deb46b 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -101,15 +101,6 @@ extern unsigned long pg0[];
101#endif 101#endif
102 102
103/* 103/*
104 * Macro to mark a page protection value as "uncacheable".
105 * On processors which do not support it, this is a no-op.
106 */
107#define pgprot_noncached(prot) \
108 ((boot_cpu_data.x86 > 3) \
109 ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) \
110 : (prot))
111
112/*
113 * Conversion functions: convert a page and protection to a page entry, 104 * Conversion functions: convert a page and protection to a page entry,
114 * and a page entry and page directory to the page they refer to. 105 * and a page entry and page directory to the page they refer to.
115 */ 106 */
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 545a0e042bb2..ba09289accaa 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -146,7 +146,7 @@ static inline void native_pgd_clear(pgd_t *pgd)
146#define PGDIR_MASK (~(PGDIR_SIZE - 1)) 146#define PGDIR_MASK (~(PGDIR_SIZE - 1))
147 147
148 148
149#define MAXMEM _AC(0x00003fffffffffff, UL) 149#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
150#define VMALLOC_START _AC(0xffffc20000000000, UL) 150#define VMALLOC_START _AC(0xffffc20000000000, UL)
151#define VMALLOC_END _AC(0xffffe1ffffffffff, UL) 151#define VMALLOC_END _AC(0xffffe1ffffffffff, UL)
152#define VMEMMAP_START _AC(0xffffe20000000000, UL) 152#define VMEMMAP_START _AC(0xffffe20000000000, UL)
@@ -177,12 +177,6 @@ static inline int pmd_bad(pmd_t pmd)
177#define pages_to_mb(x) ((x) >> (20 - PAGE_SHIFT)) /* FIXME: is this right? */ 177#define pages_to_mb(x) ((x) >> (20 - PAGE_SHIFT)) /* FIXME: is this right? */
178 178
179/* 179/*
180 * Macro to mark a page protection value as "uncacheable".
181 */
182#define pgprot_noncached(prot) \
183 (__pgprot(pgprot_val((prot)) | _PAGE_PCD | _PAGE_PWT))
184
185/*
186 * Conversion functions: convert a page and protection to a page entry, 180 * Conversion functions: convert a page and protection to a page entry,
187 * and a page entry and page directory to the page they refer to. 181 * and a page entry and page directory to the page they refer to.
188 */ 182 */
@@ -250,10 +244,22 @@ static inline int pud_large(pud_t pte)
250extern int direct_gbpages; 244extern int direct_gbpages;
251 245
252/* Encode and de-code a swap entry */ 246/* Encode and de-code a swap entry */
253#define __swp_type(x) (((x).val >> 1) & 0x3f) 247#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
254#define __swp_offset(x) ((x).val >> 8) 248#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
255#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | \ 249#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
256 ((offset) << 8) }) 250#else
251#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1)
252#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1)
253#endif
254
255#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
256
257#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \
258 & ((1U << SWP_TYPE_BITS) - 1))
259#define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT)
260#define __swp_entry(type, offset) ((swp_entry_t) { \
261 ((type) << (_PAGE_BIT_PRESENT + 1)) \
262 | ((offset) << SWP_OFFSET_SHIFT) })
257#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) }) 263#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) })
258#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) 264#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
259 265
diff --git a/arch/x86/include/asm/prctl.h b/arch/x86/include/asm/prctl.h
index fe681147a4f7..a8894647dd9a 100644
--- a/arch/x86/include/asm/prctl.h
+++ b/arch/x86/include/asm/prctl.h
@@ -6,5 +6,8 @@
6#define ARCH_GET_FS 0x1003 6#define ARCH_GET_FS 0x1003
7#define ARCH_GET_GS 0x1004 7#define ARCH_GET_GS 0x1004
8 8
9#ifdef CONFIG_X86_64
10extern long sys_arch_prctl(int, unsigned long);
11#endif /* CONFIG_X86_64 */
9 12
10#endif /* _ASM_X86_PRCTL_H */ 13#endif /* _ASM_X86_PRCTL_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 5ca01e383269..091cd8855f2e 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -110,6 +110,7 @@ struct cpuinfo_x86 {
110 /* Index into per_cpu list: */ 110 /* Index into per_cpu list: */
111 u16 cpu_index; 111 u16 cpu_index;
112#endif 112#endif
113 unsigned int x86_hyper_vendor;
113} __attribute__((__aligned__(SMP_CACHE_BYTES))); 114} __attribute__((__aligned__(SMP_CACHE_BYTES)));
114 115
115#define X86_VENDOR_INTEL 0 116#define X86_VENDOR_INTEL 0
@@ -123,6 +124,9 @@ struct cpuinfo_x86 {
123 124
124#define X86_VENDOR_UNKNOWN 0xff 125#define X86_VENDOR_UNKNOWN 0xff
125 126
127#define X86_HYPER_VENDOR_NONE 0
128#define X86_HYPER_VENDOR_VMWARE 1
129
126/* 130/*
127 * capabilities of CPUs 131 * capabilities of CPUs
128 */ 132 */
@@ -752,6 +756,19 @@ extern void switch_to_new_gdt(void);
752extern void cpu_init(void); 756extern void cpu_init(void);
753extern void init_gdt(int cpu); 757extern void init_gdt(int cpu);
754 758
759static inline unsigned long get_debugctlmsr(void)
760{
761 unsigned long debugctlmsr = 0;
762
763#ifndef CONFIG_X86_DEBUGCTLMSR
764 if (boot_cpu_data.x86 < 6)
765 return 0;
766#endif
767 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
768
769 return debugctlmsr;
770}
771
755static inline void update_debugctlmsr(unsigned long debugctlmsr) 772static inline void update_debugctlmsr(unsigned long debugctlmsr)
756{ 773{
757#ifndef CONFIG_X86_DEBUGCTLMSR 774#ifndef CONFIG_X86_DEBUGCTLMSR
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index eefb0594b058..6d34d954c228 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -6,7 +6,6 @@
6#include <asm/processor-flags.h> 6#include <asm/processor-flags.h>
7 7
8#ifdef __KERNEL__ 8#ifdef __KERNEL__
9#include <asm/ds.h> /* the DS BTS struct is used for ptrace too */
10#include <asm/segment.h> 9#include <asm/segment.h>
11#endif 10#endif
12 11
@@ -128,34 +127,6 @@ struct pt_regs {
128#endif /* !__i386__ */ 127#endif /* !__i386__ */
129 128
130 129
131#ifdef CONFIG_X86_PTRACE_BTS
132/* a branch trace record entry
133 *
134 * In order to unify the interface between various processor versions,
135 * we use the below data structure for all processors.
136 */
137enum bts_qualifier {
138 BTS_INVALID = 0,
139 BTS_BRANCH,
140 BTS_TASK_ARRIVES,
141 BTS_TASK_DEPARTS
142};
143
144struct bts_struct {
145 __u64 qualifier;
146 union {
147 /* BTS_BRANCH */
148 struct {
149 __u64 from_ip;
150 __u64 to_ip;
151 } lbr;
152 /* BTS_TASK_ARRIVES or
153 BTS_TASK_DEPARTS */
154 __u64 jiffies;
155 } variant;
156};
157#endif /* CONFIG_X86_PTRACE_BTS */
158
159#ifdef __KERNEL__ 130#ifdef __KERNEL__
160 131
161#include <linux/init.h> 132#include <linux/init.h>
@@ -163,13 +134,6 @@ struct bts_struct {
163struct cpuinfo_x86; 134struct cpuinfo_x86;
164struct task_struct; 135struct task_struct;
165 136
166#ifdef CONFIG_X86_PTRACE_BTS
167extern void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *);
168extern void ptrace_bts_take_timestamp(struct task_struct *, enum bts_qualifier);
169#else
170#define ptrace_bts_init_intel(config) do {} while (0)
171#endif /* CONFIG_X86_PTRACE_BTS */
172
173extern unsigned long profile_pc(struct pt_regs *regs); 137extern unsigned long profile_pc(struct pt_regs *regs);
174 138
175extern unsigned long 139extern unsigned long
@@ -271,6 +235,13 @@ extern int do_get_thread_area(struct task_struct *p, int idx,
271extern int do_set_thread_area(struct task_struct *p, int idx, 235extern int do_set_thread_area(struct task_struct *p, int idx,
272 struct user_desc __user *info, int can_allocate); 236 struct user_desc __user *info, int can_allocate);
273 237
238extern void x86_ptrace_untrace(struct task_struct *);
239extern void x86_ptrace_fork(struct task_struct *child,
240 unsigned long clone_flags);
241
242#define arch_ptrace_untrace(tsk) x86_ptrace_untrace(tsk)
243#define arch_ptrace_fork(child, flags) x86_ptrace_fork(child, flags)
244
274#endif /* __KERNEL__ */ 245#endif /* __KERNEL__ */
275 246
276#endif /* !__ASSEMBLY__ */ 247#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 294daeb3a006..4fcd53fd5f43 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -8,6 +8,10 @@
8/* Interrupt control for vSMPowered x86_64 systems */ 8/* Interrupt control for vSMPowered x86_64 systems */
9void vsmp_init(void); 9void vsmp_init(void);
10 10
11
12void setup_bios_corruption_check(void);
13
14
11#ifdef CONFIG_X86_VISWS 15#ifdef CONFIG_X86_VISWS
12extern void visws_early_detect(void); 16extern void visws_early_detect(void);
13extern int is_visws_box(void); 17extern int is_visws_box(void);
diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h
new file mode 100644
index 000000000000..4e0fe26d27d3
--- /dev/null
+++ b/arch/x86/include/asm/sigframe.h
@@ -0,0 +1,70 @@
1#ifndef _ASM_X86_SIGFRAME_H
2#define _ASM_X86_SIGFRAME_H
3
4#include <asm/sigcontext.h>
5#include <asm/siginfo.h>
6#include <asm/ucontext.h>
7
8#ifdef CONFIG_X86_32
9#define sigframe_ia32 sigframe
10#define rt_sigframe_ia32 rt_sigframe
11#define sigcontext_ia32 sigcontext
12#define _fpstate_ia32 _fpstate
13#define ucontext_ia32 ucontext
14#else /* !CONFIG_X86_32 */
15
16#ifdef CONFIG_IA32_EMULATION
17#include <asm/ia32.h>
18#endif /* CONFIG_IA32_EMULATION */
19
20#endif /* CONFIG_X86_32 */
21
22#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
23struct sigframe_ia32 {
24 u32 pretcode;
25 int sig;
26 struct sigcontext_ia32 sc;
27 /*
28 * fpstate is unused. fpstate is moved/allocated after
29 * retcode[] below. This movement allows to have the FP state and the
30 * future state extensions (xsave) stay together.
31 * And at the same time retaining the unused fpstate, prevents changing
32 * the offset of extramask[] in the sigframe and thus prevent any
33 * legacy application accessing/modifying it.
34 */
35 struct _fpstate_ia32 fpstate_unused;
36#ifdef CONFIG_IA32_EMULATION
37 unsigned int extramask[_COMPAT_NSIG_WORDS-1];
38#else /* !CONFIG_IA32_EMULATION */
39 unsigned long extramask[_NSIG_WORDS-1];
40#endif /* CONFIG_IA32_EMULATION */
41 char retcode[8];
42 /* fp state follows here */
43};
44
45struct rt_sigframe_ia32 {
46 u32 pretcode;
47 int sig;
48 u32 pinfo;
49 u32 puc;
50#ifdef CONFIG_IA32_EMULATION
51 compat_siginfo_t info;
52#else /* !CONFIG_IA32_EMULATION */
53 struct siginfo info;
54#endif /* CONFIG_IA32_EMULATION */
55 struct ucontext_ia32 uc;
56 char retcode[8];
57 /* fp state follows here */
58};
59#endif /* defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) */
60
61#ifdef CONFIG_X86_64
62struct rt_sigframe {
63 char __user *pretcode;
64 struct ucontext uc;
65 struct siginfo info;
66 /* fp state follows here */
67};
68#endif /* CONFIG_X86_64 */
69
70#endif /* _ASM_X86_SIGFRAME_H */
diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
index 96ac44f275da..7761a5d554bb 100644
--- a/arch/x86/include/asm/signal.h
+++ b/arch/x86/include/asm/signal.h
@@ -121,6 +121,10 @@ typedef unsigned long sigset_t;
121 121
122#ifndef __ASSEMBLY__ 122#ifndef __ASSEMBLY__
123 123
124# ifdef __KERNEL__
125extern void do_notify_resume(struct pt_regs *, void *, __u32);
126# endif /* __KERNEL__ */
127
124#ifdef __i386__ 128#ifdef __i386__
125# ifdef __KERNEL__ 129# ifdef __KERNEL__
126struct old_sigaction { 130struct old_sigaction {
@@ -141,8 +145,6 @@ struct k_sigaction {
141 struct sigaction sa; 145 struct sigaction sa;
142}; 146};
143 147
144extern void do_notify_resume(struct pt_regs *, void *, __u32);
145
146# else /* __KERNEL__ */ 148# else /* __KERNEL__ */
147/* Here we must cater to libcs that poke about in kernel headers. */ 149/* Here we must cater to libcs that poke about in kernel headers. */
148 150
diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h
index be44f7dab395..e3cc3c063ec5 100644
--- a/arch/x86/include/asm/sparsemem.h
+++ b/arch/x86/include/asm/sparsemem.h
@@ -27,7 +27,7 @@
27#else /* CONFIG_X86_32 */ 27#else /* CONFIG_X86_32 */
28# define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */ 28# define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */
29# define MAX_PHYSADDR_BITS 44 29# define MAX_PHYSADDR_BITS 44
30# define MAX_PHYSMEM_BITS 44 30# define MAX_PHYSMEM_BITS 44 /* Can be max 45 bits */
31#endif 31#endif
32 32
33#endif /* CONFIG_SPARSEMEM */ 33#endif /* CONFIG_SPARSEMEM */
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 87803da44010..9c6797c3e56c 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -19,6 +19,13 @@
19/* kernel/ioport.c */ 19/* kernel/ioport.c */
20asmlinkage long sys_ioperm(unsigned long, unsigned long, int); 20asmlinkage long sys_ioperm(unsigned long, unsigned long, int);
21 21
22/* kernel/ldt.c */
23asmlinkage int sys_modify_ldt(int, void __user *, unsigned long);
24
25/* kernel/tls.c */
26asmlinkage int sys_set_thread_area(struct user_desc __user *);
27asmlinkage int sys_get_thread_area(struct user_desc __user *);
28
22/* X86_32 only */ 29/* X86_32 only */
23#ifdef CONFIG_X86_32 30#ifdef CONFIG_X86_32
24/* kernel/process_32.c */ 31/* kernel/process_32.c */
@@ -33,14 +40,11 @@ asmlinkage int sys_sigaction(int, const struct old_sigaction __user *,
33 struct old_sigaction __user *); 40 struct old_sigaction __user *);
34asmlinkage int sys_sigaltstack(unsigned long); 41asmlinkage int sys_sigaltstack(unsigned long);
35asmlinkage unsigned long sys_sigreturn(unsigned long); 42asmlinkage unsigned long sys_sigreturn(unsigned long);
36asmlinkage int sys_rt_sigreturn(unsigned long); 43asmlinkage int sys_rt_sigreturn(struct pt_regs);
37 44
38/* kernel/ioport.c */ 45/* kernel/ioport.c */
39asmlinkage long sys_iopl(unsigned long); 46asmlinkage long sys_iopl(unsigned long);
40 47
41/* kernel/ldt.c */
42asmlinkage int sys_modify_ldt(int, void __user *, unsigned long);
43
44/* kernel/sys_i386_32.c */ 48/* kernel/sys_i386_32.c */
45asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long, 49asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long,
46 unsigned long, unsigned long, unsigned long); 50 unsigned long, unsigned long, unsigned long);
@@ -54,10 +58,6 @@ asmlinkage int sys_uname(struct old_utsname __user *);
54struct oldold_utsname; 58struct oldold_utsname;
55asmlinkage int sys_olduname(struct oldold_utsname __user *); 59asmlinkage int sys_olduname(struct oldold_utsname __user *);
56 60
57/* kernel/tls.c */
58asmlinkage int sys_set_thread_area(struct user_desc __user *);
59asmlinkage int sys_get_thread_area(struct user_desc __user *);
60
61/* kernel/vm86_32.c */ 61/* kernel/vm86_32.c */
62asmlinkage int sys_vm86old(struct pt_regs); 62asmlinkage int sys_vm86old(struct pt_regs);
63asmlinkage int sys_vm86(struct pt_regs); 63asmlinkage int sys_vm86(struct pt_regs);
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index 07c3e4048991..8e626ea33a1a 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -17,12 +17,12 @@
17# define AT_VECTOR_SIZE_ARCH 1 17# define AT_VECTOR_SIZE_ARCH 1
18#endif 18#endif
19 19
20#ifdef CONFIG_X86_32
21
22struct task_struct; /* one of the stranger aspects of C forward declarations */ 20struct task_struct; /* one of the stranger aspects of C forward declarations */
23struct task_struct *__switch_to(struct task_struct *prev, 21struct task_struct *__switch_to(struct task_struct *prev,
24 struct task_struct *next); 22 struct task_struct *next);
25 23
24#ifdef CONFIG_X86_32
25
26/* 26/*
27 * Saving eflags is important. It switches not only IOPL between tasks, 27 * Saving eflags is important. It switches not only IOPL between tasks,
28 * it also protects other tasks from NT leaking through sysenter etc. 28 * it also protects other tasks from NT leaking through sysenter etc.
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 0921b4018c11..98789647baa9 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -26,7 +26,7 @@ struct exec_domain;
26struct thread_info { 26struct thread_info {
27 struct task_struct *task; /* main task structure */ 27 struct task_struct *task; /* main task structure */
28 struct exec_domain *exec_domain; /* execution domain */ 28 struct exec_domain *exec_domain; /* execution domain */
29 unsigned long flags; /* low level flags */ 29 __u32 flags; /* low level flags */
30 __u32 status; /* thread synchronous flags */ 30 __u32 status; /* thread synchronous flags */
31 __u32 cpu; /* current CPU */ 31 __u32 cpu; /* current CPU */
32 int preempt_count; /* 0 => preemptable, 32 int preempt_count; /* 0 => preemptable,
@@ -93,7 +93,6 @@ struct thread_info {
93#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ 93#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
94#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ 94#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */
95#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ 95#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */
96#define TIF_BTS_TRACE_TS 27 /* record scheduling event timestamps */
97 96
98#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) 97#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
99#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) 98#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
@@ -115,7 +114,6 @@ struct thread_info {
115#define _TIF_FORCED_TF (1 << TIF_FORCED_TF) 114#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
116#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) 115#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR)
117#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) 116#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR)
118#define _TIF_BTS_TRACE_TS (1 << TIF_BTS_TRACE_TS)
119 117
120/* work to do in syscall_trace_enter() */ 118/* work to do in syscall_trace_enter() */
121#define _TIF_WORK_SYSCALL_ENTRY \ 119#define _TIF_WORK_SYSCALL_ENTRY \
@@ -141,8 +139,7 @@ struct thread_info {
141 139
142/* flags to check in __switch_to() */ 140/* flags to check in __switch_to() */
143#define _TIF_WORK_CTXSW \ 141#define _TIF_WORK_CTXSW \
144 (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS| \ 142 (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC)
145 _TIF_NOTSC)
146 143
147#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW 144#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
148#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG) 145#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 79e31e9dcdda..4e2f2e0aab27 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -61,13 +61,19 @@ static inline int cpu_to_node(int cpu)
61 * 61 *
62 * Side note: this function creates the returned cpumask on the stack 62 * Side note: this function creates the returned cpumask on the stack
63 * so with a high NR_CPUS count, excessive stack space is used. The 63 * so with a high NR_CPUS count, excessive stack space is used. The
64 * node_to_cpumask_ptr function should be used whenever possible. 64 * cpumask_of_node function should be used whenever possible.
65 */ 65 */
66static inline cpumask_t node_to_cpumask(int node) 66static inline cpumask_t node_to_cpumask(int node)
67{ 67{
68 return node_to_cpumask_map[node]; 68 return node_to_cpumask_map[node];
69} 69}
70 70
71/* Returns a bitmask of CPUs on Node 'node'. */
72static inline const struct cpumask *cpumask_of_node(int node)
73{
74 return &node_to_cpumask_map[node];
75}
76
71#else /* CONFIG_X86_64 */ 77#else /* CONFIG_X86_64 */
72 78
73/* Mappings between node number and cpus on that node. */ 79/* Mappings between node number and cpus on that node. */
@@ -82,7 +88,7 @@ DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map);
82#ifdef CONFIG_DEBUG_PER_CPU_MAPS 88#ifdef CONFIG_DEBUG_PER_CPU_MAPS
83extern int cpu_to_node(int cpu); 89extern int cpu_to_node(int cpu);
84extern int early_cpu_to_node(int cpu); 90extern int early_cpu_to_node(int cpu);
85extern const cpumask_t *_node_to_cpumask_ptr(int node); 91extern const cpumask_t *cpumask_of_node(int node);
86extern cpumask_t node_to_cpumask(int node); 92extern cpumask_t node_to_cpumask(int node);
87 93
88#else /* !CONFIG_DEBUG_PER_CPU_MAPS */ 94#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
@@ -103,7 +109,7 @@ static inline int early_cpu_to_node(int cpu)
103} 109}
104 110
105/* Returns a pointer to the cpumask of CPUs on Node 'node'. */ 111/* Returns a pointer to the cpumask of CPUs on Node 'node'. */
106static inline const cpumask_t *_node_to_cpumask_ptr(int node) 112static inline const cpumask_t *cpumask_of_node(int node)
107{ 113{
108 return &node_to_cpumask_map[node]; 114 return &node_to_cpumask_map[node];
109} 115}
@@ -116,12 +122,15 @@ static inline cpumask_t node_to_cpumask(int node)
116 122
117#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ 123#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
118 124
119/* Replace default node_to_cpumask_ptr with optimized version */ 125/*
126 * Replace default node_to_cpumask_ptr with optimized version
127 * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)"
128 */
120#define node_to_cpumask_ptr(v, node) \ 129#define node_to_cpumask_ptr(v, node) \
121 const cpumask_t *v = _node_to_cpumask_ptr(node) 130 const cpumask_t *v = cpumask_of_node(node)
122 131
123#define node_to_cpumask_ptr_next(v, node) \ 132#define node_to_cpumask_ptr_next(v, node) \
124 v = _node_to_cpumask_ptr(node) 133 v = cpumask_of_node(node)
125 134
126#endif /* CONFIG_X86_64 */ 135#endif /* CONFIG_X86_64 */
127 136
@@ -187,7 +196,7 @@ extern int __node_distance(int, int);
187#define cpu_to_node(cpu) 0 196#define cpu_to_node(cpu) 0
188#define early_cpu_to_node(cpu) 0 197#define early_cpu_to_node(cpu) 0
189 198
190static inline const cpumask_t *_node_to_cpumask_ptr(int node) 199static inline const cpumask_t *cpumask_of_node(int node)
191{ 200{
192 return &cpu_online_map; 201 return &cpu_online_map;
193} 202}
@@ -200,12 +209,15 @@ static inline int node_to_first_cpu(int node)
200 return first_cpu(cpu_online_map); 209 return first_cpu(cpu_online_map);
201} 210}
202 211
203/* Replace default node_to_cpumask_ptr with optimized version */ 212/*
213 * Replace default node_to_cpumask_ptr with optimized version
214 * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)"
215 */
204#define node_to_cpumask_ptr(v, node) \ 216#define node_to_cpumask_ptr(v, node) \
205 const cpumask_t *v = _node_to_cpumask_ptr(node) 217 const cpumask_t *v = cpumask_of_node(node)
206 218
207#define node_to_cpumask_ptr_next(v, node) \ 219#define node_to_cpumask_ptr_next(v, node) \
208 v = _node_to_cpumask_ptr(node) 220 v = cpumask_of_node(node)
209#endif 221#endif
210 222
211#include <asm-generic/topology.h> 223#include <asm-generic/topology.h>
@@ -214,12 +226,12 @@ static inline int node_to_first_cpu(int node)
214/* Returns the number of the first CPU on Node 'node'. */ 226/* Returns the number of the first CPU on Node 'node'. */
215static inline int node_to_first_cpu(int node) 227static inline int node_to_first_cpu(int node)
216{ 228{
217 node_to_cpumask_ptr(mask, node); 229 return cpumask_first(cpumask_of_node(node));
218 return first_cpu(*mask);
219} 230}
220#endif 231#endif
221 232
222extern cpumask_t cpu_coregroup_map(int cpu); 233extern cpumask_t cpu_coregroup_map(int cpu);
234extern const struct cpumask *cpu_coregroup_mask(int cpu);
223 235
224#ifdef ENABLE_TOPO_DEFINES 236#ifdef ENABLE_TOPO_DEFINES
225#define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id) 237#define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id)
diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h
index fa0d79facdbc..780ba0ab94f9 100644
--- a/arch/x86/include/asm/trampoline.h
+++ b/arch/x86/include/asm/trampoline.h
@@ -3,6 +3,7 @@
3 3
4#ifndef __ASSEMBLY__ 4#ifndef __ASSEMBLY__
5 5
6#ifdef CONFIG_X86_TRAMPOLINE
6/* 7/*
7 * Trampoline 80x86 program as an array. 8 * Trampoline 80x86 program as an array.
8 */ 9 */
@@ -13,8 +14,14 @@ extern unsigned char *trampoline_base;
13extern unsigned long init_rsp; 14extern unsigned long init_rsp;
14extern unsigned long initial_code; 15extern unsigned long initial_code;
15 16
17#define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE)
16#define TRAMPOLINE_BASE 0x6000 18#define TRAMPOLINE_BASE 0x6000
19
17extern unsigned long setup_trampoline(void); 20extern unsigned long setup_trampoline(void);
21extern void __init reserve_trampoline_memory(void);
22#else
23static inline void reserve_trampoline_memory(void) {};
24#endif /* CONFIG_X86_TRAMPOLINE */
18 25
19#endif /* __ASSEMBLY__ */ 26#endif /* __ASSEMBLY__ */
20 27
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 45dee286e45c..2ee0a3bceedf 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -46,6 +46,10 @@ dotraplinkage void do_coprocessor_segment_overrun(struct pt_regs *, long);
46dotraplinkage void do_invalid_TSS(struct pt_regs *, long); 46dotraplinkage void do_invalid_TSS(struct pt_regs *, long);
47dotraplinkage void do_segment_not_present(struct pt_regs *, long); 47dotraplinkage void do_segment_not_present(struct pt_regs *, long);
48dotraplinkage void do_stack_segment(struct pt_regs *, long); 48dotraplinkage void do_stack_segment(struct pt_regs *, long);
49#ifdef CONFIG_X86_64
50dotraplinkage void do_double_fault(struct pt_regs *, long);
51asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *);
52#endif
49dotraplinkage void do_general_protection(struct pt_regs *, long); 53dotraplinkage void do_general_protection(struct pt_regs *, long);
50dotraplinkage void do_page_fault(struct pt_regs *, unsigned long); 54dotraplinkage void do_page_fault(struct pt_regs *, unsigned long);
51dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long); 55dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long);
@@ -72,10 +76,13 @@ static inline int get_si_code(unsigned long condition)
72extern int panic_on_unrecovered_nmi; 76extern int panic_on_unrecovered_nmi;
73extern int kstack_depth_to_print; 77extern int kstack_depth_to_print;
74 78
75#ifdef CONFIG_X86_32
76void math_error(void __user *); 79void math_error(void __user *);
77unsigned long patch_espfix_desc(unsigned long, unsigned long);
78asmlinkage void math_emulate(long); 80asmlinkage void math_emulate(long);
81#ifdef CONFIG_X86_32
82unsigned long patch_espfix_desc(unsigned long, unsigned long);
83#else
84asmlinkage void smp_thermal_interrupt(void);
85asmlinkage void mce_threshold_interrupt(void);
79#endif 86#endif
80 87
81#endif /* _ASM_X86_TRAPS_H */ 88#endif /* _ASM_X86_TRAPS_H */
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 9cd83a8e40d5..38ae163cc91b 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -34,8 +34,6 @@ static inline cycles_t get_cycles(void)
34 34
35static __always_inline cycles_t vget_cycles(void) 35static __always_inline cycles_t vget_cycles(void)
36{ 36{
37 cycles_t cycles;
38
39 /* 37 /*
40 * We only do VDSOs on TSC capable CPUs, so this shouldnt 38 * We only do VDSOs on TSC capable CPUs, so this shouldnt
41 * access boot_cpu_data (which is not VDSO-safe): 39 * access boot_cpu_data (which is not VDSO-safe):
@@ -44,11 +42,7 @@ static __always_inline cycles_t vget_cycles(void)
44 if (!cpu_has_tsc) 42 if (!cpu_has_tsc)
45 return 0; 43 return 0;
46#endif 44#endif
47 rdtsc_barrier(); 45 return (cycles_t)__native_read_tsc();
48 cycles = (cycles_t)__native_read_tsc();
49 rdtsc_barrier();
50
51 return cycles;
52} 46}
53 47
54extern void tsc_init(void); 48extern void tsc_init(void);
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 99192bb55a53..4340055b7559 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -352,14 +352,14 @@ do { \
352 352
353#define __put_user_nocheck(x, ptr, size) \ 353#define __put_user_nocheck(x, ptr, size) \
354({ \ 354({ \
355 long __pu_err; \ 355 int __pu_err; \
356 __put_user_size((x), (ptr), (size), __pu_err, -EFAULT); \ 356 __put_user_size((x), (ptr), (size), __pu_err, -EFAULT); \
357 __pu_err; \ 357 __pu_err; \
358}) 358})
359 359
360#define __get_user_nocheck(x, ptr, size) \ 360#define __get_user_nocheck(x, ptr, size) \
361({ \ 361({ \
362 long __gu_err; \ 362 int __gu_err; \
363 unsigned long __gu_val; \ 363 unsigned long __gu_val; \
364 __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \ 364 __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \
365 (x) = (__force __typeof__(*(ptr)))__gu_val; \ 365 (x) = (__force __typeof__(*(ptr)))__gu_val; \
diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h
index d931d3b7e6f7..7ed17ff502b9 100644
--- a/arch/x86/include/asm/uv/bios.h
+++ b/arch/x86/include/asm/uv/bios.h
@@ -32,13 +32,18 @@
32enum uv_bios_cmd { 32enum uv_bios_cmd {
33 UV_BIOS_COMMON, 33 UV_BIOS_COMMON,
34 UV_BIOS_GET_SN_INFO, 34 UV_BIOS_GET_SN_INFO,
35 UV_BIOS_FREQ_BASE 35 UV_BIOS_FREQ_BASE,
36 UV_BIOS_WATCHLIST_ALLOC,
37 UV_BIOS_WATCHLIST_FREE,
38 UV_BIOS_MEMPROTECT,
39 UV_BIOS_GET_PARTITION_ADDR
36}; 40};
37 41
38/* 42/*
39 * Status values returned from a BIOS call. 43 * Status values returned from a BIOS call.
40 */ 44 */
41enum { 45enum {
46 BIOS_STATUS_MORE_PASSES = 1,
42 BIOS_STATUS_SUCCESS = 0, 47 BIOS_STATUS_SUCCESS = 0,
43 BIOS_STATUS_UNIMPLEMENTED = -ENOSYS, 48 BIOS_STATUS_UNIMPLEMENTED = -ENOSYS,
44 BIOS_STATUS_EINVAL = -EINVAL, 49 BIOS_STATUS_EINVAL = -EINVAL,
@@ -71,6 +76,21 @@ union partition_info_u {
71 }; 76 };
72}; 77};
73 78
79union uv_watchlist_u {
80 u64 val;
81 struct {
82 u64 blade : 16,
83 size : 32,
84 filler : 16;
85 };
86};
87
88enum uv_memprotect {
89 UV_MEMPROT_RESTRICT_ACCESS,
90 UV_MEMPROT_ALLOW_AMO,
91 UV_MEMPROT_ALLOW_RW
92};
93
74/* 94/*
75 * bios calls have 6 parameters 95 * bios calls have 6 parameters
76 */ 96 */
@@ -80,14 +100,20 @@ extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64);
80 100
81extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *); 101extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *);
82extern s64 uv_bios_freq_base(u64, u64 *); 102extern s64 uv_bios_freq_base(u64, u64 *);
103extern int uv_bios_mq_watchlist_alloc(int, unsigned long, unsigned int,
104 unsigned long *);
105extern int uv_bios_mq_watchlist_free(int, int);
106extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect);
107extern s64 uv_bios_reserved_page_pa(u64, u64 *, u64 *, u64 *);
83 108
84extern void uv_bios_init(void); 109extern void uv_bios_init(void);
85 110
111extern unsigned long sn_rtc_cycles_per_second;
86extern int uv_type; 112extern int uv_type;
87extern long sn_partition_id; 113extern long sn_partition_id;
88extern long uv_coherency_id; 114extern long sn_coherency_id;
89extern long uv_region_size; 115extern long sn_region_size;
90#define partition_coherence_id() (uv_coherency_id) 116#define partition_coherence_id() (sn_coherency_id)
91 117
92extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */ 118extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */
93 119
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 7a5782610b2b..777327ef05c1 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -113,25 +113,37 @@
113 */ 113 */
114#define UV_MAX_NASID_VALUE (UV_MAX_NUMALINK_NODES * 2) 114#define UV_MAX_NASID_VALUE (UV_MAX_NUMALINK_NODES * 2)
115 115
116struct uv_scir_s {
117 struct timer_list timer;
118 unsigned long offset;
119 unsigned long last;
120 unsigned long idle_on;
121 unsigned long idle_off;
122 unsigned char state;
123 unsigned char enabled;
124};
125
116/* 126/*
117 * The following defines attributes of the HUB chip. These attributes are 127 * The following defines attributes of the HUB chip. These attributes are
118 * frequently referenced and are kept in the per-cpu data areas of each cpu. 128 * frequently referenced and are kept in the per-cpu data areas of each cpu.
119 * They are kept together in a struct to minimize cache misses. 129 * They are kept together in a struct to minimize cache misses.
120 */ 130 */
121struct uv_hub_info_s { 131struct uv_hub_info_s {
122 unsigned long global_mmr_base; 132 unsigned long global_mmr_base;
123 unsigned long gpa_mask; 133 unsigned long gpa_mask;
124 unsigned long gnode_upper; 134 unsigned long gnode_upper;
125 unsigned long lowmem_remap_top; 135 unsigned long lowmem_remap_top;
126 unsigned long lowmem_remap_base; 136 unsigned long lowmem_remap_base;
127 unsigned short pnode; 137 unsigned short pnode;
128 unsigned short pnode_mask; 138 unsigned short pnode_mask;
129 unsigned short coherency_domain_number; 139 unsigned short coherency_domain_number;
130 unsigned short numa_blade_id; 140 unsigned short numa_blade_id;
131 unsigned char blade_processor_id; 141 unsigned char blade_processor_id;
132 unsigned char m_val; 142 unsigned char m_val;
133 unsigned char n_val; 143 unsigned char n_val;
144 struct uv_scir_s scir;
134}; 145};
146
135DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); 147DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
136#define uv_hub_info (&__get_cpu_var(__uv_hub_info)) 148#define uv_hub_info (&__get_cpu_var(__uv_hub_info))
137#define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu)) 149#define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu))
@@ -163,6 +175,30 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
163 175
164#define UV_APIC_PNODE_SHIFT 6 176#define UV_APIC_PNODE_SHIFT 6
165 177
178/* Local Bus from cpu's perspective */
179#define LOCAL_BUS_BASE 0x1c00000
180#define LOCAL_BUS_SIZE (4 * 1024 * 1024)
181
182/*
183 * System Controller Interface Reg
184 *
185 * Note there are NO leds on a UV system. This register is only
186 * used by the system controller to monitor system-wide operation.
187 * There are 64 regs per node. With Nahelem cpus (2 cores per node,
188 * 8 cpus per core, 2 threads per cpu) there are 32 cpu threads on
189 * a node.
190 *
191 * The window is located at top of ACPI MMR space
192 */
193#define SCIR_WINDOW_COUNT 64
194#define SCIR_LOCAL_MMR_BASE (LOCAL_BUS_BASE + \
195 LOCAL_BUS_SIZE - \
196 SCIR_WINDOW_COUNT)
197
198#define SCIR_CPU_HEARTBEAT 0x01 /* timer interrupt */
199#define SCIR_CPU_ACTIVITY 0x02 /* not idle */
200#define SCIR_CPU_HB_INTERVAL (HZ) /* once per second */
201
166/* 202/*
167 * Macros for converting between kernel virtual addresses, socket local physical 203 * Macros for converting between kernel virtual addresses, socket local physical
168 * addresses, and UV global physical addresses. 204 * addresses, and UV global physical addresses.
@@ -174,7 +210,7 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
174static inline unsigned long uv_soc_phys_ram_to_gpa(unsigned long paddr) 210static inline unsigned long uv_soc_phys_ram_to_gpa(unsigned long paddr)
175{ 211{
176 if (paddr < uv_hub_info->lowmem_remap_top) 212 if (paddr < uv_hub_info->lowmem_remap_top)
177 paddr += uv_hub_info->lowmem_remap_base; 213 paddr |= uv_hub_info->lowmem_remap_base;
178 return paddr | uv_hub_info->gnode_upper; 214 return paddr | uv_hub_info->gnode_upper;
179} 215}
180 216
@@ -182,19 +218,7 @@ static inline unsigned long uv_soc_phys_ram_to_gpa(unsigned long paddr)
182/* socket virtual --> UV global physical address */ 218/* socket virtual --> UV global physical address */
183static inline unsigned long uv_gpa(void *v) 219static inline unsigned long uv_gpa(void *v)
184{ 220{
185 return __pa(v) | uv_hub_info->gnode_upper; 221 return uv_soc_phys_ram_to_gpa(__pa(v));
186}
187
188/* socket virtual --> UV global physical address */
189static inline void *uv_vgpa(void *v)
190{
191 return (void *)uv_gpa(v);
192}
193
194/* UV global physical address --> socket virtual */
195static inline void *uv_va(unsigned long gpa)
196{
197 return __va(gpa & uv_hub_info->gpa_mask);
198} 222}
199 223
200/* pnode, offset --> socket virtual */ 224/* pnode, offset --> socket virtual */
@@ -277,6 +301,16 @@ static inline void uv_write_local_mmr(unsigned long offset, unsigned long val)
277 *uv_local_mmr_address(offset) = val; 301 *uv_local_mmr_address(offset) = val;
278} 302}
279 303
304static inline unsigned char uv_read_local_mmr8(unsigned long offset)
305{
306 return *((unsigned char *)uv_local_mmr_address(offset));
307}
308
309static inline void uv_write_local_mmr8(unsigned long offset, unsigned char val)
310{
311 *((unsigned char *)uv_local_mmr_address(offset)) = val;
312}
313
280/* 314/*
281 * Structures and definitions for converting between cpu, node, pnode, and blade 315 * Structures and definitions for converting between cpu, node, pnode, and blade
282 * numbers. 316 * numbers.
@@ -351,5 +385,20 @@ static inline int uv_num_possible_blades(void)
351 return uv_possible_blades; 385 return uv_possible_blades;
352} 386}
353 387
354#endif /* _ASM_X86_UV_UV_HUB_H */ 388/* Update SCIR state */
389static inline void uv_set_scir_bits(unsigned char value)
390{
391 if (uv_hub_info->scir.state != value) {
392 uv_hub_info->scir.state = value;
393 uv_write_local_mmr8(uv_hub_info->scir.offset, value);
394 }
395}
396static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value)
397{
398 if (uv_cpu_hub_info(cpu)->scir.state != value) {
399 uv_cpu_hub_info(cpu)->scir.state = value;
400 uv_write_local_mmr8(uv_cpu_hub_info(cpu)->scir.offset, value);
401 }
402}
355 403
404#endif /* _ASM_X86_UV_UV_HUB_H */
diff --git a/arch/x86/include/asm/vmware.h b/arch/x86/include/asm/vmware.h
new file mode 100644
index 000000000000..c11b7e100d83
--- /dev/null
+++ b/arch/x86/include/asm/vmware.h
@@ -0,0 +1,27 @@
1/*
2 * Copyright (C) 2008, VMware, Inc.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
12 * NON INFRINGEMENT. See the GNU General Public License for more
13 * details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 */
20#ifndef ASM_X86__VMWARE_H
21#define ASM_X86__VMWARE_H
22
23extern unsigned long vmware_get_tsc_khz(void);
24extern int vmware_platform(void);
25extern void vmware_set_feature_bits(struct cpuinfo_x86 *c);
26
27#endif
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 3f6000d95fe2..5e79ca694326 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -33,8 +33,14 @@
33#ifndef _ASM_X86_XEN_HYPERCALL_H 33#ifndef _ASM_X86_XEN_HYPERCALL_H
34#define _ASM_X86_XEN_HYPERCALL_H 34#define _ASM_X86_XEN_HYPERCALL_H
35 35
36#include <linux/kernel.h>
37#include <linux/spinlock.h>
36#include <linux/errno.h> 38#include <linux/errno.h>
37#include <linux/string.h> 39#include <linux/string.h>
40#include <linux/types.h>
41
42#include <asm/page.h>
43#include <asm/pgtable.h>
38 44
39#include <xen/interface/xen.h> 45#include <xen/interface/xen.h>
40#include <xen/interface/sched.h> 46#include <xen/interface/sched.h>
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index a38d25ac87d2..81fbd735aec4 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -33,39 +33,10 @@
33#ifndef _ASM_X86_XEN_HYPERVISOR_H 33#ifndef _ASM_X86_XEN_HYPERVISOR_H
34#define _ASM_X86_XEN_HYPERVISOR_H 34#define _ASM_X86_XEN_HYPERVISOR_H
35 35
36#include <linux/types.h>
37#include <linux/kernel.h>
38
39#include <xen/interface/xen.h>
40#include <xen/interface/version.h>
41
42#include <asm/ptrace.h>
43#include <asm/page.h>
44#include <asm/desc.h>
45#if defined(__i386__)
46# ifdef CONFIG_X86_PAE
47# include <asm-generic/pgtable-nopud.h>
48# else
49# include <asm-generic/pgtable-nopmd.h>
50# endif
51#endif
52#include <asm/xen/hypercall.h>
53
54/* arch/i386/kernel/setup.c */ 36/* arch/i386/kernel/setup.c */
55extern struct shared_info *HYPERVISOR_shared_info; 37extern struct shared_info *HYPERVISOR_shared_info;
56extern struct start_info *xen_start_info; 38extern struct start_info *xen_start_info;
57 39
58/* arch/i386/mach-xen/evtchn.c */
59/* Force a proper event-channel callback from Xen. */
60extern void force_evtchn_callback(void);
61
62/* Turn jiffies into Xen system time. */
63u64 jiffies_to_st(unsigned long jiffies);
64
65
66#define MULTI_UVMFLAGS_INDEX 3
67#define MULTI_UVMDOMID_INDEX 4
68
69enum xen_domain_type { 40enum xen_domain_type {
70 XEN_NATIVE, 41 XEN_NATIVE,
71 XEN_PV_DOMAIN, 42 XEN_PV_DOMAIN,
@@ -74,9 +45,15 @@ enum xen_domain_type {
74 45
75extern enum xen_domain_type xen_domain_type; 46extern enum xen_domain_type xen_domain_type;
76 47
48#ifdef CONFIG_XEN
77#define xen_domain() (xen_domain_type != XEN_NATIVE) 49#define xen_domain() (xen_domain_type != XEN_NATIVE)
78#define xen_pv_domain() (xen_domain_type == XEN_PV_DOMAIN) 50#else
51#define xen_domain() (0)
52#endif
53
54#define xen_pv_domain() (xen_domain() && xen_domain_type == XEN_PV_DOMAIN)
55#define xen_hvm_domain() (xen_domain() && xen_domain_type == XEN_HVM_DOMAIN)
56
79#define xen_initial_domain() (xen_pv_domain() && xen_start_info->flags & SIF_INITDOMAIN) 57#define xen_initial_domain() (xen_pv_domain() && xen_start_info->flags & SIF_INITDOMAIN)
80#define xen_hvm_domain() (xen_domain_type == XEN_HVM_DOMAIN)
81 58
82#endif /* _ASM_X86_XEN_HYPERVISOR_H */ 59#endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index bc628998a1b9..7ef617ef1df3 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -1,11 +1,16 @@
1#ifndef _ASM_X86_XEN_PAGE_H 1#ifndef _ASM_X86_XEN_PAGE_H
2#define _ASM_X86_XEN_PAGE_H 2#define _ASM_X86_XEN_PAGE_H
3 3
4#include <linux/kernel.h>
5#include <linux/types.h>
6#include <linux/spinlock.h>
4#include <linux/pfn.h> 7#include <linux/pfn.h>
5 8
6#include <asm/uaccess.h> 9#include <asm/uaccess.h>
10#include <asm/page.h>
7#include <asm/pgtable.h> 11#include <asm/pgtable.h>
8 12
13#include <xen/interface/xen.h>
9#include <xen/features.h> 14#include <xen/features.h>
10 15
11/* Xen machine address */ 16/* Xen machine address */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 1cad9318d217..d364df03c1d6 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -12,6 +12,7 @@ CFLAGS_REMOVE_tsc.o = -pg
12CFLAGS_REMOVE_rtc.o = -pg 12CFLAGS_REMOVE_rtc.o = -pg
13CFLAGS_REMOVE_paravirt-spinlocks.o = -pg 13CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
14CFLAGS_REMOVE_ftrace.o = -pg 14CFLAGS_REMOVE_ftrace.o = -pg
15CFLAGS_REMOVE_early_printk.o = -pg
15endif 16endif
16 17
17# 18#
@@ -23,7 +24,7 @@ CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp)
23CFLAGS_hpet.o := $(nostackp) 24CFLAGS_hpet.o := $(nostackp)
24CFLAGS_tsc.o := $(nostackp) 25CFLAGS_tsc.o := $(nostackp)
25 26
26obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o 27obj-y := process_$(BITS).o signal.o entry_$(BITS).o
27obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o 28obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
28obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o 29obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o
29obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o 30obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o
@@ -106,6 +107,10 @@ microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o
106microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o 107microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o
107obj-$(CONFIG_MICROCODE) += microcode.o 108obj-$(CONFIG_MICROCODE) += microcode.o
108 109
110obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
111
112obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o # NB rename without _64
113
109### 114###
110# 64 bit specific files 115# 64 bit specific files
111ifeq ($(CONFIG_X86_64),y) 116ifeq ($(CONFIG_X86_64),y)
@@ -119,7 +124,6 @@ ifeq ($(CONFIG_X86_64),y)
119 obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o 124 obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o
120 obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o 125 obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
121 obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o 126 obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o
122 obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o
123 127
124 obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o 128 obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
125endif 129endif
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index a7b6dec6fc3f..2e2da717b350 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -24,6 +24,7 @@
24#include <linux/iommu-helper.h> 24#include <linux/iommu-helper.h>
25#include <asm/proto.h> 25#include <asm/proto.h>
26#include <asm/iommu.h> 26#include <asm/iommu.h>
27#include <asm/gart.h>
27#include <asm/amd_iommu_types.h> 28#include <asm/amd_iommu_types.h>
28#include <asm/amd_iommu.h> 29#include <asm/amd_iommu.h>
29 30
@@ -235,8 +236,9 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
235 status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; 236 status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
236 writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); 237 writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
237 238
238 if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit())) 239 if (unlikely(i == EXIT_LOOP_COUNT))
239 printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n"); 240 panic("AMD IOMMU: Completion wait loop failed\n");
241
240out: 242out:
241 spin_unlock_irqrestore(&iommu->lock, flags); 243 spin_unlock_irqrestore(&iommu->lock, flags);
242 244
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 30ae2701b3df..c625800c55ca 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -28,6 +28,7 @@
28#include <asm/amd_iommu_types.h> 28#include <asm/amd_iommu_types.h>
29#include <asm/amd_iommu.h> 29#include <asm/amd_iommu.h>
30#include <asm/iommu.h> 30#include <asm/iommu.h>
31#include <asm/gart.h>
31 32
32/* 33/*
33 * definitions for the ACPI scanning code 34 * definitions for the ACPI scanning code
@@ -427,6 +428,10 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
427 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, 428 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
428 &entry, sizeof(entry)); 429 &entry, sizeof(entry));
429 430
431 /* set head and tail to zero manually */
432 writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
433 writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
434
430 iommu_feature_enable(iommu, CONTROL_CMDBUF_EN); 435 iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
431 436
432 return cmd_buf; 437 return cmd_buf;
@@ -1074,7 +1079,8 @@ int __init amd_iommu_init(void)
1074 goto free; 1079 goto free;
1075 1080
1076 /* IOMMU rlookup table - find the IOMMU for a specific device */ 1081 /* IOMMU rlookup table - find the IOMMU for a specific device */
1077 amd_iommu_rlookup_table = (void *)__get_free_pages(GFP_KERNEL, 1082 amd_iommu_rlookup_table = (void *)__get_free_pages(
1083 GFP_KERNEL | __GFP_ZERO,
1078 get_order(rlookup_table_size)); 1084 get_order(rlookup_table_size));
1079 if (amd_iommu_rlookup_table == NULL) 1085 if (amd_iommu_rlookup_table == NULL)
1080 goto free; 1086 goto free;
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 9a32b37ee2ee..676debfc1702 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -1,8 +1,9 @@
1/* 1/*
2 * Firmware replacement code. 2 * Firmware replacement code.
3 * 3 *
4 * Work around broken BIOSes that don't set an aperture or only set the 4 * Work around broken BIOSes that don't set an aperture, only set the
5 * aperture in the AGP bridge. 5 * aperture in the AGP bridge, or set too small aperture.
6 *
6 * If all fails map the aperture over some low memory. This is cheaper than 7 * If all fails map the aperture over some low memory. This is cheaper than
7 * doing bounce buffering. The memory is lost. This is done at early boot 8 * doing bounce buffering. The memory is lost. This is done at early boot
8 * because only the bootmem allocator can allocate 32+MB. 9 * because only the bootmem allocator can allocate 32+MB.
diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index b9019271af62..6b7f824db160 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -30,6 +30,7 @@
30#include <linux/module.h> 30#include <linux/module.h>
31#include <linux/dmi.h> 31#include <linux/dmi.h>
32#include <linux/dmar.h> 32#include <linux/dmar.h>
33#include <linux/ftrace.h>
33 34
34#include <asm/atomic.h> 35#include <asm/atomic.h>
35#include <asm/smp.h> 36#include <asm/smp.h>
@@ -775,11 +776,7 @@ static void local_apic_timer_interrupt(void)
775 /* 776 /*
776 * the NMI deadlock-detector uses this. 777 * the NMI deadlock-detector uses this.
777 */ 778 */
778#ifdef CONFIG_X86_64 779 inc_irq_stat(apic_timer_irqs);
779 add_pda(apic_timer_irqs, 1);
780#else
781 per_cpu(irq_stat, cpu).apic_timer_irqs++;
782#endif
783 780
784 evt->event_handler(evt); 781 evt->event_handler(evt);
785} 782}
@@ -792,7 +789,7 @@ static void local_apic_timer_interrupt(void)
792 * [ if a single-CPU system runs an SMP kernel then we call the local 789 * [ if a single-CPU system runs an SMP kernel then we call the local
793 * interrupt as well. Thus we cannot inline the local irq ... ] 790 * interrupt as well. Thus we cannot inline the local irq ... ]
794 */ 791 */
795void smp_apic_timer_interrupt(struct pt_regs *regs) 792void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
796{ 793{
797 struct pt_regs *old_regs = set_irq_regs(regs); 794 struct pt_regs *old_regs = set_irq_regs(regs);
798 795
@@ -806,9 +803,7 @@ void smp_apic_timer_interrupt(struct pt_regs *regs)
806 * Besides, if we don't timer interrupts ignore the global 803 * Besides, if we don't timer interrupts ignore the global
807 * interrupt lock, which is the WrongThing (tm) to do. 804 * interrupt lock, which is the WrongThing (tm) to do.
808 */ 805 */
809#ifdef CONFIG_X86_64
810 exit_idle(); 806 exit_idle();
811#endif
812 irq_enter(); 807 irq_enter();
813 local_apic_timer_interrupt(); 808 local_apic_timer_interrupt();
814 irq_exit(); 809 irq_exit();
@@ -1666,9 +1661,7 @@ void smp_spurious_interrupt(struct pt_regs *regs)
1666{ 1661{
1667 u32 v; 1662 u32 v;
1668 1663
1669#ifdef CONFIG_X86_64
1670 exit_idle(); 1664 exit_idle();
1671#endif
1672 irq_enter(); 1665 irq_enter();
1673 /* 1666 /*
1674 * Check if this really is a spurious interrupt and ACK it 1667 * Check if this really is a spurious interrupt and ACK it
@@ -1679,14 +1672,11 @@ void smp_spurious_interrupt(struct pt_regs *regs)
1679 if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) 1672 if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
1680 ack_APIC_irq(); 1673 ack_APIC_irq();
1681 1674
1682#ifdef CONFIG_X86_64 1675 inc_irq_stat(irq_spurious_count);
1683 add_pda(irq_spurious_count, 1); 1676
1684#else
1685 /* see sw-dev-man vol 3, chapter 7.4.13.5 */ 1677 /* see sw-dev-man vol 3, chapter 7.4.13.5 */
1686 pr_info("spurious APIC interrupt on CPU#%d, " 1678 pr_info("spurious APIC interrupt on CPU#%d, "
1687 "should never happen.\n", smp_processor_id()); 1679 "should never happen.\n", smp_processor_id());
1688 __get_cpu_var(irq_stat).irq_spurious_count++;
1689#endif
1690 irq_exit(); 1680 irq_exit();
1691} 1681}
1692 1682
@@ -1697,9 +1687,7 @@ void smp_error_interrupt(struct pt_regs *regs)
1697{ 1687{
1698 u32 v, v1; 1688 u32 v, v1;
1699 1689
1700#ifdef CONFIG_X86_64
1701 exit_idle(); 1690 exit_idle();
1702#endif
1703 irq_enter(); 1691 irq_enter();
1704 /* First tickle the hardware, only then report what went on. -- REW */ 1692 /* First tickle the hardware, only then report what went on. -- REW */
1705 v = apic_read(APIC_ESR); 1693 v = apic_read(APIC_ESR);
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 6649d09ad88f..ee4df08feee6 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -11,7 +11,7 @@
11#include <linux/suspend.h> 11#include <linux/suspend.h>
12#include <linux/kbuild.h> 12#include <linux/kbuild.h>
13#include <asm/ucontext.h> 13#include <asm/ucontext.h>
14#include "sigframe.h" 14#include <asm/sigframe.h>
15#include <asm/pgtable.h> 15#include <asm/pgtable.h>
16#include <asm/fixmap.h> 16#include <asm/fixmap.h>
17#include <asm/processor.h> 17#include <asm/processor.h>
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 7fcf63d22f8b..1d41d3f1edbc 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -20,6 +20,8 @@
20 20
21#include <xen/interface/xen.h> 21#include <xen/interface/xen.h>
22 22
23#include <asm/sigframe.h>
24
23#define __NO_STUBS 1 25#define __NO_STUBS 1
24#undef __SYSCALL 26#undef __SYSCALL
25#undef _ASM_X86_UNISTD_64_H 27#undef _ASM_X86_UNISTD_64_H
@@ -87,7 +89,7 @@ int main(void)
87 BLANK(); 89 BLANK();
88#undef ENTRY 90#undef ENTRY
89 DEFINE(IA32_RT_SIGFRAME_sigcontext, 91 DEFINE(IA32_RT_SIGFRAME_sigcontext,
90 offsetof (struct rt_sigframe32, uc.uc_mcontext)); 92 offsetof (struct rt_sigframe_ia32, uc.uc_mcontext));
91 BLANK(); 93 BLANK();
92#endif 94#endif
93 DEFINE(pbe_address, offsetof(struct pbe, address)); 95 DEFINE(pbe_address, offsetof(struct pbe, address));
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
index f0dfe6f17e7e..2a0a2a3cac26 100644
--- a/arch/x86/kernel/bios_uv.c
+++ b/arch/x86/kernel/bios_uv.c
@@ -69,10 +69,10 @@ s64 uv_bios_call_reentrant(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
69 69
70long sn_partition_id; 70long sn_partition_id;
71EXPORT_SYMBOL_GPL(sn_partition_id); 71EXPORT_SYMBOL_GPL(sn_partition_id);
72long uv_coherency_id; 72long sn_coherency_id;
73EXPORT_SYMBOL_GPL(uv_coherency_id); 73EXPORT_SYMBOL_GPL(sn_coherency_id);
74long uv_region_size; 74long sn_region_size;
75EXPORT_SYMBOL_GPL(uv_region_size); 75EXPORT_SYMBOL_GPL(sn_region_size);
76int uv_type; 76int uv_type;
77 77
78 78
@@ -100,6 +100,56 @@ s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher,
100 return ret; 100 return ret;
101} 101}
102 102
103int
104uv_bios_mq_watchlist_alloc(int blade, unsigned long addr, unsigned int mq_size,
105 unsigned long *intr_mmr_offset)
106{
107 union uv_watchlist_u size_blade;
108 u64 watchlist;
109 s64 ret;
110
111 size_blade.size = mq_size;
112 size_blade.blade = blade;
113
114 /*
115 * bios returns watchlist number or negative error number.
116 */
117 ret = (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_ALLOC, addr,
118 size_blade.val, (u64)intr_mmr_offset,
119 (u64)&watchlist, 0);
120 if (ret < BIOS_STATUS_SUCCESS)
121 return ret;
122
123 return watchlist;
124}
125EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_alloc);
126
127int
128uv_bios_mq_watchlist_free(int blade, int watchlist_num)
129{
130 return (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_FREE,
131 blade, watchlist_num, 0, 0, 0);
132}
133EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_free);
134
135s64
136uv_bios_change_memprotect(u64 paddr, u64 len, enum uv_memprotect perms)
137{
138 return uv_bios_call_irqsave(UV_BIOS_MEMPROTECT, paddr, len,
139 perms, 0, 0);
140}
141EXPORT_SYMBOL_GPL(uv_bios_change_memprotect);
142
143s64
144uv_bios_reserved_page_pa(u64 buf, u64 *cookie, u64 *addr, u64 *len)
145{
146 s64 ret;
147
148 ret = uv_bios_call_irqsave(UV_BIOS_GET_PARTITION_ADDR, (u64)cookie,
149 (u64)addr, buf, (u64)len, 0);
150 return ret;
151}
152EXPORT_SYMBOL_GPL(uv_bios_reserved_page_pa);
103 153
104s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second) 154s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second)
105{ 155{
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
new file mode 100644
index 000000000000..2ac0ab71412a
--- /dev/null
+++ b/arch/x86/kernel/check.c
@@ -0,0 +1,161 @@
1#include <linux/module.h>
2#include <linux/sched.h>
3#include <linux/kthread.h>
4#include <linux/workqueue.h>
5#include <asm/e820.h>
6#include <asm/proto.h>
7
8/*
9 * Some BIOSes seem to corrupt the low 64k of memory during events
10 * like suspend/resume and unplugging an HDMI cable. Reserve all
11 * remaining free memory in that area and fill it with a distinct
12 * pattern.
13 */
14#define MAX_SCAN_AREAS 8
15
16static int __read_mostly memory_corruption_check = -1;
17
18static unsigned __read_mostly corruption_check_size = 64*1024;
19static unsigned __read_mostly corruption_check_period = 60; /* seconds */
20
21static struct e820entry scan_areas[MAX_SCAN_AREAS];
22static int num_scan_areas;
23
24
25static __init int set_corruption_check(char *arg)
26{
27 char *end;
28
29 memory_corruption_check = simple_strtol(arg, &end, 10);
30
31 return (*end == 0) ? 0 : -EINVAL;
32}
33early_param("memory_corruption_check", set_corruption_check);
34
35static __init int set_corruption_check_period(char *arg)
36{
37 char *end;
38
39 corruption_check_period = simple_strtoul(arg, &end, 10);
40
41 return (*end == 0) ? 0 : -EINVAL;
42}
43early_param("memory_corruption_check_period", set_corruption_check_period);
44
45static __init int set_corruption_check_size(char *arg)
46{
47 char *end;
48 unsigned size;
49
50 size = memparse(arg, &end);
51
52 if (*end == '\0')
53 corruption_check_size = size;
54
55 return (size == corruption_check_size) ? 0 : -EINVAL;
56}
57early_param("memory_corruption_check_size", set_corruption_check_size);
58
59
60void __init setup_bios_corruption_check(void)
61{
62 u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */
63
64 if (memory_corruption_check == -1) {
65 memory_corruption_check =
66#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
67 1
68#else
69 0
70#endif
71 ;
72 }
73
74 if (corruption_check_size == 0)
75 memory_corruption_check = 0;
76
77 if (!memory_corruption_check)
78 return;
79
80 corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
81
82 while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) {
83 u64 size;
84 addr = find_e820_area_size(addr, &size, PAGE_SIZE);
85
86 if (addr == 0)
87 break;
88
89 if ((addr + size) > corruption_check_size)
90 size = corruption_check_size - addr;
91
92 if (size == 0)
93 break;
94
95 e820_update_range(addr, size, E820_RAM, E820_RESERVED);
96 scan_areas[num_scan_areas].addr = addr;
97 scan_areas[num_scan_areas].size = size;
98 num_scan_areas++;
99
100 /* Assume we've already mapped this early memory */
101 memset(__va(addr), 0, size);
102
103 addr += size;
104 }
105
106 printk(KERN_INFO "Scanning %d areas for low memory corruption\n",
107 num_scan_areas);
108 update_e820();
109}
110
111
112void check_for_bios_corruption(void)
113{
114 int i;
115 int corruption = 0;
116
117 if (!memory_corruption_check)
118 return;
119
120 for (i = 0; i < num_scan_areas; i++) {
121 unsigned long *addr = __va(scan_areas[i].addr);
122 unsigned long size = scan_areas[i].size;
123
124 for (; size; addr++, size -= sizeof(unsigned long)) {
125 if (!*addr)
126 continue;
127 printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n",
128 addr, __pa(addr), *addr);
129 corruption = 1;
130 *addr = 0;
131 }
132 }
133
134 WARN_ONCE(corruption, KERN_ERR "Memory corruption detected in low memory\n");
135}
136
137static void check_corruption(struct work_struct *dummy);
138static DECLARE_DELAYED_WORK(bios_check_work, check_corruption);
139
140static void check_corruption(struct work_struct *dummy)
141{
142 check_for_bios_corruption();
143 schedule_delayed_work(&bios_check_work,
144 round_jiffies_relative(corruption_check_period*HZ));
145}
146
147static int start_periodic_check_for_corruption(void)
148{
149 if (!memory_corruption_check || corruption_check_period == 0)
150 return 0;
151
152 printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n",
153 corruption_check_period);
154
155 /* First time we run the checks right away */
156 schedule_delayed_work(&bios_check_work, 0);
157 return 0;
158}
159
160module_init(start_periodic_check_for_corruption);
161
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 82ec6075c057..82db7f45e2de 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -2,8 +2,14 @@
2# Makefile for x86-compatible CPU details and quirks 2# Makefile for x86-compatible CPU details and quirks
3# 3#
4 4
5# Don't trace early stages of a secondary CPU boot
6ifdef CONFIG_FUNCTION_TRACER
7CFLAGS_REMOVE_common.o = -pg
8endif
9
5obj-y := intel_cacheinfo.o addon_cpuid_features.o 10obj-y := intel_cacheinfo.o addon_cpuid_features.o
6obj-y += proc.o capflags.o powerflags.o common.o 11obj-y += proc.o capflags.o powerflags.o common.o
12obj-y += vmware.o hypervisor.o
7 13
8obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o 14obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
9obj-$(CONFIG_X86_64) += bugs_64.o 15obj-$(CONFIG_X86_64) += bugs_64.o
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index ef8f831af823..2cf23634b6d9 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -120,9 +120,17 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
120 c->cpu_core_id = phys_pkg_id(c->initial_apicid, ht_mask_width) 120 c->cpu_core_id = phys_pkg_id(c->initial_apicid, ht_mask_width)
121 & core_select_mask; 121 & core_select_mask;
122 c->phys_proc_id = phys_pkg_id(c->initial_apicid, core_plus_mask_width); 122 c->phys_proc_id = phys_pkg_id(c->initial_apicid, core_plus_mask_width);
123 /*
124 * Reinit the apicid, now that we have extended initial_apicid.
125 */
126 c->apicid = phys_pkg_id(c->initial_apicid, 0);
123#else 127#else
124 c->cpu_core_id = phys_pkg_id(ht_mask_width) & core_select_mask; 128 c->cpu_core_id = phys_pkg_id(ht_mask_width) & core_select_mask;
125 c->phys_proc_id = phys_pkg_id(core_plus_mask_width); 129 c->phys_proc_id = phys_pkg_id(core_plus_mask_width);
130 /*
131 * Reinit the apicid, now that we have extended initial_apicid.
132 */
133 c->apicid = phys_pkg_id(0);
126#endif 134#endif
127 c->x86_max_cores = (core_level_siblings / smp_num_siblings); 135 c->x86_max_cores = (core_level_siblings / smp_num_siblings);
128 136
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 8f1e31db2ad5..7c878f6aa919 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -283,9 +283,14 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
283{ 283{
284 early_init_amd_mc(c); 284 early_init_amd_mc(c);
285 285
286 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ 286 /*
287 if (c->x86_power & (1<<8)) 287 * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
288 * with P/T states and does not stop in deep C-states
289 */
290 if (c->x86_power & (1 << 8)) {
288 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 291 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
292 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
293 }
289 294
290#ifdef CONFIG_X86_64 295#ifdef CONFIG_X86_64
291 set_cpu_cap(c, X86_FEATURE_SYSCALL32); 296 set_cpu_cap(c, X86_FEATURE_SYSCALL32);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index b9c9ea0217a9..42e0853030cb 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -36,6 +36,7 @@
36#include <asm/proto.h> 36#include <asm/proto.h>
37#include <asm/sections.h> 37#include <asm/sections.h>
38#include <asm/setup.h> 38#include <asm/setup.h>
39#include <asm/hypervisor.h>
39 40
40#include "cpu.h" 41#include "cpu.h"
41 42
@@ -703,6 +704,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
703 detect_ht(c); 704 detect_ht(c);
704#endif 705#endif
705 706
707 init_hypervisor(c);
706 /* 708 /*
707 * On SMP, boot_cpu_data holds the common feature set between 709 * On SMP, boot_cpu_data holds the common feature set between
708 * all CPUs; so make sure that we indicate which features are 710 * all CPUs; so make sure that we indicate which features are
@@ -862,7 +864,7 @@ EXPORT_SYMBOL(_cpu_pda);
862 864
863struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; 865struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
864 866
865char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; 867static char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
866 868
867void __cpuinit pda_init(int cpu) 869void __cpuinit pda_init(int cpu)
868{ 870{
@@ -903,8 +905,8 @@ void __cpuinit pda_init(int cpu)
903 } 905 }
904} 906}
905 907
906char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + 908static char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
907 DEBUG_STKSZ] __page_aligned_bss; 909 DEBUG_STKSZ] __page_aligned_bss;
908 910
909extern asmlinkage void ignore_sysret(void); 911extern asmlinkage void ignore_sysret(void);
910 912
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
new file mode 100644
index 000000000000..fb5b86af0b01
--- /dev/null
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -0,0 +1,58 @@
1/*
2 * Common hypervisor code
3 *
4 * Copyright (C) 2008, VMware, Inc.
5 * Author : Alok N Kataria <akataria@vmware.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
15 * NON INFRINGEMENT. See the GNU General Public License for more
16 * details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
21 *
22 */
23
24#include <asm/processor.h>
25#include <asm/vmware.h>
26#include <asm/hypervisor.h>
27
28static inline void __cpuinit
29detect_hypervisor_vendor(struct cpuinfo_x86 *c)
30{
31 if (vmware_platform()) {
32 c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE;
33 } else {
34 c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE;
35 }
36}
37
38unsigned long get_hypervisor_tsc_freq(void)
39{
40 if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE)
41 return vmware_get_tsc_khz();
42 return 0;
43}
44
45static inline void __cpuinit
46hypervisor_set_feature_bits(struct cpuinfo_x86 *c)
47{
48 if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) {
49 vmware_set_feature_bits(c);
50 return;
51 }
52}
53
54void __cpuinit init_hypervisor(struct cpuinfo_x86 *c)
55{
56 detect_hypervisor_vendor(c);
57 hypervisor_set_feature_bits(c);
58}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 816f27f289b1..8ea6929e974c 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -11,7 +11,6 @@
11#include <asm/pgtable.h> 11#include <asm/pgtable.h>
12#include <asm/msr.h> 12#include <asm/msr.h>
13#include <asm/uaccess.h> 13#include <asm/uaccess.h>
14#include <asm/ptrace.h>
15#include <asm/ds.h> 14#include <asm/ds.h>
16#include <asm/bugs.h> 15#include <asm/bugs.h>
17 16
@@ -41,6 +40,16 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
41 if (c->x86 == 15 && c->x86_cache_alignment == 64) 40 if (c->x86 == 15 && c->x86_cache_alignment == 64)
42 c->x86_cache_alignment = 128; 41 c->x86_cache_alignment = 128;
43#endif 42#endif
43
44 /*
45 * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
46 * with P/T states and does not stop in deep C-states
47 */
48 if (c->x86_power & (1 << 8)) {
49 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
50 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
51 }
52
44} 53}
45 54
46#ifdef CONFIG_X86_32 55#ifdef CONFIG_X86_32
@@ -242,6 +251,13 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
242 251
243 intel_workarounds(c); 252 intel_workarounds(c);
244 253
254 /*
255 * Detect the extended topology information if available. This
256 * will reinitialise the initial_apicid which will be used
257 * in init_intel_cacheinfo()
258 */
259 detect_extended_topology(c);
260
245 l2 = init_intel_cacheinfo(c); 261 l2 = init_intel_cacheinfo(c);
246 if (c->cpuid_level > 9) { 262 if (c->cpuid_level > 9) {
247 unsigned eax = cpuid_eax(10); 263 unsigned eax = cpuid_eax(10);
@@ -309,10 +325,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
309 set_cpu_cap(c, X86_FEATURE_P3); 325 set_cpu_cap(c, X86_FEATURE_P3);
310#endif 326#endif
311 327
312 if (cpu_has_bts)
313 ptrace_bts_init_intel(c);
314
315 detect_extended_topology(c);
316 if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { 328 if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
317 /* 329 /*
318 * let's use the legacy cpuid vector 0x1 and 0x4 for topology 330 * let's use the legacy cpuid vector 0x1 and 0x4 for topology
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 7bd00a565672..48533d77be78 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -641,20 +641,17 @@ static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf)
641 return show_shared_cpu_map_func(leaf, 1, buf); 641 return show_shared_cpu_map_func(leaf, 1, buf);
642} 642}
643 643
644static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) { 644static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)
645 switch(this_leaf->eax.split.type) { 645{
646 case CACHE_TYPE_DATA: 646 switch (this_leaf->eax.split.type) {
647 case CACHE_TYPE_DATA:
647 return sprintf(buf, "Data\n"); 648 return sprintf(buf, "Data\n");
648 break; 649 case CACHE_TYPE_INST:
649 case CACHE_TYPE_INST:
650 return sprintf(buf, "Instruction\n"); 650 return sprintf(buf, "Instruction\n");
651 break; 651 case CACHE_TYPE_UNIFIED:
652 case CACHE_TYPE_UNIFIED:
653 return sprintf(buf, "Unified\n"); 652 return sprintf(buf, "Unified\n");
654 break; 653 default:
655 default:
656 return sprintf(buf, "Unknown\n"); 654 return sprintf(buf, "Unknown\n");
657 break;
658 } 655 }
659} 656}
660 657
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 4b031a4ac856..1c838032fd37 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -510,12 +510,9 @@ static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
510 */ 510 */
511void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 511void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
512{ 512{
513 static cpumask_t mce_cpus = CPU_MASK_NONE;
514
515 mce_cpu_quirks(c); 513 mce_cpu_quirks(c);
516 514
517 if (mce_dont_init || 515 if (mce_dont_init ||
518 cpu_test_and_set(smp_processor_id(), mce_cpus) ||
519 !mce_available(c)) 516 !mce_available(c))
520 return; 517 return;
521 518
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index a1de80f368f1..a5a5e0530370 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -248,7 +248,7 @@ asmlinkage void mce_threshold_interrupt(void)
248 } 248 }
249 } 249 }
250out: 250out:
251 add_pda(irq_threshold_count, 1); 251 inc_irq_stat(irq_threshold_count);
252 irq_exit(); 252 irq_exit();
253} 253}
254 254
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index c17eaf5dd6dd..4b48f251fd39 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -26,7 +26,7 @@ asmlinkage void smp_thermal_interrupt(void)
26 if (therm_throt_process(msr_val & 1)) 26 if (therm_throt_process(msr_val & 1))
27 mce_log_therm_throt_event(smp_processor_id(), msr_val); 27 mce_log_therm_throt_event(smp_processor_id(), msr_val);
28 28
29 add_pda(irq_thermal_count, 1); 29 inc_irq_stat(irq_thermal_count);
30 irq_exit(); 30 irq_exit();
31} 31}
32 32
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index c78c04821ea1..1159e269e596 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -803,6 +803,7 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
803} 803}
804 804
805static struct res_range __initdata range[RANGE_NUM]; 805static struct res_range __initdata range[RANGE_NUM];
806static int __initdata nr_range;
806 807
807#ifdef CONFIG_MTRR_SANITIZER 808#ifdef CONFIG_MTRR_SANITIZER
808 809
@@ -1206,39 +1207,43 @@ struct mtrr_cleanup_result {
1206#define PSHIFT (PAGE_SHIFT - 10) 1207#define PSHIFT (PAGE_SHIFT - 10)
1207 1208
1208static struct mtrr_cleanup_result __initdata result[NUM_RESULT]; 1209static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
1209static struct res_range __initdata range_new[RANGE_NUM];
1210static unsigned long __initdata min_loss_pfn[RANGE_NUM]; 1210static unsigned long __initdata min_loss_pfn[RANGE_NUM];
1211 1211
1212static int __init mtrr_cleanup(unsigned address_bits) 1212static void __init print_out_mtrr_range_state(void)
1213{ 1213{
1214 unsigned long extra_remove_base, extra_remove_size;
1215 unsigned long base, size, def, dummy;
1216 mtrr_type type;
1217 int nr_range, nr_range_new;
1218 u64 chunk_size, gran_size;
1219 unsigned long range_sums, range_sums_new;
1220 int index_good;
1221 int num_reg_good;
1222 int i; 1214 int i;
1215 char start_factor = 'K', size_factor = 'K';
1216 unsigned long start_base, size_base;
1217 mtrr_type type;
1223 1218
1224 /* extra one for all 0 */ 1219 for (i = 0; i < num_var_ranges; i++) {
1225 int num[MTRR_NUM_TYPES + 1];
1226 1220
1227 if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) 1221 size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
1228 return 0; 1222 if (!size_base)
1229 rdmsr(MTRRdefType_MSR, def, dummy); 1223 continue;
1230 def &= 0xff;
1231 if (def != MTRR_TYPE_UNCACHABLE)
1232 return 0;
1233 1224
1234 /* get it and store it aside */ 1225 size_base = to_size_factor(size_base, &size_factor),
1235 memset(range_state, 0, sizeof(range_state)); 1226 start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
1236 for (i = 0; i < num_var_ranges; i++) { 1227 start_base = to_size_factor(start_base, &start_factor),
1237 mtrr_if->get(i, &base, &size, &type); 1228 type = range_state[i].type;
1238 range_state[i].base_pfn = base; 1229
1239 range_state[i].size_pfn = size; 1230 printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
1240 range_state[i].type = type; 1231 i, start_base, start_factor,
1232 size_base, size_factor,
1233 (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
1234 ((type == MTRR_TYPE_WRPROT) ? "WP" :
1235 ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
1236 );
1241 } 1237 }
1238}
1239
1240static int __init mtrr_need_cleanup(void)
1241{
1242 int i;
1243 mtrr_type type;
1244 unsigned long size;
1245 /* extra one for all 0 */
1246 int num[MTRR_NUM_TYPES + 1];
1242 1247
1243 /* check entries number */ 1248 /* check entries number */
1244 memset(num, 0, sizeof(num)); 1249 memset(num, 0, sizeof(num));
@@ -1263,29 +1268,133 @@ static int __init mtrr_cleanup(unsigned address_bits)
1263 num_var_ranges - num[MTRR_NUM_TYPES]) 1268 num_var_ranges - num[MTRR_NUM_TYPES])
1264 return 0; 1269 return 0;
1265 1270
1266 /* print original var MTRRs at first, for debugging: */ 1271 return 1;
1267 printk(KERN_DEBUG "original variable MTRRs\n"); 1272}
1268 for (i = 0; i < num_var_ranges; i++) {
1269 char start_factor = 'K', size_factor = 'K';
1270 unsigned long start_base, size_base;
1271 1273
1272 size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10); 1274static unsigned long __initdata range_sums;
1273 if (!size_base) 1275static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
1274 continue; 1276 unsigned long extra_remove_base,
1277 unsigned long extra_remove_size,
1278 int i)
1279{
1280 int num_reg;
1281 static struct res_range range_new[RANGE_NUM];
1282 static int nr_range_new;
1283 unsigned long range_sums_new;
1284
1285 /* convert ranges to var ranges state */
1286 num_reg = x86_setup_var_mtrrs(range, nr_range,
1287 chunk_size, gran_size);
1288
1289 /* we got new setting in range_state, check it */
1290 memset(range_new, 0, sizeof(range_new));
1291 nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
1292 extra_remove_base, extra_remove_size);
1293 range_sums_new = sum_ranges(range_new, nr_range_new);
1294
1295 result[i].chunk_sizek = chunk_size >> 10;
1296 result[i].gran_sizek = gran_size >> 10;
1297 result[i].num_reg = num_reg;
1298 if (range_sums < range_sums_new) {
1299 result[i].lose_cover_sizek =
1300 (range_sums_new - range_sums) << PSHIFT;
1301 result[i].bad = 1;
1302 } else
1303 result[i].lose_cover_sizek =
1304 (range_sums - range_sums_new) << PSHIFT;
1275 1305
1276 size_base = to_size_factor(size_base, &size_factor), 1306 /* double check it */
1277 start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10); 1307 if (!result[i].bad && !result[i].lose_cover_sizek) {
1278 start_base = to_size_factor(start_base, &start_factor), 1308 if (nr_range_new != nr_range ||
1279 type = range_state[i].type; 1309 memcmp(range, range_new, sizeof(range)))
1310 result[i].bad = 1;
1311 }
1280 1312
1281 printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n", 1313 if (!result[i].bad && (range_sums - range_sums_new <
1282 i, start_base, start_factor, 1314 min_loss_pfn[num_reg])) {
1283 size_base, size_factor, 1315 min_loss_pfn[num_reg] =
1284 (type == MTRR_TYPE_UNCACHABLE) ? "UC" : 1316 range_sums - range_sums_new;
1285 ((type == MTRR_TYPE_WRPROT) ? "WP" :
1286 ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
1287 );
1288 } 1317 }
1318}
1319
1320static void __init mtrr_print_out_one_result(int i)
1321{
1322 char gran_factor, chunk_factor, lose_factor;
1323 unsigned long gran_base, chunk_base, lose_base;
1324
1325 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1326 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1327 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1328 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
1329 result[i].bad ? "*BAD*" : " ",
1330 gran_base, gran_factor, chunk_base, chunk_factor);
1331 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
1332 result[i].num_reg, result[i].bad ? "-" : "",
1333 lose_base, lose_factor);
1334}
1335
1336static int __init mtrr_search_optimal_index(void)
1337{
1338 int i;
1339 int num_reg_good;
1340 int index_good;
1341
1342 if (nr_mtrr_spare_reg >= num_var_ranges)
1343 nr_mtrr_spare_reg = num_var_ranges - 1;
1344 num_reg_good = -1;
1345 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
1346 if (!min_loss_pfn[i])
1347 num_reg_good = i;
1348 }
1349
1350 index_good = -1;
1351 if (num_reg_good != -1) {
1352 for (i = 0; i < NUM_RESULT; i++) {
1353 if (!result[i].bad &&
1354 result[i].num_reg == num_reg_good &&
1355 !result[i].lose_cover_sizek) {
1356 index_good = i;
1357 break;
1358 }
1359 }
1360 }
1361
1362 return index_good;
1363}
1364
1365
1366static int __init mtrr_cleanup(unsigned address_bits)
1367{
1368 unsigned long extra_remove_base, extra_remove_size;
1369 unsigned long base, size, def, dummy;
1370 mtrr_type type;
1371 u64 chunk_size, gran_size;
1372 int index_good;
1373 int i;
1374
1375 if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
1376 return 0;
1377 rdmsr(MTRRdefType_MSR, def, dummy);
1378 def &= 0xff;
1379 if (def != MTRR_TYPE_UNCACHABLE)
1380 return 0;
1381
1382 /* get it and store it aside */
1383 memset(range_state, 0, sizeof(range_state));
1384 for (i = 0; i < num_var_ranges; i++) {
1385 mtrr_if->get(i, &base, &size, &type);
1386 range_state[i].base_pfn = base;
1387 range_state[i].size_pfn = size;
1388 range_state[i].type = type;
1389 }
1390
1391 /* check if we need handle it and can handle it */
1392 if (!mtrr_need_cleanup())
1393 return 0;
1394
1395 /* print original var MTRRs at first, for debugging: */
1396 printk(KERN_DEBUG "original variable MTRRs\n");
1397 print_out_mtrr_range_state();
1289 1398
1290 memset(range, 0, sizeof(range)); 1399 memset(range, 0, sizeof(range));
1291 extra_remove_size = 0; 1400 extra_remove_size = 0;
@@ -1309,176 +1418,64 @@ static int __init mtrr_cleanup(unsigned address_bits)
1309 range_sums >> (20 - PAGE_SHIFT)); 1418 range_sums >> (20 - PAGE_SHIFT));
1310 1419
1311 if (mtrr_chunk_size && mtrr_gran_size) { 1420 if (mtrr_chunk_size && mtrr_gran_size) {
1312 int num_reg; 1421 i = 0;
1313 char gran_factor, chunk_factor, lose_factor; 1422 mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size,
1314 unsigned long gran_base, chunk_base, lose_base; 1423 extra_remove_base, extra_remove_size, i);
1315
1316 debug_print++;
1317 /* convert ranges to var ranges state */
1318 num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size,
1319 mtrr_gran_size);
1320 1424
1321 /* we got new setting in range_state, check it */ 1425 mtrr_print_out_one_result(i);
1322 memset(range_new, 0, sizeof(range_new));
1323 nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
1324 extra_remove_base,
1325 extra_remove_size);
1326 range_sums_new = sum_ranges(range_new, nr_range_new);
1327 1426
1328 i = 0;
1329 result[i].chunk_sizek = mtrr_chunk_size >> 10;
1330 result[i].gran_sizek = mtrr_gran_size >> 10;
1331 result[i].num_reg = num_reg;
1332 if (range_sums < range_sums_new) {
1333 result[i].lose_cover_sizek =
1334 (range_sums_new - range_sums) << PSHIFT;
1335 result[i].bad = 1;
1336 } else
1337 result[i].lose_cover_sizek =
1338 (range_sums - range_sums_new) << PSHIFT;
1339
1340 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1341 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1342 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1343 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
1344 result[i].bad?"*BAD*":" ",
1345 gran_base, gran_factor, chunk_base, chunk_factor);
1346 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
1347 result[i].num_reg, result[i].bad?"-":"",
1348 lose_base, lose_factor);
1349 if (!result[i].bad) { 1427 if (!result[i].bad) {
1350 set_var_mtrr_all(address_bits); 1428 set_var_mtrr_all(address_bits);
1351 return 1; 1429 return 1;
1352 } 1430 }
1353 printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, " 1431 printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
1354 "will find optimal one\n"); 1432 "will find optimal one\n");
1355 debug_print--;
1356 memset(result, 0, sizeof(result[0]));
1357 } 1433 }
1358 1434
1359 i = 0; 1435 i = 0;
1360 memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn)); 1436 memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
1361 memset(result, 0, sizeof(result)); 1437 memset(result, 0, sizeof(result));
1362 for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) { 1438 for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
1363 char gran_factor;
1364 unsigned long gran_base;
1365
1366 if (debug_print)
1367 gran_base = to_size_factor(gran_size >> 10, &gran_factor);
1368 1439
1369 for (chunk_size = gran_size; chunk_size < (1ULL<<32); 1440 for (chunk_size = gran_size; chunk_size < (1ULL<<32);
1370 chunk_size <<= 1) { 1441 chunk_size <<= 1) {
1371 int num_reg;
1372 1442
1373 if (debug_print) {
1374 char chunk_factor;
1375 unsigned long chunk_base;
1376
1377 chunk_base = to_size_factor(chunk_size>>10, &chunk_factor),
1378 printk(KERN_INFO "\n");
1379 printk(KERN_INFO "gran_size: %ld%c chunk_size: %ld%c \n",
1380 gran_base, gran_factor, chunk_base, chunk_factor);
1381 }
1382 if (i >= NUM_RESULT) 1443 if (i >= NUM_RESULT)
1383 continue; 1444 continue;
1384 1445
1385 /* convert ranges to var ranges state */ 1446 mtrr_calc_range_state(chunk_size, gran_size,
1386 num_reg = x86_setup_var_mtrrs(range, nr_range, 1447 extra_remove_base, extra_remove_size, i);
1387 chunk_size, gran_size); 1448 if (debug_print) {
1388 1449 mtrr_print_out_one_result(i);
1389 /* we got new setting in range_state, check it */ 1450 printk(KERN_INFO "\n");
1390 memset(range_new, 0, sizeof(range_new));
1391 nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
1392 extra_remove_base, extra_remove_size);
1393 range_sums_new = sum_ranges(range_new, nr_range_new);
1394
1395 result[i].chunk_sizek = chunk_size >> 10;
1396 result[i].gran_sizek = gran_size >> 10;
1397 result[i].num_reg = num_reg;
1398 if (range_sums < range_sums_new) {
1399 result[i].lose_cover_sizek =
1400 (range_sums_new - range_sums) << PSHIFT;
1401 result[i].bad = 1;
1402 } else
1403 result[i].lose_cover_sizek =
1404 (range_sums - range_sums_new) << PSHIFT;
1405
1406 /* double check it */
1407 if (!result[i].bad && !result[i].lose_cover_sizek) {
1408 if (nr_range_new != nr_range ||
1409 memcmp(range, range_new, sizeof(range)))
1410 result[i].bad = 1;
1411 } 1451 }
1412 1452
1413 if (!result[i].bad && (range_sums - range_sums_new <
1414 min_loss_pfn[num_reg])) {
1415 min_loss_pfn[num_reg] =
1416 range_sums - range_sums_new;
1417 }
1418 i++; 1453 i++;
1419 } 1454 }
1420 } 1455 }
1421 1456
1422 /* print out all */
1423 for (i = 0; i < NUM_RESULT; i++) {
1424 char gran_factor, chunk_factor, lose_factor;
1425 unsigned long gran_base, chunk_base, lose_base;
1426
1427 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1428 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1429 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1430 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
1431 result[i].bad?"*BAD*":" ",
1432 gran_base, gran_factor, chunk_base, chunk_factor);
1433 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
1434 result[i].num_reg, result[i].bad?"-":"",
1435 lose_base, lose_factor);
1436 }
1437
1438 /* try to find the optimal index */ 1457 /* try to find the optimal index */
1439 if (nr_mtrr_spare_reg >= num_var_ranges) 1458 index_good = mtrr_search_optimal_index();
1440 nr_mtrr_spare_reg = num_var_ranges - 1;
1441 num_reg_good = -1;
1442 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
1443 if (!min_loss_pfn[i])
1444 num_reg_good = i;
1445 }
1446
1447 index_good = -1;
1448 if (num_reg_good != -1) {
1449 for (i = 0; i < NUM_RESULT; i++) {
1450 if (!result[i].bad &&
1451 result[i].num_reg == num_reg_good &&
1452 !result[i].lose_cover_sizek) {
1453 index_good = i;
1454 break;
1455 }
1456 }
1457 }
1458 1459
1459 if (index_good != -1) { 1460 if (index_good != -1) {
1460 char gran_factor, chunk_factor, lose_factor;
1461 unsigned long gran_base, chunk_base, lose_base;
1462
1463 printk(KERN_INFO "Found optimal setting for mtrr clean up\n"); 1461 printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
1464 i = index_good; 1462 i = index_good;
1465 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), 1463 mtrr_print_out_one_result(i);
1466 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), 1464
1467 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1468 printk(KERN_INFO "gran_size: %ld%c \tchunk_size: %ld%c \t",
1469 gran_base, gran_factor, chunk_base, chunk_factor);
1470 printk(KERN_CONT "num_reg: %d \tlose RAM: %ld%c\n",
1471 result[i].num_reg, lose_base, lose_factor);
1472 /* convert ranges to var ranges state */ 1465 /* convert ranges to var ranges state */
1473 chunk_size = result[i].chunk_sizek; 1466 chunk_size = result[i].chunk_sizek;
1474 chunk_size <<= 10; 1467 chunk_size <<= 10;
1475 gran_size = result[i].gran_sizek; 1468 gran_size = result[i].gran_sizek;
1476 gran_size <<= 10; 1469 gran_size <<= 10;
1477 debug_print++;
1478 x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); 1470 x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
1479 debug_print--;
1480 set_var_mtrr_all(address_bits); 1471 set_var_mtrr_all(address_bits);
1472 printk(KERN_DEBUG "New variable MTRRs\n");
1473 print_out_mtrr_range_state();
1481 return 1; 1474 return 1;
1475 } else {
1476 /* print out all */
1477 for (i = 0; i < NUM_RESULT; i++)
1478 mtrr_print_out_one_result(i);
1482 } 1479 }
1483 1480
1484 printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n"); 1481 printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
@@ -1562,7 +1559,6 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1562{ 1559{
1563 unsigned long i, base, size, highest_pfn = 0, def, dummy; 1560 unsigned long i, base, size, highest_pfn = 0, def, dummy;
1564 mtrr_type type; 1561 mtrr_type type;
1565 int nr_range;
1566 u64 total_trim_size; 1562 u64 total_trim_size;
1567 1563
1568 /* extra one for all 0 */ 1564 /* extra one for all 0 */
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
new file mode 100644
index 000000000000..284c399e3234
--- /dev/null
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -0,0 +1,112 @@
1/*
2 * VMware Detection code.
3 *
4 * Copyright (C) 2008, VMware, Inc.
5 * Author : Alok N Kataria <akataria@vmware.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
15 * NON INFRINGEMENT. See the GNU General Public License for more
16 * details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
21 *
22 */
23
24#include <linux/dmi.h>
25#include <asm/div64.h>
26#include <asm/vmware.h>
27
28#define CPUID_VMWARE_INFO_LEAF 0x40000000
29#define VMWARE_HYPERVISOR_MAGIC 0x564D5868
30#define VMWARE_HYPERVISOR_PORT 0x5658
31
32#define VMWARE_PORT_CMD_GETVERSION 10
33#define VMWARE_PORT_CMD_GETHZ 45
34
35#define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \
36 __asm__("inl (%%dx)" : \
37 "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \
38 "0"(VMWARE_HYPERVISOR_MAGIC), \
39 "1"(VMWARE_PORT_CMD_##cmd), \
40 "2"(VMWARE_HYPERVISOR_PORT), "3"(UINT_MAX) : \
41 "memory");
42
43static inline int __vmware_platform(void)
44{
45 uint32_t eax, ebx, ecx, edx;
46 VMWARE_PORT(GETVERSION, eax, ebx, ecx, edx);
47 return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC;
48}
49
50static unsigned long __vmware_get_tsc_khz(void)
51{
52 uint64_t tsc_hz;
53 uint32_t eax, ebx, ecx, edx;
54
55 VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
56
57 if (ebx == UINT_MAX)
58 return 0;
59 tsc_hz = eax | (((uint64_t)ebx) << 32);
60 do_div(tsc_hz, 1000);
61 BUG_ON(tsc_hz >> 32);
62 return tsc_hz;
63}
64
65/*
66 * While checking the dmi string infomation, just checking the product
67 * serial key should be enough, as this will always have a VMware
68 * specific string when running under VMware hypervisor.
69 */
70int vmware_platform(void)
71{
72 if (cpu_has_hypervisor) {
73 unsigned int eax, ebx, ecx, edx;
74 char hyper_vendor_id[13];
75
76 cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &ebx, &ecx, &edx);
77 memcpy(hyper_vendor_id + 0, &ebx, 4);
78 memcpy(hyper_vendor_id + 4, &ecx, 4);
79 memcpy(hyper_vendor_id + 8, &edx, 4);
80 hyper_vendor_id[12] = '\0';
81 if (!strcmp(hyper_vendor_id, "VMwareVMware"))
82 return 1;
83 } else if (dmi_available && dmi_name_in_serial("VMware") &&
84 __vmware_platform())
85 return 1;
86
87 return 0;
88}
89
90unsigned long vmware_get_tsc_khz(void)
91{
92 BUG_ON(!vmware_platform());
93 return __vmware_get_tsc_khz();
94}
95
96/*
97 * VMware hypervisor takes care of exporting a reliable TSC to the guest.
98 * Still, due to timing difference when running on virtual cpus, the TSC can
99 * be marked as unstable in some cases. For example, the TSC sync check at
100 * bootup can fail due to a marginal offset between vcpus' TSCs (though the
101 * TSCs do not drift from each other). Also, the ACPI PM timer clocksource
102 * is not suitable as a watchdog when running on a hypervisor because the
103 * kernel may miss a wrap of the counter if the vcpu is descheduled for a
104 * long time. To skip these checks at runtime we set these capability bits,
105 * so that the kernel could just trust the hypervisor with providing a
106 * reliable virtual TSC that is suitable for timekeeping.
107 */
108void __cpuinit vmware_set_feature_bits(struct cpuinfo_x86 *c)
109{
110 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
111 set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE);
112}
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 19a8c2c0389f..da91701a2348 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -6,13 +6,13 @@
6 * precise-event based sampling (PEBS). 6 * precise-event based sampling (PEBS).
7 * 7 *
8 * It manages: 8 * It manages:
9 * - per-thread and per-cpu allocation of BTS and PEBS 9 * - DS and BTS hardware configuration
10 * - buffer overflow handling (to be done) 10 * - buffer overflow handling (to be done)
11 * - buffer access 11 * - buffer access
12 * 12 *
13 * It assumes: 13 * It does not do:
14 * - get_task_struct on all traced tasks 14 * - security checking (is the caller allowed to trace the task)
15 * - current is allowed to trace tasks 15 * - buffer allocation (memory accounting)
16 * 16 *
17 * 17 *
18 * Copyright (C) 2007-2008 Intel Corporation. 18 * Copyright (C) 2007-2008 Intel Corporation.
@@ -34,15 +34,30 @@
34 * The configuration for a particular DS hardware implementation. 34 * The configuration for a particular DS hardware implementation.
35 */ 35 */
36struct ds_configuration { 36struct ds_configuration {
37 /* the size of the DS structure in bytes */ 37 /* the name of the configuration */
38 unsigned char sizeof_ds; 38 const char *name;
39 /* the size of one pointer-typed field in the DS structure in bytes; 39 /* the size of one pointer-typed field in the DS structure and
40 this covers the first 8 fields related to buffer management. */ 40 in the BTS and PEBS buffers in bytes;
41 this covers the first 8 DS fields related to buffer management. */
41 unsigned char sizeof_field; 42 unsigned char sizeof_field;
42 /* the size of a BTS/PEBS record in bytes */ 43 /* the size of a BTS/PEBS record in bytes */
43 unsigned char sizeof_rec[2]; 44 unsigned char sizeof_rec[2];
45 /* a series of bit-masks to control various features indexed
46 * by enum ds_feature */
47 unsigned long ctl[dsf_ctl_max];
44}; 48};
45static struct ds_configuration ds_cfg; 49static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array);
50
51#define ds_cfg per_cpu(ds_cfg_array, smp_processor_id())
52
53#define MAX_SIZEOF_DS (12 * 8) /* maximal size of a DS configuration */
54#define MAX_SIZEOF_BTS (3 * 8) /* maximal size of a BTS record */
55#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment */
56
57#define BTS_CONTROL \
58 (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\
59 ds_cfg.ctl[dsf_bts_overflow])
60
46 61
47/* 62/*
48 * A BTS or PEBS tracer. 63 * A BTS or PEBS tracer.
@@ -61,6 +76,8 @@ struct ds_tracer {
61struct bts_tracer { 76struct bts_tracer {
62 /* the common DS part */ 77 /* the common DS part */
63 struct ds_tracer ds; 78 struct ds_tracer ds;
79 /* the trace including the DS configuration */
80 struct bts_trace trace;
64 /* buffer overflow notification function */ 81 /* buffer overflow notification function */
65 bts_ovfl_callback_t ovfl; 82 bts_ovfl_callback_t ovfl;
66}; 83};
@@ -68,6 +85,8 @@ struct bts_tracer {
68struct pebs_tracer { 85struct pebs_tracer {
69 /* the common DS part */ 86 /* the common DS part */
70 struct ds_tracer ds; 87 struct ds_tracer ds;
88 /* the trace including the DS configuration */
89 struct pebs_trace trace;
71 /* buffer overflow notification function */ 90 /* buffer overflow notification function */
72 pebs_ovfl_callback_t ovfl; 91 pebs_ovfl_callback_t ovfl;
73}; 92};
@@ -134,13 +153,11 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
134 (*(unsigned long *)base) = value; 153 (*(unsigned long *)base) = value;
135} 154}
136 155
137#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment */
138
139 156
140/* 157/*
141 * Locking is done only for allocating BTS or PEBS resources. 158 * Locking is done only for allocating BTS or PEBS resources.
142 */ 159 */
143static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock); 160static DEFINE_SPINLOCK(ds_lock);
144 161
145 162
146/* 163/*
@@ -156,27 +173,32 @@ static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock);
156 * >0 number of per-thread tracers 173 * >0 number of per-thread tracers
157 * <0 number of per-cpu tracers 174 * <0 number of per-cpu tracers
158 * 175 *
159 * The below functions to get and put tracers and to check the
160 * allocation type require the ds_lock to be held by the caller.
161 *
162 * Tracers essentially gives the number of ds contexts for a certain 176 * Tracers essentially gives the number of ds contexts for a certain
163 * type of allocation. 177 * type of allocation.
164 */ 178 */
165static long tracers; 179static atomic_t tracers = ATOMIC_INIT(0);
166 180
167static inline void get_tracer(struct task_struct *task) 181static inline void get_tracer(struct task_struct *task)
168{ 182{
169 tracers += (task ? 1 : -1); 183 if (task)
184 atomic_inc(&tracers);
185 else
186 atomic_dec(&tracers);
170} 187}
171 188
172static inline void put_tracer(struct task_struct *task) 189static inline void put_tracer(struct task_struct *task)
173{ 190{
174 tracers -= (task ? 1 : -1); 191 if (task)
192 atomic_dec(&tracers);
193 else
194 atomic_inc(&tracers);
175} 195}
176 196
177static inline int check_tracer(struct task_struct *task) 197static inline int check_tracer(struct task_struct *task)
178{ 198{
179 return (task ? (tracers >= 0) : (tracers <= 0)); 199 return task ?
200 (atomic_read(&tracers) >= 0) :
201 (atomic_read(&tracers) <= 0);
180} 202}
181 203
182 204
@@ -190,46 +212,66 @@ static inline int check_tracer(struct task_struct *task)
190 * Contexts are use-counted. They are allocated on first access and 212 * Contexts are use-counted. They are allocated on first access and
191 * deallocated when the last user puts the context. 213 * deallocated when the last user puts the context.
192 */ 214 */
193static DEFINE_PER_CPU(struct ds_context *, system_context); 215struct ds_context {
216 /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */
217 unsigned char ds[MAX_SIZEOF_DS];
218 /* the owner of the BTS and PEBS configuration, respectively */
219 struct bts_tracer *bts_master;
220 struct pebs_tracer *pebs_master;
221 /* use count */
222 unsigned long count;
223 /* a pointer to the context location inside the thread_struct
224 * or the per_cpu context array */
225 struct ds_context **this;
226 /* a pointer to the task owning this context, or NULL, if the
227 * context is owned by a cpu */
228 struct task_struct *task;
229};
230
231static DEFINE_PER_CPU(struct ds_context *, system_context_array);
232
233#define system_context per_cpu(system_context_array, smp_processor_id())
194 234
195#define this_system_context per_cpu(system_context, smp_processor_id())
196 235
197static inline struct ds_context *ds_get_context(struct task_struct *task) 236static inline struct ds_context *ds_get_context(struct task_struct *task)
198{ 237{
199 struct ds_context **p_context = 238 struct ds_context **p_context =
200 (task ? &task->thread.ds_ctx : &this_system_context); 239 (task ? &task->thread.ds_ctx : &system_context);
201 struct ds_context *context = *p_context; 240 struct ds_context *context = NULL;
241 struct ds_context *new_context = NULL;
202 unsigned long irq; 242 unsigned long irq;
203 243
204 if (!context) { 244 /* Chances are small that we already have a context. */
205 context = kzalloc(sizeof(*context), GFP_KERNEL); 245 new_context = kzalloc(sizeof(*new_context), GFP_KERNEL);
206 if (!context) 246 if (!new_context)
207 return NULL; 247 return NULL;
208 248
209 spin_lock_irqsave(&ds_lock, irq); 249 spin_lock_irqsave(&ds_lock, irq);
210 250
211 if (*p_context) { 251 context = *p_context;
212 kfree(context); 252 if (!context) {
253 context = new_context;
213 254
214 context = *p_context; 255 context->this = p_context;
215 } else { 256 context->task = task;
216 *p_context = context; 257 context->count = 0;
217 258
218 context->this = p_context; 259 if (task)
219 context->task = task; 260 set_tsk_thread_flag(task, TIF_DS_AREA_MSR);
220 261
221 if (task) 262 if (!task || (task == current))
222 set_tsk_thread_flag(task, TIF_DS_AREA_MSR); 263 wrmsrl(MSR_IA32_DS_AREA, (unsigned long)context->ds);
223 264
224 if (!task || (task == current)) 265 *p_context = context;
225 wrmsrl(MSR_IA32_DS_AREA,
226 (unsigned long)context->ds);
227 }
228 spin_unlock_irqrestore(&ds_lock, irq);
229 } 266 }
230 267
231 context->count++; 268 context->count++;
232 269
270 spin_unlock_irqrestore(&ds_lock, irq);
271
272 if (context != new_context)
273 kfree(new_context);
274
233 return context; 275 return context;
234} 276}
235 277
@@ -242,8 +284,10 @@ static inline void ds_put_context(struct ds_context *context)
242 284
243 spin_lock_irqsave(&ds_lock, irq); 285 spin_lock_irqsave(&ds_lock, irq);
244 286
245 if (--context->count) 287 if (--context->count) {
246 goto out; 288 spin_unlock_irqrestore(&ds_lock, irq);
289 return;
290 }
247 291
248 *(context->this) = NULL; 292 *(context->this) = NULL;
249 293
@@ -253,14 +297,14 @@ static inline void ds_put_context(struct ds_context *context)
253 if (!context->task || (context->task == current)) 297 if (!context->task || (context->task == current))
254 wrmsrl(MSR_IA32_DS_AREA, 0); 298 wrmsrl(MSR_IA32_DS_AREA, 0);
255 299
256 kfree(context);
257 out:
258 spin_unlock_irqrestore(&ds_lock, irq); 300 spin_unlock_irqrestore(&ds_lock, irq);
301
302 kfree(context);
259} 303}
260 304
261 305
262/* 306/*
263 * Handle a buffer overflow 307 * Call the tracer's callback on a buffer overflow.
264 * 308 *
265 * context: the ds context 309 * context: the ds context
266 * qual: the buffer type 310 * qual: the buffer type
@@ -268,30 +312,247 @@ static inline void ds_put_context(struct ds_context *context)
268static void ds_overflow(struct ds_context *context, enum ds_qualifier qual) 312static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
269{ 313{
270 switch (qual) { 314 switch (qual) {
271 case ds_bts: { 315 case ds_bts:
272 struct bts_tracer *tracer = 316 if (context->bts_master &&
273 container_of(context->owner[qual], 317 context->bts_master->ovfl)
274 struct bts_tracer, ds); 318 context->bts_master->ovfl(context->bts_master);
275 if (tracer->ovfl)
276 tracer->ovfl(tracer);
277 }
278 break; 319 break;
279 case ds_pebs: { 320 case ds_pebs:
280 struct pebs_tracer *tracer = 321 if (context->pebs_master &&
281 container_of(context->owner[qual], 322 context->pebs_master->ovfl)
282 struct pebs_tracer, ds); 323 context->pebs_master->ovfl(context->pebs_master);
283 if (tracer->ovfl) 324 break;
284 tracer->ovfl(tracer); 325 }
326}
327
328
329/*
330 * Write raw data into the BTS or PEBS buffer.
331 *
332 * The remainder of any partially written record is zeroed out.
333 *
334 * context: the DS context
335 * qual: the buffer type
336 * record: the data to write
337 * size: the size of the data
338 */
339static int ds_write(struct ds_context *context, enum ds_qualifier qual,
340 const void *record, size_t size)
341{
342 int bytes_written = 0;
343
344 if (!record)
345 return -EINVAL;
346
347 while (size) {
348 unsigned long base, index, end, write_end, int_th;
349 unsigned long write_size, adj_write_size;
350
351 /*
352 * write as much as possible without producing an
353 * overflow interrupt.
354 *
355 * interrupt_threshold must either be
356 * - bigger than absolute_maximum or
357 * - point to a record between buffer_base and absolute_maximum
358 *
359 * index points to a valid record.
360 */
361 base = ds_get(context->ds, qual, ds_buffer_base);
362 index = ds_get(context->ds, qual, ds_index);
363 end = ds_get(context->ds, qual, ds_absolute_maximum);
364 int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
365
366 write_end = min(end, int_th);
367
368 /* if we are already beyond the interrupt threshold,
369 * we fill the entire buffer */
370 if (write_end <= index)
371 write_end = end;
372
373 if (write_end <= index)
374 break;
375
376 write_size = min((unsigned long) size, write_end - index);
377 memcpy((void *)index, record, write_size);
378
379 record = (const char *)record + write_size;
380 size -= write_size;
381 bytes_written += write_size;
382
383 adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
384 adj_write_size *= ds_cfg.sizeof_rec[qual];
385
386 /* zero out trailing bytes */
387 memset((char *)index + write_size, 0,
388 adj_write_size - write_size);
389 index += adj_write_size;
390
391 if (index >= end)
392 index = base;
393 ds_set(context->ds, qual, ds_index, index);
394
395 if (index >= int_th)
396 ds_overflow(context, qual);
397 }
398
399 return bytes_written;
400}
401
402
403/*
404 * Branch Trace Store (BTS) uses the following format. Different
405 * architectures vary in the size of those fields.
406 * - source linear address
407 * - destination linear address
408 * - flags
409 *
410 * Later architectures use 64bit pointers throughout, whereas earlier
411 * architectures use 32bit pointers in 32bit mode.
412 *
413 * We compute the base address for the first 8 fields based on:
414 * - the field size stored in the DS configuration
415 * - the relative field position
416 *
417 * In order to store additional information in the BTS buffer, we use
418 * a special source address to indicate that the record requires
419 * special interpretation.
420 *
421 * Netburst indicated via a bit in the flags field whether the branch
422 * was predicted; this is ignored.
423 *
424 * We use two levels of abstraction:
425 * - the raw data level defined here
426 * - an arch-independent level defined in ds.h
427 */
428
429enum bts_field {
430 bts_from,
431 bts_to,
432 bts_flags,
433
434 bts_qual = bts_from,
435 bts_jiffies = bts_to,
436 bts_pid = bts_flags,
437
438 bts_qual_mask = (bts_qual_max - 1),
439 bts_escape = ((unsigned long)-1 & ~bts_qual_mask)
440};
441
442static inline unsigned long bts_get(const char *base, enum bts_field field)
443{
444 base += (ds_cfg.sizeof_field * field);
445 return *(unsigned long *)base;
446}
447
448static inline void bts_set(char *base, enum bts_field field, unsigned long val)
449{
450 base += (ds_cfg.sizeof_field * field);;
451 (*(unsigned long *)base) = val;
452}
453
454
455/*
456 * The raw BTS data is architecture dependent.
457 *
458 * For higher-level users, we give an arch-independent view.
459 * - ds.h defines struct bts_struct
460 * - bts_read translates one raw bts record into a bts_struct
461 * - bts_write translates one bts_struct into the raw format and
462 * writes it into the top of the parameter tracer's buffer.
463 *
464 * return: bytes read/written on success; -Eerrno, otherwise
465 */
466static int bts_read(struct bts_tracer *tracer, const void *at,
467 struct bts_struct *out)
468{
469 if (!tracer)
470 return -EINVAL;
471
472 if (at < tracer->trace.ds.begin)
473 return -EINVAL;
474
475 if (tracer->trace.ds.end < (at + tracer->trace.ds.size))
476 return -EINVAL;
477
478 memset(out, 0, sizeof(*out));
479 if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) {
480 out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask);
481 out->variant.timestamp.jiffies = bts_get(at, bts_jiffies);
482 out->variant.timestamp.pid = bts_get(at, bts_pid);
483 } else {
484 out->qualifier = bts_branch;
485 out->variant.lbr.from = bts_get(at, bts_from);
486 out->variant.lbr.to = bts_get(at, bts_to);
487
488 if (!out->variant.lbr.from && !out->variant.lbr.to)
489 out->qualifier = bts_invalid;
285 } 490 }
491
492 return ds_cfg.sizeof_rec[ds_bts];
493}
494
495static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in)
496{
497 unsigned char raw[MAX_SIZEOF_BTS];
498
499 if (!tracer)
500 return -EINVAL;
501
502 if (MAX_SIZEOF_BTS < ds_cfg.sizeof_rec[ds_bts])
503 return -EOVERFLOW;
504
505 switch (in->qualifier) {
506 case bts_invalid:
507 bts_set(raw, bts_from, 0);
508 bts_set(raw, bts_to, 0);
509 bts_set(raw, bts_flags, 0);
510 break;
511 case bts_branch:
512 bts_set(raw, bts_from, in->variant.lbr.from);
513 bts_set(raw, bts_to, in->variant.lbr.to);
514 bts_set(raw, bts_flags, 0);
286 break; 515 break;
516 case bts_task_arrives:
517 case bts_task_departs:
518 bts_set(raw, bts_qual, (bts_escape | in->qualifier));
519 bts_set(raw, bts_jiffies, in->variant.timestamp.jiffies);
520 bts_set(raw, bts_pid, in->variant.timestamp.pid);
521 break;
522 default:
523 return -EINVAL;
287 } 524 }
525
526 return ds_write(tracer->ds.context, ds_bts, raw,
527 ds_cfg.sizeof_rec[ds_bts]);
288} 528}
289 529
290 530
291static void ds_install_ds_config(struct ds_context *context, 531static void ds_write_config(struct ds_context *context,
292 enum ds_qualifier qual, 532 struct ds_trace *cfg, enum ds_qualifier qual)
293 void *base, size_t size, size_t ith)
294{ 533{
534 unsigned char *ds = context->ds;
535
536 ds_set(ds, qual, ds_buffer_base, (unsigned long)cfg->begin);
537 ds_set(ds, qual, ds_index, (unsigned long)cfg->top);
538 ds_set(ds, qual, ds_absolute_maximum, (unsigned long)cfg->end);
539 ds_set(ds, qual, ds_interrupt_threshold, (unsigned long)cfg->ith);
540}
541
542static void ds_read_config(struct ds_context *context,
543 struct ds_trace *cfg, enum ds_qualifier qual)
544{
545 unsigned char *ds = context->ds;
546
547 cfg->begin = (void *)ds_get(ds, qual, ds_buffer_base);
548 cfg->top = (void *)ds_get(ds, qual, ds_index);
549 cfg->end = (void *)ds_get(ds, qual, ds_absolute_maximum);
550 cfg->ith = (void *)ds_get(ds, qual, ds_interrupt_threshold);
551}
552
553static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
554 void *base, size_t size, size_t ith,
555 unsigned int flags) {
295 unsigned long buffer, adj; 556 unsigned long buffer, adj;
296 557
297 /* adjust the buffer address and size to meet alignment 558 /* adjust the buffer address and size to meet alignment
@@ -308,32 +569,30 @@ static void ds_install_ds_config(struct ds_context *context,
308 buffer += adj; 569 buffer += adj;
309 size -= adj; 570 size -= adj;
310 571
311 size /= ds_cfg.sizeof_rec[qual]; 572 trace->n = size / ds_cfg.sizeof_rec[qual];
312 size *= ds_cfg.sizeof_rec[qual]; 573 trace->size = ds_cfg.sizeof_rec[qual];
313 574
314 ds_set(context->ds, qual, ds_buffer_base, buffer); 575 size = (trace->n * trace->size);
315 ds_set(context->ds, qual, ds_index, buffer);
316 ds_set(context->ds, qual, ds_absolute_maximum, buffer + size);
317 576
577 trace->begin = (void *)buffer;
578 trace->top = trace->begin;
579 trace->end = (void *)(buffer + size);
318 /* The value for 'no threshold' is -1, which will set the 580 /* The value for 'no threshold' is -1, which will set the
319 * threshold outside of the buffer, just like we want it. 581 * threshold outside of the buffer, just like we want it.
320 */ 582 */
321 ds_set(context->ds, qual, 583 trace->ith = (void *)(buffer + size - ith);
322 ds_interrupt_threshold, buffer + size - ith); 584
585 trace->flags = flags;
323} 586}
324 587
325static int ds_request(struct ds_tracer *tracer, enum ds_qualifier qual, 588
326 struct task_struct *task, 589static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
327 void *base, size_t size, size_t th) 590 enum ds_qualifier qual, struct task_struct *task,
591 void *base, size_t size, size_t th, unsigned int flags)
328{ 592{
329 struct ds_context *context; 593 struct ds_context *context;
330 unsigned long irq;
331 int error; 594 int error;
332 595
333 error = -EOPNOTSUPP;
334 if (!ds_cfg.sizeof_ds)
335 goto out;
336
337 error = -EINVAL; 596 error = -EINVAL;
338 if (!base) 597 if (!base)
339 goto out; 598 goto out;
@@ -360,43 +619,26 @@ static int ds_request(struct ds_tracer *tracer, enum ds_qualifier qual,
360 goto out; 619 goto out;
361 tracer->context = context; 620 tracer->context = context;
362 621
622 ds_init_ds_trace(trace, qual, base, size, th, flags);
363 623
364 spin_lock_irqsave(&ds_lock, irq); 624 error = 0;
365
366 error = -EPERM;
367 if (!check_tracer(task))
368 goto out_unlock;
369 get_tracer(task);
370
371 error = -EPERM;
372 if (context->owner[qual])
373 goto out_put_tracer;
374 context->owner[qual] = tracer;
375
376 spin_unlock_irqrestore(&ds_lock, irq);
377
378
379 ds_install_ds_config(context, qual, base, size, th);
380
381 return 0;
382
383 out_put_tracer:
384 put_tracer(task);
385 out_unlock:
386 spin_unlock_irqrestore(&ds_lock, irq);
387 ds_put_context(context);
388 tracer->context = NULL;
389 out: 625 out:
390 return error; 626 return error;
391} 627}
392 628
393struct bts_tracer *ds_request_bts(struct task_struct *task, 629struct bts_tracer *ds_request_bts(struct task_struct *task,
394 void *base, size_t size, 630 void *base, size_t size,
395 bts_ovfl_callback_t ovfl, size_t th) 631 bts_ovfl_callback_t ovfl, size_t th,
632 unsigned int flags)
396{ 633{
397 struct bts_tracer *tracer; 634 struct bts_tracer *tracer;
635 unsigned long irq;
398 int error; 636 int error;
399 637
638 error = -EOPNOTSUPP;
639 if (!ds_cfg.ctl[dsf_bts])
640 goto out;
641
400 /* buffer overflow notification is not yet implemented */ 642 /* buffer overflow notification is not yet implemented */
401 error = -EOPNOTSUPP; 643 error = -EOPNOTSUPP;
402 if (ovfl) 644 if (ovfl)
@@ -408,12 +650,40 @@ struct bts_tracer *ds_request_bts(struct task_struct *task,
408 goto out; 650 goto out;
409 tracer->ovfl = ovfl; 651 tracer->ovfl = ovfl;
410 652
411 error = ds_request(&tracer->ds, ds_bts, task, base, size, th); 653 error = ds_request(&tracer->ds, &tracer->trace.ds,
654 ds_bts, task, base, size, th, flags);
412 if (error < 0) 655 if (error < 0)
413 goto out_tracer; 656 goto out_tracer;
414 657
658
659 spin_lock_irqsave(&ds_lock, irq);
660
661 error = -EPERM;
662 if (!check_tracer(task))
663 goto out_unlock;
664 get_tracer(task);
665
666 error = -EPERM;
667 if (tracer->ds.context->bts_master)
668 goto out_put_tracer;
669 tracer->ds.context->bts_master = tracer;
670
671 spin_unlock_irqrestore(&ds_lock, irq);
672
673
674 tracer->trace.read = bts_read;
675 tracer->trace.write = bts_write;
676
677 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
678 ds_resume_bts(tracer);
679
415 return tracer; 680 return tracer;
416 681
682 out_put_tracer:
683 put_tracer(task);
684 out_unlock:
685 spin_unlock_irqrestore(&ds_lock, irq);
686 ds_put_context(tracer->ds.context);
417 out_tracer: 687 out_tracer:
418 kfree(tracer); 688 kfree(tracer);
419 out: 689 out:
@@ -422,9 +692,11 @@ struct bts_tracer *ds_request_bts(struct task_struct *task,
422 692
423struct pebs_tracer *ds_request_pebs(struct task_struct *task, 693struct pebs_tracer *ds_request_pebs(struct task_struct *task,
424 void *base, size_t size, 694 void *base, size_t size,
425 pebs_ovfl_callback_t ovfl, size_t th) 695 pebs_ovfl_callback_t ovfl, size_t th,
696 unsigned int flags)
426{ 697{
427 struct pebs_tracer *tracer; 698 struct pebs_tracer *tracer;
699 unsigned long irq;
428 int error; 700 int error;
429 701
430 /* buffer overflow notification is not yet implemented */ 702 /* buffer overflow notification is not yet implemented */
@@ -438,300 +710,171 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task,
438 goto out; 710 goto out;
439 tracer->ovfl = ovfl; 711 tracer->ovfl = ovfl;
440 712
441 error = ds_request(&tracer->ds, ds_pebs, task, base, size, th); 713 error = ds_request(&tracer->ds, &tracer->trace.ds,
714 ds_pebs, task, base, size, th, flags);
442 if (error < 0) 715 if (error < 0)
443 goto out_tracer; 716 goto out_tracer;
444 717
718 spin_lock_irqsave(&ds_lock, irq);
719
720 error = -EPERM;
721 if (!check_tracer(task))
722 goto out_unlock;
723 get_tracer(task);
724
725 error = -EPERM;
726 if (tracer->ds.context->pebs_master)
727 goto out_put_tracer;
728 tracer->ds.context->pebs_master = tracer;
729
730 spin_unlock_irqrestore(&ds_lock, irq);
731
732 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
733 ds_resume_pebs(tracer);
734
445 return tracer; 735 return tracer;
446 736
737 out_put_tracer:
738 put_tracer(task);
739 out_unlock:
740 spin_unlock_irqrestore(&ds_lock, irq);
741 ds_put_context(tracer->ds.context);
447 out_tracer: 742 out_tracer:
448 kfree(tracer); 743 kfree(tracer);
449 out: 744 out:
450 return ERR_PTR(error); 745 return ERR_PTR(error);
451} 746}
452 747
453static void ds_release(struct ds_tracer *tracer, enum ds_qualifier qual) 748void ds_release_bts(struct bts_tracer *tracer)
454{
455 BUG_ON(tracer->context->owner[qual] != tracer);
456 tracer->context->owner[qual] = NULL;
457
458 put_tracer(tracer->context->task);
459 ds_put_context(tracer->context);
460}
461
462int ds_release_bts(struct bts_tracer *tracer)
463{ 749{
464 if (!tracer) 750 if (!tracer)
465 return -EINVAL; 751 return;
466 752
467 ds_release(&tracer->ds, ds_bts); 753 ds_suspend_bts(tracer);
468 kfree(tracer);
469 754
470 return 0; 755 WARN_ON_ONCE(tracer->ds.context->bts_master != tracer);
471} 756 tracer->ds.context->bts_master = NULL;
472 757
473int ds_release_pebs(struct pebs_tracer *tracer) 758 put_tracer(tracer->ds.context->task);
474{ 759 ds_put_context(tracer->ds.context);
475 if (!tracer)
476 return -EINVAL;
477 760
478 ds_release(&tracer->ds, ds_pebs);
479 kfree(tracer); 761 kfree(tracer);
480
481 return 0;
482} 762}
483 763
484static size_t ds_get_index(struct ds_context *context, enum ds_qualifier qual) 764void ds_suspend_bts(struct bts_tracer *tracer)
485{ 765{
486 unsigned long base, index; 766 struct task_struct *task;
487
488 base = ds_get(context->ds, qual, ds_buffer_base);
489 index = ds_get(context->ds, qual, ds_index);
490
491 return (index - base) / ds_cfg.sizeof_rec[qual];
492}
493 767
494int ds_get_bts_index(struct bts_tracer *tracer, size_t *pos)
495{
496 if (!tracer) 768 if (!tracer)
497 return -EINVAL; 769 return;
498
499 if (!pos)
500 return -EINVAL;
501
502 *pos = ds_get_index(tracer->ds.context, ds_bts);
503
504 return 0;
505}
506 770
507int ds_get_pebs_index(struct pebs_tracer *tracer, size_t *pos) 771 task = tracer->ds.context->task;
508{
509 if (!tracer)
510 return -EINVAL;
511 772
512 if (!pos) 773 if (!task || (task == current))
513 return -EINVAL; 774 update_debugctlmsr(get_debugctlmsr() & ~BTS_CONTROL);
514 775
515 *pos = ds_get_index(tracer->ds.context, ds_pebs); 776 if (task) {
777 task->thread.debugctlmsr &= ~BTS_CONTROL;
516 778
517 return 0; 779 if (!task->thread.debugctlmsr)
780 clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
781 }
518} 782}
519 783
520static size_t ds_get_end(struct ds_context *context, enum ds_qualifier qual) 784void ds_resume_bts(struct bts_tracer *tracer)
521{ 785{
522 unsigned long base, max; 786 struct task_struct *task;
787 unsigned long control;
523 788
524 base = ds_get(context->ds, qual, ds_buffer_base);
525 max = ds_get(context->ds, qual, ds_absolute_maximum);
526
527 return (max - base) / ds_cfg.sizeof_rec[qual];
528}
529
530int ds_get_bts_end(struct bts_tracer *tracer, size_t *pos)
531{
532 if (!tracer) 789 if (!tracer)
533 return -EINVAL; 790 return;
534
535 if (!pos)
536 return -EINVAL;
537
538 *pos = ds_get_end(tracer->ds.context, ds_bts);
539
540 return 0;
541}
542
543int ds_get_pebs_end(struct pebs_tracer *tracer, size_t *pos)
544{
545 if (!tracer)
546 return -EINVAL;
547
548 if (!pos)
549 return -EINVAL;
550
551 *pos = ds_get_end(tracer->ds.context, ds_pebs);
552
553 return 0;
554}
555
556static int ds_access(struct ds_context *context, enum ds_qualifier qual,
557 size_t index, const void **record)
558{
559 unsigned long base, idx;
560
561 if (!record)
562 return -EINVAL;
563
564 base = ds_get(context->ds, qual, ds_buffer_base);
565 idx = base + (index * ds_cfg.sizeof_rec[qual]);
566
567 if (idx > ds_get(context->ds, qual, ds_absolute_maximum))
568 return -EINVAL;
569 791
570 *record = (const void *)idx; 792 task = tracer->ds.context->task;
571 793
572 return ds_cfg.sizeof_rec[qual]; 794 control = ds_cfg.ctl[dsf_bts];
573} 795 if (!(tracer->trace.ds.flags & BTS_KERNEL))
796 control |= ds_cfg.ctl[dsf_bts_kernel];
797 if (!(tracer->trace.ds.flags & BTS_USER))
798 control |= ds_cfg.ctl[dsf_bts_user];
574 799
575int ds_access_bts(struct bts_tracer *tracer, size_t index, 800 if (task) {
576 const void **record) 801 task->thread.debugctlmsr |= control;
577{ 802 set_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
578 if (!tracer) 803 }
579 return -EINVAL;
580 804
581 return ds_access(tracer->ds.context, ds_bts, index, record); 805 if (!task || (task == current))
806 update_debugctlmsr(get_debugctlmsr() | control);
582} 807}
583 808
584int ds_access_pebs(struct pebs_tracer *tracer, size_t index, 809void ds_release_pebs(struct pebs_tracer *tracer)
585 const void **record)
586{ 810{
587 if (!tracer) 811 if (!tracer)
588 return -EINVAL; 812 return;
589
590 return ds_access(tracer->ds.context, ds_pebs, index, record);
591}
592
593static int ds_write(struct ds_context *context, enum ds_qualifier qual,
594 const void *record, size_t size)
595{
596 int bytes_written = 0;
597
598 if (!record)
599 return -EINVAL;
600
601 while (size) {
602 unsigned long base, index, end, write_end, int_th;
603 unsigned long write_size, adj_write_size;
604
605 /*
606 * write as much as possible without producing an
607 * overflow interrupt.
608 *
609 * interrupt_threshold must either be
610 * - bigger than absolute_maximum or
611 * - point to a record between buffer_base and absolute_maximum
612 *
613 * index points to a valid record.
614 */
615 base = ds_get(context->ds, qual, ds_buffer_base);
616 index = ds_get(context->ds, qual, ds_index);
617 end = ds_get(context->ds, qual, ds_absolute_maximum);
618 int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
619
620 write_end = min(end, int_th);
621
622 /* if we are already beyond the interrupt threshold,
623 * we fill the entire buffer */
624 if (write_end <= index)
625 write_end = end;
626
627 if (write_end <= index)
628 break;
629
630 write_size = min((unsigned long) size, write_end - index);
631 memcpy((void *)index, record, write_size);
632
633 record = (const char *)record + write_size;
634 size -= write_size;
635 bytes_written += write_size;
636
637 adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
638 adj_write_size *= ds_cfg.sizeof_rec[qual];
639
640 /* zero out trailing bytes */
641 memset((char *)index + write_size, 0,
642 adj_write_size - write_size);
643 index += adj_write_size;
644
645 if (index >= end)
646 index = base;
647 ds_set(context->ds, qual, ds_index, index);
648 813
649 if (index >= int_th) 814 ds_suspend_pebs(tracer);
650 ds_overflow(context, qual);
651 }
652 815
653 return bytes_written; 816 WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer);
654} 817 tracer->ds.context->pebs_master = NULL;
655 818
656int ds_write_bts(struct bts_tracer *tracer, const void *record, size_t size) 819 put_tracer(tracer->ds.context->task);
657{ 820 ds_put_context(tracer->ds.context);
658 if (!tracer)
659 return -EINVAL;
660 821
661 return ds_write(tracer->ds.context, ds_bts, record, size); 822 kfree(tracer);
662} 823}
663 824
664int ds_write_pebs(struct pebs_tracer *tracer, const void *record, size_t size) 825void ds_suspend_pebs(struct pebs_tracer *tracer)
665{ 826{
666 if (!tracer)
667 return -EINVAL;
668 827
669 return ds_write(tracer->ds.context, ds_pebs, record, size);
670} 828}
671 829
672static void ds_reset_or_clear(struct ds_context *context, 830void ds_resume_pebs(struct pebs_tracer *tracer)
673 enum ds_qualifier qual, int clear)
674{ 831{
675 unsigned long base, end;
676
677 base = ds_get(context->ds, qual, ds_buffer_base);
678 end = ds_get(context->ds, qual, ds_absolute_maximum);
679 832
680 if (clear)
681 memset((void *)base, 0, end - base);
682
683 ds_set(context->ds, qual, ds_index, base);
684} 833}
685 834
686int ds_reset_bts(struct bts_tracer *tracer) 835const struct bts_trace *ds_read_bts(struct bts_tracer *tracer)
687{ 836{
688 if (!tracer) 837 if (!tracer)
689 return -EINVAL; 838 return NULL;
690 839
691 ds_reset_or_clear(tracer->ds.context, ds_bts, /* clear = */ 0); 840 ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
692 841 return &tracer->trace;
693 return 0;
694} 842}
695 843
696int ds_reset_pebs(struct pebs_tracer *tracer) 844const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer)
697{ 845{
698 if (!tracer) 846 if (!tracer)
699 return -EINVAL; 847 return NULL;
700 848
701 ds_reset_or_clear(tracer->ds.context, ds_pebs, /* clear = */ 0); 849 ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
850 tracer->trace.reset_value =
851 *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8));
702 852
703 return 0; 853 return &tracer->trace;
704} 854}
705 855
706int ds_clear_bts(struct bts_tracer *tracer) 856int ds_reset_bts(struct bts_tracer *tracer)
707{ 857{
708 if (!tracer) 858 if (!tracer)
709 return -EINVAL; 859 return -EINVAL;
710 860
711 ds_reset_or_clear(tracer->ds.context, ds_bts, /* clear = */ 1); 861 tracer->trace.ds.top = tracer->trace.ds.begin;
712
713 return 0;
714}
715
716int ds_clear_pebs(struct pebs_tracer *tracer)
717{
718 if (!tracer)
719 return -EINVAL;
720 862
721 ds_reset_or_clear(tracer->ds.context, ds_pebs, /* clear = */ 1); 863 ds_set(tracer->ds.context->ds, ds_bts, ds_index,
864 (unsigned long)tracer->trace.ds.top);
722 865
723 return 0; 866 return 0;
724} 867}
725 868
726int ds_get_pebs_reset(struct pebs_tracer *tracer, u64 *value) 869int ds_reset_pebs(struct pebs_tracer *tracer)
727{ 870{
728 if (!tracer) 871 if (!tracer)
729 return -EINVAL; 872 return -EINVAL;
730 873
731 if (!value) 874 tracer->trace.ds.top = tracer->trace.ds.begin;
732 return -EINVAL;
733 875
734 *value = *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)); 876 ds_set(tracer->ds.context->ds, ds_bts, ds_index,
877 (unsigned long)tracer->trace.ds.top);
735 878
736 return 0; 879 return 0;
737} 880}
@@ -746,35 +889,59 @@ int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value)
746 return 0; 889 return 0;
747} 890}
748 891
749static const struct ds_configuration ds_cfg_var = { 892static const struct ds_configuration ds_cfg_netburst = {
750 .sizeof_ds = sizeof(long) * 12, 893 .name = "netburst",
751 .sizeof_field = sizeof(long), 894 .ctl[dsf_bts] = (1 << 2) | (1 << 3),
752 .sizeof_rec[ds_bts] = sizeof(long) * 3, 895 .ctl[dsf_bts_kernel] = (1 << 5),
896 .ctl[dsf_bts_user] = (1 << 6),
897
898 .sizeof_field = sizeof(long),
899 .sizeof_rec[ds_bts] = sizeof(long) * 3,
753#ifdef __i386__ 900#ifdef __i386__
754 .sizeof_rec[ds_pebs] = sizeof(long) * 10 901 .sizeof_rec[ds_pebs] = sizeof(long) * 10,
755#else 902#else
756 .sizeof_rec[ds_pebs] = sizeof(long) * 18 903 .sizeof_rec[ds_pebs] = sizeof(long) * 18,
757#endif 904#endif
758}; 905};
759static const struct ds_configuration ds_cfg_64 = { 906static const struct ds_configuration ds_cfg_pentium_m = {
760 .sizeof_ds = 8 * 12, 907 .name = "pentium m",
761 .sizeof_field = 8, 908 .ctl[dsf_bts] = (1 << 6) | (1 << 7),
762 .sizeof_rec[ds_bts] = 8 * 3, 909
910 .sizeof_field = sizeof(long),
911 .sizeof_rec[ds_bts] = sizeof(long) * 3,
763#ifdef __i386__ 912#ifdef __i386__
764 .sizeof_rec[ds_pebs] = 8 * 10 913 .sizeof_rec[ds_pebs] = sizeof(long) * 10,
765#else 914#else
766 .sizeof_rec[ds_pebs] = 8 * 18 915 .sizeof_rec[ds_pebs] = sizeof(long) * 18,
767#endif 916#endif
768}; 917};
918static const struct ds_configuration ds_cfg_core2 = {
919 .name = "core 2",
920 .ctl[dsf_bts] = (1 << 6) | (1 << 7),
921 .ctl[dsf_bts_kernel] = (1 << 9),
922 .ctl[dsf_bts_user] = (1 << 10),
923
924 .sizeof_field = 8,
925 .sizeof_rec[ds_bts] = 8 * 3,
926 .sizeof_rec[ds_pebs] = 8 * 18,
927};
769 928
770static inline void 929static void
771ds_configure(const struct ds_configuration *cfg) 930ds_configure(const struct ds_configuration *cfg)
772{ 931{
932 memset(&ds_cfg, 0, sizeof(ds_cfg));
773 ds_cfg = *cfg; 933 ds_cfg = *cfg;
774 934
775 printk(KERN_INFO "DS available\n"); 935 printk(KERN_INFO "[ds] using %s configuration\n", ds_cfg.name);
936
937 if (!cpu_has_bts) {
938 ds_cfg.ctl[dsf_bts] = 0;
939 printk(KERN_INFO "[ds] bts not available\n");
940 }
941 if (!cpu_has_pebs)
942 printk(KERN_INFO "[ds] pebs not available\n");
776 943
777 BUG_ON(MAX_SIZEOF_DS < ds_cfg.sizeof_ds); 944 WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field));
778} 945}
779 946
780void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) 947void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
@@ -787,10 +954,10 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
787 break; 954 break;
788 case 0xD: 955 case 0xD:
789 case 0xE: /* Pentium M */ 956 case 0xE: /* Pentium M */
790 ds_configure(&ds_cfg_var); 957 ds_configure(&ds_cfg_pentium_m);
791 break; 958 break;
792 default: /* Core2, Atom, ... */ 959 default: /* Core2, Atom, ... */
793 ds_configure(&ds_cfg_64); 960 ds_configure(&ds_cfg_core2);
794 break; 961 break;
795 } 962 }
796 break; 963 break;
@@ -799,7 +966,7 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
799 case 0x0: 966 case 0x0:
800 case 0x1: 967 case 0x1:
801 case 0x2: /* Netburst */ 968 case 0x2: /* Netburst */
802 ds_configure(&ds_cfg_var); 969 ds_configure(&ds_cfg_netburst);
803 break; 970 break;
804 default: 971 default:
805 /* sorry, don't know about them */ 972 /* sorry, don't know about them */
@@ -812,14 +979,52 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
812 } 979 }
813} 980}
814 981
815void ds_free(struct ds_context *context) 982/*
983 * Change the DS configuration from tracing prev to tracing next.
984 */
985void ds_switch_to(struct task_struct *prev, struct task_struct *next)
816{ 986{
817 /* This is called when the task owning the parameter context 987 struct ds_context *prev_ctx = prev->thread.ds_ctx;
818 * is dying. There should not be any user of that context left 988 struct ds_context *next_ctx = next->thread.ds_ctx;
819 * to disturb us, anymore. */ 989
820 unsigned long leftovers = context->count; 990 if (prev_ctx) {
821 while (leftovers--) { 991 update_debugctlmsr(0);
822 put_tracer(context->task); 992
823 ds_put_context(context); 993 if (prev_ctx->bts_master &&
994 (prev_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) {
995 struct bts_struct ts = {
996 .qualifier = bts_task_departs,
997 .variant.timestamp.jiffies = jiffies_64,
998 .variant.timestamp.pid = prev->pid
999 };
1000 bts_write(prev_ctx->bts_master, &ts);
1001 }
1002 }
1003
1004 if (next_ctx) {
1005 if (next_ctx->bts_master &&
1006 (next_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) {
1007 struct bts_struct ts = {
1008 .qualifier = bts_task_arrives,
1009 .variant.timestamp.jiffies = jiffies_64,
1010 .variant.timestamp.pid = next->pid
1011 };
1012 bts_write(next_ctx->bts_master, &ts);
1013 }
1014
1015 wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds);
824 } 1016 }
1017
1018 update_debugctlmsr(next->thread.debugctlmsr);
1019}
1020
1021void ds_copy_thread(struct task_struct *tsk, struct task_struct *father)
1022{
1023 clear_tsk_thread_flag(tsk, TIF_DS_AREA_MSR);
1024 tsk->thread.ds_ctx = NULL;
1025}
1026
1027void ds_exit_thread(struct task_struct *tsk)
1028{
1029 WARN_ON(tsk->thread.ds_ctx);
825} 1030}
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 7aafeb5263ef..65a13943e098 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -677,22 +677,6 @@ struct early_res {
677}; 677};
678static struct early_res early_res[MAX_EARLY_RES] __initdata = { 678static struct early_res early_res[MAX_EARLY_RES] __initdata = {
679 { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */ 679 { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
680#if defined(CONFIG_X86_64) && defined(CONFIG_X86_TRAMPOLINE)
681 { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
682#endif
683#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
684 /*
685 * But first pinch a few for the stack/trampoline stuff
686 * FIXME: Don't need the extra page at 4K, but need to fix
687 * trampoline before removing it. (see the GDT stuff)
688 */
689 { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE" },
690 /*
691 * Has to be in very low memory so we can execute
692 * real-mode AP code.
693 */
694 { TRAMPOLINE_BASE, TRAMPOLINE_BASE + PAGE_SIZE, "TRAMPOLINE" },
695#endif
696 {} 680 {}
697}; 681};
698 682
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 1b894b72c0f5..744aa7fc49d5 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -17,6 +17,7 @@
17#include <asm/io_apic.h> 17#include <asm/io_apic.h>
18#include <asm/apic.h> 18#include <asm/apic.h>
19#include <asm/iommu.h> 19#include <asm/iommu.h>
20#include <asm/gart.h>
20 21
21static void __init fix_hypertransport_config(int num, int slot, int func) 22static void __init fix_hypertransport_config(int num, int slot, int func)
22{ 23{
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 34ad997d3834..23b138e31e9c 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -875,49 +875,6 @@ static struct console early_dbgp_console = {
875}; 875};
876#endif 876#endif
877 877
878/* Console interface to a host file on AMD's SimNow! */
879
880static int simnow_fd;
881
882enum {
883 MAGIC1 = 0xBACCD00A,
884 MAGIC2 = 0xCA110000,
885 XOPEN = 5,
886 XWRITE = 4,
887};
888
889static noinline long simnow(long cmd, long a, long b, long c)
890{
891 long ret;
892
893 asm volatile("cpuid" :
894 "=a" (ret) :
895 "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2));
896 return ret;
897}
898
899static void __init simnow_init(char *str)
900{
901 char *fn = "klog";
902
903 if (*str == '=')
904 fn = ++str;
905 /* error ignored */
906 simnow_fd = simnow(XOPEN, (unsigned long)fn, O_WRONLY|O_APPEND|O_CREAT, 0644);
907}
908
909static void simnow_write(struct console *con, const char *s, unsigned n)
910{
911 simnow(XWRITE, simnow_fd, (unsigned long)s, n);
912}
913
914static struct console simnow_console = {
915 .name = "simnow",
916 .write = simnow_write,
917 .flags = CON_PRINTBUFFER,
918 .index = -1,
919};
920
921/* Direct interface for emergencies */ 878/* Direct interface for emergencies */
922static struct console *early_console = &early_vga_console; 879static struct console *early_console = &early_vga_console;
923static int __initdata early_console_initialized; 880static int __initdata early_console_initialized;
@@ -960,10 +917,6 @@ static int __init setup_early_printk(char *buf)
960 max_ypos = boot_params.screen_info.orig_video_lines; 917 max_ypos = boot_params.screen_info.orig_video_lines;
961 current_ypos = boot_params.screen_info.orig_y; 918 current_ypos = boot_params.screen_info.orig_y;
962 early_console = &early_vga_console; 919 early_console = &early_vga_console;
963 } else if (!strncmp(buf, "simnow", 6)) {
964 simnow_init(buf + 6);
965 early_console = &simnow_console;
966 keep_early = 1;
967#ifdef CONFIG_EARLY_PRINTK_DBGP 920#ifdef CONFIG_EARLY_PRINTK_DBGP
968 } else if (!strncmp(buf, "dbgp", 4)) { 921 } else if (!strncmp(buf, "dbgp", 4)) {
969 if (early_dbgp_init(buf+4) < 0) 922 if (early_dbgp_init(buf+4) < 0)
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 43ceb3f454bf..d6f0490a7391 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -619,28 +619,37 @@ END(syscall_badsys)
61927:; 61927:;
620 620
621/* 621/*
622 * Build the entry stubs and pointer table with 622 * Build the entry stubs and pointer table with some assembler magic.
623 * some assembler magic. 623 * We pack 7 stubs into a single 32-byte chunk, which will fit in a
624 * single cache line on all modern x86 implementations.
624 */ 625 */
625.section .rodata,"a" 626.section .init.rodata,"a"
626ENTRY(interrupt) 627ENTRY(interrupt)
627.text 628.text
628 629 .p2align 5
630 .p2align CONFIG_X86_L1_CACHE_SHIFT
629ENTRY(irq_entries_start) 631ENTRY(irq_entries_start)
630 RING0_INT_FRAME 632 RING0_INT_FRAME
631vector=0 633vector=FIRST_EXTERNAL_VECTOR
632.rept NR_VECTORS 634.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7
633 ALIGN 635 .balign 32
634 .if vector 636 .rept 7
637 .if vector < NR_VECTORS
638 .if vector <> FIRST_EXTERNAL_VECTOR
635 CFI_ADJUST_CFA_OFFSET -4 639 CFI_ADJUST_CFA_OFFSET -4
636 .endif 640 .endif
6371: pushl $~(vector) 6411: pushl $(~vector+0x80) /* Note: always in signed byte range */
638 CFI_ADJUST_CFA_OFFSET 4 642 CFI_ADJUST_CFA_OFFSET 4
639 jmp common_interrupt 643 .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
640 .previous 644 jmp 2f
645 .endif
646 .previous
641 .long 1b 647 .long 1b
642 .text 648 .text
643vector=vector+1 649vector=vector+1
650 .endif
651 .endr
6522: jmp common_interrupt
644.endr 653.endr
645END(irq_entries_start) 654END(irq_entries_start)
646 655
@@ -652,8 +661,9 @@ END(interrupt)
652 * the CPU automatically disables interrupts when executing an IRQ vector, 661 * the CPU automatically disables interrupts when executing an IRQ vector,
653 * so IRQ-flags tracing has to follow that: 662 * so IRQ-flags tracing has to follow that:
654 */ 663 */
655 ALIGN 664 .p2align CONFIG_X86_L1_CACHE_SHIFT
656common_interrupt: 665common_interrupt:
666 addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */
657 SAVE_ALL 667 SAVE_ALL
658 TRACE_IRQS_OFF 668 TRACE_IRQS_OFF
659 movl %esp,%eax 669 movl %esp,%eax
@@ -678,65 +688,6 @@ ENDPROC(name)
678/* The include is where all of the SMP etc. interrupts come from */ 688/* The include is where all of the SMP etc. interrupts come from */
679#include "entry_arch.h" 689#include "entry_arch.h"
680 690
681KPROBE_ENTRY(page_fault)
682 RING0_EC_FRAME
683 pushl $do_page_fault
684 CFI_ADJUST_CFA_OFFSET 4
685 ALIGN
686error_code:
687 /* the function address is in %fs's slot on the stack */
688 pushl %es
689 CFI_ADJUST_CFA_OFFSET 4
690 /*CFI_REL_OFFSET es, 0*/
691 pushl %ds
692 CFI_ADJUST_CFA_OFFSET 4
693 /*CFI_REL_OFFSET ds, 0*/
694 pushl %eax
695 CFI_ADJUST_CFA_OFFSET 4
696 CFI_REL_OFFSET eax, 0
697 pushl %ebp
698 CFI_ADJUST_CFA_OFFSET 4
699 CFI_REL_OFFSET ebp, 0
700 pushl %edi
701 CFI_ADJUST_CFA_OFFSET 4
702 CFI_REL_OFFSET edi, 0
703 pushl %esi
704 CFI_ADJUST_CFA_OFFSET 4
705 CFI_REL_OFFSET esi, 0
706 pushl %edx
707 CFI_ADJUST_CFA_OFFSET 4
708 CFI_REL_OFFSET edx, 0
709 pushl %ecx
710 CFI_ADJUST_CFA_OFFSET 4
711 CFI_REL_OFFSET ecx, 0
712 pushl %ebx
713 CFI_ADJUST_CFA_OFFSET 4
714 CFI_REL_OFFSET ebx, 0
715 cld
716 pushl %fs
717 CFI_ADJUST_CFA_OFFSET 4
718 /*CFI_REL_OFFSET fs, 0*/
719 movl $(__KERNEL_PERCPU), %ecx
720 movl %ecx, %fs
721 UNWIND_ESPFIX_STACK
722 popl %ecx
723 CFI_ADJUST_CFA_OFFSET -4
724 /*CFI_REGISTER es, ecx*/
725 movl PT_FS(%esp), %edi # get the function address
726 movl PT_ORIG_EAX(%esp), %edx # get the error code
727 movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
728 mov %ecx, PT_FS(%esp)
729 /*CFI_REL_OFFSET fs, ES*/
730 movl $(__USER_DS), %ecx
731 movl %ecx, %ds
732 movl %ecx, %es
733 TRACE_IRQS_OFF
734 movl %esp,%eax # pt_regs pointer
735 call *%edi
736 jmp ret_from_exception
737 CFI_ENDPROC
738KPROBE_END(page_fault)
739
740ENTRY(coprocessor_error) 691ENTRY(coprocessor_error)
741 RING0_INT_FRAME 692 RING0_INT_FRAME
742 pushl $0 693 pushl $0
@@ -767,140 +718,6 @@ ENTRY(device_not_available)
767 CFI_ENDPROC 718 CFI_ENDPROC
768END(device_not_available) 719END(device_not_available)
769 720
770/*
771 * Debug traps and NMI can happen at the one SYSENTER instruction
772 * that sets up the real kernel stack. Check here, since we can't
773 * allow the wrong stack to be used.
774 *
775 * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
776 * already pushed 3 words if it hits on the sysenter instruction:
777 * eflags, cs and eip.
778 *
779 * We just load the right stack, and push the three (known) values
780 * by hand onto the new stack - while updating the return eip past
781 * the instruction that would have done it for sysenter.
782 */
783#define FIX_STACK(offset, ok, label) \
784 cmpw $__KERNEL_CS,4(%esp); \
785 jne ok; \
786label: \
787 movl TSS_sysenter_sp0+offset(%esp),%esp; \
788 CFI_DEF_CFA esp, 0; \
789 CFI_UNDEFINED eip; \
790 pushfl; \
791 CFI_ADJUST_CFA_OFFSET 4; \
792 pushl $__KERNEL_CS; \
793 CFI_ADJUST_CFA_OFFSET 4; \
794 pushl $sysenter_past_esp; \
795 CFI_ADJUST_CFA_OFFSET 4; \
796 CFI_REL_OFFSET eip, 0
797
798KPROBE_ENTRY(debug)
799 RING0_INT_FRAME
800 cmpl $ia32_sysenter_target,(%esp)
801 jne debug_stack_correct
802 FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
803debug_stack_correct:
804 pushl $-1 # mark this as an int
805 CFI_ADJUST_CFA_OFFSET 4
806 SAVE_ALL
807 TRACE_IRQS_OFF
808 xorl %edx,%edx # error code 0
809 movl %esp,%eax # pt_regs pointer
810 call do_debug
811 jmp ret_from_exception
812 CFI_ENDPROC
813KPROBE_END(debug)
814
815/*
816 * NMI is doubly nasty. It can happen _while_ we're handling
817 * a debug fault, and the debug fault hasn't yet been able to
818 * clear up the stack. So we first check whether we got an
819 * NMI on the sysenter entry path, but after that we need to
820 * check whether we got an NMI on the debug path where the debug
821 * fault happened on the sysenter path.
822 */
823KPROBE_ENTRY(nmi)
824 RING0_INT_FRAME
825 pushl %eax
826 CFI_ADJUST_CFA_OFFSET 4
827 movl %ss, %eax
828 cmpw $__ESPFIX_SS, %ax
829 popl %eax
830 CFI_ADJUST_CFA_OFFSET -4
831 je nmi_espfix_stack
832 cmpl $ia32_sysenter_target,(%esp)
833 je nmi_stack_fixup
834 pushl %eax
835 CFI_ADJUST_CFA_OFFSET 4
836 movl %esp,%eax
837 /* Do not access memory above the end of our stack page,
838 * it might not exist.
839 */
840 andl $(THREAD_SIZE-1),%eax
841 cmpl $(THREAD_SIZE-20),%eax
842 popl %eax
843 CFI_ADJUST_CFA_OFFSET -4
844 jae nmi_stack_correct
845 cmpl $ia32_sysenter_target,12(%esp)
846 je nmi_debug_stack_check
847nmi_stack_correct:
848 /* We have a RING0_INT_FRAME here */
849 pushl %eax
850 CFI_ADJUST_CFA_OFFSET 4
851 SAVE_ALL
852 TRACE_IRQS_OFF
853 xorl %edx,%edx # zero error code
854 movl %esp,%eax # pt_regs pointer
855 call do_nmi
856 jmp restore_nocheck_notrace
857 CFI_ENDPROC
858
859nmi_stack_fixup:
860 RING0_INT_FRAME
861 FIX_STACK(12,nmi_stack_correct, 1)
862 jmp nmi_stack_correct
863
864nmi_debug_stack_check:
865 /* We have a RING0_INT_FRAME here */
866 cmpw $__KERNEL_CS,16(%esp)
867 jne nmi_stack_correct
868 cmpl $debug,(%esp)
869 jb nmi_stack_correct
870 cmpl $debug_esp_fix_insn,(%esp)
871 ja nmi_stack_correct
872 FIX_STACK(24,nmi_stack_correct, 1)
873 jmp nmi_stack_correct
874
875nmi_espfix_stack:
876 /* We have a RING0_INT_FRAME here.
877 *
878 * create the pointer to lss back
879 */
880 pushl %ss
881 CFI_ADJUST_CFA_OFFSET 4
882 pushl %esp
883 CFI_ADJUST_CFA_OFFSET 4
884 addw $4, (%esp)
885 /* copy the iret frame of 12 bytes */
886 .rept 3
887 pushl 16(%esp)
888 CFI_ADJUST_CFA_OFFSET 4
889 .endr
890 pushl %eax
891 CFI_ADJUST_CFA_OFFSET 4
892 SAVE_ALL
893 TRACE_IRQS_OFF
894 FIXUP_ESPFIX_STACK # %eax == %esp
895 xorl %edx,%edx # zero error code
896 call do_nmi
897 RESTORE_REGS
898 lss 12+4(%esp), %esp # back to espfix stack
899 CFI_ADJUST_CFA_OFFSET -24
900 jmp irq_return
901 CFI_ENDPROC
902KPROBE_END(nmi)
903
904#ifdef CONFIG_PARAVIRT 721#ifdef CONFIG_PARAVIRT
905ENTRY(native_iret) 722ENTRY(native_iret)
906 iret 723 iret
@@ -916,19 +733,6 @@ ENTRY(native_irq_enable_sysexit)
916END(native_irq_enable_sysexit) 733END(native_irq_enable_sysexit)
917#endif 734#endif
918 735
919KPROBE_ENTRY(int3)
920 RING0_INT_FRAME
921 pushl $-1 # mark this as an int
922 CFI_ADJUST_CFA_OFFSET 4
923 SAVE_ALL
924 TRACE_IRQS_OFF
925 xorl %edx,%edx # zero error code
926 movl %esp,%eax # pt_regs pointer
927 call do_int3
928 jmp ret_from_exception
929 CFI_ENDPROC
930KPROBE_END(int3)
931
932ENTRY(overflow) 736ENTRY(overflow)
933 RING0_INT_FRAME 737 RING0_INT_FRAME
934 pushl $0 738 pushl $0
@@ -993,14 +797,6 @@ ENTRY(stack_segment)
993 CFI_ENDPROC 797 CFI_ENDPROC
994END(stack_segment) 798END(stack_segment)
995 799
996KPROBE_ENTRY(general_protection)
997 RING0_EC_FRAME
998 pushl $do_general_protection
999 CFI_ADJUST_CFA_OFFSET 4
1000 jmp error_code
1001 CFI_ENDPROC
1002KPROBE_END(general_protection)
1003
1004ENTRY(alignment_check) 800ENTRY(alignment_check)
1005 RING0_EC_FRAME 801 RING0_EC_FRAME
1006 pushl $do_alignment_check 802 pushl $do_alignment_check
@@ -1051,6 +847,7 @@ ENTRY(kernel_thread_helper)
1051 push %eax 847 push %eax
1052 CFI_ADJUST_CFA_OFFSET 4 848 CFI_ADJUST_CFA_OFFSET 4
1053 call do_exit 849 call do_exit
850 ud2 # padding for call trace
1054 CFI_ENDPROC 851 CFI_ENDPROC
1055ENDPROC(kernel_thread_helper) 852ENDPROC(kernel_thread_helper)
1056 853
@@ -1259,3 +1056,227 @@ return_to_handler:
1259#include "syscall_table_32.S" 1056#include "syscall_table_32.S"
1260 1057
1261syscall_table_size=(.-sys_call_table) 1058syscall_table_size=(.-sys_call_table)
1059
1060/*
1061 * Some functions should be protected against kprobes
1062 */
1063 .pushsection .kprobes.text, "ax"
1064
1065ENTRY(page_fault)
1066 RING0_EC_FRAME
1067 pushl $do_page_fault
1068 CFI_ADJUST_CFA_OFFSET 4
1069 ALIGN
1070error_code:
1071 /* the function address is in %fs's slot on the stack */
1072 pushl %es
1073 CFI_ADJUST_CFA_OFFSET 4
1074 /*CFI_REL_OFFSET es, 0*/
1075 pushl %ds
1076 CFI_ADJUST_CFA_OFFSET 4
1077 /*CFI_REL_OFFSET ds, 0*/
1078 pushl %eax
1079 CFI_ADJUST_CFA_OFFSET 4
1080 CFI_REL_OFFSET eax, 0
1081 pushl %ebp
1082 CFI_ADJUST_CFA_OFFSET 4
1083 CFI_REL_OFFSET ebp, 0
1084 pushl %edi
1085 CFI_ADJUST_CFA_OFFSET 4
1086 CFI_REL_OFFSET edi, 0
1087 pushl %esi
1088 CFI_ADJUST_CFA_OFFSET 4
1089 CFI_REL_OFFSET esi, 0
1090 pushl %edx
1091 CFI_ADJUST_CFA_OFFSET 4
1092 CFI_REL_OFFSET edx, 0
1093 pushl %ecx
1094 CFI_ADJUST_CFA_OFFSET 4
1095 CFI_REL_OFFSET ecx, 0
1096 pushl %ebx
1097 CFI_ADJUST_CFA_OFFSET 4
1098 CFI_REL_OFFSET ebx, 0
1099 cld
1100 pushl %fs
1101 CFI_ADJUST_CFA_OFFSET 4
1102 /*CFI_REL_OFFSET fs, 0*/
1103 movl $(__KERNEL_PERCPU), %ecx
1104 movl %ecx, %fs
1105 UNWIND_ESPFIX_STACK
1106 popl %ecx
1107 CFI_ADJUST_CFA_OFFSET -4
1108 /*CFI_REGISTER es, ecx*/
1109 movl PT_FS(%esp), %edi # get the function address
1110 movl PT_ORIG_EAX(%esp), %edx # get the error code
1111 movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
1112 mov %ecx, PT_FS(%esp)
1113 /*CFI_REL_OFFSET fs, ES*/
1114 movl $(__USER_DS), %ecx
1115 movl %ecx, %ds
1116 movl %ecx, %es
1117 TRACE_IRQS_OFF
1118 movl %esp,%eax # pt_regs pointer
1119 call *%edi
1120 jmp ret_from_exception
1121 CFI_ENDPROC
1122END(page_fault)
1123
1124/*
1125 * Debug traps and NMI can happen at the one SYSENTER instruction
1126 * that sets up the real kernel stack. Check here, since we can't
1127 * allow the wrong stack to be used.
1128 *
1129 * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
1130 * already pushed 3 words if it hits on the sysenter instruction:
1131 * eflags, cs and eip.
1132 *
1133 * We just load the right stack, and push the three (known) values
1134 * by hand onto the new stack - while updating the return eip past
1135 * the instruction that would have done it for sysenter.
1136 */
1137#define FIX_STACK(offset, ok, label) \
1138 cmpw $__KERNEL_CS,4(%esp); \
1139 jne ok; \
1140label: \
1141 movl TSS_sysenter_sp0+offset(%esp),%esp; \
1142 CFI_DEF_CFA esp, 0; \
1143 CFI_UNDEFINED eip; \
1144 pushfl; \
1145 CFI_ADJUST_CFA_OFFSET 4; \
1146 pushl $__KERNEL_CS; \
1147 CFI_ADJUST_CFA_OFFSET 4; \
1148 pushl $sysenter_past_esp; \
1149 CFI_ADJUST_CFA_OFFSET 4; \
1150 CFI_REL_OFFSET eip, 0
1151
1152ENTRY(debug)
1153 RING0_INT_FRAME
1154 cmpl $ia32_sysenter_target,(%esp)
1155 jne debug_stack_correct
1156 FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
1157debug_stack_correct:
1158 pushl $-1 # mark this as an int
1159 CFI_ADJUST_CFA_OFFSET 4
1160 SAVE_ALL
1161 TRACE_IRQS_OFF
1162 xorl %edx,%edx # error code 0
1163 movl %esp,%eax # pt_regs pointer
1164 call do_debug
1165 jmp ret_from_exception
1166 CFI_ENDPROC
1167END(debug)
1168
1169/*
1170 * NMI is doubly nasty. It can happen _while_ we're handling
1171 * a debug fault, and the debug fault hasn't yet been able to
1172 * clear up the stack. So we first check whether we got an
1173 * NMI on the sysenter entry path, but after that we need to
1174 * check whether we got an NMI on the debug path where the debug
1175 * fault happened on the sysenter path.
1176 */
1177ENTRY(nmi)
1178 RING0_INT_FRAME
1179 pushl %eax
1180 CFI_ADJUST_CFA_OFFSET 4
1181 movl %ss, %eax
1182 cmpw $__ESPFIX_SS, %ax
1183 popl %eax
1184 CFI_ADJUST_CFA_OFFSET -4
1185 je nmi_espfix_stack
1186 cmpl $ia32_sysenter_target,(%esp)
1187 je nmi_stack_fixup
1188 pushl %eax
1189 CFI_ADJUST_CFA_OFFSET 4
1190 movl %esp,%eax
1191 /* Do not access memory above the end of our stack page,
1192 * it might not exist.
1193 */
1194 andl $(THREAD_SIZE-1),%eax
1195 cmpl $(THREAD_SIZE-20),%eax
1196 popl %eax
1197 CFI_ADJUST_CFA_OFFSET -4
1198 jae nmi_stack_correct
1199 cmpl $ia32_sysenter_target,12(%esp)
1200 je nmi_debug_stack_check
1201nmi_stack_correct:
1202 /* We have a RING0_INT_FRAME here */
1203 pushl %eax
1204 CFI_ADJUST_CFA_OFFSET 4
1205 SAVE_ALL
1206 TRACE_IRQS_OFF
1207 xorl %edx,%edx # zero error code
1208 movl %esp,%eax # pt_regs pointer
1209 call do_nmi
1210 jmp restore_nocheck_notrace
1211 CFI_ENDPROC
1212
1213nmi_stack_fixup:
1214 RING0_INT_FRAME
1215 FIX_STACK(12,nmi_stack_correct, 1)
1216 jmp nmi_stack_correct
1217
1218nmi_debug_stack_check:
1219 /* We have a RING0_INT_FRAME here */
1220 cmpw $__KERNEL_CS,16(%esp)
1221 jne nmi_stack_correct
1222 cmpl $debug,(%esp)
1223 jb nmi_stack_correct
1224 cmpl $debug_esp_fix_insn,(%esp)
1225 ja nmi_stack_correct
1226 FIX_STACK(24,nmi_stack_correct, 1)
1227 jmp nmi_stack_correct
1228
1229nmi_espfix_stack:
1230 /* We have a RING0_INT_FRAME here.
1231 *
1232 * create the pointer to lss back
1233 */
1234 pushl %ss
1235 CFI_ADJUST_CFA_OFFSET 4
1236 pushl %esp
1237 CFI_ADJUST_CFA_OFFSET 4
1238 addw $4, (%esp)
1239 /* copy the iret frame of 12 bytes */
1240 .rept 3
1241 pushl 16(%esp)
1242 CFI_ADJUST_CFA_OFFSET 4
1243 .endr
1244 pushl %eax
1245 CFI_ADJUST_CFA_OFFSET 4
1246 SAVE_ALL
1247 TRACE_IRQS_OFF
1248 FIXUP_ESPFIX_STACK # %eax == %esp
1249 xorl %edx,%edx # zero error code
1250 call do_nmi
1251 RESTORE_REGS
1252 lss 12+4(%esp), %esp # back to espfix stack
1253 CFI_ADJUST_CFA_OFFSET -24
1254 jmp irq_return
1255 CFI_ENDPROC
1256END(nmi)
1257
1258ENTRY(int3)
1259 RING0_INT_FRAME
1260 pushl $-1 # mark this as an int
1261 CFI_ADJUST_CFA_OFFSET 4
1262 SAVE_ALL
1263 TRACE_IRQS_OFF
1264 xorl %edx,%edx # zero error code
1265 movl %esp,%eax # pt_regs pointer
1266 call do_int3
1267 jmp ret_from_exception
1268 CFI_ENDPROC
1269END(int3)
1270
1271ENTRY(general_protection)
1272 RING0_EC_FRAME
1273 pushl $do_general_protection
1274 CFI_ADJUST_CFA_OFFSET 4
1275 jmp error_code
1276 CFI_ENDPROC
1277END(general_protection)
1278
1279/*
1280 * End of kprobes section
1281 */
1282 .popsection
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 54e0bbdccb99..e28c7a987793 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -11,15 +11,15 @@
11 * 11 *
12 * NOTE: This code handles signal-recognition, which happens every time 12 * NOTE: This code handles signal-recognition, which happens every time
13 * after an interrupt and after each system call. 13 * after an interrupt and after each system call.
14 * 14 *
15 * Normal syscalls and interrupts don't save a full stack frame, this is 15 * Normal syscalls and interrupts don't save a full stack frame, this is
16 * only done for syscall tracing, signals or fork/exec et.al. 16 * only done for syscall tracing, signals or fork/exec et.al.
17 * 17 *
18 * A note on terminology: 18 * A note on terminology:
19 * - top of stack: Architecture defined interrupt frame from SS to RIP 19 * - top of stack: Architecture defined interrupt frame from SS to RIP
20 * at the top of the kernel process stack. 20 * at the top of the kernel process stack.
21 * - partial stack frame: partially saved registers upto R11. 21 * - partial stack frame: partially saved registers upto R11.
22 * - full stack frame: Like partial stack frame, but all register saved. 22 * - full stack frame: Like partial stack frame, but all register saved.
23 * 23 *
24 * Some macro usage: 24 * Some macro usage:
25 * - CFI macros are used to generate dwarf2 unwind information for better 25 * - CFI macros are used to generate dwarf2 unwind information for better
@@ -60,7 +60,6 @@
60#define __AUDIT_ARCH_LE 0x40000000 60#define __AUDIT_ARCH_LE 0x40000000
61 61
62 .code64 62 .code64
63
64#ifdef CONFIG_FUNCTION_TRACER 63#ifdef CONFIG_FUNCTION_TRACER
65#ifdef CONFIG_DYNAMIC_FTRACE 64#ifdef CONFIG_DYNAMIC_FTRACE
66ENTRY(mcount) 65ENTRY(mcount)
@@ -71,15 +70,7 @@ ENTRY(ftrace_caller)
71 cmpl $0, function_trace_stop 70 cmpl $0, function_trace_stop
72 jne ftrace_stub 71 jne ftrace_stub
73 72
74 /* taken from glibc */ 73 MCOUNT_SAVE_FRAME
75 subq $0x38, %rsp
76 movq %rax, (%rsp)
77 movq %rcx, 8(%rsp)
78 movq %rdx, 16(%rsp)
79 movq %rsi, 24(%rsp)
80 movq %rdi, 32(%rsp)
81 movq %r8, 40(%rsp)
82 movq %r9, 48(%rsp)
83 74
84 movq 0x38(%rsp), %rdi 75 movq 0x38(%rsp), %rdi
85 movq 8(%rbp), %rsi 76 movq 8(%rbp), %rsi
@@ -89,14 +80,7 @@ ENTRY(ftrace_caller)
89ftrace_call: 80ftrace_call:
90 call ftrace_stub 81 call ftrace_stub
91 82
92 movq 48(%rsp), %r9 83 MCOUNT_RESTORE_FRAME
93 movq 40(%rsp), %r8
94 movq 32(%rsp), %rdi
95 movq 24(%rsp), %rsi
96 movq 16(%rsp), %rdx
97 movq 8(%rsp), %rcx
98 movq (%rsp), %rax
99 addq $0x38, %rsp
100 84
101#ifdef CONFIG_FUNCTION_GRAPH_TRACER 85#ifdef CONFIG_FUNCTION_GRAPH_TRACER
102.globl ftrace_graph_call 86.globl ftrace_graph_call
@@ -130,15 +114,7 @@ ftrace_stub:
130 retq 114 retq
131 115
132trace: 116trace:
133 /* taken from glibc */ 117 MCOUNT_SAVE_FRAME
134 subq $0x38, %rsp
135 movq %rax, (%rsp)
136 movq %rcx, 8(%rsp)
137 movq %rdx, 16(%rsp)
138 movq %rsi, 24(%rsp)
139 movq %rdi, 32(%rsp)
140 movq %r8, 40(%rsp)
141 movq %r9, 48(%rsp)
142 118
143 movq 0x38(%rsp), %rdi 119 movq 0x38(%rsp), %rdi
144 movq 8(%rbp), %rsi 120 movq 8(%rbp), %rsi
@@ -146,14 +122,7 @@ trace:
146 122
147 call *ftrace_trace_function 123 call *ftrace_trace_function
148 124
149 movq 48(%rsp), %r9 125 MCOUNT_RESTORE_FRAME
150 movq 40(%rsp), %r8
151 movq 32(%rsp), %rdi
152 movq 24(%rsp), %rsi
153 movq 16(%rsp), %rdx
154 movq 8(%rsp), %rcx
155 movq (%rsp), %rax
156 addq $0x38, %rsp
157 126
158 jmp ftrace_stub 127 jmp ftrace_stub
159END(mcount) 128END(mcount)
@@ -165,14 +134,7 @@ ENTRY(ftrace_graph_caller)
165 cmpl $0, function_trace_stop 134 cmpl $0, function_trace_stop
166 jne ftrace_stub 135 jne ftrace_stub
167 136
168 subq $0x38, %rsp 137 MCOUNT_SAVE_FRAME
169 movq %rax, (%rsp)
170 movq %rcx, 8(%rsp)
171 movq %rdx, 16(%rsp)
172 movq %rsi, 24(%rsp)
173 movq %rdi, 32(%rsp)
174 movq %r8, 40(%rsp)
175 movq %r9, 48(%rsp)
176 138
177 leaq 8(%rbp), %rdi 139 leaq 8(%rbp), %rdi
178 movq 0x38(%rsp), %rsi 140 movq 0x38(%rsp), %rsi
@@ -180,14 +142,8 @@ ENTRY(ftrace_graph_caller)
180 142
181 call prepare_ftrace_return 143 call prepare_ftrace_return
182 144
183 movq 48(%rsp), %r9 145 MCOUNT_RESTORE_FRAME
184 movq 40(%rsp), %r8 146
185 movq 32(%rsp), %rdi
186 movq 24(%rsp), %rsi
187 movq 16(%rsp), %rdx
188 movq 8(%rsp), %rcx
189 movq (%rsp), %rax
190 addq $0x38, %rsp
191 retq 147 retq
192END(ftrace_graph_caller) 148END(ftrace_graph_caller)
193 149
@@ -225,7 +181,7 @@ return_to_handler:
225 181
226#ifndef CONFIG_PREEMPT 182#ifndef CONFIG_PREEMPT
227#define retint_kernel retint_restore_args 183#define retint_kernel retint_restore_args
228#endif 184#endif
229 185
230#ifdef CONFIG_PARAVIRT 186#ifdef CONFIG_PARAVIRT
231ENTRY(native_usergs_sysret64) 187ENTRY(native_usergs_sysret64)
@@ -244,29 +200,29 @@ ENTRY(native_usergs_sysret64)
244.endm 200.endm
245 201
246/* 202/*
247 * C code is not supposed to know about undefined top of stack. Every time 203 * C code is not supposed to know about undefined top of stack. Every time
248 * a C function with an pt_regs argument is called from the SYSCALL based 204 * a C function with an pt_regs argument is called from the SYSCALL based
249 * fast path FIXUP_TOP_OF_STACK is needed. 205 * fast path FIXUP_TOP_OF_STACK is needed.
250 * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs 206 * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
251 * manipulation. 207 * manipulation.
252 */ 208 */
253 209
254 /* %rsp:at FRAMEEND */ 210 /* %rsp:at FRAMEEND */
255 .macro FIXUP_TOP_OF_STACK tmp 211 .macro FIXUP_TOP_OF_STACK tmp offset=0
256 movq %gs:pda_oldrsp,\tmp 212 movq %gs:pda_oldrsp,\tmp
257 movq \tmp,RSP(%rsp) 213 movq \tmp,RSP+\offset(%rsp)
258 movq $__USER_DS,SS(%rsp) 214 movq $__USER_DS,SS+\offset(%rsp)
259 movq $__USER_CS,CS(%rsp) 215 movq $__USER_CS,CS+\offset(%rsp)
260 movq $-1,RCX(%rsp) 216 movq $-1,RCX+\offset(%rsp)
261 movq R11(%rsp),\tmp /* get eflags */ 217 movq R11+\offset(%rsp),\tmp /* get eflags */
262 movq \tmp,EFLAGS(%rsp) 218 movq \tmp,EFLAGS+\offset(%rsp)
263 .endm 219 .endm
264 220
265 .macro RESTORE_TOP_OF_STACK tmp,offset=0 221 .macro RESTORE_TOP_OF_STACK tmp offset=0
266 movq RSP-\offset(%rsp),\tmp 222 movq RSP+\offset(%rsp),\tmp
267 movq \tmp,%gs:pda_oldrsp 223 movq \tmp,%gs:pda_oldrsp
268 movq EFLAGS-\offset(%rsp),\tmp 224 movq EFLAGS+\offset(%rsp),\tmp
269 movq \tmp,R11-\offset(%rsp) 225 movq \tmp,R11+\offset(%rsp)
270 .endm 226 .endm
271 227
272 .macro FAKE_STACK_FRAME child_rip 228 .macro FAKE_STACK_FRAME child_rip
@@ -278,7 +234,7 @@ ENTRY(native_usergs_sysret64)
278 pushq %rax /* rsp */ 234 pushq %rax /* rsp */
279 CFI_ADJUST_CFA_OFFSET 8 235 CFI_ADJUST_CFA_OFFSET 8
280 CFI_REL_OFFSET rsp,0 236 CFI_REL_OFFSET rsp,0
281 pushq $(1<<9) /* eflags - interrupts on */ 237 pushq $X86_EFLAGS_IF /* eflags - interrupts on */
282 CFI_ADJUST_CFA_OFFSET 8 238 CFI_ADJUST_CFA_OFFSET 8
283 /*CFI_REL_OFFSET rflags,0*/ 239 /*CFI_REL_OFFSET rflags,0*/
284 pushq $__KERNEL_CS /* cs */ 240 pushq $__KERNEL_CS /* cs */
@@ -296,62 +252,184 @@ ENTRY(native_usergs_sysret64)
296 CFI_ADJUST_CFA_OFFSET -(6*8) 252 CFI_ADJUST_CFA_OFFSET -(6*8)
297 .endm 253 .endm
298 254
299 .macro CFI_DEFAULT_STACK start=1 255/*
256 * initial frame state for interrupts (and exceptions without error code)
257 */
258 .macro EMPTY_FRAME start=1 offset=0
300 .if \start 259 .if \start
301 CFI_STARTPROC simple 260 CFI_STARTPROC simple
302 CFI_SIGNAL_FRAME 261 CFI_SIGNAL_FRAME
303 CFI_DEF_CFA rsp,SS+8 262 CFI_DEF_CFA rsp,8+\offset
304 .else 263 .else
305 CFI_DEF_CFA_OFFSET SS+8 264 CFI_DEF_CFA_OFFSET 8+\offset
306 .endif 265 .endif
307 CFI_REL_OFFSET r15,R15
308 CFI_REL_OFFSET r14,R14
309 CFI_REL_OFFSET r13,R13
310 CFI_REL_OFFSET r12,R12
311 CFI_REL_OFFSET rbp,RBP
312 CFI_REL_OFFSET rbx,RBX
313 CFI_REL_OFFSET r11,R11
314 CFI_REL_OFFSET r10,R10
315 CFI_REL_OFFSET r9,R9
316 CFI_REL_OFFSET r8,R8
317 CFI_REL_OFFSET rax,RAX
318 CFI_REL_OFFSET rcx,RCX
319 CFI_REL_OFFSET rdx,RDX
320 CFI_REL_OFFSET rsi,RSI
321 CFI_REL_OFFSET rdi,RDI
322 CFI_REL_OFFSET rip,RIP
323 /*CFI_REL_OFFSET cs,CS*/
324 /*CFI_REL_OFFSET rflags,EFLAGS*/
325 CFI_REL_OFFSET rsp,RSP
326 /*CFI_REL_OFFSET ss,SS*/
327 .endm 266 .endm
267
328/* 268/*
329 * A newly forked process directly context switches into this. 269 * initial frame state for interrupts (and exceptions without error code)
330 */ 270 */
331/* rdi: prev */ 271 .macro INTR_FRAME start=1 offset=0
272 EMPTY_FRAME \start, SS+8+\offset-RIP
273 /*CFI_REL_OFFSET ss, SS+\offset-RIP*/
274 CFI_REL_OFFSET rsp, RSP+\offset-RIP
275 /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/
276 /*CFI_REL_OFFSET cs, CS+\offset-RIP*/
277 CFI_REL_OFFSET rip, RIP+\offset-RIP
278 .endm
279
280/*
281 * initial frame state for exceptions with error code (and interrupts
282 * with vector already pushed)
283 */
284 .macro XCPT_FRAME start=1 offset=0
285 INTR_FRAME \start, RIP+\offset-ORIG_RAX
286 /*CFI_REL_OFFSET orig_rax, ORIG_RAX-ORIG_RAX*/
287 .endm
288
289/*
290 * frame that enables calling into C.
291 */
292 .macro PARTIAL_FRAME start=1 offset=0
293 XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET
294 CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET
295 CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET
296 CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET
297 CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET
298 CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET
299 CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET
300 CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET
301 CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET
302 CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET
303 .endm
304
305/*
306 * frame that enables passing a complete pt_regs to a C function.
307 */
308 .macro DEFAULT_FRAME start=1 offset=0
309 PARTIAL_FRAME \start, R11+\offset-R15
310 CFI_REL_OFFSET rbx, RBX+\offset
311 CFI_REL_OFFSET rbp, RBP+\offset
312 CFI_REL_OFFSET r12, R12+\offset
313 CFI_REL_OFFSET r13, R13+\offset
314 CFI_REL_OFFSET r14, R14+\offset
315 CFI_REL_OFFSET r15, R15+\offset
316 .endm
317
318/* save partial stack frame */
319ENTRY(save_args)
320 XCPT_FRAME
321 cld
322 movq_cfi rdi, RDI+16-ARGOFFSET
323 movq_cfi rsi, RSI+16-ARGOFFSET
324 movq_cfi rdx, RDX+16-ARGOFFSET
325 movq_cfi rcx, RCX+16-ARGOFFSET
326 movq_cfi rax, RAX+16-ARGOFFSET
327 movq_cfi r8, R8+16-ARGOFFSET
328 movq_cfi r9, R9+16-ARGOFFSET
329 movq_cfi r10, R10+16-ARGOFFSET
330 movq_cfi r11, R11+16-ARGOFFSET
331
332 leaq -ARGOFFSET+16(%rsp),%rdi /* arg1 for handler */
333 movq_cfi rbp, 8 /* push %rbp */
334 leaq 8(%rsp), %rbp /* mov %rsp, %ebp */
335 testl $3, CS(%rdi)
336 je 1f
337 SWAPGS
338 /*
339 * irqcount is used to check if a CPU is already on an interrupt stack
340 * or not. While this is essentially redundant with preempt_count it is
341 * a little cheaper to use a separate counter in the PDA (short of
342 * moving irq_enter into assembly, which would be too much work)
343 */
3441: incl %gs:pda_irqcount
345 jne 2f
346 popq_cfi %rax /* move return address... */
347 mov %gs:pda_irqstackptr,%rsp
348 EMPTY_FRAME 0
349 pushq_cfi %rax /* ... to the new stack */
350 /*
351 * We entered an interrupt context - irqs are off:
352 */
3532: TRACE_IRQS_OFF
354 ret
355 CFI_ENDPROC
356END(save_args)
357
358ENTRY(save_rest)
359 PARTIAL_FRAME 1 REST_SKIP+8
360 movq 5*8+16(%rsp), %r11 /* save return address */
361 movq_cfi rbx, RBX+16
362 movq_cfi rbp, RBP+16
363 movq_cfi r12, R12+16
364 movq_cfi r13, R13+16
365 movq_cfi r14, R14+16
366 movq_cfi r15, R15+16
367 movq %r11, 8(%rsp) /* return address */
368 FIXUP_TOP_OF_STACK %r11, 16
369 ret
370 CFI_ENDPROC
371END(save_rest)
372
373/* save complete stack frame */
374ENTRY(save_paranoid)
375 XCPT_FRAME 1 RDI+8
376 cld
377 movq_cfi rdi, RDI+8
378 movq_cfi rsi, RSI+8
379 movq_cfi rdx, RDX+8
380 movq_cfi rcx, RCX+8
381 movq_cfi rax, RAX+8
382 movq_cfi r8, R8+8
383 movq_cfi r9, R9+8
384 movq_cfi r10, R10+8
385 movq_cfi r11, R11+8
386 movq_cfi rbx, RBX+8
387 movq_cfi rbp, RBP+8
388 movq_cfi r12, R12+8
389 movq_cfi r13, R13+8
390 movq_cfi r14, R14+8
391 movq_cfi r15, R15+8
392 movl $1,%ebx
393 movl $MSR_GS_BASE,%ecx
394 rdmsr
395 testl %edx,%edx
396 js 1f /* negative -> in kernel */
397 SWAPGS
398 xorl %ebx,%ebx
3991: ret
400 CFI_ENDPROC
401END(save_paranoid)
402
403/*
404 * A newly forked process directly context switches into this address.
405 *
406 * rdi: prev task we switched from
407 */
332ENTRY(ret_from_fork) 408ENTRY(ret_from_fork)
333 CFI_DEFAULT_STACK 409 DEFAULT_FRAME
410
334 push kernel_eflags(%rip) 411 push kernel_eflags(%rip)
335 CFI_ADJUST_CFA_OFFSET 8 412 CFI_ADJUST_CFA_OFFSET 8
336 popf # reset kernel eflags 413 popf # reset kernel eflags
337 CFI_ADJUST_CFA_OFFSET -8 414 CFI_ADJUST_CFA_OFFSET -8
338 call schedule_tail 415
416 call schedule_tail # rdi: 'prev' task parameter
417
339 GET_THREAD_INFO(%rcx) 418 GET_THREAD_INFO(%rcx)
340 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx) 419
341 jnz rff_trace 420 CFI_REMEMBER_STATE
342rff_action:
343 RESTORE_REST 421 RESTORE_REST
344 testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread? 422
423 testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
345 je int_ret_from_sys_call 424 je int_ret_from_sys_call
346 testl $_TIF_IA32,TI_flags(%rcx) 425
426 testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET
347 jnz int_ret_from_sys_call 427 jnz int_ret_from_sys_call
348 RESTORE_TOP_OF_STACK %rdi,ARGOFFSET 428
349 jmp ret_from_sys_call 429 RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
350rff_trace: 430 jmp ret_from_sys_call # go to the SYSRET fastpath
351 movq %rsp,%rdi 431
352 call syscall_trace_leave 432 CFI_RESTORE_STATE
353 GET_THREAD_INFO(%rcx)
354 jmp rff_action
355 CFI_ENDPROC 433 CFI_ENDPROC
356END(ret_from_fork) 434END(ret_from_fork)
357 435
@@ -361,20 +439,20 @@ END(ret_from_fork)
361 * SYSCALL does not save anything on the stack and does not change the 439 * SYSCALL does not save anything on the stack and does not change the
362 * stack pointer. 440 * stack pointer.
363 */ 441 */
364 442
365/* 443/*
366 * Register setup: 444 * Register setup:
367 * rax system call number 445 * rax system call number
368 * rdi arg0 446 * rdi arg0
369 * rcx return address for syscall/sysret, C arg3 447 * rcx return address for syscall/sysret, C arg3
370 * rsi arg1 448 * rsi arg1
371 * rdx arg2 449 * rdx arg2
372 * r10 arg3 (--> moved to rcx for C) 450 * r10 arg3 (--> moved to rcx for C)
373 * r8 arg4 451 * r8 arg4
374 * r9 arg5 452 * r9 arg5
375 * r11 eflags for syscall/sysret, temporary for C 453 * r11 eflags for syscall/sysret, temporary for C
376 * r12-r15,rbp,rbx saved by C code, not touched. 454 * r12-r15,rbp,rbx saved by C code, not touched.
377 * 455 *
378 * Interrupts are off on entry. 456 * Interrupts are off on entry.
379 * Only called from user space. 457 * Only called from user space.
380 * 458 *
@@ -384,7 +462,7 @@ END(ret_from_fork)
384 * When user can change the frames always force IRET. That is because 462 * When user can change the frames always force IRET. That is because
385 * it deals with uncanonical addresses better. SYSRET has trouble 463 * it deals with uncanonical addresses better. SYSRET has trouble
386 * with them due to bugs in both AMD and Intel CPUs. 464 * with them due to bugs in both AMD and Intel CPUs.
387 */ 465 */
388 466
389ENTRY(system_call) 467ENTRY(system_call)
390 CFI_STARTPROC simple 468 CFI_STARTPROC simple
@@ -400,7 +478,7 @@ ENTRY(system_call)
400 */ 478 */
401ENTRY(system_call_after_swapgs) 479ENTRY(system_call_after_swapgs)
402 480
403 movq %rsp,%gs:pda_oldrsp 481 movq %rsp,%gs:pda_oldrsp
404 movq %gs:pda_kernelstack,%rsp 482 movq %gs:pda_kernelstack,%rsp
405 /* 483 /*
406 * No need to follow this irqs off/on section - it's straight 484 * No need to follow this irqs off/on section - it's straight
@@ -408,7 +486,7 @@ ENTRY(system_call_after_swapgs)
408 */ 486 */
409 ENABLE_INTERRUPTS(CLBR_NONE) 487 ENABLE_INTERRUPTS(CLBR_NONE)
410 SAVE_ARGS 8,1 488 SAVE_ARGS 8,1
411 movq %rax,ORIG_RAX-ARGOFFSET(%rsp) 489 movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
412 movq %rcx,RIP-ARGOFFSET(%rsp) 490 movq %rcx,RIP-ARGOFFSET(%rsp)
413 CFI_REL_OFFSET rip,RIP-ARGOFFSET 491 CFI_REL_OFFSET rip,RIP-ARGOFFSET
414 GET_THREAD_INFO(%rcx) 492 GET_THREAD_INFO(%rcx)
@@ -422,19 +500,19 @@ system_call_fastpath:
422 movq %rax,RAX-ARGOFFSET(%rsp) 500 movq %rax,RAX-ARGOFFSET(%rsp)
423/* 501/*
424 * Syscall return path ending with SYSRET (fast path) 502 * Syscall return path ending with SYSRET (fast path)
425 * Has incomplete stack frame and undefined top of stack. 503 * Has incomplete stack frame and undefined top of stack.
426 */ 504 */
427ret_from_sys_call: 505ret_from_sys_call:
428 movl $_TIF_ALLWORK_MASK,%edi 506 movl $_TIF_ALLWORK_MASK,%edi
429 /* edi: flagmask */ 507 /* edi: flagmask */
430sysret_check: 508sysret_check:
431 LOCKDEP_SYS_EXIT 509 LOCKDEP_SYS_EXIT
432 GET_THREAD_INFO(%rcx) 510 GET_THREAD_INFO(%rcx)
433 DISABLE_INTERRUPTS(CLBR_NONE) 511 DISABLE_INTERRUPTS(CLBR_NONE)
434 TRACE_IRQS_OFF 512 TRACE_IRQS_OFF
435 movl TI_flags(%rcx),%edx 513 movl TI_flags(%rcx),%edx
436 andl %edi,%edx 514 andl %edi,%edx
437 jnz sysret_careful 515 jnz sysret_careful
438 CFI_REMEMBER_STATE 516 CFI_REMEMBER_STATE
439 /* 517 /*
440 * sysretq will re-enable interrupts: 518 * sysretq will re-enable interrupts:
@@ -449,7 +527,7 @@ sysret_check:
449 527
450 CFI_RESTORE_STATE 528 CFI_RESTORE_STATE
451 /* Handle reschedules */ 529 /* Handle reschedules */
452 /* edx: work, edi: workmask */ 530 /* edx: work, edi: workmask */
453sysret_careful: 531sysret_careful:
454 bt $TIF_NEED_RESCHED,%edx 532 bt $TIF_NEED_RESCHED,%edx
455 jnc sysret_signal 533 jnc sysret_signal
@@ -462,7 +540,7 @@ sysret_careful:
462 CFI_ADJUST_CFA_OFFSET -8 540 CFI_ADJUST_CFA_OFFSET -8
463 jmp sysret_check 541 jmp sysret_check
464 542
465 /* Handle a signal */ 543 /* Handle a signal */
466sysret_signal: 544sysret_signal:
467 TRACE_IRQS_ON 545 TRACE_IRQS_ON
468 ENABLE_INTERRUPTS(CLBR_NONE) 546 ENABLE_INTERRUPTS(CLBR_NONE)
@@ -471,17 +549,20 @@ sysret_signal:
471 jc sysret_audit 549 jc sysret_audit
472#endif 550#endif
473 /* edx: work flags (arg3) */ 551 /* edx: work flags (arg3) */
474 leaq do_notify_resume(%rip),%rax
475 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 552 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
476 xorl %esi,%esi # oldset -> arg2 553 xorl %esi,%esi # oldset -> arg2
477 call ptregscall_common 554 SAVE_REST
555 FIXUP_TOP_OF_STACK %r11
556 call do_notify_resume
557 RESTORE_TOP_OF_STACK %r11
558 RESTORE_REST
478 movl $_TIF_WORK_MASK,%edi 559 movl $_TIF_WORK_MASK,%edi
479 /* Use IRET because user could have changed frame. This 560 /* Use IRET because user could have changed frame. This
480 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ 561 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
481 DISABLE_INTERRUPTS(CLBR_NONE) 562 DISABLE_INTERRUPTS(CLBR_NONE)
482 TRACE_IRQS_OFF 563 TRACE_IRQS_OFF
483 jmp int_with_check 564 jmp int_with_check
484 565
485badsys: 566badsys:
486 movq $-ENOSYS,RAX-ARGOFFSET(%rsp) 567 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
487 jmp ret_from_sys_call 568 jmp ret_from_sys_call
@@ -520,7 +601,7 @@ sysret_audit:
520#endif /* CONFIG_AUDITSYSCALL */ 601#endif /* CONFIG_AUDITSYSCALL */
521 602
522 /* Do syscall tracing */ 603 /* Do syscall tracing */
523tracesys: 604tracesys:
524#ifdef CONFIG_AUDITSYSCALL 605#ifdef CONFIG_AUDITSYSCALL
525 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx) 606 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
526 jz auditsys 607 jz auditsys
@@ -543,8 +624,8 @@ tracesys:
543 call *sys_call_table(,%rax,8) 624 call *sys_call_table(,%rax,8)
544 movq %rax,RAX-ARGOFFSET(%rsp) 625 movq %rax,RAX-ARGOFFSET(%rsp)
545 /* Use IRET because user could have changed frame */ 626 /* Use IRET because user could have changed frame */
546 627
547/* 628/*
548 * Syscall return path ending with IRET. 629 * Syscall return path ending with IRET.
549 * Has correct top of stack, but partial stack frame. 630 * Has correct top of stack, but partial stack frame.
550 */ 631 */
@@ -588,18 +669,18 @@ int_very_careful:
588 TRACE_IRQS_ON 669 TRACE_IRQS_ON
589 ENABLE_INTERRUPTS(CLBR_NONE) 670 ENABLE_INTERRUPTS(CLBR_NONE)
590 SAVE_REST 671 SAVE_REST
591 /* Check for syscall exit trace */ 672 /* Check for syscall exit trace */
592 testl $_TIF_WORK_SYSCALL_EXIT,%edx 673 testl $_TIF_WORK_SYSCALL_EXIT,%edx
593 jz int_signal 674 jz int_signal
594 pushq %rdi 675 pushq %rdi
595 CFI_ADJUST_CFA_OFFSET 8 676 CFI_ADJUST_CFA_OFFSET 8
596 leaq 8(%rsp),%rdi # &ptregs -> arg1 677 leaq 8(%rsp),%rdi # &ptregs -> arg1
597 call syscall_trace_leave 678 call syscall_trace_leave
598 popq %rdi 679 popq %rdi
599 CFI_ADJUST_CFA_OFFSET -8 680 CFI_ADJUST_CFA_OFFSET -8
600 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi 681 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
601 jmp int_restore_rest 682 jmp int_restore_rest
602 683
603int_signal: 684int_signal:
604 testl $_TIF_DO_NOTIFY_MASK,%edx 685 testl $_TIF_DO_NOTIFY_MASK,%edx
605 jz 1f 686 jz 1f
@@ -614,22 +695,24 @@ int_restore_rest:
614 jmp int_with_check 695 jmp int_with_check
615 CFI_ENDPROC 696 CFI_ENDPROC
616END(system_call) 697END(system_call)
617 698
618/* 699/*
619 * Certain special system calls that need to save a complete full stack frame. 700 * Certain special system calls that need to save a complete full stack frame.
620 */ 701 */
621
622 .macro PTREGSCALL label,func,arg 702 .macro PTREGSCALL label,func,arg
623 .globl \label 703ENTRY(\label)
624\label: 704 PARTIAL_FRAME 1 8 /* offset 8: return address */
625 leaq \func(%rip),%rax 705 subq $REST_SKIP, %rsp
626 leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */ 706 CFI_ADJUST_CFA_OFFSET REST_SKIP
627 jmp ptregscall_common 707 call save_rest
708 DEFAULT_FRAME 0 8 /* offset 8: return address */
709 leaq 8(%rsp), \arg /* pt_regs pointer */
710 call \func
711 jmp ptregscall_common
712 CFI_ENDPROC
628END(\label) 713END(\label)
629 .endm 714 .endm
630 715
631 CFI_STARTPROC
632
633 PTREGSCALL stub_clone, sys_clone, %r8 716 PTREGSCALL stub_clone, sys_clone, %r8
634 PTREGSCALL stub_fork, sys_fork, %rdi 717 PTREGSCALL stub_fork, sys_fork, %rdi
635 PTREGSCALL stub_vfork, sys_vfork, %rdi 718 PTREGSCALL stub_vfork, sys_vfork, %rdi
@@ -637,25 +720,18 @@ END(\label)
637 PTREGSCALL stub_iopl, sys_iopl, %rsi 720 PTREGSCALL stub_iopl, sys_iopl, %rsi
638 721
639ENTRY(ptregscall_common) 722ENTRY(ptregscall_common)
640 popq %r11 723 DEFAULT_FRAME 1 8 /* offset 8: return address */
641 CFI_ADJUST_CFA_OFFSET -8 724 RESTORE_TOP_OF_STACK %r11, 8
642 CFI_REGISTER rip, r11 725 movq_cfi_restore R15+8, r15
643 SAVE_REST 726 movq_cfi_restore R14+8, r14
644 movq %r11, %r15 727 movq_cfi_restore R13+8, r13
645 CFI_REGISTER rip, r15 728 movq_cfi_restore R12+8, r12
646 FIXUP_TOP_OF_STACK %r11 729 movq_cfi_restore RBP+8, rbp
647 call *%rax 730 movq_cfi_restore RBX+8, rbx
648 RESTORE_TOP_OF_STACK %r11 731 ret $REST_SKIP /* pop extended registers */
649 movq %r15, %r11
650 CFI_REGISTER rip, r11
651 RESTORE_REST
652 pushq %r11
653 CFI_ADJUST_CFA_OFFSET 8
654 CFI_REL_OFFSET rip, 0
655 ret
656 CFI_ENDPROC 732 CFI_ENDPROC
657END(ptregscall_common) 733END(ptregscall_common)
658 734
659ENTRY(stub_execve) 735ENTRY(stub_execve)
660 CFI_STARTPROC 736 CFI_STARTPROC
661 popq %r11 737 popq %r11
@@ -671,11 +747,11 @@ ENTRY(stub_execve)
671 jmp int_ret_from_sys_call 747 jmp int_ret_from_sys_call
672 CFI_ENDPROC 748 CFI_ENDPROC
673END(stub_execve) 749END(stub_execve)
674 750
675/* 751/*
676 * sigreturn is special because it needs to restore all registers on return. 752 * sigreturn is special because it needs to restore all registers on return.
677 * This cannot be done with SYSRET, so use the IRET return path instead. 753 * This cannot be done with SYSRET, so use the IRET return path instead.
678 */ 754 */
679ENTRY(stub_rt_sigreturn) 755ENTRY(stub_rt_sigreturn)
680 CFI_STARTPROC 756 CFI_STARTPROC
681 addq $8, %rsp 757 addq $8, %rsp
@@ -691,70 +767,70 @@ ENTRY(stub_rt_sigreturn)
691END(stub_rt_sigreturn) 767END(stub_rt_sigreturn)
692 768
693/* 769/*
694 * initial frame state for interrupts and exceptions 770 * Build the entry stubs and pointer table with some assembler magic.
771 * We pack 7 stubs into a single 32-byte chunk, which will fit in a
772 * single cache line on all modern x86 implementations.
695 */ 773 */
696 .macro _frame ref 774 .section .init.rodata,"a"
697 CFI_STARTPROC simple 775ENTRY(interrupt)
698 CFI_SIGNAL_FRAME 776 .text
699 CFI_DEF_CFA rsp,SS+8-\ref 777 .p2align 5
700 /*CFI_REL_OFFSET ss,SS-\ref*/ 778 .p2align CONFIG_X86_L1_CACHE_SHIFT
701 CFI_REL_OFFSET rsp,RSP-\ref 779ENTRY(irq_entries_start)
702 /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/ 780 INTR_FRAME
703 /*CFI_REL_OFFSET cs,CS-\ref*/ 781vector=FIRST_EXTERNAL_VECTOR
704 CFI_REL_OFFSET rip,RIP-\ref 782.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7
705 .endm 783 .balign 32
784 .rept 7
785 .if vector < NR_VECTORS
786 .if vector <> FIRST_EXTERNAL_VECTOR
787 CFI_ADJUST_CFA_OFFSET -8
788 .endif
7891: pushq $(~vector+0x80) /* Note: always in signed byte range */
790 CFI_ADJUST_CFA_OFFSET 8
791 .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
792 jmp 2f
793 .endif
794 .previous
795 .quad 1b
796 .text
797vector=vector+1
798 .endif
799 .endr
8002: jmp common_interrupt
801.endr
802 CFI_ENDPROC
803END(irq_entries_start)
706 804
707/* initial frame state for interrupts (and exceptions without error code) */ 805.previous
708#define INTR_FRAME _frame RIP 806END(interrupt)
709/* initial frame state for exceptions with error code (and interrupts with 807.previous
710 vector already pushed) */
711#define XCPT_FRAME _frame ORIG_RAX
712 808
713/* 809/*
714 * Interrupt entry/exit. 810 * Interrupt entry/exit.
715 * 811 *
716 * Interrupt entry points save only callee clobbered registers in fast path. 812 * Interrupt entry points save only callee clobbered registers in fast path.
717 * 813 *
718 * Entry runs with interrupts off. 814 * Entry runs with interrupts off.
719 */ 815 */
720 816
721/* 0(%rsp): interrupt number */ 817/* 0(%rsp): ~(interrupt number) */
722 .macro interrupt func 818 .macro interrupt func
723 cld 819 subq $10*8, %rsp
724 SAVE_ARGS 820 CFI_ADJUST_CFA_OFFSET 10*8
725 leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler 821 call save_args
726 pushq %rbp 822 PARTIAL_FRAME 0
727 /*
728 * Save rbp twice: One is for marking the stack frame, as usual, and the
729 * other, to fill pt_regs properly. This is because bx comes right
730 * before the last saved register in that structure, and not bp. If the
731 * base pointer were in the place bx is today, this would not be needed.
732 */
733 movq %rbp, -8(%rsp)
734 CFI_ADJUST_CFA_OFFSET 8
735 CFI_REL_OFFSET rbp, 0
736 movq %rsp,%rbp
737 CFI_DEF_CFA_REGISTER rbp
738 testl $3,CS(%rdi)
739 je 1f
740 SWAPGS
741 /* irqcount is used to check if a CPU is already on an interrupt
742 stack or not. While this is essentially redundant with preempt_count
743 it is a little cheaper to use a separate counter in the PDA
744 (short of moving irq_enter into assembly, which would be too
745 much work) */
7461: incl %gs:pda_irqcount
747 cmoveq %gs:pda_irqstackptr,%rsp
748 push %rbp # backlink for old unwinder
749 /*
750 * We entered an interrupt context - irqs are off:
751 */
752 TRACE_IRQS_OFF
753 call \func 823 call \func
754 .endm 824 .endm
755 825
756ENTRY(common_interrupt) 826 /*
827 * The interrupt stubs push (~vector+0x80) onto the stack and
828 * then jump to common_interrupt.
829 */
830 .p2align CONFIG_X86_L1_CACHE_SHIFT
831common_interrupt:
757 XCPT_FRAME 832 XCPT_FRAME
833 addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */
758 interrupt do_IRQ 834 interrupt do_IRQ
759 /* 0(%rsp): oldrsp-ARGOFFSET */ 835 /* 0(%rsp): oldrsp-ARGOFFSET */
760ret_from_intr: 836ret_from_intr:
@@ -768,12 +844,12 @@ exit_intr:
768 GET_THREAD_INFO(%rcx) 844 GET_THREAD_INFO(%rcx)
769 testl $3,CS-ARGOFFSET(%rsp) 845 testl $3,CS-ARGOFFSET(%rsp)
770 je retint_kernel 846 je retint_kernel
771 847
772 /* Interrupt came from user space */ 848 /* Interrupt came from user space */
773 /* 849 /*
774 * Has a correct top of stack, but a partial stack frame 850 * Has a correct top of stack, but a partial stack frame
775 * %rcx: thread info. Interrupts off. 851 * %rcx: thread info. Interrupts off.
776 */ 852 */
777retint_with_reschedule: 853retint_with_reschedule:
778 movl $_TIF_WORK_MASK,%edi 854 movl $_TIF_WORK_MASK,%edi
779retint_check: 855retint_check:
@@ -846,20 +922,20 @@ retint_careful:
846 pushq %rdi 922 pushq %rdi
847 CFI_ADJUST_CFA_OFFSET 8 923 CFI_ADJUST_CFA_OFFSET 8
848 call schedule 924 call schedule
849 popq %rdi 925 popq %rdi
850 CFI_ADJUST_CFA_OFFSET -8 926 CFI_ADJUST_CFA_OFFSET -8
851 GET_THREAD_INFO(%rcx) 927 GET_THREAD_INFO(%rcx)
852 DISABLE_INTERRUPTS(CLBR_NONE) 928 DISABLE_INTERRUPTS(CLBR_NONE)
853 TRACE_IRQS_OFF 929 TRACE_IRQS_OFF
854 jmp retint_check 930 jmp retint_check
855 931
856retint_signal: 932retint_signal:
857 testl $_TIF_DO_NOTIFY_MASK,%edx 933 testl $_TIF_DO_NOTIFY_MASK,%edx
858 jz retint_swapgs 934 jz retint_swapgs
859 TRACE_IRQS_ON 935 TRACE_IRQS_ON
860 ENABLE_INTERRUPTS(CLBR_NONE) 936 ENABLE_INTERRUPTS(CLBR_NONE)
861 SAVE_REST 937 SAVE_REST
862 movq $-1,ORIG_RAX(%rsp) 938 movq $-1,ORIG_RAX(%rsp)
863 xorl %esi,%esi # oldset 939 xorl %esi,%esi # oldset
864 movq %rsp,%rdi # &pt_regs 940 movq %rsp,%rdi # &pt_regs
865 call do_notify_resume 941 call do_notify_resume
@@ -881,324 +957,211 @@ ENTRY(retint_kernel)
881 jnc retint_restore_args 957 jnc retint_restore_args
882 call preempt_schedule_irq 958 call preempt_schedule_irq
883 jmp exit_intr 959 jmp exit_intr
884#endif 960#endif
885 961
886 CFI_ENDPROC 962 CFI_ENDPROC
887END(common_interrupt) 963END(common_interrupt)
888 964
889/* 965/*
890 * APIC interrupts. 966 * APIC interrupts.
891 */ 967 */
892 .macro apicinterrupt num,func 968.macro apicinterrupt num sym do_sym
969ENTRY(\sym)
893 INTR_FRAME 970 INTR_FRAME
894 pushq $~(\num) 971 pushq $~(\num)
895 CFI_ADJUST_CFA_OFFSET 8 972 CFI_ADJUST_CFA_OFFSET 8
896 interrupt \func 973 interrupt \do_sym
897 jmp ret_from_intr 974 jmp ret_from_intr
898 CFI_ENDPROC 975 CFI_ENDPROC
899 .endm 976END(\sym)
900 977.endm
901ENTRY(thermal_interrupt)
902 apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
903END(thermal_interrupt)
904
905ENTRY(threshold_interrupt)
906 apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt
907END(threshold_interrupt)
908
909#ifdef CONFIG_SMP
910ENTRY(reschedule_interrupt)
911 apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
912END(reschedule_interrupt)
913
914 .macro INVALIDATE_ENTRY num
915ENTRY(invalidate_interrupt\num)
916 apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt
917END(invalidate_interrupt\num)
918 .endm
919 978
920 INVALIDATE_ENTRY 0 979#ifdef CONFIG_SMP
921 INVALIDATE_ENTRY 1 980apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
922 INVALIDATE_ENTRY 2 981 irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
923 INVALIDATE_ENTRY 3
924 INVALIDATE_ENTRY 4
925 INVALIDATE_ENTRY 5
926 INVALIDATE_ENTRY 6
927 INVALIDATE_ENTRY 7
928
929ENTRY(call_function_interrupt)
930 apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
931END(call_function_interrupt)
932ENTRY(call_function_single_interrupt)
933 apicinterrupt CALL_FUNCTION_SINGLE_VECTOR,smp_call_function_single_interrupt
934END(call_function_single_interrupt)
935ENTRY(irq_move_cleanup_interrupt)
936 apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt
937END(irq_move_cleanup_interrupt)
938#endif 982#endif
939 983
940ENTRY(apic_timer_interrupt) 984apicinterrupt UV_BAU_MESSAGE \
941 apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt 985 uv_bau_message_intr1 uv_bau_message_interrupt
942END(apic_timer_interrupt) 986apicinterrupt LOCAL_TIMER_VECTOR \
987 apic_timer_interrupt smp_apic_timer_interrupt
988
989#ifdef CONFIG_SMP
990apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
991 invalidate_interrupt0 smp_invalidate_interrupt
992apicinterrupt INVALIDATE_TLB_VECTOR_START+1 \
993 invalidate_interrupt1 smp_invalidate_interrupt
994apicinterrupt INVALIDATE_TLB_VECTOR_START+2 \
995 invalidate_interrupt2 smp_invalidate_interrupt
996apicinterrupt INVALIDATE_TLB_VECTOR_START+3 \
997 invalidate_interrupt3 smp_invalidate_interrupt
998apicinterrupt INVALIDATE_TLB_VECTOR_START+4 \
999 invalidate_interrupt4 smp_invalidate_interrupt
1000apicinterrupt INVALIDATE_TLB_VECTOR_START+5 \
1001 invalidate_interrupt5 smp_invalidate_interrupt
1002apicinterrupt INVALIDATE_TLB_VECTOR_START+6 \
1003 invalidate_interrupt6 smp_invalidate_interrupt
1004apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \
1005 invalidate_interrupt7 smp_invalidate_interrupt
1006#endif
943 1007
944ENTRY(uv_bau_message_intr1) 1008apicinterrupt THRESHOLD_APIC_VECTOR \
945 apicinterrupt 220,uv_bau_message_interrupt 1009 threshold_interrupt mce_threshold_interrupt
946END(uv_bau_message_intr1) 1010apicinterrupt THERMAL_APIC_VECTOR \
1011 thermal_interrupt smp_thermal_interrupt
1012
1013#ifdef CONFIG_SMP
1014apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
1015 call_function_single_interrupt smp_call_function_single_interrupt
1016apicinterrupt CALL_FUNCTION_VECTOR \
1017 call_function_interrupt smp_call_function_interrupt
1018apicinterrupt RESCHEDULE_VECTOR \
1019 reschedule_interrupt smp_reschedule_interrupt
1020#endif
947 1021
948ENTRY(error_interrupt) 1022apicinterrupt ERROR_APIC_VECTOR \
949 apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt 1023 error_interrupt smp_error_interrupt
950END(error_interrupt) 1024apicinterrupt SPURIOUS_APIC_VECTOR \
1025 spurious_interrupt smp_spurious_interrupt
951 1026
952ENTRY(spurious_interrupt)
953 apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
954END(spurious_interrupt)
955
956/* 1027/*
957 * Exception entry points. 1028 * Exception entry points.
958 */ 1029 */
959 .macro zeroentry sym 1030.macro zeroentry sym do_sym
1031ENTRY(\sym)
960 INTR_FRAME 1032 INTR_FRAME
961 PARAVIRT_ADJUST_EXCEPTION_FRAME 1033 PARAVIRT_ADJUST_EXCEPTION_FRAME
962 pushq $0 /* push error code/oldrax */ 1034 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
963 CFI_ADJUST_CFA_OFFSET 8 1035 subq $15*8,%rsp
964 pushq %rax /* push real oldrax to the rdi slot */ 1036 CFI_ADJUST_CFA_OFFSET 15*8
965 CFI_ADJUST_CFA_OFFSET 8 1037 call error_entry
966 CFI_REL_OFFSET rax,0 1038 DEFAULT_FRAME 0
967 leaq \sym(%rip),%rax 1039 movq %rsp,%rdi /* pt_regs pointer */
968 jmp error_entry 1040 xorl %esi,%esi /* no error code */
1041 call \do_sym
1042 jmp error_exit /* %ebx: no swapgs flag */
969 CFI_ENDPROC 1043 CFI_ENDPROC
970 .endm 1044END(\sym)
1045.endm
971 1046
972 .macro errorentry sym 1047.macro paranoidzeroentry sym do_sym
973 XCPT_FRAME 1048ENTRY(\sym)
1049 INTR_FRAME
974 PARAVIRT_ADJUST_EXCEPTION_FRAME 1050 PARAVIRT_ADJUST_EXCEPTION_FRAME
975 pushq %rax 1051 pushq $-1 /* ORIG_RAX: no syscall to restart */
976 CFI_ADJUST_CFA_OFFSET 8 1052 CFI_ADJUST_CFA_OFFSET 8
977 CFI_REL_OFFSET rax,0 1053 subq $15*8, %rsp
978 leaq \sym(%rip),%rax 1054 call save_paranoid
979 jmp error_entry 1055 TRACE_IRQS_OFF
1056 movq %rsp,%rdi /* pt_regs pointer */
1057 xorl %esi,%esi /* no error code */
1058 call \do_sym
1059 jmp paranoid_exit /* %ebx: no swapgs flag */
980 CFI_ENDPROC 1060 CFI_ENDPROC
981 .endm 1061END(\sym)
1062.endm
982 1063
983 /* error code is on the stack already */ 1064.macro paranoidzeroentry_ist sym do_sym ist
984 /* handle NMI like exceptions that can happen everywhere */ 1065ENTRY(\sym)
985 .macro paranoidentry sym, ist=0, irqtrace=1 1066 INTR_FRAME
986 SAVE_ALL 1067 PARAVIRT_ADJUST_EXCEPTION_FRAME
987 cld 1068 pushq $-1 /* ORIG_RAX: no syscall to restart */
988 movl $1,%ebx 1069 CFI_ADJUST_CFA_OFFSET 8
989 movl $MSR_GS_BASE,%ecx 1070 subq $15*8, %rsp
990 rdmsr 1071 call save_paranoid
991 testl %edx,%edx
992 js 1f
993 SWAPGS
994 xorl %ebx,%ebx
9951:
996 .if \ist
997 movq %gs:pda_data_offset, %rbp
998 .endif
999 .if \irqtrace
1000 TRACE_IRQS_OFF
1001 .endif
1002 movq %rsp,%rdi
1003 movq ORIG_RAX(%rsp),%rsi
1004 movq $-1,ORIG_RAX(%rsp)
1005 .if \ist
1006 subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
1007 .endif
1008 call \sym
1009 .if \ist
1010 addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
1011 .endif
1012 DISABLE_INTERRUPTS(CLBR_NONE)
1013 .if \irqtrace
1014 TRACE_IRQS_OFF 1072 TRACE_IRQS_OFF
1015 .endif 1073 movq %rsp,%rdi /* pt_regs pointer */
1016 .endm 1074 xorl %esi,%esi /* no error code */
1075 movq %gs:pda_data_offset, %rbp
1076 subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
1077 call \do_sym
1078 addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
1079 jmp paranoid_exit /* %ebx: no swapgs flag */
1080 CFI_ENDPROC
1081END(\sym)
1082.endm
1017 1083
1018 /* 1084.macro errorentry sym do_sym
1019 * "Paranoid" exit path from exception stack. 1085ENTRY(\sym)
1020 * Paranoid because this is used by NMIs and cannot take 1086 XCPT_FRAME
1021 * any kernel state for granted. 1087 PARAVIRT_ADJUST_EXCEPTION_FRAME
1022 * We don't do kernel preemption checks here, because only 1088 subq $15*8,%rsp
1023 * NMI should be common and it does not enable IRQs and 1089 CFI_ADJUST_CFA_OFFSET 15*8
1024 * cannot get reschedule ticks. 1090 call error_entry
1025 * 1091 DEFAULT_FRAME 0
1026 * "trace" is 0 for the NMI handler only, because irq-tracing 1092 movq %rsp,%rdi /* pt_regs pointer */
1027 * is fundamentally NMI-unsafe. (we cannot change the soft and 1093 movq ORIG_RAX(%rsp),%rsi /* get error code */
1028 * hard flags at once, atomically) 1094 movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
1029 */ 1095 call \do_sym
1030 .macro paranoidexit trace=1 1096 jmp error_exit /* %ebx: no swapgs flag */
1031 /* ebx: no swapgs flag */
1032paranoid_exit\trace:
1033 testl %ebx,%ebx /* swapgs needed? */
1034 jnz paranoid_restore\trace
1035 testl $3,CS(%rsp)
1036 jnz paranoid_userspace\trace
1037paranoid_swapgs\trace:
1038 .if \trace
1039 TRACE_IRQS_IRETQ 0
1040 .endif
1041 SWAPGS_UNSAFE_STACK
1042paranoid_restore\trace:
1043 RESTORE_ALL 8
1044 jmp irq_return
1045paranoid_userspace\trace:
1046 GET_THREAD_INFO(%rcx)
1047 movl TI_flags(%rcx),%ebx
1048 andl $_TIF_WORK_MASK,%ebx
1049 jz paranoid_swapgs\trace
1050 movq %rsp,%rdi /* &pt_regs */
1051 call sync_regs
1052 movq %rax,%rsp /* switch stack for scheduling */
1053 testl $_TIF_NEED_RESCHED,%ebx
1054 jnz paranoid_schedule\trace
1055 movl %ebx,%edx /* arg3: thread flags */
1056 .if \trace
1057 TRACE_IRQS_ON
1058 .endif
1059 ENABLE_INTERRUPTS(CLBR_NONE)
1060 xorl %esi,%esi /* arg2: oldset */
1061 movq %rsp,%rdi /* arg1: &pt_regs */
1062 call do_notify_resume
1063 DISABLE_INTERRUPTS(CLBR_NONE)
1064 .if \trace
1065 TRACE_IRQS_OFF
1066 .endif
1067 jmp paranoid_userspace\trace
1068paranoid_schedule\trace:
1069 .if \trace
1070 TRACE_IRQS_ON
1071 .endif
1072 ENABLE_INTERRUPTS(CLBR_ANY)
1073 call schedule
1074 DISABLE_INTERRUPTS(CLBR_ANY)
1075 .if \trace
1076 TRACE_IRQS_OFF
1077 .endif
1078 jmp paranoid_userspace\trace
1079 CFI_ENDPROC 1097 CFI_ENDPROC
1080 .endm 1098END(\sym)
1099.endm
1081 1100
1082/* 1101 /* error code is on the stack already */
1083 * Exception entry point. This expects an error code/orig_rax on the stack 1102.macro paranoiderrorentry sym do_sym
1084 * and the exception handler in %rax. 1103ENTRY(\sym)
1085 */ 1104 XCPT_FRAME
1086KPROBE_ENTRY(error_entry) 1105 PARAVIRT_ADJUST_EXCEPTION_FRAME
1087 _frame RDI 1106 subq $15*8,%rsp
1088 CFI_REL_OFFSET rax,0 1107 CFI_ADJUST_CFA_OFFSET 15*8
1089 /* rdi slot contains rax, oldrax contains error code */ 1108 call save_paranoid
1090 cld 1109 DEFAULT_FRAME 0
1091 subq $14*8,%rsp
1092 CFI_ADJUST_CFA_OFFSET (14*8)
1093 movq %rsi,13*8(%rsp)
1094 CFI_REL_OFFSET rsi,RSI
1095 movq 14*8(%rsp),%rsi /* load rax from rdi slot */
1096 CFI_REGISTER rax,rsi
1097 movq %rdx,12*8(%rsp)
1098 CFI_REL_OFFSET rdx,RDX
1099 movq %rcx,11*8(%rsp)
1100 CFI_REL_OFFSET rcx,RCX
1101 movq %rsi,10*8(%rsp) /* store rax */
1102 CFI_REL_OFFSET rax,RAX
1103 movq %r8, 9*8(%rsp)
1104 CFI_REL_OFFSET r8,R8
1105 movq %r9, 8*8(%rsp)
1106 CFI_REL_OFFSET r9,R9
1107 movq %r10,7*8(%rsp)
1108 CFI_REL_OFFSET r10,R10
1109 movq %r11,6*8(%rsp)
1110 CFI_REL_OFFSET r11,R11
1111 movq %rbx,5*8(%rsp)
1112 CFI_REL_OFFSET rbx,RBX
1113 movq %rbp,4*8(%rsp)
1114 CFI_REL_OFFSET rbp,RBP
1115 movq %r12,3*8(%rsp)
1116 CFI_REL_OFFSET r12,R12
1117 movq %r13,2*8(%rsp)
1118 CFI_REL_OFFSET r13,R13
1119 movq %r14,1*8(%rsp)
1120 CFI_REL_OFFSET r14,R14
1121 movq %r15,(%rsp)
1122 CFI_REL_OFFSET r15,R15
1123 xorl %ebx,%ebx
1124 testl $3,CS(%rsp)
1125 je error_kernelspace
1126error_swapgs:
1127 SWAPGS
1128error_sti:
1129 TRACE_IRQS_OFF
1130 movq %rdi,RDI(%rsp)
1131 CFI_REL_OFFSET rdi,RDI
1132 movq %rsp,%rdi
1133 movq ORIG_RAX(%rsp),%rsi /* get error code */
1134 movq $-1,ORIG_RAX(%rsp)
1135 call *%rax
1136 /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
1137error_exit:
1138 movl %ebx,%eax
1139 RESTORE_REST
1140 DISABLE_INTERRUPTS(CLBR_NONE)
1141 TRACE_IRQS_OFF 1110 TRACE_IRQS_OFF
1142 GET_THREAD_INFO(%rcx) 1111 movq %rsp,%rdi /* pt_regs pointer */
1143 testl %eax,%eax 1112 movq ORIG_RAX(%rsp),%rsi /* get error code */
1144 jne retint_kernel 1113 movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
1145 LOCKDEP_SYS_EXIT_IRQ 1114 call \do_sym
1146 movl TI_flags(%rcx),%edx 1115 jmp paranoid_exit /* %ebx: no swapgs flag */
1147 movl $_TIF_WORK_MASK,%edi
1148 andl %edi,%edx
1149 jnz retint_careful
1150 jmp retint_swapgs
1151 CFI_ENDPROC 1116 CFI_ENDPROC
1117END(\sym)
1118.endm
1152 1119
1153error_kernelspace: 1120zeroentry divide_error do_divide_error
1154 incl %ebx 1121zeroentry overflow do_overflow
1155 /* There are two places in the kernel that can potentially fault with 1122zeroentry bounds do_bounds
1156 usergs. Handle them here. The exception handlers after 1123zeroentry invalid_op do_invalid_op
1157 iret run with kernel gs again, so don't set the user space flag. 1124zeroentry device_not_available do_device_not_available
1158 B stepping K8s sometimes report an truncated RIP for IRET 1125paranoiderrorentry double_fault do_double_fault
1159 exceptions returning to compat mode. Check for these here too. */ 1126zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun
1160 leaq irq_return(%rip),%rcx 1127errorentry invalid_TSS do_invalid_TSS
1161 cmpq %rcx,RIP(%rsp) 1128errorentry segment_not_present do_segment_not_present
1162 je error_swapgs 1129zeroentry spurious_interrupt_bug do_spurious_interrupt_bug
1163 movl %ecx,%ecx /* zero extend */ 1130zeroentry coprocessor_error do_coprocessor_error
1164 cmpq %rcx,RIP(%rsp) 1131errorentry alignment_check do_alignment_check
1165 je error_swapgs 1132zeroentry simd_coprocessor_error do_simd_coprocessor_error
1166 cmpq $gs_change,RIP(%rsp) 1133
1167 je error_swapgs 1134 /* Reload gs selector with exception handling */
1168 jmp error_sti 1135 /* edi: new selector */
1169KPROBE_END(error_entry)
1170
1171 /* Reload gs selector with exception handling */
1172 /* edi: new selector */
1173ENTRY(native_load_gs_index) 1136ENTRY(native_load_gs_index)
1174 CFI_STARTPROC 1137 CFI_STARTPROC
1175 pushf 1138 pushf
1176 CFI_ADJUST_CFA_OFFSET 8 1139 CFI_ADJUST_CFA_OFFSET 8
1177 DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI)) 1140 DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI))
1178 SWAPGS 1141 SWAPGS
1179gs_change: 1142gs_change:
1180 movl %edi,%gs 1143 movl %edi,%gs
11812: mfence /* workaround */ 11442: mfence /* workaround */
1182 SWAPGS 1145 SWAPGS
1183 popf 1146 popf
1184 CFI_ADJUST_CFA_OFFSET -8 1147 CFI_ADJUST_CFA_OFFSET -8
1185 ret 1148 ret
1186 CFI_ENDPROC 1149 CFI_ENDPROC
1187ENDPROC(native_load_gs_index) 1150END(native_load_gs_index)
1188 1151
1189 .section __ex_table,"a" 1152 .section __ex_table,"a"
1190 .align 8 1153 .align 8
1191 .quad gs_change,bad_gs 1154 .quad gs_change,bad_gs
1192 .previous 1155 .previous
1193 .section .fixup,"ax" 1156 .section .fixup,"ax"
1194 /* running with kernelgs */ 1157 /* running with kernelgs */
1195bad_gs: 1158bad_gs:
1196 SWAPGS /* switch back to user gs */ 1159 SWAPGS /* switch back to user gs */
1197 xorl %eax,%eax 1160 xorl %eax,%eax
1198 movl %eax,%gs 1161 movl %eax,%gs
1199 jmp 2b 1162 jmp 2b
1200 .previous 1163 .previous
1201 1164
1202/* 1165/*
1203 * Create a kernel thread. 1166 * Create a kernel thread.
1204 * 1167 *
@@ -1221,7 +1184,7 @@ ENTRY(kernel_thread)
1221 1184
1222 xorl %r8d,%r8d 1185 xorl %r8d,%r8d
1223 xorl %r9d,%r9d 1186 xorl %r9d,%r9d
1224 1187
1225 # clone now 1188 # clone now
1226 call do_fork 1189 call do_fork
1227 movq %rax,RAX(%rsp) 1190 movq %rax,RAX(%rsp)
@@ -1232,15 +1195,15 @@ ENTRY(kernel_thread)
1232 * so internally to the x86_64 port you can rely on kernel_thread() 1195 * so internally to the x86_64 port you can rely on kernel_thread()
1233 * not to reschedule the child before returning, this avoids the need 1196 * not to reschedule the child before returning, this avoids the need
1234 * of hacks for example to fork off the per-CPU idle tasks. 1197 * of hacks for example to fork off the per-CPU idle tasks.
1235 * [Hopefully no generic code relies on the reschedule -AK] 1198 * [Hopefully no generic code relies on the reschedule -AK]
1236 */ 1199 */
1237 RESTORE_ALL 1200 RESTORE_ALL
1238 UNFAKE_STACK_FRAME 1201 UNFAKE_STACK_FRAME
1239 ret 1202 ret
1240 CFI_ENDPROC 1203 CFI_ENDPROC
1241ENDPROC(kernel_thread) 1204END(kernel_thread)
1242 1205
1243child_rip: 1206ENTRY(child_rip)
1244 pushq $0 # fake return address 1207 pushq $0 # fake return address
1245 CFI_STARTPROC 1208 CFI_STARTPROC
1246 /* 1209 /*
@@ -1253,8 +1216,9 @@ child_rip:
1253 # exit 1216 # exit
1254 mov %eax, %edi 1217 mov %eax, %edi
1255 call do_exit 1218 call do_exit
1219 ud2 # padding for call trace
1256 CFI_ENDPROC 1220 CFI_ENDPROC
1257ENDPROC(child_rip) 1221END(child_rip)
1258 1222
1259/* 1223/*
1260 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. 1224 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
@@ -1274,10 +1238,10 @@ ENDPROC(child_rip)
1274ENTRY(kernel_execve) 1238ENTRY(kernel_execve)
1275 CFI_STARTPROC 1239 CFI_STARTPROC
1276 FAKE_STACK_FRAME $0 1240 FAKE_STACK_FRAME $0
1277 SAVE_ALL 1241 SAVE_ALL
1278 movq %rsp,%rcx 1242 movq %rsp,%rcx
1279 call sys_execve 1243 call sys_execve
1280 movq %rax, RAX(%rsp) 1244 movq %rax, RAX(%rsp)
1281 RESTORE_REST 1245 RESTORE_REST
1282 testq %rax,%rax 1246 testq %rax,%rax
1283 je int_ret_from_sys_call 1247 je int_ret_from_sys_call
@@ -1285,129 +1249,7 @@ ENTRY(kernel_execve)
1285 UNFAKE_STACK_FRAME 1249 UNFAKE_STACK_FRAME
1286 ret 1250 ret
1287 CFI_ENDPROC 1251 CFI_ENDPROC
1288ENDPROC(kernel_execve) 1252END(kernel_execve)
1289
1290KPROBE_ENTRY(page_fault)
1291 errorentry do_page_fault
1292KPROBE_END(page_fault)
1293
1294ENTRY(coprocessor_error)
1295 zeroentry do_coprocessor_error
1296END(coprocessor_error)
1297
1298ENTRY(simd_coprocessor_error)
1299 zeroentry do_simd_coprocessor_error
1300END(simd_coprocessor_error)
1301
1302ENTRY(device_not_available)
1303 zeroentry do_device_not_available
1304END(device_not_available)
1305
1306 /* runs on exception stack */
1307KPROBE_ENTRY(debug)
1308 INTR_FRAME
1309 PARAVIRT_ADJUST_EXCEPTION_FRAME
1310 pushq $0
1311 CFI_ADJUST_CFA_OFFSET 8
1312 paranoidentry do_debug, DEBUG_STACK
1313 paranoidexit
1314KPROBE_END(debug)
1315
1316 /* runs on exception stack */
1317KPROBE_ENTRY(nmi)
1318 INTR_FRAME
1319 PARAVIRT_ADJUST_EXCEPTION_FRAME
1320 pushq $-1
1321 CFI_ADJUST_CFA_OFFSET 8
1322 paranoidentry do_nmi, 0, 0
1323#ifdef CONFIG_TRACE_IRQFLAGS
1324 paranoidexit 0
1325#else
1326 jmp paranoid_exit1
1327 CFI_ENDPROC
1328#endif
1329KPROBE_END(nmi)
1330
1331KPROBE_ENTRY(int3)
1332 INTR_FRAME
1333 PARAVIRT_ADJUST_EXCEPTION_FRAME
1334 pushq $0
1335 CFI_ADJUST_CFA_OFFSET 8
1336 paranoidentry do_int3, DEBUG_STACK
1337 jmp paranoid_exit1
1338 CFI_ENDPROC
1339KPROBE_END(int3)
1340
1341ENTRY(overflow)
1342 zeroentry do_overflow
1343END(overflow)
1344
1345ENTRY(bounds)
1346 zeroentry do_bounds
1347END(bounds)
1348
1349ENTRY(invalid_op)
1350 zeroentry do_invalid_op
1351END(invalid_op)
1352
1353ENTRY(coprocessor_segment_overrun)
1354 zeroentry do_coprocessor_segment_overrun
1355END(coprocessor_segment_overrun)
1356
1357 /* runs on exception stack */
1358ENTRY(double_fault)
1359 XCPT_FRAME
1360 PARAVIRT_ADJUST_EXCEPTION_FRAME
1361 paranoidentry do_double_fault
1362 jmp paranoid_exit1
1363 CFI_ENDPROC
1364END(double_fault)
1365
1366ENTRY(invalid_TSS)
1367 errorentry do_invalid_TSS
1368END(invalid_TSS)
1369
1370ENTRY(segment_not_present)
1371 errorentry do_segment_not_present
1372END(segment_not_present)
1373
1374 /* runs on exception stack */
1375ENTRY(stack_segment)
1376 XCPT_FRAME
1377 PARAVIRT_ADJUST_EXCEPTION_FRAME
1378 paranoidentry do_stack_segment
1379 jmp paranoid_exit1
1380 CFI_ENDPROC
1381END(stack_segment)
1382
1383KPROBE_ENTRY(general_protection)
1384 errorentry do_general_protection
1385KPROBE_END(general_protection)
1386
1387ENTRY(alignment_check)
1388 errorentry do_alignment_check
1389END(alignment_check)
1390
1391ENTRY(divide_error)
1392 zeroentry do_divide_error
1393END(divide_error)
1394
1395ENTRY(spurious_interrupt_bug)
1396 zeroentry do_spurious_interrupt_bug
1397END(spurious_interrupt_bug)
1398
1399#ifdef CONFIG_X86_MCE
1400 /* runs on exception stack */
1401ENTRY(machine_check)
1402 INTR_FRAME
1403 PARAVIRT_ADJUST_EXCEPTION_FRAME
1404 pushq $0
1405 CFI_ADJUST_CFA_OFFSET 8
1406 paranoidentry do_machine_check
1407 jmp paranoid_exit1
1408 CFI_ENDPROC
1409END(machine_check)
1410#endif
1411 1253
1412/* Call softirq on interrupt stack. Interrupts are off. */ 1254/* Call softirq on interrupt stack. Interrupts are off. */
1413ENTRY(call_softirq) 1255ENTRY(call_softirq)
@@ -1427,40 +1269,33 @@ ENTRY(call_softirq)
1427 decl %gs:pda_irqcount 1269 decl %gs:pda_irqcount
1428 ret 1270 ret
1429 CFI_ENDPROC 1271 CFI_ENDPROC
1430ENDPROC(call_softirq) 1272END(call_softirq)
1431
1432KPROBE_ENTRY(ignore_sysret)
1433 CFI_STARTPROC
1434 mov $-ENOSYS,%eax
1435 sysret
1436 CFI_ENDPROC
1437ENDPROC(ignore_sysret)
1438 1273
1439#ifdef CONFIG_XEN 1274#ifdef CONFIG_XEN
1440ENTRY(xen_hypervisor_callback) 1275zeroentry xen_hypervisor_callback xen_do_hypervisor_callback
1441 zeroentry xen_do_hypervisor_callback
1442END(xen_hypervisor_callback)
1443 1276
1444/* 1277/*
1445# A note on the "critical region" in our callback handler. 1278 * A note on the "critical region" in our callback handler.
1446# We want to avoid stacking callback handlers due to events occurring 1279 * We want to avoid stacking callback handlers due to events occurring
1447# during handling of the last event. To do this, we keep events disabled 1280 * during handling of the last event. To do this, we keep events disabled
1448# until we've done all processing. HOWEVER, we must enable events before 1281 * until we've done all processing. HOWEVER, we must enable events before
1449# popping the stack frame (can't be done atomically) and so it would still 1282 * popping the stack frame (can't be done atomically) and so it would still
1450# be possible to get enough handler activations to overflow the stack. 1283 * be possible to get enough handler activations to overflow the stack.
1451# Although unlikely, bugs of that kind are hard to track down, so we'd 1284 * Although unlikely, bugs of that kind are hard to track down, so we'd
1452# like to avoid the possibility. 1285 * like to avoid the possibility.
1453# So, on entry to the handler we detect whether we interrupted an 1286 * So, on entry to the handler we detect whether we interrupted an
1454# existing activation in its critical region -- if so, we pop the current 1287 * existing activation in its critical region -- if so, we pop the current
1455# activation and restart the handler using the previous one. 1288 * activation and restart the handler using the previous one.
1456*/ 1289 */
1457ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) 1290ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
1458 CFI_STARTPROC 1291 CFI_STARTPROC
1459/* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will 1292/*
1460 see the correct pointer to the pt_regs */ 1293 * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
1294 * see the correct pointer to the pt_regs
1295 */
1461 movq %rdi, %rsp # we don't return, adjust the stack frame 1296 movq %rdi, %rsp # we don't return, adjust the stack frame
1462 CFI_ENDPROC 1297 CFI_ENDPROC
1463 CFI_DEFAULT_STACK 1298 DEFAULT_FRAME
146411: incl %gs:pda_irqcount 129911: incl %gs:pda_irqcount
1465 movq %rsp,%rbp 1300 movq %rsp,%rbp
1466 CFI_DEF_CFA_REGISTER rbp 1301 CFI_DEF_CFA_REGISTER rbp
@@ -1475,23 +1310,26 @@ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
1475END(do_hypervisor_callback) 1310END(do_hypervisor_callback)
1476 1311
1477/* 1312/*
1478# Hypervisor uses this for application faults while it executes. 1313 * Hypervisor uses this for application faults while it executes.
1479# We get here for two reasons: 1314 * We get here for two reasons:
1480# 1. Fault while reloading DS, ES, FS or GS 1315 * 1. Fault while reloading DS, ES, FS or GS
1481# 2. Fault while executing IRET 1316 * 2. Fault while executing IRET
1482# Category 1 we do not need to fix up as Xen has already reloaded all segment 1317 * Category 1 we do not need to fix up as Xen has already reloaded all segment
1483# registers that could be reloaded and zeroed the others. 1318 * registers that could be reloaded and zeroed the others.
1484# Category 2 we fix up by killing the current process. We cannot use the 1319 * Category 2 we fix up by killing the current process. We cannot use the
1485# normal Linux return path in this case because if we use the IRET hypercall 1320 * normal Linux return path in this case because if we use the IRET hypercall
1486# to pop the stack frame we end up in an infinite loop of failsafe callbacks. 1321 * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
1487# We distinguish between categories by comparing each saved segment register 1322 * We distinguish between categories by comparing each saved segment register
1488# with its current contents: any discrepancy means we in category 1. 1323 * with its current contents: any discrepancy means we in category 1.
1489*/ 1324 */
1490ENTRY(xen_failsafe_callback) 1325ENTRY(xen_failsafe_callback)
1491 framesz = (RIP-0x30) /* workaround buggy gas */ 1326 INTR_FRAME 1 (6*8)
1492 _frame framesz 1327 /*CFI_REL_OFFSET gs,GS*/
1493 CFI_REL_OFFSET rcx, 0 1328 /*CFI_REL_OFFSET fs,FS*/
1494 CFI_REL_OFFSET r11, 8 1329 /*CFI_REL_OFFSET es,ES*/
1330 /*CFI_REL_OFFSET ds,DS*/
1331 CFI_REL_OFFSET r11,8
1332 CFI_REL_OFFSET rcx,0
1495 movw %ds,%cx 1333 movw %ds,%cx
1496 cmpw %cx,0x10(%rsp) 1334 cmpw %cx,0x10(%rsp)
1497 CFI_REMEMBER_STATE 1335 CFI_REMEMBER_STATE
@@ -1512,12 +1350,9 @@ ENTRY(xen_failsafe_callback)
1512 CFI_RESTORE r11 1350 CFI_RESTORE r11
1513 addq $0x30,%rsp 1351 addq $0x30,%rsp
1514 CFI_ADJUST_CFA_OFFSET -0x30 1352 CFI_ADJUST_CFA_OFFSET -0x30
1515 pushq $0 1353 pushq_cfi $0 /* RIP */
1516 CFI_ADJUST_CFA_OFFSET 8 1354 pushq_cfi %r11
1517 pushq %r11 1355 pushq_cfi %rcx
1518 CFI_ADJUST_CFA_OFFSET 8
1519 pushq %rcx
1520 CFI_ADJUST_CFA_OFFSET 8
1521 jmp general_protection 1356 jmp general_protection
1522 CFI_RESTORE_STATE 1357 CFI_RESTORE_STATE
15231: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ 13581: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
@@ -1527,11 +1362,223 @@ ENTRY(xen_failsafe_callback)
1527 CFI_RESTORE r11 1362 CFI_RESTORE r11
1528 addq $0x30,%rsp 1363 addq $0x30,%rsp
1529 CFI_ADJUST_CFA_OFFSET -0x30 1364 CFI_ADJUST_CFA_OFFSET -0x30
1530 pushq $0 1365 pushq_cfi $0
1531 CFI_ADJUST_CFA_OFFSET 8
1532 SAVE_ALL 1366 SAVE_ALL
1533 jmp error_exit 1367 jmp error_exit
1534 CFI_ENDPROC 1368 CFI_ENDPROC
1535END(xen_failsafe_callback) 1369END(xen_failsafe_callback)
1536 1370
1537#endif /* CONFIG_XEN */ 1371#endif /* CONFIG_XEN */
1372
1373/*
1374 * Some functions should be protected against kprobes
1375 */
1376 .pushsection .kprobes.text, "ax"
1377
1378paranoidzeroentry_ist debug do_debug DEBUG_STACK
1379paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
1380paranoiderrorentry stack_segment do_stack_segment
1381errorentry general_protection do_general_protection
1382errorentry page_fault do_page_fault
1383#ifdef CONFIG_X86_MCE
1384paranoidzeroentry machine_check do_machine_check
1385#endif
1386
1387 /*
1388 * "Paranoid" exit path from exception stack.
1389 * Paranoid because this is used by NMIs and cannot take
1390 * any kernel state for granted.
1391 * We don't do kernel preemption checks here, because only
1392 * NMI should be common and it does not enable IRQs and
1393 * cannot get reschedule ticks.
1394 *
1395 * "trace" is 0 for the NMI handler only, because irq-tracing
1396 * is fundamentally NMI-unsafe. (we cannot change the soft and
1397 * hard flags at once, atomically)
1398 */
1399
1400 /* ebx: no swapgs flag */
1401ENTRY(paranoid_exit)
1402 INTR_FRAME
1403 DISABLE_INTERRUPTS(CLBR_NONE)
1404 TRACE_IRQS_OFF
1405 testl %ebx,%ebx /* swapgs needed? */
1406 jnz paranoid_restore
1407 testl $3,CS(%rsp)
1408 jnz paranoid_userspace
1409paranoid_swapgs:
1410 TRACE_IRQS_IRETQ 0
1411 SWAPGS_UNSAFE_STACK
1412paranoid_restore:
1413 RESTORE_ALL 8
1414 jmp irq_return
1415paranoid_userspace:
1416 GET_THREAD_INFO(%rcx)
1417 movl TI_flags(%rcx),%ebx
1418 andl $_TIF_WORK_MASK,%ebx
1419 jz paranoid_swapgs
1420 movq %rsp,%rdi /* &pt_regs */
1421 call sync_regs
1422 movq %rax,%rsp /* switch stack for scheduling */
1423 testl $_TIF_NEED_RESCHED,%ebx
1424 jnz paranoid_schedule
1425 movl %ebx,%edx /* arg3: thread flags */
1426 TRACE_IRQS_ON
1427 ENABLE_INTERRUPTS(CLBR_NONE)
1428 xorl %esi,%esi /* arg2: oldset */
1429 movq %rsp,%rdi /* arg1: &pt_regs */
1430 call do_notify_resume
1431 DISABLE_INTERRUPTS(CLBR_NONE)
1432 TRACE_IRQS_OFF
1433 jmp paranoid_userspace
1434paranoid_schedule:
1435 TRACE_IRQS_ON
1436 ENABLE_INTERRUPTS(CLBR_ANY)
1437 call schedule
1438 DISABLE_INTERRUPTS(CLBR_ANY)
1439 TRACE_IRQS_OFF
1440 jmp paranoid_userspace
1441 CFI_ENDPROC
1442END(paranoid_exit)
1443
1444/*
1445 * Exception entry point. This expects an error code/orig_rax on the stack.
1446 * returns in "no swapgs flag" in %ebx.
1447 */
1448ENTRY(error_entry)
1449 XCPT_FRAME
1450 CFI_ADJUST_CFA_OFFSET 15*8
1451 /* oldrax contains error code */
1452 cld
1453 movq_cfi rdi, RDI+8
1454 movq_cfi rsi, RSI+8
1455 movq_cfi rdx, RDX+8
1456 movq_cfi rcx, RCX+8
1457 movq_cfi rax, RAX+8
1458 movq_cfi r8, R8+8
1459 movq_cfi r9, R9+8
1460 movq_cfi r10, R10+8
1461 movq_cfi r11, R11+8
1462 movq_cfi rbx, RBX+8
1463 movq_cfi rbp, RBP+8
1464 movq_cfi r12, R12+8
1465 movq_cfi r13, R13+8
1466 movq_cfi r14, R14+8
1467 movq_cfi r15, R15+8
1468 xorl %ebx,%ebx
1469 testl $3,CS+8(%rsp)
1470 je error_kernelspace
1471error_swapgs:
1472 SWAPGS
1473error_sti:
1474 TRACE_IRQS_OFF
1475 ret
1476 CFI_ENDPROC
1477
1478/*
1479 * There are two places in the kernel that can potentially fault with
1480 * usergs. Handle them here. The exception handlers after iret run with
1481 * kernel gs again, so don't set the user space flag. B stepping K8s
1482 * sometimes report an truncated RIP for IRET exceptions returning to
1483 * compat mode. Check for these here too.
1484 */
1485error_kernelspace:
1486 incl %ebx
1487 leaq irq_return(%rip),%rcx
1488 cmpq %rcx,RIP+8(%rsp)
1489 je error_swapgs
1490 movl %ecx,%ecx /* zero extend */
1491 cmpq %rcx,RIP+8(%rsp)
1492 je error_swapgs
1493 cmpq $gs_change,RIP+8(%rsp)
1494 je error_swapgs
1495 jmp error_sti
1496END(error_entry)
1497
1498
1499/* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
1500ENTRY(error_exit)
1501 DEFAULT_FRAME
1502 movl %ebx,%eax
1503 RESTORE_REST
1504 DISABLE_INTERRUPTS(CLBR_NONE)
1505 TRACE_IRQS_OFF
1506 GET_THREAD_INFO(%rcx)
1507 testl %eax,%eax
1508 jne retint_kernel
1509 LOCKDEP_SYS_EXIT_IRQ
1510 movl TI_flags(%rcx),%edx
1511 movl $_TIF_WORK_MASK,%edi
1512 andl %edi,%edx
1513 jnz retint_careful
1514 jmp retint_swapgs
1515 CFI_ENDPROC
1516END(error_exit)
1517
1518
1519 /* runs on exception stack */
1520ENTRY(nmi)
1521 INTR_FRAME
1522 PARAVIRT_ADJUST_EXCEPTION_FRAME
1523 pushq_cfi $-1
1524 subq $15*8, %rsp
1525 CFI_ADJUST_CFA_OFFSET 15*8
1526 call save_paranoid
1527 DEFAULT_FRAME 0
1528 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
1529 movq %rsp,%rdi
1530 movq $-1,%rsi
1531 call do_nmi
1532#ifdef CONFIG_TRACE_IRQFLAGS
1533 /* paranoidexit; without TRACE_IRQS_OFF */
1534 /* ebx: no swapgs flag */
1535 DISABLE_INTERRUPTS(CLBR_NONE)
1536 testl %ebx,%ebx /* swapgs needed? */
1537 jnz nmi_restore
1538 testl $3,CS(%rsp)
1539 jnz nmi_userspace
1540nmi_swapgs:
1541 SWAPGS_UNSAFE_STACK
1542nmi_restore:
1543 RESTORE_ALL 8
1544 jmp irq_return
1545nmi_userspace:
1546 GET_THREAD_INFO(%rcx)
1547 movl TI_flags(%rcx),%ebx
1548 andl $_TIF_WORK_MASK,%ebx
1549 jz nmi_swapgs
1550 movq %rsp,%rdi /* &pt_regs */
1551 call sync_regs
1552 movq %rax,%rsp /* switch stack for scheduling */
1553 testl $_TIF_NEED_RESCHED,%ebx
1554 jnz nmi_schedule
1555 movl %ebx,%edx /* arg3: thread flags */
1556 ENABLE_INTERRUPTS(CLBR_NONE)
1557 xorl %esi,%esi /* arg2: oldset */
1558 movq %rsp,%rdi /* arg1: &pt_regs */
1559 call do_notify_resume
1560 DISABLE_INTERRUPTS(CLBR_NONE)
1561 jmp nmi_userspace
1562nmi_schedule:
1563 ENABLE_INTERRUPTS(CLBR_ANY)
1564 call schedule
1565 DISABLE_INTERRUPTS(CLBR_ANY)
1566 jmp nmi_userspace
1567 CFI_ENDPROC
1568#else
1569 jmp paranoid_exit
1570 CFI_ENDPROC
1571#endif
1572END(nmi)
1573
1574ENTRY(ignore_sysret)
1575 CFI_STARTPROC
1576 mov $-ENOSYS,%eax
1577 sysret
1578 CFI_ENDPROC
1579END(ignore_sysret)
1580
1581/*
1582 * End of kprobes section
1583 */
1584 .popsection
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index 0e88be11227d..b193e082f6ce 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -10,6 +10,7 @@
10 10
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/threads.h> 12#include <linux/threads.h>
13#include <linux/cpu.h>
13#include <linux/cpumask.h> 14#include <linux/cpumask.h>
14#include <linux/string.h> 15#include <linux/string.h>
15#include <linux/ctype.h> 16#include <linux/ctype.h>
@@ -17,6 +18,9 @@
17#include <linux/sched.h> 18#include <linux/sched.h>
18#include <linux/module.h> 19#include <linux/module.h>
19#include <linux/hardirq.h> 20#include <linux/hardirq.h>
21#include <linux/timer.h>
22#include <linux/proc_fs.h>
23#include <asm/current.h>
20#include <asm/smp.h> 24#include <asm/smp.h>
21#include <asm/ipi.h> 25#include <asm/ipi.h>
22#include <asm/genapic.h> 26#include <asm/genapic.h>
@@ -383,6 +387,103 @@ static __init void uv_rtc_init(void)
383} 387}
384 388
385/* 389/*
390 * percpu heartbeat timer
391 */
392static void uv_heartbeat(unsigned long ignored)
393{
394 struct timer_list *timer = &uv_hub_info->scir.timer;
395 unsigned char bits = uv_hub_info->scir.state;
396
397 /* flip heartbeat bit */
398 bits ^= SCIR_CPU_HEARTBEAT;
399
400 /* is this cpu idle? */
401 if (idle_cpu(raw_smp_processor_id()))
402 bits &= ~SCIR_CPU_ACTIVITY;
403 else
404 bits |= SCIR_CPU_ACTIVITY;
405
406 /* update system controller interface reg */
407 uv_set_scir_bits(bits);
408
409 /* enable next timer period */
410 mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL);
411}
412
413static void __cpuinit uv_heartbeat_enable(int cpu)
414{
415 if (!uv_cpu_hub_info(cpu)->scir.enabled) {
416 struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer;
417
418 uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY);
419 setup_timer(timer, uv_heartbeat, cpu);
420 timer->expires = jiffies + SCIR_CPU_HB_INTERVAL;
421 add_timer_on(timer, cpu);
422 uv_cpu_hub_info(cpu)->scir.enabled = 1;
423 }
424
425 /* check boot cpu */
426 if (!uv_cpu_hub_info(0)->scir.enabled)
427 uv_heartbeat_enable(0);
428}
429
430#ifdef CONFIG_HOTPLUG_CPU
431static void __cpuinit uv_heartbeat_disable(int cpu)
432{
433 if (uv_cpu_hub_info(cpu)->scir.enabled) {
434 uv_cpu_hub_info(cpu)->scir.enabled = 0;
435 del_timer(&uv_cpu_hub_info(cpu)->scir.timer);
436 }
437 uv_set_cpu_scir_bits(cpu, 0xff);
438}
439
440/*
441 * cpu hotplug notifier
442 */
443static __cpuinit int uv_scir_cpu_notify(struct notifier_block *self,
444 unsigned long action, void *hcpu)
445{
446 long cpu = (long)hcpu;
447
448 switch (action) {
449 case CPU_ONLINE:
450 uv_heartbeat_enable(cpu);
451 break;
452 case CPU_DOWN_PREPARE:
453 uv_heartbeat_disable(cpu);
454 break;
455 default:
456 break;
457 }
458 return NOTIFY_OK;
459}
460
461static __init void uv_scir_register_cpu_notifier(void)
462{
463 hotcpu_notifier(uv_scir_cpu_notify, 0);
464}
465
466#else /* !CONFIG_HOTPLUG_CPU */
467
468static __init void uv_scir_register_cpu_notifier(void)
469{
470}
471
472static __init int uv_init_heartbeat(void)
473{
474 int cpu;
475
476 if (is_uv_system())
477 for_each_online_cpu(cpu)
478 uv_heartbeat_enable(cpu);
479 return 0;
480}
481
482late_initcall(uv_init_heartbeat);
483
484#endif /* !CONFIG_HOTPLUG_CPU */
485
486/*
386 * Called on each cpu to initialize the per_cpu UV data area. 487 * Called on each cpu to initialize the per_cpu UV data area.
387 * ZZZ hotplug not supported yet 488 * ZZZ hotplug not supported yet
388 */ 489 */
@@ -455,7 +556,7 @@ void __init uv_system_init(void)
455 556
456 uv_bios_init(); 557 uv_bios_init();
457 uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, 558 uv_bios_get_sn_info(0, &uv_type, &sn_partition_id,
458 &uv_coherency_id, &uv_region_size); 559 &sn_coherency_id, &sn_region_size);
459 uv_rtc_init(); 560 uv_rtc_init();
460 561
461 for_each_present_cpu(cpu) { 562 for_each_present_cpu(cpu) {
@@ -466,8 +567,7 @@ void __init uv_system_init(void)
466 uv_blade_info[blade].nr_possible_cpus++; 567 uv_blade_info[blade].nr_possible_cpus++;
467 568
468 uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; 569 uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base;
469 uv_cpu_hub_info(cpu)->lowmem_remap_top = 570 uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size;
470 lowmem_redir_base + lowmem_redir_size;
471 uv_cpu_hub_info(cpu)->m_val = m_val; 571 uv_cpu_hub_info(cpu)->m_val = m_val;
472 uv_cpu_hub_info(cpu)->n_val = m_val; 572 uv_cpu_hub_info(cpu)->n_val = m_val;
473 uv_cpu_hub_info(cpu)->numa_blade_id = blade; 573 uv_cpu_hub_info(cpu)->numa_blade_id = blade;
@@ -477,7 +577,8 @@ void __init uv_system_init(void)
477 uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; 577 uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
478 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; 578 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
479 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; 579 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
480 uv_cpu_hub_info(cpu)->coherency_domain_number = uv_coherency_id; 580 uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id;
581 uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu;
481 uv_node_to_blade[nid] = blade; 582 uv_node_to_blade[nid] = blade;
482 uv_cpu_to_blade[cpu] = blade; 583 uv_cpu_to_blade[cpu] = blade;
483 max_pnode = max(pnode, max_pnode); 584 max_pnode = max(pnode, max_pnode);
@@ -494,4 +595,6 @@ void __init uv_system_init(void)
494 map_mmioh_high(max_pnode); 595 map_mmioh_high(max_pnode);
495 596
496 uv_cpu_init(); 597 uv_cpu_init();
598 uv_scir_register_cpu_notifier();
599 proc_mkdir("sgi_uv", NULL);
497} 600}
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
index 1dcb0f13897e..3e66bd364a9d 100644
--- a/arch/x86/kernel/head.c
+++ b/arch/x86/kernel/head.c
@@ -35,7 +35,6 @@ void __init reserve_ebda_region(void)
35 35
36 /* start of EBDA area */ 36 /* start of EBDA area */
37 ebda_addr = get_bios_ebda(); 37 ebda_addr = get_bios_ebda();
38 printk(KERN_INFO "BIOS EBDA/lowmem at: %08x/%08x\n", ebda_addr, lowmem);
39 38
40 /* Fixup: bios puts an EBDA in the top 64K segment */ 39 /* Fixup: bios puts an EBDA in the top 64K segment */
41 /* of conventional memory, but does not adjust lowmem. */ 40 /* of conventional memory, but does not adjust lowmem. */
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index fa1d25dd83e3..ac108d1fe182 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -12,9 +12,12 @@
12#include <asm/sections.h> 12#include <asm/sections.h>
13#include <asm/e820.h> 13#include <asm/e820.h>
14#include <asm/bios_ebda.h> 14#include <asm/bios_ebda.h>
15#include <asm/trampoline.h>
15 16
16void __init i386_start_kernel(void) 17void __init i386_start_kernel(void)
17{ 18{
19 reserve_trampoline_memory();
20
18 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); 21 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
19 22
20#ifdef CONFIG_BLK_DEV_INITRD 23#ifdef CONFIG_BLK_DEV_INITRD
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index d16084f90649..388e05a5fc17 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -24,6 +24,7 @@
24#include <asm/kdebug.h> 24#include <asm/kdebug.h>
25#include <asm/e820.h> 25#include <asm/e820.h>
26#include <asm/bios_ebda.h> 26#include <asm/bios_ebda.h>
27#include <asm/trampoline.h>
27 28
28/* boot cpu pda */ 29/* boot cpu pda */
29static struct x8664_pda _boot_cpu_pda __read_mostly; 30static struct x8664_pda _boot_cpu_pda __read_mostly;
@@ -120,6 +121,8 @@ void __init x86_64_start_reservations(char *real_mode_data)
120{ 121{
121 copy_bootdata(__va(real_mode_data)); 122 copy_bootdata(__va(real_mode_data));
122 123
124 reserve_trampoline_memory();
125
123 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); 126 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
124 127
125#ifdef CONFIG_BLK_DEV_INITRD 128#ifdef CONFIG_BLK_DEV_INITRD
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index e76d7e272974..cd759ad90690 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -33,7 +33,9 @@
33 * HPET address is set in acpi/boot.c, when an ACPI entry exists 33 * HPET address is set in acpi/boot.c, when an ACPI entry exists
34 */ 34 */
35unsigned long hpet_address; 35unsigned long hpet_address;
36unsigned long hpet_num_timers; 36#ifdef CONFIG_PCI_MSI
37static unsigned long hpet_num_timers;
38#endif
37static void __iomem *hpet_virt_address; 39static void __iomem *hpet_virt_address;
38 40
39struct hpet_dev { 41struct hpet_dev {
@@ -811,7 +813,7 @@ int __init hpet_enable(void)
811 813
812out_nohpet: 814out_nohpet:
813 hpet_clear_mapping(); 815 hpet_clear_mapping();
814 boot_hpet_disable = 1; 816 hpet_address = 0;
815 return 0; 817 return 0;
816} 818}
817 819
@@ -834,10 +836,11 @@ static __init int hpet_late_init(void)
834 836
835 hpet_address = force_hpet_address; 837 hpet_address = force_hpet_address;
836 hpet_enable(); 838 hpet_enable();
837 if (!hpet_virt_address)
838 return -ENODEV;
839 } 839 }
840 840
841 if (!hpet_virt_address)
842 return -ENODEV;
843
841 hpet_reserve_platform_timers(hpet_readl(HPET_ID)); 844 hpet_reserve_platform_timers(hpet_readl(HPET_ID));
842 845
843 for_each_online_cpu(cpu) { 846 for_each_online_cpu(cpu) {
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index a4f93b4120c1..d39918076bb4 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -14,7 +14,6 @@ static struct fs_struct init_fs = INIT_FS;
14static struct signal_struct init_signals = INIT_SIGNALS(init_signals); 14static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
15static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); 15static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
16struct mm_struct init_mm = INIT_MM(init_mm); 16struct mm_struct init_mm = INIT_MM(init_mm);
17EXPORT_UNUSED_SYMBOL(init_mm); /* will be removed in 2.6.26 */
18 17
19/* 18/*
20 * Initial thread structure. 19 * Initial thread structure.
diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index 1cbf7c8d46e0..3e070bb961d7 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -2472,10 +2472,9 @@ static void set_ir_ioapic_affinity_irq(unsigned int irq,
2472asmlinkage void smp_irq_move_cleanup_interrupt(void) 2472asmlinkage void smp_irq_move_cleanup_interrupt(void)
2473{ 2473{
2474 unsigned vector, me; 2474 unsigned vector, me;
2475
2475 ack_APIC_irq(); 2476 ack_APIC_irq();
2476#ifdef CONFIG_X86_64
2477 exit_idle(); 2477 exit_idle();
2478#endif
2479 irq_enter(); 2478 irq_enter();
2480 2479
2481 me = smp_processor_id(); 2480 me = smp_processor_id();
@@ -2520,7 +2519,7 @@ static void irq_complete_move(struct irq_desc **descp)
2520 if (likely(!cfg->move_desc_pending)) 2519 if (likely(!cfg->move_desc_pending))
2521 return; 2520 return;
2522 2521
2523 /* domain is not change, but affinity is changed */ 2522 /* domain has not changed, but affinity did */
2524 me = smp_processor_id(); 2523 me = smp_processor_id();
2525 if (cpu_isset(me, desc->affinity)) { 2524 if (cpu_isset(me, desc->affinity)) {
2526 *descp = desc = move_irq_desc(desc, me); 2525 *descp = desc = move_irq_desc(desc, me);
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index fca2991443f5..6383d50f82ea 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -13,12 +13,12 @@
13#include <linux/seq_file.h> 13#include <linux/seq_file.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/delay.h> 15#include <linux/delay.h>
16#include <linux/ftrace.h>
16#include <asm/uaccess.h> 17#include <asm/uaccess.h>
17#include <asm/io_apic.h> 18#include <asm/io_apic.h>
18#include <asm/idle.h> 19#include <asm/idle.h>
19#include <asm/smp.h> 20#include <asm/smp.h>
20 21
21#ifdef CONFIG_DEBUG_STACKOVERFLOW
22/* 22/*
23 * Probabilistic stack overflow check: 23 * Probabilistic stack overflow check:
24 * 24 *
@@ -28,26 +28,25 @@
28 */ 28 */
29static inline void stack_overflow_check(struct pt_regs *regs) 29static inline void stack_overflow_check(struct pt_regs *regs)
30{ 30{
31#ifdef CONFIG_DEBUG_STACKOVERFLOW
31 u64 curbase = (u64)task_stack_page(current); 32 u64 curbase = (u64)task_stack_page(current);
32 static unsigned long warned = -60*HZ; 33
33 34 WARN_ONCE(regs->sp >= curbase &&
34 if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE && 35 regs->sp <= curbase + THREAD_SIZE &&
35 regs->sp < curbase + sizeof(struct thread_info) + 128 && 36 regs->sp < curbase + sizeof(struct thread_info) +
36 time_after(jiffies, warned + 60*HZ)) { 37 sizeof(struct pt_regs) + 128,
37 printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n", 38
38 current->comm, curbase, regs->sp); 39 "do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
39 show_stack(NULL,NULL); 40 current->comm, curbase, regs->sp);
40 warned = jiffies;
41 }
42}
43#endif 41#endif
42}
44 43
45/* 44/*
46 * do_IRQ handles all normal device IRQ's (the special 45 * do_IRQ handles all normal device IRQ's (the special
47 * SMP cross-CPU interrupts have their own specific 46 * SMP cross-CPU interrupts have their own specific
48 * handlers). 47 * handlers).
49 */ 48 */
50asmlinkage unsigned int do_IRQ(struct pt_regs *regs) 49asmlinkage unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
51{ 50{
52 struct pt_regs *old_regs = set_irq_regs(regs); 51 struct pt_regs *old_regs = set_irq_regs(regs);
53 struct irq_desc *desc; 52 struct irq_desc *desc;
@@ -60,9 +59,7 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
60 irq_enter(); 59 irq_enter();
61 irq = __get_cpu_var(vector_irq)[vector]; 60 irq = __get_cpu_var(vector_irq)[vector];
62 61
63#ifdef CONFIG_DEBUG_STACKOVERFLOW
64 stack_overflow_check(regs); 62 stack_overflow_check(regs);
65#endif
66 63
67 desc = irq_to_desc(irq); 64 desc = irq_to_desc(irq);
68 if (likely(desc)) 65 if (likely(desc))
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 61aa2a1004b5..84723295f88a 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -140,7 +140,7 @@ void __init native_init_IRQ(void)
140 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { 140 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
141 /* SYSCALL_VECTOR was reserved in trap_init. */ 141 /* SYSCALL_VECTOR was reserved in trap_init. */
142 if (i != SYSCALL_VECTOR) 142 if (i != SYSCALL_VECTOR)
143 set_intr_gate(i, interrupt[i]); 143 set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
144 } 144 }
145 145
146 146
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 1020919efe1c..31ebfe38e96c 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -24,41 +24,6 @@
24#include <asm/i8259.h> 24#include <asm/i8259.h>
25 25
26/* 26/*
27 * Common place to define all x86 IRQ vectors
28 *
29 * This builds up the IRQ handler stubs using some ugly macros in irq.h
30 *
31 * These macros create the low-level assembly IRQ routines that save
32 * register context and call do_IRQ(). do_IRQ() then does all the
33 * operations that are needed to keep the AT (or SMP IOAPIC)
34 * interrupt-controller happy.
35 */
36
37#define IRQ_NAME2(nr) nr##_interrupt(void)
38#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr)
39
40/*
41 * SMP has a few special interrupts for IPI messages
42 */
43
44#define BUILD_IRQ(nr) \
45 asmlinkage void IRQ_NAME(nr); \
46 asm("\n.text\n.p2align\n" \
47 "IRQ" #nr "_interrupt:\n\t" \
48 "push $~(" #nr ") ; " \
49 "jmp common_interrupt\n" \
50 ".previous");
51
52#define BI(x,y) \
53 BUILD_IRQ(x##y)
54
55#define BUILD_16_IRQS(x) \
56 BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
57 BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
58 BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
59 BI(x,c) BI(x,d) BI(x,e) BI(x,f)
60
61/*
62 * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: 27 * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
63 * (these are usually mapped to vectors 0x30-0x3f) 28 * (these are usually mapped to vectors 0x30-0x3f)
64 */ 29 */
@@ -73,37 +38,6 @@
73 * 38 *
74 * (these are usually mapped into the 0x30-0xff vector range) 39 * (these are usually mapped into the 0x30-0xff vector range)
75 */ 40 */
76 BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3)
77BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7)
78BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)
79BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf)
80
81#undef BUILD_16_IRQS
82#undef BI
83
84
85#define IRQ(x,y) \
86 IRQ##x##y##_interrupt
87
88#define IRQLIST_16(x) \
89 IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
90 IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
91 IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
92 IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
93
94/* for the irq vectors */
95static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
96 IRQLIST_16(0x2), IRQLIST_16(0x3),
97 IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
98 IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
99 IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf)
100};
101
102#undef IRQ
103#undef IRQLIST_16
104
105
106
107 41
108/* 42/*
109 * IRQ2 is cascade interrupt to second interrupt controller 43 * IRQ2 is cascade interrupt to second interrupt controller
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 5f8e5d75a254..c25fdb382292 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -10,7 +10,7 @@
10 * This driver allows to upgrade microcode on AMD 10 * This driver allows to upgrade microcode on AMD
11 * family 0x10 and 0x11 processors. 11 * family 0x10 and 0x11 processors.
12 * 12 *
13 * Licensed unter the terms of the GNU General Public 13 * Licensed under the terms of the GNU General Public
14 * License version 2. See file COPYING for details. 14 * License version 2. See file COPYING for details.
15*/ 15*/
16 16
@@ -32,9 +32,9 @@
32#include <linux/platform_device.h> 32#include <linux/platform_device.h>
33#include <linux/pci.h> 33#include <linux/pci.h>
34#include <linux/pci_ids.h> 34#include <linux/pci_ids.h>
35#include <linux/uaccess.h>
35 36
36#include <asm/msr.h> 37#include <asm/msr.h>
37#include <asm/uaccess.h>
38#include <asm/processor.h> 38#include <asm/processor.h>
39#include <asm/microcode.h> 39#include <asm/microcode.h>
40 40
@@ -47,43 +47,38 @@ MODULE_LICENSE("GPL v2");
47#define UCODE_UCODE_TYPE 0x00000001 47#define UCODE_UCODE_TYPE 0x00000001
48 48
49struct equiv_cpu_entry { 49struct equiv_cpu_entry {
50 unsigned int installed_cpu; 50 u32 installed_cpu;
51 unsigned int fixed_errata_mask; 51 u32 fixed_errata_mask;
52 unsigned int fixed_errata_compare; 52 u32 fixed_errata_compare;
53 unsigned int equiv_cpu; 53 u16 equiv_cpu;
54}; 54 u16 res;
55} __attribute__((packed));
55 56
56struct microcode_header_amd { 57struct microcode_header_amd {
57 unsigned int data_code; 58 u32 data_code;
58 unsigned int patch_id; 59 u32 patch_id;
59 unsigned char mc_patch_data_id[2]; 60 u16 mc_patch_data_id;
60 unsigned char mc_patch_data_len; 61 u8 mc_patch_data_len;
61 unsigned char init_flag; 62 u8 init_flag;
62 unsigned int mc_patch_data_checksum; 63 u32 mc_patch_data_checksum;
63 unsigned int nb_dev_id; 64 u32 nb_dev_id;
64 unsigned int sb_dev_id; 65 u32 sb_dev_id;
65 unsigned char processor_rev_id[2]; 66 u16 processor_rev_id;
66 unsigned char nb_rev_id; 67 u8 nb_rev_id;
67 unsigned char sb_rev_id; 68 u8 sb_rev_id;
68 unsigned char bios_api_rev; 69 u8 bios_api_rev;
69 unsigned char reserved1[3]; 70 u8 reserved1[3];
70 unsigned int match_reg[8]; 71 u32 match_reg[8];
71}; 72} __attribute__((packed));
72 73
73struct microcode_amd { 74struct microcode_amd {
74 struct microcode_header_amd hdr; 75 struct microcode_header_amd hdr;
75 unsigned int mpb[0]; 76 unsigned int mpb[0];
76}; 77};
77 78
78#define UCODE_MAX_SIZE (2048) 79#define UCODE_MAX_SIZE 2048
79#define DEFAULT_UCODE_DATASIZE (896) 80#define UCODE_CONTAINER_SECTION_HDR 8
80#define MC_HEADER_SIZE (sizeof(struct microcode_header_amd)) 81#define UCODE_CONTAINER_HEADER_SIZE 12
81#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
82#define DWSIZE (sizeof(u32))
83/* For now we support a fixed ucode total size only */
84#define get_totalsize(mc) \
85 ((((struct microcode_amd *)mc)->hdr.mc_patch_data_len * 28) \
86 + MC_HEADER_SIZE)
87 82
88/* serialize access to the physical write */ 83/* serialize access to the physical write */
89static DEFINE_SPINLOCK(microcode_update_lock); 84static DEFINE_SPINLOCK(microcode_update_lock);
@@ -93,31 +88,24 @@ static struct equiv_cpu_entry *equiv_cpu_table;
93static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) 88static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
94{ 89{
95 struct cpuinfo_x86 *c = &cpu_data(cpu); 90 struct cpuinfo_x86 *c = &cpu_data(cpu);
91 u32 dummy;
96 92
97 memset(csig, 0, sizeof(*csig)); 93 memset(csig, 0, sizeof(*csig));
98
99 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { 94 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
100 printk(KERN_ERR "microcode: CPU%d not a capable AMD processor\n", 95 printk(KERN_WARNING "microcode: CPU%d: AMD CPU family 0x%x not "
101 cpu); 96 "supported\n", cpu, c->x86);
102 return -1; 97 return -1;
103 } 98 }
104 99 rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy);
105 asm volatile("movl %1, %%ecx; rdmsr" 100 printk(KERN_INFO "microcode: CPU%d: patch_level=0x%x\n", cpu, csig->rev);
106 : "=a" (csig->rev)
107 : "i" (0x0000008B) : "ecx");
108
109 printk(KERN_INFO "microcode: collect_cpu_info_amd : patch_id=0x%x\n",
110 csig->rev);
111
112 return 0; 101 return 0;
113} 102}
114 103
115static int get_matching_microcode(int cpu, void *mc, int rev) 104static int get_matching_microcode(int cpu, void *mc, int rev)
116{ 105{
117 struct microcode_header_amd *mc_header = mc; 106 struct microcode_header_amd *mc_header = mc;
118 struct pci_dev *nb_pci_dev, *sb_pci_dev;
119 unsigned int current_cpu_id; 107 unsigned int current_cpu_id;
120 unsigned int equiv_cpu_id = 0x00; 108 u16 equiv_cpu_id = 0;
121 unsigned int i = 0; 109 unsigned int i = 0;
122 110
123 BUG_ON(equiv_cpu_table == NULL); 111 BUG_ON(equiv_cpu_table == NULL);
@@ -132,57 +120,25 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
132 } 120 }
133 121
134 if (!equiv_cpu_id) { 122 if (!equiv_cpu_id) {
135 printk(KERN_ERR "microcode: CPU%d cpu_id " 123 printk(KERN_WARNING "microcode: CPU%d: cpu revision "
136 "not found in equivalent cpu table \n", cpu); 124 "not listed in equivalent cpu table\n", cpu);
137 return 0; 125 return 0;
138 } 126 }
139 127
140 if ((mc_header->processor_rev_id[0]) != (equiv_cpu_id & 0xff)) { 128 if (mc_header->processor_rev_id != equiv_cpu_id) {
141 printk(KERN_ERR 129 printk(KERN_ERR "microcode: CPU%d: patch mismatch "
142 "microcode: CPU%d patch does not match " 130 "(processor_rev_id: %x, equiv_cpu_id: %x)\n",
143 "(patch is %x, cpu extended is %x) \n", 131 cpu, mc_header->processor_rev_id, equiv_cpu_id);
144 cpu, mc_header->processor_rev_id[0],
145 (equiv_cpu_id & 0xff));
146 return 0; 132 return 0;
147 } 133 }
148 134
149 if ((mc_header->processor_rev_id[1]) != ((equiv_cpu_id >> 16) & 0xff)) { 135 /* ucode might be chipset specific -- currently we don't support this */
150 printk(KERN_ERR "microcode: CPU%d patch does not match " 136 if (mc_header->nb_dev_id || mc_header->sb_dev_id) {
151 "(patch is %x, cpu base id is %x) \n", 137 printk(KERN_ERR "microcode: CPU%d: loading of chipset "
152 cpu, mc_header->processor_rev_id[1], 138 "specific code not yet supported\n", cpu);
153 ((equiv_cpu_id >> 16) & 0xff));
154
155 return 0; 139 return 0;
156 } 140 }
157 141
158 /* ucode may be northbridge specific */
159 if (mc_header->nb_dev_id) {
160 nb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD,
161 (mc_header->nb_dev_id & 0xff),
162 NULL);
163 if ((!nb_pci_dev) ||
164 (mc_header->nb_rev_id != nb_pci_dev->revision)) {
165 printk(KERN_ERR "microcode: CPU%d NB mismatch \n", cpu);
166 pci_dev_put(nb_pci_dev);
167 return 0;
168 }
169 pci_dev_put(nb_pci_dev);
170 }
171
172 /* ucode may be southbridge specific */
173 if (mc_header->sb_dev_id) {
174 sb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD,
175 (mc_header->sb_dev_id & 0xff),
176 NULL);
177 if ((!sb_pci_dev) ||
178 (mc_header->sb_rev_id != sb_pci_dev->revision)) {
179 printk(KERN_ERR "microcode: CPU%d SB mismatch \n", cpu);
180 pci_dev_put(sb_pci_dev);
181 return 0;
182 }
183 pci_dev_put(sb_pci_dev);
184 }
185
186 if (mc_header->patch_id <= rev) 142 if (mc_header->patch_id <= rev)
187 return 0; 143 return 0;
188 144
@@ -192,12 +148,10 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
192static void apply_microcode_amd(int cpu) 148static void apply_microcode_amd(int cpu)
193{ 149{
194 unsigned long flags; 150 unsigned long flags;
195 unsigned int eax, edx; 151 u32 rev, dummy;
196 unsigned int rev;
197 int cpu_num = raw_smp_processor_id(); 152 int cpu_num = raw_smp_processor_id();
198 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; 153 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
199 struct microcode_amd *mc_amd = uci->mc; 154 struct microcode_amd *mc_amd = uci->mc;
200 unsigned long addr;
201 155
202 /* We should bind the task to the CPU */ 156 /* We should bind the task to the CPU */
203 BUG_ON(cpu_num != cpu); 157 BUG_ON(cpu_num != cpu);
@@ -206,42 +160,34 @@ static void apply_microcode_amd(int cpu)
206 return; 160 return;
207 161
208 spin_lock_irqsave(&microcode_update_lock, flags); 162 spin_lock_irqsave(&microcode_update_lock, flags);
209 163 wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
210 addr = (unsigned long)&mc_amd->hdr.data_code;
211 edx = (unsigned int)(((unsigned long)upper_32_bits(addr)));
212 eax = (unsigned int)(((unsigned long)lower_32_bits(addr)));
213
214 asm volatile("movl %0, %%ecx; wrmsr" :
215 : "i" (0xc0010020), "a" (eax), "d" (edx) : "ecx");
216
217 /* get patch id after patching */ 164 /* get patch id after patching */
218 asm volatile("movl %1, %%ecx; rdmsr" 165 rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
219 : "=a" (rev)
220 : "i" (0x0000008B) : "ecx");
221
222 spin_unlock_irqrestore(&microcode_update_lock, flags); 166 spin_unlock_irqrestore(&microcode_update_lock, flags);
223 167
224 /* check current patch id and patch's id for match */ 168 /* check current patch id and patch's id for match */
225 if (rev != mc_amd->hdr.patch_id) { 169 if (rev != mc_amd->hdr.patch_id) {
226 printk(KERN_ERR "microcode: CPU%d update from revision " 170 printk(KERN_ERR "microcode: CPU%d: update failed "
227 "0x%x to 0x%x failed\n", cpu_num, 171 "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id);
228 mc_amd->hdr.patch_id, rev);
229 return; 172 return;
230 } 173 }
231 174
232 printk(KERN_INFO "microcode: CPU%d updated from revision " 175 printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n",
233 "0x%x to 0x%x \n", 176 cpu, rev);
234 cpu_num, uci->cpu_sig.rev, mc_amd->hdr.patch_id);
235 177
236 uci->cpu_sig.rev = rev; 178 uci->cpu_sig.rev = rev;
237} 179}
238 180
239static void * get_next_ucode(u8 *buf, unsigned int size, 181static int get_ucode_data(void *to, const u8 *from, size_t n)
240 int (*get_ucode_data)(void *, const void *, size_t), 182{
241 unsigned int *mc_size) 183 memcpy(to, from, n);
184 return 0;
185}
186
187static void *get_next_ucode(const u8 *buf, unsigned int size,
188 unsigned int *mc_size)
242{ 189{
243 unsigned int total_size; 190 unsigned int total_size;
244#define UCODE_CONTAINER_SECTION_HDR 8
245 u8 section_hdr[UCODE_CONTAINER_SECTION_HDR]; 191 u8 section_hdr[UCODE_CONTAINER_SECTION_HDR];
246 void *mc; 192 void *mc;
247 193
@@ -249,39 +195,37 @@ static void * get_next_ucode(u8 *buf, unsigned int size,
249 return NULL; 195 return NULL;
250 196
251 if (section_hdr[0] != UCODE_UCODE_TYPE) { 197 if (section_hdr[0] != UCODE_UCODE_TYPE) {
252 printk(KERN_ERR "microcode: error! " 198 printk(KERN_ERR "microcode: error: invalid type field in "
253 "Wrong microcode payload type field\n"); 199 "container file section header\n");
254 return NULL; 200 return NULL;
255 } 201 }
256 202
257 total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); 203 total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8));
258 204
259 printk(KERN_INFO "microcode: size %u, total_size %u\n", 205 printk(KERN_DEBUG "microcode: size %u, total_size %u\n",
260 size, total_size); 206 size, total_size);
261 207
262 if (total_size > size || total_size > UCODE_MAX_SIZE) { 208 if (total_size > size || total_size > UCODE_MAX_SIZE) {
263 printk(KERN_ERR "microcode: error! Bad data in microcode data file\n"); 209 printk(KERN_ERR "microcode: error: size mismatch\n");
264 return NULL; 210 return NULL;
265 } 211 }
266 212
267 mc = vmalloc(UCODE_MAX_SIZE); 213 mc = vmalloc(UCODE_MAX_SIZE);
268 if (mc) { 214 if (mc) {
269 memset(mc, 0, UCODE_MAX_SIZE); 215 memset(mc, 0, UCODE_MAX_SIZE);
270 if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size)) { 216 if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR,
217 total_size)) {
271 vfree(mc); 218 vfree(mc);
272 mc = NULL; 219 mc = NULL;
273 } else 220 } else
274 *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR; 221 *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
275 } 222 }
276#undef UCODE_CONTAINER_SECTION_HDR
277 return mc; 223 return mc;
278} 224}
279 225
280 226
281static int install_equiv_cpu_table(u8 *buf, 227static int install_equiv_cpu_table(const u8 *buf)
282 int (*get_ucode_data)(void *, const void *, size_t))
283{ 228{
284#define UCODE_CONTAINER_HEADER_SIZE 12
285 u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE]; 229 u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE];
286 unsigned int *buf_pos = (unsigned int *)container_hdr; 230 unsigned int *buf_pos = (unsigned int *)container_hdr;
287 unsigned long size; 231 unsigned long size;
@@ -292,14 +236,15 @@ static int install_equiv_cpu_table(u8 *buf,
292 size = buf_pos[2]; 236 size = buf_pos[2];
293 237
294 if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { 238 if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
295 printk(KERN_ERR "microcode: error! " 239 printk(KERN_ERR "microcode: error: invalid type field in "
296 "Wrong microcode equivalnet cpu table\n"); 240 "container file section header\n");
297 return 0; 241 return 0;
298 } 242 }
299 243
300 equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size); 244 equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size);
301 if (!equiv_cpu_table) { 245 if (!equiv_cpu_table) {
302 printk(KERN_ERR "microcode: error, can't allocate memory for equiv CPU table\n"); 246 printk(KERN_ERR "microcode: failed to allocate "
247 "equivalent CPU table\n");
303 return 0; 248 return 0;
304 } 249 }
305 250
@@ -310,7 +255,6 @@ static int install_equiv_cpu_table(u8 *buf,
310 } 255 }
311 256
312 return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */ 257 return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
313#undef UCODE_CONTAINER_HEADER_SIZE
314} 258}
315 259
316static void free_equiv_cpu_table(void) 260static void free_equiv_cpu_table(void)
@@ -321,18 +265,20 @@ static void free_equiv_cpu_table(void)
321 } 265 }
322} 266}
323 267
324static int generic_load_microcode(int cpu, void *data, size_t size, 268static int generic_load_microcode(int cpu, const u8 *data, size_t size)
325 int (*get_ucode_data)(void *, const void *, size_t))
326{ 269{
327 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 270 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
328 u8 *ucode_ptr = data, *new_mc = NULL, *mc; 271 const u8 *ucode_ptr = data;
272 void *new_mc = NULL;
273 void *mc;
329 int new_rev = uci->cpu_sig.rev; 274 int new_rev = uci->cpu_sig.rev;
330 unsigned int leftover; 275 unsigned int leftover;
331 unsigned long offset; 276 unsigned long offset;
332 277
333 offset = install_equiv_cpu_table(ucode_ptr, get_ucode_data); 278 offset = install_equiv_cpu_table(ucode_ptr);
334 if (!offset) { 279 if (!offset) {
335 printk(KERN_ERR "microcode: installing equivalent cpu table failed\n"); 280 printk(KERN_ERR "microcode: failed to create "
281 "equivalent cpu table\n");
336 return -EINVAL; 282 return -EINVAL;
337 } 283 }
338 284
@@ -343,7 +289,7 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
343 unsigned int uninitialized_var(mc_size); 289 unsigned int uninitialized_var(mc_size);
344 struct microcode_header_amd *mc_header; 290 struct microcode_header_amd *mc_header;
345 291
346 mc = get_next_ucode(ucode_ptr, leftover, get_ucode_data, &mc_size); 292 mc = get_next_ucode(ucode_ptr, leftover, &mc_size);
347 if (!mc) 293 if (!mc)
348 break; 294 break;
349 295
@@ -353,7 +299,7 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
353 vfree(new_mc); 299 vfree(new_mc);
354 new_rev = mc_header->patch_id; 300 new_rev = mc_header->patch_id;
355 new_mc = mc; 301 new_mc = mc;
356 } else 302 } else
357 vfree(mc); 303 vfree(mc);
358 304
359 ucode_ptr += mc_size; 305 ucode_ptr += mc_size;
@@ -365,9 +311,9 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
365 if (uci->mc) 311 if (uci->mc)
366 vfree(uci->mc); 312 vfree(uci->mc);
367 uci->mc = new_mc; 313 uci->mc = new_mc;
368 pr_debug("microcode: CPU%d found a matching microcode update with" 314 pr_debug("microcode: CPU%d found a matching microcode "
369 " version 0x%x (current=0x%x)\n", 315 "update with version 0x%x (current=0x%x)\n",
370 cpu, new_rev, uci->cpu_sig.rev); 316 cpu, new_rev, uci->cpu_sig.rev);
371 } else 317 } else
372 vfree(new_mc); 318 vfree(new_mc);
373 } 319 }
@@ -377,12 +323,6 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
377 return (int)leftover; 323 return (int)leftover;
378} 324}
379 325
380static int get_ucode_fw(void *to, const void *from, size_t n)
381{
382 memcpy(to, from, n);
383 return 0;
384}
385
386static int request_microcode_fw(int cpu, struct device *device) 326static int request_microcode_fw(int cpu, struct device *device)
387{ 327{
388 const char *fw_name = "amd-ucode/microcode_amd.bin"; 328 const char *fw_name = "amd-ucode/microcode_amd.bin";
@@ -394,12 +334,11 @@ static int request_microcode_fw(int cpu, struct device *device)
394 334
395 ret = request_firmware(&firmware, fw_name, device); 335 ret = request_firmware(&firmware, fw_name, device);
396 if (ret) { 336 if (ret) {
397 printk(KERN_ERR "microcode: ucode data file %s load failed\n", fw_name); 337 printk(KERN_ERR "microcode: failed to load file %s\n", fw_name);
398 return ret; 338 return ret;
399 } 339 }
400 340
401 ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size, 341 ret = generic_load_microcode(cpu, firmware->data, firmware->size);
402 &get_ucode_fw);
403 342
404 release_firmware(firmware); 343 release_firmware(firmware);
405 344
@@ -408,8 +347,8 @@ static int request_microcode_fw(int cpu, struct device *device)
408 347
409static int request_microcode_user(int cpu, const void __user *buf, size_t size) 348static int request_microcode_user(int cpu, const void __user *buf, size_t size)
410{ 349{
411 printk(KERN_WARNING "microcode: AMD microcode update via /dev/cpu/microcode" 350 printk(KERN_INFO "microcode: AMD microcode update via "
412 "is not supported\n"); 351 "/dev/cpu/microcode not supported\n");
413 return -1; 352 return -1;
414} 353}
415 354
@@ -433,3 +372,4 @@ struct microcode_ops * __init init_amd_microcode(void)
433{ 372{
434 return &microcode_amd_ops; 373 return &microcode_amd_ops;
435} 374}
375
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 82fb2809ce32..c9b721ba968c 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -99,7 +99,7 @@ MODULE_LICENSE("GPL");
99 99
100#define MICROCODE_VERSION "2.00" 100#define MICROCODE_VERSION "2.00"
101 101
102struct microcode_ops *microcode_ops; 102static struct microcode_ops *microcode_ops;
103 103
104/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ 104/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
105static DEFINE_MUTEX(microcode_mutex); 105static DEFINE_MUTEX(microcode_mutex);
@@ -203,7 +203,7 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
203#endif 203#endif
204 204
205/* fake device for request_firmware */ 205/* fake device for request_firmware */
206struct platform_device *microcode_pdev; 206static struct platform_device *microcode_pdev;
207 207
208static ssize_t reload_store(struct sys_device *dev, 208static ssize_t reload_store(struct sys_device *dev,
209 struct sysdev_attribute *attr, 209 struct sysdev_attribute *attr,
@@ -272,13 +272,18 @@ static struct attribute_group mc_attr_group = {
272 .name = "microcode", 272 .name = "microcode",
273}; 273};
274 274
275static void microcode_fini_cpu(int cpu) 275static void __microcode_fini_cpu(int cpu)
276{ 276{
277 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 277 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
278 278
279 mutex_lock(&microcode_mutex);
280 microcode_ops->microcode_fini_cpu(cpu); 279 microcode_ops->microcode_fini_cpu(cpu);
281 uci->valid = 0; 280 uci->valid = 0;
281}
282
283static void microcode_fini_cpu(int cpu)
284{
285 mutex_lock(&microcode_mutex);
286 __microcode_fini_cpu(cpu);
282 mutex_unlock(&microcode_mutex); 287 mutex_unlock(&microcode_mutex);
283} 288}
284 289
@@ -306,12 +311,16 @@ static int microcode_resume_cpu(int cpu)
306 * to this cpu (a bit of paranoia): 311 * to this cpu (a bit of paranoia):
307 */ 312 */
308 if (microcode_ops->collect_cpu_info(cpu, &nsig)) { 313 if (microcode_ops->collect_cpu_info(cpu, &nsig)) {
309 microcode_fini_cpu(cpu); 314 __microcode_fini_cpu(cpu);
315 printk(KERN_ERR "failed to collect_cpu_info for resuming cpu #%d\n",
316 cpu);
310 return -1; 317 return -1;
311 } 318 }
312 319
313 if (memcmp(&nsig, &uci->cpu_sig, sizeof(nsig))) { 320 if ((nsig.sig != uci->cpu_sig.sig) || (nsig.pf != uci->cpu_sig.pf)) {
314 microcode_fini_cpu(cpu); 321 __microcode_fini_cpu(cpu);
322 printk(KERN_ERR "cached ucode doesn't match the resuming cpu #%d\n",
323 cpu);
315 /* Should we look for a new ucode here? */ 324 /* Should we look for a new ucode here? */
316 return 1; 325 return 1;
317 } 326 }
@@ -319,7 +328,7 @@ static int microcode_resume_cpu(int cpu)
319 return 0; 328 return 0;
320} 329}
321 330
322void microcode_update_cpu(int cpu) 331static void microcode_update_cpu(int cpu)
323{ 332{
324 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 333 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
325 int err = 0; 334 int err = 0;
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 622dc4a21784..b7f4c929e615 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -155,6 +155,7 @@ static DEFINE_SPINLOCK(microcode_update_lock);
155static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) 155static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
156{ 156{
157 struct cpuinfo_x86 *c = &cpu_data(cpu_num); 157 struct cpuinfo_x86 *c = &cpu_data(cpu_num);
158 unsigned long flags;
158 unsigned int val[2]; 159 unsigned int val[2];
159 160
160 memset(csig, 0, sizeof(*csig)); 161 memset(csig, 0, sizeof(*csig));
@@ -174,11 +175,16 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
174 csig->pf = 1 << ((val[1] >> 18) & 7); 175 csig->pf = 1 << ((val[1] >> 18) & 7);
175 } 176 }
176 177
178 /* serialize access to the physical write to MSR 0x79 */
179 spin_lock_irqsave(&microcode_update_lock, flags);
180
177 wrmsr(MSR_IA32_UCODE_REV, 0, 0); 181 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
178 /* see notes above for revision 1.07. Apparent chip bug */ 182 /* see notes above for revision 1.07. Apparent chip bug */
179 sync_core(); 183 sync_core();
180 /* get the current revision from MSR 0x8B */ 184 /* get the current revision from MSR 0x8B */
181 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); 185 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
186 spin_unlock_irqrestore(&microcode_update_lock, flags);
187
182 pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n", 188 pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
183 csig->sig, csig->pf, csig->rev); 189 csig->sig, csig->pf, csig->rev);
184 190
@@ -465,7 +471,7 @@ static void microcode_fini_cpu(int cpu)
465 uci->mc = NULL; 471 uci->mc = NULL;
466} 472}
467 473
468struct microcode_ops microcode_intel_ops = { 474static struct microcode_ops microcode_intel_ops = {
469 .request_microcode_user = request_microcode_user, 475 .request_microcode_user = request_microcode_user,
470 .request_microcode_fw = request_microcode_fw, 476 .request_microcode_fw = request_microcode_fw,
471 .collect_cpu_info = collect_cpu_info, 477 .collect_cpu_info = collect_cpu_info,
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 13316cf57cdb..8bd1bf9622a7 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -208,12 +208,17 @@ static int __init setup_nmi_watchdog(char *str)
208 ++str; 208 ++str;
209 } 209 }
210 210
211 get_option(&str, &nmi); 211 if (!strncmp(str, "lapic", 5))
212 212 nmi_watchdog = NMI_LOCAL_APIC;
213 if (nmi >= NMI_INVALID) 213 else if (!strncmp(str, "ioapic", 6))
214 return 0; 214 nmi_watchdog = NMI_IO_APIC;
215 else {
216 get_option(&str, &nmi);
217 if (nmi >= NMI_INVALID)
218 return 0;
219 nmi_watchdog = nmi;
220 }
215 221
216 nmi_watchdog = nmi;
217 return 1; 222 return 1;
218} 223}
219__setup("nmi_watchdog=", setup_nmi_watchdog); 224__setup("nmi_watchdog=", setup_nmi_watchdog);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 192624820217..19a1044a0cd9 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -6,6 +6,7 @@
6#include <asm/proto.h> 6#include <asm/proto.h>
7#include <asm/dma.h> 7#include <asm/dma.h>
8#include <asm/iommu.h> 8#include <asm/iommu.h>
9#include <asm/gart.h>
9#include <asm/calgary.h> 10#include <asm/calgary.h>
10#include <asm/amd_iommu.h> 11#include <asm/amd_iommu.h>
11 12
@@ -30,11 +31,6 @@ int no_iommu __read_mostly;
30/* Set this to 1 if there is a HW IOMMU in the system */ 31/* Set this to 1 if there is a HW IOMMU in the system */
31int iommu_detected __read_mostly = 0; 32int iommu_detected __read_mostly = 0;
32 33
33/* This tells the BIO block layer to assume merging. Default to off
34 because we cannot guarantee merging later. */
35int iommu_bio_merge __read_mostly = 0;
36EXPORT_SYMBOL(iommu_bio_merge);
37
38dma_addr_t bad_dma_address __read_mostly = 0; 34dma_addr_t bad_dma_address __read_mostly = 0;
39EXPORT_SYMBOL(bad_dma_address); 35EXPORT_SYMBOL(bad_dma_address);
40 36
@@ -105,11 +101,15 @@ static void __init dma32_free_bootmem(void)
105 dma32_bootmem_ptr = NULL; 101 dma32_bootmem_ptr = NULL;
106 dma32_bootmem_size = 0; 102 dma32_bootmem_size = 0;
107} 103}
104#endif
108 105
109void __init pci_iommu_alloc(void) 106void __init pci_iommu_alloc(void)
110{ 107{
108#ifdef CONFIG_X86_64
111 /* free the range so iommu could get some range less than 4G */ 109 /* free the range so iommu could get some range less than 4G */
112 dma32_free_bootmem(); 110 dma32_free_bootmem();
111#endif
112
113 /* 113 /*
114 * The order of these functions is important for 114 * The order of these functions is important for
115 * fall-back/fail-over reasons 115 * fall-back/fail-over reasons
@@ -125,15 +125,6 @@ void __init pci_iommu_alloc(void)
125 pci_swiotlb_init(); 125 pci_swiotlb_init();
126} 126}
127 127
128unsigned long iommu_nr_pages(unsigned long addr, unsigned long len)
129{
130 unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE);
131
132 return size >> PAGE_SHIFT;
133}
134EXPORT_SYMBOL(iommu_nr_pages);
135#endif
136
137void *dma_generic_alloc_coherent(struct device *dev, size_t size, 128void *dma_generic_alloc_coherent(struct device *dev, size_t size,
138 dma_addr_t *dma_addr, gfp_t flag) 129 dma_addr_t *dma_addr, gfp_t flag)
139{ 130{
@@ -188,7 +179,6 @@ static __init int iommu_setup(char *p)
188 } 179 }
189 180
190 if (!strncmp(p, "biomerge", 8)) { 181 if (!strncmp(p, "biomerge", 8)) {
191 iommu_bio_merge = 4096;
192 iommu_merge = 1; 182 iommu_merge = 1;
193 force_iommu = 1; 183 force_iommu = 1;
194 } 184 }
@@ -300,8 +290,8 @@ fs_initcall(pci_iommu_init);
300static __devinit void via_no_dac(struct pci_dev *dev) 290static __devinit void via_no_dac(struct pci_dev *dev)
301{ 291{
302 if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { 292 if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
303 printk(KERN_INFO "PCI: VIA PCI bridge detected." 293 printk(KERN_INFO
304 "Disabling DAC.\n"); 294 "PCI: VIA PCI bridge detected. Disabling DAC.\n");
305 forbid_dac = 1; 295 forbid_dac = 1;
306 } 296 }
307} 297}
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index ba7ad83e20a8..a35eaa379ff6 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -745,10 +745,8 @@ void __init gart_iommu_init(void)
745 unsigned long scratch; 745 unsigned long scratch;
746 long i; 746 long i;
747 747
748 if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) { 748 if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0)
749 printk(KERN_INFO "PCI-GART: No AMD GART found.\n");
750 return; 749 return;
751 }
752 750
753#ifndef CONFIG_AGP_AMD64 751#ifndef CONFIG_AGP_AMD64
754 no_agp = 1; 752 no_agp = 1;
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
index 3c539d111abb..242c3440687f 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -3,6 +3,8 @@
3#include <linux/pci.h> 3#include <linux/pci.h>
4#include <linux/cache.h> 4#include <linux/cache.h>
5#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/swiotlb.h>
7#include <linux/bootmem.h>
6#include <linux/dma-mapping.h> 8#include <linux/dma-mapping.h>
7 9
8#include <asm/iommu.h> 10#include <asm/iommu.h>
@@ -11,6 +13,31 @@
11 13
12int swiotlb __read_mostly; 14int swiotlb __read_mostly;
13 15
16void *swiotlb_alloc_boot(size_t size, unsigned long nslabs)
17{
18 return alloc_bootmem_low_pages(size);
19}
20
21void *swiotlb_alloc(unsigned order, unsigned long nslabs)
22{
23 return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order);
24}
25
26dma_addr_t swiotlb_phys_to_bus(phys_addr_t paddr)
27{
28 return paddr;
29}
30
31phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr)
32{
33 return baddr;
34}
35
36int __weak swiotlb_arch_range_needs_mapping(void *ptr, size_t size)
37{
38 return 0;
39}
40
14static dma_addr_t 41static dma_addr_t
15swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size, 42swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size,
16 int direction) 43 int direction)
@@ -50,8 +77,10 @@ struct dma_mapping_ops swiotlb_dma_ops = {
50void __init pci_swiotlb_init(void) 77void __init pci_swiotlb_init(void)
51{ 78{
52 /* don't initialize swiotlb if iommu=off (no_iommu=1) */ 79 /* don't initialize swiotlb if iommu=off (no_iommu=1) */
80#ifdef CONFIG_X86_64
53 if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) 81 if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)
54 swiotlb = 1; 82 swiotlb = 1;
83#endif
55 if (swiotlb_force) 84 if (swiotlb_force)
56 swiotlb = 1; 85 swiotlb = 1;
57 if (swiotlb) { 86 if (swiotlb) {
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 95d811a9594f..e68bb9e30864 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -1,6 +1,7 @@
1#include <linux/errno.h> 1#include <linux/errno.h>
2#include <linux/kernel.h> 2#include <linux/kernel.h>
3#include <linux/mm.h> 3#include <linux/mm.h>
4#include <asm/idle.h>
4#include <linux/smp.h> 5#include <linux/smp.h>
5#include <linux/slab.h> 6#include <linux/slab.h>
6#include <linux/sched.h> 7#include <linux/sched.h>
@@ -302,7 +303,7 @@ static void c1e_idle(void)
302 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); 303 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
303 if (lo & K8_INTP_C1E_ACTIVE_MASK) { 304 if (lo & K8_INTP_C1E_ACTIVE_MASK) {
304 c1e_detected = 1; 305 c1e_detected = 1;
305 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) 306 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
306 mark_tsc_unstable("TSC halt in AMD C1E"); 307 mark_tsc_unstable("TSC halt in AMD C1E");
307 printk(KERN_INFO "System has AMD C1E enabled\n"); 308 printk(KERN_INFO "System has AMD C1E enabled\n");
308 set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E); 309 set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 24c2276aa453..3ba155d24884 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -60,6 +60,7 @@
60#include <asm/idle.h> 60#include <asm/idle.h>
61#include <asm/syscalls.h> 61#include <asm/syscalls.h>
62#include <asm/smp.h> 62#include <asm/smp.h>
63#include <asm/ds.h>
63 64
64asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 65asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
65 66
@@ -251,14 +252,8 @@ void exit_thread(void)
251 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; 252 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
252 put_cpu(); 253 put_cpu();
253 } 254 }
254#ifdef CONFIG_X86_DS 255
255 /* Free any DS contexts that have not been properly released. */ 256 ds_exit_thread(current);
256 if (unlikely(current->thread.ds_ctx)) {
257 /* we clear debugctl to make sure DS is not used. */
258 update_debugctlmsr(0);
259 ds_free(current->thread.ds_ctx);
260 }
261#endif /* CONFIG_X86_DS */
262} 257}
263 258
264void flush_thread(void) 259void flush_thread(void)
@@ -340,6 +335,12 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
340 kfree(p->thread.io_bitmap_ptr); 335 kfree(p->thread.io_bitmap_ptr);
341 p->thread.io_bitmap_max = 0; 336 p->thread.io_bitmap_max = 0;
342 } 337 }
338
339 ds_copy_thread(p, current);
340
341 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
342 p->thread.debugctlmsr = 0;
343
343 return err; 344 return err;
344} 345}
345 346
@@ -420,48 +421,19 @@ int set_tsc_mode(unsigned int val)
420 return 0; 421 return 0;
421} 422}
422 423
423#ifdef CONFIG_X86_DS
424static int update_debugctl(struct thread_struct *prev,
425 struct thread_struct *next, unsigned long debugctl)
426{
427 unsigned long ds_prev = 0;
428 unsigned long ds_next = 0;
429
430 if (prev->ds_ctx)
431 ds_prev = (unsigned long)prev->ds_ctx->ds;
432 if (next->ds_ctx)
433 ds_next = (unsigned long)next->ds_ctx->ds;
434
435 if (ds_next != ds_prev) {
436 /* we clear debugctl to make sure DS
437 * is not in use when we change it */
438 debugctl = 0;
439 update_debugctlmsr(0);
440 wrmsr(MSR_IA32_DS_AREA, ds_next, 0);
441 }
442 return debugctl;
443}
444#else
445static int update_debugctl(struct thread_struct *prev,
446 struct thread_struct *next, unsigned long debugctl)
447{
448 return debugctl;
449}
450#endif /* CONFIG_X86_DS */
451
452static noinline void 424static noinline void
453__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, 425__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
454 struct tss_struct *tss) 426 struct tss_struct *tss)
455{ 427{
456 struct thread_struct *prev, *next; 428 struct thread_struct *prev, *next;
457 unsigned long debugctl;
458 429
459 prev = &prev_p->thread; 430 prev = &prev_p->thread;
460 next = &next_p->thread; 431 next = &next_p->thread;
461 432
462 debugctl = update_debugctl(prev, next, prev->debugctlmsr); 433 if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
463 434 test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
464 if (next->debugctlmsr != debugctl) 435 ds_switch_to(prev_p, next_p);
436 else if (next->debugctlmsr != prev->debugctlmsr)
465 update_debugctlmsr(next->debugctlmsr); 437 update_debugctlmsr(next->debugctlmsr);
466 438
467 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { 439 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
@@ -483,15 +455,6 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
483 hard_enable_TSC(); 455 hard_enable_TSC();
484 } 456 }
485 457
486#ifdef CONFIG_X86_PTRACE_BTS
487 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
488 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
489
490 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
491 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
492#endif /* CONFIG_X86_PTRACE_BTS */
493
494
495 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { 458 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
496 /* 459 /*
497 * Disable the bitmap via an invalid offset. We still cache 460 * Disable the bitmap via an invalid offset. We still cache
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index fbb321d53d34..416fb9282f4f 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -53,6 +53,7 @@
53#include <asm/ia32.h> 53#include <asm/ia32.h>
54#include <asm/idle.h> 54#include <asm/idle.h>
55#include <asm/syscalls.h> 55#include <asm/syscalls.h>
56#include <asm/ds.h>
56 57
57asmlinkage extern void ret_from_fork(void); 58asmlinkage extern void ret_from_fork(void);
58 59
@@ -236,14 +237,8 @@ void exit_thread(void)
236 t->io_bitmap_max = 0; 237 t->io_bitmap_max = 0;
237 put_cpu(); 238 put_cpu();
238 } 239 }
239#ifdef CONFIG_X86_DS 240
240 /* Free any DS contexts that have not been properly released. */ 241 ds_exit_thread(current);
241 if (unlikely(t->ds_ctx)) {
242 /* we clear debugctl to make sure DS is not used. */
243 update_debugctlmsr(0);
244 ds_free(t->ds_ctx);
245 }
246#endif /* CONFIG_X86_DS */
247} 242}
248 243
249void flush_thread(void) 244void flush_thread(void)
@@ -373,6 +368,12 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
373 if (err) 368 if (err)
374 goto out; 369 goto out;
375 } 370 }
371
372 ds_copy_thread(p, me);
373
374 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
375 p->thread.debugctlmsr = 0;
376
376 err = 0; 377 err = 0;
377out: 378out:
378 if (err && p->thread.io_bitmap_ptr) { 379 if (err && p->thread.io_bitmap_ptr) {
@@ -471,35 +472,14 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
471 struct tss_struct *tss) 472 struct tss_struct *tss)
472{ 473{
473 struct thread_struct *prev, *next; 474 struct thread_struct *prev, *next;
474 unsigned long debugctl;
475 475
476 prev = &prev_p->thread, 476 prev = &prev_p->thread,
477 next = &next_p->thread; 477 next = &next_p->thread;
478 478
479 debugctl = prev->debugctlmsr; 479 if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
480 480 test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
481#ifdef CONFIG_X86_DS 481 ds_switch_to(prev_p, next_p);
482 { 482 else if (next->debugctlmsr != prev->debugctlmsr)
483 unsigned long ds_prev = 0, ds_next = 0;
484
485 if (prev->ds_ctx)
486 ds_prev = (unsigned long)prev->ds_ctx->ds;
487 if (next->ds_ctx)
488 ds_next = (unsigned long)next->ds_ctx->ds;
489
490 if (ds_next != ds_prev) {
491 /*
492 * We clear debugctl to make sure DS
493 * is not in use when we change it:
494 */
495 debugctl = 0;
496 update_debugctlmsr(0);
497 wrmsrl(MSR_IA32_DS_AREA, ds_next);
498 }
499 }
500#endif /* CONFIG_X86_DS */
501
502 if (next->debugctlmsr != debugctl)
503 update_debugctlmsr(next->debugctlmsr); 483 update_debugctlmsr(next->debugctlmsr);
504 484
505 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { 485 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
@@ -534,14 +514,6 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
534 */ 514 */
535 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 515 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
536 } 516 }
537
538#ifdef CONFIG_X86_PTRACE_BTS
539 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
540 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
541
542 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
543 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
544#endif /* CONFIG_X86_PTRACE_BTS */
545} 517}
546 518
547/* 519/*
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 2c8ec1ba75e6..0a5df5f82fb9 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -581,153 +581,91 @@ static int ioperm_get(struct task_struct *target,
581} 581}
582 582
583#ifdef CONFIG_X86_PTRACE_BTS 583#ifdef CONFIG_X86_PTRACE_BTS
584/*
585 * The configuration for a particular BTS hardware implementation.
586 */
587struct bts_configuration {
588 /* the size of a BTS record in bytes; at most BTS_MAX_RECORD_SIZE */
589 unsigned char sizeof_bts;
590 /* the size of a field in the BTS record in bytes */
591 unsigned char sizeof_field;
592 /* a bitmask to enable/disable BTS in DEBUGCTL MSR */
593 unsigned long debugctl_mask;
594};
595static struct bts_configuration bts_cfg;
596
597#define BTS_MAX_RECORD_SIZE (8 * 3)
598
599
600/*
601 * Branch Trace Store (BTS) uses the following format. Different
602 * architectures vary in the size of those fields.
603 * - source linear address
604 * - destination linear address
605 * - flags
606 *
607 * Later architectures use 64bit pointers throughout, whereas earlier
608 * architectures use 32bit pointers in 32bit mode.
609 *
610 * We compute the base address for the first 8 fields based on:
611 * - the field size stored in the DS configuration
612 * - the relative field position
613 *
614 * In order to store additional information in the BTS buffer, we use
615 * a special source address to indicate that the record requires
616 * special interpretation.
617 *
618 * Netburst indicated via a bit in the flags field whether the branch
619 * was predicted; this is ignored.
620 */
621
622enum bts_field {
623 bts_from = 0,
624 bts_to,
625 bts_flags,
626
627 bts_escape = (unsigned long)-1,
628 bts_qual = bts_to,
629 bts_jiffies = bts_flags
630};
631
632static inline unsigned long bts_get(const char *base, enum bts_field field)
633{
634 base += (bts_cfg.sizeof_field * field);
635 return *(unsigned long *)base;
636}
637
638static inline void bts_set(char *base, enum bts_field field, unsigned long val)
639{
640 base += (bts_cfg.sizeof_field * field);;
641 (*(unsigned long *)base) = val;
642}
643
644/*
645 * Translate a BTS record from the raw format into the bts_struct format
646 *
647 * out (out): bts_struct interpretation
648 * raw: raw BTS record
649 */
650static void ptrace_bts_translate_record(struct bts_struct *out, const void *raw)
651{
652 memset(out, 0, sizeof(*out));
653 if (bts_get(raw, bts_from) == bts_escape) {
654 out->qualifier = bts_get(raw, bts_qual);
655 out->variant.jiffies = bts_get(raw, bts_jiffies);
656 } else {
657 out->qualifier = BTS_BRANCH;
658 out->variant.lbr.from_ip = bts_get(raw, bts_from);
659 out->variant.lbr.to_ip = bts_get(raw, bts_to);
660 }
661}
662
663static int ptrace_bts_read_record(struct task_struct *child, size_t index, 584static int ptrace_bts_read_record(struct task_struct *child, size_t index,
664 struct bts_struct __user *out) 585 struct bts_struct __user *out)
665{ 586{
666 struct bts_struct ret; 587 const struct bts_trace *trace;
667 const void *bts_record; 588 struct bts_struct bts;
668 size_t bts_index, bts_end; 589 const unsigned char *at;
669 int error; 590 int error;
670 591
671 error = ds_get_bts_end(child->bts, &bts_end); 592 trace = ds_read_bts(child->bts);
672 if (error < 0) 593 if (!trace)
673 return error; 594 return -EPERM;
674 595
675 if (bts_end <= index) 596 at = trace->ds.top - ((index + 1) * trace->ds.size);
676 return -EINVAL; 597 if ((void *)at < trace->ds.begin)
598 at += (trace->ds.n * trace->ds.size);
677 599
678 error = ds_get_bts_index(child->bts, &bts_index); 600 if (!trace->read)
679 if (error < 0) 601 return -EOPNOTSUPP;
680 return error;
681
682 /* translate the ptrace bts index into the ds bts index */
683 bts_index += bts_end - (index + 1);
684 if (bts_end <= bts_index)
685 bts_index -= bts_end;
686 602
687 error = ds_access_bts(child->bts, bts_index, &bts_record); 603 error = trace->read(child->bts, at, &bts);
688 if (error < 0) 604 if (error < 0)
689 return error; 605 return error;
690 606
691 ptrace_bts_translate_record(&ret, bts_record); 607 if (copy_to_user(out, &bts, sizeof(bts)))
692
693 if (copy_to_user(out, &ret, sizeof(ret)))
694 return -EFAULT; 608 return -EFAULT;
695 609
696 return sizeof(ret); 610 return sizeof(bts);
697} 611}
698 612
699static int ptrace_bts_drain(struct task_struct *child, 613static int ptrace_bts_drain(struct task_struct *child,
700 long size, 614 long size,
701 struct bts_struct __user *out) 615 struct bts_struct __user *out)
702{ 616{
703 struct bts_struct ret; 617 const struct bts_trace *trace;
704 const unsigned char *raw; 618 const unsigned char *at;
705 size_t end, i; 619 int error, drained = 0;
706 int error;
707 620
708 error = ds_get_bts_index(child->bts, &end); 621 trace = ds_read_bts(child->bts);
709 if (error < 0) 622 if (!trace)
710 return error; 623 return -EPERM;
711 624
712 if (size < (end * sizeof(struct bts_struct))) 625 if (!trace->read)
626 return -EOPNOTSUPP;
627
628 if (size < (trace->ds.top - trace->ds.begin))
713 return -EIO; 629 return -EIO;
714 630
715 error = ds_access_bts(child->bts, 0, (const void **)&raw); 631 for (at = trace->ds.begin; (void *)at < trace->ds.top;
716 if (error < 0) 632 out++, drained++, at += trace->ds.size) {
717 return error; 633 struct bts_struct bts;
634 int error;
718 635
719 for (i = 0; i < end; i++, out++, raw += bts_cfg.sizeof_bts) { 636 error = trace->read(child->bts, at, &bts);
720 ptrace_bts_translate_record(&ret, raw); 637 if (error < 0)
638 return error;
721 639
722 if (copy_to_user(out, &ret, sizeof(ret))) 640 if (copy_to_user(out, &bts, sizeof(bts)))
723 return -EFAULT; 641 return -EFAULT;
724 } 642 }
725 643
726 error = ds_clear_bts(child->bts); 644 memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
645
646 error = ds_reset_bts(child->bts);
727 if (error < 0) 647 if (error < 0)
728 return error; 648 return error;
729 649
730 return end; 650 return drained;
651}
652
653static int ptrace_bts_allocate_buffer(struct task_struct *child, size_t size)
654{
655 child->bts_buffer = alloc_locked_buffer(size);
656 if (!child->bts_buffer)
657 return -ENOMEM;
658
659 child->bts_size = size;
660
661 return 0;
662}
663
664static void ptrace_bts_free_buffer(struct task_struct *child)
665{
666 free_locked_buffer(child->bts_buffer, child->bts_size);
667 child->bts_buffer = NULL;
668 child->bts_size = 0;
731} 669}
732 670
733static int ptrace_bts_config(struct task_struct *child, 671static int ptrace_bts_config(struct task_struct *child,
@@ -735,136 +673,86 @@ static int ptrace_bts_config(struct task_struct *child,
735 const struct ptrace_bts_config __user *ucfg) 673 const struct ptrace_bts_config __user *ucfg)
736{ 674{
737 struct ptrace_bts_config cfg; 675 struct ptrace_bts_config cfg;
738 int error = 0; 676 unsigned int flags = 0;
739
740 error = -EOPNOTSUPP;
741 if (!bts_cfg.sizeof_bts)
742 goto errout;
743 677
744 error = -EIO;
745 if (cfg_size < sizeof(cfg)) 678 if (cfg_size < sizeof(cfg))
746 goto errout; 679 return -EIO;
747 680
748 error = -EFAULT;
749 if (copy_from_user(&cfg, ucfg, sizeof(cfg))) 681 if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
750 goto errout; 682 return -EFAULT;
751
752 error = -EINVAL;
753 if ((cfg.flags & PTRACE_BTS_O_SIGNAL) &&
754 !(cfg.flags & PTRACE_BTS_O_ALLOC))
755 goto errout;
756
757 if (cfg.flags & PTRACE_BTS_O_ALLOC) {
758 bts_ovfl_callback_t ovfl = NULL;
759 unsigned int sig = 0;
760
761 error = -EINVAL;
762 if (cfg.size < (10 * bts_cfg.sizeof_bts))
763 goto errout;
764 683
765 if (cfg.flags & PTRACE_BTS_O_SIGNAL) { 684 if (child->bts) {
766 if (!cfg.signal) 685 ds_release_bts(child->bts);
767 goto errout; 686 child->bts = NULL;
687 }
768 688
769 error = -EOPNOTSUPP; 689 if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
770 goto errout; 690 if (!cfg.signal)
691 return -EINVAL;
771 692
772 sig = cfg.signal; 693 return -EOPNOTSUPP;
773 }
774 694
775 if (child->bts) { 695 child->thread.bts_ovfl_signal = cfg.signal;
776 (void)ds_release_bts(child->bts); 696 }
777 kfree(child->bts_buffer);
778 697
779 child->bts = NULL; 698 if ((cfg.flags & PTRACE_BTS_O_ALLOC) &&
780 child->bts_buffer = NULL; 699 (cfg.size != child->bts_size)) {
781 } 700 int error;
782 701
783 error = -ENOMEM; 702 ptrace_bts_free_buffer(child);
784 child->bts_buffer = kzalloc(cfg.size, GFP_KERNEL);
785 if (!child->bts_buffer)
786 goto errout;
787
788 child->bts = ds_request_bts(child, child->bts_buffer, cfg.size,
789 ovfl, /* th = */ (size_t)-1);
790 if (IS_ERR(child->bts)) {
791 error = PTR_ERR(child->bts);
792 kfree(child->bts_buffer);
793 child->bts = NULL;
794 child->bts_buffer = NULL;
795 goto errout;
796 }
797 703
798 child->thread.bts_ovfl_signal = sig; 704 error = ptrace_bts_allocate_buffer(child, cfg.size);
705 if (error < 0)
706 return error;
799 } 707 }
800 708
801 error = -EINVAL;
802 if (!child->thread.ds_ctx && cfg.flags)
803 goto errout;
804
805 if (cfg.flags & PTRACE_BTS_O_TRACE) 709 if (cfg.flags & PTRACE_BTS_O_TRACE)
806 child->thread.debugctlmsr |= bts_cfg.debugctl_mask; 710 flags |= BTS_USER;
807 else
808 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
809 711
810 if (cfg.flags & PTRACE_BTS_O_SCHED) 712 if (cfg.flags & PTRACE_BTS_O_SCHED)
811 set_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 713 flags |= BTS_TIMESTAMPS;
812 else
813 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
814 714
815 error = sizeof(cfg); 715 child->bts = ds_request_bts(child, child->bts_buffer, child->bts_size,
716 /* ovfl = */ NULL, /* th = */ (size_t)-1,
717 flags);
718 if (IS_ERR(child->bts)) {
719 int error = PTR_ERR(child->bts);
816 720
817out: 721 ptrace_bts_free_buffer(child);
818 if (child->thread.debugctlmsr) 722 child->bts = NULL;
819 set_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
820 else
821 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
822 723
823 return error; 724 return error;
725 }
824 726
825errout: 727 return sizeof(cfg);
826 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
827 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
828 goto out;
829} 728}
830 729
831static int ptrace_bts_status(struct task_struct *child, 730static int ptrace_bts_status(struct task_struct *child,
832 long cfg_size, 731 long cfg_size,
833 struct ptrace_bts_config __user *ucfg) 732 struct ptrace_bts_config __user *ucfg)
834{ 733{
734 const struct bts_trace *trace;
835 struct ptrace_bts_config cfg; 735 struct ptrace_bts_config cfg;
836 size_t end;
837 const void *base, *max;
838 int error;
839 736
840 if (cfg_size < sizeof(cfg)) 737 if (cfg_size < sizeof(cfg))
841 return -EIO; 738 return -EIO;
842 739
843 error = ds_get_bts_end(child->bts, &end); 740 trace = ds_read_bts(child->bts);
844 if (error < 0) 741 if (!trace)
845 return error; 742 return -EPERM;
846
847 error = ds_access_bts(child->bts, /* index = */ 0, &base);
848 if (error < 0)
849 return error;
850
851 error = ds_access_bts(child->bts, /* index = */ end, &max);
852 if (error < 0)
853 return error;
854 743
855 memset(&cfg, 0, sizeof(cfg)); 744 memset(&cfg, 0, sizeof(cfg));
856 cfg.size = (max - base); 745 cfg.size = trace->ds.end - trace->ds.begin;
857 cfg.signal = child->thread.bts_ovfl_signal; 746 cfg.signal = child->thread.bts_ovfl_signal;
858 cfg.bts_size = sizeof(struct bts_struct); 747 cfg.bts_size = sizeof(struct bts_struct);
859 748
860 if (cfg.signal) 749 if (cfg.signal)
861 cfg.flags |= PTRACE_BTS_O_SIGNAL; 750 cfg.flags |= PTRACE_BTS_O_SIGNAL;
862 751
863 if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) && 752 if (trace->ds.flags & BTS_USER)
864 child->thread.debugctlmsr & bts_cfg.debugctl_mask)
865 cfg.flags |= PTRACE_BTS_O_TRACE; 753 cfg.flags |= PTRACE_BTS_O_TRACE;
866 754
867 if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS)) 755 if (trace->ds.flags & BTS_TIMESTAMPS)
868 cfg.flags |= PTRACE_BTS_O_SCHED; 756 cfg.flags |= PTRACE_BTS_O_SCHED;
869 757
870 if (copy_to_user(ucfg, &cfg, sizeof(cfg))) 758 if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
@@ -873,106 +761,77 @@ static int ptrace_bts_status(struct task_struct *child,
873 return sizeof(cfg); 761 return sizeof(cfg);
874} 762}
875 763
876static int ptrace_bts_write_record(struct task_struct *child, 764static int ptrace_bts_clear(struct task_struct *child)
877 const struct bts_struct *in)
878{ 765{
879 unsigned char bts_record[BTS_MAX_RECORD_SIZE]; 766 const struct bts_trace *trace;
880 767
881 BUG_ON(BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts); 768 trace = ds_read_bts(child->bts);
769 if (!trace)
770 return -EPERM;
882 771
883 memset(bts_record, 0, bts_cfg.sizeof_bts); 772 memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
884 switch (in->qualifier) {
885 case BTS_INVALID:
886 break;
887 773
888 case BTS_BRANCH: 774 return ds_reset_bts(child->bts);
889 bts_set(bts_record, bts_from, in->variant.lbr.from_ip); 775}
890 bts_set(bts_record, bts_to, in->variant.lbr.to_ip);
891 break;
892 776
893 case BTS_TASK_ARRIVES: 777static int ptrace_bts_size(struct task_struct *child)
894 case BTS_TASK_DEPARTS: 778{
895 bts_set(bts_record, bts_from, bts_escape); 779 const struct bts_trace *trace;
896 bts_set(bts_record, bts_qual, in->qualifier);
897 bts_set(bts_record, bts_jiffies, in->variant.jiffies);
898 break;
899 780
900 default: 781 trace = ds_read_bts(child->bts);
901 return -EINVAL; 782 if (!trace)
902 } 783 return -EPERM;
903 784
904 return ds_write_bts(child->bts, bts_record, bts_cfg.sizeof_bts); 785 return (trace->ds.top - trace->ds.begin) / trace->ds.size;
905} 786}
906 787
907void ptrace_bts_take_timestamp(struct task_struct *tsk, 788static void ptrace_bts_fork(struct task_struct *tsk)
908 enum bts_qualifier qualifier)
909{ 789{
910 struct bts_struct rec = { 790 tsk->bts = NULL;
911 .qualifier = qualifier, 791 tsk->bts_buffer = NULL;
912 .variant.jiffies = jiffies_64 792 tsk->bts_size = 0;
913 }; 793 tsk->thread.bts_ovfl_signal = 0;
914
915 ptrace_bts_write_record(tsk, &rec);
916} 794}
917 795
918static const struct bts_configuration bts_cfg_netburst = { 796static void ptrace_bts_untrace(struct task_struct *child)
919 .sizeof_bts = sizeof(long) * 3, 797{
920 .sizeof_field = sizeof(long), 798 if (unlikely(child->bts)) {
921 .debugctl_mask = (1<<2)|(1<<3)|(1<<5) 799 ds_release_bts(child->bts);
922}; 800 child->bts = NULL;
923 801
924static const struct bts_configuration bts_cfg_pentium_m = { 802 /* We cannot update total_vm and locked_vm since
925 .sizeof_bts = sizeof(long) * 3, 803 child's mm is already gone. But we can reclaim the
926 .sizeof_field = sizeof(long), 804 memory. */
927 .debugctl_mask = (1<<6)|(1<<7) 805 kfree(child->bts_buffer);
928}; 806 child->bts_buffer = NULL;
807 child->bts_size = 0;
808 }
809}
929 810
930static const struct bts_configuration bts_cfg_core2 = { 811static void ptrace_bts_detach(struct task_struct *child)
931 .sizeof_bts = 8 * 3, 812{
932 .sizeof_field = 8, 813 if (unlikely(child->bts)) {
933 .debugctl_mask = (1<<6)|(1<<7)|(1<<9) 814 ds_release_bts(child->bts);
934}; 815 child->bts = NULL;
935 816
936static inline void bts_configure(const struct bts_configuration *cfg) 817 ptrace_bts_free_buffer(child);
818 }
819}
820#else
821static inline void ptrace_bts_fork(struct task_struct *tsk) {}
822static inline void ptrace_bts_detach(struct task_struct *child) {}
823static inline void ptrace_bts_untrace(struct task_struct *child) {}
824#endif /* CONFIG_X86_PTRACE_BTS */
825
826void x86_ptrace_fork(struct task_struct *child, unsigned long clone_flags)
937{ 827{
938 bts_cfg = *cfg; 828 ptrace_bts_fork(child);
939} 829}
940 830
941void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c) 831void x86_ptrace_untrace(struct task_struct *child)
942{ 832{
943 switch (c->x86) { 833 ptrace_bts_untrace(child);
944 case 0x6:
945 switch (c->x86_model) {
946 case 0 ... 0xC:
947 /* sorry, don't know about them */
948 break;
949 case 0xD:
950 case 0xE: /* Pentium M */
951 bts_configure(&bts_cfg_pentium_m);
952 break;
953 default: /* Core2, Atom, ... */
954 bts_configure(&bts_cfg_core2);
955 break;
956 }
957 break;
958 case 0xF:
959 switch (c->x86_model) {
960 case 0x0:
961 case 0x1:
962 case 0x2: /* Netburst */
963 bts_configure(&bts_cfg_netburst);
964 break;
965 default:
966 /* sorry, don't know about them */
967 break;
968 }
969 break;
970 default:
971 /* sorry, don't know about them */
972 break;
973 }
974} 834}
975#endif /* CONFIG_X86_PTRACE_BTS */
976 835
977/* 836/*
978 * Called by kernel/ptrace.c when detaching.. 837 * Called by kernel/ptrace.c when detaching..
@@ -985,19 +844,7 @@ void ptrace_disable(struct task_struct *child)
985#ifdef TIF_SYSCALL_EMU 844#ifdef TIF_SYSCALL_EMU
986 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); 845 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
987#endif 846#endif
988#ifdef CONFIG_X86_PTRACE_BTS 847 ptrace_bts_detach(child);
989 if (child->bts) {
990 (void)ds_release_bts(child->bts);
991 kfree(child->bts_buffer);
992 child->bts_buffer = NULL;
993
994 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
995 if (!child->thread.debugctlmsr)
996 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
997
998 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
999 }
1000#endif /* CONFIG_X86_PTRACE_BTS */
1001} 848}
1002 849
1003#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 850#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
@@ -1128,16 +975,9 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
1128 (child, data, (struct ptrace_bts_config __user *)addr); 975 (child, data, (struct ptrace_bts_config __user *)addr);
1129 break; 976 break;
1130 977
1131 case PTRACE_BTS_SIZE: { 978 case PTRACE_BTS_SIZE:
1132 size_t size; 979 ret = ptrace_bts_size(child);
1133
1134 ret = ds_get_bts_index(child->bts, &size);
1135 if (ret == 0) {
1136 BUG_ON(size != (int) size);
1137 ret = (int) size;
1138 }
1139 break; 980 break;
1140 }
1141 981
1142 case PTRACE_BTS_GET: 982 case PTRACE_BTS_GET:
1143 ret = ptrace_bts_read_record 983 ret = ptrace_bts_read_record
@@ -1145,7 +985,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
1145 break; 985 break;
1146 986
1147 case PTRACE_BTS_CLEAR: 987 case PTRACE_BTS_CLEAR:
1148 ret = ds_clear_bts(child->bts); 988 ret = ptrace_bts_clear(child);
1149 break; 989 break;
1150 990
1151 case PTRACE_BTS_DRAIN: 991 case PTRACE_BTS_DRAIN:
@@ -1408,6 +1248,14 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
1408 1248
1409 case PTRACE_GET_THREAD_AREA: 1249 case PTRACE_GET_THREAD_AREA:
1410 case PTRACE_SET_THREAD_AREA: 1250 case PTRACE_SET_THREAD_AREA:
1251#ifdef CONFIG_X86_PTRACE_BTS
1252 case PTRACE_BTS_CONFIG:
1253 case PTRACE_BTS_STATUS:
1254 case PTRACE_BTS_SIZE:
1255 case PTRACE_BTS_GET:
1256 case PTRACE_BTS_CLEAR:
1257 case PTRACE_BTS_DRAIN:
1258#endif /* CONFIG_X86_PTRACE_BTS */
1411 return arch_ptrace(child, request, addr, data); 1259 return arch_ptrace(child, request, addr, data);
1412 1260
1413 default: 1261 default:
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 67465ed89310..309949e9e1c1 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -168,6 +168,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_31,
168 ich_force_enable_hpet); 168 ich_force_enable_hpet);
169DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1, 169DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1,
170 ich_force_enable_hpet); 170 ich_force_enable_hpet);
171DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_4,
172 ich_force_enable_hpet);
171DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7, 173DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7,
172 ich_force_enable_hpet); 174 ich_force_enable_hpet);
173 175
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 32fe42f26e79..ae0d8042cf69 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -93,11 +93,13 @@
93#include <asm/desc.h> 93#include <asm/desc.h>
94#include <asm/dma.h> 94#include <asm/dma.h>
95#include <asm/iommu.h> 95#include <asm/iommu.h>
96#include <asm/gart.h>
96#include <asm/mmu_context.h> 97#include <asm/mmu_context.h>
97#include <asm/proto.h> 98#include <asm/proto.h>
98 99
99#include <mach_apic.h> 100#include <mach_apic.h>
100#include <asm/paravirt.h> 101#include <asm/paravirt.h>
102#include <asm/hypervisor.h>
101 103
102#include <asm/percpu.h> 104#include <asm/percpu.h>
103#include <asm/topology.h> 105#include <asm/topology.h>
@@ -448,6 +450,7 @@ static void __init reserve_early_setup_data(void)
448 * @size: Size of the crashkernel memory to reserve. 450 * @size: Size of the crashkernel memory to reserve.
449 * Returns the base address on success, and -1ULL on failure. 451 * Returns the base address on success, and -1ULL on failure.
450 */ 452 */
453static
451unsigned long long __init find_and_reserve_crashkernel(unsigned long long size) 454unsigned long long __init find_and_reserve_crashkernel(unsigned long long size)
452{ 455{
453 const unsigned long long alignment = 16<<20; /* 16M */ 456 const unsigned long long alignment = 16<<20; /* 16M */
@@ -600,157 +603,7 @@ static struct x86_quirks default_x86_quirks __initdata = {
600 603
601struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; 604struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
602 605
603/* 606#ifdef CONFIG_X86_RESERVE_LOW_64K
604 * Some BIOSes seem to corrupt the low 64k of memory during events
605 * like suspend/resume and unplugging an HDMI cable. Reserve all
606 * remaining free memory in that area and fill it with a distinct
607 * pattern.
608 */
609#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
610#define MAX_SCAN_AREAS 8
611
612static int __read_mostly memory_corruption_check = -1;
613
614static unsigned __read_mostly corruption_check_size = 64*1024;
615static unsigned __read_mostly corruption_check_period = 60; /* seconds */
616
617static struct e820entry scan_areas[MAX_SCAN_AREAS];
618static int num_scan_areas;
619
620
621static int set_corruption_check(char *arg)
622{
623 char *end;
624
625 memory_corruption_check = simple_strtol(arg, &end, 10);
626
627 return (*end == 0) ? 0 : -EINVAL;
628}
629early_param("memory_corruption_check", set_corruption_check);
630
631static int set_corruption_check_period(char *arg)
632{
633 char *end;
634
635 corruption_check_period = simple_strtoul(arg, &end, 10);
636
637 return (*end == 0) ? 0 : -EINVAL;
638}
639early_param("memory_corruption_check_period", set_corruption_check_period);
640
641static int set_corruption_check_size(char *arg)
642{
643 char *end;
644 unsigned size;
645
646 size = memparse(arg, &end);
647
648 if (*end == '\0')
649 corruption_check_size = size;
650
651 return (size == corruption_check_size) ? 0 : -EINVAL;
652}
653early_param("memory_corruption_check_size", set_corruption_check_size);
654
655
656static void __init setup_bios_corruption_check(void)
657{
658 u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */
659
660 if (memory_corruption_check == -1) {
661 memory_corruption_check =
662#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
663 1
664#else
665 0
666#endif
667 ;
668 }
669
670 if (corruption_check_size == 0)
671 memory_corruption_check = 0;
672
673 if (!memory_corruption_check)
674 return;
675
676 corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
677
678 while(addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) {
679 u64 size;
680 addr = find_e820_area_size(addr, &size, PAGE_SIZE);
681
682 if (addr == 0)
683 break;
684
685 if ((addr + size) > corruption_check_size)
686 size = corruption_check_size - addr;
687
688 if (size == 0)
689 break;
690
691 e820_update_range(addr, size, E820_RAM, E820_RESERVED);
692 scan_areas[num_scan_areas].addr = addr;
693 scan_areas[num_scan_areas].size = size;
694 num_scan_areas++;
695
696 /* Assume we've already mapped this early memory */
697 memset(__va(addr), 0, size);
698
699 addr += size;
700 }
701
702 printk(KERN_INFO "Scanning %d areas for low memory corruption\n",
703 num_scan_areas);
704 update_e820();
705}
706
707static struct timer_list periodic_check_timer;
708
709void check_for_bios_corruption(void)
710{
711 int i;
712 int corruption = 0;
713
714 if (!memory_corruption_check)
715 return;
716
717 for(i = 0; i < num_scan_areas; i++) {
718 unsigned long *addr = __va(scan_areas[i].addr);
719 unsigned long size = scan_areas[i].size;
720
721 for(; size; addr++, size -= sizeof(unsigned long)) {
722 if (!*addr)
723 continue;
724 printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n",
725 addr, __pa(addr), *addr);
726 corruption = 1;
727 *addr = 0;
728 }
729 }
730
731 WARN(corruption, KERN_ERR "Memory corruption detected in low memory\n");
732}
733
734static void periodic_check_for_corruption(unsigned long data)
735{
736 check_for_bios_corruption();
737 mod_timer(&periodic_check_timer, round_jiffies(jiffies + corruption_check_period*HZ));
738}
739
740void start_periodic_check_for_corruption(void)
741{
742 if (!memory_corruption_check || corruption_check_period == 0)
743 return;
744
745 printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n",
746 corruption_check_period);
747
748 init_timer(&periodic_check_timer);
749 periodic_check_timer.function = &periodic_check_for_corruption;
750 periodic_check_for_corruption(0);
751}
752#endif
753
754static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) 607static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
755{ 608{
756 printk(KERN_NOTICE 609 printk(KERN_NOTICE
@@ -762,6 +615,7 @@ static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
762 615
763 return 0; 616 return 0;
764} 617}
618#endif
765 619
766/* List of systems that have known low memory corruption BIOS problems */ 620/* List of systems that have known low memory corruption BIOS problems */
767static struct dmi_system_id __initdata bad_bios_dmi_table[] = { 621static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
@@ -920,6 +774,12 @@ void __init setup_arch(char **cmdline_p)
920 774
921 dmi_check_system(bad_bios_dmi_table); 775 dmi_check_system(bad_bios_dmi_table);
922 776
777 /*
778 * VMware detection requires dmi to be available, so this
779 * needs to be done after dmi_scan_machine, for the BP.
780 */
781 init_hypervisor(&boot_cpu_data);
782
923#ifdef CONFIG_X86_32 783#ifdef CONFIG_X86_32
924 probe_roms(); 784 probe_roms();
925#endif 785#endif
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 0b63b08e7530..49f3f709ee1f 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -339,25 +339,25 @@ static const cpumask_t cpu_mask_none;
339/* 339/*
340 * Returns a pointer to the bitmask of CPUs on Node 'node'. 340 * Returns a pointer to the bitmask of CPUs on Node 'node'.
341 */ 341 */
342const cpumask_t *_node_to_cpumask_ptr(int node) 342const cpumask_t *cpumask_of_node(int node)
343{ 343{
344 if (node_to_cpumask_map == NULL) { 344 if (node_to_cpumask_map == NULL) {
345 printk(KERN_WARNING 345 printk(KERN_WARNING
346 "_node_to_cpumask_ptr(%d): no node_to_cpumask_map!\n", 346 "cpumask_of_node(%d): no node_to_cpumask_map!\n",
347 node); 347 node);
348 dump_stack(); 348 dump_stack();
349 return (const cpumask_t *)&cpu_online_map; 349 return (const cpumask_t *)&cpu_online_map;
350 } 350 }
351 if (node >= nr_node_ids) { 351 if (node >= nr_node_ids) {
352 printk(KERN_WARNING 352 printk(KERN_WARNING
353 "_node_to_cpumask_ptr(%d): node > nr_node_ids(%d)\n", 353 "cpumask_of_node(%d): node > nr_node_ids(%d)\n",
354 node, nr_node_ids); 354 node, nr_node_ids);
355 dump_stack(); 355 dump_stack();
356 return &cpu_mask_none; 356 return &cpu_mask_none;
357 } 357 }
358 return &node_to_cpumask_map[node]; 358 return &node_to_cpumask_map[node];
359} 359}
360EXPORT_SYMBOL(_node_to_cpumask_ptr); 360EXPORT_SYMBOL(cpumask_of_node);
361 361
362/* 362/*
363 * Returns a bitmask of CPUs on Node 'node'. 363 * Returns a bitmask of CPUs on Node 'node'.
diff --git a/arch/x86/kernel/sigframe.h b/arch/x86/kernel/sigframe.h
deleted file mode 100644
index cc673aa55ce4..000000000000
--- a/arch/x86/kernel/sigframe.h
+++ /dev/null
@@ -1,42 +0,0 @@
1#ifdef CONFIG_X86_32
2struct sigframe {
3 char __user *pretcode;
4 int sig;
5 struct sigcontext sc;
6 /*
7 * fpstate is unused. fpstate is moved/allocated after
8 * retcode[] below. This movement allows to have the FP state and the
9 * future state extensions (xsave) stay together.
10 * And at the same time retaining the unused fpstate, prevents changing
11 * the offset of extramask[] in the sigframe and thus prevent any
12 * legacy application accessing/modifying it.
13 */
14 struct _fpstate fpstate_unused;
15 unsigned long extramask[_NSIG_WORDS-1];
16 char retcode[8];
17 /* fp state follows here */
18};
19
20struct rt_sigframe {
21 char __user *pretcode;
22 int sig;
23 struct siginfo __user *pinfo;
24 void __user *puc;
25 struct siginfo info;
26 struct ucontext uc;
27 char retcode[8];
28 /* fp state follows here */
29};
30#else
31struct rt_sigframe {
32 char __user *pretcode;
33 struct ucontext uc;
34 struct siginfo info;
35 /* fp state follows here */
36};
37
38int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
39 sigset_t *set, struct pt_regs *regs);
40int ia32_setup_frame(int sig, struct k_sigaction *ka,
41 sigset_t *set, struct pt_regs *regs);
42#endif
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal.c
index d6dd057d0f22..89bb7668041d 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal.c
@@ -1,36 +1,41 @@
1/* 1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds 2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
3 * 4 *
4 * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson 5 * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson
5 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes 6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
7 * 2000-2002 x86-64 support by Andi Kleen
6 */ 8 */
7#include <linux/list.h>
8 9
9#include <linux/personality.h> 10#include <linux/sched.h>
10#include <linux/binfmts.h> 11#include <linux/mm.h>
11#include <linux/suspend.h> 12#include <linux/smp.h>
12#include <linux/kernel.h> 13#include <linux/kernel.h>
13#include <linux/ptrace.h>
14#include <linux/signal.h> 14#include <linux/signal.h>
15#include <linux/stddef.h>
16#include <linux/unistd.h>
17#include <linux/errno.h> 15#include <linux/errno.h>
18#include <linux/sched.h>
19#include <linux/wait.h> 16#include <linux/wait.h>
17#include <linux/ptrace.h>
20#include <linux/tracehook.h> 18#include <linux/tracehook.h>
21#include <linux/elf.h> 19#include <linux/unistd.h>
22#include <linux/smp.h> 20#include <linux/stddef.h>
23#include <linux/mm.h> 21#include <linux/personality.h>
22#include <linux/uaccess.h>
24 23
25#include <asm/processor.h> 24#include <asm/processor.h>
26#include <asm/ucontext.h> 25#include <asm/ucontext.h>
27#include <asm/uaccess.h>
28#include <asm/i387.h> 26#include <asm/i387.h>
29#include <asm/vdso.h> 27#include <asm/vdso.h>
28
29#ifdef CONFIG_X86_64
30#include <asm/proto.h>
31#include <asm/ia32_unistd.h>
32#include <asm/mce.h>
33#endif /* CONFIG_X86_64 */
34
30#include <asm/syscall.h> 35#include <asm/syscall.h>
31#include <asm/syscalls.h> 36#include <asm/syscalls.h>
32 37
33#include "sigframe.h" 38#include <asm/sigframe.h>
34 39
35#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) 40#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
36 41
@@ -45,74 +50,6 @@
45# define FIX_EFLAGS __FIX_EFLAGS 50# define FIX_EFLAGS __FIX_EFLAGS
46#endif 51#endif
47 52
48/*
49 * Atomically swap in the new signal mask, and wait for a signal.
50 */
51asmlinkage int
52sys_sigsuspend(int history0, int history1, old_sigset_t mask)
53{
54 mask &= _BLOCKABLE;
55 spin_lock_irq(&current->sighand->siglock);
56 current->saved_sigmask = current->blocked;
57 siginitset(&current->blocked, mask);
58 recalc_sigpending();
59 spin_unlock_irq(&current->sighand->siglock);
60
61 current->state = TASK_INTERRUPTIBLE;
62 schedule();
63 set_restore_sigmask();
64
65 return -ERESTARTNOHAND;
66}
67
68asmlinkage int
69sys_sigaction(int sig, const struct old_sigaction __user *act,
70 struct old_sigaction __user *oact)
71{
72 struct k_sigaction new_ka, old_ka;
73 int ret;
74
75 if (act) {
76 old_sigset_t mask;
77
78 if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
79 __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
80 __get_user(new_ka.sa.sa_restorer, &act->sa_restorer))
81 return -EFAULT;
82
83 __get_user(new_ka.sa.sa_flags, &act->sa_flags);
84 __get_user(mask, &act->sa_mask);
85 siginitset(&new_ka.sa.sa_mask, mask);
86 }
87
88 ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
89
90 if (!ret && oact) {
91 if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
92 __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
93 __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer))
94 return -EFAULT;
95
96 __put_user(old_ka.sa.sa_flags, &oact->sa_flags);
97 __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask);
98 }
99
100 return ret;
101}
102
103asmlinkage int sys_sigaltstack(unsigned long bx)
104{
105 /*
106 * This is needed to make gcc realize it doesn't own the
107 * "struct pt_regs"
108 */
109 struct pt_regs *regs = (struct pt_regs *)&bx;
110 const stack_t __user *uss = (const stack_t __user *)bx;
111 stack_t __user *uoss = (stack_t __user *)regs->cx;
112
113 return do_sigaltstack(uss, uoss, regs->sp);
114}
115
116#define COPY(x) { \ 53#define COPY(x) { \
117 err |= __get_user(regs->x, &sc->x); \ 54 err |= __get_user(regs->x, &sc->x); \
118} 55}
@@ -123,7 +60,7 @@ asmlinkage int sys_sigaltstack(unsigned long bx)
123 regs->seg = tmp; \ 60 regs->seg = tmp; \
124} 61}
125 62
126#define COPY_SEG_STRICT(seg) { \ 63#define COPY_SEG_CPL3(seg) { \
127 unsigned short tmp; \ 64 unsigned short tmp; \
128 err |= __get_user(tmp, &sc->seg); \ 65 err |= __get_user(tmp, &sc->seg); \
129 regs->seg = tmp | 3; \ 66 regs->seg = tmp | 3; \
@@ -135,9 +72,6 @@ asmlinkage int sys_sigaltstack(unsigned long bx)
135 loadsegment(seg, tmp); \ 72 loadsegment(seg, tmp); \
136} 73}
137 74
138/*
139 * Do a signal return; undo the signal stack.
140 */
141static int 75static int
142restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, 76restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
143 unsigned long *pax) 77 unsigned long *pax)
@@ -149,14 +83,36 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
149 /* Always make any pending restarted system calls return -EINTR */ 83 /* Always make any pending restarted system calls return -EINTR */
150 current_thread_info()->restart_block.fn = do_no_restart_syscall; 84 current_thread_info()->restart_block.fn = do_no_restart_syscall;
151 85
86#ifdef CONFIG_X86_32
152 GET_SEG(gs); 87 GET_SEG(gs);
153 COPY_SEG(fs); 88 COPY_SEG(fs);
154 COPY_SEG(es); 89 COPY_SEG(es);
155 COPY_SEG(ds); 90 COPY_SEG(ds);
91#endif /* CONFIG_X86_32 */
92
156 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); 93 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
157 COPY(dx); COPY(cx); COPY(ip); 94 COPY(dx); COPY(cx); COPY(ip);
158 COPY_SEG_STRICT(cs); 95
159 COPY_SEG_STRICT(ss); 96#ifdef CONFIG_X86_64
97 COPY(r8);
98 COPY(r9);
99 COPY(r10);
100 COPY(r11);
101 COPY(r12);
102 COPY(r13);
103 COPY(r14);
104 COPY(r15);
105#endif /* CONFIG_X86_64 */
106
107#ifdef CONFIG_X86_32
108 COPY_SEG_CPL3(cs);
109 COPY_SEG_CPL3(ss);
110#else /* !CONFIG_X86_32 */
111 /* Kernel saves and restores only the CS segment register on signals,
112 * which is the bare minimum needed to allow mixed 32/64-bit code.
113 * App's signal handler can save/restore other segments if needed. */
114 COPY_SEG_CPL3(cs);
115#endif /* CONFIG_X86_32 */
160 116
161 err |= __get_user(tmpflags, &sc->flags); 117 err |= __get_user(tmpflags, &sc->flags);
162 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); 118 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
@@ -169,102 +125,24 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
169 return err; 125 return err;
170} 126}
171 127
172asmlinkage unsigned long sys_sigreturn(unsigned long __unused)
173{
174 struct sigframe __user *frame;
175 struct pt_regs *regs;
176 unsigned long ax;
177 sigset_t set;
178
179 regs = (struct pt_regs *) &__unused;
180 frame = (struct sigframe __user *)(regs->sp - 8);
181
182 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
183 goto badframe;
184 if (__get_user(set.sig[0], &frame->sc.oldmask) || (_NSIG_WORDS > 1
185 && __copy_from_user(&set.sig[1], &frame->extramask,
186 sizeof(frame->extramask))))
187 goto badframe;
188
189 sigdelsetmask(&set, ~_BLOCKABLE);
190 spin_lock_irq(&current->sighand->siglock);
191 current->blocked = set;
192 recalc_sigpending();
193 spin_unlock_irq(&current->sighand->siglock);
194
195 if (restore_sigcontext(regs, &frame->sc, &ax))
196 goto badframe;
197 return ax;
198
199badframe:
200 if (show_unhandled_signals && printk_ratelimit()) {
201 printk("%s%s[%d] bad frame in sigreturn frame:"
202 "%p ip:%lx sp:%lx oeax:%lx",
203 task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
204 current->comm, task_pid_nr(current), frame, regs->ip,
205 regs->sp, regs->orig_ax);
206 print_vma_addr(" in ", regs->ip);
207 printk(KERN_CONT "\n");
208 }
209
210 force_sig(SIGSEGV, current);
211
212 return 0;
213}
214
215static long do_rt_sigreturn(struct pt_regs *regs)
216{
217 struct rt_sigframe __user *frame;
218 unsigned long ax;
219 sigset_t set;
220
221 frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
222 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
223 goto badframe;
224 if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
225 goto badframe;
226
227 sigdelsetmask(&set, ~_BLOCKABLE);
228 spin_lock_irq(&current->sighand->siglock);
229 current->blocked = set;
230 recalc_sigpending();
231 spin_unlock_irq(&current->sighand->siglock);
232
233 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
234 goto badframe;
235
236 if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT)
237 goto badframe;
238
239 return ax;
240
241badframe:
242 signal_fault(regs, frame, "rt_sigreturn");
243 return 0;
244}
245
246asmlinkage int sys_rt_sigreturn(unsigned long __unused)
247{
248 struct pt_regs *regs = (struct pt_regs *)&__unused;
249
250 return do_rt_sigreturn(regs);
251}
252
253/*
254 * Set up a signal frame.
255 */
256static int 128static int
257setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, 129setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
258 struct pt_regs *regs, unsigned long mask) 130 struct pt_regs *regs, unsigned long mask)
259{ 131{
260 int tmp, err = 0; 132 int err = 0;
261 133
262 err |= __put_user(regs->fs, (unsigned int __user *)&sc->fs); 134#ifdef CONFIG_X86_32
263 savesegment(gs, tmp); 135 {
264 err |= __put_user(tmp, (unsigned int __user *)&sc->gs); 136 unsigned int tmp;
265 137
138 savesegment(gs, tmp);
139 err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
140 }
141 err |= __put_user(regs->fs, (unsigned int __user *)&sc->fs);
266 err |= __put_user(regs->es, (unsigned int __user *)&sc->es); 142 err |= __put_user(regs->es, (unsigned int __user *)&sc->es);
267 err |= __put_user(regs->ds, (unsigned int __user *)&sc->ds); 143 err |= __put_user(regs->ds, (unsigned int __user *)&sc->ds);
144#endif /* CONFIG_X86_32 */
145
268 err |= __put_user(regs->di, &sc->di); 146 err |= __put_user(regs->di, &sc->di);
269 err |= __put_user(regs->si, &sc->si); 147 err |= __put_user(regs->si, &sc->si);
270 err |= __put_user(regs->bp, &sc->bp); 148 err |= __put_user(regs->bp, &sc->bp);
@@ -273,19 +151,33 @@ setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
273 err |= __put_user(regs->dx, &sc->dx); 151 err |= __put_user(regs->dx, &sc->dx);
274 err |= __put_user(regs->cx, &sc->cx); 152 err |= __put_user(regs->cx, &sc->cx);
275 err |= __put_user(regs->ax, &sc->ax); 153 err |= __put_user(regs->ax, &sc->ax);
154#ifdef CONFIG_X86_64
155 err |= __put_user(regs->r8, &sc->r8);
156 err |= __put_user(regs->r9, &sc->r9);
157 err |= __put_user(regs->r10, &sc->r10);
158 err |= __put_user(regs->r11, &sc->r11);
159 err |= __put_user(regs->r12, &sc->r12);
160 err |= __put_user(regs->r13, &sc->r13);
161 err |= __put_user(regs->r14, &sc->r14);
162 err |= __put_user(regs->r15, &sc->r15);
163#endif /* CONFIG_X86_64 */
164
276 err |= __put_user(current->thread.trap_no, &sc->trapno); 165 err |= __put_user(current->thread.trap_no, &sc->trapno);
277 err |= __put_user(current->thread.error_code, &sc->err); 166 err |= __put_user(current->thread.error_code, &sc->err);
278 err |= __put_user(regs->ip, &sc->ip); 167 err |= __put_user(regs->ip, &sc->ip);
168#ifdef CONFIG_X86_32
279 err |= __put_user(regs->cs, (unsigned int __user *)&sc->cs); 169 err |= __put_user(regs->cs, (unsigned int __user *)&sc->cs);
280 err |= __put_user(regs->flags, &sc->flags); 170 err |= __put_user(regs->flags, &sc->flags);
281 err |= __put_user(regs->sp, &sc->sp_at_signal); 171 err |= __put_user(regs->sp, &sc->sp_at_signal);
282 err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss); 172 err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss);
173#else /* !CONFIG_X86_32 */
174 err |= __put_user(regs->flags, &sc->flags);
175 err |= __put_user(regs->cs, &sc->cs);
176 err |= __put_user(0, &sc->gs);
177 err |= __put_user(0, &sc->fs);
178#endif /* CONFIG_X86_32 */
283 179
284 tmp = save_i387_xstate(fpstate); 180 err |= __put_user(fpstate, &sc->fpstate);
285 if (tmp < 0)
286 err = 1;
287 else
288 err |= __put_user(tmp ? fpstate : NULL, &sc->fpstate);
289 181
290 /* non-iBCS2 extensions.. */ 182 /* non-iBCS2 extensions.. */
291 err |= __put_user(mask, &sc->oldmask); 183 err |= __put_user(mask, &sc->oldmask);
@@ -295,6 +187,32 @@ setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
295} 187}
296 188
297/* 189/*
190 * Set up a signal frame.
191 */
192#ifdef CONFIG_X86_32
193static const struct {
194 u16 poplmovl;
195 u32 val;
196 u16 int80;
197} __attribute__((packed)) retcode = {
198 0xb858, /* popl %eax; movl $..., %eax */
199 __NR_sigreturn,
200 0x80cd, /* int $0x80 */
201};
202
203static const struct {
204 u8 movl;
205 u32 val;
206 u16 int80;
207 u8 pad;
208} __attribute__((packed)) rt_retcode = {
209 0xb8, /* movl $..., %eax */
210 __NR_rt_sigreturn,
211 0x80cd, /* int $0x80 */
212 0
213};
214
215/*
298 * Determine which stack to use.. 216 * Determine which stack to use..
299 */ 217 */
300static inline void __user * 218static inline void __user *
@@ -328,6 +246,8 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
328 if (used_math()) { 246 if (used_math()) {
329 sp = sp - sig_xstate_size; 247 sp = sp - sig_xstate_size;
330 *fpstate = (struct _fpstate *) sp; 248 *fpstate = (struct _fpstate *) sp;
249 if (save_i387_xstate(*fpstate) < 0)
250 return (void __user *)-1L;
331 } 251 }
332 252
333 sp -= frame_size; 253 sp -= frame_size;
@@ -383,9 +303,7 @@ __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
383 * reasons and because gdb uses it as a signature to notice 303 * reasons and because gdb uses it as a signature to notice
384 * signal handler stack frames. 304 * signal handler stack frames.
385 */ 305 */
386 err |= __put_user(0xb858, (short __user *)(frame->retcode+0)); 306 err |= __put_user(*((u64 *)&retcode), (u64 *)frame->retcode);
387 err |= __put_user(__NR_sigreturn, (int __user *)(frame->retcode+2));
388 err |= __put_user(0x80cd, (short __user *)(frame->retcode+6));
389 307
390 if (err) 308 if (err)
391 return -EFAULT; 309 return -EFAULT;
@@ -454,9 +372,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
454 * reasons and because gdb uses it as a signature to notice 372 * reasons and because gdb uses it as a signature to notice
455 * signal handler stack frames. 373 * signal handler stack frames.
456 */ 374 */
457 err |= __put_user(0xb8, (char __user *)(frame->retcode+0)); 375 err |= __put_user(*((u64 *)&rt_retcode), (u64 *)frame->retcode);
458 err |= __put_user(__NR_rt_sigreturn, (int __user *)(frame->retcode+1));
459 err |= __put_user(0x80cd, (short __user *)(frame->retcode+5));
460 376
461 if (err) 377 if (err)
462 return -EFAULT; 378 return -EFAULT;
@@ -475,23 +391,293 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
475 391
476 return 0; 392 return 0;
477} 393}
394#else /* !CONFIG_X86_32 */
395/*
396 * Determine which stack to use..
397 */
398static void __user *
399get_stack(struct k_sigaction *ka, unsigned long sp, unsigned long size)
400{
401 /* Default to using normal stack - redzone*/
402 sp -= 128;
403
404 /* This is the X/Open sanctioned signal stack switching. */
405 if (ka->sa.sa_flags & SA_ONSTACK) {
406 if (sas_ss_flags(sp) == 0)
407 sp = current->sas_ss_sp + current->sas_ss_size;
408 }
409
410 return (void __user *)round_down(sp - size, 64);
411}
412
413static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
414 sigset_t *set, struct pt_regs *regs)
415{
416 struct rt_sigframe __user *frame;
417 void __user *fp = NULL;
418 int err = 0;
419 struct task_struct *me = current;
420
421 if (used_math()) {
422 fp = get_stack(ka, regs->sp, sig_xstate_size);
423 frame = (void __user *)round_down(
424 (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
425
426 if (save_i387_xstate(fp) < 0)
427 return -EFAULT;
428 } else
429 frame = get_stack(ka, regs->sp, sizeof(struct rt_sigframe)) - 8;
430
431 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
432 return -EFAULT;
433
434 if (ka->sa.sa_flags & SA_SIGINFO) {
435 if (copy_siginfo_to_user(&frame->info, info))
436 return -EFAULT;
437 }
438
439 /* Create the ucontext. */
440 if (cpu_has_xsave)
441 err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
442 else
443 err |= __put_user(0, &frame->uc.uc_flags);
444 err |= __put_user(0, &frame->uc.uc_link);
445 err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
446 err |= __put_user(sas_ss_flags(regs->sp),
447 &frame->uc.uc_stack.ss_flags);
448 err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
449 err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]);
450 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
451
452 /* Set up to return from userspace. If provided, use a stub
453 already in userspace. */
454 /* x86-64 should always use SA_RESTORER. */
455 if (ka->sa.sa_flags & SA_RESTORER) {
456 err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
457 } else {
458 /* could use a vstub here */
459 return -EFAULT;
460 }
461
462 if (err)
463 return -EFAULT;
464
465 /* Set up registers for signal handler */
466 regs->di = sig;
467 /* In case the signal handler was declared without prototypes */
468 regs->ax = 0;
469
470 /* This also works for non SA_SIGINFO handlers because they expect the
471 next argument after the signal number on the stack. */
472 regs->si = (unsigned long)&frame->info;
473 regs->dx = (unsigned long)&frame->uc;
474 regs->ip = (unsigned long) ka->sa.sa_handler;
475
476 regs->sp = (unsigned long)frame;
477
478 /* Set up the CS register to run signal handlers in 64-bit mode,
479 even if the handler happens to be interrupting 32-bit code. */
480 regs->cs = __USER_CS;
481
482 return 0;
483}
484#endif /* CONFIG_X86_32 */
485
486#ifdef CONFIG_X86_32
487/*
488 * Atomically swap in the new signal mask, and wait for a signal.
489 */
490asmlinkage int
491sys_sigsuspend(int history0, int history1, old_sigset_t mask)
492{
493 mask &= _BLOCKABLE;
494 spin_lock_irq(&current->sighand->siglock);
495 current->saved_sigmask = current->blocked;
496 siginitset(&current->blocked, mask);
497 recalc_sigpending();
498 spin_unlock_irq(&current->sighand->siglock);
499
500 current->state = TASK_INTERRUPTIBLE;
501 schedule();
502 set_restore_sigmask();
503
504 return -ERESTARTNOHAND;
505}
506
507asmlinkage int
508sys_sigaction(int sig, const struct old_sigaction __user *act,
509 struct old_sigaction __user *oact)
510{
511 struct k_sigaction new_ka, old_ka;
512 int ret;
513
514 if (act) {
515 old_sigset_t mask;
516
517 if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
518 __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
519 __get_user(new_ka.sa.sa_restorer, &act->sa_restorer))
520 return -EFAULT;
521
522 __get_user(new_ka.sa.sa_flags, &act->sa_flags);
523 __get_user(mask, &act->sa_mask);
524 siginitset(&new_ka.sa.sa_mask, mask);
525 }
526
527 ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
528
529 if (!ret && oact) {
530 if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
531 __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
532 __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer))
533 return -EFAULT;
534
535 __put_user(old_ka.sa.sa_flags, &oact->sa_flags);
536 __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask);
537 }
538
539 return ret;
540}
541#endif /* CONFIG_X86_32 */
542
543#ifdef CONFIG_X86_32
544asmlinkage int sys_sigaltstack(unsigned long bx)
545{
546 /*
547 * This is needed to make gcc realize it doesn't own the
548 * "struct pt_regs"
549 */
550 struct pt_regs *regs = (struct pt_regs *)&bx;
551 const stack_t __user *uss = (const stack_t __user *)bx;
552 stack_t __user *uoss = (stack_t __user *)regs->cx;
553
554 return do_sigaltstack(uss, uoss, regs->sp);
555}
556#else /* !CONFIG_X86_32 */
557asmlinkage long
558sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
559 struct pt_regs *regs)
560{
561 return do_sigaltstack(uss, uoss, regs->sp);
562}
563#endif /* CONFIG_X86_32 */
564
565/*
566 * Do a signal return; undo the signal stack.
567 */
568#ifdef CONFIG_X86_32
569asmlinkage unsigned long sys_sigreturn(unsigned long __unused)
570{
571 struct sigframe __user *frame;
572 struct pt_regs *regs;
573 unsigned long ax;
574 sigset_t set;
575
576 regs = (struct pt_regs *) &__unused;
577 frame = (struct sigframe __user *)(regs->sp - 8);
578
579 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
580 goto badframe;
581 if (__get_user(set.sig[0], &frame->sc.oldmask) || (_NSIG_WORDS > 1
582 && __copy_from_user(&set.sig[1], &frame->extramask,
583 sizeof(frame->extramask))))
584 goto badframe;
585
586 sigdelsetmask(&set, ~_BLOCKABLE);
587 spin_lock_irq(&current->sighand->siglock);
588 current->blocked = set;
589 recalc_sigpending();
590 spin_unlock_irq(&current->sighand->siglock);
591
592 if (restore_sigcontext(regs, &frame->sc, &ax))
593 goto badframe;
594 return ax;
595
596badframe:
597 signal_fault(regs, frame, "sigreturn");
598
599 return 0;
600}
601#endif /* CONFIG_X86_32 */
602
603static long do_rt_sigreturn(struct pt_regs *regs)
604{
605 struct rt_sigframe __user *frame;
606 unsigned long ax;
607 sigset_t set;
608
609 frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
610 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
611 goto badframe;
612 if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
613 goto badframe;
614
615 sigdelsetmask(&set, ~_BLOCKABLE);
616 spin_lock_irq(&current->sighand->siglock);
617 current->blocked = set;
618 recalc_sigpending();
619 spin_unlock_irq(&current->sighand->siglock);
620
621 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
622 goto badframe;
623
624 if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT)
625 goto badframe;
626
627 return ax;
628
629badframe:
630 signal_fault(regs, frame, "rt_sigreturn");
631 return 0;
632}
633
634#ifdef CONFIG_X86_32
635asmlinkage int sys_rt_sigreturn(struct pt_regs regs)
636{
637 return do_rt_sigreturn(&regs);
638}
639#else /* !CONFIG_X86_32 */
640asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
641{
642 return do_rt_sigreturn(regs);
643}
644#endif /* CONFIG_X86_32 */
478 645
479/* 646/*
480 * OK, we're invoking a handler: 647 * OK, we're invoking a handler:
481 */ 648 */
482static int signr_convert(int sig) 649static int signr_convert(int sig)
483{ 650{
651#ifdef CONFIG_X86_32
484 struct thread_info *info = current_thread_info(); 652 struct thread_info *info = current_thread_info();
485 653
486 if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32) 654 if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32)
487 return info->exec_domain->signal_invmap[sig]; 655 return info->exec_domain->signal_invmap[sig];
656#endif /* CONFIG_X86_32 */
488 return sig; 657 return sig;
489} 658}
490 659
660#ifdef CONFIG_X86_32
661
491#define is_ia32 1 662#define is_ia32 1
492#define ia32_setup_frame __setup_frame 663#define ia32_setup_frame __setup_frame
493#define ia32_setup_rt_frame __setup_rt_frame 664#define ia32_setup_rt_frame __setup_rt_frame
494 665
666#else /* !CONFIG_X86_32 */
667
668#ifdef CONFIG_IA32_EMULATION
669#define is_ia32 test_thread_flag(TIF_IA32)
670#else /* !CONFIG_IA32_EMULATION */
671#define is_ia32 0
672#endif /* CONFIG_IA32_EMULATION */
673
674int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
675 sigset_t *set, struct pt_regs *regs);
676int ia32_setup_frame(int sig, struct k_sigaction *ka,
677 sigset_t *set, struct pt_regs *regs);
678
679#endif /* CONFIG_X86_32 */
680
495static int 681static int
496setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 682setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
497 sigset_t *set, struct pt_regs *regs) 683 sigset_t *set, struct pt_regs *regs)
@@ -592,7 +778,13 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
592 return 0; 778 return 0;
593} 779}
594 780
781#ifdef CONFIG_X86_32
595#define NR_restart_syscall __NR_restart_syscall 782#define NR_restart_syscall __NR_restart_syscall
783#else /* !CONFIG_X86_32 */
784#define NR_restart_syscall \
785 test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall
786#endif /* CONFIG_X86_32 */
787
596/* 788/*
597 * Note that 'init' is a special process: it doesn't get signals it doesn't 789 * Note that 'init' is a special process: it doesn't get signals it doesn't
598 * want to handle. Thus you cannot kill init even with a SIGKILL even by 790 * want to handle. Thus you cannot kill init even with a SIGKILL even by
@@ -704,8 +896,9 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
704 struct task_struct *me = current; 896 struct task_struct *me = current;
705 897
706 if (show_unhandled_signals && printk_ratelimit()) { 898 if (show_unhandled_signals && printk_ratelimit()) {
707 printk(KERN_INFO 899 printk("%s"
708 "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", 900 "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
901 task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
709 me->comm, me->pid, where, frame, 902 me->comm, me->pid, where, frame,
710 regs->ip, regs->sp, regs->orig_ax); 903 regs->ip, regs->sp, regs->orig_ax);
711 print_vma_addr(" in ", regs->ip); 904 print_vma_addr(" in ", regs->ip);
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
deleted file mode 100644
index a5c9627f4db9..000000000000
--- a/arch/x86/kernel/signal_64.c
+++ /dev/null
@@ -1,516 +0,0 @@
1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
4 *
5 * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson
6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
7 * 2000-2002 x86-64 support by Andi Kleen
8 */
9
10#include <linux/sched.h>
11#include <linux/mm.h>
12#include <linux/smp.h>
13#include <linux/kernel.h>
14#include <linux/signal.h>
15#include <linux/errno.h>
16#include <linux/wait.h>
17#include <linux/ptrace.h>
18#include <linux/tracehook.h>
19#include <linux/unistd.h>
20#include <linux/stddef.h>
21#include <linux/personality.h>
22#include <linux/compiler.h>
23#include <linux/uaccess.h>
24
25#include <asm/processor.h>
26#include <asm/ucontext.h>
27#include <asm/i387.h>
28#include <asm/proto.h>
29#include <asm/ia32_unistd.h>
30#include <asm/mce.h>
31#include <asm/syscall.h>
32#include <asm/syscalls.h>
33#include "sigframe.h"
34
35#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
36
37#define __FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \
38 X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \
39 X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \
40 X86_EFLAGS_CF)
41
42#ifdef CONFIG_X86_32
43# define FIX_EFLAGS (__FIX_EFLAGS | X86_EFLAGS_RF)
44#else
45# define FIX_EFLAGS __FIX_EFLAGS
46#endif
47
48asmlinkage long
49sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
50 struct pt_regs *regs)
51{
52 return do_sigaltstack(uss, uoss, regs->sp);
53}
54
55#define COPY(x) { \
56 err |= __get_user(regs->x, &sc->x); \
57}
58
59#define COPY_SEG_STRICT(seg) { \
60 unsigned short tmp; \
61 err |= __get_user(tmp, &sc->seg); \
62 regs->seg = tmp | 3; \
63}
64
65/*
66 * Do a signal return; undo the signal stack.
67 */
68static int
69restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
70 unsigned long *pax)
71{
72 void __user *buf;
73 unsigned int tmpflags;
74 unsigned int err = 0;
75
76 /* Always make any pending restarted system calls return -EINTR */
77 current_thread_info()->restart_block.fn = do_no_restart_syscall;
78
79 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
80 COPY(dx); COPY(cx); COPY(ip);
81 COPY(r8);
82 COPY(r9);
83 COPY(r10);
84 COPY(r11);
85 COPY(r12);
86 COPY(r13);
87 COPY(r14);
88 COPY(r15);
89
90 /* Kernel saves and restores only the CS segment register on signals,
91 * which is the bare minimum needed to allow mixed 32/64-bit code.
92 * App's signal handler can save/restore other segments if needed. */
93 COPY_SEG_STRICT(cs);
94
95 err |= __get_user(tmpflags, &sc->flags);
96 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
97 regs->orig_ax = -1; /* disable syscall checks */
98
99 err |= __get_user(buf, &sc->fpstate);
100 err |= restore_i387_xstate(buf);
101
102 err |= __get_user(*pax, &sc->ax);
103 return err;
104}
105
106static long do_rt_sigreturn(struct pt_regs *regs)
107{
108 struct rt_sigframe __user *frame;
109 unsigned long ax;
110 sigset_t set;
111
112 frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
113 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
114 goto badframe;
115 if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
116 goto badframe;
117
118 sigdelsetmask(&set, ~_BLOCKABLE);
119 spin_lock_irq(&current->sighand->siglock);
120 current->blocked = set;
121 recalc_sigpending();
122 spin_unlock_irq(&current->sighand->siglock);
123
124 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
125 goto badframe;
126
127 if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT)
128 goto badframe;
129
130 return ax;
131
132badframe:
133 signal_fault(regs, frame, "rt_sigreturn");
134 return 0;
135}
136
137asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
138{
139 return do_rt_sigreturn(regs);
140}
141
142/*
143 * Set up a signal frame.
144 */
145
146static inline int
147setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs,
148 unsigned long mask, struct task_struct *me)
149{
150 int err = 0;
151
152 err |= __put_user(regs->cs, &sc->cs);
153 err |= __put_user(0, &sc->gs);
154 err |= __put_user(0, &sc->fs);
155
156 err |= __put_user(regs->di, &sc->di);
157 err |= __put_user(regs->si, &sc->si);
158 err |= __put_user(regs->bp, &sc->bp);
159 err |= __put_user(regs->sp, &sc->sp);
160 err |= __put_user(regs->bx, &sc->bx);
161 err |= __put_user(regs->dx, &sc->dx);
162 err |= __put_user(regs->cx, &sc->cx);
163 err |= __put_user(regs->ax, &sc->ax);
164 err |= __put_user(regs->r8, &sc->r8);
165 err |= __put_user(regs->r9, &sc->r9);
166 err |= __put_user(regs->r10, &sc->r10);
167 err |= __put_user(regs->r11, &sc->r11);
168 err |= __put_user(regs->r12, &sc->r12);
169 err |= __put_user(regs->r13, &sc->r13);
170 err |= __put_user(regs->r14, &sc->r14);
171 err |= __put_user(regs->r15, &sc->r15);
172 err |= __put_user(me->thread.trap_no, &sc->trapno);
173 err |= __put_user(me->thread.error_code, &sc->err);
174 err |= __put_user(regs->ip, &sc->ip);
175 err |= __put_user(regs->flags, &sc->flags);
176 err |= __put_user(mask, &sc->oldmask);
177 err |= __put_user(me->thread.cr2, &sc->cr2);
178
179 return err;
180}
181
182/*
183 * Determine which stack to use..
184 */
185
186static void __user *
187get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
188{
189 unsigned long sp;
190
191 /* Default to using normal stack - redzone*/
192 sp = regs->sp - 128;
193
194 /* This is the X/Open sanctioned signal stack switching. */
195 if (ka->sa.sa_flags & SA_ONSTACK) {
196 if (sas_ss_flags(sp) == 0)
197 sp = current->sas_ss_sp + current->sas_ss_size;
198 }
199
200 return (void __user *)round_down(sp - size, 64);
201}
202
203static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
204 sigset_t *set, struct pt_regs *regs)
205{
206 struct rt_sigframe __user *frame;
207 void __user *fp = NULL;
208 int err = 0;
209 struct task_struct *me = current;
210
211 if (used_math()) {
212 fp = get_stack(ka, regs, sig_xstate_size);
213 frame = (void __user *)round_down(
214 (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
215
216 if (save_i387_xstate(fp) < 0)
217 return -EFAULT;
218 } else
219 frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8;
220
221 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
222 return -EFAULT;
223
224 if (ka->sa.sa_flags & SA_SIGINFO) {
225 if (copy_siginfo_to_user(&frame->info, info))
226 return -EFAULT;
227 }
228
229 /* Create the ucontext. */
230 if (cpu_has_xsave)
231 err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
232 else
233 err |= __put_user(0, &frame->uc.uc_flags);
234 err |= __put_user(0, &frame->uc.uc_link);
235 err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
236 err |= __put_user(sas_ss_flags(regs->sp),
237 &frame->uc.uc_stack.ss_flags);
238 err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
239 err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me);
240 err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate);
241 if (sizeof(*set) == 16) {
242 __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
243 __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]);
244 } else
245 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
246
247 /* Set up to return from userspace. If provided, use a stub
248 already in userspace. */
249 /* x86-64 should always use SA_RESTORER. */
250 if (ka->sa.sa_flags & SA_RESTORER) {
251 err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
252 } else {
253 /* could use a vstub here */
254 return -EFAULT;
255 }
256
257 if (err)
258 return -EFAULT;
259
260 /* Set up registers for signal handler */
261 regs->di = sig;
262 /* In case the signal handler was declared without prototypes */
263 regs->ax = 0;
264
265 /* This also works for non SA_SIGINFO handlers because they expect the
266 next argument after the signal number on the stack. */
267 regs->si = (unsigned long)&frame->info;
268 regs->dx = (unsigned long)&frame->uc;
269 regs->ip = (unsigned long) ka->sa.sa_handler;
270
271 regs->sp = (unsigned long)frame;
272
273 /* Set up the CS register to run signal handlers in 64-bit mode,
274 even if the handler happens to be interrupting 32-bit code. */
275 regs->cs = __USER_CS;
276
277 return 0;
278}
279
280/*
281 * OK, we're invoking a handler
282 */
283static int signr_convert(int sig)
284{
285 return sig;
286}
287
288#ifdef CONFIG_IA32_EMULATION
289#define is_ia32 test_thread_flag(TIF_IA32)
290#else
291#define is_ia32 0
292#endif
293
294static int
295setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
296 sigset_t *set, struct pt_regs *regs)
297{
298 int usig = signr_convert(sig);
299 int ret;
300
301 /* Set up the stack frame */
302 if (is_ia32) {
303 if (ka->sa.sa_flags & SA_SIGINFO)
304 ret = ia32_setup_rt_frame(usig, ka, info, set, regs);
305 else
306 ret = ia32_setup_frame(usig, ka, set, regs);
307 } else
308 ret = __setup_rt_frame(sig, ka, info, set, regs);
309
310 if (ret) {
311 force_sigsegv(sig, current);
312 return -EFAULT;
313 }
314
315 return ret;
316}
317
318static int
319handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
320 sigset_t *oldset, struct pt_regs *regs)
321{
322 int ret;
323
324 /* Are we from a system call? */
325 if (syscall_get_nr(current, regs) >= 0) {
326 /* If so, check system call restarting.. */
327 switch (syscall_get_error(current, regs)) {
328 case -ERESTART_RESTARTBLOCK:
329 case -ERESTARTNOHAND:
330 regs->ax = -EINTR;
331 break;
332
333 case -ERESTARTSYS:
334 if (!(ka->sa.sa_flags & SA_RESTART)) {
335 regs->ax = -EINTR;
336 break;
337 }
338 /* fallthrough */
339 case -ERESTARTNOINTR:
340 regs->ax = regs->orig_ax;
341 regs->ip -= 2;
342 break;
343 }
344 }
345
346 /*
347 * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF
348 * flag so that register information in the sigcontext is correct.
349 */
350 if (unlikely(regs->flags & X86_EFLAGS_TF) &&
351 likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
352 regs->flags &= ~X86_EFLAGS_TF;
353
354 ret = setup_rt_frame(sig, ka, info, oldset, regs);
355
356 if (ret)
357 return ret;
358
359#ifdef CONFIG_X86_64
360 /*
361 * This has nothing to do with segment registers,
362 * despite the name. This magic affects uaccess.h
363 * macros' behavior. Reset it to the normal setting.
364 */
365 set_fs(USER_DS);
366#endif
367
368 /*
369 * Clear the direction flag as per the ABI for function entry.
370 */
371 regs->flags &= ~X86_EFLAGS_DF;
372
373 /*
374 * Clear TF when entering the signal handler, but
375 * notify any tracer that was single-stepping it.
376 * The tracer may want to single-step inside the
377 * handler too.
378 */
379 regs->flags &= ~X86_EFLAGS_TF;
380
381 spin_lock_irq(&current->sighand->siglock);
382 sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
383 if (!(ka->sa.sa_flags & SA_NODEFER))
384 sigaddset(&current->blocked, sig);
385 recalc_sigpending();
386 spin_unlock_irq(&current->sighand->siglock);
387
388 tracehook_signal_handler(sig, info, ka, regs,
389 test_thread_flag(TIF_SINGLESTEP));
390
391 return 0;
392}
393
394#define NR_restart_syscall \
395 test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall
396/*
397 * Note that 'init' is a special process: it doesn't get signals it doesn't
398 * want to handle. Thus you cannot kill init even with a SIGKILL even by
399 * mistake.
400 */
401static void do_signal(struct pt_regs *regs)
402{
403 struct k_sigaction ka;
404 siginfo_t info;
405 int signr;
406 sigset_t *oldset;
407
408 /*
409 * We want the common case to go fast, which is why we may in certain
410 * cases get here from kernel mode. Just return without doing anything
411 * if so.
412 * X86_32: vm86 regs switched out by assembly code before reaching
413 * here, so testing against kernel CS suffices.
414 */
415 if (!user_mode(regs))
416 return;
417
418 if (current_thread_info()->status & TS_RESTORE_SIGMASK)
419 oldset = &current->saved_sigmask;
420 else
421 oldset = &current->blocked;
422
423 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
424 if (signr > 0) {
425 /*
426 * Re-enable any watchpoints before delivering the
427 * signal to user space. The processor register will
428 * have been cleared if the watchpoint triggered
429 * inside the kernel.
430 */
431 if (current->thread.debugreg7)
432 set_debugreg(current->thread.debugreg7, 7);
433
434 /* Whee! Actually deliver the signal. */
435 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
436 /*
437 * A signal was successfully delivered; the saved
438 * sigmask will have been stored in the signal frame,
439 * and will be restored by sigreturn, so we can simply
440 * clear the TS_RESTORE_SIGMASK flag.
441 */
442 current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
443 }
444 return;
445 }
446
447 /* Did we come from a system call? */
448 if (syscall_get_nr(current, regs) >= 0) {
449 /* Restart the system call - no handlers present */
450 switch (syscall_get_error(current, regs)) {
451 case -ERESTARTNOHAND:
452 case -ERESTARTSYS:
453 case -ERESTARTNOINTR:
454 regs->ax = regs->orig_ax;
455 regs->ip -= 2;
456 break;
457
458 case -ERESTART_RESTARTBLOCK:
459 regs->ax = NR_restart_syscall;
460 regs->ip -= 2;
461 break;
462 }
463 }
464
465 /*
466 * If there's no signal to deliver, we just put the saved sigmask
467 * back.
468 */
469 if (current_thread_info()->status & TS_RESTORE_SIGMASK) {
470 current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
471 sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
472 }
473}
474
475/*
476 * notification of userspace execution resumption
477 * - triggered by the TIF_WORK_MASK flags
478 */
479void
480do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
481{
482#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
483 /* notify userspace of pending MCEs */
484 if (thread_info_flags & _TIF_MCE_NOTIFY)
485 mce_notify_user();
486#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
487
488 /* deal with pending signal delivery */
489 if (thread_info_flags & _TIF_SIGPENDING)
490 do_signal(regs);
491
492 if (thread_info_flags & _TIF_NOTIFY_RESUME) {
493 clear_thread_flag(TIF_NOTIFY_RESUME);
494 tracehook_notify_resume(regs);
495 }
496
497#ifdef CONFIG_X86_32
498 clear_thread_flag(TIF_IRET);
499#endif /* CONFIG_X86_32 */
500}
501
502void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
503{
504 struct task_struct *me = current;
505
506 if (show_unhandled_signals && printk_ratelimit()) {
507 printk(KERN_INFO
508 "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
509 me->comm, me->pid, where, frame,
510 regs->ip, regs->sp, regs->orig_ax);
511 print_vma_addr(" in ", regs->ip);
512 printk(KERN_CONT "\n");
513 }
514
515 force_sig(SIGSEGV, me);
516}
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 49ed667b06f3..beea2649a240 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -165,11 +165,7 @@ static void native_smp_send_stop(void)
165void smp_reschedule_interrupt(struct pt_regs *regs) 165void smp_reschedule_interrupt(struct pt_regs *regs)
166{ 166{
167 ack_APIC_irq(); 167 ack_APIC_irq();
168#ifdef CONFIG_X86_32 168 inc_irq_stat(irq_resched_count);
169 __get_cpu_var(irq_stat).irq_resched_count++;
170#else
171 add_pda(irq_resched_count, 1);
172#endif
173} 169}
174 170
175void smp_call_function_interrupt(struct pt_regs *regs) 171void smp_call_function_interrupt(struct pt_regs *regs)
@@ -177,11 +173,7 @@ void smp_call_function_interrupt(struct pt_regs *regs)
177 ack_APIC_irq(); 173 ack_APIC_irq();
178 irq_enter(); 174 irq_enter();
179 generic_smp_call_function_interrupt(); 175 generic_smp_call_function_interrupt();
180#ifdef CONFIG_X86_32 176 inc_irq_stat(irq_call_count);
181 __get_cpu_var(irq_stat).irq_call_count++;
182#else
183 add_pda(irq_call_count, 1);
184#endif
185 irq_exit(); 177 irq_exit();
186} 178}
187 179
@@ -190,11 +182,7 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
190 ack_APIC_irq(); 182 ack_APIC_irq();
191 irq_enter(); 183 irq_enter();
192 generic_smp_call_function_single_interrupt(); 184 generic_smp_call_function_single_interrupt();
193#ifdef CONFIG_X86_32 185 inc_irq_stat(irq_call_count);
194 __get_cpu_var(irq_stat).irq_call_count++;
195#else
196 add_pda(irq_call_count, 1);
197#endif
198 irq_exit(); 186 irq_exit();
199} 187}
200 188
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 1a9941b11150..9e177a4077ee 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -282,7 +282,7 @@ static int __cpuinitdata unsafe_smp;
282/* 282/*
283 * Activate a secondary processor. 283 * Activate a secondary processor.
284 */ 284 */
285static void __cpuinit start_secondary(void *unused) 285notrace static void __cpuinit start_secondary(void *unused)
286{ 286{
287 /* 287 /*
288 * Don't put *anything* before cpu_init(), SMP booting is too 288 * Don't put *anything* before cpu_init(), SMP booting is too
@@ -496,7 +496,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
496} 496}
497 497
498/* maps the cpu to the sched domain representing multi-core */ 498/* maps the cpu to the sched domain representing multi-core */
499cpumask_t cpu_coregroup_map(int cpu) 499const struct cpumask *cpu_coregroup_mask(int cpu)
500{ 500{
501 struct cpuinfo_x86 *c = &cpu_data(cpu); 501 struct cpuinfo_x86 *c = &cpu_data(cpu);
502 /* 502 /*
@@ -504,9 +504,14 @@ cpumask_t cpu_coregroup_map(int cpu)
504 * And for power savings, we return cpu_core_map 504 * And for power savings, we return cpu_core_map
505 */ 505 */
506 if (sched_mc_power_savings || sched_smt_power_savings) 506 if (sched_mc_power_savings || sched_smt_power_savings)
507 return per_cpu(cpu_core_map, cpu); 507 return &per_cpu(cpu_core_map, cpu);
508 else 508 else
509 return c->llc_shared_map; 509 return &c->llc_shared_map;
510}
511
512cpumask_t cpu_coregroup_map(int cpu)
513{
514 return *cpu_coregroup_mask(cpu);
510} 515}
511 516
512static void impress_friends(void) 517static void impress_friends(void)
@@ -1075,8 +1080,10 @@ static int __init smp_sanity_check(unsigned max_cpus)
1075#endif 1080#endif
1076 1081
1077 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { 1082 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
1078 printk(KERN_WARNING "weird, boot CPU (#%d) not listed" 1083 printk(KERN_WARNING
1079 "by the BIOS.\n", hard_smp_processor_id()); 1084 "weird, boot CPU (#%d) not listed by the BIOS.\n",
1085 hard_smp_processor_id());
1086
1080 physid_set(hard_smp_processor_id(), phys_cpu_present_map); 1087 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
1081 } 1088 }
1082 1089
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index 77b400f06ea2..65309e4cb1c0 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -75,7 +75,7 @@ EXPORT_SYMBOL(profile_pc);
75irqreturn_t timer_interrupt(int irq, void *dev_id) 75irqreturn_t timer_interrupt(int irq, void *dev_id)
76{ 76{
77 /* Keep nmi watchdog up to date */ 77 /* Keep nmi watchdog up to date */
78 per_cpu(irq_stat, smp_processor_id()).irq0_irqs++; 78 inc_irq_stat(irq0_irqs);
79 79
80#ifdef CONFIG_X86_IO_APIC 80#ifdef CONFIG_X86_IO_APIC
81 if (timer_ack) { 81 if (timer_ack) {
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index cb19d650c216..891e7a7c4334 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -49,9 +49,9 @@ unsigned long profile_pc(struct pt_regs *regs)
49} 49}
50EXPORT_SYMBOL(profile_pc); 50EXPORT_SYMBOL(profile_pc);
51 51
52irqreturn_t timer_interrupt(int irq, void *dev_id) 52static irqreturn_t timer_interrupt(int irq, void *dev_id)
53{ 53{
54 add_pda(irq0_irqs, 1); 54 inc_irq_stat(irq0_irqs);
55 55
56 global_clock_event->event_handler(global_clock_event); 56 global_clock_event->event_handler(global_clock_event);
57 57
@@ -80,6 +80,8 @@ unsigned long __init calibrate_cpu(void)
80 break; 80 break;
81 no_ctr_free = (i == 4); 81 no_ctr_free = (i == 4);
82 if (no_ctr_free) { 82 if (no_ctr_free) {
83 WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... "
84 "cpu_khz value may be incorrect.\n");
83 i = 3; 85 i = 3;
84 rdmsrl(MSR_K7_EVNTSEL3, evntsel3); 86 rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
85 wrmsrl(MSR_K7_EVNTSEL3, 0); 87 wrmsrl(MSR_K7_EVNTSEL3, 0);
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
index 174ea90d1cbd..ce5054642247 100644
--- a/arch/x86/kernel/tlb_32.c
+++ b/arch/x86/kernel/tlb_32.c
@@ -34,9 +34,8 @@ static DEFINE_SPINLOCK(tlbstate_lock);
34 */ 34 */
35void leave_mm(int cpu) 35void leave_mm(int cpu)
36{ 36{
37 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) 37 BUG_ON(x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK);
38 BUG(); 38 cpu_clear(cpu, x86_read_percpu(cpu_tlbstate.active_mm)->cpu_vm_mask);
39 cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
40 load_cr3(swapper_pg_dir); 39 load_cr3(swapper_pg_dir);
41} 40}
42EXPORT_SYMBOL_GPL(leave_mm); 41EXPORT_SYMBOL_GPL(leave_mm);
@@ -104,8 +103,8 @@ void smp_invalidate_interrupt(struct pt_regs *regs)
104 * BUG(); 103 * BUG();
105 */ 104 */
106 105
107 if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) { 106 if (flush_mm == x86_read_percpu(cpu_tlbstate.active_mm)) {
108 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) { 107 if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) {
109 if (flush_va == TLB_FLUSH_ALL) 108 if (flush_va == TLB_FLUSH_ALL)
110 local_flush_tlb(); 109 local_flush_tlb();
111 else 110 else
@@ -119,7 +118,7 @@ void smp_invalidate_interrupt(struct pt_regs *regs)
119 smp_mb__after_clear_bit(); 118 smp_mb__after_clear_bit();
120out: 119out:
121 put_cpu_no_resched(); 120 put_cpu_no_resched();
122 __get_cpu_var(irq_stat).irq_tlb_count++; 121 inc_irq_stat(irq_tlb_count);
123} 122}
124 123
125void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, 124void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
@@ -238,7 +237,7 @@ static void do_flush_tlb_all(void *info)
238 unsigned long cpu = smp_processor_id(); 237 unsigned long cpu = smp_processor_id();
239 238
240 __flush_tlb_all(); 239 __flush_tlb_all();
241 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY) 240 if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_LAZY)
242 leave_mm(cpu); 241 leave_mm(cpu);
243} 242}
244 243
diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c
index de6f1bda0c50..f8be6f1d2e48 100644
--- a/arch/x86/kernel/tlb_64.c
+++ b/arch/x86/kernel/tlb_64.c
@@ -154,7 +154,7 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
154out: 154out:
155 ack_APIC_irq(); 155 ack_APIC_irq();
156 cpu_clear(cpu, f->flush_cpumask); 156 cpu_clear(cpu, f->flush_cpumask);
157 add_pda(irq_tlb_count, 1); 157 inc_irq_stat(irq_tlb_count);
158} 158}
159 159
160void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, 160void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 04431f34fd16..6a00e5faaa74 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -566,14 +566,10 @@ static int __init uv_ptc_init(void)
566 if (!is_uv_system()) 566 if (!is_uv_system())
567 return 0; 567 return 0;
568 568
569 if (!proc_mkdir("sgi_uv", NULL))
570 return -EINVAL;
571
572 proc_uv_ptc = create_proc_entry(UV_PTC_BASENAME, 0444, NULL); 569 proc_uv_ptc = create_proc_entry(UV_PTC_BASENAME, 0444, NULL);
573 if (!proc_uv_ptc) { 570 if (!proc_uv_ptc) {
574 printk(KERN_ERR "unable to create %s proc entry\n", 571 printk(KERN_ERR "unable to create %s proc entry\n",
575 UV_PTC_BASENAME); 572 UV_PTC_BASENAME);
576 remove_proc_entry("sgi_uv", NULL);
577 return -EINVAL; 573 return -EINVAL;
578 } 574 }
579 proc_uv_ptc->proc_fops = &proc_uv_ptc_operations; 575 proc_uv_ptc->proc_fops = &proc_uv_ptc_operations;
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index 1106fac6024d..808031a5ba19 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -1,10 +1,26 @@
1#include <linux/io.h> 1#include <linux/io.h>
2 2
3#include <asm/trampoline.h> 3#include <asm/trampoline.h>
4#include <asm/e820.h>
4 5
5/* ready for x86_64 and x86 */ 6/* ready for x86_64 and x86 */
6unsigned char *trampoline_base = __va(TRAMPOLINE_BASE); 7unsigned char *trampoline_base = __va(TRAMPOLINE_BASE);
7 8
9void __init reserve_trampoline_memory(void)
10{
11#ifdef CONFIG_X86_32
12 /*
13 * But first pinch a few for the stack/trampoline stuff
14 * FIXME: Don't need the extra page at 4K, but need to fix
15 * trampoline before removing it. (see the GDT stuff)
16 */
17 reserve_early(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE");
18#endif
19 /* Has to be in very low memory so we can execute real-mode AP code. */
20 reserve_early(TRAMPOLINE_BASE, TRAMPOLINE_BASE + TRAMPOLINE_SIZE,
21 "TRAMPOLINE");
22}
23
8/* 24/*
9 * Currently trivial. Write the real->protected mode 25 * Currently trivial. Write the real->protected mode
10 * bootstrap into the page concerned. The caller 26 * bootstrap into the page concerned. The caller
@@ -12,7 +28,6 @@ unsigned char *trampoline_base = __va(TRAMPOLINE_BASE);
12 */ 28 */
13unsigned long setup_trampoline(void) 29unsigned long setup_trampoline(void)
14{ 30{
15 memcpy(trampoline_base, trampoline_data, 31 memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE);
16 trampoline_end - trampoline_data);
17 return virt_to_phys(trampoline_base); 32 return virt_to_phys(trampoline_base);
18} 33}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 4a6dff39a470..2d1f4c7e4052 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -481,11 +481,7 @@ do_nmi(struct pt_regs *regs, long error_code)
481{ 481{
482 nmi_enter(); 482 nmi_enter();
483 483
484#ifdef CONFIG_X86_32 484 inc_irq_stat(__nmi_count);
485 { int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); }
486#else
487 add_pda(__nmi_count, 1);
488#endif
489 485
490 if (!ignore_nmis) 486 if (!ignore_nmis)
491 default_do_nmi(regs); 487 default_do_nmi(regs);
@@ -664,7 +660,7 @@ void math_error(void __user *ip)
664{ 660{
665 struct task_struct *task; 661 struct task_struct *task;
666 siginfo_t info; 662 siginfo_t info;
667 unsigned short cwd, swd; 663 unsigned short cwd, swd, err;
668 664
669 /* 665 /*
670 * Save the info for the exception handler and clear the error. 666 * Save the info for the exception handler and clear the error.
@@ -675,7 +671,6 @@ void math_error(void __user *ip)
675 task->thread.error_code = 0; 671 task->thread.error_code = 0;
676 info.si_signo = SIGFPE; 672 info.si_signo = SIGFPE;
677 info.si_errno = 0; 673 info.si_errno = 0;
678 info.si_code = __SI_FAULT;
679 info.si_addr = ip; 674 info.si_addr = ip;
680 /* 675 /*
681 * (~cwd & swd) will mask out exceptions that are not set to unmasked 676 * (~cwd & swd) will mask out exceptions that are not set to unmasked
@@ -689,34 +684,31 @@ void math_error(void __user *ip)
689 */ 684 */
690 cwd = get_fpu_cwd(task); 685 cwd = get_fpu_cwd(task);
691 swd = get_fpu_swd(task); 686 swd = get_fpu_swd(task);
692 switch (swd & ~cwd & 0x3f) { 687
693 case 0x000: /* No unmasked exception */ 688 err = swd & ~cwd & 0x3f;
689
694#ifdef CONFIG_X86_32 690#ifdef CONFIG_X86_32
691 if (!err)
695 return; 692 return;
696#endif 693#endif
697 default: /* Multiple exceptions */ 694
698 break; 695 if (err & 0x001) { /* Invalid op */
699 case 0x001: /* Invalid Op */
700 /* 696 /*
701 * swd & 0x240 == 0x040: Stack Underflow 697 * swd & 0x240 == 0x040: Stack Underflow
702 * swd & 0x240 == 0x240: Stack Overflow 698 * swd & 0x240 == 0x240: Stack Overflow
703 * User must clear the SF bit (0x40) if set 699 * User must clear the SF bit (0x40) if set
704 */ 700 */
705 info.si_code = FPE_FLTINV; 701 info.si_code = FPE_FLTINV;
706 break; 702 } else if (err & 0x004) { /* Divide by Zero */
707 case 0x002: /* Denormalize */
708 case 0x010: /* Underflow */
709 info.si_code = FPE_FLTUND;
710 break;
711 case 0x004: /* Zero Divide */
712 info.si_code = FPE_FLTDIV; 703 info.si_code = FPE_FLTDIV;
713 break; 704 } else if (err & 0x008) { /* Overflow */
714 case 0x008: /* Overflow */
715 info.si_code = FPE_FLTOVF; 705 info.si_code = FPE_FLTOVF;
716 break; 706 } else if (err & 0x012) { /* Denormal, Underflow */
717 case 0x020: /* Precision */ 707 info.si_code = FPE_FLTUND;
708 } else if (err & 0x020) { /* Precision */
718 info.si_code = FPE_FLTRES; 709 info.si_code = FPE_FLTRES;
719 break; 710 } else {
711 info.si_code = __SI_FAULT|SI_KERNEL; /* WTF? */
720 } 712 }
721 force_sig_info(SIGFPE, &info, task); 713 force_sig_info(SIGFPE, &info, task);
722} 714}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 424093b157d3..599e58168631 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -15,6 +15,7 @@
15#include <asm/vgtod.h> 15#include <asm/vgtod.h>
16#include <asm/time.h> 16#include <asm/time.h>
17#include <asm/delay.h> 17#include <asm/delay.h>
18#include <asm/hypervisor.h>
18 19
19unsigned int cpu_khz; /* TSC clocks / usec, not used here */ 20unsigned int cpu_khz; /* TSC clocks / usec, not used here */
20EXPORT_SYMBOL(cpu_khz); 21EXPORT_SYMBOL(cpu_khz);
@@ -31,6 +32,7 @@ static int tsc_unstable;
31 erroneous rdtsc usage on !cpu_has_tsc processors */ 32 erroneous rdtsc usage on !cpu_has_tsc processors */
32static int tsc_disabled = -1; 33static int tsc_disabled = -1;
33 34
35static int tsc_clocksource_reliable;
34/* 36/*
35 * Scheduler clock - returns current time in nanosec units. 37 * Scheduler clock - returns current time in nanosec units.
36 */ 38 */
@@ -98,6 +100,15 @@ int __init notsc_setup(char *str)
98 100
99__setup("notsc", notsc_setup); 101__setup("notsc", notsc_setup);
100 102
103static int __init tsc_setup(char *str)
104{
105 if (!strcmp(str, "reliable"))
106 tsc_clocksource_reliable = 1;
107 return 1;
108}
109
110__setup("tsc=", tsc_setup);
111
101#define MAX_RETRIES 5 112#define MAX_RETRIES 5
102#define SMI_TRESHOLD 50000 113#define SMI_TRESHOLD 50000
103 114
@@ -352,9 +363,15 @@ unsigned long native_calibrate_tsc(void)
352{ 363{
353 u64 tsc1, tsc2, delta, ref1, ref2; 364 u64 tsc1, tsc2, delta, ref1, ref2;
354 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; 365 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
355 unsigned long flags, latch, ms, fast_calibrate; 366 unsigned long flags, latch, ms, fast_calibrate, tsc_khz;
356 int hpet = is_hpet_enabled(), i, loopmin; 367 int hpet = is_hpet_enabled(), i, loopmin;
357 368
369 tsc_khz = get_hypervisor_tsc_freq();
370 if (tsc_khz) {
371 printk(KERN_INFO "TSC: Frequency read from the hypervisor\n");
372 return tsc_khz;
373 }
374
358 local_irq_save(flags); 375 local_irq_save(flags);
359 fast_calibrate = quick_pit_calibrate(); 376 fast_calibrate = quick_pit_calibrate();
360 local_irq_restore(flags); 377 local_irq_restore(flags);
@@ -731,24 +748,21 @@ static struct dmi_system_id __initdata bad_tsc_dmi_table[] = {
731 {} 748 {}
732}; 749};
733 750
734/* 751static void __init check_system_tsc_reliable(void)
735 * Geode_LX - the OLPC CPU has a possibly a very reliable TSC 752{
736 */
737#ifdef CONFIG_MGEODE_LX 753#ifdef CONFIG_MGEODE_LX
738/* RTSC counts during suspend */ 754 /* RTSC counts during suspend */
739#define RTSC_SUSP 0x100 755#define RTSC_SUSP 0x100
740
741static void __init check_geode_tsc_reliable(void)
742{
743 unsigned long res_low, res_high; 756 unsigned long res_low, res_high;
744 757
745 rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high); 758 rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
759 /* Geode_LX - the OLPC CPU has a possibly a very reliable TSC */
746 if (res_low & RTSC_SUSP) 760 if (res_low & RTSC_SUSP)
747 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; 761 tsc_clocksource_reliable = 1;
748}
749#else
750static inline void check_geode_tsc_reliable(void) { }
751#endif 762#endif
763 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
764 tsc_clocksource_reliable = 1;
765}
752 766
753/* 767/*
754 * Make an educated guess if the TSC is trustworthy and synchronized 768 * Make an educated guess if the TSC is trustworthy and synchronized
@@ -783,6 +797,8 @@ static void __init init_tsc_clocksource(void)
783{ 797{
784 clocksource_tsc.mult = clocksource_khz2mult(tsc_khz, 798 clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
785 clocksource_tsc.shift); 799 clocksource_tsc.shift);
800 if (tsc_clocksource_reliable)
801 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
786 /* lower the rating if we already know its unstable: */ 802 /* lower the rating if we already know its unstable: */
787 if (check_tsc_unstable()) { 803 if (check_tsc_unstable()) {
788 clocksource_tsc.rating = 0; 804 clocksource_tsc.rating = 0;
@@ -843,7 +859,7 @@ void __init tsc_init(void)
843 if (unsynchronized_tsc()) 859 if (unsynchronized_tsc())
844 mark_tsc_unstable("TSCs unsynchronized"); 860 mark_tsc_unstable("TSCs unsynchronized");
845 861
846 check_geode_tsc_reliable(); 862 check_system_tsc_reliable();
847 init_tsc_clocksource(); 863 init_tsc_clocksource();
848} 864}
849 865
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 1c0dfbca87c1..bf36328f6ef9 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -112,6 +112,12 @@ void __cpuinit check_tsc_sync_source(int cpu)
112 if (unsynchronized_tsc()) 112 if (unsynchronized_tsc())
113 return; 113 return;
114 114
115 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
116 printk(KERN_INFO
117 "Skipping synchronization checks as TSC is reliable.\n");
118 return;
119 }
120
115 printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:", 121 printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:",
116 smp_processor_id(), cpu); 122 smp_processor_id(), cpu);
117 123
@@ -165,7 +171,7 @@ void __cpuinit check_tsc_sync_target(void)
165{ 171{
166 int cpus = 2; 172 int cpus = 2;
167 173
168 if (unsynchronized_tsc()) 174 if (unsynchronized_tsc() || boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
169 return; 175 return;
170 176
171 /* 177 /*
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 22fd6577156a..23206ba16874 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -266,109 +266,6 @@ static void vmi_nop(void)
266{ 266{
267} 267}
268 268
269#ifdef CONFIG_DEBUG_PAGE_TYPE
270
271#ifdef CONFIG_X86_PAE
272#define MAX_BOOT_PTS (2048+4+1)
273#else
274#define MAX_BOOT_PTS (1024+1)
275#endif
276
277/*
278 * During boot, mem_map is not yet available in paging_init, so stash
279 * all the boot page allocations here.
280 */
281static struct {
282 u32 pfn;
283 int type;
284} boot_page_allocations[MAX_BOOT_PTS];
285static int num_boot_page_allocations;
286static int boot_allocations_applied;
287
288void vmi_apply_boot_page_allocations(void)
289{
290 int i;
291 BUG_ON(!mem_map);
292 for (i = 0; i < num_boot_page_allocations; i++) {
293 struct page *page = pfn_to_page(boot_page_allocations[i].pfn);
294 page->type = boot_page_allocations[i].type;
295 page->type = boot_page_allocations[i].type &
296 ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
297 }
298 boot_allocations_applied = 1;
299}
300
301static void record_page_type(u32 pfn, int type)
302{
303 BUG_ON(num_boot_page_allocations >= MAX_BOOT_PTS);
304 boot_page_allocations[num_boot_page_allocations].pfn = pfn;
305 boot_page_allocations[num_boot_page_allocations].type = type;
306 num_boot_page_allocations++;
307}
308
309static void check_zeroed_page(u32 pfn, int type, struct page *page)
310{
311 u32 *ptr;
312 int i;
313 int limit = PAGE_SIZE / sizeof(int);
314
315 if (page_address(page))
316 ptr = (u32 *)page_address(page);
317 else
318 ptr = (u32 *)__va(pfn << PAGE_SHIFT);
319 /*
320 * When cloning the root in non-PAE mode, only the userspace
321 * pdes need to be zeroed.
322 */
323 if (type & VMI_PAGE_CLONE)
324 limit = KERNEL_PGD_BOUNDARY;
325 for (i = 0; i < limit; i++)
326 BUG_ON(ptr[i]);
327}
328
329/*
330 * We stash the page type into struct page so we can verify the page
331 * types are used properly.
332 */
333static void vmi_set_page_type(u32 pfn, int type)
334{
335 /* PAE can have multiple roots per page - don't track */
336 if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP))
337 return;
338
339 if (boot_allocations_applied) {
340 struct page *page = pfn_to_page(pfn);
341 if (type != VMI_PAGE_NORMAL)
342 BUG_ON(page->type);
343 else
344 BUG_ON(page->type == VMI_PAGE_NORMAL);
345 page->type = type & ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
346 if (type & VMI_PAGE_ZEROED)
347 check_zeroed_page(pfn, type, page);
348 } else {
349 record_page_type(pfn, type);
350 }
351}
352
353static void vmi_check_page_type(u32 pfn, int type)
354{
355 /* PAE can have multiple roots per page - skip checks */
356 if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP))
357 return;
358
359 type &= ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
360 if (boot_allocations_applied) {
361 struct page *page = pfn_to_page(pfn);
362 BUG_ON((page->type ^ type) & VMI_PAGE_PAE);
363 BUG_ON(type == VMI_PAGE_NORMAL && page->type);
364 BUG_ON((type & page->type) == 0);
365 }
366}
367#else
368#define vmi_set_page_type(p,t) do { } while (0)
369#define vmi_check_page_type(p,t) do { } while (0)
370#endif
371
372#ifdef CONFIG_HIGHPTE 269#ifdef CONFIG_HIGHPTE
373static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type) 270static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
374{ 271{
@@ -395,7 +292,6 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
395 292
396static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn) 293static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
397{ 294{
398 vmi_set_page_type(pfn, VMI_PAGE_L1);
399 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); 295 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
400} 296}
401 297
@@ -406,27 +302,22 @@ static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn)
406 * It is called only for swapper_pg_dir, which already has 302 * It is called only for swapper_pg_dir, which already has
407 * data on it. 303 * data on it.
408 */ 304 */
409 vmi_set_page_type(pfn, VMI_PAGE_L2);
410 vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0); 305 vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
411} 306}
412 307
413static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count) 308static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count)
414{ 309{
415 vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
416 vmi_check_page_type(clonepfn, VMI_PAGE_L2);
417 vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count); 310 vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
418} 311}
419 312
420static void vmi_release_pte(unsigned long pfn) 313static void vmi_release_pte(unsigned long pfn)
421{ 314{
422 vmi_ops.release_page(pfn, VMI_PAGE_L1); 315 vmi_ops.release_page(pfn, VMI_PAGE_L1);
423 vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
424} 316}
425 317
426static void vmi_release_pmd(unsigned long pfn) 318static void vmi_release_pmd(unsigned long pfn)
427{ 319{
428 vmi_ops.release_page(pfn, VMI_PAGE_L2); 320 vmi_ops.release_page(pfn, VMI_PAGE_L2);
429 vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
430} 321}
431 322
432/* 323/*
@@ -450,26 +341,22 @@ static void vmi_release_pmd(unsigned long pfn)
450 341
451static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 342static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
452{ 343{
453 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
454 vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); 344 vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
455} 345}
456 346
457static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 347static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
458{ 348{
459 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
460 vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0)); 349 vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0));
461} 350}
462 351
463static void vmi_set_pte(pte_t *ptep, pte_t pte) 352static void vmi_set_pte(pte_t *ptep, pte_t pte)
464{ 353{
465 /* XXX because of set_pmd_pte, this can be called on PT or PD layers */ 354 /* XXX because of set_pmd_pte, this can be called on PT or PD layers */
466 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE | VMI_PAGE_PD);
467 vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT); 355 vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT);
468} 356}
469 357
470static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) 358static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
471{ 359{
472 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
473 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); 360 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
474} 361}
475 362
@@ -477,10 +364,8 @@ static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval)
477{ 364{
478#ifdef CONFIG_X86_PAE 365#ifdef CONFIG_X86_PAE
479 const pte_t pte = { .pte = pmdval.pmd }; 366 const pte_t pte = { .pte = pmdval.pmd };
480 vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PMD);
481#else 367#else
482 const pte_t pte = { pmdval.pud.pgd.pgd }; 368 const pte_t pte = { pmdval.pud.pgd.pgd };
483 vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PGD);
484#endif 369#endif
485 vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD); 370 vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD);
486} 371}
@@ -502,7 +387,6 @@ static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
502 387
503static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) 388static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
504{ 389{
505 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
506 vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1)); 390 vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1));
507} 391}
508 392
@@ -510,21 +394,18 @@ static void vmi_set_pud(pud_t *pudp, pud_t pudval)
510{ 394{
511 /* Um, eww */ 395 /* Um, eww */
512 const pte_t pte = { .pte = pudval.pgd.pgd }; 396 const pte_t pte = { .pte = pudval.pgd.pgd };
513 vmi_check_page_type(__pa(pudp) >> PAGE_SHIFT, VMI_PAGE_PGD);
514 vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP); 397 vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP);
515} 398}
516 399
517static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 400static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
518{ 401{
519 const pte_t pte = { .pte = 0 }; 402 const pte_t pte = { .pte = 0 };
520 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
521 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); 403 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
522} 404}
523 405
524static void vmi_pmd_clear(pmd_t *pmd) 406static void vmi_pmd_clear(pmd_t *pmd)
525{ 407{
526 const pte_t pte = { .pte = 0 }; 408 const pte_t pte = { .pte = 0 };
527 vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD);
528 vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD); 409 vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD);
529} 410}
530#endif 411#endif
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index a9b8560adbc2..82c67559dde7 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -44,6 +44,7 @@ SECTIONS
44 SCHED_TEXT 44 SCHED_TEXT
45 LOCK_TEXT 45 LOCK_TEXT
46 KPROBES_TEXT 46 KPROBES_TEXT
47 IRQENTRY_TEXT
47 *(.fixup) 48 *(.fixup)
48 *(.gnu.warning) 49 *(.gnu.warning)
49 _etext = .; /* End of text section */ 50 _etext = .; /* End of text section */
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 46e05447405b..1a614c0e6bef 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -35,6 +35,7 @@ SECTIONS
35 SCHED_TEXT 35 SCHED_TEXT
36 LOCK_TEXT 36 LOCK_TEXT
37 KPROBES_TEXT 37 KPROBES_TEXT
38 IRQENTRY_TEXT
38 *(.fixup) 39 *(.fixup)
39 *(.gnu.warning) 40 *(.gnu.warning)
40 _etext = .; /* End of text section */ 41 _etext = .; /* End of text section */
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 6f3d3d4cd973..44153afc9067 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -131,7 +131,16 @@ static __always_inline void do_vgettimeofday(struct timeval * tv)
131 gettimeofday(tv,NULL); 131 gettimeofday(tv,NULL);
132 return; 132 return;
133 } 133 }
134
135 /*
136 * Surround the RDTSC by barriers, to make sure it's not
137 * speculated to outside the seqlock critical section and
138 * does not cause time warps:
139 */
140 rdtsc_barrier();
134 now = vread(); 141 now = vread();
142 rdtsc_barrier();
143
135 base = __vsyscall_gtod_data.clock.cycle_last; 144 base = __vsyscall_gtod_data.clock.cycle_last;
136 mask = __vsyscall_gtod_data.clock.mask; 145 mask = __vsyscall_gtod_data.clock.mask;
137 mult = __vsyscall_gtod_data.clock.mult; 146 mult = __vsyscall_gtod_data.clock.mult;
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 104c8220a383..a7ed208f81e3 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -590,7 +590,8 @@ static void __init lguest_init_IRQ(void)
590 * a straightforward 1 to 1 mapping, so force that here. */ 590 * a straightforward 1 to 1 mapping, so force that here. */
591 __get_cpu_var(vector_irq)[vector] = i; 591 __get_cpu_var(vector_irq)[vector] = i;
592 if (vector != SYSCALL_VECTOR) { 592 if (vector != SYSCALL_VECTOR) {
593 set_intr_gate(vector, interrupt[vector]); 593 set_intr_gate(vector,
594 interrupt[vector-FIRST_EXTERNAL_VECTOR]);
594 set_irq_chip_and_handler_name(i, &lguest_irq_controller, 595 set_irq_chip_and_handler_name(i, &lguest_irq_controller,
595 handle_level_irq, 596 handle_level_irq,
596 "level"); 597 "level");
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index 5c7cef34c9e7..10b9bd35a8ff 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -30,21 +30,6 @@ ENTRY(lguest_entry)
30 movl $lguest_data - __PAGE_OFFSET, %edx 30 movl $lguest_data - __PAGE_OFFSET, %edx
31 int $LGUEST_TRAP_ENTRY 31 int $LGUEST_TRAP_ENTRY
32 32
33 /* The Host put the toplevel pagetable in lguest_data.pgdir. The movsl
34 * instruction uses %esi implicitly as the source for the copy we're
35 * about to do. */
36 movl lguest_data - __PAGE_OFFSET + LGUEST_DATA_pgdir, %esi
37
38 /* Copy first 32 entries of page directory to __PAGE_OFFSET entries.
39 * This means the first 128M of kernel memory will be mapped at
40 * PAGE_OFFSET where the kernel expects to run. This will get it far
41 * enough through boot to switch to its own pagetables. */
42 movl $32, %ecx
43 movl %esi, %edi
44 addl $((__PAGE_OFFSET >> 22) * 4), %edi
45 rep
46 movsl
47
48 /* Set up the initial stack so we can run C code. */ 33 /* Set up the initial stack so we can run C code. */
49 movl $(init_thread_union+THREAD_SIZE),%esp 34 movl $(init_thread_union+THREAD_SIZE),%esp
50 35
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 21e996a70d68..57ec8c86a877 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -393,7 +393,7 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
393 if (pte && pte_present(*pte) && !pte_exec(*pte)) 393 if (pte && pte_present(*pte) && !pte_exec(*pte))
394 printk(KERN_CRIT "kernel tried to execute " 394 printk(KERN_CRIT "kernel tried to execute "
395 "NX-protected page - exploit attempt? " 395 "NX-protected page - exploit attempt? "
396 "(uid: %d)\n", current->uid); 396 "(uid: %d)\n", current_uid());
397 } 397 }
398#endif 398#endif
399 399
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index c483f4242079..8655b5bb0963 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -21,6 +21,7 @@
21#include <linux/init.h> 21#include <linux/init.h>
22#include <linux/highmem.h> 22#include <linux/highmem.h>
23#include <linux/pagemap.h> 23#include <linux/pagemap.h>
24#include <linux/pci.h>
24#include <linux/pfn.h> 25#include <linux/pfn.h>
25#include <linux/poison.h> 26#include <linux/poison.h>
26#include <linux/bootmem.h> 27#include <linux/bootmem.h>
@@ -67,7 +68,7 @@ static unsigned long __meminitdata table_top;
67 68
68static int __initdata after_init_bootmem; 69static int __initdata after_init_bootmem;
69 70
70static __init void *alloc_low_page(unsigned long *phys) 71static __init void *alloc_low_page(void)
71{ 72{
72 unsigned long pfn = table_end++; 73 unsigned long pfn = table_end++;
73 void *adr; 74 void *adr;
@@ -77,7 +78,6 @@ static __init void *alloc_low_page(unsigned long *phys)
77 78
78 adr = __va(pfn * PAGE_SIZE); 79 adr = __va(pfn * PAGE_SIZE);
79 memset(adr, 0, PAGE_SIZE); 80 memset(adr, 0, PAGE_SIZE);
80 *phys = pfn * PAGE_SIZE;
81 return adr; 81 return adr;
82} 82}
83 83
@@ -92,16 +92,17 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
92 pmd_t *pmd_table; 92 pmd_t *pmd_table;
93 93
94#ifdef CONFIG_X86_PAE 94#ifdef CONFIG_X86_PAE
95 unsigned long phys;
96 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { 95 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
97 if (after_init_bootmem) 96 if (after_init_bootmem)
98 pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); 97 pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
99 else 98 else
100 pmd_table = (pmd_t *)alloc_low_page(&phys); 99 pmd_table = (pmd_t *)alloc_low_page();
101 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); 100 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
102 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); 101 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
103 pud = pud_offset(pgd, 0); 102 pud = pud_offset(pgd, 0);
104 BUG_ON(pmd_table != pmd_offset(pud, 0)); 103 BUG_ON(pmd_table != pmd_offset(pud, 0));
104
105 return pmd_table;
105 } 106 }
106#endif 107#endif
107 pud = pud_offset(pgd, 0); 108 pud = pud_offset(pgd, 0);
@@ -126,10 +127,8 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
126 if (!page_table) 127 if (!page_table)
127 page_table = 128 page_table =
128 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); 129 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
129 } else { 130 } else
130 unsigned long phys; 131 page_table = (pte_t *)alloc_low_page();
131 page_table = (pte_t *)alloc_low_page(&phys);
132 }
133 132
134 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); 133 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
135 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); 134 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
@@ -969,7 +968,7 @@ void __init mem_init(void)
969 int codesize, reservedpages, datasize, initsize; 968 int codesize, reservedpages, datasize, initsize;
970 int tmp; 969 int tmp;
971 970
972 start_periodic_check_for_corruption(); 971 pci_iommu_alloc();
973 972
974#ifdef CONFIG_FLATMEM 973#ifdef CONFIG_FLATMEM
975 BUG_ON(!mem_map); 974 BUG_ON(!mem_map);
@@ -1040,11 +1039,25 @@ void __init mem_init(void)
1040 (unsigned long)&_text, (unsigned long)&_etext, 1039 (unsigned long)&_text, (unsigned long)&_etext,
1041 ((unsigned long)&_etext - (unsigned long)&_text) >> 10); 1040 ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
1042 1041
1042 /*
1043 * Check boundaries twice: Some fundamental inconsistencies can
1044 * be detected at build time already.
1045 */
1046#define __FIXADDR_TOP (-PAGE_SIZE)
1047#ifdef CONFIG_HIGHMEM
1048 BUILD_BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
1049 BUILD_BUG_ON(VMALLOC_END > PKMAP_BASE);
1050#endif
1051#define high_memory (-128UL << 20)
1052 BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END);
1053#undef high_memory
1054#undef __FIXADDR_TOP
1055
1043#ifdef CONFIG_HIGHMEM 1056#ifdef CONFIG_HIGHMEM
1044 BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START); 1057 BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
1045 BUG_ON(VMALLOC_END > PKMAP_BASE); 1058 BUG_ON(VMALLOC_END > PKMAP_BASE);
1046#endif 1059#endif
1047 BUG_ON(VMALLOC_START > VMALLOC_END); 1060 BUG_ON(VMALLOC_START >= VMALLOC_END);
1048 BUG_ON((unsigned long)high_memory > VMALLOC_START); 1061 BUG_ON((unsigned long)high_memory > VMALLOC_START);
1049 1062
1050 if (boot_cpu_data.wp_works_ok < 0) 1063 if (boot_cpu_data.wp_works_ok < 0)
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 9db01db6e3cd..9f7a0d24d42a 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -902,8 +902,6 @@ void __init mem_init(void)
902 long codesize, reservedpages, datasize, initsize; 902 long codesize, reservedpages, datasize, initsize;
903 unsigned long absent_pages; 903 unsigned long absent_pages;
904 904
905 start_periodic_check_for_corruption();
906
907 pci_iommu_alloc(); 905 pci_iommu_alloc();
908 906
909 /* clear_bss() already clear the empty_zero_page */ 907 /* clear_bss() already clear the empty_zero_page */
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index d4c4307ff3e0..bd85d42819e1 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -223,7 +223,8 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
223 * Check if the request spans more than any BAR in the iomem resource 223 * Check if the request spans more than any BAR in the iomem resource
224 * tree. 224 * tree.
225 */ 225 */
226 WARN_ON(iomem_map_sanity_check(phys_addr, size)); 226 WARN_ONCE(iomem_map_sanity_check(phys_addr, size),
227 KERN_INFO "Info: mapping multiple BARs. Your kernel is fine.");
227 228
228 /* 229 /*
229 * Don't allow anybody to remap normal RAM that we're using.. 230 * Don't allow anybody to remap normal RAM that we're using..
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index eb1bf000d12e..85cbd3cd3723 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -596,6 +596,242 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
596 free_memtype(addr, addr + size); 596 free_memtype(addr, addr + size);
597} 597}
598 598
599/*
600 * Internal interface to reserve a range of physical memory with prot.
601 * Reserved non RAM regions only and after successful reserve_memtype,
602 * this func also keeps identity mapping (if any) in sync with this new prot.
603 */
604static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t vma_prot)
605{
606 int is_ram = 0;
607 int id_sz, ret;
608 unsigned long flags;
609 unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
610
611 is_ram = pagerange_is_ram(paddr, paddr + size);
612
613 if (is_ram != 0) {
614 /*
615 * For mapping RAM pages, drivers need to call
616 * set_memory_[uc|wc|wb] directly, for reserve and free, before
617 * setting up the PTE.
618 */
619 WARN_ON_ONCE(1);
620 return 0;
621 }
622
623 ret = reserve_memtype(paddr, paddr + size, want_flags, &flags);
624 if (ret)
625 return ret;
626
627 if (flags != want_flags) {
628 free_memtype(paddr, paddr + size);
629 printk(KERN_ERR
630 "%s:%d map pfn expected mapping type %s for %Lx-%Lx, got %s\n",
631 current->comm, current->pid,
632 cattr_name(want_flags),
633 (unsigned long long)paddr,
634 (unsigned long long)(paddr + size),
635 cattr_name(flags));
636 return -EINVAL;
637 }
638
639 /* Need to keep identity mapping in sync */
640 if (paddr >= __pa(high_memory))
641 return 0;
642
643 id_sz = (__pa(high_memory) < paddr + size) ?
644 __pa(high_memory) - paddr :
645 size;
646
647 if (ioremap_change_attr((unsigned long)__va(paddr), id_sz, flags) < 0) {
648 free_memtype(paddr, paddr + size);
649 printk(KERN_ERR
650 "%s:%d reserve_pfn_range ioremap_change_attr failed %s "
651 "for %Lx-%Lx\n",
652 current->comm, current->pid,
653 cattr_name(flags),
654 (unsigned long long)paddr,
655 (unsigned long long)(paddr + size));
656 return -EINVAL;
657 }
658 return 0;
659}
660
661/*
662 * Internal interface to free a range of physical memory.
663 * Frees non RAM regions only.
664 */
665static void free_pfn_range(u64 paddr, unsigned long size)
666{
667 int is_ram;
668
669 is_ram = pagerange_is_ram(paddr, paddr + size);
670 if (is_ram == 0)
671 free_memtype(paddr, paddr + size);
672}
673
674/*
675 * track_pfn_vma_copy is called when vma that is covering the pfnmap gets
676 * copied through copy_page_range().
677 *
678 * If the vma has a linear pfn mapping for the entire range, we get the prot
679 * from pte and reserve the entire vma range with single reserve_pfn_range call.
680 * Otherwise, we reserve the entire vma range, my ging through the PTEs page
681 * by page to get physical address and protection.
682 */
683int track_pfn_vma_copy(struct vm_area_struct *vma)
684{
685 int retval = 0;
686 unsigned long i, j;
687 resource_size_t paddr;
688 unsigned long prot;
689 unsigned long vma_start = vma->vm_start;
690 unsigned long vma_end = vma->vm_end;
691 unsigned long vma_size = vma_end - vma_start;
692
693 if (!pat_enabled)
694 return 0;
695
696 if (is_linear_pfn_mapping(vma)) {
697 /*
698 * reserve the whole chunk covered by vma. We need the
699 * starting address and protection from pte.
700 */
701 if (follow_phys(vma, vma_start, 0, &prot, &paddr)) {
702 WARN_ON_ONCE(1);
703 return -EINVAL;
704 }
705 return reserve_pfn_range(paddr, vma_size, __pgprot(prot));
706 }
707
708 /* reserve entire vma page by page, using pfn and prot from pte */
709 for (i = 0; i < vma_size; i += PAGE_SIZE) {
710 if (follow_phys(vma, vma_start + i, 0, &prot, &paddr))
711 continue;
712
713 retval = reserve_pfn_range(paddr, PAGE_SIZE, __pgprot(prot));
714 if (retval)
715 goto cleanup_ret;
716 }
717 return 0;
718
719cleanup_ret:
720 /* Reserve error: Cleanup partial reservation and return error */
721 for (j = 0; j < i; j += PAGE_SIZE) {
722 if (follow_phys(vma, vma_start + j, 0, &prot, &paddr))
723 continue;
724
725 free_pfn_range(paddr, PAGE_SIZE);
726 }
727
728 return retval;
729}
730
731/*
732 * track_pfn_vma_new is called when a _new_ pfn mapping is being established
733 * for physical range indicated by pfn and size.
734 *
735 * prot is passed in as a parameter for the new mapping. If the vma has a
736 * linear pfn mapping for the entire range reserve the entire vma range with
737 * single reserve_pfn_range call.
738 * Otherwise, we look t the pfn and size and reserve only the specified range
739 * page by page.
740 *
741 * Note that this function can be called with caller trying to map only a
742 * subrange/page inside the vma.
743 */
744int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t prot,
745 unsigned long pfn, unsigned long size)
746{
747 int retval = 0;
748 unsigned long i, j;
749 resource_size_t base_paddr;
750 resource_size_t paddr;
751 unsigned long vma_start = vma->vm_start;
752 unsigned long vma_end = vma->vm_end;
753 unsigned long vma_size = vma_end - vma_start;
754
755 if (!pat_enabled)
756 return 0;
757
758 if (is_linear_pfn_mapping(vma)) {
759 /* reserve the whole chunk starting from vm_pgoff */
760 paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
761 return reserve_pfn_range(paddr, vma_size, prot);
762 }
763
764 /* reserve page by page using pfn and size */
765 base_paddr = (resource_size_t)pfn << PAGE_SHIFT;
766 for (i = 0; i < size; i += PAGE_SIZE) {
767 paddr = base_paddr + i;
768 retval = reserve_pfn_range(paddr, PAGE_SIZE, prot);
769 if (retval)
770 goto cleanup_ret;
771 }
772 return 0;
773
774cleanup_ret:
775 /* Reserve error: Cleanup partial reservation and return error */
776 for (j = 0; j < i; j += PAGE_SIZE) {
777 paddr = base_paddr + j;
778 free_pfn_range(paddr, PAGE_SIZE);
779 }
780
781 return retval;
782}
783
784/*
785 * untrack_pfn_vma is called while unmapping a pfnmap for a region.
786 * untrack can be called for a specific region indicated by pfn and size or
787 * can be for the entire vma (in which case size can be zero).
788 */
789void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
790 unsigned long size)
791{
792 unsigned long i;
793 resource_size_t paddr;
794 unsigned long prot;
795 unsigned long vma_start = vma->vm_start;
796 unsigned long vma_end = vma->vm_end;
797 unsigned long vma_size = vma_end - vma_start;
798
799 if (!pat_enabled)
800 return;
801
802 if (is_linear_pfn_mapping(vma)) {
803 /* free the whole chunk starting from vm_pgoff */
804 paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
805 free_pfn_range(paddr, vma_size);
806 return;
807 }
808
809 if (size != 0 && size != vma_size) {
810 /* free page by page, using pfn and size */
811 paddr = (resource_size_t)pfn << PAGE_SHIFT;
812 for (i = 0; i < size; i += PAGE_SIZE) {
813 paddr = paddr + i;
814 free_pfn_range(paddr, PAGE_SIZE);
815 }
816 } else {
817 /* free entire vma, page by page, using the pfn from pte */
818 for (i = 0; i < vma_size; i += PAGE_SIZE) {
819 if (follow_phys(vma, vma_start + i, 0, &prot, &paddr))
820 continue;
821
822 free_pfn_range(paddr, PAGE_SIZE);
823 }
824 }
825}
826
827pgprot_t pgprot_writecombine(pgprot_t prot)
828{
829 if (pat_enabled)
830 return __pgprot(pgprot_val(prot) | _PAGE_CACHE_WC);
831 else
832 return pgprot_noncached(prot);
833}
834
599#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT) 835#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
600 836
601/* get Nth element of the linked list */ 837/* get Nth element of the linked list */
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 509513760a6e..98658f25f542 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -65,11 +65,13 @@ static unsigned long reset_value[NUM_COUNTERS];
65#define IBS_FETCH_BEGIN 3 65#define IBS_FETCH_BEGIN 3
66#define IBS_OP_BEGIN 4 66#define IBS_OP_BEGIN 4
67 67
68/* The function interface needs to be fixed, something like add 68/*
69 data. Should then be added to linux/oprofile.h. */ 69 * The function interface needs to be fixed, something like add
70 * data. Should then be added to linux/oprofile.h.
71 */
70extern void 72extern void
71oprofile_add_ibs_sample(struct pt_regs *const regs, 73oprofile_add_ibs_sample(struct pt_regs * const regs,
72 unsigned int *const ibs_sample, int ibs_code); 74 unsigned int * const ibs_sample, int ibs_code);
73 75
74struct ibs_fetch_sample { 76struct ibs_fetch_sample {
75 /* MSRC001_1031 IBS Fetch Linear Address Register */ 77 /* MSRC001_1031 IBS Fetch Linear Address Register */
@@ -104,11 +106,6 @@ struct ibs_op_sample {
104 unsigned int ibs_dc_phys_high; 106 unsigned int ibs_dc_phys_high;
105}; 107};
106 108
107/*
108 * unitialize the APIC for the IBS interrupts if needed on AMD Family10h+
109*/
110static void clear_ibs_nmi(void);
111
112static int ibs_allowed; /* AMD Family10h and later */ 109static int ibs_allowed; /* AMD Family10h and later */
113 110
114struct op_ibs_config { 111struct op_ibs_config {
@@ -223,7 +220,7 @@ op_amd_handle_ibs(struct pt_regs * const regs,
223 (unsigned int *)&ibs_fetch, 220 (unsigned int *)&ibs_fetch,
224 IBS_FETCH_BEGIN); 221 IBS_FETCH_BEGIN);
225 222
226 /*reenable the IRQ */ 223 /* reenable the IRQ */
227 rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); 224 rdmsr(MSR_AMD64_IBSFETCHCTL, low, high);
228 high &= ~IBS_FETCH_HIGH_VALID_BIT; 225 high &= ~IBS_FETCH_HIGH_VALID_BIT;
229 high |= IBS_FETCH_HIGH_ENABLE; 226 high |= IBS_FETCH_HIGH_ENABLE;
@@ -331,8 +328,10 @@ static void op_amd_stop(struct op_msrs const * const msrs)
331 unsigned int low, high; 328 unsigned int low, high;
332 int i; 329 int i;
333 330
334 /* Subtle: stop on all counters to avoid race with 331 /*
335 * setting our pm callback */ 332 * Subtle: stop on all counters to avoid race with setting our
333 * pm callback
334 */
336 for (i = 0 ; i < NUM_COUNTERS ; ++i) { 335 for (i = 0 ; i < NUM_COUNTERS ; ++i) {
337 if (!reset_value[i]) 336 if (!reset_value[i])
338 continue; 337 continue;
@@ -343,13 +342,15 @@ static void op_amd_stop(struct op_msrs const * const msrs)
343 342
344#ifdef CONFIG_OPROFILE_IBS 343#ifdef CONFIG_OPROFILE_IBS
345 if (ibs_allowed && ibs_config.fetch_enabled) { 344 if (ibs_allowed && ibs_config.fetch_enabled) {
346 low = 0; /* clear max count and enable */ 345 /* clear max count and enable */
346 low = 0;
347 high = 0; 347 high = 0;
348 wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); 348 wrmsr(MSR_AMD64_IBSFETCHCTL, low, high);
349 } 349 }
350 350
351 if (ibs_allowed && ibs_config.op_enabled) { 351 if (ibs_allowed && ibs_config.op_enabled) {
352 low = 0; /* clear max count and enable */ 352 /* clear max count and enable */
353 low = 0;
353 high = 0; 354 high = 0;
354 wrmsr(MSR_AMD64_IBSOPCTL, low, high); 355 wrmsr(MSR_AMD64_IBSOPCTL, low, high);
355 } 356 }
@@ -370,18 +371,7 @@ static void op_amd_shutdown(struct op_msrs const * const msrs)
370 } 371 }
371} 372}
372 373
373#ifndef CONFIG_OPROFILE_IBS 374#ifdef CONFIG_OPROFILE_IBS
374
375/* no IBS support */
376
377static int op_amd_init(struct oprofile_operations *ops)
378{
379 return 0;
380}
381
382static void op_amd_exit(void) {}
383
384#else
385 375
386static u8 ibs_eilvt_off; 376static u8 ibs_eilvt_off;
387 377
@@ -395,7 +385,7 @@ static inline void apic_clear_ibs_nmi_per_cpu(void *arg)
395 setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1); 385 setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
396} 386}
397 387
398static int pfm_amd64_setup_eilvt(void) 388static int init_ibs_nmi(void)
399{ 389{
400#define IBSCTL_LVTOFFSETVAL (1 << 8) 390#define IBSCTL_LVTOFFSETVAL (1 << 8)
401#define IBSCTL 0x1cc 391#define IBSCTL 0x1cc
@@ -443,18 +433,22 @@ static int pfm_amd64_setup_eilvt(void)
443 return 0; 433 return 0;
444} 434}
445 435
446/* 436/* uninitialize the APIC for the IBS interrupts if needed */
447 * initialize the APIC for the IBS interrupts 437static void clear_ibs_nmi(void)
448 * if available (AMD Family10h rev B0 and later) 438{
449 */ 439 if (ibs_allowed)
450static void setup_ibs(void) 440 on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1);
441}
442
443/* initialize the APIC for the IBS interrupts if available */
444static void ibs_init(void)
451{ 445{
452 ibs_allowed = boot_cpu_has(X86_FEATURE_IBS); 446 ibs_allowed = boot_cpu_has(X86_FEATURE_IBS);
453 447
454 if (!ibs_allowed) 448 if (!ibs_allowed)
455 return; 449 return;
456 450
457 if (pfm_amd64_setup_eilvt()) { 451 if (init_ibs_nmi()) {
458 ibs_allowed = 0; 452 ibs_allowed = 0;
459 return; 453 return;
460 } 454 }
@@ -462,14 +456,12 @@ static void setup_ibs(void)
462 printk(KERN_INFO "oprofile: AMD IBS detected\n"); 456 printk(KERN_INFO "oprofile: AMD IBS detected\n");
463} 457}
464 458
465 459static void ibs_exit(void)
466/*
467 * unitialize the APIC for the IBS interrupts if needed on AMD Family10h
468 * rev B0 and later */
469static void clear_ibs_nmi(void)
470{ 460{
471 if (ibs_allowed) 461 if (!ibs_allowed)
472 on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1); 462 return;
463
464 clear_ibs_nmi();
473} 465}
474 466
475static int (*create_arch_files)(struct super_block *sb, struct dentry *root); 467static int (*create_arch_files)(struct super_block *sb, struct dentry *root);
@@ -519,7 +511,7 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
519 511
520static int op_amd_init(struct oprofile_operations *ops) 512static int op_amd_init(struct oprofile_operations *ops)
521{ 513{
522 setup_ibs(); 514 ibs_init();
523 create_arch_files = ops->create_files; 515 create_arch_files = ops->create_files;
524 ops->create_files = setup_ibs_files; 516 ops->create_files = setup_ibs_files;
525 return 0; 517 return 0;
@@ -527,10 +519,21 @@ static int op_amd_init(struct oprofile_operations *ops)
527 519
528static void op_amd_exit(void) 520static void op_amd_exit(void)
529{ 521{
530 clear_ibs_nmi(); 522 ibs_exit();
531} 523}
532 524
533#endif 525#else
526
527/* no IBS support */
528
529static int op_amd_init(struct oprofile_operations *ops)
530{
531 return 0;
532}
533
534static void op_amd_exit(void) {}
535
536#endif /* CONFIG_OPROFILE_IBS */
534 537
535struct op_x86_model_spec const op_amd_spec = { 538struct op_x86_model_spec const op_amd_spec = {
536 .init = op_amd_init, 539 .init = op_amd_init,
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index b67732bbb85a..bb1a01f089e2 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -23,6 +23,12 @@ unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 |
23unsigned int pci_early_dump_regs; 23unsigned int pci_early_dump_regs;
24static int pci_bf_sort; 24static int pci_bf_sort;
25int pci_routeirq; 25int pci_routeirq;
26int noioapicquirk;
27#ifdef CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS
28int noioapicreroute = 0;
29#else
30int noioapicreroute = 1;
31#endif
26int pcibios_last_bus = -1; 32int pcibios_last_bus = -1;
27unsigned long pirq_table_addr; 33unsigned long pirq_table_addr;
28struct pci_bus *pci_root_bus; 34struct pci_bus *pci_root_bus;
@@ -519,6 +525,17 @@ char * __devinit pcibios_setup(char *str)
519 } else if (!strcmp(str, "skip_isa_align")) { 525 } else if (!strcmp(str, "skip_isa_align")) {
520 pci_probe |= PCI_CAN_SKIP_ISA_ALIGN; 526 pci_probe |= PCI_CAN_SKIP_ISA_ALIGN;
521 return NULL; 527 return NULL;
528 } else if (!strcmp(str, "noioapicquirk")) {
529 noioapicquirk = 1;
530 return NULL;
531 } else if (!strcmp(str, "ioapicreroute")) {
532 if (noioapicreroute != -1)
533 noioapicreroute = 0;
534 return NULL;
535 } else if (!strcmp(str, "noioapicreroute")) {
536 if (noioapicreroute != -1)
537 noioapicreroute = 1;
538 return NULL;
522 } 539 }
523 return str; 540 return str;
524} 541}
diff --git a/arch/x86/scripts/strip-symbols b/arch/x86/scripts/strip-symbols
new file mode 100644
index 000000000000..a2f1ccb827c7
--- /dev/null
+++ b/arch/x86/scripts/strip-symbols
@@ -0,0 +1 @@
__cpu_vendor_dev_X86_VENDOR_*
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 513f330c5832..1241f118ab56 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -310,7 +310,7 @@ int __init sysenter_setup(void)
310} 310}
311 311
312/* Setup a VMA at program startup for the vsyscall page */ 312/* Setup a VMA at program startup for the vsyscall page */
313int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) 313int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
314{ 314{
315 struct mm_struct *mm = current->mm; 315 struct mm_struct *mm = current->mm;
316 unsigned long addr; 316 unsigned long addr;
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 257ba4a10abf..9c98cc6ba978 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -98,7 +98,7 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
98 98
99/* Setup a VMA at program startup for the vsyscall page. 99/* Setup a VMA at program startup for the vsyscall page.
100 Not called for compat tasks */ 100 Not called for compat tasks */
101int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) 101int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
102{ 102{
103 struct mm_struct *mm = current->mm; 103 struct mm_struct *mm = current->mm;
104 unsigned long addr; 104 unsigned long addr;
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 5e4686d70f62..bea215230b20 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -28,6 +28,7 @@
28#include <linux/console.h> 28#include <linux/console.h>
29 29
30#include <xen/interface/xen.h> 30#include <xen/interface/xen.h>
31#include <xen/interface/version.h>
31#include <xen/interface/physdev.h> 32#include <xen/interface/physdev.h>
32#include <xen/interface/vcpu.h> 33#include <xen/interface/vcpu.h>
33#include <xen/features.h> 34#include <xen/features.h>
@@ -793,7 +794,7 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
793 794
794 ret = 0; 795 ret = 0;
795 796
796 switch(msr) { 797 switch (msr) {
797#ifdef CONFIG_X86_64 798#ifdef CONFIG_X86_64
798 unsigned which; 799 unsigned which;
799 u64 base; 800 u64 base;
@@ -1453,7 +1454,7 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1453 1454
1454 ident_pte = 0; 1455 ident_pte = 0;
1455 pfn = 0; 1456 pfn = 0;
1456 for(pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { 1457 for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1457 pte_t *pte_page; 1458 pte_t *pte_page;
1458 1459
1459 /* Reuse or allocate a page of ptes */ 1460 /* Reuse or allocate a page of ptes */
@@ -1471,7 +1472,7 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1471 } 1472 }
1472 1473
1473 /* Install mappings */ 1474 /* Install mappings */
1474 for(pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { 1475 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1475 pte_t pte; 1476 pte_t pte;
1476 1477
1477 if (pfn > max_pfn_mapped) 1478 if (pfn > max_pfn_mapped)
@@ -1485,7 +1486,7 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1485 } 1486 }
1486 } 1487 }
1487 1488
1488 for(pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE) 1489 for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1489 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO); 1490 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1490 1491
1491 set_page_prot(pmd, PAGE_KERNEL_RO); 1492 set_page_prot(pmd, PAGE_KERNEL_RO);
@@ -1499,7 +1500,7 @@ static void convert_pfn_mfn(void *v)
1499 1500
1500 /* All levels are converted the same way, so just treat them 1501 /* All levels are converted the same way, so just treat them
1501 as ptes. */ 1502 as ptes. */
1502 for(i = 0; i < PTRS_PER_PTE; i++) 1503 for (i = 0; i < PTRS_PER_PTE; i++)
1503 pte[i] = xen_make_pte(pte[i].pte); 1504 pte[i] = xen_make_pte(pte[i].pte);
1504} 1505}
1505 1506
@@ -1514,7 +1515,8 @@ static void convert_pfn_mfn(void *v)
1514 * of the physical mapping once some sort of allocator has been set 1515 * of the physical mapping once some sort of allocator has been set
1515 * up. 1516 * up.
1516 */ 1517 */
1517static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) 1518static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1519 unsigned long max_pfn)
1518{ 1520{
1519 pud_t *l3; 1521 pud_t *l3;
1520 pmd_t *l2; 1522 pmd_t *l2;
@@ -1577,7 +1579,8 @@ static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pf
1577#else /* !CONFIG_X86_64 */ 1579#else /* !CONFIG_X86_64 */
1578static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss; 1580static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
1579 1581
1580static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) 1582static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1583 unsigned long max_pfn)
1581{ 1584{
1582 pmd_t *kernel_pmd; 1585 pmd_t *kernel_pmd;
1583 1586
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index e59e53b11e2b..503c240e26c7 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -154,13 +154,13 @@ void xen_setup_mfn_list_list(void)
154{ 154{
155 unsigned pfn, idx; 155 unsigned pfn, idx;
156 156
157 for(pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) { 157 for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
158 unsigned topidx = p2m_top_index(pfn); 158 unsigned topidx = p2m_top_index(pfn);
159 159
160 p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]); 160 p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
161 } 161 }
162 162
163 for(idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) { 163 for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
164 unsigned topidx = idx * P2M_ENTRIES_PER_PAGE; 164 unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
165 p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]); 165 p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
166 } 166 }
@@ -179,7 +179,7 @@ void __init xen_build_dynamic_phys_to_machine(void)
179 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); 179 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
180 unsigned pfn; 180 unsigned pfn;
181 181
182 for(pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) { 182 for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
183 unsigned topidx = p2m_top_index(pfn); 183 unsigned topidx = p2m_top_index(pfn);
184 184
185 p2m_top[topidx] = &mfn_list[pfn]; 185 p2m_top[topidx] = &mfn_list[pfn];
@@ -207,7 +207,7 @@ static void alloc_p2m(unsigned long **pp, unsigned long *mfnp)
207 p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL); 207 p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
208 BUG_ON(p == NULL); 208 BUG_ON(p == NULL);
209 209
210 for(i = 0; i < P2M_ENTRIES_PER_PAGE; i++) 210 for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
211 p[i] = INVALID_P2M_ENTRY; 211 p[i] = INVALID_P2M_ENTRY;
212 212
213 if (cmpxchg(pp, p2m_missing, p) != p2m_missing) 213 if (cmpxchg(pp, p2m_missing, p) != p2m_missing)
@@ -407,7 +407,8 @@ out:
407 preempt_enable(); 407 preempt_enable();
408} 408}
409 409
410pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 410pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
411 unsigned long addr, pte_t *ptep)
411{ 412{
412 /* Just return the pte as-is. We preserve the bits on commit */ 413 /* Just return the pte as-is. We preserve the bits on commit */
413 return *ptep; 414 return *ptep;
@@ -878,7 +879,8 @@ static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
878 879
879 if (user_pgd) { 880 if (user_pgd) {
880 xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD); 881 xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
881 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd))); 882 xen_do_pin(MMUEXT_PIN_L4_TABLE,
883 PFN_DOWN(__pa(user_pgd)));
882 } 884 }
883 } 885 }
884#else /* CONFIG_X86_32 */ 886#else /* CONFIG_X86_32 */
@@ -993,7 +995,8 @@ static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
993 pgd_t *user_pgd = xen_get_user_pgd(pgd); 995 pgd_t *user_pgd = xen_get_user_pgd(pgd);
994 996
995 if (user_pgd) { 997 if (user_pgd) {
996 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd))); 998 xen_do_pin(MMUEXT_UNPIN_TABLE,
999 PFN_DOWN(__pa(user_pgd)));
997 xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD); 1000 xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
998 } 1001 }
999 } 1002 }
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 8ea8a0d0b0de..c738644b5435 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -154,7 +154,7 @@ void xen_mc_flush(void)
154 ret, smp_processor_id()); 154 ret, smp_processor_id());
155 dump_stack(); 155 dump_stack();
156 for (i = 0; i < b->mcidx; i++) { 156 for (i = 0; i < b->mcidx; i++) {
157 printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n", 157 printk(KERN_DEBUG " call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
158 i+1, b->mcidx, 158 i+1, b->mcidx,
159 b->debug[i].op, 159 b->debug[i].op,
160 b->debug[i].args[0], 160 b->debug[i].args[0],
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index d67901083888..15c6c68db6a2 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -28,6 +28,9 @@
28/* These are code, but not functions. Defined in entry.S */ 28/* These are code, but not functions. Defined in entry.S */
29extern const char xen_hypervisor_callback[]; 29extern const char xen_hypervisor_callback[];
30extern const char xen_failsafe_callback[]; 30extern const char xen_failsafe_callback[];
31extern void xen_sysenter_target(void);
32extern void xen_syscall_target(void);
33extern void xen_syscall32_target(void);
31 34
32 35
33/** 36/**
@@ -110,7 +113,6 @@ static __cpuinit int register_callback(unsigned type, const void *func)
110 113
111void __cpuinit xen_enable_sysenter(void) 114void __cpuinit xen_enable_sysenter(void)
112{ 115{
113 extern void xen_sysenter_target(void);
114 int ret; 116 int ret;
115 unsigned sysenter_feature; 117 unsigned sysenter_feature;
116 118
@@ -132,8 +134,6 @@ void __cpuinit xen_enable_syscall(void)
132{ 134{
133#ifdef CONFIG_X86_64 135#ifdef CONFIG_X86_64
134 int ret; 136 int ret;
135 extern void xen_syscall_target(void);
136 extern void xen_syscall32_target(void);
137 137
138 ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target); 138 ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
139 if (ret != 0) { 139 if (ret != 0) {
@@ -160,7 +160,8 @@ void __init xen_arch_setup(void)
160 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); 160 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
161 161
162 if (!xen_feature(XENFEAT_auto_translated_physmap)) 162 if (!xen_feature(XENFEAT_auto_translated_physmap))
163 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3); 163 HYPERVISOR_vm_assist(VMASST_CMD_enable,
164 VMASST_TYPE_pae_extended_cr3);
164 165
165 if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) || 166 if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
166 register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback)) 167 register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))