aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@woody.osdl.org>2006-12-07 11:59:11 -0500
committerLinus Torvalds <torvalds@woody.osdl.org>2006-12-07 11:59:11 -0500
commit4522d58275f124105819723e24e912c8e5bf3cdd (patch)
treeb92c29014fadffe049c1925676037f0092b8d112 /arch
parent6cf24f031bc97cb5a7c9df3b6e73c45b628b2b28 (diff)
parent64a26a731235b59c9d73bbe82c1f896d57400d37 (diff)
Merge branch 'for-linus' of git://one.firstfloor.org/home/andi/git/linux-2.6
* 'for-linus' of git://one.firstfloor.org/home/andi/git/linux-2.6: (156 commits) [PATCH] x86-64: Export smp_call_function_single [PATCH] i386: Clean up smp_tune_scheduling() [PATCH] unwinder: move .eh_frame to RODATA [PATCH] unwinder: fully support linker generated .eh_frame_hdr section [PATCH] x86-64: don't use set_irq_regs() [PATCH] x86-64: check vector in setup_ioapic_dest to verify if need setup_IO_APIC_irq [PATCH] x86-64: Make ix86 default to HIGHMEM4G instead of NOHIGHMEM [PATCH] i386: replace kmalloc+memset with kzalloc [PATCH] x86-64: remove remaining pc98 code [PATCH] x86-64: remove unused variable [PATCH] x86-64: Fix constraints in atomic_add_return() [PATCH] x86-64: fix asm constraints in i386 atomic_add_return [PATCH] x86-64: Correct documentation for bzImage protocol v2.05 [PATCH] x86-64: replace kmalloc+memset with kzalloc in MTRR code [PATCH] x86-64: Fix numaq build error [PATCH] x86-64: include/asm-x86_64/cpufeature.h isn't a userspace header [PATCH] unwinder: Add debugging output to the Dwarf2 unwinder [PATCH] x86-64: Clarify error message in GART code [PATCH] x86-64: Fix interrupt race in idle callback (3rd try) [PATCH] x86-64: Remove unwind stack pointer alignment forcing again ... Fixed conflict in include/linux/uaccess.h manually Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'arch')
-rw-r--r--arch/h8300/kernel/vmlinux.lds.S1
-rw-r--r--arch/i386/Kconfig72
-rw-r--r--arch/i386/Kconfig.cpu19
-rw-r--r--arch/i386/Kconfig.debug10
-rw-r--r--arch/i386/Makefile8
-rw-r--r--arch/i386/Makefile.cpu1
-rw-r--r--arch/i386/boot/compressed/Makefile28
-rw-r--r--arch/i386/boot/compressed/head.S185
-rw-r--r--arch/i386/boot/compressed/misc.c264
-rw-r--r--arch/i386/boot/compressed/relocs.c625
-rw-r--r--arch/i386/boot/compressed/vmlinux.lds43
-rw-r--r--arch/i386/boot/compressed/vmlinux.scr3
-rw-r--r--arch/i386/boot/setup.S42
-rw-r--r--arch/i386/defconfig22
-rw-r--r--arch/i386/kernel/Makefile5
-rw-r--r--arch/i386/kernel/acpi/earlyquirk.c21
-rw-r--r--arch/i386/kernel/alternative.c63
-rw-r--r--arch/i386/kernel/apic.c22
-rw-r--r--arch/i386/kernel/apm.c3
-rw-r--r--arch/i386/kernel/asm-offsets.c39
-rw-r--r--arch/i386/kernel/cpu/amd.c5
-rw-r--r--arch/i386/kernel/cpu/common.c249
-rw-r--r--arch/i386/kernel/cpu/intel.c12
-rw-r--r--arch/i386/kernel/cpu/intel_cacheinfo.c11
-rw-r--r--arch/i386/kernel/cpu/mtrr/Makefile4
-rw-r--r--arch/i386/kernel/cpu/mtrr/amd.c2
-rw-r--r--arch/i386/kernel/cpu/mtrr/centaur.c9
-rw-r--r--arch/i386/kernel/cpu/mtrr/cyrix.c25
-rw-r--r--arch/i386/kernel/cpu/mtrr/generic.c78
-rw-r--r--arch/i386/kernel/cpu/mtrr/if.c31
-rw-r--r--arch/i386/kernel/cpu/mtrr/main.c71
-rw-r--r--arch/i386/kernel/cpu/mtrr/mtrr.h25
-rw-r--r--arch/i386/kernel/cpu/proc.c3
-rw-r--r--arch/i386/kernel/cpuid.c1
-rw-r--r--arch/i386/kernel/e820.c894
-rw-r--r--arch/i386/kernel/efi.c17
-rw-r--r--arch/i386/kernel/entry.S331
-rw-r--r--arch/i386/kernel/head.S66
-rw-r--r--arch/i386/kernel/hpet.c7
-rw-r--r--arch/i386/kernel/i8259.c5
-rw-r--r--arch/i386/kernel/io_apic.c65
-rw-r--r--arch/i386/kernel/ldt.c4
-rw-r--r--arch/i386/kernel/mca.c13
-rw-r--r--arch/i386/kernel/module.c11
-rw-r--r--arch/i386/kernel/mpparse.c2
-rw-r--r--arch/i386/kernel/msr.c3
-rw-r--r--arch/i386/kernel/nmi.c42
-rw-r--r--arch/i386/kernel/paravirt.c569
-rw-r--r--arch/i386/kernel/pci-dma.c6
-rw-r--r--arch/i386/kernel/process.c81
-rw-r--r--arch/i386/kernel/ptrace.c18
-rw-r--r--arch/i386/kernel/quirks.c46
-rw-r--r--arch/i386/kernel/setup.c852
-rw-r--r--arch/i386/kernel/signal.c6
-rw-r--r--arch/i386/kernel/smp.c6
-rw-r--r--arch/i386/kernel/smpboot.c69
-rw-r--r--arch/i386/kernel/sysenter.c4
-rw-r--r--arch/i386/kernel/time.c15
-rw-r--r--arch/i386/kernel/time_hpet.c15
-rw-r--r--arch/i386/kernel/topology.c8
-rw-r--r--arch/i386/kernel/traps.c118
-rw-r--r--arch/i386/kernel/tsc.c1
-rw-r--r--arch/i386/kernel/vm86.c121
-rw-r--r--arch/i386/kernel/vmlinux.lds.S147
-rw-r--r--arch/i386/mach-generic/probe.c4
-rw-r--r--arch/i386/mach-voyager/voyager_cat.c6
-rw-r--r--arch/i386/mach-voyager/voyager_smp.c14
-rw-r--r--arch/i386/math-emu/fpu_emu.h1
-rw-r--r--arch/i386/math-emu/fpu_entry.c3
-rw-r--r--arch/i386/math-emu/fpu_system.h1
-rw-r--r--arch/i386/math-emu/load_store.c2
-rw-r--r--arch/i386/math-emu/reg_ld_str.c15
-rw-r--r--arch/i386/mm/boot_ioremap.c1
-rw-r--r--arch/i386/mm/discontig.c2
-rw-r--r--arch/i386/mm/fault.c12
-rw-r--r--arch/i386/mm/init.c2
-rw-r--r--arch/i386/mm/pageattr.c24
-rw-r--r--arch/i386/mm/pgtable.c7
-rw-r--r--arch/i386/pci/early.c7
-rw-r--r--arch/i386/pci/irq.c4
-rw-r--r--arch/i386/pci/pcbios.c11
-rw-r--r--arch/i386/power/cpu.c8
-rw-r--r--arch/ia64/kernel/topology.c8
-rw-r--r--arch/m68knommu/kernel/vmlinux.lds.S1
-rw-r--r--arch/powerpc/kernel/sysfs.c8
-rw-r--r--arch/powerpc/kernel/vmlinux.lds.S1
-rw-r--r--arch/ppc/kernel/vmlinux.lds.S1
-rw-r--r--arch/sparc/kernel/vmlinux.lds.S1
-rw-r--r--arch/sparc64/kernel/vmlinux.lds.S1
-rw-r--r--arch/v850/kernel/vmlinux.lds.S1
-rw-r--r--arch/x86_64/Kconfig43
-rw-r--r--arch/x86_64/Makefile4
-rw-r--r--arch/x86_64/defconfig27
-rw-r--r--arch/x86_64/ia32/ia32_signal.c5
-rw-r--r--arch/x86_64/kernel/apic.c104
-rw-r--r--arch/x86_64/kernel/early-quirks.c18
-rw-r--r--arch/x86_64/kernel/entry.S36
-rw-r--r--arch/x86_64/kernel/genapic.c9
-rw-r--r--arch/x86_64/kernel/head64.c6
-rw-r--r--arch/x86_64/kernel/i387.c7
-rw-r--r--arch/x86_64/kernel/i8259.c3
-rw-r--r--arch/x86_64/kernel/io_apic.c258
-rw-r--r--arch/x86_64/kernel/irq.c2
-rw-r--r--arch/x86_64/kernel/mce.c1
-rw-r--r--arch/x86_64/kernel/mpparse.c2
-rw-r--r--arch/x86_64/kernel/nmi.c29
-rw-r--r--arch/x86_64/kernel/pci-calgary.c218
-rw-r--r--arch/x86_64/kernel/pci-dma.c5
-rw-r--r--arch/x86_64/kernel/pci-gart.c3
-rw-r--r--arch/x86_64/kernel/process.c45
-rw-r--r--arch/x86_64/kernel/setup.c21
-rw-r--r--arch/x86_64/kernel/smp.c1
-rw-r--r--arch/x86_64/kernel/smpboot.c8
-rw-r--r--arch/x86_64/kernel/traps.c55
-rw-r--r--arch/x86_64/kernel/vmlinux.lds.S9
-rw-r--r--arch/x86_64/kernel/vsyscall.c1
-rw-r--r--arch/x86_64/lib/csum-partial.c4
-rw-r--r--arch/x86_64/lib/delay.c4
-rw-r--r--arch/x86_64/mm/fault.c10
-rw-r--r--arch/x86_64/mm/init.c7
-rw-r--r--arch/x86_64/mm/pageattr.c58
121 files changed, 4507 insertions, 2186 deletions
diff --git a/arch/h8300/kernel/vmlinux.lds.S b/arch/h8300/kernel/vmlinux.lds.S
index 756325dd480e..f05288be8878 100644
--- a/arch/h8300/kernel/vmlinux.lds.S
+++ b/arch/h8300/kernel/vmlinux.lds.S
@@ -70,6 +70,7 @@ SECTIONS
70#endif 70#endif
71 .text : 71 .text :
72 { 72 {
73 _text = .;
73#if defined(CONFIG_ROMKERNEL) 74#if defined(CONFIG_ROMKERNEL)
74 *(.int_redirect) 75 *(.int_redirect)
75#endif 76#endif
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 8ff1c6fb5aa1..ea70359b02d0 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -182,6 +182,17 @@ config X86_ES7000
182 182
183endchoice 183endchoice
184 184
185config PARAVIRT
186 bool "Paravirtualization support (EXPERIMENTAL)"
187 depends on EXPERIMENTAL
188 help
189 Paravirtualization is a way of running multiple instances of
190 Linux on the same machine, under a hypervisor. This option
191 changes the kernel so it can modify itself when it is run
192 under a hypervisor, improving performance significantly.
193 However, when run without a hypervisor the kernel is
194 theoretically slower. If in doubt, say N.
195
185config ACPI_SRAT 196config ACPI_SRAT
186 bool 197 bool
187 default y 198 default y
@@ -443,7 +454,8 @@ source "drivers/firmware/Kconfig"
443 454
444choice 455choice
445 prompt "High Memory Support" 456 prompt "High Memory Support"
446 default NOHIGHMEM 457 default HIGHMEM4G if !X86_NUMAQ
458 default HIGHMEM64G if X86_NUMAQ
447 459
448config NOHIGHMEM 460config NOHIGHMEM
449 bool "off" 461 bool "off"
@@ -710,20 +722,6 @@ config BOOT_IOREMAP
710 depends on (((X86_SUMMIT || X86_GENERICARCH) && NUMA) || (X86 && EFI)) 722 depends on (((X86_SUMMIT || X86_GENERICARCH) && NUMA) || (X86 && EFI))
711 default y 723 default y
712 724
713config REGPARM
714 bool "Use register arguments"
715 default y
716 help
717 Compile the kernel with -mregparm=3. This instructs gcc to use
718 a more efficient function call ABI which passes the first three
719 arguments of a function call via registers, which results in denser
720 and faster code.
721
722 If this option is disabled, then the default ABI of passing
723 arguments via the stack is used.
724
725 If unsure, say Y.
726
727config SECCOMP 725config SECCOMP
728 bool "Enable seccomp to safely compute untrusted bytecode" 726 bool "Enable seccomp to safely compute untrusted bytecode"
729 depends on PROC_FS 727 depends on PROC_FS
@@ -773,23 +771,39 @@ config CRASH_DUMP
773 PHYSICAL_START. 771 PHYSICAL_START.
774 For more details see Documentation/kdump/kdump.txt 772 For more details see Documentation/kdump/kdump.txt
775 773
776config PHYSICAL_START 774config RELOCATABLE
777 hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) 775 bool "Build a relocatable kernel(EXPERIMENTAL)"
776 depends on EXPERIMENTAL
777 help
778 This build a kernel image that retains relocation information
779 so it can be loaded someplace besides the default 1MB.
780 The relocations tend to the kernel binary about 10% larger,
781 but are discarded at runtime.
782
783 One use is for the kexec on panic case where the recovery kernel
784 must live at a different physical address than the primary
785 kernel.
778 786
779 default "0x1000000" if CRASH_DUMP 787config PHYSICAL_ALIGN
788 hex "Alignment value to which kernel should be aligned"
780 default "0x100000" 789 default "0x100000"
790 range 0x2000 0x400000
781 help 791 help
782 This gives the physical address where the kernel is loaded. Normally 792 This value puts the alignment restrictions on physical address
783 for regular kernels this value is 0x100000 (1MB). But in the case 793 where kernel is loaded and run from. Kernel is compiled for an
784 of kexec on panic the fail safe kernel needs to run at a different 794 address which meets above alignment restriction.
785 address than the panic-ed kernel. This option is used to set the load 795
786 address for kernels used to capture crash dump on being kexec'ed 796 If bootloader loads the kernel at a non-aligned address and
787 after panic. The default value for crash dump kernels is 797 CONFIG_RELOCATABLE is set, kernel will move itself to nearest
788 0x1000000 (16MB). This can also be set based on the "X" value as 798 address aligned to above value and run from there.
789 specified in the "crashkernel=YM@XM" command line boot parameter 799
790 passed to the panic-ed kernel. Typically this parameter is set as 800 If bootloader loads the kernel at a non-aligned address and
791 crashkernel=64M@16M. Please take a look at 801 CONFIG_RELOCATABLE is not set, kernel will ignore the run time
792 Documentation/kdump/kdump.txt for more details about crash dumps. 802 load address and decompress itself to the address it has been
803 compiled for and run from there. The address for which kernel is
804 compiled already meets above alignment restrictions. Hence the
805 end result is that kernel runs from a physical address meeting
806 above alignment restrictions.
793 807
794 Don't change this unless you know what you are doing. 808 Don't change this unless you know what you are doing.
795 809
diff --git a/arch/i386/Kconfig.cpu b/arch/i386/Kconfig.cpu
index fc4f2abccf06..821fd269ca58 100644
--- a/arch/i386/Kconfig.cpu
+++ b/arch/i386/Kconfig.cpu
@@ -103,8 +103,15 @@ config MPENTIUMM
103 Select this for Intel Pentium M (not Pentium-4 M) 103 Select this for Intel Pentium M (not Pentium-4 M)
104 notebook chips. 104 notebook chips.
105 105
106config MCORE2
107 bool "Core 2/newer Xeon"
108 help
109 Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and 53xx)
110 CPUs. You can distingush newer from older Xeons by the CPU family
111 in /proc/cpuinfo. Newer ones have 6.
112
106config MPENTIUM4 113config MPENTIUM4
107 bool "Pentium-4/Celeron(P4-based)/Pentium-4 M/Xeon" 114 bool "Pentium-4/Celeron(P4-based)/Pentium-4 M/older Xeon"
108 help 115 help
109 Select this for Intel Pentium 4 chips. This includes the 116 Select this for Intel Pentium 4 chips. This includes the
110 Pentium 4, P4-based Celeron and Xeon, and Pentium-4 M 117 Pentium 4, P4-based Celeron and Xeon, and Pentium-4 M
@@ -229,7 +236,7 @@ config X86_L1_CACHE_SHIFT
229 default "7" if MPENTIUM4 || X86_GENERIC 236 default "7" if MPENTIUM4 || X86_GENERIC
230 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 237 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1
231 default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX 238 default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
232 default "6" if MK7 || MK8 || MPENTIUMM 239 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2
233 240
234config RWSEM_GENERIC_SPINLOCK 241config RWSEM_GENERIC_SPINLOCK
235 bool 242 bool
@@ -287,17 +294,17 @@ config X86_ALIGNMENT_16
287 294
288config X86_GOOD_APIC 295config X86_GOOD_APIC
289 bool 296 bool
290 depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON 297 depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON || MCORE2
291 default y 298 default y
292 299
293config X86_INTEL_USERCOPY 300config X86_INTEL_USERCOPY
294 bool 301 bool
295 depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON 302 depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2
296 default y 303 default y
297 304
298config X86_USE_PPRO_CHECKSUM 305config X86_USE_PPRO_CHECKSUM
299 bool 306 bool
300 depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX 307 depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2
301 default y 308 default y
302 309
303config X86_USE_3DNOW 310config X86_USE_3DNOW
@@ -312,5 +319,5 @@ config X86_OOSTORE
312 319
313config X86_TSC 320config X86_TSC
314 bool 321 bool
315 depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1 || MGEODE_LX) && !X86_NUMAQ 322 depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ
316 default y 323 default y
diff --git a/arch/i386/Kconfig.debug b/arch/i386/Kconfig.debug
index b31c0802e1cc..f68cc6f215f8 100644
--- a/arch/i386/Kconfig.debug
+++ b/arch/i386/Kconfig.debug
@@ -85,4 +85,14 @@ config DOUBLEFAULT
85 option saves about 4k and might cause you much additional grey 85 option saves about 4k and might cause you much additional grey
86 hair. 86 hair.
87 87
88config DEBUG_PARAVIRT
89 bool "Enable some paravirtualization debugging"
90 default y
91 depends on PARAVIRT && DEBUG_KERNEL
92 help
93 Currently deliberately clobbers regs which are allowed to be
94 clobbered in inlined paravirt hooks, even in native mode.
95 If turning this off solves a problem, then DISABLE_INTERRUPTS() or
96 ENABLE_INTERRUPTS() is lying about what registers can be clobbered.
97
88endmenu 98endmenu
diff --git a/arch/i386/Makefile b/arch/i386/Makefile
index 0677908dfa06..f7ac1aea1d8a 100644
--- a/arch/i386/Makefile
+++ b/arch/i386/Makefile
@@ -26,10 +26,12 @@ endif
26 26
27LDFLAGS := -m elf_i386 27LDFLAGS := -m elf_i386
28OBJCOPYFLAGS := -O binary -R .note -R .comment -S 28OBJCOPYFLAGS := -O binary -R .note -R .comment -S
29LDFLAGS_vmlinux := 29ifdef CONFIG_RELOCATABLE
30LDFLAGS_vmlinux := --emit-relocs
31endif
30CHECKFLAGS += -D__i386__ 32CHECKFLAGS += -D__i386__
31 33
32CFLAGS += -pipe -msoft-float 34CFLAGS += -pipe -msoft-float -mregparm=3
33 35
34# prevent gcc from keeping the stack 16 byte aligned 36# prevent gcc from keeping the stack 16 byte aligned
35CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2) 37CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2)
@@ -37,8 +39,6 @@ CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2)
37# CPU-specific tuning. Anything which can be shared with UML should go here. 39# CPU-specific tuning. Anything which can be shared with UML should go here.
38include $(srctree)/arch/i386/Makefile.cpu 40include $(srctree)/arch/i386/Makefile.cpu
39 41
40cflags-$(CONFIG_REGPARM) += -mregparm=3
41
42# temporary until string.h is fixed 42# temporary until string.h is fixed
43cflags-y += -ffreestanding 43cflags-y += -ffreestanding
44 44
diff --git a/arch/i386/Makefile.cpu b/arch/i386/Makefile.cpu
index a11befba26d5..a32c031c90d7 100644
--- a/arch/i386/Makefile.cpu
+++ b/arch/i386/Makefile.cpu
@@ -32,6 +32,7 @@ cflags-$(CONFIG_MWINCHIP2) += $(call cc-option,-march=winchip2,-march=i586)
32cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586) 32cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586)
33cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 33cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
34cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) 34cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686)
35cflags-$(CONFIG_MCORE2) += -march=i686 $(call cc-option,-mtune=core2,$(call cc-option,-mtune=generic,-mtune=i686))
35 36
36# AMD Elan support 37# AMD Elan support
37cflags-$(CONFIG_X86_ELAN) += -march=i486 38cflags-$(CONFIG_X86_ELAN) += -march=i486
diff --git a/arch/i386/boot/compressed/Makefile b/arch/i386/boot/compressed/Makefile
index 258ea95224f6..a661217f33ec 100644
--- a/arch/i386/boot/compressed/Makefile
+++ b/arch/i386/boot/compressed/Makefile
@@ -4,22 +4,42 @@
4# create a compressed vmlinux image from the original vmlinux 4# create a compressed vmlinux image from the original vmlinux
5# 5#
6 6
7targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o 7targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o \
8 vmlinux.bin.all vmlinux.relocs
8EXTRA_AFLAGS := -traditional 9EXTRA_AFLAGS := -traditional
9 10
10LDFLAGS_vmlinux := -Ttext $(IMAGE_OFFSET) -e startup_32 11LDFLAGS_vmlinux := -T
12CFLAGS_misc.o += -fPIC
13hostprogs-y := relocs
11 14
12$(obj)/vmlinux: $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE 15$(obj)/vmlinux: $(src)/vmlinux.lds $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE
13 $(call if_changed,ld) 16 $(call if_changed,ld)
14 @: 17 @:
15 18
16$(obj)/vmlinux.bin: vmlinux FORCE 19$(obj)/vmlinux.bin: vmlinux FORCE
17 $(call if_changed,objcopy) 20 $(call if_changed,objcopy)
18 21
22quiet_cmd_relocs = RELOCS $@
23 cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $<
24$(obj)/vmlinux.relocs: vmlinux $(obj)/relocs FORCE
25 $(call if_changed,relocs)
26
27vmlinux.bin.all-y := $(obj)/vmlinux.bin
28vmlinux.bin.all-$(CONFIG_RELOCATABLE) += $(obj)/vmlinux.relocs
29quiet_cmd_relocbin = BUILD $@
30 cmd_relocbin = cat $(filter-out FORCE,$^) > $@
31$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE
32 $(call if_changed,relocbin)
33
34ifdef CONFIG_RELOCATABLE
35$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE
36 $(call if_changed,gzip)
37else
19$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE 38$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
20 $(call if_changed,gzip) 39 $(call if_changed,gzip)
40endif
21 41
22LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T 42LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T
23 43
24$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE 44$(obj)/piggy.o: $(src)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE
25 $(call if_changed,ld) 45 $(call if_changed,ld)
diff --git a/arch/i386/boot/compressed/head.S b/arch/i386/boot/compressed/head.S
index b5893e4ecd37..f395a4bb38bb 100644
--- a/arch/i386/boot/compressed/head.S
+++ b/arch/i386/boot/compressed/head.S
@@ -26,9 +26,11 @@
26#include <linux/linkage.h> 26#include <linux/linkage.h>
27#include <asm/segment.h> 27#include <asm/segment.h>
28#include <asm/page.h> 28#include <asm/page.h>
29#include <asm/boot.h>
29 30
31.section ".text.head"
30 .globl startup_32 32 .globl startup_32
31 33
32startup_32: 34startup_32:
33 cld 35 cld
34 cli 36 cli
@@ -37,93 +39,142 @@ startup_32:
37 movl %eax,%es 39 movl %eax,%es
38 movl %eax,%fs 40 movl %eax,%fs
39 movl %eax,%gs 41 movl %eax,%gs
42 movl %eax,%ss
40 43
41 lss stack_start,%esp 44/* Calculate the delta between where we were compiled to run
42 xorl %eax,%eax 45 * at and where we were actually loaded at. This can only be done
431: incl %eax # check that A20 really IS enabled 46 * with a short local call on x86. Nothing else will tell us what
44 movl %eax,0x000000 # loop forever if it isn't 47 * address we are running at. The reserved chunk of the real-mode
45 cmpl %eax,0x100000 48 * data at 0x34-0x3f are used as the stack for this calculation.
46 je 1b 49 * Only 4 bytes are needed.
50 */
51 leal 0x40(%esi), %esp
52 call 1f
531: popl %ebp
54 subl $1b, %ebp
55
56/* %ebp contains the address we are loaded at by the boot loader and %ebx
57 * contains the address where we should move the kernel image temporarily
58 * for safe in-place decompression.
59 */
60
61#ifdef CONFIG_RELOCATABLE
62 movl %ebp, %ebx
63 addl $(CONFIG_PHYSICAL_ALIGN - 1), %ebx
64 andl $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebx
65#else
66 movl $LOAD_PHYSICAL_ADDR, %ebx
67#endif
68
69 /* Replace the compressed data size with the uncompressed size */
70 subl input_len(%ebp), %ebx
71 movl output_len(%ebp), %eax
72 addl %eax, %ebx
73 /* Add 8 bytes for every 32K input block */
74 shrl $12, %eax
75 addl %eax, %ebx
76 /* Add 32K + 18 bytes of extra slack */
77 addl $(32768 + 18), %ebx
78 /* Align on a 4K boundary */
79 addl $4095, %ebx
80 andl $~4095, %ebx
81
82/* Copy the compressed kernel to the end of our buffer
83 * where decompression in place becomes safe.
84 */
85 pushl %esi
86 leal _end(%ebp), %esi
87 leal _end(%ebx), %edi
88 movl $(_end - startup_32), %ecx
89 std
90 rep
91 movsb
92 cld
93 popl %esi
94
95/* Compute the kernel start address.
96 */
97#ifdef CONFIG_RELOCATABLE
98 addl $(CONFIG_PHYSICAL_ALIGN - 1), %ebp
99 andl $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebp
100#else
101 movl $LOAD_PHYSICAL_ADDR, %ebp
102#endif
47 103
48/* 104/*
49 * Initialize eflags. Some BIOS's leave bits like NT set. This would 105 * Jump to the relocated address.
50 * confuse the debugger if this code is traced.
51 * XXX - best to initialize before switching to protected mode.
52 */ 106 */
53 pushl $0 107 leal relocated(%ebx), %eax
54 popfl 108 jmp *%eax
109.section ".text"
110relocated:
111
55/* 112/*
56 * Clear BSS 113 * Clear BSS
57 */ 114 */
58 xorl %eax,%eax 115 xorl %eax,%eax
59 movl $_edata,%edi 116 leal _edata(%ebx),%edi
60 movl $_end,%ecx 117 leal _end(%ebx), %ecx
61 subl %edi,%ecx 118 subl %edi,%ecx
62 cld 119 cld
63 rep 120 rep
64 stosb 121 stosb
122
123/*
124 * Setup the stack for the decompressor
125 */
126 leal stack_end(%ebx), %esp
127
65/* 128/*
66 * Do the decompression, and jump to the new kernel.. 129 * Do the decompression, and jump to the new kernel..
67 */ 130 */
68 subl $16,%esp # place for structure on the stack 131 movl output_len(%ebx), %eax
69 movl %esp,%eax 132 pushl %eax
133 pushl %ebp # output address
134 movl input_len(%ebx), %eax
135 pushl %eax # input_len
136 leal input_data(%ebx), %eax
137 pushl %eax # input_data
138 leal _end(%ebx), %eax
139 pushl %eax # end of the image as third argument
70 pushl %esi # real mode pointer as second arg 140 pushl %esi # real mode pointer as second arg
71 pushl %eax # address of structure as first arg
72 call decompress_kernel 141 call decompress_kernel
73 orl %eax,%eax 142 addl $20, %esp
74 jnz 3f 143 popl %ecx
75 popl %esi # discard address
76 popl %esi # real mode pointer
77 xorl %ebx,%ebx
78 ljmp $(__BOOT_CS), $__PHYSICAL_START
79 144
145#if CONFIG_RELOCATABLE
146/* Find the address of the relocations.
147 */
148 movl %ebp, %edi
149 addl %ecx, %edi
150
151/* Calculate the delta between where vmlinux was compiled to run
152 * and where it was actually loaded.
153 */
154 movl %ebp, %ebx
155 subl $LOAD_PHYSICAL_ADDR, %ebx
156 jz 2f /* Nothing to be done if loaded at compiled addr. */
80/* 157/*
81 * We come here, if we were loaded high. 158 * Process relocations.
82 * We need to move the move-in-place routine down to 0x1000
83 * and then start it with the buffer addresses in registers,
84 * which we got from the stack.
85 */ 159 */
863: 160
87 movl $move_routine_start,%esi 1611: subl $4, %edi
88 movl $0x1000,%edi 162 movl 0(%edi), %ecx
89 movl $move_routine_end,%ecx 163 testl %ecx, %ecx
90 subl %esi,%ecx 164 jz 2f
91 addl $3,%ecx 165 addl %ebx, -__PAGE_OFFSET(%ebx, %ecx)
92 shrl $2,%ecx 166 jmp 1b
93 cld 1672:
94 rep 168#endif
95 movsl
96
97 popl %esi # discard the address
98 popl %ebx # real mode pointer
99 popl %esi # low_buffer_start
100 popl %ecx # lcount
101 popl %edx # high_buffer_start
102 popl %eax # hcount
103 movl $__PHYSICAL_START,%edi
104 cli # make sure we don't get interrupted
105 ljmp $(__BOOT_CS), $0x1000 # and jump to the move routine
106 169
107/* 170/*
108 * Routine (template) for moving the decompressed kernel in place, 171 * Jump to the decompressed kernel.
109 * if we were high loaded. This _must_ PIC-code !
110 */ 172 */
111move_routine_start:
112 movl %ecx,%ebp
113 shrl $2,%ecx
114 rep
115 movsl
116 movl %ebp,%ecx
117 andl $3,%ecx
118 rep
119 movsb
120 movl %edx,%esi
121 movl %eax,%ecx # NOTE: rep movsb won't move if %ecx == 0
122 addl $3,%ecx
123 shrl $2,%ecx
124 rep
125 movsl
126 movl %ebx,%esi # Restore setup pointer
127 xorl %ebx,%ebx 173 xorl %ebx,%ebx
128 ljmp $(__BOOT_CS), $__PHYSICAL_START 174 jmp *%ebp
129move_routine_end: 175
176.bss
177.balign 4
178stack:
179 .fill 4096, 1, 0
180stack_end:
diff --git a/arch/i386/boot/compressed/misc.c b/arch/i386/boot/compressed/misc.c
index b2ccd543410d..1ce7017fd627 100644
--- a/arch/i386/boot/compressed/misc.c
+++ b/arch/i386/boot/compressed/misc.c
@@ -9,11 +9,94 @@
9 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 9 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
10 */ 10 */
11 11
12#undef CONFIG_PARAVIRT
12#include <linux/linkage.h> 13#include <linux/linkage.h>
13#include <linux/vmalloc.h> 14#include <linux/vmalloc.h>
14#include <linux/screen_info.h> 15#include <linux/screen_info.h>
15#include <asm/io.h> 16#include <asm/io.h>
16#include <asm/page.h> 17#include <asm/page.h>
18#include <asm/boot.h>
19
20/* WARNING!!
21 * This code is compiled with -fPIC and it is relocated dynamically
22 * at run time, but no relocation processing is performed.
23 * This means that it is not safe to place pointers in static structures.
24 */
25
26/*
27 * Getting to provable safe in place decompression is hard.
28 * Worst case behaviours need to be analized.
29 * Background information:
30 *
31 * The file layout is:
32 * magic[2]
33 * method[1]
34 * flags[1]
35 * timestamp[4]
36 * extraflags[1]
37 * os[1]
38 * compressed data blocks[N]
39 * crc[4] orig_len[4]
40 *
41 * resulting in 18 bytes of non compressed data overhead.
42 *
43 * Files divided into blocks
44 * 1 bit (last block flag)
45 * 2 bits (block type)
46 *
47 * 1 block occurs every 32K -1 bytes or when there 50% compression has been achieved.
48 * The smallest block type encoding is always used.
49 *
50 * stored:
51 * 32 bits length in bytes.
52 *
53 * fixed:
54 * magic fixed tree.
55 * symbols.
56 *
57 * dynamic:
58 * dynamic tree encoding.
59 * symbols.
60 *
61 *
62 * The buffer for decompression in place is the length of the
63 * uncompressed data, plus a small amount extra to keep the algorithm safe.
64 * The compressed data is placed at the end of the buffer. The output
65 * pointer is placed at the start of the buffer and the input pointer
66 * is placed where the compressed data starts. Problems will occur
67 * when the output pointer overruns the input pointer.
68 *
69 * The output pointer can only overrun the input pointer if the input
70 * pointer is moving faster than the output pointer. A condition only
71 * triggered by data whose compressed form is larger than the uncompressed
72 * form.
73 *
74 * The worst case at the block level is a growth of the compressed data
75 * of 5 bytes per 32767 bytes.
76 *
77 * The worst case internal to a compressed block is very hard to figure.
78 * The worst case can at least be boundined by having one bit that represents
79 * 32764 bytes and then all of the rest of the bytes representing the very
80 * very last byte.
81 *
82 * All of which is enough to compute an amount of extra data that is required
83 * to be safe. To avoid problems at the block level allocating 5 extra bytes
84 * per 32767 bytes of data is sufficient. To avoind problems internal to a block
85 * adding an extra 32767 bytes (the worst case uncompressed block size) is
86 * sufficient, to ensure that in the worst case the decompressed data for
87 * block will stop the byte before the compressed data for a block begins.
88 * To avoid problems with the compressed data's meta information an extra 18
89 * bytes are needed. Leading to the formula:
90 *
91 * extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size.
92 *
93 * Adding 8 bytes per 32K is a bit excessive but much easier to calculate.
94 * Adding 32768 instead of 32767 just makes for round numbers.
95 * Adding the decompressor_size is necessary as it musht live after all
96 * of the data as well. Last I measured the decompressor is about 14K.
97 * 10K of actuall data and 4K of bss.
98 *
99 */
17 100
18/* 101/*
19 * gzip declarations 102 * gzip declarations
@@ -30,15 +113,20 @@ typedef unsigned char uch;
30typedef unsigned short ush; 113typedef unsigned short ush;
31typedef unsigned long ulg; 114typedef unsigned long ulg;
32 115
33#define WSIZE 0x8000 /* Window size must be at least 32k, */ 116#define WSIZE 0x80000000 /* Window size must be at least 32k,
34 /* and a power of two */ 117 * and a power of two
118 * We don't actually have a window just
119 * a huge output buffer so I report
120 * a 2G windows size, as that should
121 * always be larger than our output buffer.
122 */
35 123
36static uch *inbuf; /* input buffer */ 124static uch *inbuf; /* input buffer */
37static uch window[WSIZE]; /* Sliding window buffer */ 125static uch *window; /* Sliding window buffer, (and final output buffer) */
38 126
39static unsigned insize = 0; /* valid bytes in inbuf */ 127static unsigned insize; /* valid bytes in inbuf */
40static unsigned inptr = 0; /* index of next byte to be processed in inbuf */ 128static unsigned inptr; /* index of next byte to be processed in inbuf */
41static unsigned outcnt = 0; /* bytes in output buffer */ 129static unsigned outcnt; /* bytes in output buffer */
42 130
43/* gzip flag byte */ 131/* gzip flag byte */
44#define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */ 132#define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */
@@ -89,8 +177,6 @@ extern unsigned char input_data[];
89extern int input_len; 177extern int input_len;
90 178
91static long bytes_out = 0; 179static long bytes_out = 0;
92static uch *output_data;
93static unsigned long output_ptr = 0;
94 180
95static void *malloc(int size); 181static void *malloc(int size);
96static void free(void *where); 182static void free(void *where);
@@ -100,24 +186,17 @@ static void *memcpy(void *dest, const void *src, unsigned n);
100 186
101static void putstr(const char *); 187static void putstr(const char *);
102 188
103extern int end; 189static unsigned long free_mem_ptr;
104static long free_mem_ptr = (long)&end; 190static unsigned long free_mem_end_ptr;
105static long free_mem_end_ptr;
106 191
107#define INPLACE_MOVE_ROUTINE 0x1000
108#define LOW_BUFFER_START 0x2000
109#define LOW_BUFFER_MAX 0x90000
110#define HEAP_SIZE 0x3000 192#define HEAP_SIZE 0x3000
111static unsigned int low_buffer_end, low_buffer_size;
112static int high_loaded =0;
113static uch *high_buffer_start /* = (uch *)(((ulg)&end) + HEAP_SIZE)*/;
114 193
115static char *vidmem = (char *)0xb8000; 194static char *vidmem = (char *)0xb8000;
116static int vidport; 195static int vidport;
117static int lines, cols; 196static int lines, cols;
118 197
119#ifdef CONFIG_X86_NUMAQ 198#ifdef CONFIG_X86_NUMAQ
120static void * xquad_portio = NULL; 199void *xquad_portio;
121#endif 200#endif
122 201
123#include "../../../../lib/inflate.c" 202#include "../../../../lib/inflate.c"
@@ -151,7 +230,7 @@ static void gzip_mark(void **ptr)
151 230
152static void gzip_release(void **ptr) 231static void gzip_release(void **ptr)
153{ 232{
154 free_mem_ptr = (long) *ptr; 233 free_mem_ptr = (unsigned long) *ptr;
155} 234}
156 235
157static void scroll(void) 236static void scroll(void)
@@ -179,7 +258,7 @@ static void putstr(const char *s)
179 y--; 258 y--;
180 } 259 }
181 } else { 260 } else {
182 vidmem [ ( x + cols * y ) * 2 ] = c; 261 vidmem [ ( x + cols * y ) * 2 ] = c;
183 if ( ++x >= cols ) { 262 if ( ++x >= cols ) {
184 x = 0; 263 x = 0;
185 if ( ++y >= lines ) { 264 if ( ++y >= lines ) {
@@ -224,58 +303,31 @@ static void* memcpy(void* dest, const void* src, unsigned n)
224 */ 303 */
225static int fill_inbuf(void) 304static int fill_inbuf(void)
226{ 305{
227 if (insize != 0) { 306 error("ran out of input data");
228 error("ran out of input data"); 307 return 0;
229 }
230
231 inbuf = input_data;
232 insize = input_len;
233 inptr = 1;
234 return inbuf[0];
235} 308}
236 309
237/* =========================================================================== 310/* ===========================================================================
238 * Write the output window window[0..outcnt-1] and update crc and bytes_out. 311 * Write the output window window[0..outcnt-1] and update crc and bytes_out.
239 * (Used for the decompressed data only.) 312 * (Used for the decompressed data only.)
240 */ 313 */
241static void flush_window_low(void)
242{
243 ulg c = crc; /* temporary variable */
244 unsigned n;
245 uch *in, *out, ch;
246
247 in = window;
248 out = &output_data[output_ptr];
249 for (n = 0; n < outcnt; n++) {
250 ch = *out++ = *in++;
251 c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
252 }
253 crc = c;
254 bytes_out += (ulg)outcnt;
255 output_ptr += (ulg)outcnt;
256 outcnt = 0;
257}
258
259static void flush_window_high(void)
260{
261 ulg c = crc; /* temporary variable */
262 unsigned n;
263 uch *in, ch;
264 in = window;
265 for (n = 0; n < outcnt; n++) {
266 ch = *output_data++ = *in++;
267 if ((ulg)output_data == low_buffer_end) output_data=high_buffer_start;
268 c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
269 }
270 crc = c;
271 bytes_out += (ulg)outcnt;
272 outcnt = 0;
273}
274
275static void flush_window(void) 314static void flush_window(void)
276{ 315{
277 if (high_loaded) flush_window_high(); 316 /* With my window equal to my output buffer
278 else flush_window_low(); 317 * I only need to compute the crc here.
318 */
319 ulg c = crc; /* temporary variable */
320 unsigned n;
321 uch *in, ch;
322
323 in = window;
324 for (n = 0; n < outcnt; n++) {
325 ch = *in++;
326 c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
327 }
328 crc = c;
329 bytes_out += (ulg)outcnt;
330 outcnt = 0;
279} 331}
280 332
281static void error(char *x) 333static void error(char *x)
@@ -287,66 +339,8 @@ static void error(char *x)
287 while(1); /* Halt */ 339 while(1); /* Halt */
288} 340}
289 341
290#define STACK_SIZE (4096) 342asmlinkage void decompress_kernel(void *rmode, unsigned long end,
291 343 uch *input_data, unsigned long input_len, uch *output)
292long user_stack [STACK_SIZE];
293
294struct {
295 long * a;
296 short b;
297 } stack_start = { & user_stack [STACK_SIZE] , __BOOT_DS };
298
299static void setup_normal_output_buffer(void)
300{
301#ifdef STANDARD_MEMORY_BIOS_CALL
302 if (RM_EXT_MEM_K < 1024) error("Less than 2MB of memory");
303#else
304 if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < 1024) error("Less than 2MB of memory");
305#endif
306 output_data = (unsigned char *)__PHYSICAL_START; /* Normally Points to 1M */
307 free_mem_end_ptr = (long)real_mode;
308}
309
310struct moveparams {
311 uch *low_buffer_start; int lcount;
312 uch *high_buffer_start; int hcount;
313};
314
315static void setup_output_buffer_if_we_run_high(struct moveparams *mv)
316{
317 high_buffer_start = (uch *)(((ulg)&end) + HEAP_SIZE);
318#ifdef STANDARD_MEMORY_BIOS_CALL
319 if (RM_EXT_MEM_K < (3*1024)) error("Less than 4MB of memory");
320#else
321 if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < (3*1024)) error("Less than 4MB of memory");
322#endif
323 mv->low_buffer_start = output_data = (unsigned char *)LOW_BUFFER_START;
324 low_buffer_end = ((unsigned int)real_mode > LOW_BUFFER_MAX
325 ? LOW_BUFFER_MAX : (unsigned int)real_mode) & ~0xfff;
326 low_buffer_size = low_buffer_end - LOW_BUFFER_START;
327 high_loaded = 1;
328 free_mem_end_ptr = (long)high_buffer_start;
329 if ( (__PHYSICAL_START + low_buffer_size) > ((ulg)high_buffer_start)) {
330 high_buffer_start = (uch *)(__PHYSICAL_START + low_buffer_size);
331 mv->hcount = 0; /* say: we need not to move high_buffer */
332 }
333 else mv->hcount = -1;
334 mv->high_buffer_start = high_buffer_start;
335}
336
337static void close_output_buffer_if_we_run_high(struct moveparams *mv)
338{
339 if (bytes_out > low_buffer_size) {
340 mv->lcount = low_buffer_size;
341 if (mv->hcount)
342 mv->hcount = bytes_out - low_buffer_size;
343 } else {
344 mv->lcount = bytes_out;
345 mv->hcount = 0;
346 }
347}
348
349asmlinkage int decompress_kernel(struct moveparams *mv, void *rmode)
350{ 344{
351 real_mode = rmode; 345 real_mode = rmode;
352 346
@@ -361,13 +355,25 @@ asmlinkage int decompress_kernel(struct moveparams *mv, void *rmode)
361 lines = RM_SCREEN_INFO.orig_video_lines; 355 lines = RM_SCREEN_INFO.orig_video_lines;
362 cols = RM_SCREEN_INFO.orig_video_cols; 356 cols = RM_SCREEN_INFO.orig_video_cols;
363 357
364 if (free_mem_ptr < 0x100000) setup_normal_output_buffer(); 358 window = output; /* Output buffer (Normally at 1M) */
365 else setup_output_buffer_if_we_run_high(mv); 359 free_mem_ptr = end; /* Heap */
360 free_mem_end_ptr = end + HEAP_SIZE;
361 inbuf = input_data; /* Input buffer */
362 insize = input_len;
363 inptr = 0;
364
365 if ((u32)output & (CONFIG_PHYSICAL_ALIGN -1))
366 error("Destination address not CONFIG_PHYSICAL_ALIGN aligned");
367 if (end > ((-__PAGE_OFFSET-(512 <<20)-1) & 0x7fffffff))
368 error("Destination address too large");
369#ifndef CONFIG_RELOCATABLE
370 if ((u32)output != LOAD_PHYSICAL_ADDR)
371 error("Wrong destination address");
372#endif
366 373
367 makecrc(); 374 makecrc();
368 putstr("Uncompressing Linux... "); 375 putstr("Uncompressing Linux... ");
369 gunzip(); 376 gunzip();
370 putstr("Ok, booting the kernel.\n"); 377 putstr("Ok, booting the kernel.\n");
371 if (high_loaded) close_output_buffer_if_we_run_high(mv); 378 return;
372 return high_loaded;
373} 379}
diff --git a/arch/i386/boot/compressed/relocs.c b/arch/i386/boot/compressed/relocs.c
new file mode 100644
index 000000000000..468da89153c4
--- /dev/null
+++ b/arch/i386/boot/compressed/relocs.c
@@ -0,0 +1,625 @@
1#include <stdio.h>
2#include <stdarg.h>
3#include <stdlib.h>
4#include <stdint.h>
5#include <string.h>
6#include <errno.h>
7#include <unistd.h>
8#include <elf.h>
9#include <byteswap.h>
10#define USE_BSD
11#include <endian.h>
12
13#define MAX_SHDRS 100
14static Elf32_Ehdr ehdr;
15static Elf32_Shdr shdr[MAX_SHDRS];
16static Elf32_Sym *symtab[MAX_SHDRS];
17static Elf32_Rel *reltab[MAX_SHDRS];
18static char *strtab[MAX_SHDRS];
19static unsigned long reloc_count, reloc_idx;
20static unsigned long *relocs;
21
22/*
23 * Following symbols have been audited. There values are constant and do
24 * not change if bzImage is loaded at a different physical address than
25 * the address for which it has been compiled. Don't warn user about
26 * absolute relocations present w.r.t these symbols.
27 */
28static const char* safe_abs_relocs[] = {
29 "__kernel_vsyscall",
30 "__kernel_rt_sigreturn",
31 "__kernel_sigreturn",
32 "SYSENTER_RETURN",
33};
34
35static int is_safe_abs_reloc(const char* sym_name)
36{
37 int i, array_size;
38
39 array_size = sizeof(safe_abs_relocs)/sizeof(char*);
40
41 for(i = 0; i < array_size; i++) {
42 if (!strcmp(sym_name, safe_abs_relocs[i]))
43 /* Match found */
44 return 1;
45 }
46 return 0;
47}
48
49static void die(char *fmt, ...)
50{
51 va_list ap;
52 va_start(ap, fmt);
53 vfprintf(stderr, fmt, ap);
54 va_end(ap);
55 exit(1);
56}
57
58static const char *sym_type(unsigned type)
59{
60 static const char *type_name[] = {
61#define SYM_TYPE(X) [X] = #X
62 SYM_TYPE(STT_NOTYPE),
63 SYM_TYPE(STT_OBJECT),
64 SYM_TYPE(STT_FUNC),
65 SYM_TYPE(STT_SECTION),
66 SYM_TYPE(STT_FILE),
67 SYM_TYPE(STT_COMMON),
68 SYM_TYPE(STT_TLS),
69#undef SYM_TYPE
70 };
71 const char *name = "unknown sym type name";
72 if (type < sizeof(type_name)/sizeof(type_name[0])) {
73 name = type_name[type];
74 }
75 return name;
76}
77
78static const char *sym_bind(unsigned bind)
79{
80 static const char *bind_name[] = {
81#define SYM_BIND(X) [X] = #X
82 SYM_BIND(STB_LOCAL),
83 SYM_BIND(STB_GLOBAL),
84 SYM_BIND(STB_WEAK),
85#undef SYM_BIND
86 };
87 const char *name = "unknown sym bind name";
88 if (bind < sizeof(bind_name)/sizeof(bind_name[0])) {
89 name = bind_name[bind];
90 }
91 return name;
92}
93
94static const char *sym_visibility(unsigned visibility)
95{
96 static const char *visibility_name[] = {
97#define SYM_VISIBILITY(X) [X] = #X
98 SYM_VISIBILITY(STV_DEFAULT),
99 SYM_VISIBILITY(STV_INTERNAL),
100 SYM_VISIBILITY(STV_HIDDEN),
101 SYM_VISIBILITY(STV_PROTECTED),
102#undef SYM_VISIBILITY
103 };
104 const char *name = "unknown sym visibility name";
105 if (visibility < sizeof(visibility_name)/sizeof(visibility_name[0])) {
106 name = visibility_name[visibility];
107 }
108 return name;
109}
110
111static const char *rel_type(unsigned type)
112{
113 static const char *type_name[] = {
114#define REL_TYPE(X) [X] = #X
115 REL_TYPE(R_386_NONE),
116 REL_TYPE(R_386_32),
117 REL_TYPE(R_386_PC32),
118 REL_TYPE(R_386_GOT32),
119 REL_TYPE(R_386_PLT32),
120 REL_TYPE(R_386_COPY),
121 REL_TYPE(R_386_GLOB_DAT),
122 REL_TYPE(R_386_JMP_SLOT),
123 REL_TYPE(R_386_RELATIVE),
124 REL_TYPE(R_386_GOTOFF),
125 REL_TYPE(R_386_GOTPC),
126#undef REL_TYPE
127 };
128 const char *name = "unknown type rel type name";
129 if (type < sizeof(type_name)/sizeof(type_name[0])) {
130 name = type_name[type];
131 }
132 return name;
133}
134
135static const char *sec_name(unsigned shndx)
136{
137 const char *sec_strtab;
138 const char *name;
139 sec_strtab = strtab[ehdr.e_shstrndx];
140 name = "<noname>";
141 if (shndx < ehdr.e_shnum) {
142 name = sec_strtab + shdr[shndx].sh_name;
143 }
144 else if (shndx == SHN_ABS) {
145 name = "ABSOLUTE";
146 }
147 else if (shndx == SHN_COMMON) {
148 name = "COMMON";
149 }
150 return name;
151}
152
153static const char *sym_name(const char *sym_strtab, Elf32_Sym *sym)
154{
155 const char *name;
156 name = "<noname>";
157 if (sym->st_name) {
158 name = sym_strtab + sym->st_name;
159 }
160 else {
161 name = sec_name(shdr[sym->st_shndx].sh_name);
162 }
163 return name;
164}
165
166
167
168#if BYTE_ORDER == LITTLE_ENDIAN
169#define le16_to_cpu(val) (val)
170#define le32_to_cpu(val) (val)
171#endif
172#if BYTE_ORDER == BIG_ENDIAN
173#define le16_to_cpu(val) bswap_16(val)
174#define le32_to_cpu(val) bswap_32(val)
175#endif
176
177static uint16_t elf16_to_cpu(uint16_t val)
178{
179 return le16_to_cpu(val);
180}
181
182static uint32_t elf32_to_cpu(uint32_t val)
183{
184 return le32_to_cpu(val);
185}
186
187static void read_ehdr(FILE *fp)
188{
189 if (fread(&ehdr, sizeof(ehdr), 1, fp) != 1) {
190 die("Cannot read ELF header: %s\n",
191 strerror(errno));
192 }
193 if (memcmp(ehdr.e_ident, ELFMAG, 4) != 0) {
194 die("No ELF magic\n");
195 }
196 if (ehdr.e_ident[EI_CLASS] != ELFCLASS32) {
197 die("Not a 32 bit executable\n");
198 }
199 if (ehdr.e_ident[EI_DATA] != ELFDATA2LSB) {
200 die("Not a LSB ELF executable\n");
201 }
202 if (ehdr.e_ident[EI_VERSION] != EV_CURRENT) {
203 die("Unknown ELF version\n");
204 }
205 /* Convert the fields to native endian */
206 ehdr.e_type = elf16_to_cpu(ehdr.e_type);
207 ehdr.e_machine = elf16_to_cpu(ehdr.e_machine);
208 ehdr.e_version = elf32_to_cpu(ehdr.e_version);
209 ehdr.e_entry = elf32_to_cpu(ehdr.e_entry);
210 ehdr.e_phoff = elf32_to_cpu(ehdr.e_phoff);
211 ehdr.e_shoff = elf32_to_cpu(ehdr.e_shoff);
212 ehdr.e_flags = elf32_to_cpu(ehdr.e_flags);
213 ehdr.e_ehsize = elf16_to_cpu(ehdr.e_ehsize);
214 ehdr.e_phentsize = elf16_to_cpu(ehdr.e_phentsize);
215 ehdr.e_phnum = elf16_to_cpu(ehdr.e_phnum);
216 ehdr.e_shentsize = elf16_to_cpu(ehdr.e_shentsize);
217 ehdr.e_shnum = elf16_to_cpu(ehdr.e_shnum);
218 ehdr.e_shstrndx = elf16_to_cpu(ehdr.e_shstrndx);
219
220 if ((ehdr.e_type != ET_EXEC) && (ehdr.e_type != ET_DYN)) {
221 die("Unsupported ELF header type\n");
222 }
223 if (ehdr.e_machine != EM_386) {
224 die("Not for x86\n");
225 }
226 if (ehdr.e_version != EV_CURRENT) {
227 die("Unknown ELF version\n");
228 }
229 if (ehdr.e_ehsize != sizeof(Elf32_Ehdr)) {
230 die("Bad Elf header size\n");
231 }
232 if (ehdr.e_phentsize != sizeof(Elf32_Phdr)) {
233 die("Bad program header entry\n");
234 }
235 if (ehdr.e_shentsize != sizeof(Elf32_Shdr)) {
236 die("Bad section header entry\n");
237 }
238 if (ehdr.e_shstrndx >= ehdr.e_shnum) {
239 die("String table index out of bounds\n");
240 }
241}
242
243static void read_shdrs(FILE *fp)
244{
245 int i;
246 if (ehdr.e_shnum > MAX_SHDRS) {
247 die("%d section headers supported: %d\n",
248 ehdr.e_shnum, MAX_SHDRS);
249 }
250 if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0) {
251 die("Seek to %d failed: %s\n",
252 ehdr.e_shoff, strerror(errno));
253 }
254 if (fread(&shdr, sizeof(shdr[0]), ehdr.e_shnum, fp) != ehdr.e_shnum) {
255 die("Cannot read ELF section headers: %s\n",
256 strerror(errno));
257 }
258 for(i = 0; i < ehdr.e_shnum; i++) {
259 shdr[i].sh_name = elf32_to_cpu(shdr[i].sh_name);
260 shdr[i].sh_type = elf32_to_cpu(shdr[i].sh_type);
261 shdr[i].sh_flags = elf32_to_cpu(shdr[i].sh_flags);
262 shdr[i].sh_addr = elf32_to_cpu(shdr[i].sh_addr);
263 shdr[i].sh_offset = elf32_to_cpu(shdr[i].sh_offset);
264 shdr[i].sh_size = elf32_to_cpu(shdr[i].sh_size);
265 shdr[i].sh_link = elf32_to_cpu(shdr[i].sh_link);
266 shdr[i].sh_info = elf32_to_cpu(shdr[i].sh_info);
267 shdr[i].sh_addralign = elf32_to_cpu(shdr[i].sh_addralign);
268 shdr[i].sh_entsize = elf32_to_cpu(shdr[i].sh_entsize);
269 }
270
271}
272
273static void read_strtabs(FILE *fp)
274{
275 int i;
276 for(i = 0; i < ehdr.e_shnum; i++) {
277 if (shdr[i].sh_type != SHT_STRTAB) {
278 continue;
279 }
280 strtab[i] = malloc(shdr[i].sh_size);
281 if (!strtab[i]) {
282 die("malloc of %d bytes for strtab failed\n",
283 shdr[i].sh_size);
284 }
285 if (fseek(fp, shdr[i].sh_offset, SEEK_SET) < 0) {
286 die("Seek to %d failed: %s\n",
287 shdr[i].sh_offset, strerror(errno));
288 }
289 if (fread(strtab[i], 1, shdr[i].sh_size, fp) != shdr[i].sh_size) {
290 die("Cannot read symbol table: %s\n",
291 strerror(errno));
292 }
293 }
294}
295
296static void read_symtabs(FILE *fp)
297{
298 int i,j;
299 for(i = 0; i < ehdr.e_shnum; i++) {
300 if (shdr[i].sh_type != SHT_SYMTAB) {
301 continue;
302 }
303 symtab[i] = malloc(shdr[i].sh_size);
304 if (!symtab[i]) {
305 die("malloc of %d bytes for symtab failed\n",
306 shdr[i].sh_size);
307 }
308 if (fseek(fp, shdr[i].sh_offset, SEEK_SET) < 0) {
309 die("Seek to %d failed: %s\n",
310 shdr[i].sh_offset, strerror(errno));
311 }
312 if (fread(symtab[i], 1, shdr[i].sh_size, fp) != shdr[i].sh_size) {
313 die("Cannot read symbol table: %s\n",
314 strerror(errno));
315 }
316 for(j = 0; j < shdr[i].sh_size/sizeof(symtab[i][0]); j++) {
317 symtab[i][j].st_name = elf32_to_cpu(symtab[i][j].st_name);
318 symtab[i][j].st_value = elf32_to_cpu(symtab[i][j].st_value);
319 symtab[i][j].st_size = elf32_to_cpu(symtab[i][j].st_size);
320 symtab[i][j].st_shndx = elf16_to_cpu(symtab[i][j].st_shndx);
321 }
322 }
323}
324
325
326static void read_relocs(FILE *fp)
327{
328 int i,j;
329 for(i = 0; i < ehdr.e_shnum; i++) {
330 if (shdr[i].sh_type != SHT_REL) {
331 continue;
332 }
333 reltab[i] = malloc(shdr[i].sh_size);
334 if (!reltab[i]) {
335 die("malloc of %d bytes for relocs failed\n",
336 shdr[i].sh_size);
337 }
338 if (fseek(fp, shdr[i].sh_offset, SEEK_SET) < 0) {
339 die("Seek to %d failed: %s\n",
340 shdr[i].sh_offset, strerror(errno));
341 }
342 if (fread(reltab[i], 1, shdr[i].sh_size, fp) != shdr[i].sh_size) {
343 die("Cannot read symbol table: %s\n",
344 strerror(errno));
345 }
346 for(j = 0; j < shdr[i].sh_size/sizeof(reltab[0][0]); j++) {
347 reltab[i][j].r_offset = elf32_to_cpu(reltab[i][j].r_offset);
348 reltab[i][j].r_info = elf32_to_cpu(reltab[i][j].r_info);
349 }
350 }
351}
352
353
354static void print_absolute_symbols(void)
355{
356 int i;
357 printf("Absolute symbols\n");
358 printf(" Num: Value Size Type Bind Visibility Name\n");
359 for(i = 0; i < ehdr.e_shnum; i++) {
360 char *sym_strtab;
361 Elf32_Sym *sh_symtab;
362 int j;
363 if (shdr[i].sh_type != SHT_SYMTAB) {
364 continue;
365 }
366 sh_symtab = symtab[i];
367 sym_strtab = strtab[shdr[i].sh_link];
368 for(j = 0; j < shdr[i].sh_size/sizeof(symtab[0][0]); j++) {
369 Elf32_Sym *sym;
370 const char *name;
371 sym = &symtab[i][j];
372 name = sym_name(sym_strtab, sym);
373 if (sym->st_shndx != SHN_ABS) {
374 continue;
375 }
376 printf("%5d %08x %5d %10s %10s %12s %s\n",
377 j, sym->st_value, sym->st_size,
378 sym_type(ELF32_ST_TYPE(sym->st_info)),
379 sym_bind(ELF32_ST_BIND(sym->st_info)),
380 sym_visibility(ELF32_ST_VISIBILITY(sym->st_other)),
381 name);
382 }
383 }
384 printf("\n");
385}
386
387static void print_absolute_relocs(void)
388{
389 int i, printed = 0;
390
391 for(i = 0; i < ehdr.e_shnum; i++) {
392 char *sym_strtab;
393 Elf32_Sym *sh_symtab;
394 unsigned sec_applies, sec_symtab;
395 int j;
396 if (shdr[i].sh_type != SHT_REL) {
397 continue;
398 }
399 sec_symtab = shdr[i].sh_link;
400 sec_applies = shdr[i].sh_info;
401 if (!(shdr[sec_applies].sh_flags & SHF_ALLOC)) {
402 continue;
403 }
404 sh_symtab = symtab[sec_symtab];
405 sym_strtab = strtab[shdr[sec_symtab].sh_link];
406 for(j = 0; j < shdr[i].sh_size/sizeof(reltab[0][0]); j++) {
407 Elf32_Rel *rel;
408 Elf32_Sym *sym;
409 const char *name;
410 rel = &reltab[i][j];
411 sym = &sh_symtab[ELF32_R_SYM(rel->r_info)];
412 name = sym_name(sym_strtab, sym);
413 if (sym->st_shndx != SHN_ABS) {
414 continue;
415 }
416
417 /* Absolute symbols are not relocated if bzImage is
418 * loaded at a non-compiled address. Display a warning
419 * to user at compile time about the absolute
420 * relocations present.
421 *
422 * User need to audit the code to make sure
423 * some symbols which should have been section
424 * relative have not become absolute because of some
425 * linker optimization or wrong programming usage.
426 *
427 * Before warning check if this absolute symbol
428 * relocation is harmless.
429 */
430 if (is_safe_abs_reloc(name))
431 continue;
432
433 if (!printed) {
434 printf("WARNING: Absolute relocations"
435 " present\n");
436 printf("Offset Info Type Sym.Value "
437 "Sym.Name\n");
438 printed = 1;
439 }
440
441 printf("%08x %08x %10s %08x %s\n",
442 rel->r_offset,
443 rel->r_info,
444 rel_type(ELF32_R_TYPE(rel->r_info)),
445 sym->st_value,
446 name);
447 }
448 }
449
450 if (printed)
451 printf("\n");
452}
453
454static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym))
455{
456 int i;
457 /* Walk through the relocations */
458 for(i = 0; i < ehdr.e_shnum; i++) {
459 char *sym_strtab;
460 Elf32_Sym *sh_symtab;
461 unsigned sec_applies, sec_symtab;
462 int j;
463 if (shdr[i].sh_type != SHT_REL) {
464 continue;
465 }
466 sec_symtab = shdr[i].sh_link;
467 sec_applies = shdr[i].sh_info;
468 if (!(shdr[sec_applies].sh_flags & SHF_ALLOC)) {
469 continue;
470 }
471 sh_symtab = symtab[sec_symtab];
472 sym_strtab = strtab[shdr[sec_symtab].sh_link];
473 for(j = 0; j < shdr[i].sh_size/sizeof(reltab[0][0]); j++) {
474 Elf32_Rel *rel;
475 Elf32_Sym *sym;
476 unsigned r_type;
477 rel = &reltab[i][j];
478 sym = &sh_symtab[ELF32_R_SYM(rel->r_info)];
479 r_type = ELF32_R_TYPE(rel->r_info);
480 /* Don't visit relocations to absolute symbols */
481 if (sym->st_shndx == SHN_ABS) {
482 continue;
483 }
484 if (r_type == R_386_PC32) {
485 /* PC relative relocations don't need to be adjusted */
486 }
487 else if (r_type == R_386_32) {
488 /* Visit relocations that need to be adjusted */
489 visit(rel, sym);
490 }
491 else {
492 die("Unsupported relocation type: %d\n", r_type);
493 }
494 }
495 }
496}
497
498static void count_reloc(Elf32_Rel *rel, Elf32_Sym *sym)
499{
500 reloc_count += 1;
501}
502
503static void collect_reloc(Elf32_Rel *rel, Elf32_Sym *sym)
504{
505 /* Remember the address that needs to be adjusted. */
506 relocs[reloc_idx++] = rel->r_offset;
507}
508
509static int cmp_relocs(const void *va, const void *vb)
510{
511 const unsigned long *a, *b;
512 a = va; b = vb;
513 return (*a == *b)? 0 : (*a > *b)? 1 : -1;
514}
515
516static void emit_relocs(int as_text)
517{
518 int i;
519 /* Count how many relocations I have and allocate space for them. */
520 reloc_count = 0;
521 walk_relocs(count_reloc);
522 relocs = malloc(reloc_count * sizeof(relocs[0]));
523 if (!relocs) {
524 die("malloc of %d entries for relocs failed\n",
525 reloc_count);
526 }
527 /* Collect up the relocations */
528 reloc_idx = 0;
529 walk_relocs(collect_reloc);
530
531 /* Order the relocations for more efficient processing */
532 qsort(relocs, reloc_count, sizeof(relocs[0]), cmp_relocs);
533
534 /* Print the relocations */
535 if (as_text) {
536 /* Print the relocations in a form suitable that
537 * gas will like.
538 */
539 printf(".section \".data.reloc\",\"a\"\n");
540 printf(".balign 4\n");
541 for(i = 0; i < reloc_count; i++) {
542 printf("\t .long 0x%08lx\n", relocs[i]);
543 }
544 printf("\n");
545 }
546 else {
547 unsigned char buf[4];
548 buf[0] = buf[1] = buf[2] = buf[3] = 0;
549 /* Print a stop */
550 printf("%c%c%c%c", buf[0], buf[1], buf[2], buf[3]);
551 /* Now print each relocation */
552 for(i = 0; i < reloc_count; i++) {
553 buf[0] = (relocs[i] >> 0) & 0xff;
554 buf[1] = (relocs[i] >> 8) & 0xff;
555 buf[2] = (relocs[i] >> 16) & 0xff;
556 buf[3] = (relocs[i] >> 24) & 0xff;
557 printf("%c%c%c%c", buf[0], buf[1], buf[2], buf[3]);
558 }
559 }
560}
561
562static void usage(void)
563{
564 die("relocs [--abs-syms |--abs-relocs | --text] vmlinux\n");
565}
566
567int main(int argc, char **argv)
568{
569 int show_absolute_syms, show_absolute_relocs;
570 int as_text;
571 const char *fname;
572 FILE *fp;
573 int i;
574
575 show_absolute_syms = 0;
576 show_absolute_relocs = 0;
577 as_text = 0;
578 fname = NULL;
579 for(i = 1; i < argc; i++) {
580 char *arg = argv[i];
581 if (*arg == '-') {
582 if (strcmp(argv[1], "--abs-syms") == 0) {
583 show_absolute_syms = 1;
584 continue;
585 }
586
587 if (strcmp(argv[1], "--abs-relocs") == 0) {
588 show_absolute_relocs = 1;
589 continue;
590 }
591 else if (strcmp(argv[1], "--text") == 0) {
592 as_text = 1;
593 continue;
594 }
595 }
596 else if (!fname) {
597 fname = arg;
598 continue;
599 }
600 usage();
601 }
602 if (!fname) {
603 usage();
604 }
605 fp = fopen(fname, "r");
606 if (!fp) {
607 die("Cannot open %s: %s\n",
608 fname, strerror(errno));
609 }
610 read_ehdr(fp);
611 read_shdrs(fp);
612 read_strtabs(fp);
613 read_symtabs(fp);
614 read_relocs(fp);
615 if (show_absolute_syms) {
616 print_absolute_symbols();
617 return 0;
618 }
619 if (show_absolute_relocs) {
620 print_absolute_relocs();
621 return 0;
622 }
623 emit_relocs(as_text);
624 return 0;
625}
diff --git a/arch/i386/boot/compressed/vmlinux.lds b/arch/i386/boot/compressed/vmlinux.lds
new file mode 100644
index 000000000000..cc4854f6c6c1
--- /dev/null
+++ b/arch/i386/boot/compressed/vmlinux.lds
@@ -0,0 +1,43 @@
1OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
2OUTPUT_ARCH(i386)
3ENTRY(startup_32)
4SECTIONS
5{
6 /* Be careful parts of head.S assume startup_32 is at
7 * address 0.
8 */
9 . = 0 ;
10 .text.head : {
11 _head = . ;
12 *(.text.head)
13 _ehead = . ;
14 }
15 .data.compressed : {
16 *(.data.compressed)
17 }
18 .text : {
19 _text = .; /* Text */
20 *(.text)
21 *(.text.*)
22 _etext = . ;
23 }
24 .rodata : {
25 _rodata = . ;
26 *(.rodata) /* read-only data */
27 *(.rodata.*)
28 _erodata = . ;
29 }
30 .data : {
31 _data = . ;
32 *(.data)
33 *(.data.*)
34 _edata = . ;
35 }
36 .bss : {
37 _bss = . ;
38 *(.bss)
39 *(.bss.*)
40 *(COMMON)
41 _end = . ;
42 }
43}
diff --git a/arch/i386/boot/compressed/vmlinux.scr b/arch/i386/boot/compressed/vmlinux.scr
index 1ed9d791f863..707a88f7f29e 100644
--- a/arch/i386/boot/compressed/vmlinux.scr
+++ b/arch/i386/boot/compressed/vmlinux.scr
@@ -1,9 +1,10 @@
1SECTIONS 1SECTIONS
2{ 2{
3 .data : { 3 .data.compressed : {
4 input_len = .; 4 input_len = .;
5 LONG(input_data_end - input_data) input_data = .; 5 LONG(input_data_end - input_data) input_data = .;
6 *(.data) 6 *(.data)
7 output_len = . - 4;
7 input_data_end = .; 8 input_data_end = .;
8 } 9 }
9} 10}
diff --git a/arch/i386/boot/setup.S b/arch/i386/boot/setup.S
index 3aec4538a113..06edf1c66242 100644
--- a/arch/i386/boot/setup.S
+++ b/arch/i386/boot/setup.S
@@ -81,7 +81,7 @@ start:
81# This is the setup header, and it must start at %cs:2 (old 0x9020:2) 81# This is the setup header, and it must start at %cs:2 (old 0x9020:2)
82 82
83 .ascii "HdrS" # header signature 83 .ascii "HdrS" # header signature
84 .word 0x0204 # header version number (>= 0x0105) 84 .word 0x0205 # header version number (>= 0x0105)
85 # or else old loadlin-1.5 will fail) 85 # or else old loadlin-1.5 will fail)
86realmode_swtch: .word 0, 0 # default_switch, SETUPSEG 86realmode_swtch: .word 0, 0 # default_switch, SETUPSEG
87start_sys_seg: .word SYSSEG 87start_sys_seg: .word SYSSEG
@@ -160,6 +160,17 @@ ramdisk_max: .long (-__PAGE_OFFSET-(512 << 20)-1) & 0x7fffffff
160 # The highest safe address for 160 # The highest safe address for
161 # the contents of an initrd 161 # the contents of an initrd
162 162
163kernel_alignment: .long CONFIG_PHYSICAL_ALIGN #physical addr alignment
164 #required for protected mode
165 #kernel
166#ifdef CONFIG_RELOCATABLE
167relocatable_kernel: .byte 1
168#else
169relocatable_kernel: .byte 0
170#endif
171pad2: .byte 0
172pad3: .word 0
173
163trampoline: call start_of_setup 174trampoline: call start_of_setup
164 .align 16 175 .align 16
165 # The offset at this point is 0x240 176 # The offset at this point is 0x240
@@ -588,11 +599,6 @@ rmodeswtch_normal:
588 call default_switch 599 call default_switch
589 600
590rmodeswtch_end: 601rmodeswtch_end:
591# we get the code32 start address and modify the below 'jmpi'
592# (loader may have changed it)
593 movl %cs:code32_start, %eax
594 movl %eax, %cs:code32
595
596# Now we move the system to its rightful place ... but we check if we have a 602# Now we move the system to its rightful place ... but we check if we have a
597# big-kernel. In that case we *must* not move it ... 603# big-kernel. In that case we *must* not move it ...
598 testb $LOADED_HIGH, %cs:loadflags 604 testb $LOADED_HIGH, %cs:loadflags
@@ -788,11 +794,12 @@ a20_err_msg:
788a20_done: 794a20_done:
789 795
790#endif /* CONFIG_X86_VOYAGER */ 796#endif /* CONFIG_X86_VOYAGER */
791# set up gdt and idt 797# set up gdt and idt and 32bit start address
792 lidt idt_48 # load idt with 0,0 798 lidt idt_48 # load idt with 0,0
793 xorl %eax, %eax # Compute gdt_base 799 xorl %eax, %eax # Compute gdt_base
794 movw %ds, %ax # (Convert %ds:gdt to a linear ptr) 800 movw %ds, %ax # (Convert %ds:gdt to a linear ptr)
795 shll $4, %eax 801 shll $4, %eax
802 addl %eax, code32
796 addl $gdt, %eax 803 addl $gdt, %eax
797 movl %eax, (gdt_48+2) 804 movl %eax, (gdt_48+2)
798 lgdt gdt_48 # load gdt with whatever is 805 lgdt gdt_48 # load gdt with whatever is
@@ -851,9 +858,26 @@ flush_instr:
851# Manual, Mixing 16-bit and 32-bit code, page 16-6) 858# Manual, Mixing 16-bit and 32-bit code, page 16-6)
852 859
853 .byte 0x66, 0xea # prefix + jmpi-opcode 860 .byte 0x66, 0xea # prefix + jmpi-opcode
854code32: .long 0x1000 # will be set to 0x100000 861code32: .long startup_32 # will be set to %cs+startup_32
855 # for big kernels
856 .word __BOOT_CS 862 .word __BOOT_CS
863.code32
864startup_32:
865 movl $(__BOOT_DS), %eax
866 movl %eax, %ds
867 movl %eax, %es
868 movl %eax, %fs
869 movl %eax, %gs
870 movl %eax, %ss
871
872 xorl %eax, %eax
8731: incl %eax # check that A20 really IS enabled
874 movl %eax, 0x00000000 # loop forever if it isn't
875 cmpl %eax, 0x00100000
876 je 1b
877
878 # Jump to the 32bit entry point
879 jmpl *(code32_start - start + (DELTA_INITSEG << 4))(%esi)
880.code16
857 881
858# Here's a bunch of information about your current kernel.. 882# Here's a bunch of information about your current kernel..
859kernel_version: .ascii UTS_RELEASE 883kernel_version: .ascii UTS_RELEASE
diff --git a/arch/i386/defconfig b/arch/i386/defconfig
index 97aacd6bd7d8..65891f11aced 100644
--- a/arch/i386/defconfig
+++ b/arch/i386/defconfig
@@ -1,7 +1,7 @@
1# 1#
2# Automatically generated make config: don't edit 2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.19-rc2-git4 3# Linux kernel version: 2.6.19-git7
4# Sat Oct 21 03:38:56 2006 4# Wed Dec 6 23:50:49 2006
5# 5#
6CONFIG_X86_32=y 6CONFIG_X86_32=y
7CONFIG_GENERIC_TIME=y 7CONFIG_GENERIC_TIME=y
@@ -40,13 +40,14 @@ CONFIG_POSIX_MQUEUE=y
40CONFIG_IKCONFIG=y 40CONFIG_IKCONFIG=y
41CONFIG_IKCONFIG_PROC=y 41CONFIG_IKCONFIG_PROC=y
42# CONFIG_CPUSETS is not set 42# CONFIG_CPUSETS is not set
43CONFIG_SYSFS_DEPRECATED=y
43# CONFIG_RELAY is not set 44# CONFIG_RELAY is not set
44CONFIG_INITRAMFS_SOURCE="" 45CONFIG_INITRAMFS_SOURCE=""
45CONFIG_CC_OPTIMIZE_FOR_SIZE=y 46CONFIG_CC_OPTIMIZE_FOR_SIZE=y
46CONFIG_SYSCTL=y 47CONFIG_SYSCTL=y
47# CONFIG_EMBEDDED is not set 48# CONFIG_EMBEDDED is not set
48CONFIG_UID16=y 49CONFIG_UID16=y
49# CONFIG_SYSCTL_SYSCALL is not set 50CONFIG_SYSCTL_SYSCALL=y
50CONFIG_KALLSYMS=y 51CONFIG_KALLSYMS=y
51CONFIG_KALLSYMS_ALL=y 52CONFIG_KALLSYMS_ALL=y
52# CONFIG_KALLSYMS_EXTRA_PASS is not set 53# CONFIG_KALLSYMS_EXTRA_PASS is not set
@@ -110,6 +111,7 @@ CONFIG_SMP=y
110# CONFIG_X86_VISWS is not set 111# CONFIG_X86_VISWS is not set
111CONFIG_X86_GENERICARCH=y 112CONFIG_X86_GENERICARCH=y
112# CONFIG_X86_ES7000 is not set 113# CONFIG_X86_ES7000 is not set
114# CONFIG_PARAVIRT is not set
113CONFIG_X86_CYCLONE_TIMER=y 115CONFIG_X86_CYCLONE_TIMER=y
114# CONFIG_M386 is not set 116# CONFIG_M386 is not set
115# CONFIG_M486 is not set 117# CONFIG_M486 is not set
@@ -120,6 +122,7 @@ CONFIG_X86_CYCLONE_TIMER=y
120# CONFIG_MPENTIUMII is not set 122# CONFIG_MPENTIUMII is not set
121CONFIG_MPENTIUMIII=y 123CONFIG_MPENTIUMIII=y
122# CONFIG_MPENTIUMM is not set 124# CONFIG_MPENTIUMM is not set
125# CONFIG_MCORE2 is not set
123# CONFIG_MPENTIUM4 is not set 126# CONFIG_MPENTIUM4 is not set
124# CONFIG_MK6 is not set 127# CONFIG_MK6 is not set
125# CONFIG_MK7 is not set 128# CONFIG_MK7 is not set
@@ -197,7 +200,6 @@ CONFIG_RESOURCES_64BIT=y
197CONFIG_MTRR=y 200CONFIG_MTRR=y
198# CONFIG_EFI is not set 201# CONFIG_EFI is not set
199# CONFIG_IRQBALANCE is not set 202# CONFIG_IRQBALANCE is not set
200CONFIG_REGPARM=y
201CONFIG_SECCOMP=y 203CONFIG_SECCOMP=y
202# CONFIG_HZ_100 is not set 204# CONFIG_HZ_100 is not set
203CONFIG_HZ_250=y 205CONFIG_HZ_250=y
@@ -205,7 +207,8 @@ CONFIG_HZ_250=y
205CONFIG_HZ=250 207CONFIG_HZ=250
206# CONFIG_KEXEC is not set 208# CONFIG_KEXEC is not set
207# CONFIG_CRASH_DUMP is not set 209# CONFIG_CRASH_DUMP is not set
208CONFIG_PHYSICAL_START=0x100000 210# CONFIG_RELOCATABLE is not set
211CONFIG_PHYSICAL_ALIGN=0x100000
209# CONFIG_HOTPLUG_CPU is not set 212# CONFIG_HOTPLUG_CPU is not set
210CONFIG_COMPAT_VDSO=y 213CONFIG_COMPAT_VDSO=y
211CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y 214CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
@@ -367,6 +370,7 @@ CONFIG_INET_TCP_DIAG=y
367# CONFIG_TCP_CONG_ADVANCED is not set 370# CONFIG_TCP_CONG_ADVANCED is not set
368CONFIG_TCP_CONG_CUBIC=y 371CONFIG_TCP_CONG_CUBIC=y
369CONFIG_DEFAULT_TCP_CONG="cubic" 372CONFIG_DEFAULT_TCP_CONG="cubic"
373# CONFIG_TCP_MD5SIG is not set
370CONFIG_IPV6=y 374CONFIG_IPV6=y
371# CONFIG_IPV6_PRIVACY is not set 375# CONFIG_IPV6_PRIVACY is not set
372# CONFIG_IPV6_ROUTER_PREF is not set 376# CONFIG_IPV6_ROUTER_PREF is not set
@@ -677,6 +681,7 @@ CONFIG_SATA_INTEL_COMBINED=y
677# CONFIG_PATA_IT821X is not set 681# CONFIG_PATA_IT821X is not set
678# CONFIG_PATA_JMICRON is not set 682# CONFIG_PATA_JMICRON is not set
679# CONFIG_PATA_TRIFLEX is not set 683# CONFIG_PATA_TRIFLEX is not set
684# CONFIG_PATA_MARVELL is not set
680# CONFIG_PATA_MPIIX is not set 685# CONFIG_PATA_MPIIX is not set
681# CONFIG_PATA_OLDPIIX is not set 686# CONFIG_PATA_OLDPIIX is not set
682# CONFIG_PATA_NETCELL is not set 687# CONFIG_PATA_NETCELL is not set
@@ -850,6 +855,7 @@ CONFIG_BNX2=y
850# CONFIG_IXGB is not set 855# CONFIG_IXGB is not set
851# CONFIG_S2IO is not set 856# CONFIG_S2IO is not set
852# CONFIG_MYRI10GE is not set 857# CONFIG_MYRI10GE is not set
858# CONFIG_NETXEN_NIC is not set
853 859
854# 860#
855# Token Ring devices 861# Token Ring devices
@@ -984,10 +990,6 @@ CONFIG_RTC=y
984# CONFIG_R3964 is not set 990# CONFIG_R3964 is not set
985# CONFIG_APPLICOM is not set 991# CONFIG_APPLICOM is not set
986# CONFIG_SONYPI is not set 992# CONFIG_SONYPI is not set
987
988#
989# Ftape, the floppy tape device driver
990#
991CONFIG_AGP=y 993CONFIG_AGP=y
992# CONFIG_AGP_ALI is not set 994# CONFIG_AGP_ALI is not set
993# CONFIG_AGP_ATI is not set 995# CONFIG_AGP_ATI is not set
@@ -1108,6 +1110,7 @@ CONFIG_USB_DEVICEFS=y
1108# CONFIG_USB_BANDWIDTH is not set 1110# CONFIG_USB_BANDWIDTH is not set
1109# CONFIG_USB_DYNAMIC_MINORS is not set 1111# CONFIG_USB_DYNAMIC_MINORS is not set
1110# CONFIG_USB_SUSPEND is not set 1112# CONFIG_USB_SUSPEND is not set
1113# CONFIG_USB_MULTITHREAD_PROBE is not set
1111# CONFIG_USB_OTG is not set 1114# CONFIG_USB_OTG is not set
1112 1115
1113# 1116#
@@ -1185,6 +1188,7 @@ CONFIG_USB_HIDINPUT=y
1185# CONFIG_USB_KAWETH is not set 1188# CONFIG_USB_KAWETH is not set
1186# CONFIG_USB_PEGASUS is not set 1189# CONFIG_USB_PEGASUS is not set
1187# CONFIG_USB_RTL8150 is not set 1190# CONFIG_USB_RTL8150 is not set
1191# CONFIG_USB_USBNET_MII is not set
1188# CONFIG_USB_USBNET is not set 1192# CONFIG_USB_USBNET is not set
1189CONFIG_USB_MON=y 1193CONFIG_USB_MON=y
1190 1194
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
index 1a884b6e6e5c..1e8988e558c5 100644
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -6,7 +6,7 @@ extra-y := head.o init_task.o vmlinux.lds
6 6
7obj-y := process.o signal.o entry.o traps.o irq.o \ 7obj-y := process.o signal.o entry.o traps.o irq.o \
8 ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \ 8 ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \
9 pci-dma.o i386_ksyms.o i387.o bootflag.o \ 9 pci-dma.o i386_ksyms.o i387.o bootflag.o e820.o\
10 quirks.o i8237.o topology.o alternative.o i8253.o tsc.o 10 quirks.o i8237.o topology.o alternative.o i8253.o tsc.o
11 11
12obj-$(CONFIG_STACKTRACE) += stacktrace.o 12obj-$(CONFIG_STACKTRACE) += stacktrace.o
@@ -40,6 +40,9 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
40obj-$(CONFIG_HPET_TIMER) += hpet.o 40obj-$(CONFIG_HPET_TIMER) += hpet.o
41obj-$(CONFIG_K8_NB) += k8.o 41obj-$(CONFIG_K8_NB) += k8.o
42 42
43# Make sure this is linked after any other paravirt_ops structs: see head.S
44obj-$(CONFIG_PARAVIRT) += paravirt.o
45
43EXTRA_AFLAGS := -traditional 46EXTRA_AFLAGS := -traditional
44 47
45obj-$(CONFIG_SCx200) += scx200.o 48obj-$(CONFIG_SCx200) += scx200.o
diff --git a/arch/i386/kernel/acpi/earlyquirk.c b/arch/i386/kernel/acpi/earlyquirk.c
index c9841692bb7c..4b60af7f91dd 100644
--- a/arch/i386/kernel/acpi/earlyquirk.c
+++ b/arch/i386/kernel/acpi/earlyquirk.c
@@ -10,6 +10,7 @@
10#include <asm/pci-direct.h> 10#include <asm/pci-direct.h>
11#include <asm/acpi.h> 11#include <asm/acpi.h>
12#include <asm/apic.h> 12#include <asm/apic.h>
13#include <asm/irq.h>
13 14
14#ifdef CONFIG_ACPI 15#ifdef CONFIG_ACPI
15 16
@@ -49,6 +50,24 @@ static int __init check_bridge(int vendor, int device)
49 return 0; 50 return 0;
50} 51}
51 52
53static void check_intel(void)
54{
55 u16 vendor, device;
56
57 vendor = read_pci_config_16(0, 0, 0, PCI_VENDOR_ID);
58
59 if (vendor != PCI_VENDOR_ID_INTEL)
60 return;
61
62 device = read_pci_config_16(0, 0, 0, PCI_DEVICE_ID);
63#ifdef CONFIG_SMP
64 if (device == PCI_DEVICE_ID_INTEL_E7320_MCH ||
65 device == PCI_DEVICE_ID_INTEL_E7520_MCH ||
66 device == PCI_DEVICE_ID_INTEL_E7525_MCH)
67 quirk_intel_irqbalance();
68#endif
69}
70
52void __init check_acpi_pci(void) 71void __init check_acpi_pci(void)
53{ 72{
54 int num, slot, func; 73 int num, slot, func;
@@ -60,6 +79,8 @@ void __init check_acpi_pci(void)
60 if (!early_pci_allowed()) 79 if (!early_pci_allowed())
61 return; 80 return;
62 81
82 check_intel();
83
63 /* Poor man's PCI discovery */ 84 /* Poor man's PCI discovery */
64 for (num = 0; num < 32; num++) { 85 for (num = 0; num < 32; num++) {
65 for (slot = 0; slot < 32; slot++) { 86 for (slot = 0; slot < 32; slot++) {
diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c
index 535f9794fba1..9eca21b49f6b 100644
--- a/arch/i386/kernel/alternative.c
+++ b/arch/i386/kernel/alternative.c
@@ -124,6 +124,20 @@ static unsigned char** find_nop_table(void)
124 124
125#endif /* CONFIG_X86_64 */ 125#endif /* CONFIG_X86_64 */
126 126
127static void nop_out(void *insns, unsigned int len)
128{
129 unsigned char **noptable = find_nop_table();
130
131 while (len > 0) {
132 unsigned int noplen = len;
133 if (noplen > ASM_NOP_MAX)
134 noplen = ASM_NOP_MAX;
135 memcpy(insns, noptable[noplen], noplen);
136 insns += noplen;
137 len -= noplen;
138 }
139}
140
127extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; 141extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
128extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[]; 142extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[];
129extern u8 *__smp_locks[], *__smp_locks_end[]; 143extern u8 *__smp_locks[], *__smp_locks_end[];
@@ -138,10 +152,9 @@ extern u8 __smp_alt_begin[], __smp_alt_end[];
138 152
139void apply_alternatives(struct alt_instr *start, struct alt_instr *end) 153void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
140{ 154{
141 unsigned char **noptable = find_nop_table();
142 struct alt_instr *a; 155 struct alt_instr *a;
143 u8 *instr; 156 u8 *instr;
144 int diff, i, k; 157 int diff;
145 158
146 DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end); 159 DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end);
147 for (a = start; a < end; a++) { 160 for (a = start; a < end; a++) {
@@ -159,13 +172,7 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
159#endif 172#endif
160 memcpy(instr, a->replacement, a->replacementlen); 173 memcpy(instr, a->replacement, a->replacementlen);
161 diff = a->instrlen - a->replacementlen; 174 diff = a->instrlen - a->replacementlen;
162 /* Pad the rest with nops */ 175 nop_out(instr + a->replacementlen, diff);
163 for (i = a->replacementlen; diff > 0; diff -= k, i += k) {
164 k = diff;
165 if (k > ASM_NOP_MAX)
166 k = ASM_NOP_MAX;
167 memcpy(a->instr + i, noptable[k], k);
168 }
169 } 176 }
170} 177}
171 178
@@ -209,7 +216,6 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
209 216
210static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) 217static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
211{ 218{
212 unsigned char **noptable = find_nop_table();
213 u8 **ptr; 219 u8 **ptr;
214 220
215 for (ptr = start; ptr < end; ptr++) { 221 for (ptr = start; ptr < end; ptr++) {
@@ -217,7 +223,7 @@ static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end
217 continue; 223 continue;
218 if (*ptr > text_end) 224 if (*ptr > text_end)
219 continue; 225 continue;
220 **ptr = noptable[1][0]; 226 nop_out(*ptr, 1);
221 }; 227 };
222} 228}
223 229
@@ -343,6 +349,40 @@ void alternatives_smp_switch(int smp)
343 349
344#endif 350#endif
345 351
352#ifdef CONFIG_PARAVIRT
353void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end)
354{
355 struct paravirt_patch *p;
356
357 for (p = start; p < end; p++) {
358 unsigned int used;
359
360 used = paravirt_ops.patch(p->instrtype, p->clobbers, p->instr,
361 p->len);
362#ifdef CONFIG_DEBUG_PARAVIRT
363 {
364 int i;
365 /* Deliberately clobber regs using "not %reg" to find bugs. */
366 for (i = 0; i < 3; i++) {
367 if (p->len - used >= 2 && (p->clobbers & (1 << i))) {
368 memcpy(p->instr + used, "\xf7\xd0", 2);
369 p->instr[used+1] |= i;
370 used += 2;
371 }
372 }
373 }
374#endif
375 /* Pad the rest with nops */
376 nop_out(p->instr + used, p->len - used);
377 }
378
379 /* Sync to be conservative, in case we patched following instructions */
380 sync_core();
381}
382extern struct paravirt_patch __start_parainstructions[],
383 __stop_parainstructions[];
384#endif /* CONFIG_PARAVIRT */
385
346void __init alternative_instructions(void) 386void __init alternative_instructions(void)
347{ 387{
348 unsigned long flags; 388 unsigned long flags;
@@ -390,5 +430,6 @@ void __init alternative_instructions(void)
390 alternatives_smp_switch(0); 430 alternatives_smp_switch(0);
391 } 431 }
392#endif 432#endif
433 apply_paravirt(__start_parainstructions, __stop_parainstructions);
393 local_irq_restore(flags); 434 local_irq_restore(flags);
394} 435}
diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
index 2fd4b7d927c2..776d9be26af9 100644
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -647,23 +647,30 @@ static struct {
647static int lapic_suspend(struct sys_device *dev, pm_message_t state) 647static int lapic_suspend(struct sys_device *dev, pm_message_t state)
648{ 648{
649 unsigned long flags; 649 unsigned long flags;
650 int maxlvt;
650 651
651 if (!apic_pm_state.active) 652 if (!apic_pm_state.active)
652 return 0; 653 return 0;
653 654
655 maxlvt = get_maxlvt();
656
654 apic_pm_state.apic_id = apic_read(APIC_ID); 657 apic_pm_state.apic_id = apic_read(APIC_ID);
655 apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); 658 apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
656 apic_pm_state.apic_ldr = apic_read(APIC_LDR); 659 apic_pm_state.apic_ldr = apic_read(APIC_LDR);
657 apic_pm_state.apic_dfr = apic_read(APIC_DFR); 660 apic_pm_state.apic_dfr = apic_read(APIC_DFR);
658 apic_pm_state.apic_spiv = apic_read(APIC_SPIV); 661 apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
659 apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); 662 apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
660 apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); 663 if (maxlvt >= 4)
664 apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
661 apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); 665 apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
662 apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); 666 apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
663 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); 667 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
664 apic_pm_state.apic_tmict = apic_read(APIC_TMICT); 668 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
665 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); 669 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
666 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); 670#ifdef CONFIG_X86_MCE_P4THERMAL
671 if (maxlvt >= 5)
672 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
673#endif
667 674
668 local_irq_save(flags); 675 local_irq_save(flags);
669 disable_local_APIC(); 676 disable_local_APIC();
@@ -675,10 +682,13 @@ static int lapic_resume(struct sys_device *dev)
675{ 682{
676 unsigned int l, h; 683 unsigned int l, h;
677 unsigned long flags; 684 unsigned long flags;
685 int maxlvt;
678 686
679 if (!apic_pm_state.active) 687 if (!apic_pm_state.active)
680 return 0; 688 return 0;
681 689
690 maxlvt = get_maxlvt();
691
682 local_irq_save(flags); 692 local_irq_save(flags);
683 693
684 /* 694 /*
@@ -700,8 +710,12 @@ static int lapic_resume(struct sys_device *dev)
700 apic_write(APIC_SPIV, apic_pm_state.apic_spiv); 710 apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
701 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); 711 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
702 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); 712 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
703 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); 713#ifdef CONFIG_X86_MCE_P4THERMAL
704 apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); 714 if (maxlvt >= 5)
715 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
716#endif
717 if (maxlvt >= 4)
718 apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
705 apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); 719 apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
706 apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); 720 apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
707 apic_write(APIC_TMICT, apic_pm_state.apic_tmict); 721 apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
index a60358fe9a49..a97847da9ed5 100644
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -231,6 +231,7 @@
231#include <asm/uaccess.h> 231#include <asm/uaccess.h>
232#include <asm/desc.h> 232#include <asm/desc.h>
233#include <asm/i8253.h> 233#include <asm/i8253.h>
234#include <asm/paravirt.h>
234 235
235#include "io_ports.h" 236#include "io_ports.h"
236 237
@@ -2235,7 +2236,7 @@ static int __init apm_init(void)
2235 2236
2236 dmi_check_system(apm_dmi_table); 2237 dmi_check_system(apm_dmi_table);
2237 2238
2238 if (apm_info.bios.version == 0) { 2239 if (apm_info.bios.version == 0 || paravirt_enabled()) {
2239 printk(KERN_INFO "apm: BIOS not found.\n"); 2240 printk(KERN_INFO "apm: BIOS not found.\n");
2240 return -ENODEV; 2241 return -ENODEV;
2241 } 2242 }
diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c
index c80271f8f084..1b2f3cd33270 100644
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -15,6 +15,7 @@
15#include <asm/processor.h> 15#include <asm/processor.h>
16#include <asm/thread_info.h> 16#include <asm/thread_info.h>
17#include <asm/elf.h> 17#include <asm/elf.h>
18#include <asm/pda.h>
18 19
19#define DEFINE(sym, val) \ 20#define DEFINE(sym, val) \
20 asm volatile("\n->" #sym " %0 " #val : : "i" (val)) 21 asm volatile("\n->" #sym " %0 " #val : : "i" (val))
@@ -51,13 +52,35 @@ void foo(void)
51 OFFSET(TI_exec_domain, thread_info, exec_domain); 52 OFFSET(TI_exec_domain, thread_info, exec_domain);
52 OFFSET(TI_flags, thread_info, flags); 53 OFFSET(TI_flags, thread_info, flags);
53 OFFSET(TI_status, thread_info, status); 54 OFFSET(TI_status, thread_info, status);
54 OFFSET(TI_cpu, thread_info, cpu);
55 OFFSET(TI_preempt_count, thread_info, preempt_count); 55 OFFSET(TI_preempt_count, thread_info, preempt_count);
56 OFFSET(TI_addr_limit, thread_info, addr_limit); 56 OFFSET(TI_addr_limit, thread_info, addr_limit);
57 OFFSET(TI_restart_block, thread_info, restart_block); 57 OFFSET(TI_restart_block, thread_info, restart_block);
58 OFFSET(TI_sysenter_return, thread_info, sysenter_return); 58 OFFSET(TI_sysenter_return, thread_info, sysenter_return);
59 BLANK(); 59 BLANK();
60 60
61 OFFSET(GDS_size, Xgt_desc_struct, size);
62 OFFSET(GDS_address, Xgt_desc_struct, address);
63 OFFSET(GDS_pad, Xgt_desc_struct, pad);
64 BLANK();
65
66 OFFSET(PT_EBX, pt_regs, ebx);
67 OFFSET(PT_ECX, pt_regs, ecx);
68 OFFSET(PT_EDX, pt_regs, edx);
69 OFFSET(PT_ESI, pt_regs, esi);
70 OFFSET(PT_EDI, pt_regs, edi);
71 OFFSET(PT_EBP, pt_regs, ebp);
72 OFFSET(PT_EAX, pt_regs, eax);
73 OFFSET(PT_DS, pt_regs, xds);
74 OFFSET(PT_ES, pt_regs, xes);
75 OFFSET(PT_GS, pt_regs, xgs);
76 OFFSET(PT_ORIG_EAX, pt_regs, orig_eax);
77 OFFSET(PT_EIP, pt_regs, eip);
78 OFFSET(PT_CS, pt_regs, xcs);
79 OFFSET(PT_EFLAGS, pt_regs, eflags);
80 OFFSET(PT_OLDESP, pt_regs, esp);
81 OFFSET(PT_OLDSS, pt_regs, xss);
82 BLANK();
83
61 OFFSET(EXEC_DOMAIN_handler, exec_domain, handler); 84 OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
62 OFFSET(RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext); 85 OFFSET(RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
63 BLANK(); 86 BLANK();
@@ -74,4 +97,18 @@ void foo(void)
74 DEFINE(VDSO_PRELINK, VDSO_PRELINK); 97 DEFINE(VDSO_PRELINK, VDSO_PRELINK);
75 98
76 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); 99 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
100
101 BLANK();
102 OFFSET(PDA_cpu, i386_pda, cpu_number);
103 OFFSET(PDA_pcurrent, i386_pda, pcurrent);
104
105#ifdef CONFIG_PARAVIRT
106 BLANK();
107 OFFSET(PARAVIRT_enabled, paravirt_ops, paravirt_enabled);
108 OFFSET(PARAVIRT_irq_disable, paravirt_ops, irq_disable);
109 OFFSET(PARAVIRT_irq_enable, paravirt_ops, irq_enable);
110 OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, irq_enable_sysexit);
111 OFFSET(PARAVIRT_iret, paravirt_ops, iret);
112 OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
113#endif
77} 114}
diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c
index e4758095d87a..41cfea57232b 100644
--- a/arch/i386/kernel/cpu/amd.c
+++ b/arch/i386/kernel/cpu/amd.c
@@ -104,10 +104,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
104 f_vide(); 104 f_vide();
105 rdtscl(d2); 105 rdtscl(d2);
106 d = d2-d; 106 d = d2-d;
107 107
108 /* Knock these two lines out if it debugs out ok */
109 printk(KERN_INFO "AMD K6 stepping B detected - ");
110 /* -- cut here -- */
111 if (d > 20*K6_BUG_LOOP) 108 if (d > 20*K6_BUG_LOOP)
112 printk("system stability may be impaired when more than 32 MB are used.\n"); 109 printk("system stability may be impaired when more than 32 MB are used.\n");
113 else 110 else
diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index d9f3e3c31f05..1b34c56f8123 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -18,14 +18,15 @@
18#include <asm/apic.h> 18#include <asm/apic.h>
19#include <mach_apic.h> 19#include <mach_apic.h>
20#endif 20#endif
21#include <asm/pda.h>
21 22
22#include "cpu.h" 23#include "cpu.h"
23 24
24DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr); 25DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
25EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr); 26EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr);
26 27
27DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]); 28struct i386_pda *_cpu_pda[NR_CPUS] __read_mostly;
28EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack); 29EXPORT_SYMBOL(_cpu_pda);
29 30
30static int cachesize_override __cpuinitdata = -1; 31static int cachesize_override __cpuinitdata = -1;
31static int disable_x86_fxsr __cpuinitdata; 32static int disable_x86_fxsr __cpuinitdata;
@@ -235,29 +236,14 @@ static int __cpuinit have_cpuid_p(void)
235 return flag_is_changeable_p(X86_EFLAGS_ID); 236 return flag_is_changeable_p(X86_EFLAGS_ID);
236} 237}
237 238
238/* Do minimum CPU detection early. 239void __init cpu_detect(struct cpuinfo_x86 *c)
239 Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
240 The others are not touched to avoid unwanted side effects.
241
242 WARNING: this function is only called on the BP. Don't add code here
243 that is supposed to run on all CPUs. */
244static void __init early_cpu_detect(void)
245{ 240{
246 struct cpuinfo_x86 *c = &boot_cpu_data;
247
248 c->x86_cache_alignment = 32;
249
250 if (!have_cpuid_p())
251 return;
252
253 /* Get vendor name */ 241 /* Get vendor name */
254 cpuid(0x00000000, &c->cpuid_level, 242 cpuid(0x00000000, &c->cpuid_level,
255 (int *)&c->x86_vendor_id[0], 243 (int *)&c->x86_vendor_id[0],
256 (int *)&c->x86_vendor_id[8], 244 (int *)&c->x86_vendor_id[8],
257 (int *)&c->x86_vendor_id[4]); 245 (int *)&c->x86_vendor_id[4]);
258 246
259 get_cpu_vendor(c, 1);
260
261 c->x86 = 4; 247 c->x86 = 4;
262 if (c->cpuid_level >= 0x00000001) { 248 if (c->cpuid_level >= 0x00000001) {
263 u32 junk, tfms, cap0, misc; 249 u32 junk, tfms, cap0, misc;
@@ -274,6 +260,26 @@ static void __init early_cpu_detect(void)
274 } 260 }
275} 261}
276 262
263/* Do minimum CPU detection early.
264 Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
265 The others are not touched to avoid unwanted side effects.
266
267 WARNING: this function is only called on the BP. Don't add code here
268 that is supposed to run on all CPUs. */
269static void __init early_cpu_detect(void)
270{
271 struct cpuinfo_x86 *c = &boot_cpu_data;
272
273 c->x86_cache_alignment = 32;
274
275 if (!have_cpuid_p())
276 return;
277
278 cpu_detect(c);
279
280 get_cpu_vendor(c, 1);
281}
282
277static void __cpuinit generic_identify(struct cpuinfo_x86 * c) 283static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
278{ 284{
279 u32 tfms, xlvl; 285 u32 tfms, xlvl;
@@ -308,6 +314,8 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
308#else 314#else
309 c->apicid = (ebx >> 24) & 0xFF; 315 c->apicid = (ebx >> 24) & 0xFF;
310#endif 316#endif
317 if (c->x86_capability[0] & (1<<19))
318 c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8;
311 } else { 319 } else {
312 /* Have CPUID level 0 only - unheard of */ 320 /* Have CPUID level 0 only - unheard of */
313 c->x86 = 4; 321 c->x86 = 4;
@@ -372,6 +380,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
372 c->x86_vendor_id[0] = '\0'; /* Unset */ 380 c->x86_vendor_id[0] = '\0'; /* Unset */
373 c->x86_model_id[0] = '\0'; /* Unset */ 381 c->x86_model_id[0] = '\0'; /* Unset */
374 c->x86_max_cores = 1; 382 c->x86_max_cores = 1;
383 c->x86_clflush_size = 32;
375 memset(&c->x86_capability, 0, sizeof c->x86_capability); 384 memset(&c->x86_capability, 0, sizeof c->x86_capability);
376 385
377 if (!have_cpuid_p()) { 386 if (!have_cpuid_p()) {
@@ -591,42 +600,24 @@ void __init early_cpu_init(void)
591 disable_pse = 1; 600 disable_pse = 1;
592#endif 601#endif
593} 602}
594/* 603
595 * cpu_init() initializes state that is per-CPU. Some data is already 604/* Make sure %gs is initialized properly in idle threads */
596 * initialized (naturally) in the bootstrap process, such as the GDT 605struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
597 * and IDT. We reload them nevertheless, this function acts as a
598 * 'CPU state barrier', nothing should get across.
599 */
600void __cpuinit cpu_init(void)
601{ 606{
602 int cpu = smp_processor_id(); 607 memset(regs, 0, sizeof(struct pt_regs));
603 struct tss_struct * t = &per_cpu(init_tss, cpu); 608 regs->xgs = __KERNEL_PDA;
604 struct thread_struct *thread = &current->thread; 609 return regs;
605 struct desc_struct *gdt; 610}
606 __u32 stk16_off = (__u32)&per_cpu(cpu_16bit_stack, cpu);
607 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
608 611
609 if (cpu_test_and_set(cpu, cpu_initialized)) { 612static __cpuinit int alloc_gdt(int cpu)
610 printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); 613{
611 for (;;) local_irq_enable(); 614 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
612 } 615 struct desc_struct *gdt;
613 printk(KERN_INFO "Initializing CPU#%d\n", cpu); 616 struct i386_pda *pda;
614 617
615 if (cpu_has_vme || cpu_has_tsc || cpu_has_de) 618 gdt = (struct desc_struct *)cpu_gdt_descr->address;
616 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); 619 pda = cpu_pda(cpu);
617 if (tsc_disable && cpu_has_tsc) {
618 printk(KERN_NOTICE "Disabling TSC...\n");
619 /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
620 clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
621 set_in_cr4(X86_CR4_TSD);
622 }
623 620
624 /* The CPU hotplug case */
625 if (cpu_gdt_descr->address) {
626 gdt = (struct desc_struct *)cpu_gdt_descr->address;
627 memset(gdt, 0, PAGE_SIZE);
628 goto old_gdt;
629 }
630 /* 621 /*
631 * This is a horrible hack to allocate the GDT. The problem 622 * This is a horrible hack to allocate the GDT. The problem
632 * is that cpu_init() is called really early for the boot CPU 623 * is that cpu_init() is called really early for the boot CPU
@@ -634,43 +625,130 @@ void __cpuinit cpu_init(void)
634 * CPUs, when bootmem will have gone away 625 * CPUs, when bootmem will have gone away
635 */ 626 */
636 if (NODE_DATA(0)->bdata->node_bootmem_map) { 627 if (NODE_DATA(0)->bdata->node_bootmem_map) {
637 gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE); 628 BUG_ON(gdt != NULL || pda != NULL);
638 /* alloc_bootmem_pages panics on failure, so no check */ 629
630 gdt = alloc_bootmem_pages(PAGE_SIZE);
631 pda = alloc_bootmem(sizeof(*pda));
632 /* alloc_bootmem(_pages) panics on failure, so no check */
633
639 memset(gdt, 0, PAGE_SIZE); 634 memset(gdt, 0, PAGE_SIZE);
635 memset(pda, 0, sizeof(*pda));
640 } else { 636 } else {
641 gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL); 637 /* GDT and PDA might already have been allocated if
642 if (unlikely(!gdt)) { 638 this is a CPU hotplug re-insertion. */
643 printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu); 639 if (gdt == NULL)
644 for (;;) 640 gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL);
645 local_irq_enable(); 641
642 if (pda == NULL)
643 pda = kmalloc_node(sizeof(*pda), GFP_KERNEL, cpu_to_node(cpu));
644
645 if (unlikely(!gdt || !pda)) {
646 free_pages((unsigned long)gdt, 0);
647 kfree(pda);
648 return 0;
646 } 649 }
647 } 650 }
648old_gdt: 651
652 cpu_gdt_descr->address = (unsigned long)gdt;
653 cpu_pda(cpu) = pda;
654
655 return 1;
656}
657
658/* Initial PDA used by boot CPU */
659struct i386_pda boot_pda = {
660 ._pda = &boot_pda,
661 .cpu_number = 0,
662 .pcurrent = &init_task,
663};
664
665static inline void set_kernel_gs(void)
666{
667 /* Set %gs for this CPU's PDA. Memory clobber is to create a
668 barrier with respect to any PDA operations, so the compiler
669 doesn't move any before here. */
670 asm volatile ("mov %0, %%gs" : : "r" (__KERNEL_PDA) : "memory");
671}
672
673/* Initialize the CPU's GDT and PDA. The boot CPU does this for
674 itself, but secondaries find this done for them. */
675__cpuinit int init_gdt(int cpu, struct task_struct *idle)
676{
677 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
678 struct desc_struct *gdt;
679 struct i386_pda *pda;
680
681 /* For non-boot CPUs, the GDT and PDA should already have been
682 allocated. */
683 if (!alloc_gdt(cpu)) {
684 printk(KERN_CRIT "CPU%d failed to allocate GDT or PDA\n", cpu);
685 return 0;
686 }
687
688 gdt = (struct desc_struct *)cpu_gdt_descr->address;
689 pda = cpu_pda(cpu);
690
691 BUG_ON(gdt == NULL || pda == NULL);
692
649 /* 693 /*
650 * Initialize the per-CPU GDT with the boot GDT, 694 * Initialize the per-CPU GDT with the boot GDT,
651 * and set up the GDT descriptor: 695 * and set up the GDT descriptor:
652 */ 696 */
653 memcpy(gdt, cpu_gdt_table, GDT_SIZE); 697 memcpy(gdt, cpu_gdt_table, GDT_SIZE);
698 cpu_gdt_descr->size = GDT_SIZE - 1;
654 699
655 /* Set up GDT entry for 16bit stack */ 700 pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a,
656 *(__u64 *)(&gdt[GDT_ENTRY_ESPFIX_SS]) |= 701 (u32 *)&gdt[GDT_ENTRY_PDA].b,
657 ((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) | 702 (unsigned long)pda, sizeof(*pda) - 1,
658 ((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) | 703 0x80 | DESCTYPE_S | 0x2, 0); /* present read-write data segment */
659 (CPU_16BIT_STACK_SIZE - 1);
660 704
661 cpu_gdt_descr->size = GDT_SIZE - 1; 705 memset(pda, 0, sizeof(*pda));
662 cpu_gdt_descr->address = (unsigned long)gdt; 706 pda->_pda = pda;
707 pda->cpu_number = cpu;
708 pda->pcurrent = idle;
709
710 return 1;
711}
712
713/* Common CPU init for both boot and secondary CPUs */
714static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
715{
716 struct tss_struct * t = &per_cpu(init_tss, cpu);
717 struct thread_struct *thread = &curr->thread;
718 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
663 719
720 /* Reinit these anyway, even if they've already been done (on
721 the boot CPU, this will transition from the boot gdt+pda to
722 the real ones). */
664 load_gdt(cpu_gdt_descr); 723 load_gdt(cpu_gdt_descr);
724 set_kernel_gs();
725
726 if (cpu_test_and_set(cpu, cpu_initialized)) {
727 printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
728 for (;;) local_irq_enable();
729 }
730
731 printk(KERN_INFO "Initializing CPU#%d\n", cpu);
732
733 if (cpu_has_vme || cpu_has_tsc || cpu_has_de)
734 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
735 if (tsc_disable && cpu_has_tsc) {
736 printk(KERN_NOTICE "Disabling TSC...\n");
737 /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
738 clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
739 set_in_cr4(X86_CR4_TSD);
740 }
741
665 load_idt(&idt_descr); 742 load_idt(&idt_descr);
666 743
667 /* 744 /*
668 * Set up and load the per-CPU TSS and LDT 745 * Set up and load the per-CPU TSS and LDT
669 */ 746 */
670 atomic_inc(&init_mm.mm_count); 747 atomic_inc(&init_mm.mm_count);
671 current->active_mm = &init_mm; 748 curr->active_mm = &init_mm;
672 BUG_ON(current->mm); 749 if (curr->mm)
673 enter_lazy_tlb(&init_mm, current); 750 BUG();
751 enter_lazy_tlb(&init_mm, curr);
674 752
675 load_esp0(t, thread); 753 load_esp0(t, thread);
676 set_tss_desc(cpu,t); 754 set_tss_desc(cpu,t);
@@ -682,8 +760,8 @@ old_gdt:
682 __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); 760 __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
683#endif 761#endif
684 762
685 /* Clear %fs and %gs. */ 763 /* Clear %fs. */
686 asm volatile ("movl %0, %%fs; movl %0, %%gs" : : "r" (0)); 764 asm volatile ("mov %0, %%fs" : : "r" (0));
687 765
688 /* Clear all 6 debug registers: */ 766 /* Clear all 6 debug registers: */
689 set_debugreg(0, 0); 767 set_debugreg(0, 0);
@@ -701,6 +779,37 @@ old_gdt:
701 mxcsr_feature_mask_init(); 779 mxcsr_feature_mask_init();
702} 780}
703 781
782/* Entrypoint to initialize secondary CPU */
783void __cpuinit secondary_cpu_init(void)
784{
785 int cpu = smp_processor_id();
786 struct task_struct *curr = current;
787
788 _cpu_init(cpu, curr);
789}
790
791/*
792 * cpu_init() initializes state that is per-CPU. Some data is already
793 * initialized (naturally) in the bootstrap process, such as the GDT
794 * and IDT. We reload them nevertheless, this function acts as a
795 * 'CPU state barrier', nothing should get across.
796 */
797void __cpuinit cpu_init(void)
798{
799 int cpu = smp_processor_id();
800 struct task_struct *curr = current;
801
802 /* Set up the real GDT and PDA, so we can transition from the
803 boot versions. */
804 if (!init_gdt(cpu, curr)) {
805 /* failed to allocate something; not much we can do... */
806 for (;;)
807 local_irq_enable();
808 }
809
810 _cpu_init(cpu, curr);
811}
812
704#ifdef CONFIG_HOTPLUG_CPU 813#ifdef CONFIG_HOTPLUG_CPU
705void __cpuinit cpu_uninit(void) 814void __cpuinit cpu_uninit(void)
706{ 815{
diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c
index 94a95aa5227e..56fe26584957 100644
--- a/arch/i386/kernel/cpu/intel.c
+++ b/arch/i386/kernel/cpu/intel.c
@@ -107,7 +107,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
107 * Note that the workaround only should be initialized once... 107 * Note that the workaround only should be initialized once...
108 */ 108 */
109 c->f00f_bug = 0; 109 c->f00f_bug = 0;
110 if ( c->x86 == 5 ) { 110 if (!paravirt_enabled() && c->x86 == 5) {
111 static int f00f_workaround_enabled = 0; 111 static int f00f_workaround_enabled = 0;
112 112
113 c->f00f_bug = 1; 113 c->f00f_bug = 1;
@@ -195,8 +195,16 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
195 if ((c->x86 == 0xf && c->x86_model >= 0x03) || 195 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
196 (c->x86 == 0x6 && c->x86_model >= 0x0e)) 196 (c->x86 == 0x6 && c->x86_model >= 0x0e))
197 set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability); 197 set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
198}
199 198
199 if (cpu_has_ds) {
200 unsigned int l1;
201 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
202 if (!(l1 & (1<<11)))
203 set_bit(X86_FEATURE_BTS, c->x86_capability);
204 if (!(l1 & (1<<12)))
205 set_bit(X86_FEATURE_PEBS, c->x86_capability);
206 }
207}
200 208
201static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 * c, unsigned int size) 209static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 * c, unsigned int size)
202{ 210{
diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c
index 5c43be47587f..80b4c5d421b1 100644
--- a/arch/i386/kernel/cpu/intel_cacheinfo.c
+++ b/arch/i386/kernel/cpu/intel_cacheinfo.c
@@ -480,12 +480,10 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
480 if (num_cache_leaves == 0) 480 if (num_cache_leaves == 0)
481 return -ENOENT; 481 return -ENOENT;
482 482
483 cpuid4_info[cpu] = kmalloc( 483 cpuid4_info[cpu] = kzalloc(
484 sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL); 484 sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
485 if (unlikely(cpuid4_info[cpu] == NULL)) 485 if (unlikely(cpuid4_info[cpu] == NULL))
486 return -ENOMEM; 486 return -ENOMEM;
487 memset(cpuid4_info[cpu], 0,
488 sizeof(struct _cpuid4_info) * num_cache_leaves);
489 487
490 oldmask = current->cpus_allowed; 488 oldmask = current->cpus_allowed;
491 retval = set_cpus_allowed(current, cpumask_of_cpu(cpu)); 489 retval = set_cpus_allowed(current, cpumask_of_cpu(cpu));
@@ -658,17 +656,14 @@ static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu)
658 return -ENOENT; 656 return -ENOENT;
659 657
660 /* Allocate all required memory */ 658 /* Allocate all required memory */
661 cache_kobject[cpu] = kmalloc(sizeof(struct kobject), GFP_KERNEL); 659 cache_kobject[cpu] = kzalloc(sizeof(struct kobject), GFP_KERNEL);
662 if (unlikely(cache_kobject[cpu] == NULL)) 660 if (unlikely(cache_kobject[cpu] == NULL))
663 goto err_out; 661 goto err_out;
664 memset(cache_kobject[cpu], 0, sizeof(struct kobject));
665 662
666 index_kobject[cpu] = kmalloc( 663 index_kobject[cpu] = kzalloc(
667 sizeof(struct _index_kobject ) * num_cache_leaves, GFP_KERNEL); 664 sizeof(struct _index_kobject ) * num_cache_leaves, GFP_KERNEL);
668 if (unlikely(index_kobject[cpu] == NULL)) 665 if (unlikely(index_kobject[cpu] == NULL))
669 goto err_out; 666 goto err_out;
670 memset(index_kobject[cpu], 0,
671 sizeof(struct _index_kobject) * num_cache_leaves);
672 667
673 return 0; 668 return 0;
674 669
diff --git a/arch/i386/kernel/cpu/mtrr/Makefile b/arch/i386/kernel/cpu/mtrr/Makefile
index a25b701ab84e..191fc0533649 100644
--- a/arch/i386/kernel/cpu/mtrr/Makefile
+++ b/arch/i386/kernel/cpu/mtrr/Makefile
@@ -1,5 +1,3 @@
1obj-y := main.o if.o generic.o state.o 1obj-y := main.o if.o generic.o state.o
2obj-y += amd.o 2obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
3obj-y += cyrix.o
4obj-y += centaur.o
5 3
diff --git a/arch/i386/kernel/cpu/mtrr/amd.c b/arch/i386/kernel/cpu/mtrr/amd.c
index 1a1e04b6fd00..0949cdbf848a 100644
--- a/arch/i386/kernel/cpu/mtrr/amd.c
+++ b/arch/i386/kernel/cpu/mtrr/amd.c
@@ -7,7 +7,7 @@
7 7
8static void 8static void
9amd_get_mtrr(unsigned int reg, unsigned long *base, 9amd_get_mtrr(unsigned int reg, unsigned long *base,
10 unsigned int *size, mtrr_type * type) 10 unsigned long *size, mtrr_type * type)
11{ 11{
12 unsigned long low, high; 12 unsigned long low, high;
13 13
diff --git a/arch/i386/kernel/cpu/mtrr/centaur.c b/arch/i386/kernel/cpu/mtrr/centaur.c
index 33f00ac314ef..cb9aa3a7a7ab 100644
--- a/arch/i386/kernel/cpu/mtrr/centaur.c
+++ b/arch/i386/kernel/cpu/mtrr/centaur.c
@@ -17,7 +17,7 @@ static u8 centaur_mcr_type; /* 0 for winchip, 1 for winchip2 */
17 */ 17 */
18 18
19static int 19static int
20centaur_get_free_region(unsigned long base, unsigned long size) 20centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg)
21/* [SUMMARY] Get a free MTRR. 21/* [SUMMARY] Get a free MTRR.
22 <base> The starting (base) address of the region. 22 <base> The starting (base) address of the region.
23 <size> The size (in bytes) of the region. 23 <size> The size (in bytes) of the region.
@@ -26,10 +26,11 @@ centaur_get_free_region(unsigned long base, unsigned long size)
26{ 26{
27 int i, max; 27 int i, max;
28 mtrr_type ltype; 28 mtrr_type ltype;
29 unsigned long lbase; 29 unsigned long lbase, lsize;
30 unsigned int lsize;
31 30
32 max = num_var_ranges; 31 max = num_var_ranges;
32 if (replace_reg >= 0 && replace_reg < max)
33 return replace_reg;
33 for (i = 0; i < max; ++i) { 34 for (i = 0; i < max; ++i) {
34 if (centaur_mcr_reserved & (1 << i)) 35 if (centaur_mcr_reserved & (1 << i))
35 continue; 36 continue;
@@ -49,7 +50,7 @@ mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
49 50
50static void 51static void
51centaur_get_mcr(unsigned int reg, unsigned long *base, 52centaur_get_mcr(unsigned int reg, unsigned long *base,
52 unsigned int *size, mtrr_type * type) 53 unsigned long *size, mtrr_type * type)
53{ 54{
54 *base = centaur_mcr[reg].high >> PAGE_SHIFT; 55 *base = centaur_mcr[reg].high >> PAGE_SHIFT;
55 *size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT; 56 *size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT;
diff --git a/arch/i386/kernel/cpu/mtrr/cyrix.c b/arch/i386/kernel/cpu/mtrr/cyrix.c
index 9027a987006b..0737a596db43 100644
--- a/arch/i386/kernel/cpu/mtrr/cyrix.c
+++ b/arch/i386/kernel/cpu/mtrr/cyrix.c
@@ -9,7 +9,7 @@ int arr3_protected;
9 9
10static void 10static void
11cyrix_get_arr(unsigned int reg, unsigned long *base, 11cyrix_get_arr(unsigned int reg, unsigned long *base,
12 unsigned int *size, mtrr_type * type) 12 unsigned long *size, mtrr_type * type)
13{ 13{
14 unsigned long flags; 14 unsigned long flags;
15 unsigned char arr, ccr3, rcr, shift; 15 unsigned char arr, ccr3, rcr, shift;
@@ -77,7 +77,7 @@ cyrix_get_arr(unsigned int reg, unsigned long *base,
77} 77}
78 78
79static int 79static int
80cyrix_get_free_region(unsigned long base, unsigned long size) 80cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg)
81/* [SUMMARY] Get a free ARR. 81/* [SUMMARY] Get a free ARR.
82 <base> The starting (base) address of the region. 82 <base> The starting (base) address of the region.
83 <size> The size (in bytes) of the region. 83 <size> The size (in bytes) of the region.
@@ -86,9 +86,24 @@ cyrix_get_free_region(unsigned long base, unsigned long size)
86{ 86{
87 int i; 87 int i;
88 mtrr_type ltype; 88 mtrr_type ltype;
89 unsigned long lbase; 89 unsigned long lbase, lsize;
90 unsigned int lsize;
91 90
91 switch (replace_reg) {
92 case 7:
93 if (size < 0x40)
94 break;
95 case 6:
96 case 5:
97 case 4:
98 return replace_reg;
99 case 3:
100 if (arr3_protected)
101 break;
102 case 2:
103 case 1:
104 case 0:
105 return replace_reg;
106 }
92 /* If we are to set up a region >32M then look at ARR7 immediately */ 107 /* If we are to set up a region >32M then look at ARR7 immediately */
93 if (size > 0x2000) { 108 if (size > 0x2000) {
94 cyrix_get_arr(7, &lbase, &lsize, &ltype); 109 cyrix_get_arr(7, &lbase, &lsize, &ltype);
@@ -214,7 +229,7 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base,
214 229
215typedef struct { 230typedef struct {
216 unsigned long base; 231 unsigned long base;
217 unsigned int size; 232 unsigned long size;
218 mtrr_type type; 233 mtrr_type type;
219} arr_state_t; 234} arr_state_t;
220 235
diff --git a/arch/i386/kernel/cpu/mtrr/generic.c b/arch/i386/kernel/cpu/mtrr/generic.c
index 0b61eed8bbd8..f77fc53db654 100644
--- a/arch/i386/kernel/cpu/mtrr/generic.c
+++ b/arch/i386/kernel/cpu/mtrr/generic.c
@@ -3,6 +3,7 @@
3#include <linux/init.h> 3#include <linux/init.h>
4#include <linux/slab.h> 4#include <linux/slab.h>
5#include <linux/mm.h> 5#include <linux/mm.h>
6#include <linux/module.h>
6#include <asm/io.h> 7#include <asm/io.h>
7#include <asm/mtrr.h> 8#include <asm/mtrr.h>
8#include <asm/msr.h> 9#include <asm/msr.h>
@@ -15,12 +16,19 @@ struct mtrr_state {
15 struct mtrr_var_range *var_ranges; 16 struct mtrr_var_range *var_ranges;
16 mtrr_type fixed_ranges[NUM_FIXED_RANGES]; 17 mtrr_type fixed_ranges[NUM_FIXED_RANGES];
17 unsigned char enabled; 18 unsigned char enabled;
19 unsigned char have_fixed;
18 mtrr_type def_type; 20 mtrr_type def_type;
19}; 21};
20 22
21static unsigned long smp_changes_mask; 23static unsigned long smp_changes_mask;
22static struct mtrr_state mtrr_state = {}; 24static struct mtrr_state mtrr_state = {};
23 25
26#undef MODULE_PARAM_PREFIX
27#define MODULE_PARAM_PREFIX "mtrr."
28
29static __initdata int mtrr_show;
30module_param_named(show, mtrr_show, bool, 0);
31
24/* Get the MSR pair relating to a var range */ 32/* Get the MSR pair relating to a var range */
25static void __init 33static void __init
26get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) 34get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
@@ -43,6 +51,14 @@ get_fixed_ranges(mtrr_type * frs)
43 rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]); 51 rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]);
44} 52}
45 53
54static void __init print_fixed(unsigned base, unsigned step, const mtrr_type*types)
55{
56 unsigned i;
57
58 for (i = 0; i < 8; ++i, ++types, base += step)
59 printk(KERN_INFO "MTRR %05X-%05X %s\n", base, base + step - 1, mtrr_attrib_to_str(*types));
60}
61
46/* Grab all of the MTRR state for this CPU into *state */ 62/* Grab all of the MTRR state for this CPU into *state */
47void __init get_mtrr_state(void) 63void __init get_mtrr_state(void)
48{ 64{
@@ -58,13 +74,49 @@ void __init get_mtrr_state(void)
58 } 74 }
59 vrs = mtrr_state.var_ranges; 75 vrs = mtrr_state.var_ranges;
60 76
77 rdmsr(MTRRcap_MSR, lo, dummy);
78 mtrr_state.have_fixed = (lo >> 8) & 1;
79
61 for (i = 0; i < num_var_ranges; i++) 80 for (i = 0; i < num_var_ranges; i++)
62 get_mtrr_var_range(i, &vrs[i]); 81 get_mtrr_var_range(i, &vrs[i]);
63 get_fixed_ranges(mtrr_state.fixed_ranges); 82 if (mtrr_state.have_fixed)
83 get_fixed_ranges(mtrr_state.fixed_ranges);
64 84
65 rdmsr(MTRRdefType_MSR, lo, dummy); 85 rdmsr(MTRRdefType_MSR, lo, dummy);
66 mtrr_state.def_type = (lo & 0xff); 86 mtrr_state.def_type = (lo & 0xff);
67 mtrr_state.enabled = (lo & 0xc00) >> 10; 87 mtrr_state.enabled = (lo & 0xc00) >> 10;
88
89 if (mtrr_show) {
90 int high_width;
91
92 printk(KERN_INFO "MTRR default type: %s\n", mtrr_attrib_to_str(mtrr_state.def_type));
93 if (mtrr_state.have_fixed) {
94 printk(KERN_INFO "MTRR fixed ranges %sabled:\n",
95 mtrr_state.enabled & 1 ? "en" : "dis");
96 print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
97 for (i = 0; i < 2; ++i)
98 print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8);
99 for (i = 0; i < 8; ++i)
100 print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8);
101 }
102 printk(KERN_INFO "MTRR variable ranges %sabled:\n",
103 mtrr_state.enabled & 2 ? "en" : "dis");
104 high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4;
105 for (i = 0; i < num_var_ranges; ++i) {
106 if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
107 printk(KERN_INFO "MTRR %u base %0*X%05X000 mask %0*X%05X000 %s\n",
108 i,
109 high_width,
110 mtrr_state.var_ranges[i].base_hi,
111 mtrr_state.var_ranges[i].base_lo >> 12,
112 high_width,
113 mtrr_state.var_ranges[i].mask_hi,
114 mtrr_state.var_ranges[i].mask_lo >> 12,
115 mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff));
116 else
117 printk(KERN_INFO "MTRR %u disabled\n", i);
118 }
119 }
68} 120}
69 121
70/* Some BIOS's are fucked and don't set all MTRRs the same! */ 122/* Some BIOS's are fucked and don't set all MTRRs the same! */
@@ -95,7 +147,7 @@ void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b)
95 smp_processor_id(), msr, a, b); 147 smp_processor_id(), msr, a, b);
96} 148}
97 149
98int generic_get_free_region(unsigned long base, unsigned long size) 150int generic_get_free_region(unsigned long base, unsigned long size, int replace_reg)
99/* [SUMMARY] Get a free MTRR. 151/* [SUMMARY] Get a free MTRR.
100 <base> The starting (base) address of the region. 152 <base> The starting (base) address of the region.
101 <size> The size (in bytes) of the region. 153 <size> The size (in bytes) of the region.
@@ -104,10 +156,11 @@ int generic_get_free_region(unsigned long base, unsigned long size)
104{ 156{
105 int i, max; 157 int i, max;
106 mtrr_type ltype; 158 mtrr_type ltype;
107 unsigned long lbase; 159 unsigned long lbase, lsize;
108 unsigned lsize;
109 160
110 max = num_var_ranges; 161 max = num_var_ranges;
162 if (replace_reg >= 0 && replace_reg < max)
163 return replace_reg;
111 for (i = 0; i < max; ++i) { 164 for (i = 0; i < max; ++i) {
112 mtrr_if->get(i, &lbase, &lsize, &ltype); 165 mtrr_if->get(i, &lbase, &lsize, &ltype);
113 if (lsize == 0) 166 if (lsize == 0)
@@ -117,7 +170,7 @@ int generic_get_free_region(unsigned long base, unsigned long size)
117} 170}
118 171
119static void generic_get_mtrr(unsigned int reg, unsigned long *base, 172static void generic_get_mtrr(unsigned int reg, unsigned long *base,
120 unsigned int *size, mtrr_type * type) 173 unsigned long *size, mtrr_type *type)
121{ 174{
122 unsigned int mask_lo, mask_hi, base_lo, base_hi; 175 unsigned int mask_lo, mask_hi, base_lo, base_hi;
123 176
@@ -202,7 +255,9 @@ static int set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr)
202 return changed; 255 return changed;
203} 256}
204 257
205static unsigned long set_mtrr_state(u32 deftype_lo, u32 deftype_hi) 258static u32 deftype_lo, deftype_hi;
259
260static unsigned long set_mtrr_state(void)
206/* [SUMMARY] Set the MTRR state for this CPU. 261/* [SUMMARY] Set the MTRR state for this CPU.
207 <state> The MTRR state information to read. 262 <state> The MTRR state information to read.
208 <ctxt> Some relevant CPU context. 263 <ctxt> Some relevant CPU context.
@@ -217,14 +272,14 @@ static unsigned long set_mtrr_state(u32 deftype_lo, u32 deftype_hi)
217 if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i])) 272 if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i]))
218 change_mask |= MTRR_CHANGE_MASK_VARIABLE; 273 change_mask |= MTRR_CHANGE_MASK_VARIABLE;
219 274
220 if (set_fixed_ranges(mtrr_state.fixed_ranges)) 275 if (mtrr_state.have_fixed && set_fixed_ranges(mtrr_state.fixed_ranges))
221 change_mask |= MTRR_CHANGE_MASK_FIXED; 276 change_mask |= MTRR_CHANGE_MASK_FIXED;
222 277
223 /* Set_mtrr_restore restores the old value of MTRRdefType, 278 /* Set_mtrr_restore restores the old value of MTRRdefType,
224 so to set it we fiddle with the saved value */ 279 so to set it we fiddle with the saved value */
225 if ((deftype_lo & 0xff) != mtrr_state.def_type 280 if ((deftype_lo & 0xff) != mtrr_state.def_type
226 || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) { 281 || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) {
227 deftype_lo |= (mtrr_state.def_type | mtrr_state.enabled << 10); 282 deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type | (mtrr_state.enabled << 10);
228 change_mask |= MTRR_CHANGE_MASK_DEFTYPE; 283 change_mask |= MTRR_CHANGE_MASK_DEFTYPE;
229 } 284 }
230 285
@@ -233,7 +288,6 @@ static unsigned long set_mtrr_state(u32 deftype_lo, u32 deftype_hi)
233 288
234 289
235static unsigned long cr4 = 0; 290static unsigned long cr4 = 0;
236static u32 deftype_lo, deftype_hi;
237static DEFINE_SPINLOCK(set_atomicity_lock); 291static DEFINE_SPINLOCK(set_atomicity_lock);
238 292
239/* 293/*
@@ -271,7 +325,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
271 rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); 325 rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi);
272 326
273 /* Disable MTRRs, and set the default type to uncached */ 327 /* Disable MTRRs, and set the default type to uncached */
274 mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & 0xf300UL, deftype_hi); 328 mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & ~0xcff, deftype_hi);
275} 329}
276 330
277static void post_set(void) __releases(set_atomicity_lock) 331static void post_set(void) __releases(set_atomicity_lock)
@@ -300,7 +354,7 @@ static void generic_set_all(void)
300 prepare_set(); 354 prepare_set();
301 355
302 /* Actually set the state */ 356 /* Actually set the state */
303 mask = set_mtrr_state(deftype_lo,deftype_hi); 357 mask = set_mtrr_state();
304 358
305 post_set(); 359 post_set();
306 local_irq_restore(flags); 360 local_irq_restore(flags);
@@ -366,7 +420,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i
366 printk(KERN_WARNING "mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); 420 printk(KERN_WARNING "mtrr: base(0x%lx000) is not 4 MiB aligned\n", base);
367 return -EINVAL; 421 return -EINVAL;
368 } 422 }
369 if (!(base + size < 0x70000000 || base > 0x7003FFFF) && 423 if (!(base + size < 0x70000 || base > 0x7003F) &&
370 (type == MTRR_TYPE_WRCOMB 424 (type == MTRR_TYPE_WRCOMB
371 || type == MTRR_TYPE_WRBACK)) { 425 || type == MTRR_TYPE_WRBACK)) {
372 printk(KERN_WARNING "mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n"); 426 printk(KERN_WARNING "mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n");
diff --git a/arch/i386/kernel/cpu/mtrr/if.c b/arch/i386/kernel/cpu/mtrr/if.c
index 5ac051bb9d55..5ae1705eafa6 100644
--- a/arch/i386/kernel/cpu/mtrr/if.c
+++ b/arch/i386/kernel/cpu/mtrr/if.c
@@ -17,7 +17,7 @@ extern unsigned int *usage_table;
17 17
18#define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private) 18#define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private)
19 19
20static char *mtrr_strings[MTRR_NUM_TYPES] = 20static const char *const mtrr_strings[MTRR_NUM_TYPES] =
21{ 21{
22 "uncachable", /* 0 */ 22 "uncachable", /* 0 */
23 "write-combining", /* 1 */ 23 "write-combining", /* 1 */
@@ -28,7 +28,7 @@ static char *mtrr_strings[MTRR_NUM_TYPES] =
28 "write-back", /* 6 */ 28 "write-back", /* 6 */
29}; 29};
30 30
31char *mtrr_attrib_to_str(int x) 31const char *mtrr_attrib_to_str(int x)
32{ 32{
33 return (x <= 6) ? mtrr_strings[x] : "?"; 33 return (x <= 6) ? mtrr_strings[x] : "?";
34} 34}
@@ -44,10 +44,9 @@ mtrr_file_add(unsigned long base, unsigned long size,
44 44
45 max = num_var_ranges; 45 max = num_var_ranges;
46 if (fcount == NULL) { 46 if (fcount == NULL) {
47 fcount = kmalloc(max * sizeof *fcount, GFP_KERNEL); 47 fcount = kzalloc(max * sizeof *fcount, GFP_KERNEL);
48 if (!fcount) 48 if (!fcount)
49 return -ENOMEM; 49 return -ENOMEM;
50 memset(fcount, 0, max * sizeof *fcount);
51 FILE_FCOUNT(file) = fcount; 50 FILE_FCOUNT(file) = fcount;
52 } 51 }
53 if (!page) { 52 if (!page) {
@@ -155,6 +154,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
155{ 154{
156 int err = 0; 155 int err = 0;
157 mtrr_type type; 156 mtrr_type type;
157 unsigned long size;
158 struct mtrr_sentry sentry; 158 struct mtrr_sentry sentry;
159 struct mtrr_gentry gentry; 159 struct mtrr_gentry gentry;
160 void __user *arg = (void __user *) __arg; 160 void __user *arg = (void __user *) __arg;
@@ -235,15 +235,15 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
235 case MTRRIOC_GET_ENTRY: 235 case MTRRIOC_GET_ENTRY:
236 if (gentry.regnum >= num_var_ranges) 236 if (gentry.regnum >= num_var_ranges)
237 return -EINVAL; 237 return -EINVAL;
238 mtrr_if->get(gentry.regnum, &gentry.base, &gentry.size, &type); 238 mtrr_if->get(gentry.regnum, &gentry.base, &size, &type);
239 239
240 /* Hide entries that go above 4GB */ 240 /* Hide entries that go above 4GB */
241 if (gentry.base + gentry.size > 0x100000 241 if (gentry.base + size - 1 >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT))
242 || gentry.size == 0x100000) 242 || size >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT)))
243 gentry.base = gentry.size = gentry.type = 0; 243 gentry.base = gentry.size = gentry.type = 0;
244 else { 244 else {
245 gentry.base <<= PAGE_SHIFT; 245 gentry.base <<= PAGE_SHIFT;
246 gentry.size <<= PAGE_SHIFT; 246 gentry.size = size << PAGE_SHIFT;
247 gentry.type = type; 247 gentry.type = type;
248 } 248 }
249 249
@@ -273,8 +273,14 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
273 case MTRRIOC_GET_PAGE_ENTRY: 273 case MTRRIOC_GET_PAGE_ENTRY:
274 if (gentry.regnum >= num_var_ranges) 274 if (gentry.regnum >= num_var_ranges)
275 return -EINVAL; 275 return -EINVAL;
276 mtrr_if->get(gentry.regnum, &gentry.base, &gentry.size, &type); 276 mtrr_if->get(gentry.regnum, &gentry.base, &size, &type);
277 gentry.type = type; 277 /* Hide entries that would overflow */
278 if (size != (__typeof__(gentry.size))size)
279 gentry.base = gentry.size = gentry.type = 0;
280 else {
281 gentry.size = size;
282 gentry.type = type;
283 }
278 break; 284 break;
279 } 285 }
280 286
@@ -353,8 +359,7 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
353 char factor; 359 char factor;
354 int i, max, len; 360 int i, max, len;
355 mtrr_type type; 361 mtrr_type type;
356 unsigned long base; 362 unsigned long base, size;
357 unsigned int size;
358 363
359 len = 0; 364 len = 0;
360 max = num_var_ranges; 365 max = num_var_ranges;
@@ -373,7 +378,7 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
373 } 378 }
374 /* RED-PEN: base can be > 32bit */ 379 /* RED-PEN: base can be > 32bit */
375 len += seq_printf(seq, 380 len += seq_printf(seq,
376 "reg%02i: base=0x%05lx000 (%4liMB), size=%4i%cB: %s, count=%d\n", 381 "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n",
377 i, base, base >> (20 - PAGE_SHIFT), size, factor, 382 i, base, base >> (20 - PAGE_SHIFT), size, factor,
378 mtrr_attrib_to_str(type), usage_table[i]); 383 mtrr_attrib_to_str(type), usage_table[i]);
379 } 384 }
diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c
index fff90bda4733..16bb7ea87145 100644
--- a/arch/i386/kernel/cpu/mtrr/main.c
+++ b/arch/i386/kernel/cpu/mtrr/main.c
@@ -59,7 +59,11 @@ struct mtrr_ops * mtrr_if = NULL;
59static void set_mtrr(unsigned int reg, unsigned long base, 59static void set_mtrr(unsigned int reg, unsigned long base,
60 unsigned long size, mtrr_type type); 60 unsigned long size, mtrr_type type);
61 61
62#ifndef CONFIG_X86_64
62extern int arr3_protected; 63extern int arr3_protected;
64#else
65#define arr3_protected 0
66#endif
63 67
64void set_mtrr_ops(struct mtrr_ops * ops) 68void set_mtrr_ops(struct mtrr_ops * ops)
65{ 69{
@@ -168,6 +172,13 @@ static void ipi_handler(void *info)
168 172
169#endif 173#endif
170 174
175static inline int types_compatible(mtrr_type type1, mtrr_type type2) {
176 return type1 == MTRR_TYPE_UNCACHABLE ||
177 type2 == MTRR_TYPE_UNCACHABLE ||
178 (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK) ||
179 (type1 == MTRR_TYPE_WRBACK && type2 == MTRR_TYPE_WRTHROUGH);
180}
181
171/** 182/**
172 * set_mtrr - update mtrrs on all processors 183 * set_mtrr - update mtrrs on all processors
173 * @reg: mtrr in question 184 * @reg: mtrr in question
@@ -263,8 +274,8 @@ static void set_mtrr(unsigned int reg, unsigned long base,
263 274
264/** 275/**
265 * mtrr_add_page - Add a memory type region 276 * mtrr_add_page - Add a memory type region
266 * @base: Physical base address of region in pages (4 KB) 277 * @base: Physical base address of region in pages (in units of 4 kB!)
267 * @size: Physical size of region in pages (4 KB) 278 * @size: Physical size of region in pages (4 kB)
268 * @type: Type of MTRR desired 279 * @type: Type of MTRR desired
269 * @increment: If this is true do usage counting on the region 280 * @increment: If this is true do usage counting on the region
270 * 281 *
@@ -300,11 +311,9 @@ static void set_mtrr(unsigned int reg, unsigned long base,
300int mtrr_add_page(unsigned long base, unsigned long size, 311int mtrr_add_page(unsigned long base, unsigned long size,
301 unsigned int type, char increment) 312 unsigned int type, char increment)
302{ 313{
303 int i; 314 int i, replace, error;
304 mtrr_type ltype; 315 mtrr_type ltype;
305 unsigned long lbase; 316 unsigned long lbase, lsize;
306 unsigned int lsize;
307 int error;
308 317
309 if (!mtrr_if) 318 if (!mtrr_if)
310 return -ENXIO; 319 return -ENXIO;
@@ -324,12 +333,18 @@ int mtrr_add_page(unsigned long base, unsigned long size,
324 return -ENOSYS; 333 return -ENOSYS;
325 } 334 }
326 335
336 if (!size) {
337 printk(KERN_WARNING "mtrr: zero sized request\n");
338 return -EINVAL;
339 }
340
327 if (base & size_or_mask || size & size_or_mask) { 341 if (base & size_or_mask || size & size_or_mask) {
328 printk(KERN_WARNING "mtrr: base or size exceeds the MTRR width\n"); 342 printk(KERN_WARNING "mtrr: base or size exceeds the MTRR width\n");
329 return -EINVAL; 343 return -EINVAL;
330 } 344 }
331 345
332 error = -EINVAL; 346 error = -EINVAL;
347 replace = -1;
333 348
334 /* No CPU hotplug when we change MTRR entries */ 349 /* No CPU hotplug when we change MTRR entries */
335 lock_cpu_hotplug(); 350 lock_cpu_hotplug();
@@ -337,21 +352,28 @@ int mtrr_add_page(unsigned long base, unsigned long size,
337 mutex_lock(&mtrr_mutex); 352 mutex_lock(&mtrr_mutex);
338 for (i = 0; i < num_var_ranges; ++i) { 353 for (i = 0; i < num_var_ranges; ++i) {
339 mtrr_if->get(i, &lbase, &lsize, &ltype); 354 mtrr_if->get(i, &lbase, &lsize, &ltype);
340 if (base >= lbase + lsize) 355 if (!lsize || base > lbase + lsize - 1 || base + size - 1 < lbase)
341 continue;
342 if ((base < lbase) && (base + size <= lbase))
343 continue; 356 continue;
344 /* At this point we know there is some kind of overlap/enclosure */ 357 /* At this point we know there is some kind of overlap/enclosure */
345 if ((base < lbase) || (base + size > lbase + lsize)) { 358 if (base < lbase || base + size - 1 > lbase + lsize - 1) {
359 if (base <= lbase && base + size - 1 >= lbase + lsize - 1) {
360 /* New region encloses an existing region */
361 if (type == ltype) {
362 replace = replace == -1 ? i : -2;
363 continue;
364 }
365 else if (types_compatible(type, ltype))
366 continue;
367 }
346 printk(KERN_WARNING 368 printk(KERN_WARNING
347 "mtrr: 0x%lx000,0x%lx000 overlaps existing" 369 "mtrr: 0x%lx000,0x%lx000 overlaps existing"
348 " 0x%lx000,0x%x000\n", base, size, lbase, 370 " 0x%lx000,0x%lx000\n", base, size, lbase,
349 lsize); 371 lsize);
350 goto out; 372 goto out;
351 } 373 }
352 /* New region is enclosed by an existing region */ 374 /* New region is enclosed by an existing region */
353 if (ltype != type) { 375 if (ltype != type) {
354 if (type == MTRR_TYPE_UNCACHABLE) 376 if (types_compatible(type, ltype))
355 continue; 377 continue;
356 printk (KERN_WARNING "mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n", 378 printk (KERN_WARNING "mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n",
357 base, size, mtrr_attrib_to_str(ltype), 379 base, size, mtrr_attrib_to_str(ltype),
@@ -364,10 +386,18 @@ int mtrr_add_page(unsigned long base, unsigned long size,
364 goto out; 386 goto out;
365 } 387 }
366 /* Search for an empty MTRR */ 388 /* Search for an empty MTRR */
367 i = mtrr_if->get_free_region(base, size); 389 i = mtrr_if->get_free_region(base, size, replace);
368 if (i >= 0) { 390 if (i >= 0) {
369 set_mtrr(i, base, size, type); 391 set_mtrr(i, base, size, type);
370 usage_table[i] = 1; 392 if (likely(replace < 0))
393 usage_table[i] = 1;
394 else {
395 usage_table[i] = usage_table[replace] + !!increment;
396 if (unlikely(replace != i)) {
397 set_mtrr(replace, 0, 0, 0);
398 usage_table[replace] = 0;
399 }
400 }
371 } else 401 } else
372 printk(KERN_INFO "mtrr: no more MTRRs available\n"); 402 printk(KERN_INFO "mtrr: no more MTRRs available\n");
373 error = i; 403 error = i;
@@ -455,8 +485,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
455{ 485{
456 int i, max; 486 int i, max;
457 mtrr_type ltype; 487 mtrr_type ltype;
458 unsigned long lbase; 488 unsigned long lbase, lsize;
459 unsigned int lsize;
460 int error = -EINVAL; 489 int error = -EINVAL;
461 490
462 if (!mtrr_if) 491 if (!mtrr_if)
@@ -544,9 +573,11 @@ extern void centaur_init_mtrr(void);
544 573
545static void __init init_ifs(void) 574static void __init init_ifs(void)
546{ 575{
576#ifndef CONFIG_X86_64
547 amd_init_mtrr(); 577 amd_init_mtrr();
548 cyrix_init_mtrr(); 578 cyrix_init_mtrr();
549 centaur_init_mtrr(); 579 centaur_init_mtrr();
580#endif
550} 581}
551 582
552/* The suspend/resume methods are only for CPU without MTRR. CPU using generic 583/* The suspend/resume methods are only for CPU without MTRR. CPU using generic
@@ -555,7 +586,7 @@ static void __init init_ifs(void)
555struct mtrr_value { 586struct mtrr_value {
556 mtrr_type ltype; 587 mtrr_type ltype;
557 unsigned long lbase; 588 unsigned long lbase;
558 unsigned int lsize; 589 unsigned long lsize;
559}; 590};
560 591
561static struct mtrr_value * mtrr_state; 592static struct mtrr_value * mtrr_state;
@@ -565,10 +596,8 @@ static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
565 int i; 596 int i;
566 int size = num_var_ranges * sizeof(struct mtrr_value); 597 int size = num_var_ranges * sizeof(struct mtrr_value);
567 598
568 mtrr_state = kmalloc(size,GFP_ATOMIC); 599 mtrr_state = kzalloc(size,GFP_ATOMIC);
569 if (mtrr_state) 600 if (!mtrr_state)
570 memset(mtrr_state,0,size);
571 else
572 return -ENOMEM; 601 return -ENOMEM;
573 602
574 for (i = 0; i < num_var_ranges; i++) { 603 for (i = 0; i < num_var_ranges; i++) {
diff --git a/arch/i386/kernel/cpu/mtrr/mtrr.h b/arch/i386/kernel/cpu/mtrr/mtrr.h
index 99c9f2682041..d61ea9db6cfe 100644
--- a/arch/i386/kernel/cpu/mtrr/mtrr.h
+++ b/arch/i386/kernel/cpu/mtrr/mtrr.h
@@ -43,15 +43,16 @@ struct mtrr_ops {
43 void (*set_all)(void); 43 void (*set_all)(void);
44 44
45 void (*get)(unsigned int reg, unsigned long *base, 45 void (*get)(unsigned int reg, unsigned long *base,
46 unsigned int *size, mtrr_type * type); 46 unsigned long *size, mtrr_type * type);
47 int (*get_free_region) (unsigned long base, unsigned long size); 47 int (*get_free_region)(unsigned long base, unsigned long size,
48 48 int replace_reg);
49 int (*validate_add_page)(unsigned long base, unsigned long size, 49 int (*validate_add_page)(unsigned long base, unsigned long size,
50 unsigned int type); 50 unsigned int type);
51 int (*have_wrcomb)(void); 51 int (*have_wrcomb)(void);
52}; 52};
53 53
54extern int generic_get_free_region(unsigned long base, unsigned long size); 54extern int generic_get_free_region(unsigned long base, unsigned long size,
55 int replace_reg);
55extern int generic_validate_add_page(unsigned long base, unsigned long size, 56extern int generic_validate_add_page(unsigned long base, unsigned long size,
56 unsigned int type); 57 unsigned int type);
57 58
@@ -62,17 +63,17 @@ extern int positive_have_wrcomb(void);
62/* library functions for processor-specific routines */ 63/* library functions for processor-specific routines */
63struct set_mtrr_context { 64struct set_mtrr_context {
64 unsigned long flags; 65 unsigned long flags;
65 unsigned long deftype_lo;
66 unsigned long deftype_hi;
67 unsigned long cr4val; 66 unsigned long cr4val;
68 unsigned long ccr3; 67 u32 deftype_lo;
68 u32 deftype_hi;
69 u32 ccr3;
69}; 70};
70 71
71struct mtrr_var_range { 72struct mtrr_var_range {
72 unsigned long base_lo; 73 u32 base_lo;
73 unsigned long base_hi; 74 u32 base_hi;
74 unsigned long mask_lo; 75 u32 mask_lo;
75 unsigned long mask_hi; 76 u32 mask_hi;
76}; 77};
77 78
78void set_mtrr_done(struct set_mtrr_context *ctxt); 79void set_mtrr_done(struct set_mtrr_context *ctxt);
@@ -92,6 +93,6 @@ extern struct mtrr_ops * mtrr_if;
92extern unsigned int num_var_ranges; 93extern unsigned int num_var_ranges;
93 94
94void mtrr_state_warn(void); 95void mtrr_state_warn(void);
95char *mtrr_attrib_to_str(int x); 96const char *mtrr_attrib_to_str(int x);
96void mtrr_wrmsr(unsigned, unsigned, unsigned); 97void mtrr_wrmsr(unsigned, unsigned, unsigned);
97 98
diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c
index 76aac088a323..6624d8583c42 100644
--- a/arch/i386/kernel/cpu/proc.c
+++ b/arch/i386/kernel/cpu/proc.c
@@ -152,9 +152,10 @@ static int show_cpuinfo(struct seq_file *m, void *v)
152 seq_printf(m, " [%d]", i); 152 seq_printf(m, " [%d]", i);
153 } 153 }
154 154
155 seq_printf(m, "\nbogomips\t: %lu.%02lu\n\n", 155 seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
156 c->loops_per_jiffy/(500000/HZ), 156 c->loops_per_jiffy/(500000/HZ),
157 (c->loops_per_jiffy/(5000/HZ)) % 100); 157 (c->loops_per_jiffy/(5000/HZ)) % 100);
158 seq_printf(m, "clflush size\t: %u\n\n", c->x86_clflush_size);
158 159
159 return 0; 160 return 0;
160} 161}
diff --git a/arch/i386/kernel/cpuid.c b/arch/i386/kernel/cpuid.c
index 23b2cc748d4e..db6dd20c3589 100644
--- a/arch/i386/kernel/cpuid.c
+++ b/arch/i386/kernel/cpuid.c
@@ -34,7 +34,6 @@
34#include <linux/major.h> 34#include <linux/major.h>
35#include <linux/fs.h> 35#include <linux/fs.h>
36#include <linux/smp_lock.h> 36#include <linux/smp_lock.h>
37#include <linux/fs.h>
38#include <linux/device.h> 37#include <linux/device.h>
39#include <linux/cpu.h> 38#include <linux/cpu.h>
40#include <linux/notifier.h> 39#include <linux/notifier.h>
diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c
new file mode 100644
index 000000000000..2f7d0a92fd7c
--- /dev/null
+++ b/arch/i386/kernel/e820.c
@@ -0,0 +1,894 @@
1#include <linux/kernel.h>
2#include <linux/types.h>
3#include <linux/init.h>
4#include <linux/bootmem.h>
5#include <linux/ioport.h>
6#include <linux/string.h>
7#include <linux/kexec.h>
8#include <linux/module.h>
9#include <linux/mm.h>
10#include <linux/efi.h>
11#include <linux/pfn.h>
12#include <linux/uaccess.h>
13
14#include <asm/pgtable.h>
15#include <asm/page.h>
16#include <asm/e820.h>
17
18#ifdef CONFIG_EFI
19int efi_enabled = 0;
20EXPORT_SYMBOL(efi_enabled);
21#endif
22
23struct e820map e820;
24struct change_member {
25 struct e820entry *pbios; /* pointer to original bios entry */
26 unsigned long long addr; /* address for this change point */
27};
28static struct change_member change_point_list[2*E820MAX] __initdata;
29static struct change_member *change_point[2*E820MAX] __initdata;
30static struct e820entry *overlap_list[E820MAX] __initdata;
31static struct e820entry new_bios[E820MAX] __initdata;
32/* For PCI or other memory-mapped resources */
33unsigned long pci_mem_start = 0x10000000;
34#ifdef CONFIG_PCI
35EXPORT_SYMBOL(pci_mem_start);
36#endif
37extern int user_defined_memmap;
38struct resource data_resource = {
39 .name = "Kernel data",
40 .start = 0,
41 .end = 0,
42 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
43};
44
45struct resource code_resource = {
46 .name = "Kernel code",
47 .start = 0,
48 .end = 0,
49 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
50};
51
52static struct resource system_rom_resource = {
53 .name = "System ROM",
54 .start = 0xf0000,
55 .end = 0xfffff,
56 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
57};
58
59static struct resource extension_rom_resource = {
60 .name = "Extension ROM",
61 .start = 0xe0000,
62 .end = 0xeffff,
63 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
64};
65
66static struct resource adapter_rom_resources[] = { {
67 .name = "Adapter ROM",
68 .start = 0xc8000,
69 .end = 0,
70 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
71}, {
72 .name = "Adapter ROM",
73 .start = 0,
74 .end = 0,
75 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
76}, {
77 .name = "Adapter ROM",
78 .start = 0,
79 .end = 0,
80 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
81}, {
82 .name = "Adapter ROM",
83 .start = 0,
84 .end = 0,
85 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
86}, {
87 .name = "Adapter ROM",
88 .start = 0,
89 .end = 0,
90 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
91}, {
92 .name = "Adapter ROM",
93 .start = 0,
94 .end = 0,
95 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
96} };
97
98static struct resource video_rom_resource = {
99 .name = "Video ROM",
100 .start = 0xc0000,
101 .end = 0xc7fff,
102 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
103};
104
105static struct resource video_ram_resource = {
106 .name = "Video RAM area",
107 .start = 0xa0000,
108 .end = 0xbffff,
109 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
110};
111
112static struct resource standard_io_resources[] = { {
113 .name = "dma1",
114 .start = 0x0000,
115 .end = 0x001f,
116 .flags = IORESOURCE_BUSY | IORESOURCE_IO
117}, {
118 .name = "pic1",
119 .start = 0x0020,
120 .end = 0x0021,
121 .flags = IORESOURCE_BUSY | IORESOURCE_IO
122}, {
123 .name = "timer0",
124 .start = 0x0040,
125 .end = 0x0043,
126 .flags = IORESOURCE_BUSY | IORESOURCE_IO
127}, {
128 .name = "timer1",
129 .start = 0x0050,
130 .end = 0x0053,
131 .flags = IORESOURCE_BUSY | IORESOURCE_IO
132}, {
133 .name = "keyboard",
134 .start = 0x0060,
135 .end = 0x006f,
136 .flags = IORESOURCE_BUSY | IORESOURCE_IO
137}, {
138 .name = "dma page reg",
139 .start = 0x0080,
140 .end = 0x008f,
141 .flags = IORESOURCE_BUSY | IORESOURCE_IO
142}, {
143 .name = "pic2",
144 .start = 0x00a0,
145 .end = 0x00a1,
146 .flags = IORESOURCE_BUSY | IORESOURCE_IO
147}, {
148 .name = "dma2",
149 .start = 0x00c0,
150 .end = 0x00df,
151 .flags = IORESOURCE_BUSY | IORESOURCE_IO
152}, {
153 .name = "fpu",
154 .start = 0x00f0,
155 .end = 0x00ff,
156 .flags = IORESOURCE_BUSY | IORESOURCE_IO
157} };
158
159static int romsignature(const unsigned char *x)
160{
161 unsigned short sig;
162 int ret = 0;
163 if (probe_kernel_address((const unsigned short *)x, sig) == 0)
164 ret = (sig == 0xaa55);
165 return ret;
166}
167
168static int __init romchecksum(unsigned char *rom, unsigned long length)
169{
170 unsigned char *p, sum = 0;
171
172 for (p = rom; p < rom + length; p++)
173 sum += *p;
174 return sum == 0;
175}
176
177static void __init probe_roms(void)
178{
179 unsigned long start, length, upper;
180 unsigned char *rom;
181 int i;
182
183 /* video rom */
184 upper = adapter_rom_resources[0].start;
185 for (start = video_rom_resource.start; start < upper; start += 2048) {
186 rom = isa_bus_to_virt(start);
187 if (!romsignature(rom))
188 continue;
189
190 video_rom_resource.start = start;
191
192 /* 0 < length <= 0x7f * 512, historically */
193 length = rom[2] * 512;
194
195 /* if checksum okay, trust length byte */
196 if (length && romchecksum(rom, length))
197 video_rom_resource.end = start + length - 1;
198
199 request_resource(&iomem_resource, &video_rom_resource);
200 break;
201 }
202
203 start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
204 if (start < upper)
205 start = upper;
206
207 /* system rom */
208 request_resource(&iomem_resource, &system_rom_resource);
209 upper = system_rom_resource.start;
210
211 /* check for extension rom (ignore length byte!) */
212 rom = isa_bus_to_virt(extension_rom_resource.start);
213 if (romsignature(rom)) {
214 length = extension_rom_resource.end - extension_rom_resource.start + 1;
215 if (romchecksum(rom, length)) {
216 request_resource(&iomem_resource, &extension_rom_resource);
217 upper = extension_rom_resource.start;
218 }
219 }
220
221 /* check for adapter roms on 2k boundaries */
222 for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
223 rom = isa_bus_to_virt(start);
224 if (!romsignature(rom))
225 continue;
226
227 /* 0 < length <= 0x7f * 512, historically */
228 length = rom[2] * 512;
229
230 /* but accept any length that fits if checksum okay */
231 if (!length || start + length > upper || !romchecksum(rom, length))
232 continue;
233
234 adapter_rom_resources[i].start = start;
235 adapter_rom_resources[i].end = start + length - 1;
236 request_resource(&iomem_resource, &adapter_rom_resources[i]);
237
238 start = adapter_rom_resources[i++].end & ~2047UL;
239 }
240}
241
242/*
243 * Request address space for all standard RAM and ROM resources
244 * and also for regions reported as reserved by the e820.
245 */
246static void __init
247legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource)
248{
249 int i;
250
251 probe_roms();
252 for (i = 0; i < e820.nr_map; i++) {
253 struct resource *res;
254#ifndef CONFIG_RESOURCES_64BIT
255 if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
256 continue;
257#endif
258 res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
259 switch (e820.map[i].type) {
260 case E820_RAM: res->name = "System RAM"; break;
261 case E820_ACPI: res->name = "ACPI Tables"; break;
262 case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
263 default: res->name = "reserved";
264 }
265 res->start = e820.map[i].addr;
266 res->end = res->start + e820.map[i].size - 1;
267 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
268 if (request_resource(&iomem_resource, res)) {
269 kfree(res);
270 continue;
271 }
272 if (e820.map[i].type == E820_RAM) {
273 /*
274 * We don't know which RAM region contains kernel data,
275 * so we try it repeatedly and let the resource manager
276 * test it.
277 */
278 request_resource(res, code_resource);
279 request_resource(res, data_resource);
280#ifdef CONFIG_KEXEC
281 request_resource(res, &crashk_res);
282#endif
283 }
284 }
285}
286
287/*
288 * Request address space for all standard resources
289 *
290 * This is called just before pcibios_init(), which is also a
291 * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
292 */
293static int __init request_standard_resources(void)
294{
295 int i;
296
297 printk("Setting up standard PCI resources\n");
298 if (efi_enabled)
299 efi_initialize_iomem_resources(&code_resource, &data_resource);
300 else
301 legacy_init_iomem_resources(&code_resource, &data_resource);
302
303 /* EFI systems may still have VGA */
304 request_resource(&iomem_resource, &video_ram_resource);
305
306 /* request I/O space for devices used on all i[345]86 PCs */
307 for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
308 request_resource(&ioport_resource, &standard_io_resources[i]);
309 return 0;
310}
311
312subsys_initcall(request_standard_resources);
313
314void __init add_memory_region(unsigned long long start,
315 unsigned long long size, int type)
316{
317 int x;
318
319 if (!efi_enabled) {
320 x = e820.nr_map;
321
322 if (x == E820MAX) {
323 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
324 return;
325 }
326
327 e820.map[x].addr = start;
328 e820.map[x].size = size;
329 e820.map[x].type = type;
330 e820.nr_map++;
331 }
332} /* add_memory_region */
333
334/*
335 * Sanitize the BIOS e820 map.
336 *
337 * Some e820 responses include overlapping entries. The following
338 * replaces the original e820 map with a new one, removing overlaps.
339 *
340 */
341int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
342{
343 struct change_member *change_tmp;
344 unsigned long current_type, last_type;
345 unsigned long long last_addr;
346 int chgidx, still_changing;
347 int overlap_entries;
348 int new_bios_entry;
349 int old_nr, new_nr, chg_nr;
350 int i;
351
352 /*
353 Visually we're performing the following (1,2,3,4 = memory types)...
354
355 Sample memory map (w/overlaps):
356 ____22__________________
357 ______________________4_
358 ____1111________________
359 _44_____________________
360 11111111________________
361 ____________________33__
362 ___________44___________
363 __________33333_________
364 ______________22________
365 ___________________2222_
366 _________111111111______
367 _____________________11_
368 _________________4______
369
370 Sanitized equivalent (no overlap):
371 1_______________________
372 _44_____________________
373 ___1____________________
374 ____22__________________
375 ______11________________
376 _________1______________
377 __________3_____________
378 ___________44___________
379 _____________33_________
380 _______________2________
381 ________________1_______
382 _________________4______
383 ___________________2____
384 ____________________33__
385 ______________________4_
386 */
387 printk("sanitize start\n");
388 /* if there's only one memory region, don't bother */
389 if (*pnr_map < 2) {
390 printk("sanitize bail 0\n");
391 return -1;
392 }
393
394 old_nr = *pnr_map;
395
396 /* bail out if we find any unreasonable addresses in bios map */
397 for (i=0; i<old_nr; i++)
398 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
399 printk("sanitize bail 1\n");
400 return -1;
401 }
402
403 /* create pointers for initial change-point information (for sorting) */
404 for (i=0; i < 2*old_nr; i++)
405 change_point[i] = &change_point_list[i];
406
407 /* record all known change-points (starting and ending addresses),
408 omitting those that are for empty memory regions */
409 chgidx = 0;
410 for (i=0; i < old_nr; i++) {
411 if (biosmap[i].size != 0) {
412 change_point[chgidx]->addr = biosmap[i].addr;
413 change_point[chgidx++]->pbios = &biosmap[i];
414 change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
415 change_point[chgidx++]->pbios = &biosmap[i];
416 }
417 }
418 chg_nr = chgidx; /* true number of change-points */
419
420 /* sort change-point list by memory addresses (low -> high) */
421 still_changing = 1;
422 while (still_changing) {
423 still_changing = 0;
424 for (i=1; i < chg_nr; i++) {
425 /* if <current_addr> > <last_addr>, swap */
426 /* or, if current=<start_addr> & last=<end_addr>, swap */
427 if ((change_point[i]->addr < change_point[i-1]->addr) ||
428 ((change_point[i]->addr == change_point[i-1]->addr) &&
429 (change_point[i]->addr == change_point[i]->pbios->addr) &&
430 (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
431 )
432 {
433 change_tmp = change_point[i];
434 change_point[i] = change_point[i-1];
435 change_point[i-1] = change_tmp;
436 still_changing=1;
437 }
438 }
439 }
440
441 /* create a new bios memory map, removing overlaps */
442 overlap_entries=0; /* number of entries in the overlap table */
443 new_bios_entry=0; /* index for creating new bios map entries */
444 last_type = 0; /* start with undefined memory type */
445 last_addr = 0; /* start with 0 as last starting address */
446 /* loop through change-points, determining affect on the new bios map */
447 for (chgidx=0; chgidx < chg_nr; chgidx++)
448 {
449 /* keep track of all overlapping bios entries */
450 if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
451 {
452 /* add map entry to overlap list (> 1 entry implies an overlap) */
453 overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
454 }
455 else
456 {
457 /* remove entry from list (order independent, so swap with last) */
458 for (i=0; i<overlap_entries; i++)
459 {
460 if (overlap_list[i] == change_point[chgidx]->pbios)
461 overlap_list[i] = overlap_list[overlap_entries-1];
462 }
463 overlap_entries--;
464 }
465 /* if there are overlapping entries, decide which "type" to use */
466 /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
467 current_type = 0;
468 for (i=0; i<overlap_entries; i++)
469 if (overlap_list[i]->type > current_type)
470 current_type = overlap_list[i]->type;
471 /* continue building up new bios map based on this information */
472 if (current_type != last_type) {
473 if (last_type != 0) {
474 new_bios[new_bios_entry].size =
475 change_point[chgidx]->addr - last_addr;
476 /* move forward only if the new size was non-zero */
477 if (new_bios[new_bios_entry].size != 0)
478 if (++new_bios_entry >= E820MAX)
479 break; /* no more space left for new bios entries */
480 }
481 if (current_type != 0) {
482 new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
483 new_bios[new_bios_entry].type = current_type;
484 last_addr=change_point[chgidx]->addr;
485 }
486 last_type = current_type;
487 }
488 }
489 new_nr = new_bios_entry; /* retain count for new bios entries */
490
491 /* copy new bios mapping into original location */
492 memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
493 *pnr_map = new_nr;
494
495 printk("sanitize end\n");
496 return 0;
497}
498
499/*
500 * Copy the BIOS e820 map into a safe place.
501 *
502 * Sanity-check it while we're at it..
503 *
504 * If we're lucky and live on a modern system, the setup code
505 * will have given us a memory map that we can use to properly
506 * set up memory. If we aren't, we'll fake a memory map.
507 *
508 * We check to see that the memory map contains at least 2 elements
509 * before we'll use it, because the detection code in setup.S may
510 * not be perfect and most every PC known to man has two memory
511 * regions: one from 0 to 640k, and one from 1mb up. (The IBM
512 * thinkpad 560x, for example, does not cooperate with the memory
513 * detection code.)
514 */
515int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
516{
517 /* Only one memory region (or negative)? Ignore it */
518 if (nr_map < 2)
519 return -1;
520
521 do {
522 unsigned long long start = biosmap->addr;
523 unsigned long long size = biosmap->size;
524 unsigned long long end = start + size;
525 unsigned long type = biosmap->type;
526 printk("copy_e820_map() start: %016Lx size: %016Lx end: %016Lx type: %ld\n", start, size, end, type);
527
528 /* Overflow in 64 bits? Ignore the memory map. */
529 if (start > end)
530 return -1;
531
532 /*
533 * Some BIOSes claim RAM in the 640k - 1M region.
534 * Not right. Fix it up.
535 */
536 if (type == E820_RAM) {
537 printk("copy_e820_map() type is E820_RAM\n");
538 if (start < 0x100000ULL && end > 0xA0000ULL) {
539 printk("copy_e820_map() lies in range...\n");
540 if (start < 0xA0000ULL) {
541 printk("copy_e820_map() start < 0xA0000ULL\n");
542 add_memory_region(start, 0xA0000ULL-start, type);
543 }
544 if (end <= 0x100000ULL) {
545 printk("copy_e820_map() end <= 0x100000ULL\n");
546 continue;
547 }
548 start = 0x100000ULL;
549 size = end - start;
550 }
551 }
552 add_memory_region(start, size, type);
553 } while (biosmap++,--nr_map);
554 return 0;
555}
556
557/*
558 * Callback for efi_memory_walk.
559 */
560static int __init
561efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
562{
563 unsigned long *max_pfn = arg, pfn;
564
565 if (start < end) {
566 pfn = PFN_UP(end -1);
567 if (pfn > *max_pfn)
568 *max_pfn = pfn;
569 }
570 return 0;
571}
572
573static int __init
574efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
575{
576 memory_present(0, PFN_UP(start), PFN_DOWN(end));
577 return 0;
578}
579
580/*
581 * Find the highest page frame number we have available
582 */
583void __init find_max_pfn(void)
584{
585 int i;
586
587 max_pfn = 0;
588 if (efi_enabled) {
589 efi_memmap_walk(efi_find_max_pfn, &max_pfn);
590 efi_memmap_walk(efi_memory_present_wrapper, NULL);
591 return;
592 }
593
594 for (i = 0; i < e820.nr_map; i++) {
595 unsigned long start, end;
596 /* RAM? */
597 if (e820.map[i].type != E820_RAM)
598 continue;
599 start = PFN_UP(e820.map[i].addr);
600 end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
601 if (start >= end)
602 continue;
603 if (end > max_pfn)
604 max_pfn = end;
605 memory_present(0, start, end);
606 }
607}
608
609/*
610 * Free all available memory for boot time allocation. Used
611 * as a callback function by efi_memory_walk()
612 */
613
614static int __init
615free_available_memory(unsigned long start, unsigned long end, void *arg)
616{
617 /* check max_low_pfn */
618 if (start >= (max_low_pfn << PAGE_SHIFT))
619 return 0;
620 if (end >= (max_low_pfn << PAGE_SHIFT))
621 end = max_low_pfn << PAGE_SHIFT;
622 if (start < end)
623 free_bootmem(start, end - start);
624
625 return 0;
626}
627/*
628 * Register fully available low RAM pages with the bootmem allocator.
629 */
630void __init register_bootmem_low_pages(unsigned long max_low_pfn)
631{
632 int i;
633
634 if (efi_enabled) {
635 efi_memmap_walk(free_available_memory, NULL);
636 return;
637 }
638 for (i = 0; i < e820.nr_map; i++) {
639 unsigned long curr_pfn, last_pfn, size;
640 /*
641 * Reserve usable low memory
642 */
643 if (e820.map[i].type != E820_RAM)
644 continue;
645 /*
646 * We are rounding up the start address of usable memory:
647 */
648 curr_pfn = PFN_UP(e820.map[i].addr);
649 if (curr_pfn >= max_low_pfn)
650 continue;
651 /*
652 * ... and at the end of the usable range downwards:
653 */
654 last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
655
656 if (last_pfn > max_low_pfn)
657 last_pfn = max_low_pfn;
658
659 /*
660 * .. finally, did all the rounding and playing
661 * around just make the area go away?
662 */
663 if (last_pfn <= curr_pfn)
664 continue;
665
666 size = last_pfn - curr_pfn;
667 free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
668 }
669}
670
671void __init register_memory(void)
672{
673 unsigned long gapstart, gapsize, round;
674 unsigned long long last;
675 int i;
676
677 /*
678 * Search for the bigest gap in the low 32 bits of the e820
679 * memory space.
680 */
681 last = 0x100000000ull;
682 gapstart = 0x10000000;
683 gapsize = 0x400000;
684 i = e820.nr_map;
685 while (--i >= 0) {
686 unsigned long long start = e820.map[i].addr;
687 unsigned long long end = start + e820.map[i].size;
688
689 /*
690 * Since "last" is at most 4GB, we know we'll
691 * fit in 32 bits if this condition is true
692 */
693 if (last > end) {
694 unsigned long gap = last - end;
695
696 if (gap > gapsize) {
697 gapsize = gap;
698 gapstart = end;
699 }
700 }
701 if (start < last)
702 last = start;
703 }
704
705 /*
706 * See how much we want to round up: start off with
707 * rounding to the next 1MB area.
708 */
709 round = 0x100000;
710 while ((gapsize >> 4) > round)
711 round += round;
712 /* Fun with two's complement */
713 pci_mem_start = (gapstart + round) & -round;
714
715 printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
716 pci_mem_start, gapstart, gapsize);
717}
718
719void __init print_memory_map(char *who)
720{
721 int i;
722
723 for (i = 0; i < e820.nr_map; i++) {
724 printk(" %s: %016Lx - %016Lx ", who,
725 e820.map[i].addr,
726 e820.map[i].addr + e820.map[i].size);
727 switch (e820.map[i].type) {
728 case E820_RAM: printk("(usable)\n");
729 break;
730 case E820_RESERVED:
731 printk("(reserved)\n");
732 break;
733 case E820_ACPI:
734 printk("(ACPI data)\n");
735 break;
736 case E820_NVS:
737 printk("(ACPI NVS)\n");
738 break;
739 default: printk("type %lu\n", e820.map[i].type);
740 break;
741 }
742 }
743}
744
745static __init __always_inline void efi_limit_regions(unsigned long long size)
746{
747 unsigned long long current_addr = 0;
748 efi_memory_desc_t *md, *next_md;
749 void *p, *p1;
750 int i, j;
751
752 j = 0;
753 p1 = memmap.map;
754 for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
755 md = p;
756 next_md = p1;
757 current_addr = md->phys_addr +
758 PFN_PHYS(md->num_pages);
759 if (is_available_memory(md)) {
760 if (md->phys_addr >= size) continue;
761 memcpy(next_md, md, memmap.desc_size);
762 if (current_addr >= size) {
763 next_md->num_pages -=
764 PFN_UP(current_addr-size);
765 }
766 p1 += memmap.desc_size;
767 next_md = p1;
768 j++;
769 } else if ((md->attribute & EFI_MEMORY_RUNTIME) ==
770 EFI_MEMORY_RUNTIME) {
771 /* In order to make runtime services
772 * available we have to include runtime
773 * memory regions in memory map */
774 memcpy(next_md, md, memmap.desc_size);
775 p1 += memmap.desc_size;
776 next_md = p1;
777 j++;
778 }
779 }
780 memmap.nr_map = j;
781 memmap.map_end = memmap.map +
782 (memmap.nr_map * memmap.desc_size);
783}
784
785void __init limit_regions(unsigned long long size)
786{
787 unsigned long long current_addr;
788 int i;
789
790 print_memory_map("limit_regions start");
791 if (efi_enabled) {
792 efi_limit_regions(size);
793 return;
794 }
795 for (i = 0; i < e820.nr_map; i++) {
796 current_addr = e820.map[i].addr + e820.map[i].size;
797 if (current_addr < size)
798 continue;
799
800 if (e820.map[i].type != E820_RAM)
801 continue;
802
803 if (e820.map[i].addr >= size) {
804 /*
805 * This region starts past the end of the
806 * requested size, skip it completely.
807 */
808 e820.nr_map = i;
809 } else {
810 e820.nr_map = i + 1;
811 e820.map[i].size -= current_addr - size;
812 }
813 print_memory_map("limit_regions endfor");
814 return;
815 }
816 print_memory_map("limit_regions endfunc");
817}
818
819 /*
820 * This function checks if the entire range <start,end> is mapped with type.
821 *
822 * Note: this function only works correct if the e820 table is sorted and
823 * not-overlapping, which is the case
824 */
825int __init
826e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
827{
828 u64 start = s;
829 u64 end = e;
830 int i;
831 for (i = 0; i < e820.nr_map; i++) {
832 struct e820entry *ei = &e820.map[i];
833 if (type && ei->type != type)
834 continue;
835 /* is the region (part) in overlap with the current region ?*/
836 if (ei->addr >= end || ei->addr + ei->size <= start)
837 continue;
838 /* if the region is at the beginning of <start,end> we move
839 * start to the end of the region since it's ok until there
840 */
841 if (ei->addr <= start)
842 start = ei->addr + ei->size;
843 /* if start is now at or beyond end, we're done, full
844 * coverage */
845 if (start >= end)
846 return 1; /* we're done */
847 }
848 return 0;
849}
850
851static int __init parse_memmap(char *arg)
852{
853 if (!arg)
854 return -EINVAL;
855
856 if (strcmp(arg, "exactmap") == 0) {
857#ifdef CONFIG_CRASH_DUMP
858 /* If we are doing a crash dump, we
859 * still need to know the real mem
860 * size before original memory map is
861 * reset.
862 */
863 find_max_pfn();
864 saved_max_pfn = max_pfn;
865#endif
866 e820.nr_map = 0;
867 user_defined_memmap = 1;
868 } else {
869 /* If the user specifies memory size, we
870 * limit the BIOS-provided memory map to
871 * that size. exactmap can be used to specify
872 * the exact map. mem=number can be used to
873 * trim the existing memory map.
874 */
875 unsigned long long start_at, mem_size;
876
877 mem_size = memparse(arg, &arg);
878 if (*arg == '@') {
879 start_at = memparse(arg+1, &arg);
880 add_memory_region(start_at, mem_size, E820_RAM);
881 } else if (*arg == '#') {
882 start_at = memparse(arg+1, &arg);
883 add_memory_region(start_at, mem_size, E820_ACPI);
884 } else if (*arg == '$') {
885 start_at = memparse(arg+1, &arg);
886 add_memory_region(start_at, mem_size, E820_RESERVED);
887 } else {
888 limit_regions(mem_size);
889 user_defined_memmap = 1;
890 }
891 }
892 return 0;
893}
894early_param("memmap", parse_memmap);
diff --git a/arch/i386/kernel/efi.c b/arch/i386/kernel/efi.c
index 8b40648d0ef0..b92c7f0a358a 100644
--- a/arch/i386/kernel/efi.c
+++ b/arch/i386/kernel/efi.c
@@ -194,17 +194,24 @@ inline int efi_set_rtc_mmss(unsigned long nowtime)
194 return 0; 194 return 0;
195} 195}
196/* 196/*
197 * This should only be used during kernel init and before runtime 197 * This is used during kernel init before runtime
198 * services have been remapped, therefore, we'll need to call in physical 198 * services have been remapped and also during suspend, therefore,
199 * mode. Note, this call isn't used later, so mark it __init. 199 * we'll need to call both in physical and virtual modes.
200 */ 200 */
201inline unsigned long __init efi_get_time(void) 201inline unsigned long efi_get_time(void)
202{ 202{
203 efi_status_t status; 203 efi_status_t status;
204 efi_time_t eft; 204 efi_time_t eft;
205 efi_time_cap_t cap; 205 efi_time_cap_t cap;
206 206
207 status = phys_efi_get_time(&eft, &cap); 207 if (efi.get_time) {
208 /* if we are in virtual mode use remapped function */
209 status = efi.get_time(&eft, &cap);
210 } else {
211 /* we are in physical mode */
212 status = phys_efi_get_time(&eft, &cap);
213 }
214
208 if (status != EFI_SUCCESS) 215 if (status != EFI_SUCCESS)
209 printk("Oops: efitime: can't read time status: 0x%lx\n",status); 216 printk("Oops: efitime: can't read time status: 0x%lx\n",status);
210 217
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 5a63d6fdb70e..de34b7fed3c1 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -30,12 +30,13 @@
30 * 18(%esp) - %eax 30 * 18(%esp) - %eax
31 * 1C(%esp) - %ds 31 * 1C(%esp) - %ds
32 * 20(%esp) - %es 32 * 20(%esp) - %es
33 * 24(%esp) - orig_eax 33 * 24(%esp) - %gs
34 * 28(%esp) - %eip 34 * 28(%esp) - orig_eax
35 * 2C(%esp) - %cs 35 * 2C(%esp) - %eip
36 * 30(%esp) - %eflags 36 * 30(%esp) - %cs
37 * 34(%esp) - %oldesp 37 * 34(%esp) - %eflags
38 * 38(%esp) - %oldss 38 * 38(%esp) - %oldesp
39 * 3C(%esp) - %oldss
39 * 40 *
40 * "current" is in register %ebx during any slow entries. 41 * "current" is in register %ebx during any slow entries.
41 */ 42 */
@@ -48,26 +49,24 @@
48#include <asm/smp.h> 49#include <asm/smp.h>
49#include <asm/page.h> 50#include <asm/page.h>
50#include <asm/desc.h> 51#include <asm/desc.h>
52#include <asm/percpu.h>
51#include <asm/dwarf2.h> 53#include <asm/dwarf2.h>
52#include "irq_vectors.h" 54#include "irq_vectors.h"
53 55
54#define nr_syscalls ((syscall_table_size)/4) 56/*
57 * We use macros for low-level operations which need to be overridden
58 * for paravirtualization. The following will never clobber any registers:
59 * INTERRUPT_RETURN (aka. "iret")
60 * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
61 * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
62 *
63 * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
64 * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
65 * Allowing a register to be clobbered can shrink the paravirt replacement
66 * enough to patch inline, increasing performance.
67 */
55 68
56EBX = 0x00 69#define nr_syscalls ((syscall_table_size)/4)
57ECX = 0x04
58EDX = 0x08
59ESI = 0x0C
60EDI = 0x10
61EBP = 0x14
62EAX = 0x18
63DS = 0x1C
64ES = 0x20
65ORIG_EAX = 0x24
66EIP = 0x28
67CS = 0x2C
68EFLAGS = 0x30
69OLDESP = 0x34
70OLDSS = 0x38
71 70
72CF_MASK = 0x00000001 71CF_MASK = 0x00000001
73TF_MASK = 0x00000100 72TF_MASK = 0x00000100
@@ -76,23 +75,16 @@ DF_MASK = 0x00000400
76NT_MASK = 0x00004000 75NT_MASK = 0x00004000
77VM_MASK = 0x00020000 76VM_MASK = 0x00020000
78 77
79/* These are replaces for paravirtualization */
80#define DISABLE_INTERRUPTS cli
81#define ENABLE_INTERRUPTS sti
82#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit
83#define INTERRUPT_RETURN iret
84#define GET_CR0_INTO_EAX movl %cr0, %eax
85
86#ifdef CONFIG_PREEMPT 78#ifdef CONFIG_PREEMPT
87#define preempt_stop DISABLE_INTERRUPTS; TRACE_IRQS_OFF 79#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
88#else 80#else
89#define preempt_stop 81#define preempt_stop(clobbers)
90#define resume_kernel restore_nocheck 82#define resume_kernel restore_nocheck
91#endif 83#endif
92 84
93.macro TRACE_IRQS_IRET 85.macro TRACE_IRQS_IRET
94#ifdef CONFIG_TRACE_IRQFLAGS 86#ifdef CONFIG_TRACE_IRQFLAGS
95 testl $IF_MASK,EFLAGS(%esp) # interrupts off? 87 testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off?
96 jz 1f 88 jz 1f
97 TRACE_IRQS_ON 89 TRACE_IRQS_ON
981: 901:
@@ -107,6 +99,9 @@ VM_MASK = 0x00020000
107 99
108#define SAVE_ALL \ 100#define SAVE_ALL \
109 cld; \ 101 cld; \
102 pushl %gs; \
103 CFI_ADJUST_CFA_OFFSET 4;\
104 /*CFI_REL_OFFSET gs, 0;*/\
110 pushl %es; \ 105 pushl %es; \
111 CFI_ADJUST_CFA_OFFSET 4;\ 106 CFI_ADJUST_CFA_OFFSET 4;\
112 /*CFI_REL_OFFSET es, 0;*/\ 107 /*CFI_REL_OFFSET es, 0;*/\
@@ -136,7 +131,9 @@ VM_MASK = 0x00020000
136 CFI_REL_OFFSET ebx, 0;\ 131 CFI_REL_OFFSET ebx, 0;\
137 movl $(__USER_DS), %edx; \ 132 movl $(__USER_DS), %edx; \
138 movl %edx, %ds; \ 133 movl %edx, %ds; \
139 movl %edx, %es; 134 movl %edx, %es; \
135 movl $(__KERNEL_PDA), %edx; \
136 movl %edx, %gs
140 137
141#define RESTORE_INT_REGS \ 138#define RESTORE_INT_REGS \
142 popl %ebx; \ 139 popl %ebx; \
@@ -169,17 +166,22 @@ VM_MASK = 0x00020000
1692: popl %es; \ 1662: popl %es; \
170 CFI_ADJUST_CFA_OFFSET -4;\ 167 CFI_ADJUST_CFA_OFFSET -4;\
171 /*CFI_RESTORE es;*/\ 168 /*CFI_RESTORE es;*/\
172.section .fixup,"ax"; \ 1693: popl %gs; \
1733: movl $0,(%esp); \ 170 CFI_ADJUST_CFA_OFFSET -4;\
174 jmp 1b; \ 171 /*CFI_RESTORE gs;*/\
172.pushsection .fixup,"ax"; \
1754: movl $0,(%esp); \ 1734: movl $0,(%esp); \
174 jmp 1b; \
1755: movl $0,(%esp); \
176 jmp 2b; \ 176 jmp 2b; \
177.previous; \ 1776: movl $0,(%esp); \
178 jmp 3b; \
178.section __ex_table,"a";\ 179.section __ex_table,"a";\
179 .align 4; \ 180 .align 4; \
180 .long 1b,3b; \ 181 .long 1b,4b; \
181 .long 2b,4b; \ 182 .long 2b,5b; \
182.previous 183 .long 3b,6b; \
184.popsection
183 185
184#define RING0_INT_FRAME \ 186#define RING0_INT_FRAME \
185 CFI_STARTPROC simple;\ 187 CFI_STARTPROC simple;\
@@ -198,18 +200,18 @@ VM_MASK = 0x00020000
198#define RING0_PTREGS_FRAME \ 200#define RING0_PTREGS_FRAME \
199 CFI_STARTPROC simple;\ 201 CFI_STARTPROC simple;\
200 CFI_SIGNAL_FRAME;\ 202 CFI_SIGNAL_FRAME;\
201 CFI_DEF_CFA esp, OLDESP-EBX;\ 203 CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\
202 /*CFI_OFFSET cs, CS-OLDESP;*/\ 204 /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\
203 CFI_OFFSET eip, EIP-OLDESP;\ 205 CFI_OFFSET eip, PT_EIP-PT_OLDESP;\
204 /*CFI_OFFSET es, ES-OLDESP;*/\ 206 /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\
205 /*CFI_OFFSET ds, DS-OLDESP;*/\ 207 /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\
206 CFI_OFFSET eax, EAX-OLDESP;\ 208 CFI_OFFSET eax, PT_EAX-PT_OLDESP;\
207 CFI_OFFSET ebp, EBP-OLDESP;\ 209 CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\
208 CFI_OFFSET edi, EDI-OLDESP;\ 210 CFI_OFFSET edi, PT_EDI-PT_OLDESP;\
209 CFI_OFFSET esi, ESI-OLDESP;\ 211 CFI_OFFSET esi, PT_ESI-PT_OLDESP;\
210 CFI_OFFSET edx, EDX-OLDESP;\ 212 CFI_OFFSET edx, PT_EDX-PT_OLDESP;\
211 CFI_OFFSET ecx, ECX-OLDESP;\ 213 CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\
212 CFI_OFFSET ebx, EBX-OLDESP 214 CFI_OFFSET ebx, PT_EBX-PT_OLDESP
213 215
214ENTRY(ret_from_fork) 216ENTRY(ret_from_fork)
215 CFI_STARTPROC 217 CFI_STARTPROC
@@ -237,17 +239,18 @@ ENTRY(ret_from_fork)
237 ALIGN 239 ALIGN
238 RING0_PTREGS_FRAME 240 RING0_PTREGS_FRAME
239ret_from_exception: 241ret_from_exception:
240 preempt_stop 242 preempt_stop(CLBR_ANY)
241ret_from_intr: 243ret_from_intr:
242 GET_THREAD_INFO(%ebp) 244 GET_THREAD_INFO(%ebp)
243check_userspace: 245check_userspace:
244 movl EFLAGS(%esp), %eax # mix EFLAGS and CS 246 movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
245 movb CS(%esp), %al 247 movb PT_CS(%esp), %al
246 andl $(VM_MASK | SEGMENT_RPL_MASK), %eax 248 andl $(VM_MASK | SEGMENT_RPL_MASK), %eax
247 cmpl $USER_RPL, %eax 249 cmpl $USER_RPL, %eax
248 jb resume_kernel # not returning to v8086 or userspace 250 jb resume_kernel # not returning to v8086 or userspace
251
249ENTRY(resume_userspace) 252ENTRY(resume_userspace)
250 DISABLE_INTERRUPTS # make sure we don't miss an interrupt 253 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
251 # setting need_resched or sigpending 254 # setting need_resched or sigpending
252 # between sampling and the iret 255 # between sampling and the iret
253 movl TI_flags(%ebp), %ecx 256 movl TI_flags(%ebp), %ecx
@@ -258,14 +261,14 @@ ENTRY(resume_userspace)
258 261
259#ifdef CONFIG_PREEMPT 262#ifdef CONFIG_PREEMPT
260ENTRY(resume_kernel) 263ENTRY(resume_kernel)
261 DISABLE_INTERRUPTS 264 DISABLE_INTERRUPTS(CLBR_ANY)
262 cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? 265 cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
263 jnz restore_nocheck 266 jnz restore_nocheck
264need_resched: 267need_resched:
265 movl TI_flags(%ebp), %ecx # need_resched set ? 268 movl TI_flags(%ebp), %ecx # need_resched set ?
266 testb $_TIF_NEED_RESCHED, %cl 269 testb $_TIF_NEED_RESCHED, %cl
267 jz restore_all 270 jz restore_all
268 testl $IF_MASK,EFLAGS(%esp) # interrupts off (exception path) ? 271 testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off (exception path) ?
269 jz restore_all 272 jz restore_all
270 call preempt_schedule_irq 273 call preempt_schedule_irq
271 jmp need_resched 274 jmp need_resched
@@ -287,7 +290,7 @@ sysenter_past_esp:
287 * No need to follow this irqs on/off section: the syscall 290 * No need to follow this irqs on/off section: the syscall
288 * disabled irqs and here we enable it straight after entry: 291 * disabled irqs and here we enable it straight after entry:
289 */ 292 */
290 ENABLE_INTERRUPTS 293 ENABLE_INTERRUPTS(CLBR_NONE)
291 pushl $(__USER_DS) 294 pushl $(__USER_DS)
292 CFI_ADJUST_CFA_OFFSET 4 295 CFI_ADJUST_CFA_OFFSET 4
293 /*CFI_REL_OFFSET ss, 0*/ 296 /*CFI_REL_OFFSET ss, 0*/
@@ -331,20 +334,27 @@ sysenter_past_esp:
331 cmpl $(nr_syscalls), %eax 334 cmpl $(nr_syscalls), %eax
332 jae syscall_badsys 335 jae syscall_badsys
333 call *sys_call_table(,%eax,4) 336 call *sys_call_table(,%eax,4)
334 movl %eax,EAX(%esp) 337 movl %eax,PT_EAX(%esp)
335 DISABLE_INTERRUPTS 338 DISABLE_INTERRUPTS(CLBR_ECX|CLBR_EDX)
336 TRACE_IRQS_OFF 339 TRACE_IRQS_OFF
337 movl TI_flags(%ebp), %ecx 340 movl TI_flags(%ebp), %ecx
338 testw $_TIF_ALLWORK_MASK, %cx 341 testw $_TIF_ALLWORK_MASK, %cx
339 jne syscall_exit_work 342 jne syscall_exit_work
340/* if something modifies registers it must also disable sysexit */ 343/* if something modifies registers it must also disable sysexit */
341 movl EIP(%esp), %edx 344 movl PT_EIP(%esp), %edx
342 movl OLDESP(%esp), %ecx 345 movl PT_OLDESP(%esp), %ecx
343 xorl %ebp,%ebp 346 xorl %ebp,%ebp
344 TRACE_IRQS_ON 347 TRACE_IRQS_ON
3481: mov PT_GS(%esp), %gs
345 ENABLE_INTERRUPTS_SYSEXIT 349 ENABLE_INTERRUPTS_SYSEXIT
346 CFI_ENDPROC 350 CFI_ENDPROC
347 351.pushsection .fixup,"ax"
3522: movl $0,PT_GS(%esp)
353 jmp 1b
354.section __ex_table,"a"
355 .align 4
356 .long 1b,2b
357.popsection
348 358
349 # system call handler stub 359 # system call handler stub
350ENTRY(system_call) 360ENTRY(system_call)
@@ -353,7 +363,7 @@ ENTRY(system_call)
353 CFI_ADJUST_CFA_OFFSET 4 363 CFI_ADJUST_CFA_OFFSET 4
354 SAVE_ALL 364 SAVE_ALL
355 GET_THREAD_INFO(%ebp) 365 GET_THREAD_INFO(%ebp)
356 testl $TF_MASK,EFLAGS(%esp) 366 testl $TF_MASK,PT_EFLAGS(%esp)
357 jz no_singlestep 367 jz no_singlestep
358 orl $_TIF_SINGLESTEP,TI_flags(%ebp) 368 orl $_TIF_SINGLESTEP,TI_flags(%ebp)
359no_singlestep: 369no_singlestep:
@@ -365,9 +375,9 @@ no_singlestep:
365 jae syscall_badsys 375 jae syscall_badsys
366syscall_call: 376syscall_call:
367 call *sys_call_table(,%eax,4) 377 call *sys_call_table(,%eax,4)
368 movl %eax,EAX(%esp) # store the return value 378 movl %eax,PT_EAX(%esp) # store the return value
369syscall_exit: 379syscall_exit:
370 DISABLE_INTERRUPTS # make sure we don't miss an interrupt 380 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
371 # setting need_resched or sigpending 381 # setting need_resched or sigpending
372 # between sampling and the iret 382 # between sampling and the iret
373 TRACE_IRQS_OFF 383 TRACE_IRQS_OFF
@@ -376,12 +386,12 @@ syscall_exit:
376 jne syscall_exit_work 386 jne syscall_exit_work
377 387
378restore_all: 388restore_all:
379 movl EFLAGS(%esp), %eax # mix EFLAGS, SS and CS 389 movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
380 # Warning: OLDSS(%esp) contains the wrong/random values if we 390 # Warning: PT_OLDSS(%esp) contains the wrong/random values if we
381 # are returning to the kernel. 391 # are returning to the kernel.
382 # See comments in process.c:copy_thread() for details. 392 # See comments in process.c:copy_thread() for details.
383 movb OLDSS(%esp), %ah 393 movb PT_OLDSS(%esp), %ah
384 movb CS(%esp), %al 394 movb PT_CS(%esp), %al
385 andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax 395 andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
386 cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax 396 cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
387 CFI_REMEMBER_STATE 397 CFI_REMEMBER_STATE
@@ -390,13 +400,13 @@ restore_nocheck:
390 TRACE_IRQS_IRET 400 TRACE_IRQS_IRET
391restore_nocheck_notrace: 401restore_nocheck_notrace:
392 RESTORE_REGS 402 RESTORE_REGS
393 addl $4, %esp 403 addl $4, %esp # skip orig_eax/error_code
394 CFI_ADJUST_CFA_OFFSET -4 404 CFI_ADJUST_CFA_OFFSET -4
3951: INTERRUPT_RETURN 4051: INTERRUPT_RETURN
396.section .fixup,"ax" 406.section .fixup,"ax"
397iret_exc: 407iret_exc:
398 TRACE_IRQS_ON 408 TRACE_IRQS_ON
399 ENABLE_INTERRUPTS 409 ENABLE_INTERRUPTS(CLBR_NONE)
400 pushl $0 # no error code 410 pushl $0 # no error code
401 pushl $do_iret_error 411 pushl $do_iret_error
402 jmp error_code 412 jmp error_code
@@ -408,33 +418,42 @@ iret_exc:
408 418
409 CFI_RESTORE_STATE 419 CFI_RESTORE_STATE
410ldt_ss: 420ldt_ss:
411 larl OLDSS(%esp), %eax 421 larl PT_OLDSS(%esp), %eax
412 jnz restore_nocheck 422 jnz restore_nocheck
413 testl $0x00400000, %eax # returning to 32bit stack? 423 testl $0x00400000, %eax # returning to 32bit stack?
414 jnz restore_nocheck # allright, normal return 424 jnz restore_nocheck # allright, normal return
425
426#ifdef CONFIG_PARAVIRT
427 /*
428 * The kernel can't run on a non-flat stack if paravirt mode
429 * is active. Rather than try to fixup the high bits of
430 * ESP, bypass this code entirely. This may break DOSemu
431 * and/or Wine support in a paravirt VM, although the option
432 * is still available to implement the setting of the high
433 * 16-bits in the INTERRUPT_RETURN paravirt-op.
434 */
435 cmpl $0, paravirt_ops+PARAVIRT_enabled
436 jne restore_nocheck
437#endif
438
415 /* If returning to userspace with 16bit stack, 439 /* If returning to userspace with 16bit stack,
416 * try to fix the higher word of ESP, as the CPU 440 * try to fix the higher word of ESP, as the CPU
417 * won't restore it. 441 * won't restore it.
418 * This is an "official" bug of all the x86-compatible 442 * This is an "official" bug of all the x86-compatible
419 * CPUs, which we can try to work around to make 443 * CPUs, which we can try to work around to make
420 * dosemu and wine happy. */ 444 * dosemu and wine happy. */
421 subl $8, %esp # reserve space for switch16 pointer 445 movl PT_OLDESP(%esp), %eax
422 CFI_ADJUST_CFA_OFFSET 8 446 movl %esp, %edx
423 DISABLE_INTERRUPTS 447 call patch_espfix_desc
448 pushl $__ESPFIX_SS
449 CFI_ADJUST_CFA_OFFSET 4
450 pushl %eax
451 CFI_ADJUST_CFA_OFFSET 4
452 DISABLE_INTERRUPTS(CLBR_EAX)
424 TRACE_IRQS_OFF 453 TRACE_IRQS_OFF
425 movl %esp, %eax 454 lss (%esp), %esp
426 /* Set up the 16bit stack frame with switch32 pointer on top, 455 CFI_ADJUST_CFA_OFFSET -8
427 * and a switch16 pointer on top of the current frame. */ 456 jmp restore_nocheck
428 call setup_x86_bogus_stack
429 CFI_ADJUST_CFA_OFFSET -8 # frame has moved
430 TRACE_IRQS_IRET
431 RESTORE_REGS
432 lss 20+4(%esp), %esp # switch to 16bit stack
4331: INTERRUPT_RETURN
434.section __ex_table,"a"
435 .align 4
436 .long 1b,iret_exc
437.previous
438 CFI_ENDPROC 457 CFI_ENDPROC
439 458
440 # perform work that needs to be done immediately before resumption 459 # perform work that needs to be done immediately before resumption
@@ -445,7 +464,7 @@ work_pending:
445 jz work_notifysig 464 jz work_notifysig
446work_resched: 465work_resched:
447 call schedule 466 call schedule
448 DISABLE_INTERRUPTS # make sure we don't miss an interrupt 467 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
449 # setting need_resched or sigpending 468 # setting need_resched or sigpending
450 # between sampling and the iret 469 # between sampling and the iret
451 TRACE_IRQS_OFF 470 TRACE_IRQS_OFF
@@ -458,7 +477,8 @@ work_resched:
458 477
459work_notifysig: # deal with pending signals and 478work_notifysig: # deal with pending signals and
460 # notify-resume requests 479 # notify-resume requests
461 testl $VM_MASK, EFLAGS(%esp) 480#ifdef CONFIG_VM86
481 testl $VM_MASK, PT_EFLAGS(%esp)
462 movl %esp, %eax 482 movl %esp, %eax
463 jne work_notifysig_v86 # returning to kernel-space or 483 jne work_notifysig_v86 # returning to kernel-space or
464 # vm86-space 484 # vm86-space
@@ -468,29 +488,30 @@ work_notifysig: # deal with pending signals and
468 488
469 ALIGN 489 ALIGN
470work_notifysig_v86: 490work_notifysig_v86:
471#ifdef CONFIG_VM86
472 pushl %ecx # save ti_flags for do_notify_resume 491 pushl %ecx # save ti_flags for do_notify_resume
473 CFI_ADJUST_CFA_OFFSET 4 492 CFI_ADJUST_CFA_OFFSET 4
474 call save_v86_state # %eax contains pt_regs pointer 493 call save_v86_state # %eax contains pt_regs pointer
475 popl %ecx 494 popl %ecx
476 CFI_ADJUST_CFA_OFFSET -4 495 CFI_ADJUST_CFA_OFFSET -4
477 movl %eax, %esp 496 movl %eax, %esp
497#else
498 movl %esp, %eax
499#endif
478 xorl %edx, %edx 500 xorl %edx, %edx
479 call do_notify_resume 501 call do_notify_resume
480 jmp resume_userspace_sig 502 jmp resume_userspace_sig
481#endif
482 503
483 # perform syscall exit tracing 504 # perform syscall exit tracing
484 ALIGN 505 ALIGN
485syscall_trace_entry: 506syscall_trace_entry:
486 movl $-ENOSYS,EAX(%esp) 507 movl $-ENOSYS,PT_EAX(%esp)
487 movl %esp, %eax 508 movl %esp, %eax
488 xorl %edx,%edx 509 xorl %edx,%edx
489 call do_syscall_trace 510 call do_syscall_trace
490 cmpl $0, %eax 511 cmpl $0, %eax
491 jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU, 512 jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
492 # so must skip actual syscall 513 # so must skip actual syscall
493 movl ORIG_EAX(%esp), %eax 514 movl PT_ORIG_EAX(%esp), %eax
494 cmpl $(nr_syscalls), %eax 515 cmpl $(nr_syscalls), %eax
495 jnae syscall_call 516 jnae syscall_call
496 jmp syscall_exit 517 jmp syscall_exit
@@ -501,7 +522,7 @@ syscall_exit_work:
501 testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl 522 testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
502 jz work_pending 523 jz work_pending
503 TRACE_IRQS_ON 524 TRACE_IRQS_ON
504 ENABLE_INTERRUPTS # could let do_syscall_trace() call 525 ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call
505 # schedule() instead 526 # schedule() instead
506 movl %esp, %eax 527 movl %esp, %eax
507 movl $1, %edx 528 movl $1, %edx
@@ -515,39 +536,38 @@ syscall_fault:
515 CFI_ADJUST_CFA_OFFSET 4 536 CFI_ADJUST_CFA_OFFSET 4
516 SAVE_ALL 537 SAVE_ALL
517 GET_THREAD_INFO(%ebp) 538 GET_THREAD_INFO(%ebp)
518 movl $-EFAULT,EAX(%esp) 539 movl $-EFAULT,PT_EAX(%esp)
519 jmp resume_userspace 540 jmp resume_userspace
520 541
521syscall_badsys: 542syscall_badsys:
522 movl $-ENOSYS,EAX(%esp) 543 movl $-ENOSYS,PT_EAX(%esp)
523 jmp resume_userspace 544 jmp resume_userspace
524 CFI_ENDPROC 545 CFI_ENDPROC
525 546
526#define FIXUP_ESPFIX_STACK \ 547#define FIXUP_ESPFIX_STACK \
527 movl %esp, %eax; \ 548 /* since we are on a wrong stack, we cant make it a C code :( */ \
528 /* switch to 32bit stack using the pointer on top of 16bit stack */ \ 549 movl %gs:PDA_cpu, %ebx; \
529 lss %ss:CPU_16BIT_STACK_SIZE-8, %esp; \ 550 PER_CPU(cpu_gdt_descr, %ebx); \
530 /* copy data from 16bit stack to 32bit stack */ \ 551 movl GDS_address(%ebx), %ebx; \
531 call fixup_x86_bogus_stack; \ 552 GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
532 /* put ESP to the proper location */ \ 553 addl %esp, %eax; \
533 movl %eax, %esp; 554 pushl $__KERNEL_DS; \
534#define UNWIND_ESPFIX_STACK \ 555 CFI_ADJUST_CFA_OFFSET 4; \
535 pushl %eax; \ 556 pushl %eax; \
536 CFI_ADJUST_CFA_OFFSET 4; \ 557 CFI_ADJUST_CFA_OFFSET 4; \
558 lss (%esp), %esp; \
559 CFI_ADJUST_CFA_OFFSET -8;
560#define UNWIND_ESPFIX_STACK \
537 movl %ss, %eax; \ 561 movl %ss, %eax; \
538 /* see if on 16bit stack */ \ 562 /* see if on espfix stack */ \
539 cmpw $__ESPFIX_SS, %ax; \ 563 cmpw $__ESPFIX_SS, %ax; \
540 je 28f; \ 564 jne 27f; \
54127: popl %eax; \ 565 movl $__KERNEL_DS, %eax; \
542 CFI_ADJUST_CFA_OFFSET -4; \
543.section .fixup,"ax"; \
54428: movl $__KERNEL_DS, %eax; \
545 movl %eax, %ds; \ 566 movl %eax, %ds; \
546 movl %eax, %es; \ 567 movl %eax, %es; \
547 /* switch to 32bit stack */ \ 568 /* switch to normal stack */ \
548 FIXUP_ESPFIX_STACK; \ 569 FIXUP_ESPFIX_STACK; \
549 jmp 27b; \ 57027:;
550.previous
551 571
552/* 572/*
553 * Build the entry stubs and pointer table with 573 * Build the entry stubs and pointer table with
@@ -608,13 +628,16 @@ KPROBE_ENTRY(page_fault)
608 CFI_ADJUST_CFA_OFFSET 4 628 CFI_ADJUST_CFA_OFFSET 4
609 ALIGN 629 ALIGN
610error_code: 630error_code:
631 /* the function address is in %gs's slot on the stack */
632 pushl %es
633 CFI_ADJUST_CFA_OFFSET 4
634 /*CFI_REL_OFFSET es, 0*/
611 pushl %ds 635 pushl %ds
612 CFI_ADJUST_CFA_OFFSET 4 636 CFI_ADJUST_CFA_OFFSET 4
613 /*CFI_REL_OFFSET ds, 0*/ 637 /*CFI_REL_OFFSET ds, 0*/
614 pushl %eax 638 pushl %eax
615 CFI_ADJUST_CFA_OFFSET 4 639 CFI_ADJUST_CFA_OFFSET 4
616 CFI_REL_OFFSET eax, 0 640 CFI_REL_OFFSET eax, 0
617 xorl %eax, %eax
618 pushl %ebp 641 pushl %ebp
619 CFI_ADJUST_CFA_OFFSET 4 642 CFI_ADJUST_CFA_OFFSET 4
620 CFI_REL_OFFSET ebp, 0 643 CFI_REL_OFFSET ebp, 0
@@ -627,7 +650,6 @@ error_code:
627 pushl %edx 650 pushl %edx
628 CFI_ADJUST_CFA_OFFSET 4 651 CFI_ADJUST_CFA_OFFSET 4
629 CFI_REL_OFFSET edx, 0 652 CFI_REL_OFFSET edx, 0
630 decl %eax # eax = -1
631 pushl %ecx 653 pushl %ecx
632 CFI_ADJUST_CFA_OFFSET 4 654 CFI_ADJUST_CFA_OFFSET 4
633 CFI_REL_OFFSET ecx, 0 655 CFI_REL_OFFSET ecx, 0
@@ -635,18 +657,20 @@ error_code:
635 CFI_ADJUST_CFA_OFFSET 4 657 CFI_ADJUST_CFA_OFFSET 4
636 CFI_REL_OFFSET ebx, 0 658 CFI_REL_OFFSET ebx, 0
637 cld 659 cld
638 pushl %es 660 pushl %gs
639 CFI_ADJUST_CFA_OFFSET 4 661 CFI_ADJUST_CFA_OFFSET 4
640 /*CFI_REL_OFFSET es, 0*/ 662 /*CFI_REL_OFFSET gs, 0*/
663 movl $(__KERNEL_PDA), %ecx
664 movl %ecx, %gs
641 UNWIND_ESPFIX_STACK 665 UNWIND_ESPFIX_STACK
642 popl %ecx 666 popl %ecx
643 CFI_ADJUST_CFA_OFFSET -4 667 CFI_ADJUST_CFA_OFFSET -4
644 /*CFI_REGISTER es, ecx*/ 668 /*CFI_REGISTER es, ecx*/
645 movl ES(%esp), %edi # get the function address 669 movl PT_GS(%esp), %edi # get the function address
646 movl ORIG_EAX(%esp), %edx # get the error code 670 movl PT_ORIG_EAX(%esp), %edx # get the error code
647 movl %eax, ORIG_EAX(%esp) 671 movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
648 movl %ecx, ES(%esp) 672 mov %ecx, PT_GS(%esp)
649 /*CFI_REL_OFFSET es, ES*/ 673 /*CFI_REL_OFFSET gs, ES*/
650 movl $(__USER_DS), %ecx 674 movl $(__USER_DS), %ecx
651 movl %ecx, %ds 675 movl %ecx, %ds
652 movl %ecx, %es 676 movl %ecx, %es
@@ -682,7 +706,7 @@ ENTRY(device_not_available)
682 GET_CR0_INTO_EAX 706 GET_CR0_INTO_EAX
683 testl $0x4, %eax # EM (math emulation bit) 707 testl $0x4, %eax # EM (math emulation bit)
684 jne device_not_available_emulate 708 jne device_not_available_emulate
685 preempt_stop 709 preempt_stop(CLBR_ANY)
686 call math_state_restore 710 call math_state_restore
687 jmp ret_from_exception 711 jmp ret_from_exception
688device_not_available_emulate: 712device_not_available_emulate:
@@ -754,7 +778,7 @@ KPROBE_ENTRY(nmi)
754 cmpw $__ESPFIX_SS, %ax 778 cmpw $__ESPFIX_SS, %ax
755 popl %eax 779 popl %eax
756 CFI_ADJUST_CFA_OFFSET -4 780 CFI_ADJUST_CFA_OFFSET -4
757 je nmi_16bit_stack 781 je nmi_espfix_stack
758 cmpl $sysenter_entry,(%esp) 782 cmpl $sysenter_entry,(%esp)
759 je nmi_stack_fixup 783 je nmi_stack_fixup
760 pushl %eax 784 pushl %eax
@@ -797,7 +821,7 @@ nmi_debug_stack_check:
797 FIX_STACK(24,nmi_stack_correct, 1) 821 FIX_STACK(24,nmi_stack_correct, 1)
798 jmp nmi_stack_correct 822 jmp nmi_stack_correct
799 823
800nmi_16bit_stack: 824nmi_espfix_stack:
801 /* We have a RING0_INT_FRAME here. 825 /* We have a RING0_INT_FRAME here.
802 * 826 *
803 * create the pointer to lss back 827 * create the pointer to lss back
@@ -806,7 +830,6 @@ nmi_16bit_stack:
806 CFI_ADJUST_CFA_OFFSET 4 830 CFI_ADJUST_CFA_OFFSET 4
807 pushl %esp 831 pushl %esp
808 CFI_ADJUST_CFA_OFFSET 4 832 CFI_ADJUST_CFA_OFFSET 4
809 movzwl %sp, %esp
810 addw $4, (%esp) 833 addw $4, (%esp)
811 /* copy the iret frame of 12 bytes */ 834 /* copy the iret frame of 12 bytes */
812 .rept 3 835 .rept 3
@@ -817,11 +840,11 @@ nmi_16bit_stack:
817 CFI_ADJUST_CFA_OFFSET 4 840 CFI_ADJUST_CFA_OFFSET 4
818 SAVE_ALL 841 SAVE_ALL
819 FIXUP_ESPFIX_STACK # %eax == %esp 842 FIXUP_ESPFIX_STACK # %eax == %esp
820 CFI_ADJUST_CFA_OFFSET -20 # the frame has now moved
821 xorl %edx,%edx # zero error code 843 xorl %edx,%edx # zero error code
822 call do_nmi 844 call do_nmi
823 RESTORE_REGS 845 RESTORE_REGS
824 lss 12+4(%esp), %esp # back to 16bit stack 846 lss 12+4(%esp), %esp # back to espfix stack
847 CFI_ADJUST_CFA_OFFSET -24
8251: INTERRUPT_RETURN 8481: INTERRUPT_RETURN
826 CFI_ENDPROC 849 CFI_ENDPROC
827.section __ex_table,"a" 850.section __ex_table,"a"
@@ -830,6 +853,19 @@ nmi_16bit_stack:
830.previous 853.previous
831KPROBE_END(nmi) 854KPROBE_END(nmi)
832 855
856#ifdef CONFIG_PARAVIRT
857ENTRY(native_iret)
8581: iret
859.section __ex_table,"a"
860 .align 4
861 .long 1b,iret_exc
862.previous
863
864ENTRY(native_irq_enable_sysexit)
865 sti
866 sysexit
867#endif
868
833KPROBE_ENTRY(int3) 869KPROBE_ENTRY(int3)
834 RING0_INT_FRAME 870 RING0_INT_FRAME
835 pushl $-1 # mark this as an int 871 pushl $-1 # mark this as an int
@@ -949,26 +985,27 @@ ENTRY(arch_unwind_init_running)
949 movl 4(%esp), %edx 985 movl 4(%esp), %edx
950 movl (%esp), %ecx 986 movl (%esp), %ecx
951 leal 4(%esp), %eax 987 leal 4(%esp), %eax
952 movl %ebx, EBX(%edx) 988 movl %ebx, PT_EBX(%edx)
953 xorl %ebx, %ebx 989 xorl %ebx, %ebx
954 movl %ebx, ECX(%edx) 990 movl %ebx, PT_ECX(%edx)
955 movl %ebx, EDX(%edx) 991 movl %ebx, PT_EDX(%edx)
956 movl %esi, ESI(%edx) 992 movl %esi, PT_ESI(%edx)
957 movl %edi, EDI(%edx) 993 movl %edi, PT_EDI(%edx)
958 movl %ebp, EBP(%edx) 994 movl %ebp, PT_EBP(%edx)
959 movl %ebx, EAX(%edx) 995 movl %ebx, PT_EAX(%edx)
960 movl $__USER_DS, DS(%edx) 996 movl $__USER_DS, PT_DS(%edx)
961 movl $__USER_DS, ES(%edx) 997 movl $__USER_DS, PT_ES(%edx)
962 movl %ebx, ORIG_EAX(%edx) 998 movl $0, PT_GS(%edx)
963 movl %ecx, EIP(%edx) 999 movl %ebx, PT_ORIG_EAX(%edx)
1000 movl %ecx, PT_EIP(%edx)
964 movl 12(%esp), %ecx 1001 movl 12(%esp), %ecx
965 movl $__KERNEL_CS, CS(%edx) 1002 movl $__KERNEL_CS, PT_CS(%edx)
966 movl %ebx, EFLAGS(%edx) 1003 movl %ebx, PT_EFLAGS(%edx)
967 movl %eax, OLDESP(%edx) 1004 movl %eax, PT_OLDESP(%edx)
968 movl 8(%esp), %eax 1005 movl 8(%esp), %eax
969 movl %ecx, 8(%esp) 1006 movl %ecx, 8(%esp)
970 movl EBX(%edx), %ebx 1007 movl PT_EBX(%edx), %ebx
971 movl $__KERNEL_DS, OLDSS(%edx) 1008 movl $__KERNEL_DS, PT_OLDSS(%edx)
972 jmpl *%eax 1009 jmpl *%eax
973 CFI_ENDPROC 1010 CFI_ENDPROC
974ENDPROC(arch_unwind_init_running) 1011ENDPROC(arch_unwind_init_running)
diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S
index ca31f18d277c..edef5084ce17 100644
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -55,6 +55,12 @@
55 */ 55 */
56ENTRY(startup_32) 56ENTRY(startup_32)
57 57
58#ifdef CONFIG_PARAVIRT
59 movl %cs, %eax
60 testl $0x3, %eax
61 jnz startup_paravirt
62#endif
63
58/* 64/*
59 * Set segments to known values. 65 * Set segments to known values.
60 */ 66 */
@@ -302,6 +308,7 @@ is386: movl $2,%ecx # set MP
302 movl %eax,%cr0 308 movl %eax,%cr0
303 309
304 call check_x87 310 call check_x87
311 call setup_pda
305 lgdt cpu_gdt_descr 312 lgdt cpu_gdt_descr
306 lidt idt_descr 313 lidt idt_descr
307 ljmp $(__KERNEL_CS),$1f 314 ljmp $(__KERNEL_CS),$1f
@@ -312,10 +319,13 @@ is386: movl $2,%ecx # set MP
312 movl %eax,%ds 319 movl %eax,%ds
313 movl %eax,%es 320 movl %eax,%es
314 321
315 xorl %eax,%eax # Clear FS/GS and LDT 322 xorl %eax,%eax # Clear FS and LDT
316 movl %eax,%fs 323 movl %eax,%fs
317 movl %eax,%gs
318 lldt %ax 324 lldt %ax
325
326 movl $(__KERNEL_PDA),%eax
327 mov %eax,%gs
328
319 cld # gcc2 wants the direction flag cleared at all times 329 cld # gcc2 wants the direction flag cleared at all times
320 pushl $0 # fake return address for unwinder 330 pushl $0 # fake return address for unwinder
321#ifdef CONFIG_SMP 331#ifdef CONFIG_SMP
@@ -346,6 +356,23 @@ check_x87:
346 ret 356 ret
347 357
348/* 358/*
359 * Point the GDT at this CPU's PDA. On boot this will be
360 * cpu_gdt_table and boot_pda; for secondary CPUs, these will be
361 * that CPU's GDT and PDA.
362 */
363setup_pda:
364 /* get the PDA pointer */
365 movl start_pda, %eax
366
367 /* slot the PDA address into the GDT */
368 mov cpu_gdt_descr+2, %ecx
369 mov %ax, (__KERNEL_PDA+0+2)(%ecx) /* base & 0x0000ffff */
370 shr $16, %eax
371 mov %al, (__KERNEL_PDA+4+0)(%ecx) /* base & 0x00ff0000 */
372 mov %ah, (__KERNEL_PDA+4+3)(%ecx) /* base & 0xff000000 */
373 ret
374
375/*
349 * setup_idt 376 * setup_idt
350 * 377 *
351 * sets up a idt with 256 entries pointing to 378 * sets up a idt with 256 entries pointing to
@@ -465,6 +492,33 @@ ignore_int:
465#endif 492#endif
466 iret 493 iret
467 494
495#ifdef CONFIG_PARAVIRT
496startup_paravirt:
497 cld
498 movl $(init_thread_union+THREAD_SIZE),%esp
499
500 /* We take pains to preserve all the regs. */
501 pushl %edx
502 pushl %ecx
503 pushl %eax
504
505 /* paravirt.o is last in link, and that probe fn never returns */
506 pushl $__start_paravirtprobe
5071:
508 movl 0(%esp), %eax
509 pushl (%eax)
510 movl 8(%esp), %eax
511 call *(%esp)
512 popl %eax
513
514 movl 4(%esp), %eax
515 movl 8(%esp), %ecx
516 movl 12(%esp), %edx
517
518 addl $4, (%esp)
519 jmp 1b
520#endif
521
468/* 522/*
469 * Real beginning of normal "text" segment 523 * Real beginning of normal "text" segment
470 */ 524 */
@@ -484,6 +538,8 @@ ENTRY(empty_zero_page)
484 * This starts the data section. 538 * This starts the data section.
485 */ 539 */
486.data 540.data
541ENTRY(start_pda)
542 .long boot_pda
487 543
488ENTRY(stack_start) 544ENTRY(stack_start)
489 .long init_thread_union+THREAD_SIZE 545 .long init_thread_union+THREAD_SIZE
@@ -525,7 +581,7 @@ idt_descr:
525 581
526# boot GDT descriptor (later on used by CPU#0): 582# boot GDT descriptor (later on used by CPU#0):
527 .word 0 # 32 bit align gdt_desc.address 583 .word 0 # 32 bit align gdt_desc.address
528cpu_gdt_descr: 584ENTRY(cpu_gdt_descr)
529 .word GDT_ENTRIES*8-1 585 .word GDT_ENTRIES*8-1
530 .long cpu_gdt_table 586 .long cpu_gdt_table
531 587
@@ -584,8 +640,8 @@ ENTRY(cpu_gdt_table)
584 .quad 0x00009a000000ffff /* 0xc0 APM CS 16 code (16 bit) */ 640 .quad 0x00009a000000ffff /* 0xc0 APM CS 16 code (16 bit) */
585 .quad 0x004092000000ffff /* 0xc8 APM DS data */ 641 .quad 0x004092000000ffff /* 0xc8 APM DS data */
586 642
587 .quad 0x0000920000000000 /* 0xd0 - ESPFIX 16-bit SS */ 643 .quad 0x00c0920000000000 /* 0xd0 - ESPFIX SS */
588 .quad 0x0000000000000000 /* 0xd8 - unused */ 644 .quad 0x00cf92000000ffff /* 0xd8 - PDA */
589 .quad 0x0000000000000000 /* 0xe0 - unused */ 645 .quad 0x0000000000000000 /* 0xe0 - unused */
590 .quad 0x0000000000000000 /* 0xe8 - unused */ 646 .quad 0x0000000000000000 /* 0xe8 - unused */
591 .quad 0x0000000000000000 /* 0xf0 - unused */ 647 .quad 0x0000000000000000 /* 0xf0 - unused */
diff --git a/arch/i386/kernel/hpet.c b/arch/i386/kernel/hpet.c
index 17647a530b2f..45a8685bb60b 100644
--- a/arch/i386/kernel/hpet.c
+++ b/arch/i386/kernel/hpet.c
@@ -34,6 +34,7 @@ static int __init init_hpet_clocksource(void)
34 unsigned long hpet_period; 34 unsigned long hpet_period;
35 void __iomem* hpet_base; 35 void __iomem* hpet_base;
36 u64 tmp; 36 u64 tmp;
37 int err;
37 38
38 if (!is_hpet_enabled()) 39 if (!is_hpet_enabled())
39 return -ENODEV; 40 return -ENODEV;
@@ -61,7 +62,11 @@ static int __init init_hpet_clocksource(void)
61 do_div(tmp, FSEC_PER_NSEC); 62 do_div(tmp, FSEC_PER_NSEC);
62 clocksource_hpet.mult = (u32)tmp; 63 clocksource_hpet.mult = (u32)tmp;
63 64
64 return clocksource_register(&clocksource_hpet); 65 err = clocksource_register(&clocksource_hpet);
66 if (err)
67 iounmap(hpet_base);
68
69 return err;
65} 70}
66 71
67module_init(init_hpet_clocksource); 72module_init(init_hpet_clocksource);
diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c
index 62996cd17084..c8d45821c788 100644
--- a/arch/i386/kernel/i8259.c
+++ b/arch/i386/kernel/i8259.c
@@ -381,7 +381,10 @@ void __init init_ISA_irqs (void)
381 } 381 }
382} 382}
383 383
384void __init init_IRQ(void) 384/* Overridden in paravirt.c */
385void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
386
387void __init native_init_IRQ(void)
385{ 388{
386 int i; 389 int i;
387 390
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 44c5a3206b2a..e21dcde0790e 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -154,14 +154,20 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
154 * the interrupt, and we need to make sure the entry is fully populated 154 * the interrupt, and we need to make sure the entry is fully populated
155 * before that happens. 155 * before that happens.
156 */ 156 */
157static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) 157static void
158__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
158{ 159{
159 unsigned long flags;
160 union entry_union eu; 160 union entry_union eu;
161 eu.entry = e; 161 eu.entry = e;
162 spin_lock_irqsave(&ioapic_lock, flags);
163 io_apic_write(apic, 0x11 + 2*pin, eu.w2); 162 io_apic_write(apic, 0x11 + 2*pin, eu.w2);
164 io_apic_write(apic, 0x10 + 2*pin, eu.w1); 163 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
164}
165
166static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
167{
168 unsigned long flags;
169 spin_lock_irqsave(&ioapic_lock, flags);
170 __ioapic_write_entry(apic, pin, e);
165 spin_unlock_irqrestore(&ioapic_lock, flags); 171 spin_unlock_irqrestore(&ioapic_lock, flags);
166} 172}
167 173
@@ -837,8 +843,7 @@ static int __init find_isa_irq_pin(int irq, int type)
837 843
838 if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || 844 if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
839 mp_bus_id_to_type[lbus] == MP_BUS_EISA || 845 mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
840 mp_bus_id_to_type[lbus] == MP_BUS_MCA || 846 mp_bus_id_to_type[lbus] == MP_BUS_MCA
841 mp_bus_id_to_type[lbus] == MP_BUS_NEC98
842 ) && 847 ) &&
843 (mp_irqs[i].mpc_irqtype == type) && 848 (mp_irqs[i].mpc_irqtype == type) &&
844 (mp_irqs[i].mpc_srcbusirq == irq)) 849 (mp_irqs[i].mpc_srcbusirq == irq))
@@ -857,8 +862,7 @@ static int __init find_isa_irq_apic(int irq, int type)
857 862
858 if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || 863 if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
859 mp_bus_id_to_type[lbus] == MP_BUS_EISA || 864 mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
860 mp_bus_id_to_type[lbus] == MP_BUS_MCA || 865 mp_bus_id_to_type[lbus] == MP_BUS_MCA
861 mp_bus_id_to_type[lbus] == MP_BUS_NEC98
862 ) && 866 ) &&
863 (mp_irqs[i].mpc_irqtype == type) && 867 (mp_irqs[i].mpc_irqtype == type) &&
864 (mp_irqs[i].mpc_srcbusirq == irq)) 868 (mp_irqs[i].mpc_srcbusirq == irq))
@@ -988,12 +992,6 @@ static int EISA_ELCR(unsigned int irq)
988#define default_MCA_trigger(idx) (1) 992#define default_MCA_trigger(idx) (1)
989#define default_MCA_polarity(idx) (0) 993#define default_MCA_polarity(idx) (0)
990 994
991/* NEC98 interrupts are always polarity zero edge triggered,
992 * when listed as conforming in the MP table. */
993
994#define default_NEC98_trigger(idx) (0)
995#define default_NEC98_polarity(idx) (0)
996
997static int __init MPBIOS_polarity(int idx) 995static int __init MPBIOS_polarity(int idx)
998{ 996{
999 int bus = mp_irqs[idx].mpc_srcbus; 997 int bus = mp_irqs[idx].mpc_srcbus;
@@ -1028,11 +1026,6 @@ static int __init MPBIOS_polarity(int idx)
1028 polarity = default_MCA_polarity(idx); 1026 polarity = default_MCA_polarity(idx);
1029 break; 1027 break;
1030 } 1028 }
1031 case MP_BUS_NEC98: /* NEC 98 pin */
1032 {
1033 polarity = default_NEC98_polarity(idx);
1034 break;
1035 }
1036 default: 1029 default:
1037 { 1030 {
1038 printk(KERN_WARNING "broken BIOS!!\n"); 1031 printk(KERN_WARNING "broken BIOS!!\n");
@@ -1102,11 +1095,6 @@ static int MPBIOS_trigger(int idx)
1102 trigger = default_MCA_trigger(idx); 1095 trigger = default_MCA_trigger(idx);
1103 break; 1096 break;
1104 } 1097 }
1105 case MP_BUS_NEC98: /* NEC 98 pin */
1106 {
1107 trigger = default_NEC98_trigger(idx);
1108 break;
1109 }
1110 default: 1098 default:
1111 { 1099 {
1112 printk(KERN_WARNING "broken BIOS!!\n"); 1100 printk(KERN_WARNING "broken BIOS!!\n");
@@ -1168,7 +1156,6 @@ static int pin_2_irq(int idx, int apic, int pin)
1168 case MP_BUS_ISA: /* ISA pin */ 1156 case MP_BUS_ISA: /* ISA pin */
1169 case MP_BUS_EISA: 1157 case MP_BUS_EISA:
1170 case MP_BUS_MCA: 1158 case MP_BUS_MCA:
1171 case MP_BUS_NEC98:
1172 { 1159 {
1173 irq = mp_irqs[idx].mpc_srcbusirq; 1160 irq = mp_irqs[idx].mpc_srcbusirq;
1174 break; 1161 break;
@@ -1236,7 +1223,7 @@ static inline int IO_APIC_irq_trigger(int irq)
1236} 1223}
1237 1224
1238/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ 1225/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
1239u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 }; 1226static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
1240 1227
1241static int __assign_irq_vector(int irq) 1228static int __assign_irq_vector(int irq)
1242{ 1229{
@@ -1361,8 +1348,8 @@ static void __init setup_IO_APIC_irqs(void)
1361 if (!apic && (irq < 16)) 1348 if (!apic && (irq < 16))
1362 disable_8259A_irq(irq); 1349 disable_8259A_irq(irq);
1363 } 1350 }
1364 ioapic_write_entry(apic, pin, entry);
1365 spin_lock_irqsave(&ioapic_lock, flags); 1351 spin_lock_irqsave(&ioapic_lock, flags);
1352 __ioapic_write_entry(apic, pin, entry);
1366 set_native_irq_info(irq, TARGET_CPUS); 1353 set_native_irq_info(irq, TARGET_CPUS);
1367 spin_unlock_irqrestore(&ioapic_lock, flags); 1354 spin_unlock_irqrestore(&ioapic_lock, flags);
1368 } 1355 }
@@ -1927,6 +1914,15 @@ static void __init setup_ioapic_ids_from_mpc(void)
1927static void __init setup_ioapic_ids_from_mpc(void) { } 1914static void __init setup_ioapic_ids_from_mpc(void) { }
1928#endif 1915#endif
1929 1916
1917static int no_timer_check __initdata;
1918
1919static int __init notimercheck(char *s)
1920{
1921 no_timer_check = 1;
1922 return 1;
1923}
1924__setup("no_timer_check", notimercheck);
1925
1930/* 1926/*
1931 * There is a nasty bug in some older SMP boards, their mptable lies 1927 * There is a nasty bug in some older SMP boards, their mptable lies
1932 * about the timer IRQ. We do the following to work around the situation: 1928 * about the timer IRQ. We do the following to work around the situation:
@@ -1935,10 +1931,13 @@ static void __init setup_ioapic_ids_from_mpc(void) { }
1935 * - if this function detects that timer IRQs are defunct, then we fall 1931 * - if this function detects that timer IRQs are defunct, then we fall
1936 * back to ISA timer IRQs 1932 * back to ISA timer IRQs
1937 */ 1933 */
1938static int __init timer_irq_works(void) 1934int __init timer_irq_works(void)
1939{ 1935{
1940 unsigned long t1 = jiffies; 1936 unsigned long t1 = jiffies;
1941 1937
1938 if (no_timer_check)
1939 return 1;
1940
1942 local_irq_enable(); 1941 local_irq_enable();
1943 /* Let ten ticks pass... */ 1942 /* Let ten ticks pass... */
1944 mdelay((10 * 1000) / HZ); 1943 mdelay((10 * 1000) / HZ);
@@ -2162,9 +2161,15 @@ static inline void unlock_ExtINT_logic(void)
2162 unsigned char save_control, save_freq_select; 2161 unsigned char save_control, save_freq_select;
2163 2162
2164 pin = find_isa_irq_pin(8, mp_INT); 2163 pin = find_isa_irq_pin(8, mp_INT);
2164 if (pin == -1) {
2165 WARN_ON_ONCE(1);
2166 return;
2167 }
2165 apic = find_isa_irq_apic(8, mp_INT); 2168 apic = find_isa_irq_apic(8, mp_INT);
2166 if (pin == -1) 2169 if (apic == -1) {
2170 WARN_ON_ONCE(1);
2167 return; 2171 return;
2172 }
2168 2173
2169 entry0 = ioapic_read_entry(apic, pin); 2174 entry0 = ioapic_read_entry(apic, pin);
2170 clear_IO_APIC_pin(apic, pin); 2175 clear_IO_APIC_pin(apic, pin);
@@ -2209,7 +2214,7 @@ int timer_uses_ioapic_pin_0;
2209 * is so screwy. Thanks to Brian Perkins for testing/hacking this beast 2214 * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
2210 * fanatically on his truly buggy board. 2215 * fanatically on his truly buggy board.
2211 */ 2216 */
2212static inline void check_timer(void) 2217static inline void __init check_timer(void)
2213{ 2218{
2214 int apic1, pin1, apic2, pin2; 2219 int apic1, pin1, apic2, pin2;
2215 int vector; 2220 int vector;
@@ -2857,8 +2862,8 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
2857 if (!ioapic && (irq < 16)) 2862 if (!ioapic && (irq < 16))
2858 disable_8259A_irq(irq); 2863 disable_8259A_irq(irq);
2859 2864
2860 ioapic_write_entry(ioapic, pin, entry);
2861 spin_lock_irqsave(&ioapic_lock, flags); 2865 spin_lock_irqsave(&ioapic_lock, flags);
2866 __ioapic_write_entry(ioapic, pin, entry);
2862 set_native_irq_info(irq, TARGET_CPUS); 2867 set_native_irq_info(irq, TARGET_CPUS);
2863 spin_unlock_irqrestore(&ioapic_lock, flags); 2868 spin_unlock_irqrestore(&ioapic_lock, flags);
2864 2869
diff --git a/arch/i386/kernel/ldt.c b/arch/i386/kernel/ldt.c
index 445211eb2d57..b410e5fb034f 100644
--- a/arch/i386/kernel/ldt.c
+++ b/arch/i386/kernel/ldt.c
@@ -160,16 +160,14 @@ static int read_default_ldt(void __user * ptr, unsigned long bytecount)
160{ 160{
161 int err; 161 int err;
162 unsigned long size; 162 unsigned long size;
163 void *address;
164 163
165 err = 0; 164 err = 0;
166 address = &default_ldt[0];
167 size = 5*sizeof(struct desc_struct); 165 size = 5*sizeof(struct desc_struct);
168 if (size > bytecount) 166 if (size > bytecount)
169 size = bytecount; 167 size = bytecount;
170 168
171 err = size; 169 err = size;
172 if (copy_to_user(ptr, address, size)) 170 if (clear_user(ptr, size))
173 err = -EFAULT; 171 err = -EFAULT;
174 172
175 return err; 173 return err;
diff --git a/arch/i386/kernel/mca.c b/arch/i386/kernel/mca.c
index eb57a851789d..b83672b89527 100644
--- a/arch/i386/kernel/mca.c
+++ b/arch/i386/kernel/mca.c
@@ -283,10 +283,9 @@ static int __init mca_init(void)
283 bus->f.mca_transform_memory = mca_dummy_transform_memory; 283 bus->f.mca_transform_memory = mca_dummy_transform_memory;
284 284
285 /* get the motherboard device */ 285 /* get the motherboard device */
286 mca_dev = kmalloc(sizeof(struct mca_device), GFP_KERNEL); 286 mca_dev = kzalloc(sizeof(struct mca_device), GFP_KERNEL);
287 if(unlikely(!mca_dev)) 287 if(unlikely(!mca_dev))
288 goto out_nomem; 288 goto out_nomem;
289 memset(mca_dev, 0, sizeof(struct mca_device));
290 289
291 /* 290 /*
292 * We do not expect many MCA interrupts during initialization, 291 * We do not expect many MCA interrupts during initialization,
@@ -310,11 +309,9 @@ static int __init mca_init(void)
310 mca_dev->slot = MCA_MOTHERBOARD; 309 mca_dev->slot = MCA_MOTHERBOARD;
311 mca_register_device(MCA_PRIMARY_BUS, mca_dev); 310 mca_register_device(MCA_PRIMARY_BUS, mca_dev);
312 311
313 mca_dev = kmalloc(sizeof(struct mca_device), GFP_ATOMIC); 312 mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
314 if(unlikely(!mca_dev)) 313 if(unlikely(!mca_dev))
315 goto out_unlock_nomem; 314 goto out_unlock_nomem;
316 memset(mca_dev, 0, sizeof(struct mca_device));
317
318 315
319 /* Put motherboard into video setup mode, read integrated video 316 /* Put motherboard into video setup mode, read integrated video
320 * POS registers, and turn motherboard setup off. 317 * POS registers, and turn motherboard setup off.
@@ -349,10 +346,9 @@ static int __init mca_init(void)
349 } 346 }
350 if(which_scsi) { 347 if(which_scsi) {
351 /* found a scsi card */ 348 /* found a scsi card */
352 mca_dev = kmalloc(sizeof(struct mca_device), GFP_ATOMIC); 349 mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
353 if(unlikely(!mca_dev)) 350 if(unlikely(!mca_dev))
354 goto out_unlock_nomem; 351 goto out_unlock_nomem;
355 memset(mca_dev, 0, sizeof(struct mca_device));
356 352
357 for(j = 0; j < 8; j++) 353 for(j = 0; j < 8; j++)
358 mca_dev->pos[j] = pos[j]; 354 mca_dev->pos[j] = pos[j];
@@ -378,10 +374,9 @@ static int __init mca_init(void)
378 if(!mca_read_and_store_pos(pos)) 374 if(!mca_read_and_store_pos(pos))
379 continue; 375 continue;
380 376
381 mca_dev = kmalloc(sizeof(struct mca_device), GFP_ATOMIC); 377 mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
382 if(unlikely(!mca_dev)) 378 if(unlikely(!mca_dev))
383 goto out_unlock_nomem; 379 goto out_unlock_nomem;
384 memset(mca_dev, 0, sizeof(struct mca_device));
385 380
386 for(j=0; j<8; j++) 381 for(j=0; j<8; j++)
387 mca_dev->pos[j]=pos[j]; 382 mca_dev->pos[j]=pos[j];
diff --git a/arch/i386/kernel/module.c b/arch/i386/kernel/module.c
index 470cf97e7cd3..d7d9c8b23f72 100644
--- a/arch/i386/kernel/module.c
+++ b/arch/i386/kernel/module.c
@@ -108,7 +108,8 @@ int module_finalize(const Elf_Ehdr *hdr,
108 const Elf_Shdr *sechdrs, 108 const Elf_Shdr *sechdrs,
109 struct module *me) 109 struct module *me)
110{ 110{
111 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL; 111 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
112 *para = NULL;
112 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; 113 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
113 114
114 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { 115 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
@@ -118,6 +119,8 @@ int module_finalize(const Elf_Ehdr *hdr,
118 alt = s; 119 alt = s;
119 if (!strcmp(".smp_locks", secstrings + s->sh_name)) 120 if (!strcmp(".smp_locks", secstrings + s->sh_name))
120 locks= s; 121 locks= s;
122 if (!strcmp(".parainstructions", secstrings + s->sh_name))
123 para = s;
121 } 124 }
122 125
123 if (alt) { 126 if (alt) {
@@ -132,6 +135,12 @@ int module_finalize(const Elf_Ehdr *hdr,
132 lseg, lseg + locks->sh_size, 135 lseg, lseg + locks->sh_size,
133 tseg, tseg + text->sh_size); 136 tseg, tseg + text->sh_size);
134 } 137 }
138
139 if (para) {
140 void *pseg = (void *)para->sh_addr;
141 apply_paravirt(pseg, pseg + para->sh_size);
142 }
143
135 return 0; 144 return 0;
136} 145}
137 146
diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c
index 442aaf8c77eb..2ce67228dff8 100644
--- a/arch/i386/kernel/mpparse.c
+++ b/arch/i386/kernel/mpparse.c
@@ -249,8 +249,6 @@ static void __init MP_bus_info (struct mpc_config_bus *m)
249 mp_current_pci_id++; 249 mp_current_pci_id++;
250 } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) { 250 } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) {
251 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA; 251 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
252 } else if (strncmp(str, BUSTYPE_NEC98, sizeof(BUSTYPE_NEC98)-1) == 0) {
253 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_NEC98;
254 } else { 252 } else {
255 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); 253 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
256 } 254 }
diff --git a/arch/i386/kernel/msr.c b/arch/i386/kernel/msr.c
index 7763c67ca282..1d1a56cae340 100644
--- a/arch/i386/kernel/msr.c
+++ b/arch/i386/kernel/msr.c
@@ -195,7 +195,6 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
195{ 195{
196 const u32 __user *tmp = (const u32 __user *)buf; 196 const u32 __user *tmp = (const u32 __user *)buf;
197 u32 data[2]; 197 u32 data[2];
198 size_t rv;
199 u32 reg = *ppos; 198 u32 reg = *ppos;
200 int cpu = iminor(file->f_dentry->d_inode); 199 int cpu = iminor(file->f_dentry->d_inode);
201 int err; 200 int err;
@@ -203,7 +202,7 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
203 if (count % 8) 202 if (count % 8)
204 return -EINVAL; /* Invalid chunk size */ 203 return -EINVAL; /* Invalid chunk size */
205 204
206 for (rv = 0; count; count -= 8) { 205 for (; count; count -= 8) {
207 if (copy_from_user(&data, tmp, 8)) 206 if (copy_from_user(&data, tmp, 8))
208 return -EFAULT; 207 return -EFAULT;
209 err = do_wrmsr(cpu, reg, data[0], data[1]); 208 err = do_wrmsr(cpu, reg, data[0], data[1]);
diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index eaafe233a5da..f5bc7e1be801 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -22,6 +22,7 @@
22#include <linux/percpu.h> 22#include <linux/percpu.h>
23#include <linux/dmi.h> 23#include <linux/dmi.h>
24#include <linux/kprobes.h> 24#include <linux/kprobes.h>
25#include <linux/cpumask.h>
25 26
26#include <asm/smp.h> 27#include <asm/smp.h>
27#include <asm/nmi.h> 28#include <asm/nmi.h>
@@ -42,6 +43,8 @@ int nmi_watchdog_enabled;
42static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner); 43static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner);
43static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[3]); 44static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[3]);
44 45
46static cpumask_t backtrace_mask = CPU_MASK_NONE;
47
45/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's 48/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
46 * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now) 49 * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now)
47 */ 50 */
@@ -867,14 +870,16 @@ static unsigned int
867 870
868void touch_nmi_watchdog (void) 871void touch_nmi_watchdog (void)
869{ 872{
870 int i; 873 if (nmi_watchdog > 0) {
874 unsigned cpu;
871 875
872 /* 876 /*
873 * Just reset the alert counters, (other CPUs might be 877 * Just reset the alert counters, (other CPUs might be
874 * spinning on locks we hold): 878 * spinning on locks we hold):
875 */ 879 */
876 for_each_possible_cpu(i) 880 for_each_present_cpu (cpu)
877 alert_counter[i] = 0; 881 alert_counter[cpu] = 0;
882 }
878 883
879 /* 884 /*
880 * Tickle the softlockup detector too: 885 * Tickle the softlockup detector too:
@@ -907,6 +912,16 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
907 touched = 1; 912 touched = 1;
908 } 913 }
909 914
915 if (cpu_isset(cpu, backtrace_mask)) {
916 static DEFINE_SPINLOCK(lock); /* Serialise the printks */
917
918 spin_lock(&lock);
919 printk("NMI backtrace for cpu %d\n", cpu);
920 dump_stack();
921 spin_unlock(&lock);
922 cpu_clear(cpu, backtrace_mask);
923 }
924
910 sum = per_cpu(irq_stat, cpu).apic_timer_irqs; 925 sum = per_cpu(irq_stat, cpu).apic_timer_irqs;
911 926
912 /* if the apic timer isn't firing, this cpu isn't doing much */ 927 /* if the apic timer isn't firing, this cpu isn't doing much */
@@ -1033,6 +1048,19 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
1033 1048
1034#endif 1049#endif
1035 1050
1051void __trigger_all_cpu_backtrace(void)
1052{
1053 int i;
1054
1055 backtrace_mask = cpu_online_map;
1056 /* Wait for up to 10 seconds for all CPUs to do the backtrace */
1057 for (i = 0; i < 10 * 1000; i++) {
1058 if (cpus_empty(backtrace_mask))
1059 break;
1060 mdelay(1);
1061 }
1062}
1063
1036EXPORT_SYMBOL(nmi_active); 1064EXPORT_SYMBOL(nmi_active);
1037EXPORT_SYMBOL(nmi_watchdog); 1065EXPORT_SYMBOL(nmi_watchdog);
1038EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi); 1066EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c
new file mode 100644
index 000000000000..3dceab5828f1
--- /dev/null
+++ b/arch/i386/kernel/paravirt.c
@@ -0,0 +1,569 @@
1/* Paravirtualization interfaces
2 Copyright (C) 2006 Rusty Russell IBM Corporation
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17*/
18#include <linux/errno.h>
19#include <linux/module.h>
20#include <linux/efi.h>
21#include <linux/bcd.h>
22#include <linux/start_kernel.h>
23
24#include <asm/bug.h>
25#include <asm/paravirt.h>
26#include <asm/desc.h>
27#include <asm/setup.h>
28#include <asm/arch_hooks.h>
29#include <asm/time.h>
30#include <asm/irq.h>
31#include <asm/delay.h>
32#include <asm/fixmap.h>
33#include <asm/apic.h>
34#include <asm/tlbflush.h>
35
36/* nop stub */
37static void native_nop(void)
38{
39}
40
41static void __init default_banner(void)
42{
43 printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
44 paravirt_ops.name);
45}
46
47char *memory_setup(void)
48{
49 return paravirt_ops.memory_setup();
50}
51
52/* Simple instruction patching code. */
53#define DEF_NATIVE(name, code) \
54 extern const char start_##name[], end_##name[]; \
55 asm("start_" #name ": " code "; end_" #name ":")
56DEF_NATIVE(cli, "cli");
57DEF_NATIVE(sti, "sti");
58DEF_NATIVE(popf, "push %eax; popf");
59DEF_NATIVE(pushf, "pushf; pop %eax");
60DEF_NATIVE(pushf_cli, "pushf; pop %eax; cli");
61DEF_NATIVE(iret, "iret");
62DEF_NATIVE(sti_sysexit, "sti; sysexit");
63
64static const struct native_insns
65{
66 const char *start, *end;
67} native_insns[] = {
68 [PARAVIRT_IRQ_DISABLE] = { start_cli, end_cli },
69 [PARAVIRT_IRQ_ENABLE] = { start_sti, end_sti },
70 [PARAVIRT_RESTORE_FLAGS] = { start_popf, end_popf },
71 [PARAVIRT_SAVE_FLAGS] = { start_pushf, end_pushf },
72 [PARAVIRT_SAVE_FLAGS_IRQ_DISABLE] = { start_pushf_cli, end_pushf_cli },
73 [PARAVIRT_INTERRUPT_RETURN] = { start_iret, end_iret },
74 [PARAVIRT_STI_SYSEXIT] = { start_sti_sysexit, end_sti_sysexit },
75};
76
77static unsigned native_patch(u8 type, u16 clobbers, void *insns, unsigned len)
78{
79 unsigned int insn_len;
80
81 /* Don't touch it if we don't have a replacement */
82 if (type >= ARRAY_SIZE(native_insns) || !native_insns[type].start)
83 return len;
84
85 insn_len = native_insns[type].end - native_insns[type].start;
86
87 /* Similarly if we can't fit replacement. */
88 if (len < insn_len)
89 return len;
90
91 memcpy(insns, native_insns[type].start, insn_len);
92 return insn_len;
93}
94
95static fastcall unsigned long native_get_debugreg(int regno)
96{
97 unsigned long val = 0; /* Damn you, gcc! */
98
99 switch (regno) {
100 case 0:
101 asm("movl %%db0, %0" :"=r" (val)); break;
102 case 1:
103 asm("movl %%db1, %0" :"=r" (val)); break;
104 case 2:
105 asm("movl %%db2, %0" :"=r" (val)); break;
106 case 3:
107 asm("movl %%db3, %0" :"=r" (val)); break;
108 case 6:
109 asm("movl %%db6, %0" :"=r" (val)); break;
110 case 7:
111 asm("movl %%db7, %0" :"=r" (val)); break;
112 default:
113 BUG();
114 }
115 return val;
116}
117
118static fastcall void native_set_debugreg(int regno, unsigned long value)
119{
120 switch (regno) {
121 case 0:
122 asm("movl %0,%%db0" : /* no output */ :"r" (value));
123 break;
124 case 1:
125 asm("movl %0,%%db1" : /* no output */ :"r" (value));
126 break;
127 case 2:
128 asm("movl %0,%%db2" : /* no output */ :"r" (value));
129 break;
130 case 3:
131 asm("movl %0,%%db3" : /* no output */ :"r" (value));
132 break;
133 case 6:
134 asm("movl %0,%%db6" : /* no output */ :"r" (value));
135 break;
136 case 7:
137 asm("movl %0,%%db7" : /* no output */ :"r" (value));
138 break;
139 default:
140 BUG();
141 }
142}
143
144void init_IRQ(void)
145{
146 paravirt_ops.init_IRQ();
147}
148
149static fastcall void native_clts(void)
150{
151 asm volatile ("clts");
152}
153
154static fastcall unsigned long native_read_cr0(void)
155{
156 unsigned long val;
157 asm volatile("movl %%cr0,%0\n\t" :"=r" (val));
158 return val;
159}
160
161static fastcall void native_write_cr0(unsigned long val)
162{
163 asm volatile("movl %0,%%cr0": :"r" (val));
164}
165
166static fastcall unsigned long native_read_cr2(void)
167{
168 unsigned long val;
169 asm volatile("movl %%cr2,%0\n\t" :"=r" (val));
170 return val;
171}
172
173static fastcall void native_write_cr2(unsigned long val)
174{
175 asm volatile("movl %0,%%cr2": :"r" (val));
176}
177
178static fastcall unsigned long native_read_cr3(void)
179{
180 unsigned long val;
181 asm volatile("movl %%cr3,%0\n\t" :"=r" (val));
182 return val;
183}
184
185static fastcall void native_write_cr3(unsigned long val)
186{
187 asm volatile("movl %0,%%cr3": :"r" (val));
188}
189
190static fastcall unsigned long native_read_cr4(void)
191{
192 unsigned long val;
193 asm volatile("movl %%cr4,%0\n\t" :"=r" (val));
194 return val;
195}
196
197static fastcall unsigned long native_read_cr4_safe(void)
198{
199 unsigned long val;
200 /* This could fault if %cr4 does not exist */
201 asm("1: movl %%cr4, %0 \n"
202 "2: \n"
203 ".section __ex_table,\"a\" \n"
204 ".long 1b,2b \n"
205 ".previous \n"
206 : "=r" (val): "0" (0));
207 return val;
208}
209
210static fastcall void native_write_cr4(unsigned long val)
211{
212 asm volatile("movl %0,%%cr4": :"r" (val));
213}
214
215static fastcall unsigned long native_save_fl(void)
216{
217 unsigned long f;
218 asm volatile("pushfl ; popl %0":"=g" (f): /* no input */);
219 return f;
220}
221
222static fastcall void native_restore_fl(unsigned long f)
223{
224 asm volatile("pushl %0 ; popfl": /* no output */
225 :"g" (f)
226 :"memory", "cc");
227}
228
229static fastcall void native_irq_disable(void)
230{
231 asm volatile("cli": : :"memory");
232}
233
234static fastcall void native_irq_enable(void)
235{
236 asm volatile("sti": : :"memory");
237}
238
239static fastcall void native_safe_halt(void)
240{
241 asm volatile("sti; hlt": : :"memory");
242}
243
244static fastcall void native_halt(void)
245{
246 asm volatile("hlt": : :"memory");
247}
248
249static fastcall void native_wbinvd(void)
250{
251 asm volatile("wbinvd": : :"memory");
252}
253
254static fastcall unsigned long long native_read_msr(unsigned int msr, int *err)
255{
256 unsigned long long val;
257
258 asm volatile("2: rdmsr ; xorl %0,%0\n"
259 "1:\n\t"
260 ".section .fixup,\"ax\"\n\t"
261 "3: movl %3,%0 ; jmp 1b\n\t"
262 ".previous\n\t"
263 ".section __ex_table,\"a\"\n"
264 " .align 4\n\t"
265 " .long 2b,3b\n\t"
266 ".previous"
267 : "=r" (*err), "=A" (val)
268 : "c" (msr), "i" (-EFAULT));
269
270 return val;
271}
272
273static fastcall int native_write_msr(unsigned int msr, unsigned long long val)
274{
275 int err;
276 asm volatile("2: wrmsr ; xorl %0,%0\n"
277 "1:\n\t"
278 ".section .fixup,\"ax\"\n\t"
279 "3: movl %4,%0 ; jmp 1b\n\t"
280 ".previous\n\t"
281 ".section __ex_table,\"a\"\n"
282 " .align 4\n\t"
283 " .long 2b,3b\n\t"
284 ".previous"
285 : "=a" (err)
286 : "c" (msr), "0" ((u32)val), "d" ((u32)(val>>32)),
287 "i" (-EFAULT));
288 return err;
289}
290
291static fastcall unsigned long long native_read_tsc(void)
292{
293 unsigned long long val;
294 asm volatile("rdtsc" : "=A" (val));
295 return val;
296}
297
298static fastcall unsigned long long native_read_pmc(void)
299{
300 unsigned long long val;
301 asm volatile("rdpmc" : "=A" (val));
302 return val;
303}
304
305static fastcall void native_load_tr_desc(void)
306{
307 asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
308}
309
310static fastcall void native_load_gdt(const struct Xgt_desc_struct *dtr)
311{
312 asm volatile("lgdt %0"::"m" (*dtr));
313}
314
315static fastcall void native_load_idt(const struct Xgt_desc_struct *dtr)
316{
317 asm volatile("lidt %0"::"m" (*dtr));
318}
319
320static fastcall void native_store_gdt(struct Xgt_desc_struct *dtr)
321{
322 asm ("sgdt %0":"=m" (*dtr));
323}
324
325static fastcall void native_store_idt(struct Xgt_desc_struct *dtr)
326{
327 asm ("sidt %0":"=m" (*dtr));
328}
329
330static fastcall unsigned long native_store_tr(void)
331{
332 unsigned long tr;
333 asm ("str %0":"=r" (tr));
334 return tr;
335}
336
337static fastcall void native_load_tls(struct thread_struct *t, unsigned int cpu)
338{
339#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]
340 C(0); C(1); C(2);
341#undef C
342}
343
344static inline void native_write_dt_entry(void *dt, int entry, u32 entry_low, u32 entry_high)
345{
346 u32 *lp = (u32 *)((char *)dt + entry*8);
347 lp[0] = entry_low;
348 lp[1] = entry_high;
349}
350
351static fastcall void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high)
352{
353 native_write_dt_entry(dt, entrynum, low, high);
354}
355
356static fastcall void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high)
357{
358 native_write_dt_entry(dt, entrynum, low, high);
359}
360
361static fastcall void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high)
362{
363 native_write_dt_entry(dt, entrynum, low, high);
364}
365
366static fastcall void native_load_esp0(struct tss_struct *tss,
367 struct thread_struct *thread)
368{
369 tss->esp0 = thread->esp0;
370
371 /* This can only happen when SEP is enabled, no need to test "SEP"arately */
372 if (unlikely(tss->ss1 != thread->sysenter_cs)) {
373 tss->ss1 = thread->sysenter_cs;
374 wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
375 }
376}
377
378static fastcall void native_io_delay(void)
379{
380 asm volatile("outb %al,$0x80");
381}
382
383static fastcall void native_flush_tlb(void)
384{
385 __native_flush_tlb();
386}
387
388/*
389 * Global pages have to be flushed a bit differently. Not a real
390 * performance problem because this does not happen often.
391 */
392static fastcall void native_flush_tlb_global(void)
393{
394 __native_flush_tlb_global();
395}
396
397static fastcall void native_flush_tlb_single(u32 addr)
398{
399 __native_flush_tlb_single(addr);
400}
401
402#ifndef CONFIG_X86_PAE
403static fastcall void native_set_pte(pte_t *ptep, pte_t pteval)
404{
405 *ptep = pteval;
406}
407
408static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval)
409{
410 *ptep = pteval;
411}
412
413static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
414{
415 *pmdp = pmdval;
416}
417
418#else /* CONFIG_X86_PAE */
419
420static fastcall void native_set_pte(pte_t *ptep, pte_t pte)
421{
422 ptep->pte_high = pte.pte_high;
423 smp_wmb();
424 ptep->pte_low = pte.pte_low;
425}
426
427static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte)
428{
429 ptep->pte_high = pte.pte_high;
430 smp_wmb();
431 ptep->pte_low = pte.pte_low;
432}
433
434static fastcall void native_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
435{
436 ptep->pte_low = 0;
437 smp_wmb();
438 ptep->pte_high = pte.pte_high;
439 smp_wmb();
440 ptep->pte_low = pte.pte_low;
441}
442
443static fastcall void native_set_pte_atomic(pte_t *ptep, pte_t pteval)
444{
445 set_64bit((unsigned long long *)ptep,pte_val(pteval));
446}
447
448static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
449{
450 set_64bit((unsigned long long *)pmdp,pmd_val(pmdval));
451}
452
453static fastcall void native_set_pud(pud_t *pudp, pud_t pudval)
454{
455 *pudp = pudval;
456}
457
458static fastcall void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
459{
460 ptep->pte_low = 0;
461 smp_wmb();
462 ptep->pte_high = 0;
463}
464
465static fastcall void native_pmd_clear(pmd_t *pmd)
466{
467 u32 *tmp = (u32 *)pmd;
468 *tmp = 0;
469 smp_wmb();
470 *(tmp + 1) = 0;
471}
472#endif /* CONFIG_X86_PAE */
473
474/* These are in entry.S */
475extern fastcall void native_iret(void);
476extern fastcall void native_irq_enable_sysexit(void);
477
478static int __init print_banner(void)
479{
480 paravirt_ops.banner();
481 return 0;
482}
483core_initcall(print_banner);
484
485/* We simply declare start_kernel to be the paravirt probe of last resort. */
486paravirt_probe(start_kernel);
487
488struct paravirt_ops paravirt_ops = {
489 .name = "bare hardware",
490 .paravirt_enabled = 0,
491 .kernel_rpl = 0,
492
493 .patch = native_patch,
494 .banner = default_banner,
495 .arch_setup = native_nop,
496 .memory_setup = machine_specific_memory_setup,
497 .get_wallclock = native_get_wallclock,
498 .set_wallclock = native_set_wallclock,
499 .time_init = time_init_hook,
500 .init_IRQ = native_init_IRQ,
501
502 .cpuid = native_cpuid,
503 .get_debugreg = native_get_debugreg,
504 .set_debugreg = native_set_debugreg,
505 .clts = native_clts,
506 .read_cr0 = native_read_cr0,
507 .write_cr0 = native_write_cr0,
508 .read_cr2 = native_read_cr2,
509 .write_cr2 = native_write_cr2,
510 .read_cr3 = native_read_cr3,
511 .write_cr3 = native_write_cr3,
512 .read_cr4 = native_read_cr4,
513 .read_cr4_safe = native_read_cr4_safe,
514 .write_cr4 = native_write_cr4,
515 .save_fl = native_save_fl,
516 .restore_fl = native_restore_fl,
517 .irq_disable = native_irq_disable,
518 .irq_enable = native_irq_enable,
519 .safe_halt = native_safe_halt,
520 .halt = native_halt,
521 .wbinvd = native_wbinvd,
522 .read_msr = native_read_msr,
523 .write_msr = native_write_msr,
524 .read_tsc = native_read_tsc,
525 .read_pmc = native_read_pmc,
526 .load_tr_desc = native_load_tr_desc,
527 .set_ldt = native_set_ldt,
528 .load_gdt = native_load_gdt,
529 .load_idt = native_load_idt,
530 .store_gdt = native_store_gdt,
531 .store_idt = native_store_idt,
532 .store_tr = native_store_tr,
533 .load_tls = native_load_tls,
534 .write_ldt_entry = native_write_ldt_entry,
535 .write_gdt_entry = native_write_gdt_entry,
536 .write_idt_entry = native_write_idt_entry,
537 .load_esp0 = native_load_esp0,
538
539 .set_iopl_mask = native_set_iopl_mask,
540 .io_delay = native_io_delay,
541 .const_udelay = __const_udelay,
542
543#ifdef CONFIG_X86_LOCAL_APIC
544 .apic_write = native_apic_write,
545 .apic_write_atomic = native_apic_write_atomic,
546 .apic_read = native_apic_read,
547#endif
548
549 .flush_tlb_user = native_flush_tlb,
550 .flush_tlb_kernel = native_flush_tlb_global,
551 .flush_tlb_single = native_flush_tlb_single,
552
553 .set_pte = native_set_pte,
554 .set_pte_at = native_set_pte_at,
555 .set_pmd = native_set_pmd,
556 .pte_update = (void *)native_nop,
557 .pte_update_defer = (void *)native_nop,
558#ifdef CONFIG_X86_PAE
559 .set_pte_atomic = native_set_pte_atomic,
560 .set_pte_present = native_set_pte_present,
561 .set_pud = native_set_pud,
562 .pte_clear = native_pte_clear,
563 .pmd_clear = native_pmd_clear,
564#endif
565
566 .irq_enable_sysexit = native_irq_enable_sysexit,
567 .iret = native_iret,
568};
569EXPORT_SYMBOL(paravirt_ops);
diff --git a/arch/i386/kernel/pci-dma.c b/arch/i386/kernel/pci-dma.c
index 5c8c6ef1fc5e..41af692c1584 100644
--- a/arch/i386/kernel/pci-dma.c
+++ b/arch/i386/kernel/pci-dma.c
@@ -92,14 +92,12 @@ int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
92 if (!mem_base) 92 if (!mem_base)
93 goto out; 93 goto out;
94 94
95 dev->dma_mem = kmalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); 95 dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
96 if (!dev->dma_mem) 96 if (!dev->dma_mem)
97 goto out; 97 goto out;
98 memset(dev->dma_mem, 0, sizeof(struct dma_coherent_mem)); 98 dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
99 dev->dma_mem->bitmap = kmalloc(bitmap_size, GFP_KERNEL);
100 if (!dev->dma_mem->bitmap) 99 if (!dev->dma_mem->bitmap)
101 goto free1_out; 100 goto free1_out;
102 memset(dev->dma_mem->bitmap, 0, bitmap_size);
103 101
104 dev->dma_mem->virt_base = mem_base; 102 dev->dma_mem->virt_base = mem_base;
105 dev->dma_mem->device_base = device_addr; 103 dev->dma_mem->device_base = device_addr;
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index dd53c58f64f1..99308510a17c 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -56,6 +56,7 @@
56 56
57#include <asm/tlbflush.h> 57#include <asm/tlbflush.h>
58#include <asm/cpu.h> 58#include <asm/cpu.h>
59#include <asm/pda.h>
59 60
60asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 61asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
61 62
@@ -99,22 +100,18 @@ EXPORT_SYMBOL(enable_hlt);
99 */ 100 */
100void default_idle(void) 101void default_idle(void)
101{ 102{
102 local_irq_enable();
103
104 if (!hlt_counter && boot_cpu_data.hlt_works_ok) { 103 if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
105 current_thread_info()->status &= ~TS_POLLING; 104 current_thread_info()->status &= ~TS_POLLING;
106 smp_mb__after_clear_bit(); 105 smp_mb__after_clear_bit();
107 while (!need_resched()) { 106 local_irq_disable();
108 local_irq_disable(); 107 if (!need_resched())
109 if (!need_resched()) 108 safe_halt(); /* enables interrupts racelessly */
110 safe_halt(); 109 else
111 else 110 local_irq_enable();
112 local_irq_enable();
113 }
114 current_thread_info()->status |= TS_POLLING; 111 current_thread_info()->status |= TS_POLLING;
115 } else { 112 } else {
116 while (!need_resched()) 113 /* loop is done by the caller */
117 cpu_relax(); 114 cpu_relax();
118 } 115 }
119} 116}
120#ifdef CONFIG_APM_MODULE 117#ifdef CONFIG_APM_MODULE
@@ -128,14 +125,7 @@ EXPORT_SYMBOL(default_idle);
128 */ 125 */
129static void poll_idle (void) 126static void poll_idle (void)
130{ 127{
131 local_irq_enable(); 128 cpu_relax();
132
133 asm volatile(
134 "2:"
135 "testl %0, %1;"
136 "rep; nop;"
137 "je 2b;"
138 : : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags));
139} 129}
140 130
141#ifdef CONFIG_HOTPLUG_CPU 131#ifdef CONFIG_HOTPLUG_CPU
@@ -256,8 +246,7 @@ void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
256static void mwait_idle(void) 246static void mwait_idle(void)
257{ 247{
258 local_irq_enable(); 248 local_irq_enable();
259 while (!need_resched()) 249 mwait_idle_with_hints(0, 0);
260 mwait_idle_with_hints(0, 0);
261} 250}
262 251
263void __devinit select_idle_routine(const struct cpuinfo_x86 *c) 252void __devinit select_idle_routine(const struct cpuinfo_x86 *c)
@@ -314,8 +303,8 @@ void show_regs(struct pt_regs * regs)
314 regs->eax,regs->ebx,regs->ecx,regs->edx); 303 regs->eax,regs->ebx,regs->ecx,regs->edx);
315 printk("ESI: %08lx EDI: %08lx EBP: %08lx", 304 printk("ESI: %08lx EDI: %08lx EBP: %08lx",
316 regs->esi, regs->edi, regs->ebp); 305 regs->esi, regs->edi, regs->ebp);
317 printk(" DS: %04x ES: %04x\n", 306 printk(" DS: %04x ES: %04x GS: %04x\n",
318 0xffff & regs->xds,0xffff & regs->xes); 307 0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xgs);
319 308
320 cr0 = read_cr0(); 309 cr0 = read_cr0();
321 cr2 = read_cr2(); 310 cr2 = read_cr2();
@@ -346,6 +335,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
346 335
347 regs.xds = __USER_DS; 336 regs.xds = __USER_DS;
348 regs.xes = __USER_DS; 337 regs.xes = __USER_DS;
338 regs.xgs = __KERNEL_PDA;
349 regs.orig_eax = -1; 339 regs.orig_eax = -1;
350 regs.eip = (unsigned long) kernel_thread_helper; 340 regs.eip = (unsigned long) kernel_thread_helper;
351 regs.xcs = __KERNEL_CS | get_kernel_rpl(); 341 regs.xcs = __KERNEL_CS | get_kernel_rpl();
@@ -431,7 +421,6 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
431 p->thread.eip = (unsigned long) ret_from_fork; 421 p->thread.eip = (unsigned long) ret_from_fork;
432 422
433 savesegment(fs,p->thread.fs); 423 savesegment(fs,p->thread.fs);
434 savesegment(gs,p->thread.gs);
435 424
436 tsk = current; 425 tsk = current;
437 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { 426 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
@@ -508,7 +497,7 @@ void dump_thread(struct pt_regs * regs, struct user * dump)
508 dump->regs.ds = regs->xds; 497 dump->regs.ds = regs->xds;
509 dump->regs.es = regs->xes; 498 dump->regs.es = regs->xes;
510 savesegment(fs,dump->regs.fs); 499 savesegment(fs,dump->regs.fs);
511 savesegment(gs,dump->regs.gs); 500 dump->regs.gs = regs->xgs;
512 dump->regs.orig_eax = regs->orig_eax; 501 dump->regs.orig_eax = regs->orig_eax;
513 dump->regs.eip = regs->eip; 502 dump->regs.eip = regs->eip;
514 dump->regs.cs = regs->xcs; 503 dump->regs.cs = regs->xcs;
@@ -648,22 +637,27 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
648 637
649 __unlazy_fpu(prev_p); 638 __unlazy_fpu(prev_p);
650 639
640
641 /* we're going to use this soon, after a few expensive things */
642 if (next_p->fpu_counter > 5)
643 prefetch(&next->i387.fxsave);
644
651 /* 645 /*
652 * Reload esp0. 646 * Reload esp0.
653 */ 647 */
654 load_esp0(tss, next); 648 load_esp0(tss, next);
655 649
656 /* 650 /*
657 * Save away %fs and %gs. No need to save %es and %ds, as 651 * Save away %fs. No need to save %gs, as it was saved on the
658 * those are always kernel segments while inside the kernel. 652 * stack on entry. No need to save %es and %ds, as those are
659 * Doing this before setting the new TLS descriptors avoids 653 * always kernel segments while inside the kernel. Doing this
660 * the situation where we temporarily have non-reloadable 654 * before setting the new TLS descriptors avoids the situation
661 * segments in %fs and %gs. This could be an issue if the 655 * where we temporarily have non-reloadable segments in %fs
662 * NMI handler ever used %fs or %gs (it does not today), or 656 * and %gs. This could be an issue if the NMI handler ever
663 * if the kernel is running inside of a hypervisor layer. 657 * used %fs or %gs (it does not today), or if the kernel is
658 * running inside of a hypervisor layer.
664 */ 659 */
665 savesegment(fs, prev->fs); 660 savesegment(fs, prev->fs);
666 savesegment(gs, prev->gs);
667 661
668 /* 662 /*
669 * Load the per-thread Thread-Local Storage descriptor. 663 * Load the per-thread Thread-Local Storage descriptor.
@@ -671,22 +665,14 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
671 load_TLS(next, cpu); 665 load_TLS(next, cpu);
672 666
673 /* 667 /*
674 * Restore %fs and %gs if needed. 668 * Restore %fs if needed.
675 * 669 *
676 * Glibc normally makes %fs be zero, and %gs is one of 670 * Glibc normally makes %fs be zero.
677 * the TLS segments.
678 */ 671 */
679 if (unlikely(prev->fs | next->fs)) 672 if (unlikely(prev->fs | next->fs))
680 loadsegment(fs, next->fs); 673 loadsegment(fs, next->fs);
681 674
682 if (prev->gs | next->gs) 675 write_pda(pcurrent, next_p);
683 loadsegment(gs, next->gs);
684
685 /*
686 * Restore IOPL if needed.
687 */
688 if (unlikely(prev->iopl != next->iopl))
689 set_iopl_mask(next->iopl);
690 676
691 /* 677 /*
692 * Now maybe handle debug registers and/or IO bitmaps 678 * Now maybe handle debug registers and/or IO bitmaps
@@ -697,6 +683,13 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
697 683
698 disable_tsc(prev_p, next_p); 684 disable_tsc(prev_p, next_p);
699 685
686 /* If the task has used fpu the last 5 timeslices, just do a full
687 * restore of the math state immediately to avoid the trap; the
688 * chances of needing FPU soon are obviously high now
689 */
690 if (next_p->fpu_counter > 5)
691 math_state_restore();
692
700 return prev_p; 693 return prev_p;
701} 694}
702 695
diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index 775f50e9395b..f3f94ac5736a 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -94,13 +94,9 @@ static int putreg(struct task_struct *child,
94 return -EIO; 94 return -EIO;
95 child->thread.fs = value; 95 child->thread.fs = value;
96 return 0; 96 return 0;
97 case GS:
98 if (value && (value & 3) != 3)
99 return -EIO;
100 child->thread.gs = value;
101 return 0;
102 case DS: 97 case DS:
103 case ES: 98 case ES:
99 case GS:
104 if (value && (value & 3) != 3) 100 if (value && (value & 3) != 3)
105 return -EIO; 101 return -EIO;
106 value &= 0xffff; 102 value &= 0xffff;
@@ -116,8 +112,8 @@ static int putreg(struct task_struct *child,
116 value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK; 112 value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK;
117 break; 113 break;
118 } 114 }
119 if (regno > GS*4) 115 if (regno > ES*4)
120 regno -= 2*4; 116 regno -= 1*4;
121 put_stack_long(child, regno - sizeof(struct pt_regs), value); 117 put_stack_long(child, regno - sizeof(struct pt_regs), value);
122 return 0; 118 return 0;
123} 119}
@@ -131,18 +127,16 @@ static unsigned long getreg(struct task_struct *child,
131 case FS: 127 case FS:
132 retval = child->thread.fs; 128 retval = child->thread.fs;
133 break; 129 break;
134 case GS:
135 retval = child->thread.gs;
136 break;
137 case DS: 130 case DS:
138 case ES: 131 case ES:
132 case GS:
139 case SS: 133 case SS:
140 case CS: 134 case CS:
141 retval = 0xffff; 135 retval = 0xffff;
142 /* fall through */ 136 /* fall through */
143 default: 137 default:
144 if (regno > GS*4) 138 if (regno > ES*4)
145 regno -= 2*4; 139 regno -= 1*4;
146 regno = regno - sizeof(struct pt_regs); 140 regno = regno - sizeof(struct pt_regs);
147 retval &= get_stack_long(child, regno); 141 retval &= get_stack_long(child, regno);
148 } 142 }
diff --git a/arch/i386/kernel/quirks.c b/arch/i386/kernel/quirks.c
index 9f6ab1789bb0..a01320a7b636 100644
--- a/arch/i386/kernel/quirks.c
+++ b/arch/i386/kernel/quirks.c
@@ -3,10 +3,23 @@
3 */ 3 */
4#include <linux/pci.h> 4#include <linux/pci.h>
5#include <linux/irq.h> 5#include <linux/irq.h>
6#include <asm/pci-direct.h>
7#include <asm/genapic.h>
8#include <asm/cpu.h>
6 9
7#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) 10#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
11static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev)
12{
13#ifdef CONFIG_X86_64
14 if (genapic != &apic_flat)
15 panic("APIC mode must be flat on this system\n");
16#elif defined(CONFIG_X86_GENERICARCH)
17 if (genapic != &apic_default)
18 panic("APIC mode must be default(flat) on this system. Use apic=default\n");
19#endif
20}
8 21
9static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) 22void __init quirk_intel_irqbalance(void)
10{ 23{
11 u8 config, rev; 24 u8 config, rev;
12 u32 word; 25 u32 word;
@@ -16,18 +29,18 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
16 * based platforms. 29 * based platforms.
17 * Disable SW irqbalance/affinity on those platforms. 30 * Disable SW irqbalance/affinity on those platforms.
18 */ 31 */
19 pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); 32 rev = read_pci_config_byte(0, 0, 0, PCI_CLASS_REVISION);
20 if (rev > 0x9) 33 if (rev > 0x9)
21 return; 34 return;
22 35
23 printk(KERN_INFO "Intel E7520/7320/7525 detected."); 36 printk(KERN_INFO "Intel E7520/7320/7525 detected.");
24 37
25 /* enable access to config space*/ 38 /* enable access to config space */
26 pci_read_config_byte(dev, 0xf4, &config); 39 config = read_pci_config_byte(0, 0, 0, 0xf4);
27 pci_write_config_byte(dev, 0xf4, config|0x2); 40 write_pci_config_byte(0, 0, 0, 0xf4, config|0x2);
28 41
29 /* read xTPR register */ 42 /* read xTPR register */
30 raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word); 43 word = read_pci_config_16(0, 0, 0x40, 0x4c);
31 44
32 if (!(word & (1 << 13))) { 45 if (!(word & (1 << 13))) {
33 printk(KERN_INFO "Disabling irq balancing and affinity\n"); 46 printk(KERN_INFO "Disabling irq balancing and affinity\n");
@@ -38,13 +51,24 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
38#ifdef CONFIG_PROC_FS 51#ifdef CONFIG_PROC_FS
39 no_irq_affinity = 1; 52 no_irq_affinity = 1;
40#endif 53#endif
54#ifdef CONFIG_HOTPLUG_CPU
55 printk(KERN_INFO "Disabling cpu hotplug control\n");
56 enable_cpu_hotplug = 0;
57#endif
58#ifdef CONFIG_X86_64
59 /* force the genapic selection to flat mode so that
60 * interrupts can be redirected to more than one CPU.
61 */
62 genapic_force = &apic_flat;
63#endif
41 } 64 }
42 65
43 /* put back the original value for config space*/ 66 /* put back the original value for config space */
44 if (!(config & 0x2)) 67 if (!(config & 0x2))
45 pci_write_config_byte(dev, 0xf4, config); 68 write_pci_config_byte(0, 0, 0, 0xf4, config);
46} 69}
47DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance); 70DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, verify_quirk_intel_irqbalance);
48DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance); 71DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, verify_quirk_intel_irqbalance);
49DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance); 72DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, verify_quirk_intel_irqbalance);
73
50#endif 74#endif
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index 97bb869307bc..79df6e612dbd 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -63,9 +63,6 @@
63#include <setup_arch.h> 63#include <setup_arch.h>
64#include <bios_ebda.h> 64#include <bios_ebda.h>
65 65
66/* Forward Declaration. */
67void __init find_max_pfn(void);
68
69/* This value is set up by the early boot code to point to the value 66/* This value is set up by the early boot code to point to the value
70 immediately after the boot time page tables. It contains a *physical* 67 immediately after the boot time page tables. It contains a *physical*
71 address, and must not be in the .bss segment! */ 68 address, and must not be in the .bss segment! */
@@ -76,11 +73,8 @@ int disable_pse __devinitdata = 0;
76/* 73/*
77 * Machine setup.. 74 * Machine setup..
78 */ 75 */
79 76extern struct resource code_resource;
80#ifdef CONFIG_EFI 77extern struct resource data_resource;
81int efi_enabled = 0;
82EXPORT_SYMBOL(efi_enabled);
83#endif
84 78
85/* cpu data as detected by the assembly code in head.S */ 79/* cpu data as detected by the assembly code in head.S */
86struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; 80struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
@@ -99,12 +93,6 @@ unsigned int machine_submodel_id;
99unsigned int BIOS_revision; 93unsigned int BIOS_revision;
100unsigned int mca_pentium_flag; 94unsigned int mca_pentium_flag;
101 95
102/* For PCI or other memory-mapped resources */
103unsigned long pci_mem_start = 0x10000000;
104#ifdef CONFIG_PCI
105EXPORT_SYMBOL(pci_mem_start);
106#endif
107
108/* Boot loader ID as an integer, for the benefit of proc_dointvec */ 96/* Boot loader ID as an integer, for the benefit of proc_dointvec */
109int bootloader_type; 97int bootloader_type;
110 98
@@ -134,7 +122,6 @@ struct ist_info ist_info;
134 defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE) 122 defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
135EXPORT_SYMBOL(ist_info); 123EXPORT_SYMBOL(ist_info);
136#endif 124#endif
137struct e820map e820;
138 125
139extern void early_cpu_init(void); 126extern void early_cpu_init(void);
140extern int root_mountflags; 127extern int root_mountflags;
@@ -149,516 +136,6 @@ static char command_line[COMMAND_LINE_SIZE];
149 136
150unsigned char __initdata boot_params[PARAM_SIZE]; 137unsigned char __initdata boot_params[PARAM_SIZE];
151 138
152static struct resource data_resource = {
153 .name = "Kernel data",
154 .start = 0,
155 .end = 0,
156 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
157};
158
159static struct resource code_resource = {
160 .name = "Kernel code",
161 .start = 0,
162 .end = 0,
163 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
164};
165
166static struct resource system_rom_resource = {
167 .name = "System ROM",
168 .start = 0xf0000,
169 .end = 0xfffff,
170 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
171};
172
173static struct resource extension_rom_resource = {
174 .name = "Extension ROM",
175 .start = 0xe0000,
176 .end = 0xeffff,
177 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
178};
179
180static struct resource adapter_rom_resources[] = { {
181 .name = "Adapter ROM",
182 .start = 0xc8000,
183 .end = 0,
184 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
185}, {
186 .name = "Adapter ROM",
187 .start = 0,
188 .end = 0,
189 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
190}, {
191 .name = "Adapter ROM",
192 .start = 0,
193 .end = 0,
194 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
195}, {
196 .name = "Adapter ROM",
197 .start = 0,
198 .end = 0,
199 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
200}, {
201 .name = "Adapter ROM",
202 .start = 0,
203 .end = 0,
204 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
205}, {
206 .name = "Adapter ROM",
207 .start = 0,
208 .end = 0,
209 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
210} };
211
212static struct resource video_rom_resource = {
213 .name = "Video ROM",
214 .start = 0xc0000,
215 .end = 0xc7fff,
216 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
217};
218
219static struct resource video_ram_resource = {
220 .name = "Video RAM area",
221 .start = 0xa0000,
222 .end = 0xbffff,
223 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
224};
225
226static struct resource standard_io_resources[] = { {
227 .name = "dma1",
228 .start = 0x0000,
229 .end = 0x001f,
230 .flags = IORESOURCE_BUSY | IORESOURCE_IO
231}, {
232 .name = "pic1",
233 .start = 0x0020,
234 .end = 0x0021,
235 .flags = IORESOURCE_BUSY | IORESOURCE_IO
236}, {
237 .name = "timer0",
238 .start = 0x0040,
239 .end = 0x0043,
240 .flags = IORESOURCE_BUSY | IORESOURCE_IO
241}, {
242 .name = "timer1",
243 .start = 0x0050,
244 .end = 0x0053,
245 .flags = IORESOURCE_BUSY | IORESOURCE_IO
246}, {
247 .name = "keyboard",
248 .start = 0x0060,
249 .end = 0x006f,
250 .flags = IORESOURCE_BUSY | IORESOURCE_IO
251}, {
252 .name = "dma page reg",
253 .start = 0x0080,
254 .end = 0x008f,
255 .flags = IORESOURCE_BUSY | IORESOURCE_IO
256}, {
257 .name = "pic2",
258 .start = 0x00a0,
259 .end = 0x00a1,
260 .flags = IORESOURCE_BUSY | IORESOURCE_IO
261}, {
262 .name = "dma2",
263 .start = 0x00c0,
264 .end = 0x00df,
265 .flags = IORESOURCE_BUSY | IORESOURCE_IO
266}, {
267 .name = "fpu",
268 .start = 0x00f0,
269 .end = 0x00ff,
270 .flags = IORESOURCE_BUSY | IORESOURCE_IO
271} };
272
273#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
274
275static int __init romchecksum(unsigned char *rom, unsigned long length)
276{
277 unsigned char *p, sum = 0;
278
279 for (p = rom; p < rom + length; p++)
280 sum += *p;
281 return sum == 0;
282}
283
284static void __init probe_roms(void)
285{
286 unsigned long start, length, upper;
287 unsigned char *rom;
288 int i;
289
290 /* video rom */
291 upper = adapter_rom_resources[0].start;
292 for (start = video_rom_resource.start; start < upper; start += 2048) {
293 rom = isa_bus_to_virt(start);
294 if (!romsignature(rom))
295 continue;
296
297 video_rom_resource.start = start;
298
299 /* 0 < length <= 0x7f * 512, historically */
300 length = rom[2] * 512;
301
302 /* if checksum okay, trust length byte */
303 if (length && romchecksum(rom, length))
304 video_rom_resource.end = start + length - 1;
305
306 request_resource(&iomem_resource, &video_rom_resource);
307 break;
308 }
309
310 start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
311 if (start < upper)
312 start = upper;
313
314 /* system rom */
315 request_resource(&iomem_resource, &system_rom_resource);
316 upper = system_rom_resource.start;
317
318 /* check for extension rom (ignore length byte!) */
319 rom = isa_bus_to_virt(extension_rom_resource.start);
320 if (romsignature(rom)) {
321 length = extension_rom_resource.end - extension_rom_resource.start + 1;
322 if (romchecksum(rom, length)) {
323 request_resource(&iomem_resource, &extension_rom_resource);
324 upper = extension_rom_resource.start;
325 }
326 }
327
328 /* check for adapter roms on 2k boundaries */
329 for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
330 rom = isa_bus_to_virt(start);
331 if (!romsignature(rom))
332 continue;
333
334 /* 0 < length <= 0x7f * 512, historically */
335 length = rom[2] * 512;
336
337 /* but accept any length that fits if checksum okay */
338 if (!length || start + length > upper || !romchecksum(rom, length))
339 continue;
340
341 adapter_rom_resources[i].start = start;
342 adapter_rom_resources[i].end = start + length - 1;
343 request_resource(&iomem_resource, &adapter_rom_resources[i]);
344
345 start = adapter_rom_resources[i++].end & ~2047UL;
346 }
347}
348
349static void __init limit_regions(unsigned long long size)
350{
351 unsigned long long current_addr = 0;
352 int i;
353
354 if (efi_enabled) {
355 efi_memory_desc_t *md;
356 void *p;
357
358 for (p = memmap.map, i = 0; p < memmap.map_end;
359 p += memmap.desc_size, i++) {
360 md = p;
361 current_addr = md->phys_addr + (md->num_pages << 12);
362 if (md->type == EFI_CONVENTIONAL_MEMORY) {
363 if (current_addr >= size) {
364 md->num_pages -=
365 (((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT);
366 memmap.nr_map = i + 1;
367 return;
368 }
369 }
370 }
371 }
372 for (i = 0; i < e820.nr_map; i++) {
373 current_addr = e820.map[i].addr + e820.map[i].size;
374 if (current_addr < size)
375 continue;
376
377 if (e820.map[i].type != E820_RAM)
378 continue;
379
380 if (e820.map[i].addr >= size) {
381 /*
382 * This region starts past the end of the
383 * requested size, skip it completely.
384 */
385 e820.nr_map = i;
386 } else {
387 e820.nr_map = i + 1;
388 e820.map[i].size -= current_addr - size;
389 }
390 return;
391 }
392}
393
394void __init add_memory_region(unsigned long long start,
395 unsigned long long size, int type)
396{
397 int x;
398
399 if (!efi_enabled) {
400 x = e820.nr_map;
401
402 if (x == E820MAX) {
403 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
404 return;
405 }
406
407 e820.map[x].addr = start;
408 e820.map[x].size = size;
409 e820.map[x].type = type;
410 e820.nr_map++;
411 }
412} /* add_memory_region */
413
414#define E820_DEBUG 1
415
416static void __init print_memory_map(char *who)
417{
418 int i;
419
420 for (i = 0; i < e820.nr_map; i++) {
421 printk(" %s: %016Lx - %016Lx ", who,
422 e820.map[i].addr,
423 e820.map[i].addr + e820.map[i].size);
424 switch (e820.map[i].type) {
425 case E820_RAM: printk("(usable)\n");
426 break;
427 case E820_RESERVED:
428 printk("(reserved)\n");
429 break;
430 case E820_ACPI:
431 printk("(ACPI data)\n");
432 break;
433 case E820_NVS:
434 printk("(ACPI NVS)\n");
435 break;
436 default: printk("type %lu\n", e820.map[i].type);
437 break;
438 }
439 }
440}
441
442/*
443 * Sanitize the BIOS e820 map.
444 *
445 * Some e820 responses include overlapping entries. The following
446 * replaces the original e820 map with a new one, removing overlaps.
447 *
448 */
449struct change_member {
450 struct e820entry *pbios; /* pointer to original bios entry */
451 unsigned long long addr; /* address for this change point */
452};
453static struct change_member change_point_list[2*E820MAX] __initdata;
454static struct change_member *change_point[2*E820MAX] __initdata;
455static struct e820entry *overlap_list[E820MAX] __initdata;
456static struct e820entry new_bios[E820MAX] __initdata;
457
458int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
459{
460 struct change_member *change_tmp;
461 unsigned long current_type, last_type;
462 unsigned long long last_addr;
463 int chgidx, still_changing;
464 int overlap_entries;
465 int new_bios_entry;
466 int old_nr, new_nr, chg_nr;
467 int i;
468
469 /*
470 Visually we're performing the following (1,2,3,4 = memory types)...
471
472 Sample memory map (w/overlaps):
473 ____22__________________
474 ______________________4_
475 ____1111________________
476 _44_____________________
477 11111111________________
478 ____________________33__
479 ___________44___________
480 __________33333_________
481 ______________22________
482 ___________________2222_
483 _________111111111______
484 _____________________11_
485 _________________4______
486
487 Sanitized equivalent (no overlap):
488 1_______________________
489 _44_____________________
490 ___1____________________
491 ____22__________________
492 ______11________________
493 _________1______________
494 __________3_____________
495 ___________44___________
496 _____________33_________
497 _______________2________
498 ________________1_______
499 _________________4______
500 ___________________2____
501 ____________________33__
502 ______________________4_
503 */
504
505 /* if there's only one memory region, don't bother */
506 if (*pnr_map < 2)
507 return -1;
508
509 old_nr = *pnr_map;
510
511 /* bail out if we find any unreasonable addresses in bios map */
512 for (i=0; i<old_nr; i++)
513 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
514 return -1;
515
516 /* create pointers for initial change-point information (for sorting) */
517 for (i=0; i < 2*old_nr; i++)
518 change_point[i] = &change_point_list[i];
519
520 /* record all known change-points (starting and ending addresses),
521 omitting those that are for empty memory regions */
522 chgidx = 0;
523 for (i=0; i < old_nr; i++) {
524 if (biosmap[i].size != 0) {
525 change_point[chgidx]->addr = biosmap[i].addr;
526 change_point[chgidx++]->pbios = &biosmap[i];
527 change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
528 change_point[chgidx++]->pbios = &biosmap[i];
529 }
530 }
531 chg_nr = chgidx; /* true number of change-points */
532
533 /* sort change-point list by memory addresses (low -> high) */
534 still_changing = 1;
535 while (still_changing) {
536 still_changing = 0;
537 for (i=1; i < chg_nr; i++) {
538 /* if <current_addr> > <last_addr>, swap */
539 /* or, if current=<start_addr> & last=<end_addr>, swap */
540 if ((change_point[i]->addr < change_point[i-1]->addr) ||
541 ((change_point[i]->addr == change_point[i-1]->addr) &&
542 (change_point[i]->addr == change_point[i]->pbios->addr) &&
543 (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
544 )
545 {
546 change_tmp = change_point[i];
547 change_point[i] = change_point[i-1];
548 change_point[i-1] = change_tmp;
549 still_changing=1;
550 }
551 }
552 }
553
554 /* create a new bios memory map, removing overlaps */
555 overlap_entries=0; /* number of entries in the overlap table */
556 new_bios_entry=0; /* index for creating new bios map entries */
557 last_type = 0; /* start with undefined memory type */
558 last_addr = 0; /* start with 0 as last starting address */
559 /* loop through change-points, determining affect on the new bios map */
560 for (chgidx=0; chgidx < chg_nr; chgidx++)
561 {
562 /* keep track of all overlapping bios entries */
563 if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
564 {
565 /* add map entry to overlap list (> 1 entry implies an overlap) */
566 overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
567 }
568 else
569 {
570 /* remove entry from list (order independent, so swap with last) */
571 for (i=0; i<overlap_entries; i++)
572 {
573 if (overlap_list[i] == change_point[chgidx]->pbios)
574 overlap_list[i] = overlap_list[overlap_entries-1];
575 }
576 overlap_entries--;
577 }
578 /* if there are overlapping entries, decide which "type" to use */
579 /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
580 current_type = 0;
581 for (i=0; i<overlap_entries; i++)
582 if (overlap_list[i]->type > current_type)
583 current_type = overlap_list[i]->type;
584 /* continue building up new bios map based on this information */
585 if (current_type != last_type) {
586 if (last_type != 0) {
587 new_bios[new_bios_entry].size =
588 change_point[chgidx]->addr - last_addr;
589 /* move forward only if the new size was non-zero */
590 if (new_bios[new_bios_entry].size != 0)
591 if (++new_bios_entry >= E820MAX)
592 break; /* no more space left for new bios entries */
593 }
594 if (current_type != 0) {
595 new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
596 new_bios[new_bios_entry].type = current_type;
597 last_addr=change_point[chgidx]->addr;
598 }
599 last_type = current_type;
600 }
601 }
602 new_nr = new_bios_entry; /* retain count for new bios entries */
603
604 /* copy new bios mapping into original location */
605 memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
606 *pnr_map = new_nr;
607
608 return 0;
609}
610
611/*
612 * Copy the BIOS e820 map into a safe place.
613 *
614 * Sanity-check it while we're at it..
615 *
616 * If we're lucky and live on a modern system, the setup code
617 * will have given us a memory map that we can use to properly
618 * set up memory. If we aren't, we'll fake a memory map.
619 *
620 * We check to see that the memory map contains at least 2 elements
621 * before we'll use it, because the detection code in setup.S may
622 * not be perfect and most every PC known to man has two memory
623 * regions: one from 0 to 640k, and one from 1mb up. (The IBM
624 * thinkpad 560x, for example, does not cooperate with the memory
625 * detection code.)
626 */
627int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
628{
629 /* Only one memory region (or negative)? Ignore it */
630 if (nr_map < 2)
631 return -1;
632
633 do {
634 unsigned long long start = biosmap->addr;
635 unsigned long long size = biosmap->size;
636 unsigned long long end = start + size;
637 unsigned long type = biosmap->type;
638
639 /* Overflow in 64 bits? Ignore the memory map. */
640 if (start > end)
641 return -1;
642
643 /*
644 * Some BIOSes claim RAM in the 640k - 1M region.
645 * Not right. Fix it up.
646 */
647 if (type == E820_RAM) {
648 if (start < 0x100000ULL && end > 0xA0000ULL) {
649 if (start < 0xA0000ULL)
650 add_memory_region(start, 0xA0000ULL-start, type);
651 if (end <= 0x100000ULL)
652 continue;
653 start = 0x100000ULL;
654 size = end - start;
655 }
656 }
657 add_memory_region(start, size, type);
658 } while (biosmap++,--nr_map);
659 return 0;
660}
661
662#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) 139#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
663struct edd edd; 140struct edd edd;
664#ifdef CONFIG_EDD_MODULE 141#ifdef CONFIG_EDD_MODULE
@@ -682,7 +159,7 @@ static inline void copy_edd(void)
682} 159}
683#endif 160#endif
684 161
685static int __initdata user_defined_memmap = 0; 162int __initdata user_defined_memmap = 0;
686 163
687/* 164/*
688 * "mem=nopentium" disables the 4MB page tables. 165 * "mem=nopentium" disables the 4MB page tables.
@@ -719,51 +196,6 @@ static int __init parse_mem(char *arg)
719} 196}
720early_param("mem", parse_mem); 197early_param("mem", parse_mem);
721 198
722static int __init parse_memmap(char *arg)
723{
724 if (!arg)
725 return -EINVAL;
726
727 if (strcmp(arg, "exactmap") == 0) {
728#ifdef CONFIG_CRASH_DUMP
729 /* If we are doing a crash dump, we
730 * still need to know the real mem
731 * size before original memory map is
732 * reset.
733 */
734 find_max_pfn();
735 saved_max_pfn = max_pfn;
736#endif
737 e820.nr_map = 0;
738 user_defined_memmap = 1;
739 } else {
740 /* If the user specifies memory size, we
741 * limit the BIOS-provided memory map to
742 * that size. exactmap can be used to specify
743 * the exact map. mem=number can be used to
744 * trim the existing memory map.
745 */
746 unsigned long long start_at, mem_size;
747
748 mem_size = memparse(arg, &arg);
749 if (*arg == '@') {
750 start_at = memparse(arg+1, &arg);
751 add_memory_region(start_at, mem_size, E820_RAM);
752 } else if (*arg == '#') {
753 start_at = memparse(arg+1, &arg);
754 add_memory_region(start_at, mem_size, E820_ACPI);
755 } else if (*arg == '$') {
756 start_at = memparse(arg+1, &arg);
757 add_memory_region(start_at, mem_size, E820_RESERVED);
758 } else {
759 limit_regions(mem_size);
760 user_defined_memmap = 1;
761 }
762 }
763 return 0;
764}
765early_param("memmap", parse_memmap);
766
767#ifdef CONFIG_PROC_VMCORE 199#ifdef CONFIG_PROC_VMCORE
768/* elfcorehdr= specifies the location of elf core header 200/* elfcorehdr= specifies the location of elf core header
769 * stored by the crashed kernel. 201 * stored by the crashed kernel.
@@ -828,90 +260,6 @@ static int __init parse_reservetop(char *arg)
828early_param("reservetop", parse_reservetop); 260early_param("reservetop", parse_reservetop);
829 261
830/* 262/*
831 * Callback for efi_memory_walk.
832 */
833static int __init
834efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
835{
836 unsigned long *max_pfn = arg, pfn;
837
838 if (start < end) {
839 pfn = PFN_UP(end -1);
840 if (pfn > *max_pfn)
841 *max_pfn = pfn;
842 }
843 return 0;
844}
845
846static int __init
847efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
848{
849 memory_present(0, PFN_UP(start), PFN_DOWN(end));
850 return 0;
851}
852
853 /*
854 * This function checks if the entire range <start,end> is mapped with type.
855 *
856 * Note: this function only works correct if the e820 table is sorted and
857 * not-overlapping, which is the case
858 */
859int __init
860e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
861{
862 u64 start = s;
863 u64 end = e;
864 int i;
865 for (i = 0; i < e820.nr_map; i++) {
866 struct e820entry *ei = &e820.map[i];
867 if (type && ei->type != type)
868 continue;
869 /* is the region (part) in overlap with the current region ?*/
870 if (ei->addr >= end || ei->addr + ei->size <= start)
871 continue;
872 /* if the region is at the beginning of <start,end> we move
873 * start to the end of the region since it's ok until there
874 */
875 if (ei->addr <= start)
876 start = ei->addr + ei->size;
877 /* if start is now at or beyond end, we're done, full
878 * coverage */
879 if (start >= end)
880 return 1; /* we're done */
881 }
882 return 0;
883}
884
885/*
886 * Find the highest page frame number we have available
887 */
888void __init find_max_pfn(void)
889{
890 int i;
891
892 max_pfn = 0;
893 if (efi_enabled) {
894 efi_memmap_walk(efi_find_max_pfn, &max_pfn);
895 efi_memmap_walk(efi_memory_present_wrapper, NULL);
896 return;
897 }
898
899 for (i = 0; i < e820.nr_map; i++) {
900 unsigned long start, end;
901 /* RAM? */
902 if (e820.map[i].type != E820_RAM)
903 continue;
904 start = PFN_UP(e820.map[i].addr);
905 end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
906 if (start >= end)
907 continue;
908 if (end > max_pfn)
909 max_pfn = end;
910 memory_present(0, start, end);
911 }
912}
913
914/*
915 * Determine low and high memory ranges: 263 * Determine low and high memory ranges:
916 */ 264 */
917unsigned long __init find_max_low_pfn(void) 265unsigned long __init find_max_low_pfn(void)
@@ -971,68 +319,6 @@ unsigned long __init find_max_low_pfn(void)
971} 319}
972 320
973/* 321/*
974 * Free all available memory for boot time allocation. Used
975 * as a callback function by efi_memory_walk()
976 */
977
978static int __init
979free_available_memory(unsigned long start, unsigned long end, void *arg)
980{
981 /* check max_low_pfn */
982 if (start >= (max_low_pfn << PAGE_SHIFT))
983 return 0;
984 if (end >= (max_low_pfn << PAGE_SHIFT))
985 end = max_low_pfn << PAGE_SHIFT;
986 if (start < end)
987 free_bootmem(start, end - start);
988
989 return 0;
990}
991/*
992 * Register fully available low RAM pages with the bootmem allocator.
993 */
994static void __init register_bootmem_low_pages(unsigned long max_low_pfn)
995{
996 int i;
997
998 if (efi_enabled) {
999 efi_memmap_walk(free_available_memory, NULL);
1000 return;
1001 }
1002 for (i = 0; i < e820.nr_map; i++) {
1003 unsigned long curr_pfn, last_pfn, size;
1004 /*
1005 * Reserve usable low memory
1006 */
1007 if (e820.map[i].type != E820_RAM)
1008 continue;
1009 /*
1010 * We are rounding up the start address of usable memory:
1011 */
1012 curr_pfn = PFN_UP(e820.map[i].addr);
1013 if (curr_pfn >= max_low_pfn)
1014 continue;
1015 /*
1016 * ... and at the end of the usable range downwards:
1017 */
1018 last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
1019
1020 if (last_pfn > max_low_pfn)
1021 last_pfn = max_low_pfn;
1022
1023 /*
1024 * .. finally, did all the rounding and playing
1025 * around just make the area go away?
1026 */
1027 if (last_pfn <= curr_pfn)
1028 continue;
1029
1030 size = last_pfn - curr_pfn;
1031 free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
1032 }
1033}
1034
1035/*
1036 * workaround for Dell systems that neglect to reserve EBDA 322 * workaround for Dell systems that neglect to reserve EBDA
1037 */ 323 */
1038static void __init reserve_ebda_region(void) 324static void __init reserve_ebda_region(void)
@@ -1118,8 +404,8 @@ void __init setup_bootmem_allocator(void)
1118 * the (very unlikely) case of us accidentally initializing the 404 * the (very unlikely) case of us accidentally initializing the
1119 * bootmem allocator with an invalid RAM area. 405 * bootmem allocator with an invalid RAM area.
1120 */ 406 */
1121 reserve_bootmem(__PHYSICAL_START, (PFN_PHYS(min_low_pfn) + 407 reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
1122 bootmap_size + PAGE_SIZE-1) - (__PHYSICAL_START)); 408 bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text));
1123 409
1124 /* 410 /*
1125 * reserve physical page 0 - it's a special BIOS page on many boxes, 411 * reserve physical page 0 - it's a special BIOS page on many boxes,
@@ -1199,126 +485,6 @@ void __init remapped_pgdat_init(void)
1199 } 485 }
1200} 486}
1201 487
1202/*
1203 * Request address space for all standard RAM and ROM resources
1204 * and also for regions reported as reserved by the e820.
1205 */
1206static void __init
1207legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource)
1208{
1209 int i;
1210
1211 probe_roms();
1212 for (i = 0; i < e820.nr_map; i++) {
1213 struct resource *res;
1214#ifndef CONFIG_RESOURCES_64BIT
1215 if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
1216 continue;
1217#endif
1218 res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
1219 switch (e820.map[i].type) {
1220 case E820_RAM: res->name = "System RAM"; break;
1221 case E820_ACPI: res->name = "ACPI Tables"; break;
1222 case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
1223 default: res->name = "reserved";
1224 }
1225 res->start = e820.map[i].addr;
1226 res->end = res->start + e820.map[i].size - 1;
1227 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
1228 if (request_resource(&iomem_resource, res)) {
1229 kfree(res);
1230 continue;
1231 }
1232 if (e820.map[i].type == E820_RAM) {
1233 /*
1234 * We don't know which RAM region contains kernel data,
1235 * so we try it repeatedly and let the resource manager
1236 * test it.
1237 */
1238 request_resource(res, code_resource);
1239 request_resource(res, data_resource);
1240#ifdef CONFIG_KEXEC
1241 request_resource(res, &crashk_res);
1242#endif
1243 }
1244 }
1245}
1246
1247/*
1248 * Request address space for all standard resources
1249 *
1250 * This is called just before pcibios_init(), which is also a
1251 * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
1252 */
1253static int __init request_standard_resources(void)
1254{
1255 int i;
1256
1257 printk("Setting up standard PCI resources\n");
1258 if (efi_enabled)
1259 efi_initialize_iomem_resources(&code_resource, &data_resource);
1260 else
1261 legacy_init_iomem_resources(&code_resource, &data_resource);
1262
1263 /* EFI systems may still have VGA */
1264 request_resource(&iomem_resource, &video_ram_resource);
1265
1266 /* request I/O space for devices used on all i[345]86 PCs */
1267 for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
1268 request_resource(&ioport_resource, &standard_io_resources[i]);
1269 return 0;
1270}
1271
1272subsys_initcall(request_standard_resources);
1273
1274static void __init register_memory(void)
1275{
1276 unsigned long gapstart, gapsize, round;
1277 unsigned long long last;
1278 int i;
1279
1280 /*
1281 * Search for the bigest gap in the low 32 bits of the e820
1282 * memory space.
1283 */
1284 last = 0x100000000ull;
1285 gapstart = 0x10000000;
1286 gapsize = 0x400000;
1287 i = e820.nr_map;
1288 while (--i >= 0) {
1289 unsigned long long start = e820.map[i].addr;
1290 unsigned long long end = start + e820.map[i].size;
1291
1292 /*
1293 * Since "last" is at most 4GB, we know we'll
1294 * fit in 32 bits if this condition is true
1295 */
1296 if (last > end) {
1297 unsigned long gap = last - end;
1298
1299 if (gap > gapsize) {
1300 gapsize = gap;
1301 gapstart = end;
1302 }
1303 }
1304 if (start < last)
1305 last = start;
1306 }
1307
1308 /*
1309 * See how much we want to round up: start off with
1310 * rounding to the next 1MB area.
1311 */
1312 round = 0x100000;
1313 while ((gapsize >> 4) > round)
1314 round += round;
1315 /* Fun with two's complement */
1316 pci_mem_start = (gapstart + round) & -round;
1317
1318 printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
1319 pci_mem_start, gapstart, gapsize);
1320}
1321
1322#ifdef CONFIG_MCA 488#ifdef CONFIG_MCA
1323static void set_mca_bus(int x) 489static void set_mca_bus(int x)
1324{ 490{
@@ -1328,6 +494,12 @@ static void set_mca_bus(int x)
1328static void set_mca_bus(int x) { } 494static void set_mca_bus(int x) { }
1329#endif 495#endif
1330 496
497/* Overridden in paravirt.c if CONFIG_PARAVIRT */
498char * __attribute__((weak)) memory_setup(void)
499{
500 return machine_specific_memory_setup();
501}
502
1331/* 503/*
1332 * Determine if we were loaded by an EFI loader. If so, then we have also been 504 * Determine if we were loaded by an EFI loader. If so, then we have also been
1333 * passed the efi memmap, systab, etc., so we should use these data structures 505 * passed the efi memmap, systab, etc., so we should use these data structures
@@ -1380,7 +552,7 @@ void __init setup_arch(char **cmdline_p)
1380 efi_init(); 552 efi_init();
1381 else { 553 else {
1382 printk(KERN_INFO "BIOS-provided physical RAM map:\n"); 554 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
1383 print_memory_map(machine_specific_memory_setup()); 555 print_memory_map(memory_setup());
1384 } 556 }
1385 557
1386 copy_edd(); 558 copy_edd();
diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c
index 43002cfb40c4..65d7620eaa09 100644
--- a/arch/i386/kernel/signal.c
+++ b/arch/i386/kernel/signal.c
@@ -128,7 +128,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax
128 X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \ 128 X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \
129 X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF) 129 X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF)
130 130
131 GET_SEG(gs); 131 COPY_SEG(gs);
132 GET_SEG(fs); 132 GET_SEG(fs);
133 COPY_SEG(es); 133 COPY_SEG(es);
134 COPY_SEG(ds); 134 COPY_SEG(ds);
@@ -244,9 +244,7 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
244{ 244{
245 int tmp, err = 0; 245 int tmp, err = 0;
246 246
247 tmp = 0; 247 err |= __put_user(regs->xgs, (unsigned int __user *)&sc->gs);
248 savesegment(gs, tmp);
249 err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
250 savesegment(fs, tmp); 248 savesegment(fs, tmp);
251 err |= __put_user(tmp, (unsigned int __user *)&sc->fs); 249 err |= __put_user(tmp, (unsigned int __user *)&sc->fs);
252 250
diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index 9827cf927ecb..5285aff8367f 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -321,7 +321,6 @@ static inline void leave_mm (unsigned long cpu)
321 321
322fastcall void smp_invalidate_interrupt(struct pt_regs *regs) 322fastcall void smp_invalidate_interrupt(struct pt_regs *regs)
323{ 323{
324 struct pt_regs *old_regs = set_irq_regs(regs);
325 unsigned long cpu; 324 unsigned long cpu;
326 325
327 cpu = get_cpu(); 326 cpu = get_cpu();
@@ -352,7 +351,6 @@ fastcall void smp_invalidate_interrupt(struct pt_regs *regs)
352 smp_mb__after_clear_bit(); 351 smp_mb__after_clear_bit();
353out: 352out:
354 put_cpu_no_resched(); 353 put_cpu_no_resched();
355 set_irq_regs(old_regs);
356} 354}
357 355
358static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, 356static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
@@ -607,14 +605,11 @@ void smp_send_stop(void)
607 */ 605 */
608fastcall void smp_reschedule_interrupt(struct pt_regs *regs) 606fastcall void smp_reschedule_interrupt(struct pt_regs *regs)
609{ 607{
610 struct pt_regs *old_regs = set_irq_regs(regs);
611 ack_APIC_irq(); 608 ack_APIC_irq();
612 set_irq_regs(old_regs);
613} 609}
614 610
615fastcall void smp_call_function_interrupt(struct pt_regs *regs) 611fastcall void smp_call_function_interrupt(struct pt_regs *regs)
616{ 612{
617 struct pt_regs *old_regs = set_irq_regs(regs);
618 void (*func) (void *info) = call_data->func; 613 void (*func) (void *info) = call_data->func;
619 void *info = call_data->info; 614 void *info = call_data->info;
620 int wait = call_data->wait; 615 int wait = call_data->wait;
@@ -637,7 +632,6 @@ fastcall void smp_call_function_interrupt(struct pt_regs *regs)
637 mb(); 632 mb();
638 atomic_inc(&call_data->finished); 633 atomic_inc(&call_data->finished);
639 } 634 }
640 set_irq_regs(old_regs);
641} 635}
642 636
643/* 637/*
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 02a9b66b6ac3..4bf0e3c83b8b 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -33,6 +33,11 @@
33 * Dave Jones : Report invalid combinations of Athlon CPUs. 33 * Dave Jones : Report invalid combinations of Athlon CPUs.
34* Rusty Russell : Hacked into shape for new "hotplug" boot process. */ 34* Rusty Russell : Hacked into shape for new "hotplug" boot process. */
35 35
36
37/* SMP boot always wants to use real time delay to allow sufficient time for
38 * the APs to come online */
39#define USE_REAL_TIME_DELAY
40
36#include <linux/module.h> 41#include <linux/module.h>
37#include <linux/init.h> 42#include <linux/init.h>
38#include <linux/kernel.h> 43#include <linux/kernel.h>
@@ -52,6 +57,8 @@
52#include <asm/desc.h> 57#include <asm/desc.h>
53#include <asm/arch_hooks.h> 58#include <asm/arch_hooks.h>
54#include <asm/nmi.h> 59#include <asm/nmi.h>
60#include <asm/pda.h>
61#include <asm/genapic.h>
55 62
56#include <mach_apic.h> 63#include <mach_apic.h>
57#include <mach_wakecpu.h> 64#include <mach_wakecpu.h>
@@ -536,11 +543,11 @@ set_cpu_sibling_map(int cpu)
536static void __devinit start_secondary(void *unused) 543static void __devinit start_secondary(void *unused)
537{ 544{
538 /* 545 /*
539 * Dont put anything before smp_callin(), SMP 546 * Don't put *anything* before secondary_cpu_init(), SMP
540 * booting is too fragile that we want to limit the 547 * booting is too fragile that we want to limit the
541 * things done here to the most necessary things. 548 * things done here to the most necessary things.
542 */ 549 */
543 cpu_init(); 550 secondary_cpu_init();
544 preempt_disable(); 551 preempt_disable();
545 smp_callin(); 552 smp_callin();
546 while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) 553 while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
@@ -599,13 +606,16 @@ void __devinit initialize_secondary(void)
599 "movl %0,%%esp\n\t" 606 "movl %0,%%esp\n\t"
600 "jmp *%1" 607 "jmp *%1"
601 : 608 :
602 :"r" (current->thread.esp),"r" (current->thread.eip)); 609 :"m" (current->thread.esp),"m" (current->thread.eip));
603} 610}
604 611
612/* Static state in head.S used to set up a CPU */
605extern struct { 613extern struct {
606 void * esp; 614 void * esp;
607 unsigned short ss; 615 unsigned short ss;
608} stack_start; 616} stack_start;
617extern struct i386_pda *start_pda;
618extern struct Xgt_desc_struct cpu_gdt_descr;
609 619
610#ifdef CONFIG_NUMA 620#ifdef CONFIG_NUMA
611 621
@@ -936,9 +946,6 @@ static int __devinit do_boot_cpu(int apicid, int cpu)
936 unsigned long start_eip; 946 unsigned long start_eip;
937 unsigned short nmi_high = 0, nmi_low = 0; 947 unsigned short nmi_high = 0, nmi_low = 0;
938 948
939 ++cpucount;
940 alternatives_smp_switch(1);
941
942 /* 949 /*
943 * We can't use kernel_thread since we must avoid to 950 * We can't use kernel_thread since we must avoid to
944 * reschedule the child. 951 * reschedule the child.
@@ -946,15 +953,30 @@ static int __devinit do_boot_cpu(int apicid, int cpu)
946 idle = alloc_idle_task(cpu); 953 idle = alloc_idle_task(cpu);
947 if (IS_ERR(idle)) 954 if (IS_ERR(idle))
948 panic("failed fork for CPU %d", cpu); 955 panic("failed fork for CPU %d", cpu);
956
957 /* Pre-allocate and initialize the CPU's GDT and PDA so it
958 doesn't have to do any memory allocation during the
959 delicate CPU-bringup phase. */
960 if (!init_gdt(cpu, idle)) {
961 printk(KERN_INFO "Couldn't allocate GDT/PDA for CPU %d\n", cpu);
962 return -1; /* ? */
963 }
964
949 idle->thread.eip = (unsigned long) start_secondary; 965 idle->thread.eip = (unsigned long) start_secondary;
950 /* start_eip had better be page-aligned! */ 966 /* start_eip had better be page-aligned! */
951 start_eip = setup_trampoline(); 967 start_eip = setup_trampoline();
952 968
969 ++cpucount;
970 alternatives_smp_switch(1);
971
953 /* So we see what's up */ 972 /* So we see what's up */
954 printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip); 973 printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip);
955 /* Stack for startup_32 can be just as for start_secondary onwards */ 974 /* Stack for startup_32 can be just as for start_secondary onwards */
956 stack_start.esp = (void *) idle->thread.esp; 975 stack_start.esp = (void *) idle->thread.esp;
957 976
977 start_pda = cpu_pda(cpu);
978 cpu_gdt_descr = per_cpu(cpu_gdt_descr, cpu);
979
958 irq_ctx_init(cpu); 980 irq_ctx_init(cpu);
959 981
960 x86_cpu_to_apicid[cpu] = apicid; 982 x86_cpu_to_apicid[cpu] = apicid;
@@ -1109,34 +1131,15 @@ exit:
1109} 1131}
1110#endif 1132#endif
1111 1133
1112static void smp_tune_scheduling (void) 1134static void smp_tune_scheduling(void)
1113{ 1135{
1114 unsigned long cachesize; /* kB */ 1136 unsigned long cachesize; /* kB */
1115 unsigned long bandwidth = 350; /* MB/s */
1116 /*
1117 * Rough estimation for SMP scheduling, this is the number of
1118 * cycles it takes for a fully memory-limited process to flush
1119 * the SMP-local cache.
1120 *
1121 * (For a P5 this pretty much means we will choose another idle
1122 * CPU almost always at wakeup time (this is due to the small
1123 * L1 cache), on PIIs it's around 50-100 usecs, depending on
1124 * the cache size)
1125 */
1126 1137
1127 if (!cpu_khz) { 1138 if (cpu_khz) {
1128 /*
1129 * this basically disables processor-affinity
1130 * scheduling on SMP without a TSC.
1131 */
1132 return;
1133 } else {
1134 cachesize = boot_cpu_data.x86_cache_size; 1139 cachesize = boot_cpu_data.x86_cache_size;
1135 if (cachesize == -1) { 1140
1136 cachesize = 16; /* Pentiums, 2x8kB cache */ 1141 if (cachesize > 0)
1137 bandwidth = 100; 1142 max_cache_size = cachesize * 1024;
1138 }
1139 max_cache_size = cachesize * 1024;
1140 } 1143 }
1141} 1144}
1142 1145
@@ -1462,6 +1465,12 @@ int __devinit __cpu_up(unsigned int cpu)
1462 cpu_set(cpu, smp_commenced_mask); 1465 cpu_set(cpu, smp_commenced_mask);
1463 while (!cpu_isset(cpu, cpu_online_map)) 1466 while (!cpu_isset(cpu, cpu_online_map))
1464 cpu_relax(); 1467 cpu_relax();
1468
1469#ifdef CONFIG_X86_GENERICARCH
1470 if (num_online_cpus() > 8 && genapic == &apic_default)
1471 panic("Default flat APIC routing can't be used with > 8 cpus\n");
1472#endif
1473
1465 return 0; 1474 return 0;
1466} 1475}
1467 1476
diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c
index 0bbacd0ec175..7de9117b5a3a 100644
--- a/arch/i386/kernel/sysenter.c
+++ b/arch/i386/kernel/sysenter.c
@@ -27,7 +27,11 @@
27 * Should the kernel map a VDSO page into processes and pass its 27 * Should the kernel map a VDSO page into processes and pass its
28 * address down to glibc upon exec()? 28 * address down to glibc upon exec()?
29 */ 29 */
30#ifdef CONFIG_PARAVIRT
31unsigned int __read_mostly vdso_enabled = 0;
32#else
30unsigned int __read_mostly vdso_enabled = 1; 33unsigned int __read_mostly vdso_enabled = 1;
34#endif
31 35
32EXPORT_SYMBOL_GPL(vdso_enabled); 36EXPORT_SYMBOL_GPL(vdso_enabled);
33 37
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index 78af572fd17c..c505b16c0990 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -56,6 +56,7 @@
56#include <asm/uaccess.h> 56#include <asm/uaccess.h>
57#include <asm/processor.h> 57#include <asm/processor.h>
58#include <asm/timer.h> 58#include <asm/timer.h>
59#include <asm/time.h>
59 60
60#include "mach_time.h" 61#include "mach_time.h"
61 62
@@ -116,10 +117,7 @@ static int set_rtc_mmss(unsigned long nowtime)
116 /* gets recalled with irq locally disabled */ 117 /* gets recalled with irq locally disabled */
117 /* XXX - does irqsave resolve this? -johnstul */ 118 /* XXX - does irqsave resolve this? -johnstul */
118 spin_lock_irqsave(&rtc_lock, flags); 119 spin_lock_irqsave(&rtc_lock, flags);
119 if (efi_enabled) 120 retval = set_wallclock(nowtime);
120 retval = efi_set_rtc_mmss(nowtime);
121 else
122 retval = mach_set_rtc_mmss(nowtime);
123 spin_unlock_irqrestore(&rtc_lock, flags); 121 spin_unlock_irqrestore(&rtc_lock, flags);
124 122
125 return retval; 123 return retval;
@@ -223,10 +221,7 @@ unsigned long get_cmos_time(void)
223 221
224 spin_lock_irqsave(&rtc_lock, flags); 222 spin_lock_irqsave(&rtc_lock, flags);
225 223
226 if (efi_enabled) 224 retval = get_wallclock();
227 retval = efi_get_time();
228 else
229 retval = mach_get_cmos_time();
230 225
231 spin_unlock_irqrestore(&rtc_lock, flags); 226 spin_unlock_irqrestore(&rtc_lock, flags);
232 227
@@ -370,7 +365,7 @@ static void __init hpet_time_init(void)
370 printk("Using HPET for base-timer\n"); 365 printk("Using HPET for base-timer\n");
371 } 366 }
372 367
373 time_init_hook(); 368 do_time_init();
374} 369}
375#endif 370#endif
376 371
@@ -392,5 +387,5 @@ void __init time_init(void)
392 387
393 do_settimeofday(&ts); 388 do_settimeofday(&ts);
394 389
395 time_init_hook(); 390 do_time_init();
396} 391}
diff --git a/arch/i386/kernel/time_hpet.c b/arch/i386/kernel/time_hpet.c
index 1a2a979cf6a3..1e4702dfcd01 100644
--- a/arch/i386/kernel/time_hpet.c
+++ b/arch/i386/kernel/time_hpet.c
@@ -132,14 +132,20 @@ int __init hpet_enable(void)
132 * the single HPET timer for system time. 132 * the single HPET timer for system time.
133 */ 133 */
134#ifdef CONFIG_HPET_EMULATE_RTC 134#ifdef CONFIG_HPET_EMULATE_RTC
135 if (!(id & HPET_ID_NUMBER)) 135 if (!(id & HPET_ID_NUMBER)) {
136 iounmap(hpet_virt_address);
137 hpet_virt_address = NULL;
136 return -1; 138 return -1;
139 }
137#endif 140#endif
138 141
139 142
140 hpet_period = hpet_readl(HPET_PERIOD); 143 hpet_period = hpet_readl(HPET_PERIOD);
141 if ((hpet_period < HPET_MIN_PERIOD) || (hpet_period > HPET_MAX_PERIOD)) 144 if ((hpet_period < HPET_MIN_PERIOD) || (hpet_period > HPET_MAX_PERIOD)) {
145 iounmap(hpet_virt_address);
146 hpet_virt_address = NULL;
142 return -1; 147 return -1;
148 }
143 149
144 /* 150 /*
145 * 64 bit math 151 * 64 bit math
@@ -156,8 +162,11 @@ int __init hpet_enable(void)
156 162
157 hpet_use_timer = id & HPET_ID_LEGSUP; 163 hpet_use_timer = id & HPET_ID_LEGSUP;
158 164
159 if (hpet_timer_stop_set_go(hpet_tick)) 165 if (hpet_timer_stop_set_go(hpet_tick)) {
166 iounmap(hpet_virt_address);
167 hpet_virt_address = NULL;
160 return -1; 168 return -1;
169 }
161 170
162 use_hpet = 1; 171 use_hpet = 1;
163 172
diff --git a/arch/i386/kernel/topology.c b/arch/i386/kernel/topology.c
index 07d6da36a825..79cf608e14ca 100644
--- a/arch/i386/kernel/topology.c
+++ b/arch/i386/kernel/topology.c
@@ -40,14 +40,18 @@ int arch_register_cpu(int num)
40 * restrictions and assumptions in kernel. This basically 40 * restrictions and assumptions in kernel. This basically
41 * doesnt add a control file, one cannot attempt to offline 41 * doesnt add a control file, one cannot attempt to offline
42 * BSP. 42 * BSP.
43 *
44 * Also certain PCI quirks require not to enable hotplug control
45 * for all CPU's.
43 */ 46 */
44 if (!num) 47 if (num && enable_cpu_hotplug)
45 cpu_devices[num].cpu.no_control = 1; 48 cpu_devices[num].cpu.hotpluggable = 1;
46 49
47 return register_cpu(&cpu_devices[num].cpu, num); 50 return register_cpu(&cpu_devices[num].cpu, num);
48} 51}
49 52
50#ifdef CONFIG_HOTPLUG_CPU 53#ifdef CONFIG_HOTPLUG_CPU
54int enable_cpu_hotplug = 1;
51 55
52void arch_unregister_cpu(int num) { 56void arch_unregister_cpu(int num) {
53 return unregister_cpu(&cpu_devices[num].cpu); 57 return unregister_cpu(&cpu_devices[num].cpu);
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 3124f1b04d67..68de48e498ca 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -29,6 +29,7 @@
29#include <linux/kexec.h> 29#include <linux/kexec.h>
30#include <linux/unwind.h> 30#include <linux/unwind.h>
31#include <linux/uaccess.h> 31#include <linux/uaccess.h>
32#include <linux/nmi.h>
32 33
33#ifdef CONFIG_EISA 34#ifdef CONFIG_EISA
34#include <linux/ioport.h> 35#include <linux/ioport.h>
@@ -61,9 +62,6 @@ int panic_on_unrecovered_nmi;
61 62
62asmlinkage int system_call(void); 63asmlinkage int system_call(void);
63 64
64struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 },
65 { 0, 0 }, { 0, 0 } };
66
67/* Do we ignore FPU interrupts ? */ 65/* Do we ignore FPU interrupts ? */
68char ignore_fpu_irq = 0; 66char ignore_fpu_irq = 0;
69 67
@@ -94,7 +92,7 @@ asmlinkage void alignment_check(void);
94asmlinkage void spurious_interrupt_bug(void); 92asmlinkage void spurious_interrupt_bug(void);
95asmlinkage void machine_check(void); 93asmlinkage void machine_check(void);
96 94
97static int kstack_depth_to_print = 24; 95int kstack_depth_to_print = 24;
98#ifdef CONFIG_STACK_UNWIND 96#ifdef CONFIG_STACK_UNWIND
99static int call_trace = 1; 97static int call_trace = 1;
100#else 98#else
@@ -163,16 +161,25 @@ dump_trace_unwind(struct unwind_frame_info *info, void *data)
163{ 161{
164 struct ops_and_data *oad = (struct ops_and_data *)data; 162 struct ops_and_data *oad = (struct ops_and_data *)data;
165 int n = 0; 163 int n = 0;
164 unsigned long sp = UNW_SP(info);
166 165
166 if (arch_unw_user_mode(info))
167 return -1;
167 while (unwind(info) == 0 && UNW_PC(info)) { 168 while (unwind(info) == 0 && UNW_PC(info)) {
168 n++; 169 n++;
169 oad->ops->address(oad->data, UNW_PC(info)); 170 oad->ops->address(oad->data, UNW_PC(info));
170 if (arch_unw_user_mode(info)) 171 if (arch_unw_user_mode(info))
171 break; 172 break;
173 if ((sp & ~(PAGE_SIZE - 1)) == (UNW_SP(info) & ~(PAGE_SIZE - 1))
174 && sp > UNW_SP(info))
175 break;
176 sp = UNW_SP(info);
172 } 177 }
173 return n; 178 return n;
174} 179}
175 180
181#define MSG(msg) ops->warning(data, msg)
182
176void dump_trace(struct task_struct *task, struct pt_regs *regs, 183void dump_trace(struct task_struct *task, struct pt_regs *regs,
177 unsigned long *stack, 184 unsigned long *stack,
178 struct stacktrace_ops *ops, void *data) 185 struct stacktrace_ops *ops, void *data)
@@ -191,29 +198,31 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
191 if (unwind_init_frame_info(&info, task, regs) == 0) 198 if (unwind_init_frame_info(&info, task, regs) == 0)
192 unw_ret = dump_trace_unwind(&info, &oad); 199 unw_ret = dump_trace_unwind(&info, &oad);
193 } else if (task == current) 200 } else if (task == current)
194 unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad); 201 unw_ret = unwind_init_running(&info, dump_trace_unwind,
202 &oad);
195 else { 203 else {
196 if (unwind_init_blocked(&info, task) == 0) 204 if (unwind_init_blocked(&info, task) == 0)
197 unw_ret = dump_trace_unwind(&info, &oad); 205 unw_ret = dump_trace_unwind(&info, &oad);
198 } 206 }
199 if (unw_ret > 0) { 207 if (unw_ret > 0) {
200 if (call_trace == 1 && !arch_unw_user_mode(&info)) { 208 if (call_trace == 1 && !arch_unw_user_mode(&info)) {
201 ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n", 209 ops->warning_symbol(data,
210 "DWARF2 unwinder stuck at %s",
202 UNW_PC(&info)); 211 UNW_PC(&info));
203 if (UNW_SP(&info) >= PAGE_OFFSET) { 212 if (UNW_SP(&info) >= PAGE_OFFSET) {
204 ops->warning(data, "Leftover inexact backtrace:\n"); 213 MSG("Leftover inexact backtrace:");
205 stack = (void *)UNW_SP(&info); 214 stack = (void *)UNW_SP(&info);
206 if (!stack) 215 if (!stack)
207 return; 216 return;
208 ebp = UNW_FP(&info); 217 ebp = UNW_FP(&info);
209 } else 218 } else
210 ops->warning(data, "Full inexact backtrace again:\n"); 219 MSG("Full inexact backtrace again:");
211 } else if (call_trace >= 1) 220 } else if (call_trace >= 1)
212 return; 221 return;
213 else 222 else
214 ops->warning(data, "Full inexact backtrace again:\n"); 223 MSG("Full inexact backtrace again:");
215 } else 224 } else
216 ops->warning(data, "Inexact backtrace:\n"); 225 MSG("Inexact backtrace:");
217 } 226 }
218 if (!stack) { 227 if (!stack) {
219 unsigned long dummy; 228 unsigned long dummy;
@@ -247,6 +256,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
247 stack = (unsigned long*)context->previous_esp; 256 stack = (unsigned long*)context->previous_esp;
248 if (!stack) 257 if (!stack)
249 break; 258 break;
259 touch_nmi_watchdog();
250 } 260 }
251} 261}
252EXPORT_SYMBOL(dump_trace); 262EXPORT_SYMBOL(dump_trace);
@@ -379,7 +389,7 @@ void show_registers(struct pt_regs *regs)
379 * time of the fault.. 389 * time of the fault..
380 */ 390 */
381 if (in_kernel) { 391 if (in_kernel) {
382 u8 __user *eip; 392 u8 *eip;
383 int code_bytes = 64; 393 int code_bytes = 64;
384 unsigned char c; 394 unsigned char c;
385 395
@@ -388,18 +398,20 @@ void show_registers(struct pt_regs *regs)
388 398
389 printk(KERN_EMERG "Code: "); 399 printk(KERN_EMERG "Code: ");
390 400
391 eip = (u8 __user *)regs->eip - 43; 401 eip = (u8 *)regs->eip - 43;
392 if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) { 402 if (eip < (u8 *)PAGE_OFFSET ||
403 probe_kernel_address(eip, c)) {
393 /* try starting at EIP */ 404 /* try starting at EIP */
394 eip = (u8 __user *)regs->eip; 405 eip = (u8 *)regs->eip;
395 code_bytes = 32; 406 code_bytes = 32;
396 } 407 }
397 for (i = 0; i < code_bytes; i++, eip++) { 408 for (i = 0; i < code_bytes; i++, eip++) {
398 if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) { 409 if (eip < (u8 *)PAGE_OFFSET ||
410 probe_kernel_address(eip, c)) {
399 printk(" Bad EIP value."); 411 printk(" Bad EIP value.");
400 break; 412 break;
401 } 413 }
402 if (eip == (u8 __user *)regs->eip) 414 if (eip == (u8 *)regs->eip)
403 printk("<%02x> ", c); 415 printk("<%02x> ", c);
404 else 416 else
405 printk("%02x ", c); 417 printk("%02x ", c);
@@ -415,7 +427,7 @@ static void handle_BUG(struct pt_regs *regs)
415 427
416 if (eip < PAGE_OFFSET) 428 if (eip < PAGE_OFFSET)
417 return; 429 return;
418 if (probe_kernel_address((unsigned short __user *)eip, ud2)) 430 if (probe_kernel_address((unsigned short *)eip, ud2))
419 return; 431 return;
420 if (ud2 != 0x0b0f) 432 if (ud2 != 0x0b0f)
421 return; 433 return;
@@ -428,11 +440,11 @@ static void handle_BUG(struct pt_regs *regs)
428 char *file; 440 char *file;
429 char c; 441 char c;
430 442
431 if (probe_kernel_address((unsigned short __user *)(eip + 2), 443 if (probe_kernel_address((unsigned short *)(eip + 2), line))
432 line))
433 break; 444 break;
434 if (__get_user(file, (char * __user *)(eip + 4)) || 445 if (probe_kernel_address((char **)(eip + 4), file) ||
435 (unsigned long)file < PAGE_OFFSET || __get_user(c, file)) 446 (unsigned long)file < PAGE_OFFSET ||
447 probe_kernel_address(file, c))
436 file = "<bad filename>"; 448 file = "<bad filename>";
437 449
438 printk(KERN_EMERG "kernel BUG at %s:%d!\n", file, line); 450 printk(KERN_EMERG "kernel BUG at %s:%d!\n", file, line);
@@ -707,8 +719,7 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs)
707{ 719{
708 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on " 720 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
709 "CPU %d.\n", reason, smp_processor_id()); 721 "CPU %d.\n", reason, smp_processor_id());
710 printk(KERN_EMERG "You probably have a hardware problem with your RAM " 722 printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
711 "chips\n");
712 if (panic_on_unrecovered_nmi) 723 if (panic_on_unrecovered_nmi)
713 panic("NMI: Not continuing"); 724 panic("NMI: Not continuing");
714 725
@@ -773,7 +784,6 @@ void __kprobes die_nmi(struct pt_regs *regs, const char *msg)
773 printk(" on CPU%d, eip %08lx, registers:\n", 784 printk(" on CPU%d, eip %08lx, registers:\n",
774 smp_processor_id(), regs->eip); 785 smp_processor_id(), regs->eip);
775 show_registers(regs); 786 show_registers(regs);
776 printk(KERN_EMERG "console shuts up ...\n");
777 console_silent(); 787 console_silent();
778 spin_unlock(&nmi_print_lock); 788 spin_unlock(&nmi_print_lock);
779 bust_spinlocks(0); 789 bust_spinlocks(0);
@@ -1088,49 +1098,24 @@ fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
1088#endif 1098#endif
1089} 1099}
1090 1100
1091fastcall void setup_x86_bogus_stack(unsigned char * stk) 1101fastcall unsigned long patch_espfix_desc(unsigned long uesp,
1092{ 1102 unsigned long kesp)
1093 unsigned long *switch16_ptr, *switch32_ptr;
1094 struct pt_regs *regs;
1095 unsigned long stack_top, stack_bot;
1096 unsigned short iret_frame16_off;
1097 int cpu = smp_processor_id();
1098 /* reserve the space on 32bit stack for the magic switch16 pointer */
1099 memmove(stk, stk + 8, sizeof(struct pt_regs));
1100 switch16_ptr = (unsigned long *)(stk + sizeof(struct pt_regs));
1101 regs = (struct pt_regs *)stk;
1102 /* now the switch32 on 16bit stack */
1103 stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
1104 stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
1105 switch32_ptr = (unsigned long *)(stack_top - 8);
1106 iret_frame16_off = CPU_16BIT_STACK_SIZE - 8 - 20;
1107 /* copy iret frame on 16bit stack */
1108 memcpy((void *)(stack_bot + iret_frame16_off), &regs->eip, 20);
1109 /* fill in the switch pointers */
1110 switch16_ptr[0] = (regs->esp & 0xffff0000) | iret_frame16_off;
1111 switch16_ptr[1] = __ESPFIX_SS;
1112 switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) +
1113 8 - CPU_16BIT_STACK_SIZE;
1114 switch32_ptr[1] = __KERNEL_DS;
1115}
1116
1117fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp)
1118{ 1103{
1119 unsigned long *switch32_ptr;
1120 unsigned char *stack16, *stack32;
1121 unsigned long stack_top, stack_bot;
1122 int len;
1123 int cpu = smp_processor_id(); 1104 int cpu = smp_processor_id();
1124 stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu); 1105 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
1125 stack_top = stack_bot + CPU_16BIT_STACK_SIZE; 1106 struct desc_struct *gdt = (struct desc_struct *)cpu_gdt_descr->address;
1126 switch32_ptr = (unsigned long *)(stack_top - 8); 1107 unsigned long base = (kesp - uesp) & -THREAD_SIZE;
1127 /* copy the data from 16bit stack to 32bit stack */ 1108 unsigned long new_kesp = kesp - base;
1128 len = CPU_16BIT_STACK_SIZE - 8 - sp; 1109 unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
1129 stack16 = (unsigned char *)(stack_bot + sp); 1110 __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS];
1130 stack32 = (unsigned char *) 1111 /* Set up base for espfix segment */
1131 (switch32_ptr[0] + CPU_16BIT_STACK_SIZE - 8 - len); 1112 desc &= 0x00f0ff0000000000ULL;
1132 memcpy(stack32, stack16, len); 1113 desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) |
1133 return stack32; 1114 ((((__u64)base) << 32) & 0xff00000000000000ULL) |
1115 ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) |
1116 (lim_pages & 0xffff);
1117 *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc;
1118 return new_kesp;
1134} 1119}
1135 1120
1136/* 1121/*
@@ -1143,7 +1128,7 @@ fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp)
1143 * Must be called with kernel preemption disabled (in this case, 1128 * Must be called with kernel preemption disabled (in this case,
1144 * local interrupts are disabled at the call-site in entry.S). 1129 * local interrupts are disabled at the call-site in entry.S).
1145 */ 1130 */
1146asmlinkage void math_state_restore(struct pt_regs regs) 1131asmlinkage void math_state_restore(void)
1147{ 1132{
1148 struct thread_info *thread = current_thread_info(); 1133 struct thread_info *thread = current_thread_info();
1149 struct task_struct *tsk = thread->task; 1134 struct task_struct *tsk = thread->task;
@@ -1153,6 +1138,7 @@ asmlinkage void math_state_restore(struct pt_regs regs)
1153 init_fpu(tsk); 1138 init_fpu(tsk);
1154 restore_fpu(tsk); 1139 restore_fpu(tsk);
1155 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ 1140 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
1141 tsk->fpu_counter++;
1156} 1142}
1157 1143
1158#ifndef CONFIG_MATH_EMULATION 1144#ifndef CONFIG_MATH_EMULATION
diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index 9810c8c90750..1bbe45dca7a0 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -13,7 +13,6 @@
13 13
14#include <asm/delay.h> 14#include <asm/delay.h>
15#include <asm/tsc.h> 15#include <asm/tsc.h>
16#include <asm/delay.h>
17#include <asm/io.h> 16#include <asm/io.h>
18 17
19#include "mach_timer.h" 18#include "mach_timer.h"
diff --git a/arch/i386/kernel/vm86.c b/arch/i386/kernel/vm86.c
index cbcd61d6120b..be2f96e67f78 100644
--- a/arch/i386/kernel/vm86.c
+++ b/arch/i386/kernel/vm86.c
@@ -43,6 +43,7 @@
43#include <linux/highmem.h> 43#include <linux/highmem.h>
44#include <linux/ptrace.h> 44#include <linux/ptrace.h>
45#include <linux/audit.h> 45#include <linux/audit.h>
46#include <linux/stddef.h>
46 47
47#include <asm/uaccess.h> 48#include <asm/uaccess.h>
48#include <asm/io.h> 49#include <asm/io.h>
@@ -72,10 +73,10 @@
72/* 73/*
73 * 8- and 16-bit register defines.. 74 * 8- and 16-bit register defines..
74 */ 75 */
75#define AL(regs) (((unsigned char *)&((regs)->eax))[0]) 76#define AL(regs) (((unsigned char *)&((regs)->pt.eax))[0])
76#define AH(regs) (((unsigned char *)&((regs)->eax))[1]) 77#define AH(regs) (((unsigned char *)&((regs)->pt.eax))[1])
77#define IP(regs) (*(unsigned short *)&((regs)->eip)) 78#define IP(regs) (*(unsigned short *)&((regs)->pt.eip))
78#define SP(regs) (*(unsigned short *)&((regs)->esp)) 79#define SP(regs) (*(unsigned short *)&((regs)->pt.esp))
79 80
80/* 81/*
81 * virtual flags (16 and 32-bit versions) 82 * virtual flags (16 and 32-bit versions)
@@ -89,10 +90,37 @@
89#define SAFE_MASK (0xDD5) 90#define SAFE_MASK (0xDD5)
90#define RETURN_MASK (0xDFF) 91#define RETURN_MASK (0xDFF)
91 92
92#define VM86_REGS_PART2 orig_eax 93/* convert kernel_vm86_regs to vm86_regs */
93#define VM86_REGS_SIZE1 \ 94static int copy_vm86_regs_to_user(struct vm86_regs __user *user,
94 ( (unsigned)( & (((struct kernel_vm86_regs *)0)->VM86_REGS_PART2) ) ) 95 const struct kernel_vm86_regs *regs)
95#define VM86_REGS_SIZE2 (sizeof(struct kernel_vm86_regs) - VM86_REGS_SIZE1) 96{
97 int ret = 0;
98
99 /* kernel_vm86_regs is missing xfs, so copy everything up to
100 (but not including) xgs, and then rest after xgs. */
101 ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.xgs));
102 ret += copy_to_user(&user->__null_gs, &regs->pt.xgs,
103 sizeof(struct kernel_vm86_regs) -
104 offsetof(struct kernel_vm86_regs, pt.xgs));
105
106 return ret;
107}
108
109/* convert vm86_regs to kernel_vm86_regs */
110static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
111 const struct vm86_regs __user *user,
112 unsigned extra)
113{
114 int ret = 0;
115
116 ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.xgs));
117 ret += copy_from_user(&regs->pt.xgs, &user->__null_gs,
118 sizeof(struct kernel_vm86_regs) -
119 offsetof(struct kernel_vm86_regs, pt.xgs) +
120 extra);
121
122 return ret;
123}
96 124
97struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs)); 125struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs));
98struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs) 126struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
@@ -112,10 +140,8 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
112 printk("no vm86_info: BAD\n"); 140 printk("no vm86_info: BAD\n");
113 do_exit(SIGSEGV); 141 do_exit(SIGSEGV);
114 } 142 }
115 set_flags(regs->eflags, VEFLAGS, VIF_MASK | current->thread.v86mask); 143 set_flags(regs->pt.eflags, VEFLAGS, VIF_MASK | current->thread.v86mask);
116 tmp = copy_to_user(&current->thread.vm86_info->regs,regs, VM86_REGS_SIZE1); 144 tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs,regs);
117 tmp += copy_to_user(&current->thread.vm86_info->regs.VM86_REGS_PART2,
118 &regs->VM86_REGS_PART2, VM86_REGS_SIZE2);
119 tmp += put_user(current->thread.screen_bitmap,&current->thread.vm86_info->screen_bitmap); 145 tmp += put_user(current->thread.screen_bitmap,&current->thread.vm86_info->screen_bitmap);
120 if (tmp) { 146 if (tmp) {
121 printk("vm86: could not access userspace vm86_info\n"); 147 printk("vm86: could not access userspace vm86_info\n");
@@ -129,9 +155,11 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
129 current->thread.saved_esp0 = 0; 155 current->thread.saved_esp0 = 0;
130 put_cpu(); 156 put_cpu();
131 157
132 loadsegment(fs, current->thread.saved_fs);
133 loadsegment(gs, current->thread.saved_gs);
134 ret = KVM86->regs32; 158 ret = KVM86->regs32;
159
160 loadsegment(fs, current->thread.saved_fs);
161 ret->xgs = current->thread.saved_gs;
162
135 return ret; 163 return ret;
136} 164}
137 165
@@ -183,9 +211,9 @@ asmlinkage int sys_vm86old(struct pt_regs regs)
183 tsk = current; 211 tsk = current;
184 if (tsk->thread.saved_esp0) 212 if (tsk->thread.saved_esp0)
185 goto out; 213 goto out;
186 tmp = copy_from_user(&info, v86, VM86_REGS_SIZE1); 214 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
187 tmp += copy_from_user(&info.regs.VM86_REGS_PART2, &v86->regs.VM86_REGS_PART2, 215 offsetof(struct kernel_vm86_struct, vm86plus) -
188 (long)&info.vm86plus - (long)&info.regs.VM86_REGS_PART2); 216 sizeof(info.regs));
189 ret = -EFAULT; 217 ret = -EFAULT;
190 if (tmp) 218 if (tmp)
191 goto out; 219 goto out;
@@ -233,9 +261,9 @@ asmlinkage int sys_vm86(struct pt_regs regs)
233 if (tsk->thread.saved_esp0) 261 if (tsk->thread.saved_esp0)
234 goto out; 262 goto out;
235 v86 = (struct vm86plus_struct __user *)regs.ecx; 263 v86 = (struct vm86plus_struct __user *)regs.ecx;
236 tmp = copy_from_user(&info, v86, VM86_REGS_SIZE1); 264 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
237 tmp += copy_from_user(&info.regs.VM86_REGS_PART2, &v86->regs.VM86_REGS_PART2, 265 offsetof(struct kernel_vm86_struct, regs32) -
238 (long)&info.regs32 - (long)&info.regs.VM86_REGS_PART2); 266 sizeof(info.regs));
239 ret = -EFAULT; 267 ret = -EFAULT;
240 if (tmp) 268 if (tmp)
241 goto out; 269 goto out;
@@ -252,15 +280,15 @@ out:
252static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk) 280static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk)
253{ 281{
254 struct tss_struct *tss; 282 struct tss_struct *tss;
255 long eax;
256/* 283/*
257 * make sure the vm86() system call doesn't try to do anything silly 284 * make sure the vm86() system call doesn't try to do anything silly
258 */ 285 */
259 info->regs.__null_ds = 0; 286 info->regs.pt.xds = 0;
260 info->regs.__null_es = 0; 287 info->regs.pt.xes = 0;
288 info->regs.pt.xgs = 0;
261 289
262/* we are clearing fs,gs later just before "jmp resume_userspace", 290/* we are clearing fs later just before "jmp resume_userspace",
263 * because starting with Linux 2.1.x they aren't no longer saved/restored 291 * because it is not saved/restored.
264 */ 292 */
265 293
266/* 294/*
@@ -268,10 +296,10 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
268 * has set it up safely, so this makes sure interrupt etc flags are 296 * has set it up safely, so this makes sure interrupt etc flags are
269 * inherited from protected mode. 297 * inherited from protected mode.
270 */ 298 */
271 VEFLAGS = info->regs.eflags; 299 VEFLAGS = info->regs.pt.eflags;
272 info->regs.eflags &= SAFE_MASK; 300 info->regs.pt.eflags &= SAFE_MASK;
273 info->regs.eflags |= info->regs32->eflags & ~SAFE_MASK; 301 info->regs.pt.eflags |= info->regs32->eflags & ~SAFE_MASK;
274 info->regs.eflags |= VM_MASK; 302 info->regs.pt.eflags |= VM_MASK;
275 303
276 switch (info->cpu_type) { 304 switch (info->cpu_type) {
277 case CPU_286: 305 case CPU_286:
@@ -294,7 +322,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
294 info->regs32->eax = 0; 322 info->regs32->eax = 0;
295 tsk->thread.saved_esp0 = tsk->thread.esp0; 323 tsk->thread.saved_esp0 = tsk->thread.esp0;
296 savesegment(fs, tsk->thread.saved_fs); 324 savesegment(fs, tsk->thread.saved_fs);
297 savesegment(gs, tsk->thread.saved_gs); 325 tsk->thread.saved_gs = info->regs32->xgs;
298 326
299 tss = &per_cpu(init_tss, get_cpu()); 327 tss = &per_cpu(init_tss, get_cpu());
300 tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0; 328 tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0;
@@ -306,19 +334,18 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
306 tsk->thread.screen_bitmap = info->screen_bitmap; 334 tsk->thread.screen_bitmap = info->screen_bitmap;
307 if (info->flags & VM86_SCREEN_BITMAP) 335 if (info->flags & VM86_SCREEN_BITMAP)
308 mark_screen_rdonly(tsk->mm); 336 mark_screen_rdonly(tsk->mm);
309 __asm__ __volatile__("xorl %eax,%eax; movl %eax,%fs; movl %eax,%gs\n\t");
310 __asm__ __volatile__("movl %%eax, %0\n" :"=r"(eax));
311 337
312 /*call audit_syscall_exit since we do not exit via the normal paths */ 338 /*call audit_syscall_exit since we do not exit via the normal paths */
313 if (unlikely(current->audit_context)) 339 if (unlikely(current->audit_context))
314 audit_syscall_exit(AUDITSC_RESULT(eax), eax); 340 audit_syscall_exit(AUDITSC_RESULT(0), 0);
315 341
316 __asm__ __volatile__( 342 __asm__ __volatile__(
317 "movl %0,%%esp\n\t" 343 "movl %0,%%esp\n\t"
318 "movl %1,%%ebp\n\t" 344 "movl %1,%%ebp\n\t"
345 "mov %2, %%fs\n\t"
319 "jmp resume_userspace" 346 "jmp resume_userspace"
320 : /* no outputs */ 347 : /* no outputs */
321 :"r" (&info->regs), "r" (task_thread_info(tsk))); 348 :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
322 /* we never return here */ 349 /* we never return here */
323} 350}
324 351
@@ -348,12 +375,12 @@ static inline void clear_IF(struct kernel_vm86_regs * regs)
348 375
349static inline void clear_TF(struct kernel_vm86_regs * regs) 376static inline void clear_TF(struct kernel_vm86_regs * regs)
350{ 377{
351 regs->eflags &= ~TF_MASK; 378 regs->pt.eflags &= ~TF_MASK;
352} 379}
353 380
354static inline void clear_AC(struct kernel_vm86_regs * regs) 381static inline void clear_AC(struct kernel_vm86_regs * regs)
355{ 382{
356 regs->eflags &= ~AC_MASK; 383 regs->pt.eflags &= ~AC_MASK;
357} 384}
358 385
359/* It is correct to call set_IF(regs) from the set_vflags_* 386/* It is correct to call set_IF(regs) from the set_vflags_*
@@ -370,7 +397,7 @@ static inline void clear_AC(struct kernel_vm86_regs * regs)
370static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs * regs) 397static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs * regs)
371{ 398{
372 set_flags(VEFLAGS, eflags, current->thread.v86mask); 399 set_flags(VEFLAGS, eflags, current->thread.v86mask);
373 set_flags(regs->eflags, eflags, SAFE_MASK); 400 set_flags(regs->pt.eflags, eflags, SAFE_MASK);
374 if (eflags & IF_MASK) 401 if (eflags & IF_MASK)
375 set_IF(regs); 402 set_IF(regs);
376 else 403 else
@@ -380,7 +407,7 @@ static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs
380static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs) 407static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs)
381{ 408{
382 set_flags(VFLAGS, flags, current->thread.v86mask); 409 set_flags(VFLAGS, flags, current->thread.v86mask);
383 set_flags(regs->eflags, flags, SAFE_MASK); 410 set_flags(regs->pt.eflags, flags, SAFE_MASK);
384 if (flags & IF_MASK) 411 if (flags & IF_MASK)
385 set_IF(regs); 412 set_IF(regs);
386 else 413 else
@@ -389,7 +416,7 @@ static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_reg
389 416
390static inline unsigned long get_vflags(struct kernel_vm86_regs * regs) 417static inline unsigned long get_vflags(struct kernel_vm86_regs * regs)
391{ 418{
392 unsigned long flags = regs->eflags & RETURN_MASK; 419 unsigned long flags = regs->pt.eflags & RETURN_MASK;
393 420
394 if (VEFLAGS & VIF_MASK) 421 if (VEFLAGS & VIF_MASK)
395 flags |= IF_MASK; 422 flags |= IF_MASK;
@@ -493,7 +520,7 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
493 unsigned long __user *intr_ptr; 520 unsigned long __user *intr_ptr;
494 unsigned long segoffs; 521 unsigned long segoffs;
495 522
496 if (regs->cs == BIOSSEG) 523 if (regs->pt.xcs == BIOSSEG)
497 goto cannot_handle; 524 goto cannot_handle;
498 if (is_revectored(i, &KVM86->int_revectored)) 525 if (is_revectored(i, &KVM86->int_revectored))
499 goto cannot_handle; 526 goto cannot_handle;
@@ -505,9 +532,9 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
505 if ((segoffs >> 16) == BIOSSEG) 532 if ((segoffs >> 16) == BIOSSEG)
506 goto cannot_handle; 533 goto cannot_handle;
507 pushw(ssp, sp, get_vflags(regs), cannot_handle); 534 pushw(ssp, sp, get_vflags(regs), cannot_handle);
508 pushw(ssp, sp, regs->cs, cannot_handle); 535 pushw(ssp, sp, regs->pt.xcs, cannot_handle);
509 pushw(ssp, sp, IP(regs), cannot_handle); 536 pushw(ssp, sp, IP(regs), cannot_handle);
510 regs->cs = segoffs >> 16; 537 regs->pt.xcs = segoffs >> 16;
511 SP(regs) -= 6; 538 SP(regs) -= 6;
512 IP(regs) = segoffs & 0xffff; 539 IP(regs) = segoffs & 0xffff;
513 clear_TF(regs); 540 clear_TF(regs);
@@ -524,7 +551,7 @@ int handle_vm86_trap(struct kernel_vm86_regs * regs, long error_code, int trapno
524 if (VMPI.is_vm86pus) { 551 if (VMPI.is_vm86pus) {
525 if ( (trapno==3) || (trapno==1) ) 552 if ( (trapno==3) || (trapno==1) )
526 return_to_32bit(regs, VM86_TRAP + (trapno << 8)); 553 return_to_32bit(regs, VM86_TRAP + (trapno << 8));
527 do_int(regs, trapno, (unsigned char __user *) (regs->ss << 4), SP(regs)); 554 do_int(regs, trapno, (unsigned char __user *) (regs->pt.xss << 4), SP(regs));
528 return 0; 555 return 0;
529 } 556 }
530 if (trapno !=1) 557 if (trapno !=1)
@@ -560,10 +587,10 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
560 handle_vm86_trap(regs, 0, 1); \ 587 handle_vm86_trap(regs, 0, 1); \
561 return; } while (0) 588 return; } while (0)
562 589
563 orig_flags = *(unsigned short *)&regs->eflags; 590 orig_flags = *(unsigned short *)&regs->pt.eflags;
564 591
565 csp = (unsigned char __user *) (regs->cs << 4); 592 csp = (unsigned char __user *) (regs->pt.xcs << 4);
566 ssp = (unsigned char __user *) (regs->ss << 4); 593 ssp = (unsigned char __user *) (regs->pt.xss << 4);
567 sp = SP(regs); 594 sp = SP(regs);
568 ip = IP(regs); 595 ip = IP(regs);
569 596
@@ -650,7 +677,7 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
650 SP(regs) += 6; 677 SP(regs) += 6;
651 } 678 }
652 IP(regs) = newip; 679 IP(regs) = newip;
653 regs->cs = newcs; 680 regs->pt.xcs = newcs;
654 CHECK_IF_IN_TRAP; 681 CHECK_IF_IN_TRAP;
655 if (data32) { 682 if (data32) {
656 set_vflags_long(newflags, regs); 683 set_vflags_long(newflags, regs);
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index c6f84a0322ba..56e6ad5cb045 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -1,13 +1,26 @@
1/* ld script to make i386 Linux kernel 1/* ld script to make i386 Linux kernel
2 * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>; 2 * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
3 *
4 * Don't define absolute symbols until and unless you know that symbol
5 * value is should remain constant even if kernel image is relocated
6 * at run time. Absolute symbols are not relocated. If symbol value should
7 * change if kernel is relocated, make the symbol section relative and
8 * put it inside the section definition.
3 */ 9 */
4 10
11/* Don't define absolute symbols until and unless you know that symbol
12 * value is should remain constant even if kernel image is relocated
13 * at run time. Absolute symbols are not relocated. If symbol value should
14 * change if kernel is relocated, make the symbol section relative and
15 * put it inside the section definition.
16 */
5#define LOAD_OFFSET __PAGE_OFFSET 17#define LOAD_OFFSET __PAGE_OFFSET
6 18
7#include <asm-generic/vmlinux.lds.h> 19#include <asm-generic/vmlinux.lds.h>
8#include <asm/thread_info.h> 20#include <asm/thread_info.h>
9#include <asm/page.h> 21#include <asm/page.h>
10#include <asm/cache.h> 22#include <asm/cache.h>
23#include <asm/boot.h>
11 24
12OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") 25OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
13OUTPUT_ARCH(i386) 26OUTPUT_ARCH(i386)
@@ -21,34 +34,35 @@ PHDRS {
21} 34}
22SECTIONS 35SECTIONS
23{ 36{
24 . = __KERNEL_START; 37 . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
25 phys_startup_32 = startup_32 - LOAD_OFFSET; 38 phys_startup_32 = startup_32 - LOAD_OFFSET;
26 /* read-only */ 39 /* read-only */
27 _text = .; /* Text and read-only data */
28 .text : AT(ADDR(.text) - LOAD_OFFSET) { 40 .text : AT(ADDR(.text) - LOAD_OFFSET) {
41 _text = .; /* Text and read-only data */
29 *(.text) 42 *(.text)
30 SCHED_TEXT 43 SCHED_TEXT
31 LOCK_TEXT 44 LOCK_TEXT
32 KPROBES_TEXT 45 KPROBES_TEXT
33 *(.fixup) 46 *(.fixup)
34 *(.gnu.warning) 47 *(.gnu.warning)
35 } :text = 0x9090 48 _etext = .; /* End of text section */
36 49 } :text = 0x9090
37 _etext = .; /* End of text section */
38 50
39 . = ALIGN(16); /* Exception table */ 51 . = ALIGN(16); /* Exception table */
40 __start___ex_table = .; 52 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
41 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) } 53 __start___ex_table = .;
42 __stop___ex_table = .; 54 *(__ex_table)
55 __stop___ex_table = .;
56 }
43 57
44 RODATA 58 RODATA
45 59
46 . = ALIGN(4); 60 . = ALIGN(4);
47 __tracedata_start = .;
48 .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) { 61 .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
62 __tracedata_start = .;
49 *(.tracedata) 63 *(.tracedata)
64 __tracedata_end = .;
50 } 65 }
51 __tracedata_end = .;
52 66
53 /* writeable */ 67 /* writeable */
54 . = ALIGN(4096); 68 . = ALIGN(4096);
@@ -57,11 +71,19 @@ SECTIONS
57 CONSTRUCTORS 71 CONSTRUCTORS
58 } :data 72 } :data
59 73
74 .paravirtprobe : AT(ADDR(.paravirtprobe) - LOAD_OFFSET) {
75 __start_paravirtprobe = .;
76 *(.paravirtprobe)
77 __stop_paravirtprobe = .;
78 }
79
60 . = ALIGN(4096); 80 . = ALIGN(4096);
61 __nosave_begin = .; 81 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
62 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) } 82 __nosave_begin = .;
63 . = ALIGN(4096); 83 *(.data.nosave)
64 __nosave_end = .; 84 . = ALIGN(4096);
85 __nosave_end = .;
86 }
65 87
66 . = ALIGN(4096); 88 . = ALIGN(4096);
67 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { 89 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
@@ -75,17 +97,10 @@ SECTIONS
75 97
76 /* rarely changed data like cpu maps */ 98 /* rarely changed data like cpu maps */
77 . = ALIGN(32); 99 . = ALIGN(32);
78 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { *(.data.read_mostly) } 100 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
79 _edata = .; /* End of data section */ 101 *(.data.read_mostly)
80 102 _edata = .; /* End of data section */
81#ifdef CONFIG_STACK_UNWIND
82 . = ALIGN(4);
83 .eh_frame : AT(ADDR(.eh_frame) - LOAD_OFFSET) {
84 __start_unwind = .;
85 *(.eh_frame)
86 __end_unwind = .;
87 } 103 }
88#endif
89 104
90 . = ALIGN(THREAD_SIZE); /* init_task */ 105 . = ALIGN(THREAD_SIZE); /* init_task */
91 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { 106 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
@@ -94,88 +109,102 @@ SECTIONS
94 109
95 /* might get freed after init */ 110 /* might get freed after init */
96 . = ALIGN(4096); 111 . = ALIGN(4096);
97 __smp_alt_begin = .;
98 __smp_alt_instructions = .;
99 .smp_altinstructions : AT(ADDR(.smp_altinstructions) - LOAD_OFFSET) { 112 .smp_altinstructions : AT(ADDR(.smp_altinstructions) - LOAD_OFFSET) {
113 __smp_alt_begin = .;
114 __smp_alt_instructions = .;
100 *(.smp_altinstructions) 115 *(.smp_altinstructions)
116 __smp_alt_instructions_end = .;
101 } 117 }
102 __smp_alt_instructions_end = .;
103 . = ALIGN(4); 118 . = ALIGN(4);
104 __smp_locks = .;
105 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { 119 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
120 __smp_locks = .;
106 *(.smp_locks) 121 *(.smp_locks)
122 __smp_locks_end = .;
107 } 123 }
108 __smp_locks_end = .;
109 .smp_altinstr_replacement : AT(ADDR(.smp_altinstr_replacement) - LOAD_OFFSET) { 124 .smp_altinstr_replacement : AT(ADDR(.smp_altinstr_replacement) - LOAD_OFFSET) {
110 *(.smp_altinstr_replacement) 125 *(.smp_altinstr_replacement)
126 __smp_alt_end = .;
111 } 127 }
128 /* will be freed after init
129 * Following ALIGN() is required to make sure no other data falls on the
130 * same page where __smp_alt_end is pointing as that page might be freed
131 * after boot. Always make sure that ALIGN() directive is present after
132 * the section which contains __smp_alt_end.
133 */
112 . = ALIGN(4096); 134 . = ALIGN(4096);
113 __smp_alt_end = .;
114 135
115 /* will be freed after init */ 136 /* will be freed after init */
116 . = ALIGN(4096); /* Init code and data */ 137 . = ALIGN(4096); /* Init code and data */
117 __init_begin = .;
118 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { 138 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
139 __init_begin = .;
119 _sinittext = .; 140 _sinittext = .;
120 *(.init.text) 141 *(.init.text)
121 _einittext = .; 142 _einittext = .;
122 } 143 }
123 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) } 144 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) }
124 . = ALIGN(16); 145 . = ALIGN(16);
125 __setup_start = .; 146 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
126 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) } 147 __setup_start = .;
127 __setup_end = .; 148 *(.init.setup)
128 __initcall_start = .; 149 __setup_end = .;
150 }
129 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { 151 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
152 __initcall_start = .;
130 INITCALLS 153 INITCALLS
154 __initcall_end = .;
131 } 155 }
132 __initcall_end = .;
133 __con_initcall_start = .;
134 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { 156 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
157 __con_initcall_start = .;
135 *(.con_initcall.init) 158 *(.con_initcall.init)
159 __con_initcall_end = .;
136 } 160 }
137 __con_initcall_end = .;
138 SECURITY_INIT 161 SECURITY_INIT
139 . = ALIGN(4); 162 . = ALIGN(4);
140 __alt_instructions = .;
141 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { 163 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
164 __alt_instructions = .;
142 *(.altinstructions) 165 *(.altinstructions)
166 __alt_instructions_end = .;
143 } 167 }
144 __alt_instructions_end = .;
145 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { 168 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
146 *(.altinstr_replacement) 169 *(.altinstr_replacement)
147 } 170 }
171 . = ALIGN(4);
172 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
173 __start_parainstructions = .;
174 *(.parainstructions)
175 __stop_parainstructions = .;
176 }
148 /* .exit.text is discard at runtime, not link time, to deal with references 177 /* .exit.text is discard at runtime, not link time, to deal with references
149 from .altinstructions and .eh_frame */ 178 from .altinstructions and .eh_frame */
150 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) } 179 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
151 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) } 180 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) }
152 . = ALIGN(4096); 181 . = ALIGN(4096);
153 __initramfs_start = .; 182 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
154 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) } 183 __initramfs_start = .;
155 __initramfs_end = .; 184 *(.init.ramfs)
185 __initramfs_end = .;
186 }
156 . = ALIGN(L1_CACHE_BYTES); 187 . = ALIGN(L1_CACHE_BYTES);
157 __per_cpu_start = .; 188 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
158 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) } 189 __per_cpu_start = .;
159 __per_cpu_end = .; 190 *(.data.percpu)
191 __per_cpu_end = .;
192 }
160 . = ALIGN(4096); 193 . = ALIGN(4096);
161 __init_end = .;
162 /* freed after init ends here */ 194 /* freed after init ends here */
163 195
164 __bss_start = .; /* BSS */
165 .bss.page_aligned : AT(ADDR(.bss.page_aligned) - LOAD_OFFSET) {
166 *(.bss.page_aligned)
167 }
168 .bss : AT(ADDR(.bss) - LOAD_OFFSET) { 196 .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
197 __init_end = .;
198 __bss_start = .; /* BSS */
199 *(.bss.page_aligned)
169 *(.bss) 200 *(.bss)
201 . = ALIGN(4);
202 __bss_stop = .;
203 _end = . ;
204 /* This is where the kernel creates the early boot page tables */
205 . = ALIGN(4096);
206 pg0 = . ;
170 } 207 }
171 . = ALIGN(4);
172 __bss_stop = .;
173
174 _end = . ;
175
176 /* This is where the kernel creates the early boot page tables */
177 . = ALIGN(4096);
178 pg0 = .;
179 208
180 /* Sections to be discarded */ 209 /* Sections to be discarded */
181 /DISCARD/ : { 210 /DISCARD/ : {
diff --git a/arch/i386/mach-generic/probe.c b/arch/i386/mach-generic/probe.c
index 94b1fd9cbe3c..a7b3999bb37a 100644
--- a/arch/i386/mach-generic/probe.c
+++ b/arch/i386/mach-generic/probe.c
@@ -45,7 +45,9 @@ static int __init parse_apic(char *arg)
45 return 0; 45 return 0;
46 } 46 }
47 } 47 }
48 return -ENOENT; 48
49 /* Parsed again by __setup for debug/verbose */
50 return 0;
49} 51}
50early_param("apic", parse_apic); 52early_param("apic", parse_apic);
51 53
diff --git a/arch/i386/mach-voyager/voyager_cat.c b/arch/i386/mach-voyager/voyager_cat.c
index f50c6c6ad680..943a9473b138 100644
--- a/arch/i386/mach-voyager/voyager_cat.c
+++ b/arch/i386/mach-voyager/voyager_cat.c
@@ -776,7 +776,7 @@ voyager_cat_init(void)
776 for(asic=0; asic < (*modpp)->num_asics; asic++) { 776 for(asic=0; asic < (*modpp)->num_asics; asic++) {
777 int j; 777 int j;
778 voyager_asic_t *asicp = *asicpp 778 voyager_asic_t *asicp = *asicpp
779 = kmalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count++];*/ 779 = kzalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count++];*/
780 voyager_sp_table_t *sp_table; 780 voyager_sp_table_t *sp_table;
781 voyager_at_t *asic_table; 781 voyager_at_t *asic_table;
782 voyager_jtt_t *jtag_table; 782 voyager_jtt_t *jtag_table;
@@ -785,7 +785,6 @@ voyager_cat_init(void)
785 printk("**WARNING** kmalloc failure in cat_init\n"); 785 printk("**WARNING** kmalloc failure in cat_init\n");
786 continue; 786 continue;
787 } 787 }
788 memset(asicp, 0, sizeof(voyager_asic_t));
789 asicpp = &(asicp->next); 788 asicpp = &(asicp->next);
790 asicp->asic_location = asic; 789 asicp->asic_location = asic;
791 sp_table = (voyager_sp_table_t *)(eprom_buf + sp_offset); 790 sp_table = (voyager_sp_table_t *)(eprom_buf + sp_offset);
@@ -851,8 +850,7 @@ voyager_cat_init(void)
851#endif 850#endif
852 851
853 { 852 {
854 struct resource *res = kmalloc(sizeof(struct resource),GFP_KERNEL); 853 struct resource *res = kzalloc(sizeof(struct resource),GFP_KERNEL);
855 memset(res, 0, sizeof(struct resource));
856 res->name = kmalloc(128, GFP_KERNEL); 854 res->name = kmalloc(128, GFP_KERNEL);
857 sprintf((char *)res->name, "Voyager %s Quad CPI", cat_module_name(i)); 855 sprintf((char *)res->name, "Voyager %s Quad CPI", cat_module_name(i));
858 res->start = qic_addr; 856 res->start = qic_addr;
diff --git a/arch/i386/mach-voyager/voyager_smp.c b/arch/i386/mach-voyager/voyager_smp.c
index f3fea2ad50fe..55428e656a3f 100644
--- a/arch/i386/mach-voyager/voyager_smp.c
+++ b/arch/i386/mach-voyager/voyager_smp.c
@@ -28,6 +28,7 @@
28#include <asm/pgalloc.h> 28#include <asm/pgalloc.h>
29#include <asm/tlbflush.h> 29#include <asm/tlbflush.h>
30#include <asm/arch_hooks.h> 30#include <asm/arch_hooks.h>
31#include <asm/pda.h>
31 32
32/* TLB state -- visible externally, indexed physically */ 33/* TLB state -- visible externally, indexed physically */
33DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0 }; 34DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0 };
@@ -422,6 +423,7 @@ find_smp_config(void)
422 VOYAGER_SUS_IN_CONTROL_PORT); 423 VOYAGER_SUS_IN_CONTROL_PORT);
423 424
424 current_thread_info()->cpu = boot_cpu_id; 425 current_thread_info()->cpu = boot_cpu_id;
426 write_pda(cpu_number, boot_cpu_id);
425} 427}
426 428
427/* 429/*
@@ -458,7 +460,7 @@ start_secondary(void *unused)
458 /* external functions not defined in the headers */ 460 /* external functions not defined in the headers */
459 extern void calibrate_delay(void); 461 extern void calibrate_delay(void);
460 462
461 cpu_init(); 463 secondary_cpu_init();
462 464
463 /* OK, we're in the routine */ 465 /* OK, we're in the routine */
464 ack_CPI(VIC_CPU_BOOT_CPI); 466 ack_CPI(VIC_CPU_BOOT_CPI);
@@ -578,6 +580,15 @@ do_boot_cpu(__u8 cpu)
578 /* init_tasks (in sched.c) is indexed logically */ 580 /* init_tasks (in sched.c) is indexed logically */
579 stack_start.esp = (void *) idle->thread.esp; 581 stack_start.esp = (void *) idle->thread.esp;
580 582
583 /* Pre-allocate and initialize the CPU's GDT and PDA so it
584 doesn't have to do any memory allocation during the
585 delicate CPU-bringup phase. */
586 if (!init_gdt(cpu, idle)) {
587 printk(KERN_INFO "Couldn't allocate GDT/PDA for CPU %d\n", cpu);
588 cpucount--;
589 return;
590 }
591
581 irq_ctx_init(cpu); 592 irq_ctx_init(cpu);
582 593
583 /* Note: Don't modify initial ss override */ 594 /* Note: Don't modify initial ss override */
@@ -1963,4 +1974,5 @@ void __init
1963smp_setup_processor_id(void) 1974smp_setup_processor_id(void)
1964{ 1975{
1965 current_thread_info()->cpu = hard_smp_processor_id(); 1976 current_thread_info()->cpu = hard_smp_processor_id();
1977 write_pda(cpu_number, hard_smp_processor_id());
1966} 1978}
diff --git a/arch/i386/math-emu/fpu_emu.h b/arch/i386/math-emu/fpu_emu.h
index d62b20a3e660..65120f523853 100644
--- a/arch/i386/math-emu/fpu_emu.h
+++ b/arch/i386/math-emu/fpu_emu.h
@@ -57,6 +57,7 @@
57#define TAG_Special Const(2) /* De-normal, + or - infinity, 57#define TAG_Special Const(2) /* De-normal, + or - infinity,
58 or Not a Number */ 58 or Not a Number */
59#define TAG_Empty Const(3) /* empty */ 59#define TAG_Empty Const(3) /* empty */
60#define TAG_Error Const(0x80) /* probably need to abort */
60 61
61#define LOADED_DATA Const(10101) /* Special st() number to identify 62#define LOADED_DATA Const(10101) /* Special st() number to identify
62 loaded data (not on stack). */ 63 loaded data (not on stack). */
diff --git a/arch/i386/math-emu/fpu_entry.c b/arch/i386/math-emu/fpu_entry.c
index d93f16ef828f..ddf8fa3bbd01 100644
--- a/arch/i386/math-emu/fpu_entry.c
+++ b/arch/i386/math-emu/fpu_entry.c
@@ -742,7 +742,8 @@ int save_i387_soft(void *s387, struct _fpstate __user * buf)
742 S387->fcs &= ~0xf8000000; 742 S387->fcs &= ~0xf8000000;
743 S387->fos |= 0xffff0000; 743 S387->fos |= 0xffff0000;
744#endif /* PECULIAR_486 */ 744#endif /* PECULIAR_486 */
745 __copy_to_user(d, &S387->cwd, 7*4); 745 if (__copy_to_user(d, &S387->cwd, 7*4))
746 return -1;
746 RE_ENTRANT_CHECK_ON; 747 RE_ENTRANT_CHECK_ON;
747 748
748 d += 7*4; 749 d += 7*4;
diff --git a/arch/i386/math-emu/fpu_system.h b/arch/i386/math-emu/fpu_system.h
index bf26341c8bde..a3ae28c49ddd 100644
--- a/arch/i386/math-emu/fpu_system.h
+++ b/arch/i386/math-emu/fpu_system.h
@@ -68,6 +68,7 @@
68 68
69#define FPU_access_ok(x,y,z) if ( !access_ok(x,y,z) ) \ 69#define FPU_access_ok(x,y,z) if ( !access_ok(x,y,z) ) \
70 math_abort(FPU_info,SIGSEGV) 70 math_abort(FPU_info,SIGSEGV)
71#define FPU_abort math_abort(FPU_info, SIGSEGV)
71 72
72#undef FPU_IGNORE_CODE_SEGV 73#undef FPU_IGNORE_CODE_SEGV
73#ifdef FPU_IGNORE_CODE_SEGV 74#ifdef FPU_IGNORE_CODE_SEGV
diff --git a/arch/i386/math-emu/load_store.c b/arch/i386/math-emu/load_store.c
index 85314be2fef8..eebd6fb1c8a8 100644
--- a/arch/i386/math-emu/load_store.c
+++ b/arch/i386/math-emu/load_store.c
@@ -227,6 +227,8 @@ int FPU_load_store(u_char type, fpu_addr_modes addr_modes,
227 case 027: /* fild m64int */ 227 case 027: /* fild m64int */
228 clear_C1(); 228 clear_C1();
229 loaded_tag = FPU_load_int64((long long __user *)data_address); 229 loaded_tag = FPU_load_int64((long long __user *)data_address);
230 if (loaded_tag == TAG_Error)
231 return 0;
230 FPU_settag0(loaded_tag); 232 FPU_settag0(loaded_tag);
231 break; 233 break;
232 case 030: /* fstenv m14/28byte */ 234 case 030: /* fstenv m14/28byte */
diff --git a/arch/i386/math-emu/reg_ld_str.c b/arch/i386/math-emu/reg_ld_str.c
index f06ed41d191d..e976caef6498 100644
--- a/arch/i386/math-emu/reg_ld_str.c
+++ b/arch/i386/math-emu/reg_ld_str.c
@@ -244,7 +244,8 @@ int FPU_load_int64(long long __user *_s)
244 244
245 RE_ENTRANT_CHECK_OFF; 245 RE_ENTRANT_CHECK_OFF;
246 FPU_access_ok(VERIFY_READ, _s, 8); 246 FPU_access_ok(VERIFY_READ, _s, 8);
247 copy_from_user(&s,_s,8); 247 if (copy_from_user(&s,_s,8))
248 FPU_abort;
248 RE_ENTRANT_CHECK_ON; 249 RE_ENTRANT_CHECK_ON;
249 250
250 if (s == 0) 251 if (s == 0)
@@ -907,7 +908,8 @@ int FPU_store_int64(FPU_REG *st0_ptr, u_char st0_tag, long long __user *d)
907 908
908 RE_ENTRANT_CHECK_OFF; 909 RE_ENTRANT_CHECK_OFF;
909 FPU_access_ok(VERIFY_WRITE,d,8); 910 FPU_access_ok(VERIFY_WRITE,d,8);
910 copy_to_user(d, &tll, 8); 911 if (copy_to_user(d, &tll, 8))
912 FPU_abort;
911 RE_ENTRANT_CHECK_ON; 913 RE_ENTRANT_CHECK_ON;
912 914
913 return 1; 915 return 1;
@@ -1336,7 +1338,8 @@ u_char __user *fstenv(fpu_addr_modes addr_modes, u_char __user *d)
1336 I387.soft.fcs &= ~0xf8000000; 1338 I387.soft.fcs &= ~0xf8000000;
1337 I387.soft.fos |= 0xffff0000; 1339 I387.soft.fos |= 0xffff0000;
1338#endif /* PECULIAR_486 */ 1340#endif /* PECULIAR_486 */
1339 __copy_to_user(d, &control_word, 7*4); 1341 if (__copy_to_user(d, &control_word, 7*4))
1342 FPU_abort;
1340 RE_ENTRANT_CHECK_ON; 1343 RE_ENTRANT_CHECK_ON;
1341 d += 0x1c; 1344 d += 0x1c;
1342 } 1345 }
@@ -1359,9 +1362,11 @@ void fsave(fpu_addr_modes addr_modes, u_char __user *data_address)
1359 FPU_access_ok(VERIFY_WRITE,d,80); 1362 FPU_access_ok(VERIFY_WRITE,d,80);
1360 1363
1361 /* Copy all registers in stack order. */ 1364 /* Copy all registers in stack order. */
1362 __copy_to_user(d, register_base+offset, other); 1365 if (__copy_to_user(d, register_base+offset, other))
1366 FPU_abort;
1363 if ( offset ) 1367 if ( offset )
1364 __copy_to_user(d+other, register_base, offset); 1368 if (__copy_to_user(d+other, register_base, offset))
1369 FPU_abort;
1365 RE_ENTRANT_CHECK_ON; 1370 RE_ENTRANT_CHECK_ON;
1366 1371
1367 finit(); 1372 finit();
diff --git a/arch/i386/mm/boot_ioremap.c b/arch/i386/mm/boot_ioremap.c
index 4de11f508c3a..4de95a17a7d4 100644
--- a/arch/i386/mm/boot_ioremap.c
+++ b/arch/i386/mm/boot_ioremap.c
@@ -16,6 +16,7 @@
16 */ 16 */
17 17
18#undef CONFIG_X86_PAE 18#undef CONFIG_X86_PAE
19#undef CONFIG_PARAVIRT
19#include <asm/page.h> 20#include <asm/page.h>
20#include <asm/pgtable.h> 21#include <asm/pgtable.h>
21#include <asm/tlbflush.h> 22#include <asm/tlbflush.h>
diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c
index ddbdb0336f28..103b76e56a94 100644
--- a/arch/i386/mm/discontig.c
+++ b/arch/i386/mm/discontig.c
@@ -168,7 +168,7 @@ static void __init allocate_pgdat(int nid)
168 if (nid && node_has_online_mem(nid)) 168 if (nid && node_has_online_mem(nid))
169 NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; 169 NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
170 else { 170 else {
171 NODE_DATA(nid) = (pg_data_t *)(__va(min_low_pfn << PAGE_SHIFT)); 171 NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(min_low_pfn));
172 min_low_pfn += PFN_UP(sizeof(pg_data_t)); 172 min_low_pfn += PFN_UP(sizeof(pg_data_t));
173 } 173 }
174} 174}
diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c
index 2581575786c1..aaaa4d225f7e 100644
--- a/arch/i386/mm/fault.c
+++ b/arch/i386/mm/fault.c
@@ -22,9 +22,9 @@
22#include <linux/highmem.h> 22#include <linux/highmem.h>
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/kprobes.h> 24#include <linux/kprobes.h>
25#include <linux/uaccess.h>
25 26
26#include <asm/system.h> 27#include <asm/system.h>
27#include <asm/uaccess.h>
28#include <asm/desc.h> 28#include <asm/desc.h>
29#include <asm/kdebug.h> 29#include <asm/kdebug.h>
30#include <asm/segment.h> 30#include <asm/segment.h>
@@ -167,7 +167,7 @@ static inline unsigned long get_segment_eip(struct pt_regs *regs,
167static int __is_prefetch(struct pt_regs *regs, unsigned long addr) 167static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
168{ 168{
169 unsigned long limit; 169 unsigned long limit;
170 unsigned long instr = get_segment_eip (regs, &limit); 170 unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit);
171 int scan_more = 1; 171 int scan_more = 1;
172 int prefetch = 0; 172 int prefetch = 0;
173 int i; 173 int i;
@@ -177,9 +177,9 @@ static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
177 unsigned char instr_hi; 177 unsigned char instr_hi;
178 unsigned char instr_lo; 178 unsigned char instr_lo;
179 179
180 if (instr > limit) 180 if (instr > (unsigned char *)limit)
181 break; 181 break;
182 if (__get_user(opcode, (unsigned char __user *) instr)) 182 if (probe_kernel_address(instr, opcode))
183 break; 183 break;
184 184
185 instr_hi = opcode & 0xf0; 185 instr_hi = opcode & 0xf0;
@@ -204,9 +204,9 @@ static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
204 case 0x00: 204 case 0x00:
205 /* Prefetch instruction is 0x0F0D or 0x0F18 */ 205 /* Prefetch instruction is 0x0F0D or 0x0F18 */
206 scan_more = 0; 206 scan_more = 0;
207 if (instr > limit) 207 if (instr > (unsigned char *)limit)
208 break; 208 break;
209 if (__get_user(opcode, (unsigned char __user *) instr)) 209 if (probe_kernel_address(instr, opcode))
210 break; 210 break;
211 prefetch = (instr_lo == 0xF) && 211 prefetch = (instr_lo == 0xF) &&
212 (opcode == 0x0D || opcode == 0x18); 212 (opcode == 0x0D || opcode == 0x18);
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index a6a8e397dd88..84697dfc7348 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -192,8 +192,6 @@ static inline int page_kills_ppro(unsigned long pagenr)
192 return 0; 192 return 0;
193} 193}
194 194
195extern int is_available_memory(efi_memory_desc_t *);
196
197int page_is_ram(unsigned long pagenr) 195int page_is_ram(unsigned long pagenr)
198{ 196{
199 int i; 197 int i;
diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c
index 8564b6ae17e3..ad91528bdc14 100644
--- a/arch/i386/mm/pageattr.c
+++ b/arch/i386/mm/pageattr.c
@@ -67,11 +67,17 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot,
67 return base; 67 return base;
68} 68}
69 69
70static void flush_kernel_map(void *dummy) 70static void flush_kernel_map(void *arg)
71{ 71{
72 /* Could use CLFLUSH here if the CPU supports it (Hammer,P4) */ 72 unsigned long adr = (unsigned long)arg;
73 if (boot_cpu_data.x86_model >= 4) 73
74 if (adr && cpu_has_clflush) {
75 int i;
76 for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
77 asm volatile("clflush (%0)" :: "r" (adr + i));
78 } else if (boot_cpu_data.x86_model >= 4)
74 wbinvd(); 79 wbinvd();
80
75 /* Flush all to work around Errata in early athlons regarding 81 /* Flush all to work around Errata in early athlons regarding
76 * large page flushing. 82 * large page flushing.
77 */ 83 */
@@ -173,9 +179,9 @@ __change_page_attr(struct page *page, pgprot_t prot)
173 return 0; 179 return 0;
174} 180}
175 181
176static inline void flush_map(void) 182static inline void flush_map(void *adr)
177{ 183{
178 on_each_cpu(flush_kernel_map, NULL, 1, 1); 184 on_each_cpu(flush_kernel_map, adr, 1, 1);
179} 185}
180 186
181/* 187/*
@@ -217,9 +223,13 @@ void global_flush_tlb(void)
217 spin_lock_irq(&cpa_lock); 223 spin_lock_irq(&cpa_lock);
218 list_replace_init(&df_list, &l); 224 list_replace_init(&df_list, &l);
219 spin_unlock_irq(&cpa_lock); 225 spin_unlock_irq(&cpa_lock);
220 flush_map(); 226 if (!cpu_has_clflush)
221 list_for_each_entry_safe(pg, next, &l, lru) 227 flush_map(0);
228 list_for_each_entry_safe(pg, next, &l, lru) {
229 if (cpu_has_clflush)
230 flush_map(page_address(pg));
222 __free_page(pg); 231 __free_page(pg);
232 }
223} 233}
224 234
225#ifdef CONFIG_DEBUG_PAGEALLOC 235#ifdef CONFIG_DEBUG_PAGEALLOC
diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
index 33be236fc6af..f349eaf450b0 100644
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -95,8 +95,11 @@ static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
95 return; 95 return;
96 } 96 }
97 pte = pte_offset_kernel(pmd, vaddr); 97 pte = pte_offset_kernel(pmd, vaddr);
98 /* <pfn,flags> stored as-is, to permit clearing entries */ 98 if (pgprot_val(flags))
99 set_pte(pte, pfn_pte(pfn, flags)); 99 /* <pfn,flags> stored as-is, to permit clearing entries */
100 set_pte(pte, pfn_pte(pfn, flags));
101 else
102 pte_clear(&init_mm, vaddr, pte);
100 103
101 /* 104 /*
102 * It's enough to flush this one mapping. 105 * It's enough to flush this one mapping.
diff --git a/arch/i386/pci/early.c b/arch/i386/pci/early.c
index 713d6c866cae..42df4b6606df 100644
--- a/arch/i386/pci/early.c
+++ b/arch/i386/pci/early.c
@@ -45,6 +45,13 @@ void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset,
45 outl(val, 0xcfc); 45 outl(val, 0xcfc);
46} 46}
47 47
48void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val)
49{
50 PDprintk("%x writing to %x: %x\n", slot, offset, val);
51 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
52 outb(val, 0xcfc);
53}
54
48int early_pci_allowed(void) 55int early_pci_allowed(void)
49{ 56{
50 return (pci_probe & (PCI_PROBE_CONF1|PCI_PROBE_NOEARLY)) == 57 return (pci_probe & (PCI_PROBE_CONF1|PCI_PROBE_NOEARLY)) ==
diff --git a/arch/i386/pci/irq.c b/arch/i386/pci/irq.c
index e65551cd8216..f2cb942f8281 100644
--- a/arch/i386/pci/irq.c
+++ b/arch/i386/pci/irq.c
@@ -764,7 +764,7 @@ static void __init pirq_find_router(struct irq_router *r)
764 DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n", 764 DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n",
765 rt->rtr_vendor, rt->rtr_device); 765 rt->rtr_vendor, rt->rtr_device);
766 766
767 pirq_router_dev = pci_find_slot(rt->rtr_bus, rt->rtr_devfn); 767 pirq_router_dev = pci_get_bus_and_slot(rt->rtr_bus, rt->rtr_devfn);
768 if (!pirq_router_dev) { 768 if (!pirq_router_dev) {
769 DBG(KERN_DEBUG "PCI: Interrupt router not found at " 769 DBG(KERN_DEBUG "PCI: Interrupt router not found at "
770 "%02x:%02x\n", rt->rtr_bus, rt->rtr_devfn); 770 "%02x:%02x\n", rt->rtr_bus, rt->rtr_devfn);
@@ -784,6 +784,8 @@ static void __init pirq_find_router(struct irq_router *r)
784 pirq_router_dev->vendor, 784 pirq_router_dev->vendor,
785 pirq_router_dev->device, 785 pirq_router_dev->device,
786 pci_name(pirq_router_dev)); 786 pci_name(pirq_router_dev));
787
788 /* The device remains referenced for the kernel lifetime */
787} 789}
788 790
789static struct irq_info *pirq_get_info(struct pci_dev *dev) 791static struct irq_info *pirq_get_info(struct pci_dev *dev)
diff --git a/arch/i386/pci/pcbios.c b/arch/i386/pci/pcbios.c
index ed1512a175ab..5f5193401bea 100644
--- a/arch/i386/pci/pcbios.c
+++ b/arch/i386/pci/pcbios.c
@@ -5,6 +5,7 @@
5#include <linux/pci.h> 5#include <linux/pci.h>
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/uaccess.h>
8#include "pci.h" 9#include "pci.h"
9#include "pci-functions.h" 10#include "pci-functions.h"
10 11
@@ -314,6 +315,10 @@ static struct pci_raw_ops * __devinit pci_find_bios(void)
314 for (check = (union bios32 *) __va(0xe0000); 315 for (check = (union bios32 *) __va(0xe0000);
315 check <= (union bios32 *) __va(0xffff0); 316 check <= (union bios32 *) __va(0xffff0);
316 ++check) { 317 ++check) {
318 long sig;
319 if (probe_kernel_address(&check->fields.signature, sig))
320 continue;
321
317 if (check->fields.signature != BIOS32_SIGNATURE) 322 if (check->fields.signature != BIOS32_SIGNATURE)
318 continue; 323 continue;
319 length = check->fields.length * 16; 324 length = check->fields.length * 16;
@@ -331,11 +336,13 @@ static struct pci_raw_ops * __devinit pci_find_bios(void)
331 } 336 }
332 DBG("PCI: BIOS32 Service Directory structure at 0x%p\n", check); 337 DBG("PCI: BIOS32 Service Directory structure at 0x%p\n", check);
333 if (check->fields.entry >= 0x100000) { 338 if (check->fields.entry >= 0x100000) {
334 printk("PCI: BIOS32 entry (0x%p) in high memory, cannot use.\n", check); 339 printk("PCI: BIOS32 entry (0x%p) in high memory, "
340 "cannot use.\n", check);
335 return NULL; 341 return NULL;
336 } else { 342 } else {
337 unsigned long bios32_entry = check->fields.entry; 343 unsigned long bios32_entry = check->fields.entry;
338 DBG("PCI: BIOS32 Service Directory entry at 0x%lx\n", bios32_entry); 344 DBG("PCI: BIOS32 Service Directory entry at 0x%lx\n",
345 bios32_entry);
339 bios32_indirect.address = bios32_entry + PAGE_OFFSET; 346 bios32_indirect.address = bios32_entry + PAGE_OFFSET;
340 if (check_pcibios()) 347 if (check_pcibios())
341 return &pci_bios_access; 348 return &pci_bios_access;
diff --git a/arch/i386/power/cpu.c b/arch/i386/power/cpu.c
index 5a1abeff033b..2c15500f8713 100644
--- a/arch/i386/power/cpu.c
+++ b/arch/i386/power/cpu.c
@@ -26,8 +26,8 @@ void __save_processor_state(struct saved_context *ctxt)
26 /* 26 /*
27 * descriptor tables 27 * descriptor tables
28 */ 28 */
29 store_gdt(&ctxt->gdt_limit); 29 store_gdt(&ctxt->gdt);
30 store_idt(&ctxt->idt_limit); 30 store_idt(&ctxt->idt);
31 store_tr(ctxt->tr); 31 store_tr(ctxt->tr);
32 32
33 /* 33 /*
@@ -99,8 +99,8 @@ void __restore_processor_state(struct saved_context *ctxt)
99 * now restore the descriptor tables to their proper values 99 * now restore the descriptor tables to their proper values
100 * ltr is done i fix_processor_context(). 100 * ltr is done i fix_processor_context().
101 */ 101 */
102 load_gdt(&ctxt->gdt_limit); 102 load_gdt(&ctxt->gdt);
103 load_idt(&ctxt->idt_limit); 103 load_idt(&ctxt->idt);
104 104
105 /* 105 /*
106 * segment registers 106 * segment registers
diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c
index 5629b45e89c6..687500ddb4b8 100644
--- a/arch/ia64/kernel/topology.c
+++ b/arch/ia64/kernel/topology.c
@@ -31,11 +31,11 @@ int arch_register_cpu(int num)
31{ 31{
32#if defined (CONFIG_ACPI) && defined (CONFIG_HOTPLUG_CPU) 32#if defined (CONFIG_ACPI) && defined (CONFIG_HOTPLUG_CPU)
33 /* 33 /*
34 * If CPEI cannot be re-targetted, and this is 34 * If CPEI can be re-targetted or if this is not
35 * CPEI target, then dont create the control file 35 * CPEI target, then it is hotpluggable
36 */ 36 */
37 if (!can_cpei_retarget() && is_cpu_cpei_target(num)) 37 if (can_cpei_retarget() || !is_cpu_cpei_target(num))
38 sysfs_cpus[num].cpu.no_control = 1; 38 sysfs_cpus[num].cpu.hotpluggable = 1;
39 map_cpu_to_node(num, node_cpuid[num].nid); 39 map_cpu_to_node(num, node_cpuid[num].nid);
40#endif 40#endif
41 41
diff --git a/arch/m68knommu/kernel/vmlinux.lds.S b/arch/m68knommu/kernel/vmlinux.lds.S
index 58afa8be604e..2b2a10da64a4 100644
--- a/arch/m68knommu/kernel/vmlinux.lds.S
+++ b/arch/m68knommu/kernel/vmlinux.lds.S
@@ -60,6 +60,7 @@ SECTIONS {
60#endif 60#endif
61 61
62 .text : { 62 .text : {
63 _text = .;
63 _stext = . ; 64 _stext = . ;
64 *(.text) 65 *(.text)
65 SCHED_TEXT 66 SCHED_TEXT
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index 22123a0d5416..63ed265b7f09 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -239,7 +239,7 @@ static void unregister_cpu_online(unsigned int cpu)
239 struct cpu *c = &per_cpu(cpu_devices, cpu); 239 struct cpu *c = &per_cpu(cpu_devices, cpu);
240 struct sys_device *s = &c->sysdev; 240 struct sys_device *s = &c->sysdev;
241 241
242 BUG_ON(c->no_control); 242 BUG_ON(!c->hotpluggable);
243 243
244 if (!firmware_has_feature(FW_FEATURE_ISERIES) && 244 if (!firmware_has_feature(FW_FEATURE_ISERIES) &&
245 cpu_has_feature(CPU_FTR_SMT)) 245 cpu_has_feature(CPU_FTR_SMT))
@@ -424,10 +424,10 @@ static int __init topology_init(void)
424 * CPU. For instance, the boot cpu might never be valid 424 * CPU. For instance, the boot cpu might never be valid
425 * for hotplugging. 425 * for hotplugging.
426 */ 426 */
427 if (!ppc_md.cpu_die) 427 if (ppc_md.cpu_die)
428 c->no_control = 1; 428 c->hotpluggable = 1;
429 429
430 if (cpu_online(cpu) || (c->no_control == 0)) { 430 if (cpu_online(cpu) || c->hotpluggable) {
431 register_cpu(c, cpu); 431 register_cpu(c, cpu);
432 432
433 sysdev_create_file(&c->sysdev, &attr_physical_id); 433 sysdev_create_file(&c->sysdev, &attr_physical_id);
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index e8342d867536..04b98671a060 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -33,6 +33,7 @@ SECTIONS
33 33
34 /* Text and gots */ 34 /* Text and gots */
35 .text : { 35 .text : {
36 _text = .;
36 *(.text .text.*) 37 *(.text .text.*)
37 SCHED_TEXT 38 SCHED_TEXT
38 LOCK_TEXT 39 LOCK_TEXT
diff --git a/arch/ppc/kernel/vmlinux.lds.S b/arch/ppc/kernel/vmlinux.lds.S
index 16e8661e1fec..61921268a0d0 100644
--- a/arch/ppc/kernel/vmlinux.lds.S
+++ b/arch/ppc/kernel/vmlinux.lds.S
@@ -31,6 +31,7 @@ SECTIONS
31 .plt : { *(.plt) } 31 .plt : { *(.plt) }
32 .text : 32 .text :
33 { 33 {
34 _text = .;
34 *(.text) 35 *(.text)
35 SCHED_TEXT 36 SCHED_TEXT
36 LOCK_TEXT 37 LOCK_TEXT
diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S
index 5cc5ff7f8824..b73e6b9067ed 100644
--- a/arch/sparc/kernel/vmlinux.lds.S
+++ b/arch/sparc/kernel/vmlinux.lds.S
@@ -11,6 +11,7 @@ SECTIONS
11 . = 0x10000 + SIZEOF_HEADERS; 11 . = 0x10000 + SIZEOF_HEADERS;
12 .text 0xf0004000 : 12 .text 0xf0004000 :
13 { 13 {
14 _text = .;
14 *(.text) 15 *(.text)
15 SCHED_TEXT 16 SCHED_TEXT
16 LOCK_TEXT 17 LOCK_TEXT
diff --git a/arch/sparc64/kernel/vmlinux.lds.S b/arch/sparc64/kernel/vmlinux.lds.S
index bd9de8c2a2aa..4a6063f33e7a 100644
--- a/arch/sparc64/kernel/vmlinux.lds.S
+++ b/arch/sparc64/kernel/vmlinux.lds.S
@@ -13,6 +13,7 @@ SECTIONS
13 . = 0x4000; 13 . = 0x4000;
14 .text 0x0000000000404000 : 14 .text 0x0000000000404000 :
15 { 15 {
16 _text = .;
16 *(.text) 17 *(.text)
17 SCHED_TEXT 18 SCHED_TEXT
18 LOCK_TEXT 19 LOCK_TEXT
diff --git a/arch/v850/kernel/vmlinux.lds.S b/arch/v850/kernel/vmlinux.lds.S
index 88d087f527c9..3a5fd07fe064 100644
--- a/arch/v850/kernel/vmlinux.lds.S
+++ b/arch/v850/kernel/vmlinux.lds.S
@@ -90,6 +90,7 @@
90 90
91/* Kernel text segment, and some constant data areas. */ 91/* Kernel text segment, and some constant data areas. */
92#define TEXT_CONTENTS \ 92#define TEXT_CONTENTS \
93 _text = .; \
93 __stext = . ; \ 94 __stext = . ; \
94 *(.text) \ 95 *(.text) \
95 SCHED_TEXT \ 96 SCHED_TEXT \
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 010d2265f1cf..bfbb9bcae123 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -122,7 +122,7 @@ endchoice
122 122
123choice 123choice
124 prompt "Processor family" 124 prompt "Processor family"
125 default MK8 125 default GENERIC_CPU
126 126
127config MK8 127config MK8
128 bool "AMD-Opteron/Athlon64" 128 bool "AMD-Opteron/Athlon64"
@@ -130,16 +130,31 @@ config MK8
130 Optimize for AMD Opteron/Athlon64/Hammer/K8 CPUs. 130 Optimize for AMD Opteron/Athlon64/Hammer/K8 CPUs.
131 131
132config MPSC 132config MPSC
133 bool "Intel EM64T" 133 bool "Intel P4 / older Netburst based Xeon"
134 help 134 help
135 Optimize for Intel Pentium 4 and Xeon CPUs with Intel 135 Optimize for Intel Pentium 4 and older Nocona/Dempsey Xeon CPUs
136 Extended Memory 64 Technology(EM64T). For details see 136 with Intel Extended Memory 64 Technology(EM64T). For details see
137 <http://www.intel.com/technology/64bitextensions/>. 137 <http://www.intel.com/technology/64bitextensions/>.
138 Note the the latest Xeons (Xeon 51xx and 53xx) are not based on the
139 Netburst core and shouldn't use this option. You can distingush them
140 using the cpu family field
141 in /proc/cpuinfo. Family 15 is a older Xeon, Family 6 a newer one
142 (this rule only applies to system that support EM64T)
143
144config MCORE2
145 bool "Intel Core2 / newer Xeon"
146 help
147 Optimize for Intel Core2 and newer Xeons (51xx)
148 You can distingush the newer Xeons from the older ones using
149 the cpu family field in /proc/cpuinfo. 15 is a older Xeon
150 (use CONFIG_MPSC then), 6 is a newer one. This rule only
151 applies to CPUs that support EM64T.
138 152
139config GENERIC_CPU 153config GENERIC_CPU
140 bool "Generic-x86-64" 154 bool "Generic-x86-64"
141 help 155 help
142 Generic x86-64 CPU. 156 Generic x86-64 CPU.
157 Run equally well on all x86-64 CPUs.
143 158
144endchoice 159endchoice
145 160
@@ -149,12 +164,12 @@ endchoice
149config X86_L1_CACHE_BYTES 164config X86_L1_CACHE_BYTES
150 int 165 int
151 default "128" if GENERIC_CPU || MPSC 166 default "128" if GENERIC_CPU || MPSC
152 default "64" if MK8 167 default "64" if MK8 || MCORE2
153 168
154config X86_L1_CACHE_SHIFT 169config X86_L1_CACHE_SHIFT
155 int 170 int
156 default "7" if GENERIC_CPU || MPSC 171 default "7" if GENERIC_CPU || MPSC
157 default "6" if MK8 172 default "6" if MK8 || MCORE2
158 173
159config X86_INTERNODE_CACHE_BYTES 174config X86_INTERNODE_CACHE_BYTES
160 int 175 int
@@ -344,11 +359,6 @@ config ARCH_DISCONTIGMEM_ENABLE
344 depends on NUMA 359 depends on NUMA
345 default y 360 default y
346 361
347
348config ARCH_DISCONTIGMEM_ENABLE
349 def_bool y
350 depends on NUMA
351
352config ARCH_DISCONTIGMEM_DEFAULT 362config ARCH_DISCONTIGMEM_DEFAULT
353 def_bool y 363 def_bool y
354 depends on NUMA 364 depends on NUMA
@@ -455,6 +465,17 @@ config CALGARY_IOMMU
455 Normally the kernel will make the right choice by itself. 465 Normally the kernel will make the right choice by itself.
456 If unsure, say Y. 466 If unsure, say Y.
457 467
468config CALGARY_IOMMU_ENABLED_BY_DEFAULT
469 bool "Should Calgary be enabled by default?"
470 default y
471 depends on CALGARY_IOMMU
472 help
473 Should Calgary be enabled by default? if you choose 'y', Calgary
474 will be used (if it exists). If you choose 'n', Calgary will not be
475 used even if it exists. If you choose 'n' and would like to use
476 Calgary anyway, pass 'iommu=calgary' on the kernel command line.
477 If unsure, say Y.
478
458# need this always selected by IOMMU for the VIA workaround 479# need this always selected by IOMMU for the VIA workaround
459config SWIOTLB 480config SWIOTLB
460 bool 481 bool
diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile
index 6e38d4daeed7..b471b8550d03 100644
--- a/arch/x86_64/Makefile
+++ b/arch/x86_64/Makefile
@@ -30,6 +30,10 @@ cflags-y :=
30cflags-kernel-y := 30cflags-kernel-y :=
31cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8) 31cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)
32cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona) 32cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
33# gcc doesn't support -march=core2 yet as of gcc 4.3, but I hope it
34# will eventually. Use -mtune=generic as fallback
35cflags-$(CONFIG_MCORE2) += \
36 $(call cc-option,-march=core2,$(call cc-option,-mtune=generic))
33cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic) 37cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic)
34 38
35cflags-y += -m64 39cflags-y += -m64
diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig
index 0f5d44e86be5..96f226cfb339 100644
--- a/arch/x86_64/defconfig
+++ b/arch/x86_64/defconfig
@@ -1,7 +1,7 @@
1# 1#
2# Automatically generated make config: don't edit 2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.19-rc2-git4 3# Linux kernel version: 2.6.19-git7
4# Sat Oct 21 03:38:52 2006 4# Wed Dec 6 23:50:47 2006
5# 5#
6CONFIG_X86_64=y 6CONFIG_X86_64=y
7CONFIG_64BIT=y 7CONFIG_64BIT=y
@@ -47,13 +47,14 @@ CONFIG_POSIX_MQUEUE=y
47CONFIG_IKCONFIG=y 47CONFIG_IKCONFIG=y
48CONFIG_IKCONFIG_PROC=y 48CONFIG_IKCONFIG_PROC=y
49# CONFIG_CPUSETS is not set 49# CONFIG_CPUSETS is not set
50CONFIG_SYSFS_DEPRECATED=y
50# CONFIG_RELAY is not set 51# CONFIG_RELAY is not set
51CONFIG_INITRAMFS_SOURCE="" 52CONFIG_INITRAMFS_SOURCE=""
52CONFIG_CC_OPTIMIZE_FOR_SIZE=y 53CONFIG_CC_OPTIMIZE_FOR_SIZE=y
53CONFIG_SYSCTL=y 54CONFIG_SYSCTL=y
54# CONFIG_EMBEDDED is not set 55# CONFIG_EMBEDDED is not set
55CONFIG_UID16=y 56CONFIG_UID16=y
56# CONFIG_SYSCTL_SYSCALL is not set 57CONFIG_SYSCTL_SYSCALL=y
57CONFIG_KALLSYMS=y 58CONFIG_KALLSYMS=y
58CONFIG_KALLSYMS_ALL=y 59CONFIG_KALLSYMS_ALL=y
59# CONFIG_KALLSYMS_EXTRA_PASS is not set 60# CONFIG_KALLSYMS_EXTRA_PASS is not set
@@ -87,9 +88,7 @@ CONFIG_STOP_MACHINE=y
87# Block layer 88# Block layer
88# 89#
89CONFIG_BLOCK=y 90CONFIG_BLOCK=y
90CONFIG_LBD=y
91# CONFIG_BLK_DEV_IO_TRACE is not set 91# CONFIG_BLK_DEV_IO_TRACE is not set
92# CONFIG_LSF is not set
93 92
94# 93#
95# IO Schedulers 94# IO Schedulers
@@ -111,10 +110,11 @@ CONFIG_X86_PC=y
111# CONFIG_X86_VSMP is not set 110# CONFIG_X86_VSMP is not set
112# CONFIG_MK8 is not set 111# CONFIG_MK8 is not set
113# CONFIG_MPSC is not set 112# CONFIG_MPSC is not set
114CONFIG_GENERIC_CPU=y 113CONFIG_MCORE2=y
115CONFIG_X86_L1_CACHE_BYTES=128 114# CONFIG_GENERIC_CPU is not set
116CONFIG_X86_L1_CACHE_SHIFT=7 115CONFIG_X86_L1_CACHE_BYTES=64
117CONFIG_X86_INTERNODE_CACHE_BYTES=128 116CONFIG_X86_L1_CACHE_SHIFT=6
117CONFIG_X86_INTERNODE_CACHE_BYTES=64
118CONFIG_X86_TSC=y 118CONFIG_X86_TSC=y
119CONFIG_X86_GOOD_APIC=y 119CONFIG_X86_GOOD_APIC=y
120# CONFIG_MICROCODE is not set 120# CONFIG_MICROCODE is not set
@@ -322,6 +322,7 @@ CONFIG_INET_TCP_DIAG=y
322# CONFIG_TCP_CONG_ADVANCED is not set 322# CONFIG_TCP_CONG_ADVANCED is not set
323CONFIG_TCP_CONG_CUBIC=y 323CONFIG_TCP_CONG_CUBIC=y
324CONFIG_DEFAULT_TCP_CONG="cubic" 324CONFIG_DEFAULT_TCP_CONG="cubic"
325# CONFIG_TCP_MD5SIG is not set
325CONFIG_IPV6=y 326CONFIG_IPV6=y
326# CONFIG_IPV6_PRIVACY is not set 327# CONFIG_IPV6_PRIVACY is not set
327# CONFIG_IPV6_ROUTER_PREF is not set 328# CONFIG_IPV6_ROUTER_PREF is not set
@@ -624,6 +625,7 @@ CONFIG_SATA_INTEL_COMBINED=y
624# CONFIG_PATA_IT821X is not set 625# CONFIG_PATA_IT821X is not set
625# CONFIG_PATA_JMICRON is not set 626# CONFIG_PATA_JMICRON is not set
626# CONFIG_PATA_TRIFLEX is not set 627# CONFIG_PATA_TRIFLEX is not set
628# CONFIG_PATA_MARVELL is not set
627# CONFIG_PATA_MPIIX is not set 629# CONFIG_PATA_MPIIX is not set
628# CONFIG_PATA_OLDPIIX is not set 630# CONFIG_PATA_OLDPIIX is not set
629# CONFIG_PATA_NETCELL is not set 631# CONFIG_PATA_NETCELL is not set
@@ -795,6 +797,7 @@ CONFIG_BNX2=y
795CONFIG_S2IO=m 797CONFIG_S2IO=m
796# CONFIG_S2IO_NAPI is not set 798# CONFIG_S2IO_NAPI is not set
797# CONFIG_MYRI10GE is not set 799# CONFIG_MYRI10GE is not set
800# CONFIG_NETXEN_NIC is not set
798 801
799# 802#
800# Token Ring devices 803# Token Ring devices
@@ -927,10 +930,6 @@ CONFIG_RTC=y
927# CONFIG_DTLK is not set 930# CONFIG_DTLK is not set
928# CONFIG_R3964 is not set 931# CONFIG_R3964 is not set
929# CONFIG_APPLICOM is not set 932# CONFIG_APPLICOM is not set
930
931#
932# Ftape, the floppy tape device driver
933#
934CONFIG_AGP=y 933CONFIG_AGP=y
935CONFIG_AGP_AMD64=y 934CONFIG_AGP_AMD64=y
936CONFIG_AGP_INTEL=y 935CONFIG_AGP_INTEL=y
@@ -1135,6 +1134,7 @@ CONFIG_USB_DEVICEFS=y
1135# CONFIG_USB_BANDWIDTH is not set 1134# CONFIG_USB_BANDWIDTH is not set
1136# CONFIG_USB_DYNAMIC_MINORS is not set 1135# CONFIG_USB_DYNAMIC_MINORS is not set
1137# CONFIG_USB_SUSPEND is not set 1136# CONFIG_USB_SUSPEND is not set
1137# CONFIG_USB_MULTITHREAD_PROBE is not set
1138# CONFIG_USB_OTG is not set 1138# CONFIG_USB_OTG is not set
1139 1139
1140# 1140#
@@ -1212,6 +1212,7 @@ CONFIG_USB_HIDINPUT=y
1212# CONFIG_USB_KAWETH is not set 1212# CONFIG_USB_KAWETH is not set
1213# CONFIG_USB_PEGASUS is not set 1213# CONFIG_USB_PEGASUS is not set
1214# CONFIG_USB_RTL8150 is not set 1214# CONFIG_USB_RTL8150 is not set
1215# CONFIG_USB_USBNET_MII is not set
1215# CONFIG_USB_USBNET is not set 1216# CONFIG_USB_USBNET is not set
1216CONFIG_USB_MON=y 1217CONFIG_USB_MON=y
1217 1218
diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c
index 0e0a266d976f..ff499ef2a1ba 100644
--- a/arch/x86_64/ia32/ia32_signal.c
+++ b/arch/x86_64/ia32/ia32_signal.c
@@ -584,6 +584,11 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
584 regs->rdx = (unsigned long) &frame->info; 584 regs->rdx = (unsigned long) &frame->info;
585 regs->rcx = (unsigned long) &frame->uc; 585 regs->rcx = (unsigned long) &frame->uc;
586 586
587 /* Make -mregparm=3 work */
588 regs->rax = sig;
589 regs->rdx = (unsigned long) &frame->info;
590 regs->rcx = (unsigned long) &frame->uc;
591
587 asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); 592 asm volatile("movl %0,%%ds" :: "r" (__USER32_DS));
588 asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); 593 asm volatile("movl %0,%%es" :: "r" (__USER32_DS));
589 594
diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index 4d9d5ed942b2..124b2d27b4ac 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -25,6 +25,7 @@
25#include <linux/kernel_stat.h> 25#include <linux/kernel_stat.h>
26#include <linux/sysdev.h> 26#include <linux/sysdev.h>
27#include <linux/module.h> 27#include <linux/module.h>
28#include <linux/ioport.h>
28 29
29#include <asm/atomic.h> 30#include <asm/atomic.h>
30#include <asm/smp.h> 31#include <asm/smp.h>
@@ -45,6 +46,12 @@ int apic_calibrate_pmtmr __initdata;
45 46
46int disable_apic_timer __initdata; 47int disable_apic_timer __initdata;
47 48
49static struct resource *ioapic_resources;
50static struct resource lapic_resource = {
51 .name = "Local APIC",
52 .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
53};
54
48/* 55/*
49 * cpu_mask that denotes the CPUs that needs timer interrupt coming in as 56 * cpu_mask that denotes the CPUs that needs timer interrupt coming in as
50 * IPIs in place of local APIC timers 57 * IPIs in place of local APIC timers
@@ -133,7 +140,6 @@ void clear_local_APIC(void)
133 apic_write(APIC_LVTERR, APIC_LVT_MASKED); 140 apic_write(APIC_LVTERR, APIC_LVT_MASKED);
134 if (maxlvt >= 4) 141 if (maxlvt >= 4)
135 apic_write(APIC_LVTPC, APIC_LVT_MASKED); 142 apic_write(APIC_LVTPC, APIC_LVT_MASKED);
136 v = GET_APIC_VERSION(apic_read(APIC_LVR));
137 apic_write(APIC_ESR, 0); 143 apic_write(APIC_ESR, 0);
138 apic_read(APIC_ESR); 144 apic_read(APIC_ESR);
139} 145}
@@ -452,23 +458,30 @@ static struct {
452static int lapic_suspend(struct sys_device *dev, pm_message_t state) 458static int lapic_suspend(struct sys_device *dev, pm_message_t state)
453{ 459{
454 unsigned long flags; 460 unsigned long flags;
461 int maxlvt;
455 462
456 if (!apic_pm_state.active) 463 if (!apic_pm_state.active)
457 return 0; 464 return 0;
458 465
466 maxlvt = get_maxlvt();
467
459 apic_pm_state.apic_id = apic_read(APIC_ID); 468 apic_pm_state.apic_id = apic_read(APIC_ID);
460 apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); 469 apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
461 apic_pm_state.apic_ldr = apic_read(APIC_LDR); 470 apic_pm_state.apic_ldr = apic_read(APIC_LDR);
462 apic_pm_state.apic_dfr = apic_read(APIC_DFR); 471 apic_pm_state.apic_dfr = apic_read(APIC_DFR);
463 apic_pm_state.apic_spiv = apic_read(APIC_SPIV); 472 apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
464 apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); 473 apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
465 apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); 474 if (maxlvt >= 4)
475 apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
466 apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); 476 apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
467 apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); 477 apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
468 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); 478 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
469 apic_pm_state.apic_tmict = apic_read(APIC_TMICT); 479 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
470 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); 480 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
471 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); 481#ifdef CONFIG_X86_MCE_INTEL
482 if (maxlvt >= 5)
483 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
484#endif
472 local_irq_save(flags); 485 local_irq_save(flags);
473 disable_local_APIC(); 486 disable_local_APIC();
474 local_irq_restore(flags); 487 local_irq_restore(flags);
@@ -479,10 +492,13 @@ static int lapic_resume(struct sys_device *dev)
479{ 492{
480 unsigned int l, h; 493 unsigned int l, h;
481 unsigned long flags; 494 unsigned long flags;
495 int maxlvt;
482 496
483 if (!apic_pm_state.active) 497 if (!apic_pm_state.active)
484 return 0; 498 return 0;
485 499
500 maxlvt = get_maxlvt();
501
486 local_irq_save(flags); 502 local_irq_save(flags);
487 rdmsr(MSR_IA32_APICBASE, l, h); 503 rdmsr(MSR_IA32_APICBASE, l, h);
488 l &= ~MSR_IA32_APICBASE_BASE; 504 l &= ~MSR_IA32_APICBASE_BASE;
@@ -496,8 +512,12 @@ static int lapic_resume(struct sys_device *dev)
496 apic_write(APIC_SPIV, apic_pm_state.apic_spiv); 512 apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
497 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); 513 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
498 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); 514 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
499 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); 515#ifdef CONFIG_X86_MCE_INTEL
500 apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); 516 if (maxlvt >= 5)
517 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
518#endif
519 if (maxlvt >= 4)
520 apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
501 apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); 521 apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
502 apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); 522 apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
503 apic_write(APIC_TMICT, apic_pm_state.apic_tmict); 523 apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
@@ -585,6 +605,64 @@ static int __init detect_init_APIC (void)
585 return 0; 605 return 0;
586} 606}
587 607
608#ifdef CONFIG_X86_IO_APIC
609static struct resource * __init ioapic_setup_resources(void)
610{
611#define IOAPIC_RESOURCE_NAME_SIZE 11
612 unsigned long n;
613 struct resource *res;
614 char *mem;
615 int i;
616
617 if (nr_ioapics <= 0)
618 return NULL;
619
620 n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
621 n *= nr_ioapics;
622
623 mem = alloc_bootmem(n);
624 res = (void *)mem;
625
626 if (mem != NULL) {
627 memset(mem, 0, n);
628 mem += sizeof(struct resource) * nr_ioapics;
629
630 for (i = 0; i < nr_ioapics; i++) {
631 res[i].name = mem;
632 res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
633 sprintf(mem, "IOAPIC %u", i);
634 mem += IOAPIC_RESOURCE_NAME_SIZE;
635 }
636 }
637
638 ioapic_resources = res;
639
640 return res;
641}
642
643static int __init ioapic_insert_resources(void)
644{
645 int i;
646 struct resource *r = ioapic_resources;
647
648 if (!r) {
649 printk("IO APIC resources could be not be allocated.\n");
650 return -1;
651 }
652
653 for (i = 0; i < nr_ioapics; i++) {
654 insert_resource(&iomem_resource, r);
655 r++;
656 }
657
658 return 0;
659}
660
661/* Insert the IO APIC resources after PCI initialization has occured to handle
662 * IO APICS that are mapped in on a BAR in PCI space. */
663late_initcall(ioapic_insert_resources);
664#endif
665
588void __init init_apic_mappings(void) 666void __init init_apic_mappings(void)
589{ 667{
590 unsigned long apic_phys; 668 unsigned long apic_phys;
@@ -604,6 +682,11 @@ void __init init_apic_mappings(void)
604 apic_mapped = 1; 682 apic_mapped = 1;
605 apic_printk(APIC_VERBOSE,"mapped APIC to %16lx (%16lx)\n", APIC_BASE, apic_phys); 683 apic_printk(APIC_VERBOSE,"mapped APIC to %16lx (%16lx)\n", APIC_BASE, apic_phys);
606 684
685 /* Put local APIC into the resource map. */
686 lapic_resource.start = apic_phys;
687 lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
688 insert_resource(&iomem_resource, &lapic_resource);
689
607 /* 690 /*
608 * Fetch the APIC ID of the BSP in case we have a 691 * Fetch the APIC ID of the BSP in case we have a
609 * default configuration (or the MP table is broken). 692 * default configuration (or the MP table is broken).
@@ -613,7 +696,9 @@ void __init init_apic_mappings(void)
613 { 696 {
614 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; 697 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
615 int i; 698 int i;
699 struct resource *ioapic_res;
616 700
701 ioapic_res = ioapic_setup_resources();
617 for (i = 0; i < nr_ioapics; i++) { 702 for (i = 0; i < nr_ioapics; i++) {
618 if (smp_found_config) { 703 if (smp_found_config) {
619 ioapic_phys = mp_ioapics[i].mpc_apicaddr; 704 ioapic_phys = mp_ioapics[i].mpc_apicaddr;
@@ -625,6 +710,12 @@ void __init init_apic_mappings(void)
625 apic_printk(APIC_VERBOSE,"mapped IOAPIC to %016lx (%016lx)\n", 710 apic_printk(APIC_VERBOSE,"mapped IOAPIC to %016lx (%016lx)\n",
626 __fix_to_virt(idx), ioapic_phys); 711 __fix_to_virt(idx), ioapic_phys);
627 idx++; 712 idx++;
713
714 if (ioapic_res != NULL) {
715 ioapic_res->start = ioapic_phys;
716 ioapic_res->end = ioapic_phys + (4 * 1024) - 1;
717 ioapic_res++;
718 }
628 } 719 }
629 } 720 }
630} 721}
@@ -644,10 +735,9 @@ void __init init_apic_mappings(void)
644 735
645static void __setup_APIC_LVTT(unsigned int clocks) 736static void __setup_APIC_LVTT(unsigned int clocks)
646{ 737{
647 unsigned int lvtt_value, tmp_value, ver; 738 unsigned int lvtt_value, tmp_value;
648 int cpu = smp_processor_id(); 739 int cpu = smp_processor_id();
649 740
650 ver = GET_APIC_VERSION(apic_read(APIC_LVR));
651 lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR; 741 lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR;
652 742
653 if (cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) 743 if (cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask))
diff --git a/arch/x86_64/kernel/early-quirks.c b/arch/x86_64/kernel/early-quirks.c
index 68273bff58cc..829698f6d049 100644
--- a/arch/x86_64/kernel/early-quirks.c
+++ b/arch/x86_64/kernel/early-quirks.c
@@ -69,11 +69,18 @@ static void nvidia_bugs(void)
69 69
70static void ati_bugs(void) 70static void ati_bugs(void)
71{ 71{
72 if (timer_over_8254 == 1) { 72}
73 timer_over_8254 = 0; 73
74 printk(KERN_INFO 74static void intel_bugs(void)
75 "ATI board detected. Disabling timer routing over 8254.\n"); 75{
76 } 76 u16 device = read_pci_config_16(0, 0, 0, PCI_DEVICE_ID);
77
78#ifdef CONFIG_SMP
79 if (device == PCI_DEVICE_ID_INTEL_E7320_MCH ||
80 device == PCI_DEVICE_ID_INTEL_E7520_MCH ||
81 device == PCI_DEVICE_ID_INTEL_E7525_MCH)
82 quirk_intel_irqbalance();
83#endif
77} 84}
78 85
79struct chipset { 86struct chipset {
@@ -85,6 +92,7 @@ static struct chipset early_qrk[] = {
85 { PCI_VENDOR_ID_NVIDIA, nvidia_bugs }, 92 { PCI_VENDOR_ID_NVIDIA, nvidia_bugs },
86 { PCI_VENDOR_ID_VIA, via_bugs }, 93 { PCI_VENDOR_ID_VIA, via_bugs },
87 { PCI_VENDOR_ID_ATI, ati_bugs }, 94 { PCI_VENDOR_ID_ATI, ati_bugs },
95 { PCI_VENDOR_ID_INTEL, intel_bugs},
88 {} 96 {}
89}; 97};
90 98
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index 7d401b00d822..601d332c4b79 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -230,7 +230,6 @@ ENTRY(system_call)
230 CFI_REL_OFFSET rip,RIP-ARGOFFSET 230 CFI_REL_OFFSET rip,RIP-ARGOFFSET
231 GET_THREAD_INFO(%rcx) 231 GET_THREAD_INFO(%rcx)
232 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx) 232 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
233 CFI_REMEMBER_STATE
234 jnz tracesys 233 jnz tracesys
235 cmpq $__NR_syscall_max,%rax 234 cmpq $__NR_syscall_max,%rax
236 ja badsys 235 ja badsys
@@ -241,7 +240,6 @@ ENTRY(system_call)
241 * Syscall return path ending with SYSRET (fast path) 240 * Syscall return path ending with SYSRET (fast path)
242 * Has incomplete stack frame and undefined top of stack. 241 * Has incomplete stack frame and undefined top of stack.
243 */ 242 */
244 .globl ret_from_sys_call
245ret_from_sys_call: 243ret_from_sys_call:
246 movl $_TIF_ALLWORK_MASK,%edi 244 movl $_TIF_ALLWORK_MASK,%edi
247 /* edi: flagmask */ 245 /* edi: flagmask */
@@ -251,8 +249,8 @@ sysret_check:
251 TRACE_IRQS_OFF 249 TRACE_IRQS_OFF
252 movl threadinfo_flags(%rcx),%edx 250 movl threadinfo_flags(%rcx),%edx
253 andl %edi,%edx 251 andl %edi,%edx
254 CFI_REMEMBER_STATE
255 jnz sysret_careful 252 jnz sysret_careful
253 CFI_REMEMBER_STATE
256 /* 254 /*
257 * sysretq will re-enable interrupts: 255 * sysretq will re-enable interrupts:
258 */ 256 */
@@ -265,10 +263,10 @@ sysret_check:
265 swapgs 263 swapgs
266 sysretq 264 sysretq
267 265
266 CFI_RESTORE_STATE
268 /* Handle reschedules */ 267 /* Handle reschedules */
269 /* edx: work, edi: workmask */ 268 /* edx: work, edi: workmask */
270sysret_careful: 269sysret_careful:
271 CFI_RESTORE_STATE
272 bt $TIF_NEED_RESCHED,%edx 270 bt $TIF_NEED_RESCHED,%edx
273 jnc sysret_signal 271 jnc sysret_signal
274 TRACE_IRQS_ON 272 TRACE_IRQS_ON
@@ -306,7 +304,6 @@ badsys:
306 304
307 /* Do syscall tracing */ 305 /* Do syscall tracing */
308tracesys: 306tracesys:
309 CFI_RESTORE_STATE
310 SAVE_REST 307 SAVE_REST
311 movq $-ENOSYS,RAX(%rsp) 308 movq $-ENOSYS,RAX(%rsp)
312 FIXUP_TOP_OF_STACK %rdi 309 FIXUP_TOP_OF_STACK %rdi
@@ -322,32 +319,13 @@ tracesys:
322 call *sys_call_table(,%rax,8) 319 call *sys_call_table(,%rax,8)
3231: movq %rax,RAX-ARGOFFSET(%rsp) 3201: movq %rax,RAX-ARGOFFSET(%rsp)
324 /* Use IRET because user could have changed frame */ 321 /* Use IRET because user could have changed frame */
325 jmp int_ret_from_sys_call
326 CFI_ENDPROC
327END(system_call)
328 322
329/* 323/*
330 * Syscall return path ending with IRET. 324 * Syscall return path ending with IRET.
331 * Has correct top of stack, but partial stack frame. 325 * Has correct top of stack, but partial stack frame.
332 */ 326 */
333ENTRY(int_ret_from_sys_call) 327 .globl int_ret_from_sys_call
334 CFI_STARTPROC simple 328int_ret_from_sys_call:
335 CFI_SIGNAL_FRAME
336 CFI_DEF_CFA rsp,SS+8-ARGOFFSET
337 /*CFI_REL_OFFSET ss,SS-ARGOFFSET*/
338 CFI_REL_OFFSET rsp,RSP-ARGOFFSET
339 /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
340 /*CFI_REL_OFFSET cs,CS-ARGOFFSET*/
341 CFI_REL_OFFSET rip,RIP-ARGOFFSET
342 CFI_REL_OFFSET rdx,RDX-ARGOFFSET
343 CFI_REL_OFFSET rcx,RCX-ARGOFFSET
344 CFI_REL_OFFSET rax,RAX-ARGOFFSET
345 CFI_REL_OFFSET rdi,RDI-ARGOFFSET
346 CFI_REL_OFFSET rsi,RSI-ARGOFFSET
347 CFI_REL_OFFSET r8,R8-ARGOFFSET
348 CFI_REL_OFFSET r9,R9-ARGOFFSET
349 CFI_REL_OFFSET r10,R10-ARGOFFSET
350 CFI_REL_OFFSET r11,R11-ARGOFFSET
351 cli 329 cli
352 TRACE_IRQS_OFF 330 TRACE_IRQS_OFF
353 testl $3,CS-ARGOFFSET(%rsp) 331 testl $3,CS-ARGOFFSET(%rsp)
@@ -394,8 +372,6 @@ int_very_careful:
394 popq %rdi 372 popq %rdi
395 CFI_ADJUST_CFA_OFFSET -8 373 CFI_ADJUST_CFA_OFFSET -8
396 andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi 374 andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
397 cli
398 TRACE_IRQS_OFF
399 jmp int_restore_rest 375 jmp int_restore_rest
400 376
401int_signal: 377int_signal:
@@ -411,7 +387,7 @@ int_restore_rest:
411 TRACE_IRQS_OFF 387 TRACE_IRQS_OFF
412 jmp int_with_check 388 jmp int_with_check
413 CFI_ENDPROC 389 CFI_ENDPROC
414END(int_ret_from_sys_call) 390END(system_call)
415 391
416/* 392/*
417 * Certain special system calls that need to save a complete full stack frame. 393 * Certain special system calls that need to save a complete full stack frame.
diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c
index 8e78a75d1866..b007433f96bb 100644
--- a/arch/x86_64/kernel/genapic.c
+++ b/arch/x86_64/kernel/genapic.c
@@ -33,7 +33,7 @@ extern struct genapic apic_flat;
33extern struct genapic apic_physflat; 33extern struct genapic apic_physflat;
34 34
35struct genapic *genapic = &apic_flat; 35struct genapic *genapic = &apic_flat;
36 36struct genapic *genapic_force;
37 37
38/* 38/*
39 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. 39 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
@@ -46,6 +46,13 @@ void __init clustered_apic_check(void)
46 u8 cluster_cnt[NUM_APIC_CLUSTERS]; 46 u8 cluster_cnt[NUM_APIC_CLUSTERS];
47 int max_apic = 0; 47 int max_apic = 0;
48 48
49 /* genapic selection can be forced because of certain quirks.
50 */
51 if (genapic_force) {
52 genapic = genapic_force;
53 goto print;
54 }
55
49#if defined(CONFIG_ACPI) 56#if defined(CONFIG_ACPI)
50 /* 57 /*
51 * Some x86_64 machines use physical APIC mode regardless of how many 58 * Some x86_64 machines use physical APIC mode regardless of how many
diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c
index 9561eb3c5b5c..cc230b93cd1c 100644
--- a/arch/x86_64/kernel/head64.c
+++ b/arch/x86_64/kernel/head64.c
@@ -57,10 +57,12 @@ void __init x86_64_start_kernel(char * real_mode_data)
57{ 57{
58 int i; 58 int i;
59 59
60 for (i = 0; i < 256; i++) 60 /* clear bss before set_intr_gate with early_idt_handler */
61 clear_bss();
62
63 for (i = 0; i < IDT_ENTRIES; i++)
61 set_intr_gate(i, early_idt_handler); 64 set_intr_gate(i, early_idt_handler);
62 asm volatile("lidt %0" :: "m" (idt_descr)); 65 asm volatile("lidt %0" :: "m" (idt_descr));
63 clear_bss();
64 66
65 early_printk("Kernel alive\n"); 67 early_printk("Kernel alive\n");
66 68
diff --git a/arch/x86_64/kernel/i387.c b/arch/x86_64/kernel/i387.c
index 3aa1e9bb781d..1d58c13bc6bc 100644
--- a/arch/x86_64/kernel/i387.c
+++ b/arch/x86_64/kernel/i387.c
@@ -82,11 +82,8 @@ int save_i387(struct _fpstate __user *buf)
82 struct task_struct *tsk = current; 82 struct task_struct *tsk = current;
83 int err = 0; 83 int err = 0;
84 84
85 { 85 BUILD_BUG_ON(sizeof(struct user_i387_struct) !=
86 extern void bad_user_i387_struct(void); 86 sizeof(tsk->thread.i387.fxsave));
87 if (sizeof(struct user_i387_struct) != sizeof(tsk->thread.i387.fxsave))
88 bad_user_i387_struct();
89 }
90 87
91 if ((unsigned long)buf % 16) 88 if ((unsigned long)buf % 16)
92 printk("save_i387: bad fpstate %p\n",buf); 89 printk("save_i387: bad fpstate %p\n",buf);
diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
index c4ef801b765b..d73c79e821f1 100644
--- a/arch/x86_64/kernel/i8259.c
+++ b/arch/x86_64/kernel/i8259.c
@@ -76,7 +76,8 @@ BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf)
76 IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ 76 IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
77 IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) 77 IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
78 78
79void (*interrupt[NR_IRQS])(void) = { 79/* for the irq vectors */
80static void (*interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
80 IRQLIST_16(0x2), IRQLIST_16(0x3), 81 IRQLIST_16(0x2), IRQLIST_16(0x3),
81 IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), 82 IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
82 IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), 83 IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index c80081a6ba41..2a1dcd5f69c2 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -55,10 +55,6 @@ int sis_apic_bug; /* not actually supported, dummy for compile */
55 55
56static int no_timer_check; 56static int no_timer_check;
57 57
58static int disable_timer_pin_1 __initdata;
59
60int timer_over_8254 __initdata = 1;
61
62/* Where if anywhere is the i8259 connect in external int mode */ 58/* Where if anywhere is the i8259 connect in external int mode */
63static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; 59static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
64 60
@@ -178,14 +174,20 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
178 * the interrupt, and we need to make sure the entry is fully populated 174 * the interrupt, and we need to make sure the entry is fully populated
179 * before that happens. 175 * before that happens.
180 */ 176 */
181static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) 177static void
178__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
182{ 179{
183 unsigned long flags;
184 union entry_union eu; 180 union entry_union eu;
185 eu.entry = e; 181 eu.entry = e;
186 spin_lock_irqsave(&ioapic_lock, flags);
187 io_apic_write(apic, 0x11 + 2*pin, eu.w2); 182 io_apic_write(apic, 0x11 + 2*pin, eu.w2);
188 io_apic_write(apic, 0x10 + 2*pin, eu.w1); 183 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
184}
185
186static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
187{
188 unsigned long flags;
189 spin_lock_irqsave(&ioapic_lock, flags);
190 __ioapic_write_entry(apic, pin, e);
189 spin_unlock_irqrestore(&ioapic_lock, flags); 191 spin_unlock_irqrestore(&ioapic_lock, flags);
190} 192}
191 193
@@ -348,29 +350,6 @@ static int __init disable_ioapic_setup(char *str)
348} 350}
349early_param("noapic", disable_ioapic_setup); 351early_param("noapic", disable_ioapic_setup);
350 352
351/* Actually the next is obsolete, but keep it for paranoid reasons -AK */
352static int __init disable_timer_pin_setup(char *arg)
353{
354 disable_timer_pin_1 = 1;
355 return 1;
356}
357__setup("disable_timer_pin_1", disable_timer_pin_setup);
358
359static int __init setup_disable_8254_timer(char *s)
360{
361 timer_over_8254 = -1;
362 return 1;
363}
364static int __init setup_enable_8254_timer(char *s)
365{
366 timer_over_8254 = 2;
367 return 1;
368}
369
370__setup("disable_8254_timer", setup_disable_8254_timer);
371__setup("enable_8254_timer", setup_enable_8254_timer);
372
373
374/* 353/*
375 * Find the IRQ entry number of a certain pin. 354 * Find the IRQ entry number of a certain pin.
376 */ 355 */
@@ -750,6 +729,22 @@ static int assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result)
750 return vector; 729 return vector;
751} 730}
752 731
732static void __clear_irq_vector(int irq)
733{
734 cpumask_t mask;
735 int cpu, vector;
736
737 BUG_ON(!irq_vector[irq]);
738
739 vector = irq_vector[irq];
740 cpus_and(mask, irq_domain[irq], cpu_online_map);
741 for_each_cpu_mask(cpu, mask)
742 per_cpu(vector_irq, cpu)[vector] = -1;
743
744 irq_vector[irq] = 0;
745 irq_domain[irq] = CPU_MASK_NONE;
746}
747
753void __setup_vector_irq(int cpu) 748void __setup_vector_irq(int cpu)
754{ 749{
755 /* Initialize vector_irq on a new cpu */ 750 /* Initialize vector_irq on a new cpu */
@@ -794,27 +789,65 @@ static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
794 handle_edge_irq, "edge"); 789 handle_edge_irq, "edge");
795 } 790 }
796} 791}
797 792static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq)
798static void __init setup_IO_APIC_irqs(void)
799{ 793{
800 struct IO_APIC_route_entry entry; 794 struct IO_APIC_route_entry entry;
801 int apic, pin, idx, irq, first_notcon = 1, vector; 795 int vector;
802 unsigned long flags; 796 unsigned long flags;
803 797
804 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
805 798
806 for (apic = 0; apic < nr_ioapics; apic++) { 799 /*
807 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { 800 * add it to the IO-APIC irq-routing table:
801 */
802 memset(&entry,0,sizeof(entry));
808 803
809 /* 804 entry.delivery_mode = INT_DELIVERY_MODE;
810 * add it to the IO-APIC irq-routing table: 805 entry.dest_mode = INT_DEST_MODE;
811 */ 806 entry.mask = 0; /* enable IRQ */
812 memset(&entry,0,sizeof(entry)); 807 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
808
809 entry.trigger = irq_trigger(idx);
810 entry.polarity = irq_polarity(idx);
813 811
814 entry.delivery_mode = INT_DELIVERY_MODE; 812 if (irq_trigger(idx)) {
815 entry.dest_mode = INT_DEST_MODE; 813 entry.trigger = 1;
816 entry.mask = 0; /* enable IRQ */ 814 entry.mask = 1;
817 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); 815 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
816 }
817
818 if (!apic && !IO_APIC_IRQ(irq))
819 return;
820
821 if (IO_APIC_IRQ(irq)) {
822 cpumask_t mask;
823 vector = assign_irq_vector(irq, TARGET_CPUS, &mask);
824 if (vector < 0)
825 return;
826
827 entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask);
828 entry.vector = vector;
829
830 ioapic_register_intr(irq, vector, IOAPIC_AUTO);
831 if (!apic && (irq < 16))
832 disable_8259A_irq(irq);
833 }
834
835 ioapic_write_entry(apic, pin, entry);
836
837 spin_lock_irqsave(&ioapic_lock, flags);
838 set_native_irq_info(irq, TARGET_CPUS);
839 spin_unlock_irqrestore(&ioapic_lock, flags);
840
841}
842
843static void __init setup_IO_APIC_irqs(void)
844{
845 int apic, pin, idx, irq, first_notcon = 1;
846
847 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
848
849 for (apic = 0; apic < nr_ioapics; apic++) {
850 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
818 851
819 idx = find_irq_entry(apic,pin,mp_INT); 852 idx = find_irq_entry(apic,pin,mp_INT);
820 if (idx == -1) { 853 if (idx == -1) {
@@ -826,39 +859,11 @@ static void __init setup_IO_APIC_irqs(void)
826 continue; 859 continue;
827 } 860 }
828 861
829 entry.trigger = irq_trigger(idx);
830 entry.polarity = irq_polarity(idx);
831
832 if (irq_trigger(idx)) {
833 entry.trigger = 1;
834 entry.mask = 1;
835 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
836 }
837
838 irq = pin_2_irq(idx, apic, pin); 862 irq = pin_2_irq(idx, apic, pin);
839 add_pin_to_irq(irq, apic, pin); 863 add_pin_to_irq(irq, apic, pin);
840 864
841 if (!apic && !IO_APIC_IRQ(irq)) 865 setup_IO_APIC_irq(apic, pin, idx, irq);
842 continue;
843
844 if (IO_APIC_IRQ(irq)) {
845 cpumask_t mask;
846 vector = assign_irq_vector(irq, TARGET_CPUS, &mask);
847 if (vector < 0)
848 continue;
849
850 entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask);
851 entry.vector = vector;
852
853 ioapic_register_intr(irq, vector, IOAPIC_AUTO);
854 if (!apic && (irq < 16))
855 disable_8259A_irq(irq);
856 }
857 ioapic_write_entry(apic, pin, entry);
858 866
859 spin_lock_irqsave(&ioapic_lock, flags);
860 set_native_irq_info(irq, TARGET_CPUS);
861 spin_unlock_irqrestore(&ioapic_lock, flags);
862 } 867 }
863 } 868 }
864 869
@@ -1563,10 +1568,33 @@ static inline void unlock_ExtINT_logic(void)
1563 * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ 1568 * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
1564 * is so screwy. Thanks to Brian Perkins for testing/hacking this beast 1569 * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
1565 * fanatically on his truly buggy board. 1570 * fanatically on his truly buggy board.
1566 *
1567 * FIXME: really need to revamp this for modern platforms only.
1568 */ 1571 */
1569static inline void check_timer(void) 1572
1573static int try_apic_pin(int apic, int pin, char *msg)
1574{
1575 apic_printk(APIC_VERBOSE, KERN_INFO
1576 "..TIMER: trying IO-APIC=%d PIN=%d %s",
1577 apic, pin, msg);
1578
1579 /*
1580 * Ok, does IRQ0 through the IOAPIC work?
1581 */
1582 if (!no_timer_check && timer_irq_works()) {
1583 nmi_watchdog_default();
1584 if (nmi_watchdog == NMI_IO_APIC) {
1585 disable_8259A_irq(0);
1586 setup_nmi();
1587 enable_8259A_irq(0);
1588 }
1589 return 1;
1590 }
1591 clear_IO_APIC_pin(apic, pin);
1592 apic_printk(APIC_QUIET, KERN_ERR " .. failed\n");
1593 return 0;
1594}
1595
1596/* The function from hell */
1597static void check_timer(void)
1570{ 1598{
1571 int apic1, pin1, apic2, pin2; 1599 int apic1, pin1, apic2, pin2;
1572 int vector; 1600 int vector;
@@ -1587,61 +1615,43 @@ static inline void check_timer(void)
1587 */ 1615 */
1588 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); 1616 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
1589 init_8259A(1); 1617 init_8259A(1);
1590 if (timer_over_8254 > 0)
1591 enable_8259A_irq(0);
1592 1618
1593 pin1 = find_isa_irq_pin(0, mp_INT); 1619 pin1 = find_isa_irq_pin(0, mp_INT);
1594 apic1 = find_isa_irq_apic(0, mp_INT); 1620 apic1 = find_isa_irq_apic(0, mp_INT);
1595 pin2 = ioapic_i8259.pin; 1621 pin2 = ioapic_i8259.pin;
1596 apic2 = ioapic_i8259.apic; 1622 apic2 = ioapic_i8259.apic;
1597 1623
1598 apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", 1624 /* Do this first, otherwise we get double interrupts on ATI boards */
1599 vector, apic1, pin1, apic2, pin2); 1625 if ((pin1 != -1) && try_apic_pin(apic1, pin1,"with 8259 IRQ0 disabled"))
1626 return;
1600 1627
1601 if (pin1 != -1) { 1628 /* Now try again with IRQ0 8259A enabled.
1602 /* 1629 Assumes timer is on IO-APIC 0 ?!? */
1603 * Ok, does IRQ0 through the IOAPIC work? 1630 enable_8259A_irq(0);
1604 */ 1631 unmask_IO_APIC_irq(0);
1605 unmask_IO_APIC_irq(0); 1632 if (try_apic_pin(apic1, pin1, "with 8259 IRQ0 enabled"))
1606 if (!no_timer_check && timer_irq_works()) { 1633 return;
1607 nmi_watchdog_default(); 1634 disable_8259A_irq(0);
1608 if (nmi_watchdog == NMI_IO_APIC) { 1635
1609 disable_8259A_irq(0); 1636 /* Always try pin0 and pin2 on APIC 0 to handle buggy timer overrides
1610 setup_nmi(); 1637 on Nvidia boards */
1611 enable_8259A_irq(0); 1638 if (!(apic1 == 0 && pin1 == 0) &&
1612 } 1639 try_apic_pin(0, 0, "fallback with 8259 IRQ0 disabled"))
1613 if (disable_timer_pin_1 > 0) 1640 return;
1614 clear_IO_APIC_pin(0, pin1); 1641 if (!(apic1 == 0 && pin1 == 2) &&
1615 return; 1642 try_apic_pin(0, 2, "fallback with 8259 IRQ0 disabled"))
1616 } 1643 return;
1617 clear_IO_APIC_pin(apic1, pin1);
1618 apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not "
1619 "connected to IO-APIC\n");
1620 }
1621 1644
1622 apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) " 1645 /* Then try pure 8259A routing on the 8259 as reported by BIOS*/
1623 "through the 8259A ... "); 1646 enable_8259A_irq(0);
1624 if (pin2 != -1) { 1647 if (pin2 != -1) {
1625 apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
1626 apic2, pin2);
1627 /*
1628 * legacy devices should be connected to IO APIC #0
1629 */
1630 setup_ExtINT_IRQ0_pin(apic2, pin2, vector); 1648 setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
1631 if (timer_irq_works()) { 1649 if (try_apic_pin(apic2,pin2,"8259A broadcast ExtINT from BIOS"))
1632 apic_printk(APIC_VERBOSE," works.\n");
1633 nmi_watchdog_default();
1634 if (nmi_watchdog == NMI_IO_APIC) {
1635 setup_nmi();
1636 }
1637 return; 1650 return;
1638 }
1639 /*
1640 * Cleanup, just in case ...
1641 */
1642 clear_IO_APIC_pin(apic2, pin2);
1643 } 1651 }
1644 apic_printk(APIC_VERBOSE," failed.\n"); 1652
1653 /* Tried all possibilities to go through the IO-APIC. Now come the
1654 really cheesy fallbacks. */
1645 1655
1646 if (nmi_watchdog == NMI_IO_APIC) { 1656 if (nmi_watchdog == NMI_IO_APIC) {
1647 printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); 1657 printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
@@ -1837,7 +1847,7 @@ void destroy_irq(unsigned int irq)
1837 dynamic_irq_cleanup(irq); 1847 dynamic_irq_cleanup(irq);
1838 1848
1839 spin_lock_irqsave(&vector_lock, flags); 1849 spin_lock_irqsave(&vector_lock, flags);
1840 irq_vector[irq] = 0; 1850 __clear_irq_vector(irq);
1841 spin_unlock_irqrestore(&vector_lock, flags); 1851 spin_unlock_irqrestore(&vector_lock, flags);
1842} 1852}
1843 1853
@@ -2139,7 +2149,15 @@ void __init setup_ioapic_dest(void)
2139 if (irq_entry == -1) 2149 if (irq_entry == -1)
2140 continue; 2150 continue;
2141 irq = pin_2_irq(irq_entry, ioapic, pin); 2151 irq = pin_2_irq(irq_entry, ioapic, pin);
2142 set_ioapic_affinity_irq(irq, TARGET_CPUS); 2152
2153 /* setup_IO_APIC_irqs could fail to get vector for some device
2154 * when you have too many devices, because at that time only boot
2155 * cpu is online.
2156 */
2157 if(!irq_vector[irq])
2158 setup_IO_APIC_irq(ioapic, pin, irq_entry, irq);
2159 else
2160 set_ioapic_affinity_irq(irq, TARGET_CPUS);
2143 } 2161 }
2144 2162
2145 } 2163 }
diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
index e46c55856d40..0c06af6c13bc 100644
--- a/arch/x86_64/kernel/irq.c
+++ b/arch/x86_64/kernel/irq.c
@@ -120,7 +120,7 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
120 120
121 if (likely(irq < NR_IRQS)) 121 if (likely(irq < NR_IRQS))
122 generic_handle_irq(irq); 122 generic_handle_irq(irq);
123 else 123 else if (printk_ratelimit())
124 printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n", 124 printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n",
125 __func__, smp_processor_id(), vector); 125 __func__, smp_processor_id(), vector);
126 126
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index bc863c464a1f..ac085038af29 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -651,6 +651,7 @@ static void mce_remove_device(unsigned int cpu)
651 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant); 651 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant);
652 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval); 652 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval);
653 sysdev_unregister(&per_cpu(device_mce,cpu)); 653 sysdev_unregister(&per_cpu(device_mce,cpu));
654 memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
654} 655}
655 656
656/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 657/* Get notified when a cpu comes on/off. Be hotplug friendly. */
diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c
index b147ab19fbd4..08072568847d 100644
--- a/arch/x86_64/kernel/mpparse.c
+++ b/arch/x86_64/kernel/mpparse.c
@@ -35,8 +35,6 @@
35int smp_found_config; 35int smp_found_config;
36unsigned int __initdata maxcpus = NR_CPUS; 36unsigned int __initdata maxcpus = NR_CPUS;
37 37
38int acpi_found_madt;
39
40/* 38/*
41 * Various Linux-internal data structures created from the 39 * Various Linux-internal data structures created from the
42 * MP-table. 40 * MP-table.
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 7af9cb3e2d99..27e95e7922c1 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -12,14 +12,15 @@
12 * Mikael Pettersson : PM converted to driver model. Disable/enable API. 12 * Mikael Pettersson : PM converted to driver model. Disable/enable API.
13 */ 13 */
14 14
15#include <linux/nmi.h>
15#include <linux/mm.h> 16#include <linux/mm.h>
16#include <linux/delay.h> 17#include <linux/delay.h>
17#include <linux/interrupt.h> 18#include <linux/interrupt.h>
18#include <linux/module.h> 19#include <linux/module.h>
19#include <linux/sysdev.h> 20#include <linux/sysdev.h>
20#include <linux/nmi.h>
21#include <linux/sysctl.h> 21#include <linux/sysctl.h>
22#include <linux/kprobes.h> 22#include <linux/kprobes.h>
23#include <linux/cpumask.h>
23 24
24#include <asm/smp.h> 25#include <asm/smp.h>
25#include <asm/nmi.h> 26#include <asm/nmi.h>
@@ -41,6 +42,8 @@ int panic_on_unrecovered_nmi;
41static DEFINE_PER_CPU(unsigned, perfctr_nmi_owner); 42static DEFINE_PER_CPU(unsigned, perfctr_nmi_owner);
42static DEFINE_PER_CPU(unsigned, evntsel_nmi_owner[2]); 43static DEFINE_PER_CPU(unsigned, evntsel_nmi_owner[2]);
43 44
45static cpumask_t backtrace_mask = CPU_MASK_NONE;
46
44/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's 47/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
45 * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now) 48 * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now)
46 */ 49 */
@@ -782,6 +785,7 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
782{ 785{
783 int sum; 786 int sum;
784 int touched = 0; 787 int touched = 0;
788 int cpu = smp_processor_id();
785 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); 789 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
786 u64 dummy; 790 u64 dummy;
787 int rc=0; 791 int rc=0;
@@ -799,6 +803,16 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
799 touched = 1; 803 touched = 1;
800 } 804 }
801 805
806 if (cpu_isset(cpu, backtrace_mask)) {
807 static DEFINE_SPINLOCK(lock); /* Serialise the printks */
808
809 spin_lock(&lock);
810 printk("NMI backtrace for cpu %d\n", cpu);
811 dump_stack();
812 spin_unlock(&lock);
813 cpu_clear(cpu, backtrace_mask);
814 }
815
802#ifdef CONFIG_X86_MCE 816#ifdef CONFIG_X86_MCE
803 /* Could check oops_in_progress here too, but it's safer 817 /* Could check oops_in_progress here too, but it's safer
804 not too */ 818 not too */
@@ -931,6 +945,19 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
931 945
932#endif 946#endif
933 947
948void __trigger_all_cpu_backtrace(void)
949{
950 int i;
951
952 backtrace_mask = cpu_online_map;
953 /* Wait for up to 10 seconds for all CPUs to do the backtrace */
954 for (i = 0; i < 10 * 1000; i++) {
955 if (cpus_empty(backtrace_mask))
956 break;
957 mdelay(1);
958 }
959}
960
934EXPORT_SYMBOL(nmi_active); 961EXPORT_SYMBOL(nmi_active);
935EXPORT_SYMBOL(nmi_watchdog); 962EXPORT_SYMBOL(nmi_watchdog);
936EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi); 963EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c
index 37a770859e71..3215675ab128 100644
--- a/arch/x86_64/kernel/pci-calgary.c
+++ b/arch/x86_64/kernel/pci-calgary.c
@@ -41,6 +41,13 @@
41#include <asm/pci-direct.h> 41#include <asm/pci-direct.h>
42#include <asm/system.h> 42#include <asm/system.h>
43#include <asm/dma.h> 43#include <asm/dma.h>
44#include <asm/rio.h>
45
46#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT
47int use_calgary __read_mostly = 1;
48#else
49int use_calgary __read_mostly = 0;
50#endif /* CONFIG_CALGARY_DEFAULT_ENABLED */
44 51
45#define PCI_DEVICE_ID_IBM_CALGARY 0x02a1 52#define PCI_DEVICE_ID_IBM_CALGARY 0x02a1
46#define PCI_VENDOR_DEVICE_ID_CALGARY \ 53#define PCI_VENDOR_DEVICE_ID_CALGARY \
@@ -115,14 +122,35 @@ static const unsigned long phb_offsets[] = {
115 0xB000 /* PHB3 */ 122 0xB000 /* PHB3 */
116}; 123};
117 124
125/* PHB debug registers */
126
127static const unsigned long phb_debug_offsets[] = {
128 0x4000 /* PHB 0 DEBUG */,
129 0x5000 /* PHB 1 DEBUG */,
130 0x6000 /* PHB 2 DEBUG */,
131 0x7000 /* PHB 3 DEBUG */
132};
133
134/*
135 * STUFF register for each debug PHB,
136 * byte 1 = start bus number, byte 2 = end bus number
137 */
138
139#define PHB_DEBUG_STUFF_OFFSET 0x0020
140
118unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED; 141unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED;
119static int translate_empty_slots __read_mostly = 0; 142static int translate_empty_slots __read_mostly = 0;
120static int calgary_detected __read_mostly = 0; 143static int calgary_detected __read_mostly = 0;
121 144
145static struct rio_table_hdr *rio_table_hdr __initdata;
146static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata;
147static struct rio_detail *rio_devs[MAX_NUMNODES * 4] __initdata;
148
122struct calgary_bus_info { 149struct calgary_bus_info {
123 void *tce_space; 150 void *tce_space;
124 unsigned char translation_disabled; 151 unsigned char translation_disabled;
125 signed char phbid; 152 signed char phbid;
153 void __iomem *bbar;
126}; 154};
127 155
128static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, }; 156static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, };
@@ -475,6 +503,11 @@ static struct dma_mapping_ops calgary_dma_ops = {
475 .unmap_sg = calgary_unmap_sg, 503 .unmap_sg = calgary_unmap_sg,
476}; 504};
477 505
506static inline void __iomem * busno_to_bbar(unsigned char num)
507{
508 return bus_info[num].bbar;
509}
510
478static inline int busno_to_phbid(unsigned char num) 511static inline int busno_to_phbid(unsigned char num)
479{ 512{
480 return bus_info[num].phbid; 513 return bus_info[num].phbid;
@@ -620,14 +653,9 @@ static void __init calgary_reserve_peripheral_mem_2(struct pci_dev *dev)
620static void __init calgary_reserve_regions(struct pci_dev *dev) 653static void __init calgary_reserve_regions(struct pci_dev *dev)
621{ 654{
622 unsigned int npages; 655 unsigned int npages;
623 void __iomem *bbar;
624 unsigned char busnum;
625 u64 start; 656 u64 start;
626 struct iommu_table *tbl = dev->sysdata; 657 struct iommu_table *tbl = dev->sysdata;
627 658
628 bbar = tbl->bbar;
629 busnum = dev->bus->number;
630
631 /* reserve bad_dma_address in case it's a legal address */ 659 /* reserve bad_dma_address in case it's a legal address */
632 iommu_range_reserve(tbl, bad_dma_address, 1); 660 iommu_range_reserve(tbl, bad_dma_address, 1);
633 661
@@ -740,7 +768,7 @@ static void __init calgary_increase_split_completion_timeout(void __iomem *bbar,
740{ 768{
741 u64 val64; 769 u64 val64;
742 void __iomem *target; 770 void __iomem *target;
743 unsigned long phb_shift = -1; 771 unsigned int phb_shift = ~0; /* silence gcc */
744 u64 mask; 772 u64 mask;
745 773
746 switch (busno_to_phbid(busnum)) { 774 switch (busno_to_phbid(busnum)) {
@@ -828,33 +856,6 @@ static void __init calgary_disable_translation(struct pci_dev *dev)
828 del_timer_sync(&tbl->watchdog_timer); 856 del_timer_sync(&tbl->watchdog_timer);
829} 857}
830 858
831static inline unsigned int __init locate_register_space(struct pci_dev *dev)
832{
833 int rionodeid;
834 u32 address;
835
836 /*
837 * Each Calgary has four busses. The first four busses (first Calgary)
838 * have RIO node ID 2, then the next four (second Calgary) have RIO
839 * node ID 3, the next four (third Calgary) have node ID 2 again, etc.
840 * We use a gross hack - relying on the dev->bus->number ordering,
841 * modulo 14 - to decide which Calgary a given bus is on. Busses 0, 1,
842 * 2 and 4 are on the first Calgary (id 2), 6, 8, a and c are on the
843 * second (id 3), and then it repeats modulo 14.
844 */
845 rionodeid = (dev->bus->number % 14 > 4) ? 3 : 2;
846 /*
847 * register space address calculation as follows:
848 * FE0MB-8MB*OneBasedChassisNumber+1MB*(RioNodeId-ChassisBase)
849 * ChassisBase is always zero for x366/x260/x460
850 * RioNodeId is 2 for first Calgary, 3 for second Calgary
851 */
852 address = START_ADDRESS -
853 (0x800000 * (ONE_BASED_CHASSIS_NUM + dev->bus->number / 14)) +
854 (0x100000) * (rionodeid - CHASSIS_BASE);
855 return address;
856}
857
858static void __init calgary_init_one_nontraslated(struct pci_dev *dev) 859static void __init calgary_init_one_nontraslated(struct pci_dev *dev)
859{ 860{
860 pci_dev_get(dev); 861 pci_dev_get(dev);
@@ -864,23 +865,15 @@ static void __init calgary_init_one_nontraslated(struct pci_dev *dev)
864 865
865static int __init calgary_init_one(struct pci_dev *dev) 866static int __init calgary_init_one(struct pci_dev *dev)
866{ 867{
867 u32 address;
868 void __iomem *bbar; 868 void __iomem *bbar;
869 int ret; 869 int ret;
870 870
871 BUG_ON(dev->bus->number >= MAX_PHB_BUS_NUM); 871 BUG_ON(dev->bus->number >= MAX_PHB_BUS_NUM);
872 872
873 address = locate_register_space(dev); 873 bbar = busno_to_bbar(dev->bus->number);
874 /* map entire 1MB of Calgary config space */
875 bbar = ioremap_nocache(address, 1024 * 1024);
876 if (!bbar) {
877 ret = -ENODATA;
878 goto done;
879 }
880
881 ret = calgary_setup_tar(dev, bbar); 874 ret = calgary_setup_tar(dev, bbar);
882 if (ret) 875 if (ret)
883 goto iounmap; 876 goto done;
884 877
885 pci_dev_get(dev); 878 pci_dev_get(dev);
886 dev->bus->self = dev; 879 dev->bus->self = dev;
@@ -888,17 +881,66 @@ static int __init calgary_init_one(struct pci_dev *dev)
888 881
889 return 0; 882 return 0;
890 883
891iounmap:
892 iounmap(bbar);
893done: 884done:
894 return ret; 885 return ret;
895} 886}
896 887
888static int __init calgary_locate_bbars(void)
889{
890 int ret;
891 int rioidx, phb, bus;
892 void __iomem *bbar;
893 void __iomem *target;
894 unsigned long offset;
895 u8 start_bus, end_bus;
896 u32 val;
897
898 ret = -ENODATA;
899 for (rioidx = 0; rioidx < rio_table_hdr->num_rio_dev; rioidx++) {
900 struct rio_detail *rio = rio_devs[rioidx];
901
902 if ((rio->type != COMPAT_CALGARY) && (rio->type != ALT_CALGARY))
903 continue;
904
905 /* map entire 1MB of Calgary config space */
906 bbar = ioremap_nocache(rio->BBAR, 1024 * 1024);
907 if (!bbar)
908 goto error;
909
910 for (phb = 0; phb < PHBS_PER_CALGARY; phb++) {
911 offset = phb_debug_offsets[phb] | PHB_DEBUG_STUFF_OFFSET;
912 target = calgary_reg(bbar, offset);
913
914 val = be32_to_cpu(readl(target));
915 start_bus = (u8)((val & 0x00FF0000) >> 16);
916 end_bus = (u8)((val & 0x0000FF00) >> 8);
917 for (bus = start_bus; bus <= end_bus; bus++) {
918 bus_info[bus].bbar = bbar;
919 bus_info[bus].phbid = phb;
920 }
921 }
922 }
923
924 return 0;
925
926error:
927 /* scan bus_info and iounmap any bbars we previously ioremap'd */
928 for (bus = 0; bus < ARRAY_SIZE(bus_info); bus++)
929 if (bus_info[bus].bbar)
930 iounmap(bus_info[bus].bbar);
931
932 return ret;
933}
934
897static int __init calgary_init(void) 935static int __init calgary_init(void)
898{ 936{
899 int ret = -ENODEV; 937 int ret;
900 struct pci_dev *dev = NULL; 938 struct pci_dev *dev = NULL;
901 939
940 ret = calgary_locate_bbars();
941 if (ret)
942 return ret;
943
902 do { 944 do {
903 dev = pci_get_device(PCI_VENDOR_ID_IBM, 945 dev = pci_get_device(PCI_VENDOR_ID_IBM,
904 PCI_DEVICE_ID_IBM_CALGARY, 946 PCI_DEVICE_ID_IBM_CALGARY,
@@ -921,7 +963,7 @@ static int __init calgary_init(void)
921 963
922error: 964error:
923 do { 965 do {
924 dev = pci_find_device_reverse(PCI_VENDOR_ID_IBM, 966 dev = pci_get_device_reverse(PCI_VENDOR_ID_IBM,
925 PCI_DEVICE_ID_IBM_CALGARY, 967 PCI_DEVICE_ID_IBM_CALGARY,
926 dev); 968 dev);
927 if (!dev) 969 if (!dev)
@@ -962,13 +1004,56 @@ static inline int __init determine_tce_table_size(u64 ram)
962 return ret; 1004 return ret;
963} 1005}
964 1006
1007static int __init build_detail_arrays(void)
1008{
1009 unsigned long ptr;
1010 int i, scal_detail_size, rio_detail_size;
1011
1012 if (rio_table_hdr->num_scal_dev > MAX_NUMNODES){
1013 printk(KERN_WARNING
1014 "Calgary: MAX_NUMNODES too low! Defined as %d, "
1015 "but system has %d nodes.\n",
1016 MAX_NUMNODES, rio_table_hdr->num_scal_dev);
1017 return -ENODEV;
1018 }
1019
1020 switch (rio_table_hdr->version){
1021 case 2:
1022 scal_detail_size = 11;
1023 rio_detail_size = 13;
1024 break;
1025 case 3:
1026 scal_detail_size = 12;
1027 rio_detail_size = 15;
1028 break;
1029 default:
1030 printk(KERN_WARNING
1031 "Calgary: Invalid Rio Grande Table Version: %d\n",
1032 rio_table_hdr->version);
1033 return -EPROTO;
1034 }
1035
1036 ptr = ((unsigned long)rio_table_hdr) + 3;
1037 for (i = 0; i < rio_table_hdr->num_scal_dev;
1038 i++, ptr += scal_detail_size)
1039 scal_devs[i] = (struct scal_detail *)ptr;
1040
1041 for (i = 0; i < rio_table_hdr->num_rio_dev;
1042 i++, ptr += rio_detail_size)
1043 rio_devs[i] = (struct rio_detail *)ptr;
1044
1045 return 0;
1046}
1047
965void __init detect_calgary(void) 1048void __init detect_calgary(void)
966{ 1049{
967 u32 val; 1050 u32 val;
968 int bus; 1051 int bus;
969 void *tbl; 1052 void *tbl;
970 int calgary_found = 0; 1053 int calgary_found = 0;
971 int phb = -1; 1054 unsigned long ptr;
1055 int offset;
1056 int ret;
972 1057
973 /* 1058 /*
974 * if the user specified iommu=off or iommu=soft or we found 1059 * if the user specified iommu=off or iommu=soft or we found
@@ -977,25 +1062,47 @@ void __init detect_calgary(void)
977 if (swiotlb || no_iommu || iommu_detected) 1062 if (swiotlb || no_iommu || iommu_detected)
978 return; 1063 return;
979 1064
1065 if (!use_calgary)
1066 return;
1067
980 if (!early_pci_allowed()) 1068 if (!early_pci_allowed())
981 return; 1069 return;
982 1070
1071 ptr = (unsigned long)phys_to_virt(get_bios_ebda());
1072
1073 rio_table_hdr = NULL;
1074 offset = 0x180;
1075 while (offset) {
1076 /* The block id is stored in the 2nd word */
1077 if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){
1078 /* set the pointer past the offset & block id */
1079 rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4);
1080 break;
1081 }
1082 /* The next offset is stored in the 1st word. 0 means no more */
1083 offset = *((unsigned short *)(ptr + offset));
1084 }
1085 if (!rio_table_hdr) {
1086 printk(KERN_ERR "Calgary: Unable to locate "
1087 "Rio Grande Table in EBDA - bailing!\n");
1088 return;
1089 }
1090
1091 ret = build_detail_arrays();
1092 if (ret) {
1093 printk(KERN_ERR "Calgary: build_detail_arrays ret %d\n", ret);
1094 return;
1095 }
1096
983 specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE); 1097 specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE);
984 1098
985 for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { 1099 for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
986 int dev; 1100 int dev;
987 struct calgary_bus_info *info = &bus_info[bus]; 1101 struct calgary_bus_info *info = &bus_info[bus];
988 info->phbid = -1;
989 1102
990 if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY) 1103 if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY)
991 continue; 1104 continue;
992 1105
993 /*
994 * There are 4 PHBs per Calgary chip. Set phb to which phb (0-3)
995 * it is connected to releative to the clagary chip.
996 */
997 phb = (phb + 1) % PHBS_PER_CALGARY;
998
999 if (info->translation_disabled) 1106 if (info->translation_disabled)
1000 continue; 1107 continue;
1001 1108
@@ -1010,7 +1117,6 @@ void __init detect_calgary(void)
1010 if (!tbl) 1117 if (!tbl)
1011 goto cleanup; 1118 goto cleanup;
1012 info->tce_space = tbl; 1119 info->tce_space = tbl;
1013 info->phbid = phb;
1014 calgary_found = 1; 1120 calgary_found = 1;
1015 break; 1121 break;
1016 } 1122 }
diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c
index f8d857453f8a..683b7a5c1ab3 100644
--- a/arch/x86_64/kernel/pci-dma.c
+++ b/arch/x86_64/kernel/pci-dma.c
@@ -296,6 +296,11 @@ __init int iommu_setup(char *p)
296 gart_parse_options(p); 296 gart_parse_options(p);
297#endif 297#endif
298 298
299#ifdef CONFIG_CALGARY_IOMMU
300 if (!strncmp(p, "calgary", 7))
301 use_calgary = 1;
302#endif /* CONFIG_CALGARY_IOMMU */
303
299 p += strcspn(p, ","); 304 p += strcspn(p, ",");
300 if (*p == ',') 305 if (*p == ',')
301 ++p; 306 ++p;
diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
index 16261a8a3303..fc1960f1f243 100644
--- a/arch/x86_64/kernel/pci-gart.c
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -601,10 +601,9 @@ void __init gart_iommu_init(void)
601 (!force_iommu && end_pfn <= MAX_DMA32_PFN) || 601 (!force_iommu && end_pfn <= MAX_DMA32_PFN) ||
602 !iommu_aperture || 602 !iommu_aperture ||
603 (no_agp && init_k8_gatt(&info) < 0)) { 603 (no_agp && init_k8_gatt(&info) < 0)) {
604 printk(KERN_INFO "PCI-DMA: Disabling IOMMU.\n");
605 if (end_pfn > MAX_DMA32_PFN) { 604 if (end_pfn > MAX_DMA32_PFN) {
606 printk(KERN_ERR "WARNING more than 4GB of memory " 605 printk(KERN_ERR "WARNING more than 4GB of memory "
607 "but IOMMU not available.\n" 606 "but GART IOMMU not available.\n"
608 KERN_ERR "WARNING 32bit PCI may malfunction.\n"); 607 KERN_ERR "WARNING 32bit PCI may malfunction.\n");
609 } 608 }
610 return; 609 return;
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index 7451a4c43c16..a418ee4c8c62 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -108,17 +108,15 @@ void exit_idle(void)
108 */ 108 */
109static void default_idle(void) 109static void default_idle(void)
110{ 110{
111 local_irq_enable();
112
113 current_thread_info()->status &= ~TS_POLLING; 111 current_thread_info()->status &= ~TS_POLLING;
114 smp_mb__after_clear_bit(); 112 smp_mb__after_clear_bit();
115 while (!need_resched()) { 113 local_irq_disable();
116 local_irq_disable(); 114 if (!need_resched()) {
117 if (!need_resched()) 115 /* Enables interrupts one instruction before HLT.
118 safe_halt(); 116 x86 special cases this so there is no race. */
119 else 117 safe_halt();
120 local_irq_enable(); 118 } else
121 } 119 local_irq_enable();
122 current_thread_info()->status |= TS_POLLING; 120 current_thread_info()->status |= TS_POLLING;
123} 121}
124 122
@@ -130,15 +128,7 @@ static void default_idle(void)
130static void poll_idle (void) 128static void poll_idle (void)
131{ 129{
132 local_irq_enable(); 130 local_irq_enable();
133 131 cpu_relax();
134 asm volatile(
135 "2:"
136 "testl %0,%1;"
137 "rep; nop;"
138 "je 2b;"
139 : :
140 "i" (_TIF_NEED_RESCHED),
141 "m" (current_thread_info()->flags));
142} 132}
143 133
144void cpu_idle_wait(void) 134void cpu_idle_wait(void)
@@ -219,6 +209,12 @@ void cpu_idle (void)
219 idle = default_idle; 209 idle = default_idle;
220 if (cpu_is_offline(smp_processor_id())) 210 if (cpu_is_offline(smp_processor_id()))
221 play_dead(); 211 play_dead();
212 /*
213 * Idle routines should keep interrupts disabled
214 * from here on, until they go to idle.
215 * Otherwise, idle callbacks can misfire.
216 */
217 local_irq_disable();
222 enter_idle(); 218 enter_idle();
223 idle(); 219 idle();
224 /* In many cases the interrupt that ended idle 220 /* In many cases the interrupt that ended idle
@@ -256,9 +252,16 @@ void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
256/* Default MONITOR/MWAIT with no hints, used for default C1 state */ 252/* Default MONITOR/MWAIT with no hints, used for default C1 state */
257static void mwait_idle(void) 253static void mwait_idle(void)
258{ 254{
259 local_irq_enable(); 255 if (!need_resched()) {
260 while (!need_resched()) 256 __monitor((void *)&current_thread_info()->flags, 0, 0);
261 mwait_idle_with_hints(0,0); 257 smp_mb();
258 if (!need_resched())
259 __sti_mwait(0, 0);
260 else
261 local_irq_enable();
262 } else {
263 local_irq_enable();
264 }
262} 265}
263 266
264void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) 267void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index f12f266f3e98..af425a8049fb 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -731,11 +731,8 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
731 /* Fix cpuid4 emulation for more */ 731 /* Fix cpuid4 emulation for more */
732 num_cache_leaves = 3; 732 num_cache_leaves = 3;
733 733
734 /* When there is only one core no need to synchronize RDTSC */ 734 /* RDTSC can be speculated around */
735 if (num_possible_cpus() == 1) 735 clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
736 set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
737 else
738 clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
739} 736}
740 737
741static void __cpuinit detect_ht(struct cpuinfo_x86 *c) 738static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
@@ -834,6 +831,15 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
834 set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability); 831 set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability);
835 } 832 }
836 833
834 if (cpu_has_ds) {
835 unsigned int l1, l2;
836 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
837 if (!(l1 & (1<<11)))
838 set_bit(X86_FEATURE_BTS, c->x86_capability);
839 if (!(l1 & (1<<12)))
840 set_bit(X86_FEATURE_PEBS, c->x86_capability);
841 }
842
837 n = c->extended_cpuid_level; 843 n = c->extended_cpuid_level;
838 if (n >= 0x80000008) { 844 if (n >= 0x80000008) {
839 unsigned eax = cpuid_eax(0x80000008); 845 unsigned eax = cpuid_eax(0x80000008);
@@ -853,7 +859,10 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
853 set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); 859 set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
854 if (c->x86 == 6) 860 if (c->x86 == 6)
855 set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); 861 set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
856 set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); 862 if (c->x86 == 15)
863 set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
864 else
865 clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
857 c->x86_max_cores = intel_num_cpu_cores(c); 866 c->x86_max_cores = intel_num_cpu_cores(c);
858 867
859 srat_detect_node(); 868 srat_detect_node();
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index 32f4d7e2a060..af1ec4d23cf8 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -389,6 +389,7 @@ int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
389 put_cpu(); 389 put_cpu();
390 return 0; 390 return 0;
391} 391}
392EXPORT_SYMBOL(smp_call_function_single);
392 393
393/* 394/*
394 * this function sends a 'generic call function' IPI to all other CPUs 395 * this function sends a 'generic call function' IPI to all other CPUs
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 9800147c4c68..daf19332f0dd 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -60,6 +60,7 @@
60#include <asm/irq.h> 60#include <asm/irq.h>
61#include <asm/hw_irq.h> 61#include <asm/hw_irq.h>
62#include <asm/numa.h> 62#include <asm/numa.h>
63#include <asm/genapic.h>
63 64
64/* Number of siblings per CPU package */ 65/* Number of siblings per CPU package */
65int smp_num_siblings = 1; 66int smp_num_siblings = 1;
@@ -1169,6 +1170,13 @@ int __cpuinit __cpu_up(unsigned int cpu)
1169 1170
1170 while (!cpu_isset(cpu, cpu_online_map)) 1171 while (!cpu_isset(cpu, cpu_online_map))
1171 cpu_relax(); 1172 cpu_relax();
1173
1174 if (num_online_cpus() > 8 && genapic == &apic_flat) {
1175 printk(KERN_WARNING
1176 "flat APIC routing can't be used with > 8 cpus\n");
1177 BUG();
1178 }
1179
1172 err = 0; 1180 err = 0;
1173 1181
1174 return err; 1182 return err;
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 0d65b22f229c..a1641ffdffcf 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -30,9 +30,9 @@
30#include <linux/kprobes.h> 30#include <linux/kprobes.h>
31#include <linux/kexec.h> 31#include <linux/kexec.h>
32#include <linux/unwind.h> 32#include <linux/unwind.h>
33#include <linux/uaccess.h>
33 34
34#include <asm/system.h> 35#include <asm/system.h>
35#include <asm/uaccess.h>
36#include <asm/io.h> 36#include <asm/io.h>
37#include <asm/atomic.h> 37#include <asm/atomic.h>
38#include <asm/debugreg.h> 38#include <asm/debugreg.h>
@@ -108,7 +108,7 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
108 preempt_enable_no_resched(); 108 preempt_enable_no_resched();
109} 109}
110 110
111static int kstack_depth_to_print = 12; 111int kstack_depth_to_print = 12;
112#ifdef CONFIG_STACK_UNWIND 112#ifdef CONFIG_STACK_UNWIND
113static int call_trace = 1; 113static int call_trace = 1;
114#else 114#else
@@ -225,16 +225,25 @@ static int dump_trace_unwind(struct unwind_frame_info *info, void *context)
225{ 225{
226 struct ops_and_data *oad = (struct ops_and_data *)context; 226 struct ops_and_data *oad = (struct ops_and_data *)context;
227 int n = 0; 227 int n = 0;
228 unsigned long sp = UNW_SP(info);
228 229
230 if (arch_unw_user_mode(info))
231 return -1;
229 while (unwind(info) == 0 && UNW_PC(info)) { 232 while (unwind(info) == 0 && UNW_PC(info)) {
230 n++; 233 n++;
231 oad->ops->address(oad->data, UNW_PC(info)); 234 oad->ops->address(oad->data, UNW_PC(info));
232 if (arch_unw_user_mode(info)) 235 if (arch_unw_user_mode(info))
233 break; 236 break;
237 if ((sp & ~(PAGE_SIZE - 1)) == (UNW_SP(info) & ~(PAGE_SIZE - 1))
238 && sp > UNW_SP(info))
239 break;
240 sp = UNW_SP(info);
234 } 241 }
235 return n; 242 return n;
236} 243}
237 244
245#define MSG(txt) ops->warning(data, txt)
246
238/* 247/*
239 * x86-64 can have upto three kernel stacks: 248 * x86-64 can have upto three kernel stacks:
240 * process stack 249 * process stack
@@ -248,11 +257,12 @@ static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
248 return p > t && p < t + THREAD_SIZE - 3; 257 return p > t && p < t + THREAD_SIZE - 3;
249} 258}
250 259
251void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack, 260void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
261 unsigned long *stack,
252 struct stacktrace_ops *ops, void *data) 262 struct stacktrace_ops *ops, void *data)
253{ 263{
254 const unsigned cpu = smp_processor_id(); 264 const unsigned cpu = get_cpu();
255 unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; 265 unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr;
256 unsigned used = 0; 266 unsigned used = 0;
257 struct thread_info *tinfo; 267 struct thread_info *tinfo;
258 268
@@ -268,28 +278,30 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
268 if (unwind_init_frame_info(&info, tsk, regs) == 0) 278 if (unwind_init_frame_info(&info, tsk, regs) == 0)
269 unw_ret = dump_trace_unwind(&info, &oad); 279 unw_ret = dump_trace_unwind(&info, &oad);
270 } else if (tsk == current) 280 } else if (tsk == current)
271 unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad); 281 unw_ret = unwind_init_running(&info, dump_trace_unwind,
282 &oad);
272 else { 283 else {
273 if (unwind_init_blocked(&info, tsk) == 0) 284 if (unwind_init_blocked(&info, tsk) == 0)
274 unw_ret = dump_trace_unwind(&info, &oad); 285 unw_ret = dump_trace_unwind(&info, &oad);
275 } 286 }
276 if (unw_ret > 0) { 287 if (unw_ret > 0) {
277 if (call_trace == 1 && !arch_unw_user_mode(&info)) { 288 if (call_trace == 1 && !arch_unw_user_mode(&info)) {
278 ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n", 289 ops->warning_symbol(data,
290 "DWARF2 unwinder stuck at %s",
279 UNW_PC(&info)); 291 UNW_PC(&info));
280 if ((long)UNW_SP(&info) < 0) { 292 if ((long)UNW_SP(&info) < 0) {
281 ops->warning(data, "Leftover inexact backtrace:\n"); 293 MSG("Leftover inexact backtrace:");
282 stack = (unsigned long *)UNW_SP(&info); 294 stack = (unsigned long *)UNW_SP(&info);
283 if (!stack) 295 if (!stack)
284 return; 296 goto out;
285 } else 297 } else
286 ops->warning(data, "Full inexact backtrace again:\n"); 298 MSG("Full inexact backtrace again:");
287 } else if (call_trace >= 1) 299 } else if (call_trace >= 1)
288 return; 300 goto out;
289 else 301 else
290 ops->warning(data, "Full inexact backtrace again:\n"); 302 MSG("Full inexact backtrace again:");
291 } else 303 } else
292 ops->warning(data, "Inexact backtrace:\n"); 304 MSG("Inexact backtrace:");
293 } 305 }
294 if (!stack) { 306 if (!stack) {
295 unsigned long dummy; 307 unsigned long dummy;
@@ -297,12 +309,6 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
297 if (tsk && tsk != current) 309 if (tsk && tsk != current)
298 stack = (unsigned long *)tsk->thread.rsp; 310 stack = (unsigned long *)tsk->thread.rsp;
299 } 311 }
300 /*
301 * Align the stack pointer on word boundary, later loops
302 * rely on that (and corruption / debug info bugs can cause
303 * unaligned values here):
304 */
305 stack = (unsigned long *)((unsigned long)stack & ~(sizeof(long)-1));
306 312
307 /* 313 /*
308 * Print function call entries within a stack. 'cond' is the 314 * Print function call entries within a stack. 'cond' is the
@@ -312,9 +318,9 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
312#define HANDLE_STACK(cond) \ 318#define HANDLE_STACK(cond) \
313 do while (cond) { \ 319 do while (cond) { \
314 unsigned long addr = *stack++; \ 320 unsigned long addr = *stack++; \
315 if (oops_in_progress ? \ 321 /* Use unlocked access here because except for NMIs \
316 __kernel_text_address(addr) : \ 322 we should be already protected against module unloads */ \
317 kernel_text_address(addr)) { \ 323 if (__kernel_text_address(addr)) { \
318 /* \ 324 /* \
319 * If the address is either in the text segment of the \ 325 * If the address is either in the text segment of the \
320 * kernel, or in the region which contains vmalloc'ed \ 326 * kernel, or in the region which contains vmalloc'ed \
@@ -380,6 +386,8 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s
380 tinfo = current_thread_info(); 386 tinfo = current_thread_info();
381 HANDLE_STACK (valid_stack_ptr(tinfo, stack)); 387 HANDLE_STACK (valid_stack_ptr(tinfo, stack));
382#undef HANDLE_STACK 388#undef HANDLE_STACK
389out:
390 put_cpu();
383} 391}
384EXPORT_SYMBOL(dump_trace); 392EXPORT_SYMBOL(dump_trace);
385 393
@@ -786,8 +794,7 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs)
786{ 794{
787 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", 795 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
788 reason); 796 reason);
789 printk(KERN_EMERG "You probably have a hardware problem with your " 797 printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
790 "RAM chips\n");
791 798
792 if (panic_on_unrecovered_nmi) 799 if (panic_on_unrecovered_nmi)
793 panic("NMI: Not continuing"); 800 panic("NMI: Not continuing");
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index d9534e750d4f..6a1f8f491e5d 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -51,15 +51,6 @@ SECTIONS
51 51
52 RODATA 52 RODATA
53 53
54#ifdef CONFIG_STACK_UNWIND
55 . = ALIGN(8);
56 .eh_frame : AT(ADDR(.eh_frame) - LOAD_OFFSET) {
57 __start_unwind = .;
58 *(.eh_frame)
59 __end_unwind = .;
60 }
61#endif
62
63 . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */ 54 . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */
64 /* Data */ 55 /* Data */
65 .data : AT(ADDR(.data) - LOAD_OFFSET) { 56 .data : AT(ADDR(.data) - LOAD_OFFSET) {
diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
index 3785e4954734..4a673f5397a0 100644
--- a/arch/x86_64/kernel/vsyscall.c
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -289,6 +289,7 @@ static void __init map_vsyscall(void)
289 extern char __vsyscall_0; 289 extern char __vsyscall_0;
290 unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0); 290 unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
291 291
292 /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
292 __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL); 293 __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
293} 294}
294 295
diff --git a/arch/x86_64/lib/csum-partial.c b/arch/x86_64/lib/csum-partial.c
index 06ae630de82b..bc503f506903 100644
--- a/arch/x86_64/lib/csum-partial.c
+++ b/arch/x86_64/lib/csum-partial.c
@@ -9,8 +9,6 @@
9#include <linux/module.h> 9#include <linux/module.h>
10#include <asm/checksum.h> 10#include <asm/checksum.h>
11 11
12#define __force_inline inline __attribute__((always_inline))
13
14static inline unsigned short from32to16(unsigned a) 12static inline unsigned short from32to16(unsigned a)
15{ 13{
16 unsigned short b = a >> 16; 14 unsigned short b = a >> 16;
@@ -33,7 +31,7 @@ static inline unsigned short from32to16(unsigned a)
33 * Unrolling to an 128 bytes inner loop. 31 * Unrolling to an 128 bytes inner loop.
34 * Using interleaving with more registers to break the carry chains. 32 * Using interleaving with more registers to break the carry chains.
35 */ 33 */
36static __force_inline unsigned do_csum(const unsigned char *buff, unsigned len) 34static unsigned do_csum(const unsigned char *buff, unsigned len)
37{ 35{
38 unsigned odd, count; 36 unsigned odd, count;
39 unsigned long result = 0; 37 unsigned long result = 0;
diff --git a/arch/x86_64/lib/delay.c b/arch/x86_64/lib/delay.c
index 50be90975d04..2dbebd308347 100644
--- a/arch/x86_64/lib/delay.c
+++ b/arch/x86_64/lib/delay.c
@@ -40,13 +40,13 @@ EXPORT_SYMBOL(__delay);
40 40
41inline void __const_udelay(unsigned long xloops) 41inline void __const_udelay(unsigned long xloops)
42{ 42{
43 __delay((xloops * HZ * cpu_data[raw_smp_processor_id()].loops_per_jiffy) >> 32); 43 __delay(((xloops * HZ * cpu_data[raw_smp_processor_id()].loops_per_jiffy) >> 32) + 1);
44} 44}
45EXPORT_SYMBOL(__const_udelay); 45EXPORT_SYMBOL(__const_udelay);
46 46
47void __udelay(unsigned long usecs) 47void __udelay(unsigned long usecs)
48{ 48{
49 __const_udelay(usecs * 0x000010c6); /* 2**32 / 1000000 */ 49 __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */
50} 50}
51EXPORT_SYMBOL(__udelay); 51EXPORT_SYMBOL(__udelay);
52 52
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index 3751b4788e28..a65fc6f1dcaf 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -23,9 +23,9 @@
23#include <linux/compiler.h> 23#include <linux/compiler.h>
24#include <linux/module.h> 24#include <linux/module.h>
25#include <linux/kprobes.h> 25#include <linux/kprobes.h>
26#include <linux/uaccess.h>
26 27
27#include <asm/system.h> 28#include <asm/system.h>
28#include <asm/uaccess.h>
29#include <asm/pgalloc.h> 29#include <asm/pgalloc.h>
30#include <asm/smp.h> 30#include <asm/smp.h>
31#include <asm/tlbflush.h> 31#include <asm/tlbflush.h>
@@ -96,7 +96,7 @@ void bust_spinlocks(int yes)
96static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, 96static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
97 unsigned long error_code) 97 unsigned long error_code)
98{ 98{
99 unsigned char __user *instr; 99 unsigned char *instr;
100 int scan_more = 1; 100 int scan_more = 1;
101 int prefetch = 0; 101 int prefetch = 0;
102 unsigned char *max_instr; 102 unsigned char *max_instr;
@@ -116,7 +116,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
116 unsigned char instr_hi; 116 unsigned char instr_hi;
117 unsigned char instr_lo; 117 unsigned char instr_lo;
118 118
119 if (__get_user(opcode, (char __user *)instr)) 119 if (probe_kernel_address(instr, opcode))
120 break; 120 break;
121 121
122 instr_hi = opcode & 0xf0; 122 instr_hi = opcode & 0xf0;
@@ -154,7 +154,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
154 case 0x00: 154 case 0x00:
155 /* Prefetch instruction is 0x0F0D or 0x0F18 */ 155 /* Prefetch instruction is 0x0F0D or 0x0F18 */
156 scan_more = 0; 156 scan_more = 0;
157 if (__get_user(opcode, (char __user *)instr)) 157 if (probe_kernel_address(instr, opcode))
158 break; 158 break;
159 prefetch = (instr_lo == 0xF) && 159 prefetch = (instr_lo == 0xF) &&
160 (opcode == 0x0D || opcode == 0x18); 160 (opcode == 0x0D || opcode == 0x18);
@@ -170,7 +170,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
170static int bad_address(void *p) 170static int bad_address(void *p)
171{ 171{
172 unsigned long dummy; 172 unsigned long dummy;
173 return __get_user(dummy, (unsigned long __user *)p); 173 return probe_kernel_address((unsigned long *)p, dummy);
174} 174}
175 175
176void dump_pagetable(unsigned long address) 176void dump_pagetable(unsigned long address)
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 4c0c00ef3ca7..2968b90ef8ad 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -730,14 +730,15 @@ static __init int x8664_sysctl_init(void)
730__initcall(x8664_sysctl_init); 730__initcall(x8664_sysctl_init);
731#endif 731#endif
732 732
733/* A pseudo VMAs to allow ptrace access for the vsyscall page. This only 733/* A pseudo VMA to allow ptrace access for the vsyscall page. This only
734 covers the 64bit vsyscall page now. 32bit has a real VMA now and does 734 covers the 64bit vsyscall page now. 32bit has a real VMA now and does
735 not need special handling anymore. */ 735 not need special handling anymore. */
736 736
737static struct vm_area_struct gate_vma = { 737static struct vm_area_struct gate_vma = {
738 .vm_start = VSYSCALL_START, 738 .vm_start = VSYSCALL_START,
739 .vm_end = VSYSCALL_END, 739 .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT),
740 .vm_page_prot = PAGE_READONLY 740 .vm_page_prot = PAGE_READONLY_EXEC,
741 .vm_flags = VM_READ | VM_EXEC
741}; 742};
742 743
743struct vm_area_struct *get_gate_vma(struct task_struct *tsk) 744struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c
index 3e231d762aaa..ccb91dd996a9 100644
--- a/arch/x86_64/mm/pageattr.c
+++ b/arch/x86_64/mm/pageattr.c
@@ -61,34 +61,40 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot,
61 return base; 61 return base;
62} 62}
63 63
64 64static void cache_flush_page(void *adr)
65static void flush_kernel_map(void *address)
66{ 65{
67 if (0 && address && cpu_has_clflush) { 66 int i;
68 /* is this worth it? */ 67 for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
69 int i; 68 asm volatile("clflush (%0)" :: "r" (adr + i));
70 for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
71 asm volatile("clflush (%0)" :: "r" (address + i));
72 } else
73 asm volatile("wbinvd":::"memory");
74 if (address)
75 __flush_tlb_one(address);
76 else
77 __flush_tlb_all();
78} 69}
79 70
71static void flush_kernel_map(void *arg)
72{
73 struct list_head *l = (struct list_head *)arg;
74 struct page *pg;
75
76 /* When clflush is available always use it because it is
77 much cheaper than WBINVD */
78 if (!cpu_has_clflush)
79 asm volatile("wbinvd" ::: "memory");
80 list_for_each_entry(pg, l, lru) {
81 void *adr = page_address(pg);
82 if (cpu_has_clflush)
83 cache_flush_page(adr);
84 __flush_tlb_one(adr);
85 }
86}
80 87
81static inline void flush_map(unsigned long address) 88static inline void flush_map(struct list_head *l)
82{ 89{
83 on_each_cpu(flush_kernel_map, (void *)address, 1, 1); 90 on_each_cpu(flush_kernel_map, l, 1, 1);
84} 91}
85 92
86static struct page *deferred_pages; /* protected by init_mm.mmap_sem */ 93static LIST_HEAD(deferred_pages); /* protected by init_mm.mmap_sem */
87 94
88static inline void save_page(struct page *fpage) 95static inline void save_page(struct page *fpage)
89{ 96{
90 fpage->lru.next = (struct list_head *)deferred_pages; 97 list_add(&fpage->lru, &deferred_pages);
91 deferred_pages = fpage;
92} 98}
93 99
94/* 100/*
@@ -207,18 +213,18 @@ int change_page_attr(struct page *page, int numpages, pgprot_t prot)
207 213
208void global_flush_tlb(void) 214void global_flush_tlb(void)
209{ 215{
210 struct page *dpage; 216 struct page *pg, *next;
217 struct list_head l;
211 218
212 down_read(&init_mm.mmap_sem); 219 down_read(&init_mm.mmap_sem);
213 dpage = xchg(&deferred_pages, NULL); 220 list_replace_init(&deferred_pages, &l);
214 up_read(&init_mm.mmap_sem); 221 up_read(&init_mm.mmap_sem);
215 222
216 flush_map((dpage && !dpage->lru.next) ? (unsigned long)page_address(dpage) : 0); 223 flush_map(&l);
217 while (dpage) { 224
218 struct page *tmp = dpage; 225 list_for_each_entry_safe(pg, next, &l, lru) {
219 dpage = (struct page *)dpage->lru.next; 226 ClearPagePrivate(pg);
220 ClearPagePrivate(tmp); 227 __free_page(pg);
221 __free_page(tmp);
222 } 228 }
223} 229}
224 230