aboutsummaryrefslogtreecommitdiffstats
path: root/arch/i386
diff options
context:
space:
mode:
authorTrond Myklebust <Trond.Myklebust@netapp.com>2006-12-07 16:35:17 -0500
committerTrond Myklebust <Trond.Myklebust@netapp.com>2006-12-07 16:35:17 -0500
commit21b4e736922f546e0f1aa7b9d6c442f309a2444a (patch)
treee1be8645297f8ebe87445251743ebcc52081a20d /arch/i386
parent34161db6b14d984fb9b06c735b7b42f8803f6851 (diff)
parent68380b581383c028830f79ec2670f4a193854aa6 (diff)
Merge branch 'master' of /home/trondmy/kernel/linux-2.6/ into merge_linus
Diffstat (limited to 'arch/i386')
-rw-r--r--arch/i386/Kconfig72
-rw-r--r--arch/i386/Kconfig.cpu19
-rw-r--r--arch/i386/Kconfig.debug10
-rw-r--r--arch/i386/Makefile8
-rw-r--r--arch/i386/Makefile.cpu1
-rw-r--r--arch/i386/boot/compressed/Makefile28
-rw-r--r--arch/i386/boot/compressed/head.S185
-rw-r--r--arch/i386/boot/compressed/misc.c264
-rw-r--r--arch/i386/boot/compressed/relocs.c625
-rw-r--r--arch/i386/boot/compressed/vmlinux.lds43
-rw-r--r--arch/i386/boot/compressed/vmlinux.scr3
-rw-r--r--arch/i386/boot/setup.S42
-rw-r--r--arch/i386/defconfig22
-rw-r--r--arch/i386/kernel/Makefile5
-rw-r--r--arch/i386/kernel/acpi/cstate.c6
-rw-r--r--arch/i386/kernel/acpi/earlyquirk.c21
-rw-r--r--arch/i386/kernel/alternative.c63
-rw-r--r--arch/i386/kernel/apic.c22
-rw-r--r--arch/i386/kernel/apm.c3
-rw-r--r--arch/i386/kernel/asm-offsets.c39
-rw-r--r--arch/i386/kernel/cpu/amd.c5
-rw-r--r--arch/i386/kernel/cpu/common.c249
-rw-r--r--arch/i386/kernel/cpu/intel.c12
-rw-r--r--arch/i386/kernel/cpu/intel_cacheinfo.c11
-rw-r--r--arch/i386/kernel/cpu/mcheck/therm_throt.c2
-rw-r--r--arch/i386/kernel/cpu/mtrr/Makefile4
-rw-r--r--arch/i386/kernel/cpu/mtrr/amd.c2
-rw-r--r--arch/i386/kernel/cpu/mtrr/centaur.c9
-rw-r--r--arch/i386/kernel/cpu/mtrr/cyrix.c25
-rw-r--r--arch/i386/kernel/cpu/mtrr/generic.c78
-rw-r--r--arch/i386/kernel/cpu/mtrr/if.c31
-rw-r--r--arch/i386/kernel/cpu/mtrr/main.c71
-rw-r--r--arch/i386/kernel/cpu/mtrr/mtrr.h25
-rw-r--r--arch/i386/kernel/cpu/proc.c3
-rw-r--r--arch/i386/kernel/cpuid.c3
-rw-r--r--arch/i386/kernel/crash.c66
-rw-r--r--arch/i386/kernel/e820.c894
-rw-r--r--arch/i386/kernel/efi.c17
-rw-r--r--arch/i386/kernel/entry.S331
-rw-r--r--arch/i386/kernel/head.S66
-rw-r--r--arch/i386/kernel/hpet.c7
-rw-r--r--arch/i386/kernel/i8259.c5
-rw-r--r--arch/i386/kernel/io_apic.c66
-rw-r--r--arch/i386/kernel/kprobes.c4
-rw-r--r--arch/i386/kernel/ldt.c4
-rw-r--r--arch/i386/kernel/mca.c13
-rw-r--r--arch/i386/kernel/microcode.c2
-rw-r--r--arch/i386/kernel/module.c11
-rw-r--r--arch/i386/kernel/mpparse.c2
-rw-r--r--arch/i386/kernel/msr.c5
-rw-r--r--arch/i386/kernel/nmi.c42
-rw-r--r--arch/i386/kernel/paravirt.c569
-rw-r--r--arch/i386/kernel/pci-dma.c6
-rw-r--r--arch/i386/kernel/process.c81
-rw-r--r--arch/i386/kernel/ptrace.c18
-rw-r--r--arch/i386/kernel/quirks.c46
-rw-r--r--arch/i386/kernel/reboot.c1
-rw-r--r--arch/i386/kernel/setup.c855
-rw-r--r--arch/i386/kernel/signal.c6
-rw-r--r--arch/i386/kernel/smp.c10
-rw-r--r--arch/i386/kernel/smpboot.c69
-rw-r--r--arch/i386/kernel/sysenter.c6
-rw-r--r--arch/i386/kernel/time.c15
-rw-r--r--arch/i386/kernel/time_hpet.c15
-rw-r--r--arch/i386/kernel/topology.c8
-rw-r--r--arch/i386/kernel/traps.c120
-rw-r--r--arch/i386/kernel/tsc.c1
-rw-r--r--arch/i386/kernel/vm86.c121
-rw-r--r--arch/i386/kernel/vmlinux.lds.S147
-rw-r--r--arch/i386/mach-generic/probe.c4
-rw-r--r--arch/i386/mach-voyager/voyager_cat.c6
-rw-r--r--arch/i386/mach-voyager/voyager_smp.c14
-rw-r--r--arch/i386/math-emu/fpu_emu.h1
-rw-r--r--arch/i386/math-emu/fpu_entry.c3
-rw-r--r--arch/i386/math-emu/fpu_system.h1
-rw-r--r--arch/i386/math-emu/load_store.c2
-rw-r--r--arch/i386/math-emu/reg_ld_str.c15
-rw-r--r--arch/i386/mm/boot_ioremap.c1
-rw-r--r--arch/i386/mm/discontig.c2
-rw-r--r--arch/i386/mm/fault.c12
-rw-r--r--arch/i386/mm/highmem.c26
-rw-r--r--arch/i386/mm/hugetlbpage.c112
-rw-r--r--arch/i386/mm/init.c6
-rw-r--r--arch/i386/mm/pageattr.c24
-rw-r--r--arch/i386/mm/pgtable.c13
-rw-r--r--arch/i386/pci/early.c7
-rw-r--r--arch/i386/pci/irq.c4
-rw-r--r--arch/i386/pci/pcbios.c11
-rw-r--r--arch/i386/power/Makefile2
-rw-r--r--arch/i386/power/cpu.c8
-rw-r--r--arch/i386/power/suspend.c158
-rw-r--r--arch/i386/power/swsusp.S9
92 files changed, 4153 insertions, 1928 deletions
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 8ff1c6fb5aa1..ea70359b02d0 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -182,6 +182,17 @@ config X86_ES7000
182 182
183endchoice 183endchoice
184 184
185config PARAVIRT
186 bool "Paravirtualization support (EXPERIMENTAL)"
187 depends on EXPERIMENTAL
188 help
189 Paravirtualization is a way of running multiple instances of
190 Linux on the same machine, under a hypervisor. This option
191 changes the kernel so it can modify itself when it is run
192 under a hypervisor, improving performance significantly.
193 However, when run without a hypervisor the kernel is
194 theoretically slower. If in doubt, say N.
195
185config ACPI_SRAT 196config ACPI_SRAT
186 bool 197 bool
187 default y 198 default y
@@ -443,7 +454,8 @@ source "drivers/firmware/Kconfig"
443 454
444choice 455choice
445 prompt "High Memory Support" 456 prompt "High Memory Support"
446 default NOHIGHMEM 457 default HIGHMEM4G if !X86_NUMAQ
458 default HIGHMEM64G if X86_NUMAQ
447 459
448config NOHIGHMEM 460config NOHIGHMEM
449 bool "off" 461 bool "off"
@@ -710,20 +722,6 @@ config BOOT_IOREMAP
710 depends on (((X86_SUMMIT || X86_GENERICARCH) && NUMA) || (X86 && EFI)) 722 depends on (((X86_SUMMIT || X86_GENERICARCH) && NUMA) || (X86 && EFI))
711 default y 723 default y
712 724
713config REGPARM
714 bool "Use register arguments"
715 default y
716 help
717 Compile the kernel with -mregparm=3. This instructs gcc to use
718 a more efficient function call ABI which passes the first three
719 arguments of a function call via registers, which results in denser
720 and faster code.
721
722 If this option is disabled, then the default ABI of passing
723 arguments via the stack is used.
724
725 If unsure, say Y.
726
727config SECCOMP 725config SECCOMP
728 bool "Enable seccomp to safely compute untrusted bytecode" 726 bool "Enable seccomp to safely compute untrusted bytecode"
729 depends on PROC_FS 727 depends on PROC_FS
@@ -773,23 +771,39 @@ config CRASH_DUMP
773 PHYSICAL_START. 771 PHYSICAL_START.
774 For more details see Documentation/kdump/kdump.txt 772 For more details see Documentation/kdump/kdump.txt
775 773
776config PHYSICAL_START 774config RELOCATABLE
777 hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) 775 bool "Build a relocatable kernel(EXPERIMENTAL)"
776 depends on EXPERIMENTAL
777 help
778 This build a kernel image that retains relocation information
779 so it can be loaded someplace besides the default 1MB.
780 The relocations tend to the kernel binary about 10% larger,
781 but are discarded at runtime.
782
783 One use is for the kexec on panic case where the recovery kernel
784 must live at a different physical address than the primary
785 kernel.
778 786
779 default "0x1000000" if CRASH_DUMP 787config PHYSICAL_ALIGN
788 hex "Alignment value to which kernel should be aligned"
780 default "0x100000" 789 default "0x100000"
790 range 0x2000 0x400000
781 help 791 help
782 This gives the physical address where the kernel is loaded. Normally 792 This value puts the alignment restrictions on physical address
783 for regular kernels this value is 0x100000 (1MB). But in the case 793 where kernel is loaded and run from. Kernel is compiled for an
784 of kexec on panic the fail safe kernel needs to run at a different 794 address which meets above alignment restriction.
785 address than the panic-ed kernel. This option is used to set the load 795
786 address for kernels used to capture crash dump on being kexec'ed 796 If bootloader loads the kernel at a non-aligned address and
787 after panic. The default value for crash dump kernels is 797 CONFIG_RELOCATABLE is set, kernel will move itself to nearest
788 0x1000000 (16MB). This can also be set based on the "X" value as 798 address aligned to above value and run from there.
789 specified in the "crashkernel=YM@XM" command line boot parameter 799
790 passed to the panic-ed kernel. Typically this parameter is set as 800 If bootloader loads the kernel at a non-aligned address and
791 crashkernel=64M@16M. Please take a look at 801 CONFIG_RELOCATABLE is not set, kernel will ignore the run time
792 Documentation/kdump/kdump.txt for more details about crash dumps. 802 load address and decompress itself to the address it has been
803 compiled for and run from there. The address for which kernel is
804 compiled already meets above alignment restrictions. Hence the
805 end result is that kernel runs from a physical address meeting
806 above alignment restrictions.
793 807
794 Don't change this unless you know what you are doing. 808 Don't change this unless you know what you are doing.
795 809
diff --git a/arch/i386/Kconfig.cpu b/arch/i386/Kconfig.cpu
index fc4f2abccf06..821fd269ca58 100644
--- a/arch/i386/Kconfig.cpu
+++ b/arch/i386/Kconfig.cpu
@@ -103,8 +103,15 @@ config MPENTIUMM
103 Select this for Intel Pentium M (not Pentium-4 M) 103 Select this for Intel Pentium M (not Pentium-4 M)
104 notebook chips. 104 notebook chips.
105 105
106config MCORE2
107 bool "Core 2/newer Xeon"
108 help
109 Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and 53xx)
110 CPUs. You can distingush newer from older Xeons by the CPU family
111 in /proc/cpuinfo. Newer ones have 6.
112
106config MPENTIUM4 113config MPENTIUM4
107 bool "Pentium-4/Celeron(P4-based)/Pentium-4 M/Xeon" 114 bool "Pentium-4/Celeron(P4-based)/Pentium-4 M/older Xeon"
108 help 115 help
109 Select this for Intel Pentium 4 chips. This includes the 116 Select this for Intel Pentium 4 chips. This includes the
110 Pentium 4, P4-based Celeron and Xeon, and Pentium-4 M 117 Pentium 4, P4-based Celeron and Xeon, and Pentium-4 M
@@ -229,7 +236,7 @@ config X86_L1_CACHE_SHIFT
229 default "7" if MPENTIUM4 || X86_GENERIC 236 default "7" if MPENTIUM4 || X86_GENERIC
230 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 237 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1
231 default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX 238 default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
232 default "6" if MK7 || MK8 || MPENTIUMM 239 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2
233 240
234config RWSEM_GENERIC_SPINLOCK 241config RWSEM_GENERIC_SPINLOCK
235 bool 242 bool
@@ -287,17 +294,17 @@ config X86_ALIGNMENT_16
287 294
288config X86_GOOD_APIC 295config X86_GOOD_APIC
289 bool 296 bool
290 depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON 297 depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON || MCORE2
291 default y 298 default y
292 299
293config X86_INTEL_USERCOPY 300config X86_INTEL_USERCOPY
294 bool 301 bool
295 depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON 302 depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2
296 default y 303 default y
297 304
298config X86_USE_PPRO_CHECKSUM 305config X86_USE_PPRO_CHECKSUM
299 bool 306 bool
300 depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX 307 depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2
301 default y 308 default y
302 309
303config X86_USE_3DNOW 310config X86_USE_3DNOW
@@ -312,5 +319,5 @@ config X86_OOSTORE
312 319
313config X86_TSC 320config X86_TSC
314 bool 321 bool
315 depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1 || MGEODE_LX) && !X86_NUMAQ 322 depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ
316 default y 323 default y
diff --git a/arch/i386/Kconfig.debug b/arch/i386/Kconfig.debug
index b31c0802e1cc..f68cc6f215f8 100644
--- a/arch/i386/Kconfig.debug
+++ b/arch/i386/Kconfig.debug
@@ -85,4 +85,14 @@ config DOUBLEFAULT
85 option saves about 4k and might cause you much additional grey 85 option saves about 4k and might cause you much additional grey
86 hair. 86 hair.
87 87
88config DEBUG_PARAVIRT
89 bool "Enable some paravirtualization debugging"
90 default y
91 depends on PARAVIRT && DEBUG_KERNEL
92 help
93 Currently deliberately clobbers regs which are allowed to be
94 clobbered in inlined paravirt hooks, even in native mode.
95 If turning this off solves a problem, then DISABLE_INTERRUPTS() or
96 ENABLE_INTERRUPTS() is lying about what registers can be clobbered.
97
88endmenu 98endmenu
diff --git a/arch/i386/Makefile b/arch/i386/Makefile
index 0677908dfa06..f7ac1aea1d8a 100644
--- a/arch/i386/Makefile
+++ b/arch/i386/Makefile
@@ -26,10 +26,12 @@ endif
26 26
27LDFLAGS := -m elf_i386 27LDFLAGS := -m elf_i386
28OBJCOPYFLAGS := -O binary -R .note -R .comment -S 28OBJCOPYFLAGS := -O binary -R .note -R .comment -S
29LDFLAGS_vmlinux := 29ifdef CONFIG_RELOCATABLE
30LDFLAGS_vmlinux := --emit-relocs
31endif
30CHECKFLAGS += -D__i386__ 32CHECKFLAGS += -D__i386__
31 33
32CFLAGS += -pipe -msoft-float 34CFLAGS += -pipe -msoft-float -mregparm=3
33 35
34# prevent gcc from keeping the stack 16 byte aligned 36# prevent gcc from keeping the stack 16 byte aligned
35CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2) 37CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2)
@@ -37,8 +39,6 @@ CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2)
37# CPU-specific tuning. Anything which can be shared with UML should go here. 39# CPU-specific tuning. Anything which can be shared with UML should go here.
38include $(srctree)/arch/i386/Makefile.cpu 40include $(srctree)/arch/i386/Makefile.cpu
39 41
40cflags-$(CONFIG_REGPARM) += -mregparm=3
41
42# temporary until string.h is fixed 42# temporary until string.h is fixed
43cflags-y += -ffreestanding 43cflags-y += -ffreestanding
44 44
diff --git a/arch/i386/Makefile.cpu b/arch/i386/Makefile.cpu
index a11befba26d5..a32c031c90d7 100644
--- a/arch/i386/Makefile.cpu
+++ b/arch/i386/Makefile.cpu
@@ -32,6 +32,7 @@ cflags-$(CONFIG_MWINCHIP2) += $(call cc-option,-march=winchip2,-march=i586)
32cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586) 32cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586)
33cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 33cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
34cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) 34cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686)
35cflags-$(CONFIG_MCORE2) += -march=i686 $(call cc-option,-mtune=core2,$(call cc-option,-mtune=generic,-mtune=i686))
35 36
36# AMD Elan support 37# AMD Elan support
37cflags-$(CONFIG_X86_ELAN) += -march=i486 38cflags-$(CONFIG_X86_ELAN) += -march=i486
diff --git a/arch/i386/boot/compressed/Makefile b/arch/i386/boot/compressed/Makefile
index 258ea95224f6..a661217f33ec 100644
--- a/arch/i386/boot/compressed/Makefile
+++ b/arch/i386/boot/compressed/Makefile
@@ -4,22 +4,42 @@
4# create a compressed vmlinux image from the original vmlinux 4# create a compressed vmlinux image from the original vmlinux
5# 5#
6 6
7targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o 7targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o \
8 vmlinux.bin.all vmlinux.relocs
8EXTRA_AFLAGS := -traditional 9EXTRA_AFLAGS := -traditional
9 10
10LDFLAGS_vmlinux := -Ttext $(IMAGE_OFFSET) -e startup_32 11LDFLAGS_vmlinux := -T
12CFLAGS_misc.o += -fPIC
13hostprogs-y := relocs
11 14
12$(obj)/vmlinux: $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE 15$(obj)/vmlinux: $(src)/vmlinux.lds $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE
13 $(call if_changed,ld) 16 $(call if_changed,ld)
14 @: 17 @:
15 18
16$(obj)/vmlinux.bin: vmlinux FORCE 19$(obj)/vmlinux.bin: vmlinux FORCE
17 $(call if_changed,objcopy) 20 $(call if_changed,objcopy)
18 21
22quiet_cmd_relocs = RELOCS $@
23 cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $<
24$(obj)/vmlinux.relocs: vmlinux $(obj)/relocs FORCE
25 $(call if_changed,relocs)
26
27vmlinux.bin.all-y := $(obj)/vmlinux.bin
28vmlinux.bin.all-$(CONFIG_RELOCATABLE) += $(obj)/vmlinux.relocs
29quiet_cmd_relocbin = BUILD $@
30 cmd_relocbin = cat $(filter-out FORCE,$^) > $@
31$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE
32 $(call if_changed,relocbin)
33
34ifdef CONFIG_RELOCATABLE
35$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE
36 $(call if_changed,gzip)
37else
19$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE 38$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
20 $(call if_changed,gzip) 39 $(call if_changed,gzip)
40endif
21 41
22LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T 42LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T
23 43
24$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE 44$(obj)/piggy.o: $(src)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE
25 $(call if_changed,ld) 45 $(call if_changed,ld)
diff --git a/arch/i386/boot/compressed/head.S b/arch/i386/boot/compressed/head.S
index b5893e4ecd37..f395a4bb38bb 100644
--- a/arch/i386/boot/compressed/head.S
+++ b/arch/i386/boot/compressed/head.S
@@ -26,9 +26,11 @@
26#include <linux/linkage.h> 26#include <linux/linkage.h>
27#include <asm/segment.h> 27#include <asm/segment.h>
28#include <asm/page.h> 28#include <asm/page.h>
29#include <asm/boot.h>
29 30
31.section ".text.head"
30 .globl startup_32 32 .globl startup_32
31 33
32startup_32: 34startup_32:
33 cld 35 cld
34 cli 36 cli
@@ -37,93 +39,142 @@ startup_32:
37 movl %eax,%es 39 movl %eax,%es
38 movl %eax,%fs 40 movl %eax,%fs
39 movl %eax,%gs 41 movl %eax,%gs
42 movl %eax,%ss
40 43
41 lss stack_start,%esp 44/* Calculate the delta between where we were compiled to run
42 xorl %eax,%eax 45 * at and where we were actually loaded at. This can only be done
431: incl %eax # check that A20 really IS enabled 46 * with a short local call on x86. Nothing else will tell us what
44 movl %eax,0x000000 # loop forever if it isn't 47 * address we are running at. The reserved chunk of the real-mode
45 cmpl %eax,0x100000 48 * data at 0x34-0x3f are used as the stack for this calculation.
46 je 1b 49 * Only 4 bytes are needed.
50 */
51 leal 0x40(%esi), %esp
52 call 1f
531: popl %ebp
54 subl $1b, %ebp
55
56/* %ebp contains the address we are loaded at by the boot loader and %ebx
57 * contains the address where we should move the kernel image temporarily
58 * for safe in-place decompression.
59 */
60
61#ifdef CONFIG_RELOCATABLE
62 movl %ebp, %ebx
63 addl $(CONFIG_PHYSICAL_ALIGN - 1), %ebx
64 andl $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebx
65#else
66 movl $LOAD_PHYSICAL_ADDR, %ebx
67#endif
68
69 /* Replace the compressed data size with the uncompressed size */
70 subl input_len(%ebp), %ebx
71 movl output_len(%ebp), %eax
72 addl %eax, %ebx
73 /* Add 8 bytes for every 32K input block */
74 shrl $12, %eax
75 addl %eax, %ebx
76 /* Add 32K + 18 bytes of extra slack */
77 addl $(32768 + 18), %ebx
78 /* Align on a 4K boundary */
79 addl $4095, %ebx
80 andl $~4095, %ebx
81
82/* Copy the compressed kernel to the end of our buffer
83 * where decompression in place becomes safe.
84 */
85 pushl %esi
86 leal _end(%ebp), %esi
87 leal _end(%ebx), %edi
88 movl $(_end - startup_32), %ecx
89 std
90 rep
91 movsb
92 cld
93 popl %esi
94
95/* Compute the kernel start address.
96 */
97#ifdef CONFIG_RELOCATABLE
98 addl $(CONFIG_PHYSICAL_ALIGN - 1), %ebp
99 andl $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebp
100#else
101 movl $LOAD_PHYSICAL_ADDR, %ebp
102#endif
47 103
48/* 104/*
49 * Initialize eflags. Some BIOS's leave bits like NT set. This would 105 * Jump to the relocated address.
50 * confuse the debugger if this code is traced.
51 * XXX - best to initialize before switching to protected mode.
52 */ 106 */
53 pushl $0 107 leal relocated(%ebx), %eax
54 popfl 108 jmp *%eax
109.section ".text"
110relocated:
111
55/* 112/*
56 * Clear BSS 113 * Clear BSS
57 */ 114 */
58 xorl %eax,%eax 115 xorl %eax,%eax
59 movl $_edata,%edi 116 leal _edata(%ebx),%edi
60 movl $_end,%ecx 117 leal _end(%ebx), %ecx
61 subl %edi,%ecx 118 subl %edi,%ecx
62 cld 119 cld
63 rep 120 rep
64 stosb 121 stosb
122
123/*
124 * Setup the stack for the decompressor
125 */
126 leal stack_end(%ebx), %esp
127
65/* 128/*
66 * Do the decompression, and jump to the new kernel.. 129 * Do the decompression, and jump to the new kernel..
67 */ 130 */
68 subl $16,%esp # place for structure on the stack 131 movl output_len(%ebx), %eax
69 movl %esp,%eax 132 pushl %eax
133 pushl %ebp # output address
134 movl input_len(%ebx), %eax
135 pushl %eax # input_len
136 leal input_data(%ebx), %eax
137 pushl %eax # input_data
138 leal _end(%ebx), %eax
139 pushl %eax # end of the image as third argument
70 pushl %esi # real mode pointer as second arg 140 pushl %esi # real mode pointer as second arg
71 pushl %eax # address of structure as first arg
72 call decompress_kernel 141 call decompress_kernel
73 orl %eax,%eax 142 addl $20, %esp
74 jnz 3f 143 popl %ecx
75 popl %esi # discard address
76 popl %esi # real mode pointer
77 xorl %ebx,%ebx
78 ljmp $(__BOOT_CS), $__PHYSICAL_START
79 144
145#if CONFIG_RELOCATABLE
146/* Find the address of the relocations.
147 */
148 movl %ebp, %edi
149 addl %ecx, %edi
150
151/* Calculate the delta between where vmlinux was compiled to run
152 * and where it was actually loaded.
153 */
154 movl %ebp, %ebx
155 subl $LOAD_PHYSICAL_ADDR, %ebx
156 jz 2f /* Nothing to be done if loaded at compiled addr. */
80/* 157/*
81 * We come here, if we were loaded high. 158 * Process relocations.
82 * We need to move the move-in-place routine down to 0x1000
83 * and then start it with the buffer addresses in registers,
84 * which we got from the stack.
85 */ 159 */
863: 160
87 movl $move_routine_start,%esi 1611: subl $4, %edi
88 movl $0x1000,%edi 162 movl 0(%edi), %ecx
89 movl $move_routine_end,%ecx 163 testl %ecx, %ecx
90 subl %esi,%ecx 164 jz 2f
91 addl $3,%ecx 165 addl %ebx, -__PAGE_OFFSET(%ebx, %ecx)
92 shrl $2,%ecx 166 jmp 1b
93 cld 1672:
94 rep 168#endif
95 movsl
96
97 popl %esi # discard the address
98 popl %ebx # real mode pointer
99 popl %esi # low_buffer_start
100 popl %ecx # lcount
101 popl %edx # high_buffer_start
102 popl %eax # hcount
103 movl $__PHYSICAL_START,%edi
104 cli # make sure we don't get interrupted
105 ljmp $(__BOOT_CS), $0x1000 # and jump to the move routine
106 169
107/* 170/*
108 * Routine (template) for moving the decompressed kernel in place, 171 * Jump to the decompressed kernel.
109 * if we were high loaded. This _must_ PIC-code !
110 */ 172 */
111move_routine_start:
112 movl %ecx,%ebp
113 shrl $2,%ecx
114 rep
115 movsl
116 movl %ebp,%ecx
117 andl $3,%ecx
118 rep
119 movsb
120 movl %edx,%esi
121 movl %eax,%ecx # NOTE: rep movsb won't move if %ecx == 0
122 addl $3,%ecx
123 shrl $2,%ecx
124 rep
125 movsl
126 movl %ebx,%esi # Restore setup pointer
127 xorl %ebx,%ebx 173 xorl %ebx,%ebx
128 ljmp $(__BOOT_CS), $__PHYSICAL_START 174 jmp *%ebp
129move_routine_end: 175
176.bss
177.balign 4
178stack:
179 .fill 4096, 1, 0
180stack_end:
diff --git a/arch/i386/boot/compressed/misc.c b/arch/i386/boot/compressed/misc.c
index b2ccd543410d..1ce7017fd627 100644
--- a/arch/i386/boot/compressed/misc.c
+++ b/arch/i386/boot/compressed/misc.c
@@ -9,11 +9,94 @@
9 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 9 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
10 */ 10 */
11 11
12#undef CONFIG_PARAVIRT
12#include <linux/linkage.h> 13#include <linux/linkage.h>
13#include <linux/vmalloc.h> 14#include <linux/vmalloc.h>
14#include <linux/screen_info.h> 15#include <linux/screen_info.h>
15#include <asm/io.h> 16#include <asm/io.h>
16#include <asm/page.h> 17#include <asm/page.h>
18#include <asm/boot.h>
19
20/* WARNING!!
21 * This code is compiled with -fPIC and it is relocated dynamically
22 * at run time, but no relocation processing is performed.
23 * This means that it is not safe to place pointers in static structures.
24 */
25
26/*
27 * Getting to provable safe in place decompression is hard.
28 * Worst case behaviours need to be analized.
29 * Background information:
30 *
31 * The file layout is:
32 * magic[2]
33 * method[1]
34 * flags[1]
35 * timestamp[4]
36 * extraflags[1]
37 * os[1]
38 * compressed data blocks[N]
39 * crc[4] orig_len[4]
40 *
41 * resulting in 18 bytes of non compressed data overhead.
42 *
43 * Files divided into blocks
44 * 1 bit (last block flag)
45 * 2 bits (block type)
46 *
47 * 1 block occurs every 32K -1 bytes or when there 50% compression has been achieved.
48 * The smallest block type encoding is always used.
49 *
50 * stored:
51 * 32 bits length in bytes.
52 *
53 * fixed:
54 * magic fixed tree.
55 * symbols.
56 *
57 * dynamic:
58 * dynamic tree encoding.
59 * symbols.
60 *
61 *
62 * The buffer for decompression in place is the length of the
63 * uncompressed data, plus a small amount extra to keep the algorithm safe.
64 * The compressed data is placed at the end of the buffer. The output
65 * pointer is placed at the start of the buffer and the input pointer
66 * is placed where the compressed data starts. Problems will occur
67 * when the output pointer overruns the input pointer.
68 *
69 * The output pointer can only overrun the input pointer if the input
70 * pointer is moving faster than the output pointer. A condition only
71 * triggered by data whose compressed form is larger than the uncompressed
72 * form.
73 *
74 * The worst case at the block level is a growth of the compressed data
75 * of 5 bytes per 32767 bytes.
76 *
77 * The worst case internal to a compressed block is very hard to figure.
78 * The worst case can at least be boundined by having one bit that represents
79 * 32764 bytes and then all of the rest of the bytes representing the very
80 * very last byte.
81 *
82 * All of which is enough to compute an amount of extra data that is required
83 * to be safe. To avoid problems at the block level allocating 5 extra bytes
84 * per 32767 bytes of data is sufficient. To avoind problems internal to a block
85 * adding an extra 32767 bytes (the worst case uncompressed block size) is
86 * sufficient, to ensure that in the worst case the decompressed data for
87 * block will stop the byte before the compressed data for a block begins.
88 * To avoid problems with the compressed data's meta information an extra 18
89 * bytes are needed. Leading to the formula:
90 *
91 * extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size.
92 *
93 * Adding 8 bytes per 32K is a bit excessive but much easier to calculate.
94 * Adding 32768 instead of 32767 just makes for round numbers.
95 * Adding the decompressor_size is necessary as it musht live after all
96 * of the data as well. Last I measured the decompressor is about 14K.
97 * 10K of actuall data and 4K of bss.
98 *
99 */
17 100
18/* 101/*
19 * gzip declarations 102 * gzip declarations
@@ -30,15 +113,20 @@ typedef unsigned char uch;
30typedef unsigned short ush; 113typedef unsigned short ush;
31typedef unsigned long ulg; 114typedef unsigned long ulg;
32 115
33#define WSIZE 0x8000 /* Window size must be at least 32k, */ 116#define WSIZE 0x80000000 /* Window size must be at least 32k,
34 /* and a power of two */ 117 * and a power of two
118 * We don't actually have a window just
119 * a huge output buffer so I report
120 * a 2G windows size, as that should
121 * always be larger than our output buffer.
122 */
35 123
36static uch *inbuf; /* input buffer */ 124static uch *inbuf; /* input buffer */
37static uch window[WSIZE]; /* Sliding window buffer */ 125static uch *window; /* Sliding window buffer, (and final output buffer) */
38 126
39static unsigned insize = 0; /* valid bytes in inbuf */ 127static unsigned insize; /* valid bytes in inbuf */
40static unsigned inptr = 0; /* index of next byte to be processed in inbuf */ 128static unsigned inptr; /* index of next byte to be processed in inbuf */
41static unsigned outcnt = 0; /* bytes in output buffer */ 129static unsigned outcnt; /* bytes in output buffer */
42 130
43/* gzip flag byte */ 131/* gzip flag byte */
44#define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */ 132#define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */
@@ -89,8 +177,6 @@ extern unsigned char input_data[];
89extern int input_len; 177extern int input_len;
90 178
91static long bytes_out = 0; 179static long bytes_out = 0;
92static uch *output_data;
93static unsigned long output_ptr = 0;
94 180
95static void *malloc(int size); 181static void *malloc(int size);
96static void free(void *where); 182static void free(void *where);
@@ -100,24 +186,17 @@ static void *memcpy(void *dest, const void *src, unsigned n);
100 186
101static void putstr(const char *); 187static void putstr(const char *);
102 188
103extern int end; 189static unsigned long free_mem_ptr;
104static long free_mem_ptr = (long)&end; 190static unsigned long free_mem_end_ptr;
105static long free_mem_end_ptr;
106 191
107#define INPLACE_MOVE_ROUTINE 0x1000
108#define LOW_BUFFER_START 0x2000
109#define LOW_BUFFER_MAX 0x90000
110#define HEAP_SIZE 0x3000 192#define HEAP_SIZE 0x3000
111static unsigned int low_buffer_end, low_buffer_size;
112static int high_loaded =0;
113static uch *high_buffer_start /* = (uch *)(((ulg)&end) + HEAP_SIZE)*/;
114 193
115static char *vidmem = (char *)0xb8000; 194static char *vidmem = (char *)0xb8000;
116static int vidport; 195static int vidport;
117static int lines, cols; 196static int lines, cols;
118 197
119#ifdef CONFIG_X86_NUMAQ 198#ifdef CONFIG_X86_NUMAQ
120static void * xquad_portio = NULL; 199void *xquad_portio;
121#endif 200#endif
122 201
123#include "../../../../lib/inflate.c" 202#include "../../../../lib/inflate.c"
@@ -151,7 +230,7 @@ static void gzip_mark(void **ptr)
151 230
152static void gzip_release(void **ptr) 231static void gzip_release(void **ptr)
153{ 232{
154 free_mem_ptr = (long) *ptr; 233 free_mem_ptr = (unsigned long) *ptr;
155} 234}
156 235
157static void scroll(void) 236static void scroll(void)
@@ -179,7 +258,7 @@ static void putstr(const char *s)
179 y--; 258 y--;
180 } 259 }
181 } else { 260 } else {
182 vidmem [ ( x + cols * y ) * 2 ] = c; 261 vidmem [ ( x + cols * y ) * 2 ] = c;
183 if ( ++x >= cols ) { 262 if ( ++x >= cols ) {
184 x = 0; 263 x = 0;
185 if ( ++y >= lines ) { 264 if ( ++y >= lines ) {
@@ -224,58 +303,31 @@ static void* memcpy(void* dest, const void* src, unsigned n)
224 */ 303 */
225static int fill_inbuf(void) 304static int fill_inbuf(void)
226{ 305{
227 if (insize != 0) { 306 error("ran out of input data");
228 error("ran out of input data"); 307 return 0;
229 }
230
231 inbuf = input_data;
232 insize = input_len;
233 inptr = 1;
234 return inbuf[0];
235} 308}
236 309
237/* =========================================================================== 310/* ===========================================================================
238 * Write the output window window[0..outcnt-1] and update crc and bytes_out. 311 * Write the output window window[0..outcnt-1] and update crc and bytes_out.
239 * (Used for the decompressed data only.) 312 * (Used for the decompressed data only.)
240 */ 313 */
241static void flush_window_low(void)
242{
243 ulg c = crc; /* temporary variable */
244 unsigned n;
245 uch *in, *out, ch;
246
247 in = window;
248 out = &output_data[output_ptr];
249 for (n = 0; n < outcnt; n++) {
250 ch = *out++ = *in++;
251 c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
252 }
253 crc = c;
254 bytes_out += (ulg)outcnt;
255 output_ptr += (ulg)outcnt;
256 outcnt = 0;
257}
258
259static void flush_window_high(void)
260{
261 ulg c = crc; /* temporary variable */
262 unsigned n;
263 uch *in, ch;
264 in = window;
265 for (n = 0; n < outcnt; n++) {
266 ch = *output_data++ = *in++;
267 if ((ulg)output_data == low_buffer_end) output_data=high_buffer_start;
268 c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
269 }
270 crc = c;
271 bytes_out += (ulg)outcnt;
272 outcnt = 0;
273}
274
275static void flush_window(void) 314static void flush_window(void)
276{ 315{
277 if (high_loaded) flush_window_high(); 316 /* With my window equal to my output buffer
278 else flush_window_low(); 317 * I only need to compute the crc here.
318 */
319 ulg c = crc; /* temporary variable */
320 unsigned n;
321 uch *in, ch;
322
323 in = window;
324 for (n = 0; n < outcnt; n++) {
325 ch = *in++;
326 c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
327 }
328 crc = c;
329 bytes_out += (ulg)outcnt;
330 outcnt = 0;
279} 331}
280 332
281static void error(char *x) 333static void error(char *x)
@@ -287,66 +339,8 @@ static void error(char *x)
287 while(1); /* Halt */ 339 while(1); /* Halt */
288} 340}
289 341
290#define STACK_SIZE (4096) 342asmlinkage void decompress_kernel(void *rmode, unsigned long end,
291 343 uch *input_data, unsigned long input_len, uch *output)
292long user_stack [STACK_SIZE];
293
294struct {
295 long * a;
296 short b;
297 } stack_start = { & user_stack [STACK_SIZE] , __BOOT_DS };
298
299static void setup_normal_output_buffer(void)
300{
301#ifdef STANDARD_MEMORY_BIOS_CALL
302 if (RM_EXT_MEM_K < 1024) error("Less than 2MB of memory");
303#else
304 if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < 1024) error("Less than 2MB of memory");
305#endif
306 output_data = (unsigned char *)__PHYSICAL_START; /* Normally Points to 1M */
307 free_mem_end_ptr = (long)real_mode;
308}
309
310struct moveparams {
311 uch *low_buffer_start; int lcount;
312 uch *high_buffer_start; int hcount;
313};
314
315static void setup_output_buffer_if_we_run_high(struct moveparams *mv)
316{
317 high_buffer_start = (uch *)(((ulg)&end) + HEAP_SIZE);
318#ifdef STANDARD_MEMORY_BIOS_CALL
319 if (RM_EXT_MEM_K < (3*1024)) error("Less than 4MB of memory");
320#else
321 if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < (3*1024)) error("Less than 4MB of memory");
322#endif
323 mv->low_buffer_start = output_data = (unsigned char *)LOW_BUFFER_START;
324 low_buffer_end = ((unsigned int)real_mode > LOW_BUFFER_MAX
325 ? LOW_BUFFER_MAX : (unsigned int)real_mode) & ~0xfff;
326 low_buffer_size = low_buffer_end - LOW_BUFFER_START;
327 high_loaded = 1;
328 free_mem_end_ptr = (long)high_buffer_start;
329 if ( (__PHYSICAL_START + low_buffer_size) > ((ulg)high_buffer_start)) {
330 high_buffer_start = (uch *)(__PHYSICAL_START + low_buffer_size);
331 mv->hcount = 0; /* say: we need not to move high_buffer */
332 }
333 else mv->hcount = -1;
334 mv->high_buffer_start = high_buffer_start;
335}
336
337static void close_output_buffer_if_we_run_high(struct moveparams *mv)
338{
339 if (bytes_out > low_buffer_size) {
340 mv->lcount = low_buffer_size;
341 if (mv->hcount)
342 mv->hcount = bytes_out - low_buffer_size;
343 } else {
344 mv->lcount = bytes_out;
345 mv->hcount = 0;
346 }
347}
348
349asmlinkage int decompress_kernel(struct moveparams *mv, void *rmode)
350{ 344{
351 real_mode = rmode; 345 real_mode = rmode;
352 346
@@ -361,13 +355,25 @@ asmlinkage int decompress_kernel(struct moveparams *mv, void *rmode)
361 lines = RM_SCREEN_INFO.orig_video_lines; 355 lines = RM_SCREEN_INFO.orig_video_lines;
362 cols = RM_SCREEN_INFO.orig_video_cols; 356 cols = RM_SCREEN_INFO.orig_video_cols;
363 357
364 if (free_mem_ptr < 0x100000) setup_normal_output_buffer(); 358 window = output; /* Output buffer (Normally at 1M) */
365 else setup_output_buffer_if_we_run_high(mv); 359 free_mem_ptr = end; /* Heap */
360 free_mem_end_ptr = end + HEAP_SIZE;
361 inbuf = input_data; /* Input buffer */
362 insize = input_len;
363 inptr = 0;
364
365 if ((u32)output & (CONFIG_PHYSICAL_ALIGN -1))
366 error("Destination address not CONFIG_PHYSICAL_ALIGN aligned");
367 if (end > ((-__PAGE_OFFSET-(512 <<20)-1) & 0x7fffffff))
368 error("Destination address too large");
369#ifndef CONFIG_RELOCATABLE
370 if ((u32)output != LOAD_PHYSICAL_ADDR)
371 error("Wrong destination address");
372#endif
366 373
367 makecrc(); 374 makecrc();
368 putstr("Uncompressing Linux... "); 375 putstr("Uncompressing Linux... ");
369 gunzip(); 376 gunzip();
370 putstr("Ok, booting the kernel.\n"); 377 putstr("Ok, booting the kernel.\n");
371 if (high_loaded) close_output_buffer_if_we_run_high(mv); 378 return;
372 return high_loaded;
373} 379}
diff --git a/arch/i386/boot/compressed/relocs.c b/arch/i386/boot/compressed/relocs.c
new file mode 100644
index 000000000000..468da89153c4
--- /dev/null
+++ b/arch/i386/boot/compressed/relocs.c
@@ -0,0 +1,625 @@
1#include <stdio.h>
2#include <stdarg.h>
3#include <stdlib.h>
4#include <stdint.h>
5#include <string.h>
6#include <errno.h>
7#include <unistd.h>
8#include <elf.h>
9#include <byteswap.h>
10#define USE_BSD
11#include <endian.h>
12
13#define MAX_SHDRS 100
14static Elf32_Ehdr ehdr;
15static Elf32_Shdr shdr[MAX_SHDRS];
16static Elf32_Sym *symtab[MAX_SHDRS];
17static Elf32_Rel *reltab[MAX_SHDRS];
18static char *strtab[MAX_SHDRS];
19static unsigned long reloc_count, reloc_idx;
20static unsigned long *relocs;
21
22/*
23 * Following symbols have been audited. There values are constant and do
24 * not change if bzImage is loaded at a different physical address than
25 * the address for which it has been compiled. Don't warn user about
26 * absolute relocations present w.r.t these symbols.
27 */
28static const char* safe_abs_relocs[] = {
29 "__kernel_vsyscall",
30 "__kernel_rt_sigreturn",
31 "__kernel_sigreturn",
32 "SYSENTER_RETURN",
33};
34
35static int is_safe_abs_reloc(const char* sym_name)
36{
37 int i, array_size;
38
39 array_size = sizeof(safe_abs_relocs)/sizeof(char*);
40
41 for(i = 0; i < array_size; i++) {
42 if (!strcmp(sym_name, safe_abs_relocs[i]))
43 /* Match found */
44 return 1;
45 }
46 return 0;
47}
48
49static void die(char *fmt, ...)
50{
51 va_list ap;
52 va_start(ap, fmt);
53 vfprintf(stderr, fmt, ap);
54 va_end(ap);
55 exit(1);
56}
57
58static const char *sym_type(unsigned type)
59{
60 static const char *type_name[] = {
61#define SYM_TYPE(X) [X] = #X
62 SYM_TYPE(STT_NOTYPE),
63 SYM_TYPE(STT_OBJECT),
64 SYM_TYPE(STT_FUNC),
65 SYM_TYPE(STT_SECTION),
66 SYM_TYPE(STT_FILE),
67 SYM_TYPE(STT_COMMON),
68 SYM_TYPE(STT_TLS),
69#undef SYM_TYPE
70 };
71 const char *name = "unknown sym type name";
72 if (type < sizeof(type_name)/sizeof(type_name[0])) {
73 name = type_name[type];
74 }
75 return name;
76}
77
78static const char *sym_bind(unsigned bind)
79{
80 static const char *bind_name[] = {
81#define SYM_BIND(X) [X] = #X
82 SYM_BIND(STB_LOCAL),
83 SYM_BIND(STB_GLOBAL),
84 SYM_BIND(STB_WEAK),
85#undef SYM_BIND
86 };
87 const char *name = "unknown sym bind name";
88 if (bind < sizeof(bind_name)/sizeof(bind_name[0])) {
89 name = bind_name[bind];
90 }
91 return name;
92}
93
94static const char *sym_visibility(unsigned visibility)
95{
96 static const char *visibility_name[] = {
97#define SYM_VISIBILITY(X) [X] = #X
98 SYM_VISIBILITY(STV_DEFAULT),
99 SYM_VISIBILITY(STV_INTERNAL),
100 SYM_VISIBILITY(STV_HIDDEN),
101 SYM_VISIBILITY(STV_PROTECTED),
102#undef SYM_VISIBILITY
103 };
104 const char *name = "unknown sym visibility name";
105 if (visibility < sizeof(visibility_name)/sizeof(visibility_name[0])) {
106 name = visibility_name[visibility];
107 }
108 return name;
109}
110
111static const char *rel_type(unsigned type)
112{
113 static const char *type_name[] = {
114#define REL_TYPE(X) [X] = #X
115 REL_TYPE(R_386_NONE),
116 REL_TYPE(R_386_32),
117 REL_TYPE(R_386_PC32),
118 REL_TYPE(R_386_GOT32),
119 REL_TYPE(R_386_PLT32),
120 REL_TYPE(R_386_COPY),
121 REL_TYPE(R_386_GLOB_DAT),
122 REL_TYPE(R_386_JMP_SLOT),
123 REL_TYPE(R_386_RELATIVE),
124 REL_TYPE(R_386_GOTOFF),
125 REL_TYPE(R_386_GOTPC),
126#undef REL_TYPE
127 };
128 const char *name = "unknown type rel type name";
129 if (type < sizeof(type_name)/sizeof(type_name[0])) {
130 name = type_name[type];
131 }
132 return name;
133}
134
135static const char *sec_name(unsigned shndx)
136{
137 const char *sec_strtab;
138 const char *name;
139 sec_strtab = strtab[ehdr.e_shstrndx];
140 name = "<noname>";
141 if (shndx < ehdr.e_shnum) {
142 name = sec_strtab + shdr[shndx].sh_name;
143 }
144 else if (shndx == SHN_ABS) {
145 name = "ABSOLUTE";
146 }
147 else if (shndx == SHN_COMMON) {
148 name = "COMMON";
149 }
150 return name;
151}
152
153static const char *sym_name(const char *sym_strtab, Elf32_Sym *sym)
154{
155 const char *name;
156 name = "<noname>";
157 if (sym->st_name) {
158 name = sym_strtab + sym->st_name;
159 }
160 else {
161 name = sec_name(shdr[sym->st_shndx].sh_name);
162 }
163 return name;
164}
165
166
167
168#if BYTE_ORDER == LITTLE_ENDIAN
169#define le16_to_cpu(val) (val)
170#define le32_to_cpu(val) (val)
171#endif
172#if BYTE_ORDER == BIG_ENDIAN
173#define le16_to_cpu(val) bswap_16(val)
174#define le32_to_cpu(val) bswap_32(val)
175#endif
176
177static uint16_t elf16_to_cpu(uint16_t val)
178{
179 return le16_to_cpu(val);
180}
181
182static uint32_t elf32_to_cpu(uint32_t val)
183{
184 return le32_to_cpu(val);
185}
186
187static void read_ehdr(FILE *fp)
188{
189 if (fread(&ehdr, sizeof(ehdr), 1, fp) != 1) {
190 die("Cannot read ELF header: %s\n",
191 strerror(errno));
192 }
193 if (memcmp(ehdr.e_ident, ELFMAG, 4) != 0) {
194 die("No ELF magic\n");
195 }
196 if (ehdr.e_ident[EI_CLASS] != ELFCLASS32) {
197 die("Not a 32 bit executable\n");
198 }
199 if (ehdr.e_ident[EI_DATA] != ELFDATA2LSB) {
200 die("Not a LSB ELF executable\n");
201 }
202 if (ehdr.e_ident[EI_VERSION] != EV_CURRENT) {
203 die("Unknown ELF version\n");
204 }
205 /* Convert the fields to native endian */
206 ehdr.e_type = elf16_to_cpu(ehdr.e_type);
207 ehdr.e_machine = elf16_to_cpu(ehdr.e_machine);
208 ehdr.e_version = elf32_to_cpu(ehdr.e_version);
209 ehdr.e_entry = elf32_to_cpu(ehdr.e_entry);
210 ehdr.e_phoff = elf32_to_cpu(ehdr.e_phoff);
211 ehdr.e_shoff = elf32_to_cpu(ehdr.e_shoff);
212 ehdr.e_flags = elf32_to_cpu(ehdr.e_flags);
213 ehdr.e_ehsize = elf16_to_cpu(ehdr.e_ehsize);
214 ehdr.e_phentsize = elf16_to_cpu(ehdr.e_phentsize);
215 ehdr.e_phnum = elf16_to_cpu(ehdr.e_phnum);
216 ehdr.e_shentsize = elf16_to_cpu(ehdr.e_shentsize);
217 ehdr.e_shnum = elf16_to_cpu(ehdr.e_shnum);
218 ehdr.e_shstrndx = elf16_to_cpu(ehdr.e_shstrndx);
219
220 if ((ehdr.e_type != ET_EXEC) && (ehdr.e_type != ET_DYN)) {
221 die("Unsupported ELF header type\n");
222 }
223 if (ehdr.e_machine != EM_386) {
224 die("Not for x86\n");
225 }
226 if (ehdr.e_version != EV_CURRENT) {
227 die("Unknown ELF version\n");
228 }
229 if (ehdr.e_ehsize != sizeof(Elf32_Ehdr)) {
230 die("Bad Elf header size\n");
231 }
232 if (ehdr.e_phentsize != sizeof(Elf32_Phdr)) {
233 die("Bad program header entry\n");
234 }
235 if (ehdr.e_shentsize != sizeof(Elf32_Shdr)) {
236 die("Bad section header entry\n");
237 }
238 if (ehdr.e_shstrndx >= ehdr.e_shnum) {
239 die("String table index out of bounds\n");
240 }
241}
242
243static void read_shdrs(FILE *fp)
244{
245 int i;
246 if (ehdr.e_shnum > MAX_SHDRS) {
247 die("%d section headers supported: %d\n",
248 ehdr.e_shnum, MAX_SHDRS);
249 }
250 if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0) {
251 die("Seek to %d failed: %s\n",
252 ehdr.e_shoff, strerror(errno));
253 }
254 if (fread(&shdr, sizeof(shdr[0]), ehdr.e_shnum, fp) != ehdr.e_shnum) {
255 die("Cannot read ELF section headers: %s\n",
256 strerror(errno));
257 }
258 for(i = 0; i < ehdr.e_shnum; i++) {
259 shdr[i].sh_name = elf32_to_cpu(shdr[i].sh_name);
260 shdr[i].sh_type = elf32_to_cpu(shdr[i].sh_type);
261 shdr[i].sh_flags = elf32_to_cpu(shdr[i].sh_flags);
262 shdr[i].sh_addr = elf32_to_cpu(shdr[i].sh_addr);
263 shdr[i].sh_offset = elf32_to_cpu(shdr[i].sh_offset);
264 shdr[i].sh_size = elf32_to_cpu(shdr[i].sh_size);
265 shdr[i].sh_link = elf32_to_cpu(shdr[i].sh_link);
266 shdr[i].sh_info = elf32_to_cpu(shdr[i].sh_info);
267 shdr[i].sh_addralign = elf32_to_cpu(shdr[i].sh_addralign);
268 shdr[i].sh_entsize = elf32_to_cpu(shdr[i].sh_entsize);
269 }
270
271}
272
273static void read_strtabs(FILE *fp)
274{
275 int i;
276 for(i = 0; i < ehdr.e_shnum; i++) {
277 if (shdr[i].sh_type != SHT_STRTAB) {
278 continue;
279 }
280 strtab[i] = malloc(shdr[i].sh_size);
281 if (!strtab[i]) {
282 die("malloc of %d bytes for strtab failed\n",
283 shdr[i].sh_size);
284 }
285 if (fseek(fp, shdr[i].sh_offset, SEEK_SET) < 0) {
286 die("Seek to %d failed: %s\n",
287 shdr[i].sh_offset, strerror(errno));
288 }
289 if (fread(strtab[i], 1, shdr[i].sh_size, fp) != shdr[i].sh_size) {
290 die("Cannot read symbol table: %s\n",
291 strerror(errno));
292 }
293 }
294}
295
296static void read_symtabs(FILE *fp)
297{
298 int i,j;
299 for(i = 0; i < ehdr.e_shnum; i++) {
300 if (shdr[i].sh_type != SHT_SYMTAB) {
301 continue;
302 }
303 symtab[i] = malloc(shdr[i].sh_size);
304 if (!symtab[i]) {
305 die("malloc of %d bytes for symtab failed\n",
306 shdr[i].sh_size);
307 }
308 if (fseek(fp, shdr[i].sh_offset, SEEK_SET) < 0) {
309 die("Seek to %d failed: %s\n",
310 shdr[i].sh_offset, strerror(errno));
311 }
312 if (fread(symtab[i], 1, shdr[i].sh_size, fp) != shdr[i].sh_size) {
313 die("Cannot read symbol table: %s\n",
314 strerror(errno));
315 }
316 for(j = 0; j < shdr[i].sh_size/sizeof(symtab[i][0]); j++) {
317 symtab[i][j].st_name = elf32_to_cpu(symtab[i][j].st_name);
318 symtab[i][j].st_value = elf32_to_cpu(symtab[i][j].st_value);
319 symtab[i][j].st_size = elf32_to_cpu(symtab[i][j].st_size);
320 symtab[i][j].st_shndx = elf16_to_cpu(symtab[i][j].st_shndx);
321 }
322 }
323}
324
325
326static void read_relocs(FILE *fp)
327{
328 int i,j;
329 for(i = 0; i < ehdr.e_shnum; i++) {
330 if (shdr[i].sh_type != SHT_REL) {
331 continue;
332 }
333 reltab[i] = malloc(shdr[i].sh_size);
334 if (!reltab[i]) {
335 die("malloc of %d bytes for relocs failed\n",
336 shdr[i].sh_size);
337 }
338 if (fseek(fp, shdr[i].sh_offset, SEEK_SET) < 0) {
339 die("Seek to %d failed: %s\n",
340 shdr[i].sh_offset, strerror(errno));
341 }
342 if (fread(reltab[i], 1, shdr[i].sh_size, fp) != shdr[i].sh_size) {
343 die("Cannot read symbol table: %s\n",
344 strerror(errno));
345 }
346 for(j = 0; j < shdr[i].sh_size/sizeof(reltab[0][0]); j++) {
347 reltab[i][j].r_offset = elf32_to_cpu(reltab[i][j].r_offset);
348 reltab[i][j].r_info = elf32_to_cpu(reltab[i][j].r_info);
349 }
350 }
351}
352
353
354static void print_absolute_symbols(void)
355{
356 int i;
357 printf("Absolute symbols\n");
358 printf(" Num: Value Size Type Bind Visibility Name\n");
359 for(i = 0; i < ehdr.e_shnum; i++) {
360 char *sym_strtab;
361 Elf32_Sym *sh_symtab;
362 int j;
363 if (shdr[i].sh_type != SHT_SYMTAB) {
364 continue;
365 }
366 sh_symtab = symtab[i];
367 sym_strtab = strtab[shdr[i].sh_link];
368 for(j = 0; j < shdr[i].sh_size/sizeof(symtab[0][0]); j++) {
369 Elf32_Sym *sym;
370 const char *name;
371 sym = &symtab[i][j];
372 name = sym_name(sym_strtab, sym);
373 if (sym->st_shndx != SHN_ABS) {
374 continue;
375 }
376 printf("%5d %08x %5d %10s %10s %12s %s\n",
377 j, sym->st_value, sym->st_size,
378 sym_type(ELF32_ST_TYPE(sym->st_info)),
379 sym_bind(ELF32_ST_BIND(sym->st_info)),
380 sym_visibility(ELF32_ST_VISIBILITY(sym->st_other)),
381 name);
382 }
383 }
384 printf("\n");
385}
386
387static void print_absolute_relocs(void)
388{
389 int i, printed = 0;
390
391 for(i = 0; i < ehdr.e_shnum; i++) {
392 char *sym_strtab;
393 Elf32_Sym *sh_symtab;
394 unsigned sec_applies, sec_symtab;
395 int j;
396 if (shdr[i].sh_type != SHT_REL) {
397 continue;
398 }
399 sec_symtab = shdr[i].sh_link;
400 sec_applies = shdr[i].sh_info;
401 if (!(shdr[sec_applies].sh_flags & SHF_ALLOC)) {
402 continue;
403 }
404 sh_symtab = symtab[sec_symtab];
405 sym_strtab = strtab[shdr[sec_symtab].sh_link];
406 for(j = 0; j < shdr[i].sh_size/sizeof(reltab[0][0]); j++) {
407 Elf32_Rel *rel;
408 Elf32_Sym *sym;
409 const char *name;
410 rel = &reltab[i][j];
411 sym = &sh_symtab[ELF32_R_SYM(rel->r_info)];
412 name = sym_name(sym_strtab, sym);
413 if (sym->st_shndx != SHN_ABS) {
414 continue;
415 }
416
417 /* Absolute symbols are not relocated if bzImage is
418 * loaded at a non-compiled address. Display a warning
419 * to user at compile time about the absolute
420 * relocations present.
421 *
422 * User need to audit the code to make sure
423 * some symbols which should have been section
424 * relative have not become absolute because of some
425 * linker optimization or wrong programming usage.
426 *
427 * Before warning check if this absolute symbol
428 * relocation is harmless.
429 */
430 if (is_safe_abs_reloc(name))
431 continue;
432
433 if (!printed) {
434 printf("WARNING: Absolute relocations"
435 " present\n");
436 printf("Offset Info Type Sym.Value "
437 "Sym.Name\n");
438 printed = 1;
439 }
440
441 printf("%08x %08x %10s %08x %s\n",
442 rel->r_offset,
443 rel->r_info,
444 rel_type(ELF32_R_TYPE(rel->r_info)),
445 sym->st_value,
446 name);
447 }
448 }
449
450 if (printed)
451 printf("\n");
452}
453
454static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym))
455{
456 int i;
457 /* Walk through the relocations */
458 for(i = 0; i < ehdr.e_shnum; i++) {
459 char *sym_strtab;
460 Elf32_Sym *sh_symtab;
461 unsigned sec_applies, sec_symtab;
462 int j;
463 if (shdr[i].sh_type != SHT_REL) {
464 continue;
465 }
466 sec_symtab = shdr[i].sh_link;
467 sec_applies = shdr[i].sh_info;
468 if (!(shdr[sec_applies].sh_flags & SHF_ALLOC)) {
469 continue;
470 }
471 sh_symtab = symtab[sec_symtab];
472 sym_strtab = strtab[shdr[sec_symtab].sh_link];
473 for(j = 0; j < shdr[i].sh_size/sizeof(reltab[0][0]); j++) {
474 Elf32_Rel *rel;
475 Elf32_Sym *sym;
476 unsigned r_type;
477 rel = &reltab[i][j];
478 sym = &sh_symtab[ELF32_R_SYM(rel->r_info)];
479 r_type = ELF32_R_TYPE(rel->r_info);
480 /* Don't visit relocations to absolute symbols */
481 if (sym->st_shndx == SHN_ABS) {
482 continue;
483 }
484 if (r_type == R_386_PC32) {
485 /* PC relative relocations don't need to be adjusted */
486 }
487 else if (r_type == R_386_32) {
488 /* Visit relocations that need to be adjusted */
489 visit(rel, sym);
490 }
491 else {
492 die("Unsupported relocation type: %d\n", r_type);
493 }
494 }
495 }
496}
497
498static void count_reloc(Elf32_Rel *rel, Elf32_Sym *sym)
499{
500 reloc_count += 1;
501}
502
503static void collect_reloc(Elf32_Rel *rel, Elf32_Sym *sym)
504{
505 /* Remember the address that needs to be adjusted. */
506 relocs[reloc_idx++] = rel->r_offset;
507}
508
509static int cmp_relocs(const void *va, const void *vb)
510{
511 const unsigned long *a, *b;
512 a = va; b = vb;
513 return (*a == *b)? 0 : (*a > *b)? 1 : -1;
514}
515
516static void emit_relocs(int as_text)
517{
518 int i;
519 /* Count how many relocations I have and allocate space for them. */
520 reloc_count = 0;
521 walk_relocs(count_reloc);
522 relocs = malloc(reloc_count * sizeof(relocs[0]));
523 if (!relocs) {
524 die("malloc of %d entries for relocs failed\n",
525 reloc_count);
526 }
527 /* Collect up the relocations */
528 reloc_idx = 0;
529 walk_relocs(collect_reloc);
530
531 /* Order the relocations for more efficient processing */
532 qsort(relocs, reloc_count, sizeof(relocs[0]), cmp_relocs);
533
534 /* Print the relocations */
535 if (as_text) {
536 /* Print the relocations in a form suitable that
537 * gas will like.
538 */
539 printf(".section \".data.reloc\",\"a\"\n");
540 printf(".balign 4\n");
541 for(i = 0; i < reloc_count; i++) {
542 printf("\t .long 0x%08lx\n", relocs[i]);
543 }
544 printf("\n");
545 }
546 else {
547 unsigned char buf[4];
548 buf[0] = buf[1] = buf[2] = buf[3] = 0;
549 /* Print a stop */
550 printf("%c%c%c%c", buf[0], buf[1], buf[2], buf[3]);
551 /* Now print each relocation */
552 for(i = 0; i < reloc_count; i++) {
553 buf[0] = (relocs[i] >> 0) & 0xff;
554 buf[1] = (relocs[i] >> 8) & 0xff;
555 buf[2] = (relocs[i] >> 16) & 0xff;
556 buf[3] = (relocs[i] >> 24) & 0xff;
557 printf("%c%c%c%c", buf[0], buf[1], buf[2], buf[3]);
558 }
559 }
560}
561
562static void usage(void)
563{
564 die("relocs [--abs-syms |--abs-relocs | --text] vmlinux\n");
565}
566
567int main(int argc, char **argv)
568{
569 int show_absolute_syms, show_absolute_relocs;
570 int as_text;
571 const char *fname;
572 FILE *fp;
573 int i;
574
575 show_absolute_syms = 0;
576 show_absolute_relocs = 0;
577 as_text = 0;
578 fname = NULL;
579 for(i = 1; i < argc; i++) {
580 char *arg = argv[i];
581 if (*arg == '-') {
582 if (strcmp(argv[1], "--abs-syms") == 0) {
583 show_absolute_syms = 1;
584 continue;
585 }
586
587 if (strcmp(argv[1], "--abs-relocs") == 0) {
588 show_absolute_relocs = 1;
589 continue;
590 }
591 else if (strcmp(argv[1], "--text") == 0) {
592 as_text = 1;
593 continue;
594 }
595 }
596 else if (!fname) {
597 fname = arg;
598 continue;
599 }
600 usage();
601 }
602 if (!fname) {
603 usage();
604 }
605 fp = fopen(fname, "r");
606 if (!fp) {
607 die("Cannot open %s: %s\n",
608 fname, strerror(errno));
609 }
610 read_ehdr(fp);
611 read_shdrs(fp);
612 read_strtabs(fp);
613 read_symtabs(fp);
614 read_relocs(fp);
615 if (show_absolute_syms) {
616 print_absolute_symbols();
617 return 0;
618 }
619 if (show_absolute_relocs) {
620 print_absolute_relocs();
621 return 0;
622 }
623 emit_relocs(as_text);
624 return 0;
625}
diff --git a/arch/i386/boot/compressed/vmlinux.lds b/arch/i386/boot/compressed/vmlinux.lds
new file mode 100644
index 000000000000..cc4854f6c6c1
--- /dev/null
+++ b/arch/i386/boot/compressed/vmlinux.lds
@@ -0,0 +1,43 @@
1OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
2OUTPUT_ARCH(i386)
3ENTRY(startup_32)
4SECTIONS
5{
6 /* Be careful parts of head.S assume startup_32 is at
7 * address 0.
8 */
9 . = 0 ;
10 .text.head : {
11 _head = . ;
12 *(.text.head)
13 _ehead = . ;
14 }
15 .data.compressed : {
16 *(.data.compressed)
17 }
18 .text : {
19 _text = .; /* Text */
20 *(.text)
21 *(.text.*)
22 _etext = . ;
23 }
24 .rodata : {
25 _rodata = . ;
26 *(.rodata) /* read-only data */
27 *(.rodata.*)
28 _erodata = . ;
29 }
30 .data : {
31 _data = . ;
32 *(.data)
33 *(.data.*)
34 _edata = . ;
35 }
36 .bss : {
37 _bss = . ;
38 *(.bss)
39 *(.bss.*)
40 *(COMMON)
41 _end = . ;
42 }
43}
diff --git a/arch/i386/boot/compressed/vmlinux.scr b/arch/i386/boot/compressed/vmlinux.scr
index 1ed9d791f863..707a88f7f29e 100644
--- a/arch/i386/boot/compressed/vmlinux.scr
+++ b/arch/i386/boot/compressed/vmlinux.scr
@@ -1,9 +1,10 @@
1SECTIONS 1SECTIONS
2{ 2{
3 .data : { 3 .data.compressed : {
4 input_len = .; 4 input_len = .;
5 LONG(input_data_end - input_data) input_data = .; 5 LONG(input_data_end - input_data) input_data = .;
6 *(.data) 6 *(.data)
7 output_len = . - 4;
7 input_data_end = .; 8 input_data_end = .;
8 } 9 }
9} 10}
diff --git a/arch/i386/boot/setup.S b/arch/i386/boot/setup.S
index 3aec4538a113..06edf1c66242 100644
--- a/arch/i386/boot/setup.S
+++ b/arch/i386/boot/setup.S
@@ -81,7 +81,7 @@ start:
81# This is the setup header, and it must start at %cs:2 (old 0x9020:2) 81# This is the setup header, and it must start at %cs:2 (old 0x9020:2)
82 82
83 .ascii "HdrS" # header signature 83 .ascii "HdrS" # header signature
84 .word 0x0204 # header version number (>= 0x0105) 84 .word 0x0205 # header version number (>= 0x0105)
85 # or else old loadlin-1.5 will fail) 85 # or else old loadlin-1.5 will fail)
86realmode_swtch: .word 0, 0 # default_switch, SETUPSEG 86realmode_swtch: .word 0, 0 # default_switch, SETUPSEG
87start_sys_seg: .word SYSSEG 87start_sys_seg: .word SYSSEG
@@ -160,6 +160,17 @@ ramdisk_max: .long (-__PAGE_OFFSET-(512 << 20)-1) & 0x7fffffff
160 # The highest safe address for 160 # The highest safe address for
161 # the contents of an initrd 161 # the contents of an initrd
162 162
163kernel_alignment: .long CONFIG_PHYSICAL_ALIGN #physical addr alignment
164 #required for protected mode
165 #kernel
166#ifdef CONFIG_RELOCATABLE
167relocatable_kernel: .byte 1
168#else
169relocatable_kernel: .byte 0
170#endif
171pad2: .byte 0
172pad3: .word 0
173
163trampoline: call start_of_setup 174trampoline: call start_of_setup
164 .align 16 175 .align 16
165 # The offset at this point is 0x240 176 # The offset at this point is 0x240
@@ -588,11 +599,6 @@ rmodeswtch_normal:
588 call default_switch 599 call default_switch
589 600
590rmodeswtch_end: 601rmodeswtch_end:
591# we get the code32 start address and modify the below 'jmpi'
592# (loader may have changed it)
593 movl %cs:code32_start, %eax
594 movl %eax, %cs:code32
595
596# Now we move the system to its rightful place ... but we check if we have a 602# Now we move the system to its rightful place ... but we check if we have a
597# big-kernel. In that case we *must* not move it ... 603# big-kernel. In that case we *must* not move it ...
598 testb $LOADED_HIGH, %cs:loadflags 604 testb $LOADED_HIGH, %cs:loadflags
@@ -788,11 +794,12 @@ a20_err_msg:
788a20_done: 794a20_done:
789 795
790#endif /* CONFIG_X86_VOYAGER */ 796#endif /* CONFIG_X86_VOYAGER */
791# set up gdt and idt 797# set up gdt and idt and 32bit start address
792 lidt idt_48 # load idt with 0,0 798 lidt idt_48 # load idt with 0,0
793 xorl %eax, %eax # Compute gdt_base 799 xorl %eax, %eax # Compute gdt_base
794 movw %ds, %ax # (Convert %ds:gdt to a linear ptr) 800 movw %ds, %ax # (Convert %ds:gdt to a linear ptr)
795 shll $4, %eax 801 shll $4, %eax
802 addl %eax, code32
796 addl $gdt, %eax 803 addl $gdt, %eax
797 movl %eax, (gdt_48+2) 804 movl %eax, (gdt_48+2)
798 lgdt gdt_48 # load gdt with whatever is 805 lgdt gdt_48 # load gdt with whatever is
@@ -851,9 +858,26 @@ flush_instr:
851# Manual, Mixing 16-bit and 32-bit code, page 16-6) 858# Manual, Mixing 16-bit and 32-bit code, page 16-6)
852 859
853 .byte 0x66, 0xea # prefix + jmpi-opcode 860 .byte 0x66, 0xea # prefix + jmpi-opcode
854code32: .long 0x1000 # will be set to 0x100000 861code32: .long startup_32 # will be set to %cs+startup_32
855 # for big kernels
856 .word __BOOT_CS 862 .word __BOOT_CS
863.code32
864startup_32:
865 movl $(__BOOT_DS), %eax
866 movl %eax, %ds
867 movl %eax, %es
868 movl %eax, %fs
869 movl %eax, %gs
870 movl %eax, %ss
871
872 xorl %eax, %eax
8731: incl %eax # check that A20 really IS enabled
874 movl %eax, 0x00000000 # loop forever if it isn't
875 cmpl %eax, 0x00100000
876 je 1b
877
878 # Jump to the 32bit entry point
879 jmpl *(code32_start - start + (DELTA_INITSEG << 4))(%esi)
880.code16
857 881
858# Here's a bunch of information about your current kernel.. 882# Here's a bunch of information about your current kernel..
859kernel_version: .ascii UTS_RELEASE 883kernel_version: .ascii UTS_RELEASE
diff --git a/arch/i386/defconfig b/arch/i386/defconfig
index 97aacd6bd7d8..65891f11aced 100644
--- a/arch/i386/defconfig
+++ b/arch/i386/defconfig
@@ -1,7 +1,7 @@
1# 1#
2# Automatically generated make config: don't edit 2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.19-rc2-git4 3# Linux kernel version: 2.6.19-git7
4# Sat Oct 21 03:38:56 2006 4# Wed Dec 6 23:50:49 2006
5# 5#
6CONFIG_X86_32=y 6CONFIG_X86_32=y
7CONFIG_GENERIC_TIME=y 7CONFIG_GENERIC_TIME=y
@@ -40,13 +40,14 @@ CONFIG_POSIX_MQUEUE=y
40CONFIG_IKCONFIG=y 40CONFIG_IKCONFIG=y
41CONFIG_IKCONFIG_PROC=y 41CONFIG_IKCONFIG_PROC=y
42# CONFIG_CPUSETS is not set 42# CONFIG_CPUSETS is not set
43CONFIG_SYSFS_DEPRECATED=y
43# CONFIG_RELAY is not set 44# CONFIG_RELAY is not set
44CONFIG_INITRAMFS_SOURCE="" 45CONFIG_INITRAMFS_SOURCE=""
45CONFIG_CC_OPTIMIZE_FOR_SIZE=y 46CONFIG_CC_OPTIMIZE_FOR_SIZE=y
46CONFIG_SYSCTL=y 47CONFIG_SYSCTL=y
47# CONFIG_EMBEDDED is not set 48# CONFIG_EMBEDDED is not set
48CONFIG_UID16=y 49CONFIG_UID16=y
49# CONFIG_SYSCTL_SYSCALL is not set 50CONFIG_SYSCTL_SYSCALL=y
50CONFIG_KALLSYMS=y 51CONFIG_KALLSYMS=y
51CONFIG_KALLSYMS_ALL=y 52CONFIG_KALLSYMS_ALL=y
52# CONFIG_KALLSYMS_EXTRA_PASS is not set 53# CONFIG_KALLSYMS_EXTRA_PASS is not set
@@ -110,6 +111,7 @@ CONFIG_SMP=y
110# CONFIG_X86_VISWS is not set 111# CONFIG_X86_VISWS is not set
111CONFIG_X86_GENERICARCH=y 112CONFIG_X86_GENERICARCH=y
112# CONFIG_X86_ES7000 is not set 113# CONFIG_X86_ES7000 is not set
114# CONFIG_PARAVIRT is not set
113CONFIG_X86_CYCLONE_TIMER=y 115CONFIG_X86_CYCLONE_TIMER=y
114# CONFIG_M386 is not set 116# CONFIG_M386 is not set
115# CONFIG_M486 is not set 117# CONFIG_M486 is not set
@@ -120,6 +122,7 @@ CONFIG_X86_CYCLONE_TIMER=y
120# CONFIG_MPENTIUMII is not set 122# CONFIG_MPENTIUMII is not set
121CONFIG_MPENTIUMIII=y 123CONFIG_MPENTIUMIII=y
122# CONFIG_MPENTIUMM is not set 124# CONFIG_MPENTIUMM is not set
125# CONFIG_MCORE2 is not set
123# CONFIG_MPENTIUM4 is not set 126# CONFIG_MPENTIUM4 is not set
124# CONFIG_MK6 is not set 127# CONFIG_MK6 is not set
125# CONFIG_MK7 is not set 128# CONFIG_MK7 is not set
@@ -197,7 +200,6 @@ CONFIG_RESOURCES_64BIT=y
197CONFIG_MTRR=y 200CONFIG_MTRR=y
198# CONFIG_EFI is not set 201# CONFIG_EFI is not set
199# CONFIG_IRQBALANCE is not set 202# CONFIG_IRQBALANCE is not set
200CONFIG_REGPARM=y
201CONFIG_SECCOMP=y 203CONFIG_SECCOMP=y
202# CONFIG_HZ_100 is not set 204# CONFIG_HZ_100 is not set
203CONFIG_HZ_250=y 205CONFIG_HZ_250=y
@@ -205,7 +207,8 @@ CONFIG_HZ_250=y
205CONFIG_HZ=250 207CONFIG_HZ=250
206# CONFIG_KEXEC is not set 208# CONFIG_KEXEC is not set
207# CONFIG_CRASH_DUMP is not set 209# CONFIG_CRASH_DUMP is not set
208CONFIG_PHYSICAL_START=0x100000 210# CONFIG_RELOCATABLE is not set
211CONFIG_PHYSICAL_ALIGN=0x100000
209# CONFIG_HOTPLUG_CPU is not set 212# CONFIG_HOTPLUG_CPU is not set
210CONFIG_COMPAT_VDSO=y 213CONFIG_COMPAT_VDSO=y
211CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y 214CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
@@ -367,6 +370,7 @@ CONFIG_INET_TCP_DIAG=y
367# CONFIG_TCP_CONG_ADVANCED is not set 370# CONFIG_TCP_CONG_ADVANCED is not set
368CONFIG_TCP_CONG_CUBIC=y 371CONFIG_TCP_CONG_CUBIC=y
369CONFIG_DEFAULT_TCP_CONG="cubic" 372CONFIG_DEFAULT_TCP_CONG="cubic"
373# CONFIG_TCP_MD5SIG is not set
370CONFIG_IPV6=y 374CONFIG_IPV6=y
371# CONFIG_IPV6_PRIVACY is not set 375# CONFIG_IPV6_PRIVACY is not set
372# CONFIG_IPV6_ROUTER_PREF is not set 376# CONFIG_IPV6_ROUTER_PREF is not set
@@ -677,6 +681,7 @@ CONFIG_SATA_INTEL_COMBINED=y
677# CONFIG_PATA_IT821X is not set 681# CONFIG_PATA_IT821X is not set
678# CONFIG_PATA_JMICRON is not set 682# CONFIG_PATA_JMICRON is not set
679# CONFIG_PATA_TRIFLEX is not set 683# CONFIG_PATA_TRIFLEX is not set
684# CONFIG_PATA_MARVELL is not set
680# CONFIG_PATA_MPIIX is not set 685# CONFIG_PATA_MPIIX is not set
681# CONFIG_PATA_OLDPIIX is not set 686# CONFIG_PATA_OLDPIIX is not set
682# CONFIG_PATA_NETCELL is not set 687# CONFIG_PATA_NETCELL is not set
@@ -850,6 +855,7 @@ CONFIG_BNX2=y
850# CONFIG_IXGB is not set 855# CONFIG_IXGB is not set
851# CONFIG_S2IO is not set 856# CONFIG_S2IO is not set
852# CONFIG_MYRI10GE is not set 857# CONFIG_MYRI10GE is not set
858# CONFIG_NETXEN_NIC is not set
853 859
854# 860#
855# Token Ring devices 861# Token Ring devices
@@ -984,10 +990,6 @@ CONFIG_RTC=y
984# CONFIG_R3964 is not set 990# CONFIG_R3964 is not set
985# CONFIG_APPLICOM is not set 991# CONFIG_APPLICOM is not set
986# CONFIG_SONYPI is not set 992# CONFIG_SONYPI is not set
987
988#
989# Ftape, the floppy tape device driver
990#
991CONFIG_AGP=y 993CONFIG_AGP=y
992# CONFIG_AGP_ALI is not set 994# CONFIG_AGP_ALI is not set
993# CONFIG_AGP_ATI is not set 995# CONFIG_AGP_ATI is not set
@@ -1108,6 +1110,7 @@ CONFIG_USB_DEVICEFS=y
1108# CONFIG_USB_BANDWIDTH is not set 1110# CONFIG_USB_BANDWIDTH is not set
1109# CONFIG_USB_DYNAMIC_MINORS is not set 1111# CONFIG_USB_DYNAMIC_MINORS is not set
1110# CONFIG_USB_SUSPEND is not set 1112# CONFIG_USB_SUSPEND is not set
1113# CONFIG_USB_MULTITHREAD_PROBE is not set
1111# CONFIG_USB_OTG is not set 1114# CONFIG_USB_OTG is not set
1112 1115
1113# 1116#
@@ -1185,6 +1188,7 @@ CONFIG_USB_HIDINPUT=y
1185# CONFIG_USB_KAWETH is not set 1188# CONFIG_USB_KAWETH is not set
1186# CONFIG_USB_PEGASUS is not set 1189# CONFIG_USB_PEGASUS is not set
1187# CONFIG_USB_RTL8150 is not set 1190# CONFIG_USB_RTL8150 is not set
1191# CONFIG_USB_USBNET_MII is not set
1188# CONFIG_USB_USBNET is not set 1192# CONFIG_USB_USBNET is not set
1189CONFIG_USB_MON=y 1193CONFIG_USB_MON=y
1190 1194
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
index 1a884b6e6e5c..1e8988e558c5 100644
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -6,7 +6,7 @@ extra-y := head.o init_task.o vmlinux.lds
6 6
7obj-y := process.o signal.o entry.o traps.o irq.o \ 7obj-y := process.o signal.o entry.o traps.o irq.o \
8 ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \ 8 ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \
9 pci-dma.o i386_ksyms.o i387.o bootflag.o \ 9 pci-dma.o i386_ksyms.o i387.o bootflag.o e820.o\
10 quirks.o i8237.o topology.o alternative.o i8253.o tsc.o 10 quirks.o i8237.o topology.o alternative.o i8253.o tsc.o
11 11
12obj-$(CONFIG_STACKTRACE) += stacktrace.o 12obj-$(CONFIG_STACKTRACE) += stacktrace.o
@@ -40,6 +40,9 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
40obj-$(CONFIG_HPET_TIMER) += hpet.o 40obj-$(CONFIG_HPET_TIMER) += hpet.o
41obj-$(CONFIG_K8_NB) += k8.o 41obj-$(CONFIG_K8_NB) += k8.o
42 42
43# Make sure this is linked after any other paravirt_ops structs: see head.S
44obj-$(CONFIG_PARAVIRT) += paravirt.o
45
43EXTRA_AFLAGS := -traditional 46EXTRA_AFLAGS := -traditional
44 47
45obj-$(CONFIG_SCx200) += scx200.o 48obj-$(CONFIG_SCx200) += scx200.o
diff --git a/arch/i386/kernel/acpi/cstate.c b/arch/i386/kernel/acpi/cstate.c
index 4664b55f623e..12e937c1ce4b 100644
--- a/arch/i386/kernel/acpi/cstate.c
+++ b/arch/i386/kernel/acpi/cstate.c
@@ -156,10 +156,8 @@ static int __init ffh_cstate_init(void)
156 156
157static void __exit ffh_cstate_exit(void) 157static void __exit ffh_cstate_exit(void)
158{ 158{
159 if (cpu_cstate_entry) { 159 free_percpu(cpu_cstate_entry);
160 free_percpu(cpu_cstate_entry); 160 cpu_cstate_entry = NULL;
161 cpu_cstate_entry = NULL;
162 }
163} 161}
164 162
165arch_initcall(ffh_cstate_init); 163arch_initcall(ffh_cstate_init);
diff --git a/arch/i386/kernel/acpi/earlyquirk.c b/arch/i386/kernel/acpi/earlyquirk.c
index c9841692bb7c..4b60af7f91dd 100644
--- a/arch/i386/kernel/acpi/earlyquirk.c
+++ b/arch/i386/kernel/acpi/earlyquirk.c
@@ -10,6 +10,7 @@
10#include <asm/pci-direct.h> 10#include <asm/pci-direct.h>
11#include <asm/acpi.h> 11#include <asm/acpi.h>
12#include <asm/apic.h> 12#include <asm/apic.h>
13#include <asm/irq.h>
13 14
14#ifdef CONFIG_ACPI 15#ifdef CONFIG_ACPI
15 16
@@ -49,6 +50,24 @@ static int __init check_bridge(int vendor, int device)
49 return 0; 50 return 0;
50} 51}
51 52
53static void check_intel(void)
54{
55 u16 vendor, device;
56
57 vendor = read_pci_config_16(0, 0, 0, PCI_VENDOR_ID);
58
59 if (vendor != PCI_VENDOR_ID_INTEL)
60 return;
61
62 device = read_pci_config_16(0, 0, 0, PCI_DEVICE_ID);
63#ifdef CONFIG_SMP
64 if (device == PCI_DEVICE_ID_INTEL_E7320_MCH ||
65 device == PCI_DEVICE_ID_INTEL_E7520_MCH ||
66 device == PCI_DEVICE_ID_INTEL_E7525_MCH)
67 quirk_intel_irqbalance();
68#endif
69}
70
52void __init check_acpi_pci(void) 71void __init check_acpi_pci(void)
53{ 72{
54 int num, slot, func; 73 int num, slot, func;
@@ -60,6 +79,8 @@ void __init check_acpi_pci(void)
60 if (!early_pci_allowed()) 79 if (!early_pci_allowed())
61 return; 80 return;
62 81
82 check_intel();
83
63 /* Poor man's PCI discovery */ 84 /* Poor man's PCI discovery */
64 for (num = 0; num < 32; num++) { 85 for (num = 0; num < 32; num++) {
65 for (slot = 0; slot < 32; slot++) { 86 for (slot = 0; slot < 32; slot++) {
diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c
index 535f9794fba1..9eca21b49f6b 100644
--- a/arch/i386/kernel/alternative.c
+++ b/arch/i386/kernel/alternative.c
@@ -124,6 +124,20 @@ static unsigned char** find_nop_table(void)
124 124
125#endif /* CONFIG_X86_64 */ 125#endif /* CONFIG_X86_64 */
126 126
127static void nop_out(void *insns, unsigned int len)
128{
129 unsigned char **noptable = find_nop_table();
130
131 while (len > 0) {
132 unsigned int noplen = len;
133 if (noplen > ASM_NOP_MAX)
134 noplen = ASM_NOP_MAX;
135 memcpy(insns, noptable[noplen], noplen);
136 insns += noplen;
137 len -= noplen;
138 }
139}
140
127extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; 141extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
128extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[]; 142extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[];
129extern u8 *__smp_locks[], *__smp_locks_end[]; 143extern u8 *__smp_locks[], *__smp_locks_end[];
@@ -138,10 +152,9 @@ extern u8 __smp_alt_begin[], __smp_alt_end[];
138 152
139void apply_alternatives(struct alt_instr *start, struct alt_instr *end) 153void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
140{ 154{
141 unsigned char **noptable = find_nop_table();
142 struct alt_instr *a; 155 struct alt_instr *a;
143 u8 *instr; 156 u8 *instr;
144 int diff, i, k; 157 int diff;
145 158
146 DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end); 159 DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end);
147 for (a = start; a < end; a++) { 160 for (a = start; a < end; a++) {
@@ -159,13 +172,7 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
159#endif 172#endif
160 memcpy(instr, a->replacement, a->replacementlen); 173 memcpy(instr, a->replacement, a->replacementlen);
161 diff = a->instrlen - a->replacementlen; 174 diff = a->instrlen - a->replacementlen;
162 /* Pad the rest with nops */ 175 nop_out(instr + a->replacementlen, diff);
163 for (i = a->replacementlen; diff > 0; diff -= k, i += k) {
164 k = diff;
165 if (k > ASM_NOP_MAX)
166 k = ASM_NOP_MAX;
167 memcpy(a->instr + i, noptable[k], k);
168 }
169 } 176 }
170} 177}
171 178
@@ -209,7 +216,6 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
209 216
210static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) 217static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
211{ 218{
212 unsigned char **noptable = find_nop_table();
213 u8 **ptr; 219 u8 **ptr;
214 220
215 for (ptr = start; ptr < end; ptr++) { 221 for (ptr = start; ptr < end; ptr++) {
@@ -217,7 +223,7 @@ static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end
217 continue; 223 continue;
218 if (*ptr > text_end) 224 if (*ptr > text_end)
219 continue; 225 continue;
220 **ptr = noptable[1][0]; 226 nop_out(*ptr, 1);
221 }; 227 };
222} 228}
223 229
@@ -343,6 +349,40 @@ void alternatives_smp_switch(int smp)
343 349
344#endif 350#endif
345 351
352#ifdef CONFIG_PARAVIRT
353void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end)
354{
355 struct paravirt_patch *p;
356
357 for (p = start; p < end; p++) {
358 unsigned int used;
359
360 used = paravirt_ops.patch(p->instrtype, p->clobbers, p->instr,
361 p->len);
362#ifdef CONFIG_DEBUG_PARAVIRT
363 {
364 int i;
365 /* Deliberately clobber regs using "not %reg" to find bugs. */
366 for (i = 0; i < 3; i++) {
367 if (p->len - used >= 2 && (p->clobbers & (1 << i))) {
368 memcpy(p->instr + used, "\xf7\xd0", 2);
369 p->instr[used+1] |= i;
370 used += 2;
371 }
372 }
373 }
374#endif
375 /* Pad the rest with nops */
376 nop_out(p->instr + used, p->len - used);
377 }
378
379 /* Sync to be conservative, in case we patched following instructions */
380 sync_core();
381}
382extern struct paravirt_patch __start_parainstructions[],
383 __stop_parainstructions[];
384#endif /* CONFIG_PARAVIRT */
385
346void __init alternative_instructions(void) 386void __init alternative_instructions(void)
347{ 387{
348 unsigned long flags; 388 unsigned long flags;
@@ -390,5 +430,6 @@ void __init alternative_instructions(void)
390 alternatives_smp_switch(0); 430 alternatives_smp_switch(0);
391 } 431 }
392#endif 432#endif
433 apply_paravirt(__start_parainstructions, __stop_parainstructions);
393 local_irq_restore(flags); 434 local_irq_restore(flags);
394} 435}
diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
index 2fd4b7d927c2..776d9be26af9 100644
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -647,23 +647,30 @@ static struct {
647static int lapic_suspend(struct sys_device *dev, pm_message_t state) 647static int lapic_suspend(struct sys_device *dev, pm_message_t state)
648{ 648{
649 unsigned long flags; 649 unsigned long flags;
650 int maxlvt;
650 651
651 if (!apic_pm_state.active) 652 if (!apic_pm_state.active)
652 return 0; 653 return 0;
653 654
655 maxlvt = get_maxlvt();
656
654 apic_pm_state.apic_id = apic_read(APIC_ID); 657 apic_pm_state.apic_id = apic_read(APIC_ID);
655 apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); 658 apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
656 apic_pm_state.apic_ldr = apic_read(APIC_LDR); 659 apic_pm_state.apic_ldr = apic_read(APIC_LDR);
657 apic_pm_state.apic_dfr = apic_read(APIC_DFR); 660 apic_pm_state.apic_dfr = apic_read(APIC_DFR);
658 apic_pm_state.apic_spiv = apic_read(APIC_SPIV); 661 apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
659 apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); 662 apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
660 apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); 663 if (maxlvt >= 4)
664 apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
661 apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); 665 apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
662 apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); 666 apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
663 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); 667 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
664 apic_pm_state.apic_tmict = apic_read(APIC_TMICT); 668 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
665 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); 669 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
666 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); 670#ifdef CONFIG_X86_MCE_P4THERMAL
671 if (maxlvt >= 5)
672 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
673#endif
667 674
668 local_irq_save(flags); 675 local_irq_save(flags);
669 disable_local_APIC(); 676 disable_local_APIC();
@@ -675,10 +682,13 @@ static int lapic_resume(struct sys_device *dev)
675{ 682{
676 unsigned int l, h; 683 unsigned int l, h;
677 unsigned long flags; 684 unsigned long flags;
685 int maxlvt;
678 686
679 if (!apic_pm_state.active) 687 if (!apic_pm_state.active)
680 return 0; 688 return 0;
681 689
690 maxlvt = get_maxlvt();
691
682 local_irq_save(flags); 692 local_irq_save(flags);
683 693
684 /* 694 /*
@@ -700,8 +710,12 @@ static int lapic_resume(struct sys_device *dev)
700 apic_write(APIC_SPIV, apic_pm_state.apic_spiv); 710 apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
701 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); 711 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
702 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); 712 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
703 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); 713#ifdef CONFIG_X86_MCE_P4THERMAL
704 apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); 714 if (maxlvt >= 5)
715 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
716#endif
717 if (maxlvt >= 4)
718 apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
705 apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); 719 apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
706 apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); 720 apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
707 apic_write(APIC_TMICT, apic_pm_state.apic_tmict); 721 apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
index a60358fe9a49..a97847da9ed5 100644
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -231,6 +231,7 @@
231#include <asm/uaccess.h> 231#include <asm/uaccess.h>
232#include <asm/desc.h> 232#include <asm/desc.h>
233#include <asm/i8253.h> 233#include <asm/i8253.h>
234#include <asm/paravirt.h>
234 235
235#include "io_ports.h" 236#include "io_ports.h"
236 237
@@ -2235,7 +2236,7 @@ static int __init apm_init(void)
2235 2236
2236 dmi_check_system(apm_dmi_table); 2237 dmi_check_system(apm_dmi_table);
2237 2238
2238 if (apm_info.bios.version == 0) { 2239 if (apm_info.bios.version == 0 || paravirt_enabled()) {
2239 printk(KERN_INFO "apm: BIOS not found.\n"); 2240 printk(KERN_INFO "apm: BIOS not found.\n");
2240 return -ENODEV; 2241 return -ENODEV;
2241 } 2242 }
diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c
index c80271f8f084..1b2f3cd33270 100644
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -15,6 +15,7 @@
15#include <asm/processor.h> 15#include <asm/processor.h>
16#include <asm/thread_info.h> 16#include <asm/thread_info.h>
17#include <asm/elf.h> 17#include <asm/elf.h>
18#include <asm/pda.h>
18 19
19#define DEFINE(sym, val) \ 20#define DEFINE(sym, val) \
20 asm volatile("\n->" #sym " %0 " #val : : "i" (val)) 21 asm volatile("\n->" #sym " %0 " #val : : "i" (val))
@@ -51,13 +52,35 @@ void foo(void)
51 OFFSET(TI_exec_domain, thread_info, exec_domain); 52 OFFSET(TI_exec_domain, thread_info, exec_domain);
52 OFFSET(TI_flags, thread_info, flags); 53 OFFSET(TI_flags, thread_info, flags);
53 OFFSET(TI_status, thread_info, status); 54 OFFSET(TI_status, thread_info, status);
54 OFFSET(TI_cpu, thread_info, cpu);
55 OFFSET(TI_preempt_count, thread_info, preempt_count); 55 OFFSET(TI_preempt_count, thread_info, preempt_count);
56 OFFSET(TI_addr_limit, thread_info, addr_limit); 56 OFFSET(TI_addr_limit, thread_info, addr_limit);
57 OFFSET(TI_restart_block, thread_info, restart_block); 57 OFFSET(TI_restart_block, thread_info, restart_block);
58 OFFSET(TI_sysenter_return, thread_info, sysenter_return); 58 OFFSET(TI_sysenter_return, thread_info, sysenter_return);
59 BLANK(); 59 BLANK();
60 60
61 OFFSET(GDS_size, Xgt_desc_struct, size);
62 OFFSET(GDS_address, Xgt_desc_struct, address);
63 OFFSET(GDS_pad, Xgt_desc_struct, pad);
64 BLANK();
65
66 OFFSET(PT_EBX, pt_regs, ebx);
67 OFFSET(PT_ECX, pt_regs, ecx);
68 OFFSET(PT_EDX, pt_regs, edx);
69 OFFSET(PT_ESI, pt_regs, esi);
70 OFFSET(PT_EDI, pt_regs, edi);
71 OFFSET(PT_EBP, pt_regs, ebp);
72 OFFSET(PT_EAX, pt_regs, eax);
73 OFFSET(PT_DS, pt_regs, xds);
74 OFFSET(PT_ES, pt_regs, xes);
75 OFFSET(PT_GS, pt_regs, xgs);
76 OFFSET(PT_ORIG_EAX, pt_regs, orig_eax);
77 OFFSET(PT_EIP, pt_regs, eip);
78 OFFSET(PT_CS, pt_regs, xcs);
79 OFFSET(PT_EFLAGS, pt_regs, eflags);
80 OFFSET(PT_OLDESP, pt_regs, esp);
81 OFFSET(PT_OLDSS, pt_regs, xss);
82 BLANK();
83
61 OFFSET(EXEC_DOMAIN_handler, exec_domain, handler); 84 OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
62 OFFSET(RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext); 85 OFFSET(RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
63 BLANK(); 86 BLANK();
@@ -74,4 +97,18 @@ void foo(void)
74 DEFINE(VDSO_PRELINK, VDSO_PRELINK); 97 DEFINE(VDSO_PRELINK, VDSO_PRELINK);
75 98
76 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); 99 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
100
101 BLANK();
102 OFFSET(PDA_cpu, i386_pda, cpu_number);
103 OFFSET(PDA_pcurrent, i386_pda, pcurrent);
104
105#ifdef CONFIG_PARAVIRT
106 BLANK();
107 OFFSET(PARAVIRT_enabled, paravirt_ops, paravirt_enabled);
108 OFFSET(PARAVIRT_irq_disable, paravirt_ops, irq_disable);
109 OFFSET(PARAVIRT_irq_enable, paravirt_ops, irq_enable);
110 OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, irq_enable_sysexit);
111 OFFSET(PARAVIRT_iret, paravirt_ops, iret);
112 OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
113#endif
77} 114}
diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c
index e4758095d87a..41cfea57232b 100644
--- a/arch/i386/kernel/cpu/amd.c
+++ b/arch/i386/kernel/cpu/amd.c
@@ -104,10 +104,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
104 f_vide(); 104 f_vide();
105 rdtscl(d2); 105 rdtscl(d2);
106 d = d2-d; 106 d = d2-d;
107 107
108 /* Knock these two lines out if it debugs out ok */
109 printk(KERN_INFO "AMD K6 stepping B detected - ");
110 /* -- cut here -- */
111 if (d > 20*K6_BUG_LOOP) 108 if (d > 20*K6_BUG_LOOP)
112 printk("system stability may be impaired when more than 32 MB are used.\n"); 109 printk("system stability may be impaired when more than 32 MB are used.\n");
113 else 110 else
diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index d9f3e3c31f05..1b34c56f8123 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -18,14 +18,15 @@
18#include <asm/apic.h> 18#include <asm/apic.h>
19#include <mach_apic.h> 19#include <mach_apic.h>
20#endif 20#endif
21#include <asm/pda.h>
21 22
22#include "cpu.h" 23#include "cpu.h"
23 24
24DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr); 25DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
25EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr); 26EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr);
26 27
27DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]); 28struct i386_pda *_cpu_pda[NR_CPUS] __read_mostly;
28EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack); 29EXPORT_SYMBOL(_cpu_pda);
29 30
30static int cachesize_override __cpuinitdata = -1; 31static int cachesize_override __cpuinitdata = -1;
31static int disable_x86_fxsr __cpuinitdata; 32static int disable_x86_fxsr __cpuinitdata;
@@ -235,29 +236,14 @@ static int __cpuinit have_cpuid_p(void)
235 return flag_is_changeable_p(X86_EFLAGS_ID); 236 return flag_is_changeable_p(X86_EFLAGS_ID);
236} 237}
237 238
238/* Do minimum CPU detection early. 239void __init cpu_detect(struct cpuinfo_x86 *c)
239 Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
240 The others are not touched to avoid unwanted side effects.
241
242 WARNING: this function is only called on the BP. Don't add code here
243 that is supposed to run on all CPUs. */
244static void __init early_cpu_detect(void)
245{ 240{
246 struct cpuinfo_x86 *c = &boot_cpu_data;
247
248 c->x86_cache_alignment = 32;
249
250 if (!have_cpuid_p())
251 return;
252
253 /* Get vendor name */ 241 /* Get vendor name */
254 cpuid(0x00000000, &c->cpuid_level, 242 cpuid(0x00000000, &c->cpuid_level,
255 (int *)&c->x86_vendor_id[0], 243 (int *)&c->x86_vendor_id[0],
256 (int *)&c->x86_vendor_id[8], 244 (int *)&c->x86_vendor_id[8],
257 (int *)&c->x86_vendor_id[4]); 245 (int *)&c->x86_vendor_id[4]);
258 246
259 get_cpu_vendor(c, 1);
260
261 c->x86 = 4; 247 c->x86 = 4;
262 if (c->cpuid_level >= 0x00000001) { 248 if (c->cpuid_level >= 0x00000001) {
263 u32 junk, tfms, cap0, misc; 249 u32 junk, tfms, cap0, misc;
@@ -274,6 +260,26 @@ static void __init early_cpu_detect(void)
274 } 260 }
275} 261}
276 262
263/* Do minimum CPU detection early.
264 Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
265 The others are not touched to avoid unwanted side effects.
266
267 WARNING: this function is only called on the BP. Don't add code here
268 that is supposed to run on all CPUs. */
269static void __init early_cpu_detect(void)
270{
271 struct cpuinfo_x86 *c = &boot_cpu_data;
272
273 c->x86_cache_alignment = 32;
274
275 if (!have_cpuid_p())
276 return;
277
278 cpu_detect(c);
279
280 get_cpu_vendor(c, 1);
281}
282
277static void __cpuinit generic_identify(struct cpuinfo_x86 * c) 283static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
278{ 284{
279 u32 tfms, xlvl; 285 u32 tfms, xlvl;
@@ -308,6 +314,8 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
308#else 314#else
309 c->apicid = (ebx >> 24) & 0xFF; 315 c->apicid = (ebx >> 24) & 0xFF;
310#endif 316#endif
317 if (c->x86_capability[0] & (1<<19))
318 c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8;
311 } else { 319 } else {
312 /* Have CPUID level 0 only - unheard of */ 320 /* Have CPUID level 0 only - unheard of */
313 c->x86 = 4; 321 c->x86 = 4;
@@ -372,6 +380,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
372 c->x86_vendor_id[0] = '\0'; /* Unset */ 380 c->x86_vendor_id[0] = '\0'; /* Unset */
373 c->x86_model_id[0] = '\0'; /* Unset */ 381 c->x86_model_id[0] = '\0'; /* Unset */
374 c->x86_max_cores = 1; 382 c->x86_max_cores = 1;
383 c->x86_clflush_size = 32;
375 memset(&c->x86_capability, 0, sizeof c->x86_capability); 384 memset(&c->x86_capability, 0, sizeof c->x86_capability);
376 385
377 if (!have_cpuid_p()) { 386 if (!have_cpuid_p()) {
@@ -591,42 +600,24 @@ void __init early_cpu_init(void)
591 disable_pse = 1; 600 disable_pse = 1;
592#endif 601#endif
593} 602}
594/* 603
595 * cpu_init() initializes state that is per-CPU. Some data is already 604/* Make sure %gs is initialized properly in idle threads */
596 * initialized (naturally) in the bootstrap process, such as the GDT 605struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
597 * and IDT. We reload them nevertheless, this function acts as a
598 * 'CPU state barrier', nothing should get across.
599 */
600void __cpuinit cpu_init(void)
601{ 606{
602 int cpu = smp_processor_id(); 607 memset(regs, 0, sizeof(struct pt_regs));
603 struct tss_struct * t = &per_cpu(init_tss, cpu); 608 regs->xgs = __KERNEL_PDA;
604 struct thread_struct *thread = &current->thread; 609 return regs;
605 struct desc_struct *gdt; 610}
606 __u32 stk16_off = (__u32)&per_cpu(cpu_16bit_stack, cpu);
607 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
608 611
609 if (cpu_test_and_set(cpu, cpu_initialized)) { 612static __cpuinit int alloc_gdt(int cpu)
610 printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); 613{
611 for (;;) local_irq_enable(); 614 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
612 } 615 struct desc_struct *gdt;
613 printk(KERN_INFO "Initializing CPU#%d\n", cpu); 616 struct i386_pda *pda;
614 617
615 if (cpu_has_vme || cpu_has_tsc || cpu_has_de) 618 gdt = (struct desc_struct *)cpu_gdt_descr->address;
616 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); 619 pda = cpu_pda(cpu);
617 if (tsc_disable && cpu_has_tsc) {
618 printk(KERN_NOTICE "Disabling TSC...\n");
619 /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
620 clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
621 set_in_cr4(X86_CR4_TSD);
622 }
623 620
624 /* The CPU hotplug case */
625 if (cpu_gdt_descr->address) {
626 gdt = (struct desc_struct *)cpu_gdt_descr->address;
627 memset(gdt, 0, PAGE_SIZE);
628 goto old_gdt;
629 }
630 /* 621 /*
631 * This is a horrible hack to allocate the GDT. The problem 622 * This is a horrible hack to allocate the GDT. The problem
632 * is that cpu_init() is called really early for the boot CPU 623 * is that cpu_init() is called really early for the boot CPU
@@ -634,43 +625,130 @@ void __cpuinit cpu_init(void)
634 * CPUs, when bootmem will have gone away 625 * CPUs, when bootmem will have gone away
635 */ 626 */
636 if (NODE_DATA(0)->bdata->node_bootmem_map) { 627 if (NODE_DATA(0)->bdata->node_bootmem_map) {
637 gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE); 628 BUG_ON(gdt != NULL || pda != NULL);
638 /* alloc_bootmem_pages panics on failure, so no check */ 629
630 gdt = alloc_bootmem_pages(PAGE_SIZE);
631 pda = alloc_bootmem(sizeof(*pda));
632 /* alloc_bootmem(_pages) panics on failure, so no check */
633
639 memset(gdt, 0, PAGE_SIZE); 634 memset(gdt, 0, PAGE_SIZE);
635 memset(pda, 0, sizeof(*pda));
640 } else { 636 } else {
641 gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL); 637 /* GDT and PDA might already have been allocated if
642 if (unlikely(!gdt)) { 638 this is a CPU hotplug re-insertion. */
643 printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu); 639 if (gdt == NULL)
644 for (;;) 640 gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL);
645 local_irq_enable(); 641
642 if (pda == NULL)
643 pda = kmalloc_node(sizeof(*pda), GFP_KERNEL, cpu_to_node(cpu));
644
645 if (unlikely(!gdt || !pda)) {
646 free_pages((unsigned long)gdt, 0);
647 kfree(pda);
648 return 0;
646 } 649 }
647 } 650 }
648old_gdt: 651
652 cpu_gdt_descr->address = (unsigned long)gdt;
653 cpu_pda(cpu) = pda;
654
655 return 1;
656}
657
658/* Initial PDA used by boot CPU */
659struct i386_pda boot_pda = {
660 ._pda = &boot_pda,
661 .cpu_number = 0,
662 .pcurrent = &init_task,
663};
664
665static inline void set_kernel_gs(void)
666{
667 /* Set %gs for this CPU's PDA. Memory clobber is to create a
668 barrier with respect to any PDA operations, so the compiler
669 doesn't move any before here. */
670 asm volatile ("mov %0, %%gs" : : "r" (__KERNEL_PDA) : "memory");
671}
672
673/* Initialize the CPU's GDT and PDA. The boot CPU does this for
674 itself, but secondaries find this done for them. */
675__cpuinit int init_gdt(int cpu, struct task_struct *idle)
676{
677 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
678 struct desc_struct *gdt;
679 struct i386_pda *pda;
680
681 /* For non-boot CPUs, the GDT and PDA should already have been
682 allocated. */
683 if (!alloc_gdt(cpu)) {
684 printk(KERN_CRIT "CPU%d failed to allocate GDT or PDA\n", cpu);
685 return 0;
686 }
687
688 gdt = (struct desc_struct *)cpu_gdt_descr->address;
689 pda = cpu_pda(cpu);
690
691 BUG_ON(gdt == NULL || pda == NULL);
692
649 /* 693 /*
650 * Initialize the per-CPU GDT with the boot GDT, 694 * Initialize the per-CPU GDT with the boot GDT,
651 * and set up the GDT descriptor: 695 * and set up the GDT descriptor:
652 */ 696 */
653 memcpy(gdt, cpu_gdt_table, GDT_SIZE); 697 memcpy(gdt, cpu_gdt_table, GDT_SIZE);
698 cpu_gdt_descr->size = GDT_SIZE - 1;
654 699
655 /* Set up GDT entry for 16bit stack */ 700 pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a,
656 *(__u64 *)(&gdt[GDT_ENTRY_ESPFIX_SS]) |= 701 (u32 *)&gdt[GDT_ENTRY_PDA].b,
657 ((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) | 702 (unsigned long)pda, sizeof(*pda) - 1,
658 ((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) | 703 0x80 | DESCTYPE_S | 0x2, 0); /* present read-write data segment */
659 (CPU_16BIT_STACK_SIZE - 1);
660 704
661 cpu_gdt_descr->size = GDT_SIZE - 1; 705 memset(pda, 0, sizeof(*pda));
662 cpu_gdt_descr->address = (unsigned long)gdt; 706 pda->_pda = pda;
707 pda->cpu_number = cpu;
708 pda->pcurrent = idle;
709
710 return 1;
711}
712
713/* Common CPU init for both boot and secondary CPUs */
714static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
715{
716 struct tss_struct * t = &per_cpu(init_tss, cpu);
717 struct thread_struct *thread = &curr->thread;
718 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
663 719
720 /* Reinit these anyway, even if they've already been done (on
721 the boot CPU, this will transition from the boot gdt+pda to
722 the real ones). */
664 load_gdt(cpu_gdt_descr); 723 load_gdt(cpu_gdt_descr);
724 set_kernel_gs();
725
726 if (cpu_test_and_set(cpu, cpu_initialized)) {
727 printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
728 for (;;) local_irq_enable();
729 }
730
731 printk(KERN_INFO "Initializing CPU#%d\n", cpu);
732
733 if (cpu_has_vme || cpu_has_tsc || cpu_has_de)
734 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
735 if (tsc_disable && cpu_has_tsc) {
736 printk(KERN_NOTICE "Disabling TSC...\n");
737 /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
738 clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
739 set_in_cr4(X86_CR4_TSD);
740 }
741
665 load_idt(&idt_descr); 742 load_idt(&idt_descr);
666 743
667 /* 744 /*
668 * Set up and load the per-CPU TSS and LDT 745 * Set up and load the per-CPU TSS and LDT
669 */ 746 */
670 atomic_inc(&init_mm.mm_count); 747 atomic_inc(&init_mm.mm_count);
671 current->active_mm = &init_mm; 748 curr->active_mm = &init_mm;
672 BUG_ON(current->mm); 749 if (curr->mm)
673 enter_lazy_tlb(&init_mm, current); 750 BUG();
751 enter_lazy_tlb(&init_mm, curr);
674 752
675 load_esp0(t, thread); 753 load_esp0(t, thread);
676 set_tss_desc(cpu,t); 754 set_tss_desc(cpu,t);
@@ -682,8 +760,8 @@ old_gdt:
682 __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); 760 __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
683#endif 761#endif
684 762
685 /* Clear %fs and %gs. */ 763 /* Clear %fs. */
686 asm volatile ("movl %0, %%fs; movl %0, %%gs" : : "r" (0)); 764 asm volatile ("mov %0, %%fs" : : "r" (0));
687 765
688 /* Clear all 6 debug registers: */ 766 /* Clear all 6 debug registers: */
689 set_debugreg(0, 0); 767 set_debugreg(0, 0);
@@ -701,6 +779,37 @@ old_gdt:
701 mxcsr_feature_mask_init(); 779 mxcsr_feature_mask_init();
702} 780}
703 781
782/* Entrypoint to initialize secondary CPU */
783void __cpuinit secondary_cpu_init(void)
784{
785 int cpu = smp_processor_id();
786 struct task_struct *curr = current;
787
788 _cpu_init(cpu, curr);
789}
790
791/*
792 * cpu_init() initializes state that is per-CPU. Some data is already
793 * initialized (naturally) in the bootstrap process, such as the GDT
794 * and IDT. We reload them nevertheless, this function acts as a
795 * 'CPU state barrier', nothing should get across.
796 */
797void __cpuinit cpu_init(void)
798{
799 int cpu = smp_processor_id();
800 struct task_struct *curr = current;
801
802 /* Set up the real GDT and PDA, so we can transition from the
803 boot versions. */
804 if (!init_gdt(cpu, curr)) {
805 /* failed to allocate something; not much we can do... */
806 for (;;)
807 local_irq_enable();
808 }
809
810 _cpu_init(cpu, curr);
811}
812
704#ifdef CONFIG_HOTPLUG_CPU 813#ifdef CONFIG_HOTPLUG_CPU
705void __cpuinit cpu_uninit(void) 814void __cpuinit cpu_uninit(void)
706{ 815{
diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c
index 94a95aa5227e..56fe26584957 100644
--- a/arch/i386/kernel/cpu/intel.c
+++ b/arch/i386/kernel/cpu/intel.c
@@ -107,7 +107,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
107 * Note that the workaround only should be initialized once... 107 * Note that the workaround only should be initialized once...
108 */ 108 */
109 c->f00f_bug = 0; 109 c->f00f_bug = 0;
110 if ( c->x86 == 5 ) { 110 if (!paravirt_enabled() && c->x86 == 5) {
111 static int f00f_workaround_enabled = 0; 111 static int f00f_workaround_enabled = 0;
112 112
113 c->f00f_bug = 1; 113 c->f00f_bug = 1;
@@ -195,8 +195,16 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
195 if ((c->x86 == 0xf && c->x86_model >= 0x03) || 195 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
196 (c->x86 == 0x6 && c->x86_model >= 0x0e)) 196 (c->x86 == 0x6 && c->x86_model >= 0x0e))
197 set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability); 197 set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
198}
199 198
199 if (cpu_has_ds) {
200 unsigned int l1;
201 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
202 if (!(l1 & (1<<11)))
203 set_bit(X86_FEATURE_BTS, c->x86_capability);
204 if (!(l1 & (1<<12)))
205 set_bit(X86_FEATURE_PEBS, c->x86_capability);
206 }
207}
200 208
201static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 * c, unsigned int size) 209static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 * c, unsigned int size)
202{ 210{
diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c
index 5c43be47587f..80b4c5d421b1 100644
--- a/arch/i386/kernel/cpu/intel_cacheinfo.c
+++ b/arch/i386/kernel/cpu/intel_cacheinfo.c
@@ -480,12 +480,10 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
480 if (num_cache_leaves == 0) 480 if (num_cache_leaves == 0)
481 return -ENOENT; 481 return -ENOENT;
482 482
483 cpuid4_info[cpu] = kmalloc( 483 cpuid4_info[cpu] = kzalloc(
484 sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL); 484 sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
485 if (unlikely(cpuid4_info[cpu] == NULL)) 485 if (unlikely(cpuid4_info[cpu] == NULL))
486 return -ENOMEM; 486 return -ENOMEM;
487 memset(cpuid4_info[cpu], 0,
488 sizeof(struct _cpuid4_info) * num_cache_leaves);
489 487
490 oldmask = current->cpus_allowed; 488 oldmask = current->cpus_allowed;
491 retval = set_cpus_allowed(current, cpumask_of_cpu(cpu)); 489 retval = set_cpus_allowed(current, cpumask_of_cpu(cpu));
@@ -658,17 +656,14 @@ static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu)
658 return -ENOENT; 656 return -ENOENT;
659 657
660 /* Allocate all required memory */ 658 /* Allocate all required memory */
661 cache_kobject[cpu] = kmalloc(sizeof(struct kobject), GFP_KERNEL); 659 cache_kobject[cpu] = kzalloc(sizeof(struct kobject), GFP_KERNEL);
662 if (unlikely(cache_kobject[cpu] == NULL)) 660 if (unlikely(cache_kobject[cpu] == NULL))
663 goto err_out; 661 goto err_out;
664 memset(cache_kobject[cpu], 0, sizeof(struct kobject));
665 662
666 index_kobject[cpu] = kmalloc( 663 index_kobject[cpu] = kzalloc(
667 sizeof(struct _index_kobject ) * num_cache_leaves, GFP_KERNEL); 664 sizeof(struct _index_kobject ) * num_cache_leaves, GFP_KERNEL);
668 if (unlikely(index_kobject[cpu] == NULL)) 665 if (unlikely(index_kobject[cpu] == NULL))
669 goto err_out; 666 goto err_out;
670 memset(index_kobject[cpu], 0,
671 sizeof(struct _index_kobject) * num_cache_leaves);
672 667
673 return 0; 668 return 0;
674 669
diff --git a/arch/i386/kernel/cpu/mcheck/therm_throt.c b/arch/i386/kernel/cpu/mcheck/therm_throt.c
index bad8b4420709..065005c3f168 100644
--- a/arch/i386/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/i386/kernel/cpu/mcheck/therm_throt.c
@@ -116,7 +116,6 @@ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
116 return sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group); 116 return sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group);
117} 117}
118 118
119#ifdef CONFIG_HOTPLUG_CPU
120static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) 119static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
121{ 120{
122 return sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); 121 return sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group);
@@ -153,7 +152,6 @@ static struct notifier_block thermal_throttle_cpu_notifier =
153{ 152{
154 .notifier_call = thermal_throttle_cpu_callback, 153 .notifier_call = thermal_throttle_cpu_callback,
155}; 154};
156#endif /* CONFIG_HOTPLUG_CPU */
157 155
158static __init int thermal_throttle_init_device(void) 156static __init int thermal_throttle_init_device(void)
159{ 157{
diff --git a/arch/i386/kernel/cpu/mtrr/Makefile b/arch/i386/kernel/cpu/mtrr/Makefile
index a25b701ab84e..191fc0533649 100644
--- a/arch/i386/kernel/cpu/mtrr/Makefile
+++ b/arch/i386/kernel/cpu/mtrr/Makefile
@@ -1,5 +1,3 @@
1obj-y := main.o if.o generic.o state.o 1obj-y := main.o if.o generic.o state.o
2obj-y += amd.o 2obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
3obj-y += cyrix.o
4obj-y += centaur.o
5 3
diff --git a/arch/i386/kernel/cpu/mtrr/amd.c b/arch/i386/kernel/cpu/mtrr/amd.c
index 1a1e04b6fd00..0949cdbf848a 100644
--- a/arch/i386/kernel/cpu/mtrr/amd.c
+++ b/arch/i386/kernel/cpu/mtrr/amd.c
@@ -7,7 +7,7 @@
7 7
8static void 8static void
9amd_get_mtrr(unsigned int reg, unsigned long *base, 9amd_get_mtrr(unsigned int reg, unsigned long *base,
10 unsigned int *size, mtrr_type * type) 10 unsigned long *size, mtrr_type * type)
11{ 11{
12 unsigned long low, high; 12 unsigned long low, high;
13 13
diff --git a/arch/i386/kernel/cpu/mtrr/centaur.c b/arch/i386/kernel/cpu/mtrr/centaur.c
index 33f00ac314ef..cb9aa3a7a7ab 100644
--- a/arch/i386/kernel/cpu/mtrr/centaur.c
+++ b/arch/i386/kernel/cpu/mtrr/centaur.c
@@ -17,7 +17,7 @@ static u8 centaur_mcr_type; /* 0 for winchip, 1 for winchip2 */
17 */ 17 */
18 18
19static int 19static int
20centaur_get_free_region(unsigned long base, unsigned long size) 20centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg)
21/* [SUMMARY] Get a free MTRR. 21/* [SUMMARY] Get a free MTRR.
22 <base> The starting (base) address of the region. 22 <base> The starting (base) address of the region.
23 <size> The size (in bytes) of the region. 23 <size> The size (in bytes) of the region.
@@ -26,10 +26,11 @@ centaur_get_free_region(unsigned long base, unsigned long size)
26{ 26{
27 int i, max; 27 int i, max;
28 mtrr_type ltype; 28 mtrr_type ltype;
29 unsigned long lbase; 29 unsigned long lbase, lsize;
30 unsigned int lsize;
31 30
32 max = num_var_ranges; 31 max = num_var_ranges;
32 if (replace_reg >= 0 && replace_reg < max)
33 return replace_reg;
33 for (i = 0; i < max; ++i) { 34 for (i = 0; i < max; ++i) {
34 if (centaur_mcr_reserved & (1 << i)) 35 if (centaur_mcr_reserved & (1 << i))
35 continue; 36 continue;
@@ -49,7 +50,7 @@ mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
49 50
50static void 51static void
51centaur_get_mcr(unsigned int reg, unsigned long *base, 52centaur_get_mcr(unsigned int reg, unsigned long *base,
52 unsigned int *size, mtrr_type * type) 53 unsigned long *size, mtrr_type * type)
53{ 54{
54 *base = centaur_mcr[reg].high >> PAGE_SHIFT; 55 *base = centaur_mcr[reg].high >> PAGE_SHIFT;
55 *size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT; 56 *size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT;
diff --git a/arch/i386/kernel/cpu/mtrr/cyrix.c b/arch/i386/kernel/cpu/mtrr/cyrix.c
index 9027a987006b..0737a596db43 100644
--- a/arch/i386/kernel/cpu/mtrr/cyrix.c
+++ b/arch/i386/kernel/cpu/mtrr/cyrix.c
@@ -9,7 +9,7 @@ int arr3_protected;
9 9
10static void 10static void
11cyrix_get_arr(unsigned int reg, unsigned long *base, 11cyrix_get_arr(unsigned int reg, unsigned long *base,
12 unsigned int *size, mtrr_type * type) 12 unsigned long *size, mtrr_type * type)
13{ 13{
14 unsigned long flags; 14 unsigned long flags;
15 unsigned char arr, ccr3, rcr, shift; 15 unsigned char arr, ccr3, rcr, shift;
@@ -77,7 +77,7 @@ cyrix_get_arr(unsigned int reg, unsigned long *base,
77} 77}
78 78
79static int 79static int
80cyrix_get_free_region(unsigned long base, unsigned long size) 80cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg)
81/* [SUMMARY] Get a free ARR. 81/* [SUMMARY] Get a free ARR.
82 <base> The starting (base) address of the region. 82 <base> The starting (base) address of the region.
83 <size> The size (in bytes) of the region. 83 <size> The size (in bytes) of the region.
@@ -86,9 +86,24 @@ cyrix_get_free_region(unsigned long base, unsigned long size)
86{ 86{
87 int i; 87 int i;
88 mtrr_type ltype; 88 mtrr_type ltype;
89 unsigned long lbase; 89 unsigned long lbase, lsize;
90 unsigned int lsize;
91 90
91 switch (replace_reg) {
92 case 7:
93 if (size < 0x40)
94 break;
95 case 6:
96 case 5:
97 case 4:
98 return replace_reg;
99 case 3:
100 if (arr3_protected)
101 break;
102 case 2:
103 case 1:
104 case 0:
105 return replace_reg;
106 }
92 /* If we are to set up a region >32M then look at ARR7 immediately */ 107 /* If we are to set up a region >32M then look at ARR7 immediately */
93 if (size > 0x2000) { 108 if (size > 0x2000) {
94 cyrix_get_arr(7, &lbase, &lsize, &ltype); 109 cyrix_get_arr(7, &lbase, &lsize, &ltype);
@@ -214,7 +229,7 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base,
214 229
215typedef struct { 230typedef struct {
216 unsigned long base; 231 unsigned long base;
217 unsigned int size; 232 unsigned long size;
218 mtrr_type type; 233 mtrr_type type;
219} arr_state_t; 234} arr_state_t;
220 235
diff --git a/arch/i386/kernel/cpu/mtrr/generic.c b/arch/i386/kernel/cpu/mtrr/generic.c
index 0b61eed8bbd8..f77fc53db654 100644
--- a/arch/i386/kernel/cpu/mtrr/generic.c
+++ b/arch/i386/kernel/cpu/mtrr/generic.c
@@ -3,6 +3,7 @@
3#include <linux/init.h> 3#include <linux/init.h>
4#include <linux/slab.h> 4#include <linux/slab.h>
5#include <linux/mm.h> 5#include <linux/mm.h>
6#include <linux/module.h>
6#include <asm/io.h> 7#include <asm/io.h>
7#include <asm/mtrr.h> 8#include <asm/mtrr.h>
8#include <asm/msr.h> 9#include <asm/msr.h>
@@ -15,12 +16,19 @@ struct mtrr_state {
15 struct mtrr_var_range *var_ranges; 16 struct mtrr_var_range *var_ranges;
16 mtrr_type fixed_ranges[NUM_FIXED_RANGES]; 17 mtrr_type fixed_ranges[NUM_FIXED_RANGES];
17 unsigned char enabled; 18 unsigned char enabled;
19 unsigned char have_fixed;
18 mtrr_type def_type; 20 mtrr_type def_type;
19}; 21};
20 22
21static unsigned long smp_changes_mask; 23static unsigned long smp_changes_mask;
22static struct mtrr_state mtrr_state = {}; 24static struct mtrr_state mtrr_state = {};
23 25
26#undef MODULE_PARAM_PREFIX
27#define MODULE_PARAM_PREFIX "mtrr."
28
29static __initdata int mtrr_show;
30module_param_named(show, mtrr_show, bool, 0);
31
24/* Get the MSR pair relating to a var range */ 32/* Get the MSR pair relating to a var range */
25static void __init 33static void __init
26get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) 34get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
@@ -43,6 +51,14 @@ get_fixed_ranges(mtrr_type * frs)
43 rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]); 51 rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]);
44} 52}
45 53
54static void __init print_fixed(unsigned base, unsigned step, const mtrr_type*types)
55{
56 unsigned i;
57
58 for (i = 0; i < 8; ++i, ++types, base += step)
59 printk(KERN_INFO "MTRR %05X-%05X %s\n", base, base + step - 1, mtrr_attrib_to_str(*types));
60}
61
46/* Grab all of the MTRR state for this CPU into *state */ 62/* Grab all of the MTRR state for this CPU into *state */
47void __init get_mtrr_state(void) 63void __init get_mtrr_state(void)
48{ 64{
@@ -58,13 +74,49 @@ void __init get_mtrr_state(void)
58 } 74 }
59 vrs = mtrr_state.var_ranges; 75 vrs = mtrr_state.var_ranges;
60 76
77 rdmsr(MTRRcap_MSR, lo, dummy);
78 mtrr_state.have_fixed = (lo >> 8) & 1;
79
61 for (i = 0; i < num_var_ranges; i++) 80 for (i = 0; i < num_var_ranges; i++)
62 get_mtrr_var_range(i, &vrs[i]); 81 get_mtrr_var_range(i, &vrs[i]);
63 get_fixed_ranges(mtrr_state.fixed_ranges); 82 if (mtrr_state.have_fixed)
83 get_fixed_ranges(mtrr_state.fixed_ranges);
64 84
65 rdmsr(MTRRdefType_MSR, lo, dummy); 85 rdmsr(MTRRdefType_MSR, lo, dummy);
66 mtrr_state.def_type = (lo & 0xff); 86 mtrr_state.def_type = (lo & 0xff);
67 mtrr_state.enabled = (lo & 0xc00) >> 10; 87 mtrr_state.enabled = (lo & 0xc00) >> 10;
88
89 if (mtrr_show) {
90 int high_width;
91
92 printk(KERN_INFO "MTRR default type: %s\n", mtrr_attrib_to_str(mtrr_state.def_type));
93 if (mtrr_state.have_fixed) {
94 printk(KERN_INFO "MTRR fixed ranges %sabled:\n",
95 mtrr_state.enabled & 1 ? "en" : "dis");
96 print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
97 for (i = 0; i < 2; ++i)
98 print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8);
99 for (i = 0; i < 8; ++i)
100 print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8);
101 }
102 printk(KERN_INFO "MTRR variable ranges %sabled:\n",
103 mtrr_state.enabled & 2 ? "en" : "dis");
104 high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4;
105 for (i = 0; i < num_var_ranges; ++i) {
106 if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
107 printk(KERN_INFO "MTRR %u base %0*X%05X000 mask %0*X%05X000 %s\n",
108 i,
109 high_width,
110 mtrr_state.var_ranges[i].base_hi,
111 mtrr_state.var_ranges[i].base_lo >> 12,
112 high_width,
113 mtrr_state.var_ranges[i].mask_hi,
114 mtrr_state.var_ranges[i].mask_lo >> 12,
115 mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff));
116 else
117 printk(KERN_INFO "MTRR %u disabled\n", i);
118 }
119 }
68} 120}
69 121
70/* Some BIOS's are fucked and don't set all MTRRs the same! */ 122/* Some BIOS's are fucked and don't set all MTRRs the same! */
@@ -95,7 +147,7 @@ void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b)
95 smp_processor_id(), msr, a, b); 147 smp_processor_id(), msr, a, b);
96} 148}
97 149
98int generic_get_free_region(unsigned long base, unsigned long size) 150int generic_get_free_region(unsigned long base, unsigned long size, int replace_reg)
99/* [SUMMARY] Get a free MTRR. 151/* [SUMMARY] Get a free MTRR.
100 <base> The starting (base) address of the region. 152 <base> The starting (base) address of the region.
101 <size> The size (in bytes) of the region. 153 <size> The size (in bytes) of the region.
@@ -104,10 +156,11 @@ int generic_get_free_region(unsigned long base, unsigned long size)
104{ 156{
105 int i, max; 157 int i, max;
106 mtrr_type ltype; 158 mtrr_type ltype;
107 unsigned long lbase; 159 unsigned long lbase, lsize;
108 unsigned lsize;
109 160
110 max = num_var_ranges; 161 max = num_var_ranges;
162 if (replace_reg >= 0 && replace_reg < max)
163 return replace_reg;
111 for (i = 0; i < max; ++i) { 164 for (i = 0; i < max; ++i) {
112 mtrr_if->get(i, &lbase, &lsize, &ltype); 165 mtrr_if->get(i, &lbase, &lsize, &ltype);
113 if (lsize == 0) 166 if (lsize == 0)
@@ -117,7 +170,7 @@ int generic_get_free_region(unsigned long base, unsigned long size)
117} 170}
118 171
119static void generic_get_mtrr(unsigned int reg, unsigned long *base, 172static void generic_get_mtrr(unsigned int reg, unsigned long *base,
120 unsigned int *size, mtrr_type * type) 173 unsigned long *size, mtrr_type *type)
121{ 174{
122 unsigned int mask_lo, mask_hi, base_lo, base_hi; 175 unsigned int mask_lo, mask_hi, base_lo, base_hi;
123 176
@@ -202,7 +255,9 @@ static int set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr)
202 return changed; 255 return changed;
203} 256}
204 257
205static unsigned long set_mtrr_state(u32 deftype_lo, u32 deftype_hi) 258static u32 deftype_lo, deftype_hi;
259
260static unsigned long set_mtrr_state(void)
206/* [SUMMARY] Set the MTRR state for this CPU. 261/* [SUMMARY] Set the MTRR state for this CPU.
207 <state> The MTRR state information to read. 262 <state> The MTRR state information to read.
208 <ctxt> Some relevant CPU context. 263 <ctxt> Some relevant CPU context.
@@ -217,14 +272,14 @@ static unsigned long set_mtrr_state(u32 deftype_lo, u32 deftype_hi)
217 if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i])) 272 if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i]))
218 change_mask |= MTRR_CHANGE_MASK_VARIABLE; 273 change_mask |= MTRR_CHANGE_MASK_VARIABLE;
219 274
220 if (set_fixed_ranges(mtrr_state.fixed_ranges)) 275 if (mtrr_state.have_fixed && set_fixed_ranges(mtrr_state.fixed_ranges))
221 change_mask |= MTRR_CHANGE_MASK_FIXED; 276 change_mask |= MTRR_CHANGE_MASK_FIXED;
222 277
223 /* Set_mtrr_restore restores the old value of MTRRdefType, 278 /* Set_mtrr_restore restores the old value of MTRRdefType,
224 so to set it we fiddle with the saved value */ 279 so to set it we fiddle with the saved value */
225 if ((deftype_lo & 0xff) != mtrr_state.def_type 280 if ((deftype_lo & 0xff) != mtrr_state.def_type
226 || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) { 281 || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) {
227 deftype_lo |= (mtrr_state.def_type | mtrr_state.enabled << 10); 282 deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type | (mtrr_state.enabled << 10);
228 change_mask |= MTRR_CHANGE_MASK_DEFTYPE; 283 change_mask |= MTRR_CHANGE_MASK_DEFTYPE;
229 } 284 }
230 285
@@ -233,7 +288,6 @@ static unsigned long set_mtrr_state(u32 deftype_lo, u32 deftype_hi)
233 288
234 289
235static unsigned long cr4 = 0; 290static unsigned long cr4 = 0;
236static u32 deftype_lo, deftype_hi;
237static DEFINE_SPINLOCK(set_atomicity_lock); 291static DEFINE_SPINLOCK(set_atomicity_lock);
238 292
239/* 293/*
@@ -271,7 +325,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
271 rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); 325 rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi);
272 326
273 /* Disable MTRRs, and set the default type to uncached */ 327 /* Disable MTRRs, and set the default type to uncached */
274 mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & 0xf300UL, deftype_hi); 328 mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & ~0xcff, deftype_hi);
275} 329}
276 330
277static void post_set(void) __releases(set_atomicity_lock) 331static void post_set(void) __releases(set_atomicity_lock)
@@ -300,7 +354,7 @@ static void generic_set_all(void)
300 prepare_set(); 354 prepare_set();
301 355
302 /* Actually set the state */ 356 /* Actually set the state */
303 mask = set_mtrr_state(deftype_lo,deftype_hi); 357 mask = set_mtrr_state();
304 358
305 post_set(); 359 post_set();
306 local_irq_restore(flags); 360 local_irq_restore(flags);
@@ -366,7 +420,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i
366 printk(KERN_WARNING "mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); 420 printk(KERN_WARNING "mtrr: base(0x%lx000) is not 4 MiB aligned\n", base);
367 return -EINVAL; 421 return -EINVAL;
368 } 422 }
369 if (!(base + size < 0x70000000 || base > 0x7003FFFF) && 423 if (!(base + size < 0x70000 || base > 0x7003F) &&
370 (type == MTRR_TYPE_WRCOMB 424 (type == MTRR_TYPE_WRCOMB
371 || type == MTRR_TYPE_WRBACK)) { 425 || type == MTRR_TYPE_WRBACK)) {
372 printk(KERN_WARNING "mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n"); 426 printk(KERN_WARNING "mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n");
diff --git a/arch/i386/kernel/cpu/mtrr/if.c b/arch/i386/kernel/cpu/mtrr/if.c
index 5ac051bb9d55..5ae1705eafa6 100644
--- a/arch/i386/kernel/cpu/mtrr/if.c
+++ b/arch/i386/kernel/cpu/mtrr/if.c
@@ -17,7 +17,7 @@ extern unsigned int *usage_table;
17 17
18#define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private) 18#define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private)
19 19
20static char *mtrr_strings[MTRR_NUM_TYPES] = 20static const char *const mtrr_strings[MTRR_NUM_TYPES] =
21{ 21{
22 "uncachable", /* 0 */ 22 "uncachable", /* 0 */
23 "write-combining", /* 1 */ 23 "write-combining", /* 1 */
@@ -28,7 +28,7 @@ static char *mtrr_strings[MTRR_NUM_TYPES] =
28 "write-back", /* 6 */ 28 "write-back", /* 6 */
29}; 29};
30 30
31char *mtrr_attrib_to_str(int x) 31const char *mtrr_attrib_to_str(int x)
32{ 32{
33 return (x <= 6) ? mtrr_strings[x] : "?"; 33 return (x <= 6) ? mtrr_strings[x] : "?";
34} 34}
@@ -44,10 +44,9 @@ mtrr_file_add(unsigned long base, unsigned long size,
44 44
45 max = num_var_ranges; 45 max = num_var_ranges;
46 if (fcount == NULL) { 46 if (fcount == NULL) {
47 fcount = kmalloc(max * sizeof *fcount, GFP_KERNEL); 47 fcount = kzalloc(max * sizeof *fcount, GFP_KERNEL);
48 if (!fcount) 48 if (!fcount)
49 return -ENOMEM; 49 return -ENOMEM;
50 memset(fcount, 0, max * sizeof *fcount);
51 FILE_FCOUNT(file) = fcount; 50 FILE_FCOUNT(file) = fcount;
52 } 51 }
53 if (!page) { 52 if (!page) {
@@ -155,6 +154,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
155{ 154{
156 int err = 0; 155 int err = 0;
157 mtrr_type type; 156 mtrr_type type;
157 unsigned long size;
158 struct mtrr_sentry sentry; 158 struct mtrr_sentry sentry;
159 struct mtrr_gentry gentry; 159 struct mtrr_gentry gentry;
160 void __user *arg = (void __user *) __arg; 160 void __user *arg = (void __user *) __arg;
@@ -235,15 +235,15 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
235 case MTRRIOC_GET_ENTRY: 235 case MTRRIOC_GET_ENTRY:
236 if (gentry.regnum >= num_var_ranges) 236 if (gentry.regnum >= num_var_ranges)
237 return -EINVAL; 237 return -EINVAL;
238 mtrr_if->get(gentry.regnum, &gentry.base, &gentry.size, &type); 238 mtrr_if->get(gentry.regnum, &gentry.base, &size, &type);
239 239
240 /* Hide entries that go above 4GB */ 240 /* Hide entries that go above 4GB */
241 if (gentry.base + gentry.size > 0x100000 241 if (gentry.base + size - 1 >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT))
242 || gentry.size == 0x100000) 242 || size >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT)))
243 gentry.base = gentry.size = gentry.type = 0; 243 gentry.base = gentry.size = gentry.type = 0;
244 else { 244 else {
245 gentry.base <<= PAGE_SHIFT; 245 gentry.base <<= PAGE_SHIFT;
246 gentry.size <<= PAGE_SHIFT; 246 gentry.size = size << PAGE_SHIFT;
247 gentry.type = type; 247 gentry.type = type;
248 } 248 }
249 249
@@ -273,8 +273,14 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
273 case MTRRIOC_GET_PAGE_ENTRY: 273 case MTRRIOC_GET_PAGE_ENTRY:
274 if (gentry.regnum >= num_var_ranges) 274 if (gentry.regnum >= num_var_ranges)
275 return -EINVAL; 275 return -EINVAL;
276 mtrr_if->get(gentry.regnum, &gentry.base, &gentry.size, &type); 276 mtrr_if->get(gentry.regnum, &gentry.base, &size, &type);
277 gentry.type = type; 277 /* Hide entries that would overflow */
278 if (size != (__typeof__(gentry.size))size)
279 gentry.base = gentry.size = gentry.type = 0;
280 else {
281 gentry.size = size;
282 gentry.type = type;
283 }
278 break; 284 break;
279 } 285 }
280 286
@@ -353,8 +359,7 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
353 char factor; 359 char factor;
354 int i, max, len; 360 int i, max, len;
355 mtrr_type type; 361 mtrr_type type;
356 unsigned long base; 362 unsigned long base, size;
357 unsigned int size;
358 363
359 len = 0; 364 len = 0;
360 max = num_var_ranges; 365 max = num_var_ranges;
@@ -373,7 +378,7 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
373 } 378 }
374 /* RED-PEN: base can be > 32bit */ 379 /* RED-PEN: base can be > 32bit */
375 len += seq_printf(seq, 380 len += seq_printf(seq,
376 "reg%02i: base=0x%05lx000 (%4liMB), size=%4i%cB: %s, count=%d\n", 381 "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n",
377 i, base, base >> (20 - PAGE_SHIFT), size, factor, 382 i, base, base >> (20 - PAGE_SHIFT), size, factor,
378 mtrr_attrib_to_str(type), usage_table[i]); 383 mtrr_attrib_to_str(type), usage_table[i]);
379 } 384 }
diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c
index fff90bda4733..16bb7ea87145 100644
--- a/arch/i386/kernel/cpu/mtrr/main.c
+++ b/arch/i386/kernel/cpu/mtrr/main.c
@@ -59,7 +59,11 @@ struct mtrr_ops * mtrr_if = NULL;
59static void set_mtrr(unsigned int reg, unsigned long base, 59static void set_mtrr(unsigned int reg, unsigned long base,
60 unsigned long size, mtrr_type type); 60 unsigned long size, mtrr_type type);
61 61
62#ifndef CONFIG_X86_64
62extern int arr3_protected; 63extern int arr3_protected;
64#else
65#define arr3_protected 0
66#endif
63 67
64void set_mtrr_ops(struct mtrr_ops * ops) 68void set_mtrr_ops(struct mtrr_ops * ops)
65{ 69{
@@ -168,6 +172,13 @@ static void ipi_handler(void *info)
168 172
169#endif 173#endif
170 174
175static inline int types_compatible(mtrr_type type1, mtrr_type type2) {
176 return type1 == MTRR_TYPE_UNCACHABLE ||
177 type2 == MTRR_TYPE_UNCACHABLE ||
178 (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK) ||
179 (type1 == MTRR_TYPE_WRBACK && type2 == MTRR_TYPE_WRTHROUGH);
180}
181
171/** 182/**
172 * set_mtrr - update mtrrs on all processors 183 * set_mtrr - update mtrrs on all processors
173 * @reg: mtrr in question 184 * @reg: mtrr in question
@@ -263,8 +274,8 @@ static void set_mtrr(unsigned int reg, unsigned long base,
263 274
264/** 275/**
265 * mtrr_add_page - Add a memory type region 276 * mtrr_add_page - Add a memory type region
266 * @base: Physical base address of region in pages (4 KB) 277 * @base: Physical base address of region in pages (in units of 4 kB!)
267 * @size: Physical size of region in pages (4 KB) 278 * @size: Physical size of region in pages (4 kB)
268 * @type: Type of MTRR desired 279 * @type: Type of MTRR desired
269 * @increment: If this is true do usage counting on the region 280 * @increment: If this is true do usage counting on the region
270 * 281 *
@@ -300,11 +311,9 @@ static void set_mtrr(unsigned int reg, unsigned long base,
300int mtrr_add_page(unsigned long base, unsigned long size, 311int mtrr_add_page(unsigned long base, unsigned long size,
301 unsigned int type, char increment) 312 unsigned int type, char increment)
302{ 313{
303 int i; 314 int i, replace, error;
304 mtrr_type ltype; 315 mtrr_type ltype;
305 unsigned long lbase; 316 unsigned long lbase, lsize;
306 unsigned int lsize;
307 int error;
308 317
309 if (!mtrr_if) 318 if (!mtrr_if)
310 return -ENXIO; 319 return -ENXIO;
@@ -324,12 +333,18 @@ int mtrr_add_page(unsigned long base, unsigned long size,
324 return -ENOSYS; 333 return -ENOSYS;
325 } 334 }
326 335
336 if (!size) {
337 printk(KERN_WARNING "mtrr: zero sized request\n");
338 return -EINVAL;
339 }
340
327 if (base & size_or_mask || size & size_or_mask) { 341 if (base & size_or_mask || size & size_or_mask) {
328 printk(KERN_WARNING "mtrr: base or size exceeds the MTRR width\n"); 342 printk(KERN_WARNING "mtrr: base or size exceeds the MTRR width\n");
329 return -EINVAL; 343 return -EINVAL;
330 } 344 }
331 345
332 error = -EINVAL; 346 error = -EINVAL;
347 replace = -1;
333 348
334 /* No CPU hotplug when we change MTRR entries */ 349 /* No CPU hotplug when we change MTRR entries */
335 lock_cpu_hotplug(); 350 lock_cpu_hotplug();
@@ -337,21 +352,28 @@ int mtrr_add_page(unsigned long base, unsigned long size,
337 mutex_lock(&mtrr_mutex); 352 mutex_lock(&mtrr_mutex);
338 for (i = 0; i < num_var_ranges; ++i) { 353 for (i = 0; i < num_var_ranges; ++i) {
339 mtrr_if->get(i, &lbase, &lsize, &ltype); 354 mtrr_if->get(i, &lbase, &lsize, &ltype);
340 if (base >= lbase + lsize) 355 if (!lsize || base > lbase + lsize - 1 || base + size - 1 < lbase)
341 continue;
342 if ((base < lbase) && (base + size <= lbase))
343 continue; 356 continue;
344 /* At this point we know there is some kind of overlap/enclosure */ 357 /* At this point we know there is some kind of overlap/enclosure */
345 if ((base < lbase) || (base + size > lbase + lsize)) { 358 if (base < lbase || base + size - 1 > lbase + lsize - 1) {
359 if (base <= lbase && base + size - 1 >= lbase + lsize - 1) {
360 /* New region encloses an existing region */
361 if (type == ltype) {
362 replace = replace == -1 ? i : -2;
363 continue;
364 }
365 else if (types_compatible(type, ltype))
366 continue;
367 }
346 printk(KERN_WARNING 368 printk(KERN_WARNING
347 "mtrr: 0x%lx000,0x%lx000 overlaps existing" 369 "mtrr: 0x%lx000,0x%lx000 overlaps existing"
348 " 0x%lx000,0x%x000\n", base, size, lbase, 370 " 0x%lx000,0x%lx000\n", base, size, lbase,
349 lsize); 371 lsize);
350 goto out; 372 goto out;
351 } 373 }
352 /* New region is enclosed by an existing region */ 374 /* New region is enclosed by an existing region */
353 if (ltype != type) { 375 if (ltype != type) {
354 if (type == MTRR_TYPE_UNCACHABLE) 376 if (types_compatible(type, ltype))
355 continue; 377 continue;
356 printk (KERN_WARNING "mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n", 378 printk (KERN_WARNING "mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n",
357 base, size, mtrr_attrib_to_str(ltype), 379 base, size, mtrr_attrib_to_str(ltype),
@@ -364,10 +386,18 @@ int mtrr_add_page(unsigned long base, unsigned long size,
364 goto out; 386 goto out;
365 } 387 }
366 /* Search for an empty MTRR */ 388 /* Search for an empty MTRR */
367 i = mtrr_if->get_free_region(base, size); 389 i = mtrr_if->get_free_region(base, size, replace);
368 if (i >= 0) { 390 if (i >= 0) {
369 set_mtrr(i, base, size, type); 391 set_mtrr(i, base, size, type);
370 usage_table[i] = 1; 392 if (likely(replace < 0))
393 usage_table[i] = 1;
394 else {
395 usage_table[i] = usage_table[replace] + !!increment;
396 if (unlikely(replace != i)) {
397 set_mtrr(replace, 0, 0, 0);
398 usage_table[replace] = 0;
399 }
400 }
371 } else 401 } else
372 printk(KERN_INFO "mtrr: no more MTRRs available\n"); 402 printk(KERN_INFO "mtrr: no more MTRRs available\n");
373 error = i; 403 error = i;
@@ -455,8 +485,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
455{ 485{
456 int i, max; 486 int i, max;
457 mtrr_type ltype; 487 mtrr_type ltype;
458 unsigned long lbase; 488 unsigned long lbase, lsize;
459 unsigned int lsize;
460 int error = -EINVAL; 489 int error = -EINVAL;
461 490
462 if (!mtrr_if) 491 if (!mtrr_if)
@@ -544,9 +573,11 @@ extern void centaur_init_mtrr(void);
544 573
545static void __init init_ifs(void) 574static void __init init_ifs(void)
546{ 575{
576#ifndef CONFIG_X86_64
547 amd_init_mtrr(); 577 amd_init_mtrr();
548 cyrix_init_mtrr(); 578 cyrix_init_mtrr();
549 centaur_init_mtrr(); 579 centaur_init_mtrr();
580#endif
550} 581}
551 582
552/* The suspend/resume methods are only for CPU without MTRR. CPU using generic 583/* The suspend/resume methods are only for CPU without MTRR. CPU using generic
@@ -555,7 +586,7 @@ static void __init init_ifs(void)
555struct mtrr_value { 586struct mtrr_value {
556 mtrr_type ltype; 587 mtrr_type ltype;
557 unsigned long lbase; 588 unsigned long lbase;
558 unsigned int lsize; 589 unsigned long lsize;
559}; 590};
560 591
561static struct mtrr_value * mtrr_state; 592static struct mtrr_value * mtrr_state;
@@ -565,10 +596,8 @@ static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
565 int i; 596 int i;
566 int size = num_var_ranges * sizeof(struct mtrr_value); 597 int size = num_var_ranges * sizeof(struct mtrr_value);
567 598
568 mtrr_state = kmalloc(size,GFP_ATOMIC); 599 mtrr_state = kzalloc(size,GFP_ATOMIC);
569 if (mtrr_state) 600 if (!mtrr_state)
570 memset(mtrr_state,0,size);
571 else
572 return -ENOMEM; 601 return -ENOMEM;
573 602
574 for (i = 0; i < num_var_ranges; i++) { 603 for (i = 0; i < num_var_ranges; i++) {
diff --git a/arch/i386/kernel/cpu/mtrr/mtrr.h b/arch/i386/kernel/cpu/mtrr/mtrr.h
index 99c9f2682041..d61ea9db6cfe 100644
--- a/arch/i386/kernel/cpu/mtrr/mtrr.h
+++ b/arch/i386/kernel/cpu/mtrr/mtrr.h
@@ -43,15 +43,16 @@ struct mtrr_ops {
43 void (*set_all)(void); 43 void (*set_all)(void);
44 44
45 void (*get)(unsigned int reg, unsigned long *base, 45 void (*get)(unsigned int reg, unsigned long *base,
46 unsigned int *size, mtrr_type * type); 46 unsigned long *size, mtrr_type * type);
47 int (*get_free_region) (unsigned long base, unsigned long size); 47 int (*get_free_region)(unsigned long base, unsigned long size,
48 48 int replace_reg);
49 int (*validate_add_page)(unsigned long base, unsigned long size, 49 int (*validate_add_page)(unsigned long base, unsigned long size,
50 unsigned int type); 50 unsigned int type);
51 int (*have_wrcomb)(void); 51 int (*have_wrcomb)(void);
52}; 52};
53 53
54extern int generic_get_free_region(unsigned long base, unsigned long size); 54extern int generic_get_free_region(unsigned long base, unsigned long size,
55 int replace_reg);
55extern int generic_validate_add_page(unsigned long base, unsigned long size, 56extern int generic_validate_add_page(unsigned long base, unsigned long size,
56 unsigned int type); 57 unsigned int type);
57 58
@@ -62,17 +63,17 @@ extern int positive_have_wrcomb(void);
62/* library functions for processor-specific routines */ 63/* library functions for processor-specific routines */
63struct set_mtrr_context { 64struct set_mtrr_context {
64 unsigned long flags; 65 unsigned long flags;
65 unsigned long deftype_lo;
66 unsigned long deftype_hi;
67 unsigned long cr4val; 66 unsigned long cr4val;
68 unsigned long ccr3; 67 u32 deftype_lo;
68 u32 deftype_hi;
69 u32 ccr3;
69}; 70};
70 71
71struct mtrr_var_range { 72struct mtrr_var_range {
72 unsigned long base_lo; 73 u32 base_lo;
73 unsigned long base_hi; 74 u32 base_hi;
74 unsigned long mask_lo; 75 u32 mask_lo;
75 unsigned long mask_hi; 76 u32 mask_hi;
76}; 77};
77 78
78void set_mtrr_done(struct set_mtrr_context *ctxt); 79void set_mtrr_done(struct set_mtrr_context *ctxt);
@@ -92,6 +93,6 @@ extern struct mtrr_ops * mtrr_if;
92extern unsigned int num_var_ranges; 93extern unsigned int num_var_ranges;
93 94
94void mtrr_state_warn(void); 95void mtrr_state_warn(void);
95char *mtrr_attrib_to_str(int x); 96const char *mtrr_attrib_to_str(int x);
96void mtrr_wrmsr(unsigned, unsigned, unsigned); 97void mtrr_wrmsr(unsigned, unsigned, unsigned);
97 98
diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c
index 76aac088a323..6624d8583c42 100644
--- a/arch/i386/kernel/cpu/proc.c
+++ b/arch/i386/kernel/cpu/proc.c
@@ -152,9 +152,10 @@ static int show_cpuinfo(struct seq_file *m, void *v)
152 seq_printf(m, " [%d]", i); 152 seq_printf(m, " [%d]", i);
153 } 153 }
154 154
155 seq_printf(m, "\nbogomips\t: %lu.%02lu\n\n", 155 seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
156 c->loops_per_jiffy/(500000/HZ), 156 c->loops_per_jiffy/(500000/HZ),
157 (c->loops_per_jiffy/(5000/HZ)) % 100); 157 (c->loops_per_jiffy/(5000/HZ)) % 100);
158 seq_printf(m, "clflush size\t: %u\n\n", c->x86_clflush_size);
158 159
159 return 0; 160 return 0;
160} 161}
diff --git a/arch/i386/kernel/cpuid.c b/arch/i386/kernel/cpuid.c
index ab0c327e79dc..db6dd20c3589 100644
--- a/arch/i386/kernel/cpuid.c
+++ b/arch/i386/kernel/cpuid.c
@@ -34,7 +34,6 @@
34#include <linux/major.h> 34#include <linux/major.h>
35#include <linux/fs.h> 35#include <linux/fs.h>
36#include <linux/smp_lock.h> 36#include <linux/smp_lock.h>
37#include <linux/fs.h>
38#include <linux/device.h> 37#include <linux/device.h>
39#include <linux/cpu.h> 38#include <linux/cpu.h>
40#include <linux/notifier.h> 39#include <linux/notifier.h>
@@ -167,7 +166,6 @@ static int cpuid_device_create(int i)
167 return err; 166 return err;
168} 167}
169 168
170#ifdef CONFIG_HOTPLUG_CPU
171static int cpuid_class_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 169static int cpuid_class_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
172{ 170{
173 unsigned int cpu = (unsigned long)hcpu; 171 unsigned int cpu = (unsigned long)hcpu;
@@ -187,7 +185,6 @@ static struct notifier_block __cpuinitdata cpuid_class_cpu_notifier =
187{ 185{
188 .notifier_call = cpuid_class_cpu_callback, 186 .notifier_call = cpuid_class_cpu_callback,
189}; 187};
190#endif /* !CONFIG_HOTPLUG_CPU */
191 188
192static int __init cpuid_init(void) 189static int __init cpuid_init(void)
193{ 190{
diff --git a/arch/i386/kernel/crash.c b/arch/i386/kernel/crash.c
index 144b43288965..a5e0e990ea95 100644
--- a/arch/i386/kernel/crash.c
+++ b/arch/i386/kernel/crash.c
@@ -31,68 +31,6 @@
31/* This keeps a track of which one is crashing cpu. */ 31/* This keeps a track of which one is crashing cpu. */
32static int crashing_cpu; 32static int crashing_cpu;
33 33
34static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
35 size_t data_len)
36{
37 struct elf_note note;
38
39 note.n_namesz = strlen(name) + 1;
40 note.n_descsz = data_len;
41 note.n_type = type;
42 memcpy(buf, &note, sizeof(note));
43 buf += (sizeof(note) +3)/4;
44 memcpy(buf, name, note.n_namesz);
45 buf += (note.n_namesz + 3)/4;
46 memcpy(buf, data, note.n_descsz);
47 buf += (note.n_descsz + 3)/4;
48
49 return buf;
50}
51
52static void final_note(u32 *buf)
53{
54 struct elf_note note;
55
56 note.n_namesz = 0;
57 note.n_descsz = 0;
58 note.n_type = 0;
59 memcpy(buf, &note, sizeof(note));
60}
61
62static void crash_save_this_cpu(struct pt_regs *regs, int cpu)
63{
64 struct elf_prstatus prstatus;
65 u32 *buf;
66
67 if ((cpu < 0) || (cpu >= NR_CPUS))
68 return;
69
70 /* Using ELF notes here is opportunistic.
71 * I need a well defined structure format
72 * for the data I pass, and I need tags
73 * on the data to indicate what information I have
74 * squirrelled away. ELF notes happen to provide
75 * all of that, so there is no need to invent something new.
76 */
77 buf = (u32*)per_cpu_ptr(crash_notes, cpu);
78 if (!buf)
79 return;
80 memset(&prstatus, 0, sizeof(prstatus));
81 prstatus.pr_pid = current->pid;
82 elf_core_copy_regs(&prstatus.pr_reg, regs);
83 buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
84 sizeof(prstatus));
85 final_note(buf);
86}
87
88static void crash_save_self(struct pt_regs *regs)
89{
90 int cpu;
91
92 cpu = safe_smp_processor_id();
93 crash_save_this_cpu(regs, cpu);
94}
95
96#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) 34#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
97static atomic_t waiting_for_crash_ipi; 35static atomic_t waiting_for_crash_ipi;
98 36
@@ -121,7 +59,7 @@ static int crash_nmi_callback(struct notifier_block *self,
121 crash_fixup_ss_esp(&fixed_regs, regs); 59 crash_fixup_ss_esp(&fixed_regs, regs);
122 regs = &fixed_regs; 60 regs = &fixed_regs;
123 } 61 }
124 crash_save_this_cpu(regs, cpu); 62 crash_save_cpu(regs, cpu);
125 disable_local_APIC(); 63 disable_local_APIC();
126 atomic_dec(&waiting_for_crash_ipi); 64 atomic_dec(&waiting_for_crash_ipi);
127 /* Assume hlt works */ 65 /* Assume hlt works */
@@ -195,5 +133,5 @@ void machine_crash_shutdown(struct pt_regs *regs)
195#if defined(CONFIG_X86_IO_APIC) 133#if defined(CONFIG_X86_IO_APIC)
196 disable_IO_APIC(); 134 disable_IO_APIC();
197#endif 135#endif
198 crash_save_self(regs); 136 crash_save_cpu(regs, safe_smp_processor_id());
199} 137}
diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c
new file mode 100644
index 000000000000..2f7d0a92fd7c
--- /dev/null
+++ b/arch/i386/kernel/e820.c
@@ -0,0 +1,894 @@
1#include <linux/kernel.h>
2#include <linux/types.h>
3#include <linux/init.h>
4#include <linux/bootmem.h>
5#include <linux/ioport.h>
6#include <linux/string.h>
7#include <linux/kexec.h>
8#include <linux/module.h>
9#include <linux/mm.h>
10#include <linux/efi.h>
11#include <linux/pfn.h>
12#include <linux/uaccess.h>
13
14#include <asm/pgtable.h>
15#include <asm/page.h>
16#include <asm/e820.h>
17
18#ifdef CONFIG_EFI
19int efi_enabled = 0;
20EXPORT_SYMBOL(efi_enabled);
21#endif
22
23struct e820map e820;
24struct change_member {
25 struct e820entry *pbios; /* pointer to original bios entry */
26 unsigned long long addr; /* address for this change point */
27};
28static struct change_member change_point_list[2*E820MAX] __initdata;
29static struct change_member *change_point[2*E820MAX] __initdata;
30static struct e820entry *overlap_list[E820MAX] __initdata;
31static struct e820entry new_bios[E820MAX] __initdata;
32/* For PCI or other memory-mapped resources */
33unsigned long pci_mem_start = 0x10000000;
34#ifdef CONFIG_PCI
35EXPORT_SYMBOL(pci_mem_start);
36#endif
37extern int user_defined_memmap;
38struct resource data_resource = {
39 .name = "Kernel data",
40 .start = 0,
41 .end = 0,
42 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
43};
44
45struct resource code_resource = {
46 .name = "Kernel code",
47 .start = 0,
48 .end = 0,
49 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
50};
51
52static struct resource system_rom_resource = {
53 .name = "System ROM",
54 .start = 0xf0000,
55 .end = 0xfffff,
56 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
57};
58
59static struct resource extension_rom_resource = {
60 .name = "Extension ROM",
61 .start = 0xe0000,
62 .end = 0xeffff,
63 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
64};
65
66static struct resource adapter_rom_resources[] = { {
67 .name = "Adapter ROM",
68 .start = 0xc8000,
69 .end = 0,
70 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
71}, {
72 .name = "Adapter ROM",
73 .start = 0,
74 .end = 0,
75 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
76}, {
77 .name = "Adapter ROM",
78 .start = 0,
79 .end = 0,
80 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
81}, {
82 .name = "Adapter ROM",
83 .start = 0,
84 .end = 0,
85 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
86}, {
87 .name = "Adapter ROM",
88 .start = 0,
89 .end = 0,
90 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
91}, {
92 .name = "Adapter ROM",
93 .start = 0,
94 .end = 0,
95 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
96} };
97
98static struct resource video_rom_resource = {
99 .name = "Video ROM",
100 .start = 0xc0000,
101 .end = 0xc7fff,
102 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
103};
104
105static struct resource video_ram_resource = {
106 .name = "Video RAM area",
107 .start = 0xa0000,
108 .end = 0xbffff,
109 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
110};
111
112static struct resource standard_io_resources[] = { {
113 .name = "dma1",
114 .start = 0x0000,
115 .end = 0x001f,
116 .flags = IORESOURCE_BUSY | IORESOURCE_IO
117}, {
118 .name = "pic1",
119 .start = 0x0020,
120 .end = 0x0021,
121 .flags = IORESOURCE_BUSY | IORESOURCE_IO
122}, {
123 .name = "timer0",
124 .start = 0x0040,
125 .end = 0x0043,
126 .flags = IORESOURCE_BUSY | IORESOURCE_IO
127}, {
128 .name = "timer1",
129 .start = 0x0050,
130 .end = 0x0053,
131 .flags = IORESOURCE_BUSY | IORESOURCE_IO
132}, {
133 .name = "keyboard",
134 .start = 0x0060,
135 .end = 0x006f,
136 .flags = IORESOURCE_BUSY | IORESOURCE_IO
137}, {
138 .name = "dma page reg",
139 .start = 0x0080,
140 .end = 0x008f,
141 .flags = IORESOURCE_BUSY | IORESOURCE_IO
142}, {
143 .name = "pic2",
144 .start = 0x00a0,
145 .end = 0x00a1,
146 .flags = IORESOURCE_BUSY | IORESOURCE_IO
147}, {
148 .name = "dma2",
149 .start = 0x00c0,
150 .end = 0x00df,
151 .flags = IORESOURCE_BUSY | IORESOURCE_IO
152}, {
153 .name = "fpu",
154 .start = 0x00f0,
155 .end = 0x00ff,
156 .flags = IORESOURCE_BUSY | IORESOURCE_IO
157} };
158
159static int romsignature(const unsigned char *x)
160{
161 unsigned short sig;
162 int ret = 0;
163 if (probe_kernel_address((const unsigned short *)x, sig) == 0)
164 ret = (sig == 0xaa55);
165 return ret;
166}
167
168static int __init romchecksum(unsigned char *rom, unsigned long length)
169{
170 unsigned char *p, sum = 0;
171
172 for (p = rom; p < rom + length; p++)
173 sum += *p;
174 return sum == 0;
175}
176
177static void __init probe_roms(void)
178{
179 unsigned long start, length, upper;
180 unsigned char *rom;
181 int i;
182
183 /* video rom */
184 upper = adapter_rom_resources[0].start;
185 for (start = video_rom_resource.start; start < upper; start += 2048) {
186 rom = isa_bus_to_virt(start);
187 if (!romsignature(rom))
188 continue;
189
190 video_rom_resource.start = start;
191
192 /* 0 < length <= 0x7f * 512, historically */
193 length = rom[2] * 512;
194
195 /* if checksum okay, trust length byte */
196 if (length && romchecksum(rom, length))
197 video_rom_resource.end = start + length - 1;
198
199 request_resource(&iomem_resource, &video_rom_resource);
200 break;
201 }
202
203 start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
204 if (start < upper)
205 start = upper;
206
207 /* system rom */
208 request_resource(&iomem_resource, &system_rom_resource);
209 upper = system_rom_resource.start;
210
211 /* check for extension rom (ignore length byte!) */
212 rom = isa_bus_to_virt(extension_rom_resource.start);
213 if (romsignature(rom)) {
214 length = extension_rom_resource.end - extension_rom_resource.start + 1;
215 if (romchecksum(rom, length)) {
216 request_resource(&iomem_resource, &extension_rom_resource);
217 upper = extension_rom_resource.start;
218 }
219 }
220
221 /* check for adapter roms on 2k boundaries */
222 for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
223 rom = isa_bus_to_virt(start);
224 if (!romsignature(rom))
225 continue;
226
227 /* 0 < length <= 0x7f * 512, historically */
228 length = rom[2] * 512;
229
230 /* but accept any length that fits if checksum okay */
231 if (!length || start + length > upper || !romchecksum(rom, length))
232 continue;
233
234 adapter_rom_resources[i].start = start;
235 adapter_rom_resources[i].end = start + length - 1;
236 request_resource(&iomem_resource, &adapter_rom_resources[i]);
237
238 start = adapter_rom_resources[i++].end & ~2047UL;
239 }
240}
241
242/*
243 * Request address space for all standard RAM and ROM resources
244 * and also for regions reported as reserved by the e820.
245 */
246static void __init
247legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource)
248{
249 int i;
250
251 probe_roms();
252 for (i = 0; i < e820.nr_map; i++) {
253 struct resource *res;
254#ifndef CONFIG_RESOURCES_64BIT
255 if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
256 continue;
257#endif
258 res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
259 switch (e820.map[i].type) {
260 case E820_RAM: res->name = "System RAM"; break;
261 case E820_ACPI: res->name = "ACPI Tables"; break;
262 case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
263 default: res->name = "reserved";
264 }
265 res->start = e820.map[i].addr;
266 res->end = res->start + e820.map[i].size - 1;
267 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
268 if (request_resource(&iomem_resource, res)) {
269 kfree(res);
270 continue;
271 }
272 if (e820.map[i].type == E820_RAM) {
273 /*
274 * We don't know which RAM region contains kernel data,
275 * so we try it repeatedly and let the resource manager
276 * test it.
277 */
278 request_resource(res, code_resource);
279 request_resource(res, data_resource);
280#ifdef CONFIG_KEXEC
281 request_resource(res, &crashk_res);
282#endif
283 }
284 }
285}
286
287/*
288 * Request address space for all standard resources
289 *
290 * This is called just before pcibios_init(), which is also a
291 * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
292 */
293static int __init request_standard_resources(void)
294{
295 int i;
296
297 printk("Setting up standard PCI resources\n");
298 if (efi_enabled)
299 efi_initialize_iomem_resources(&code_resource, &data_resource);
300 else
301 legacy_init_iomem_resources(&code_resource, &data_resource);
302
303 /* EFI systems may still have VGA */
304 request_resource(&iomem_resource, &video_ram_resource);
305
306 /* request I/O space for devices used on all i[345]86 PCs */
307 for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
308 request_resource(&ioport_resource, &standard_io_resources[i]);
309 return 0;
310}
311
312subsys_initcall(request_standard_resources);
313
314void __init add_memory_region(unsigned long long start,
315 unsigned long long size, int type)
316{
317 int x;
318
319 if (!efi_enabled) {
320 x = e820.nr_map;
321
322 if (x == E820MAX) {
323 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
324 return;
325 }
326
327 e820.map[x].addr = start;
328 e820.map[x].size = size;
329 e820.map[x].type = type;
330 e820.nr_map++;
331 }
332} /* add_memory_region */
333
334/*
335 * Sanitize the BIOS e820 map.
336 *
337 * Some e820 responses include overlapping entries. The following
338 * replaces the original e820 map with a new one, removing overlaps.
339 *
340 */
341int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
342{
343 struct change_member *change_tmp;
344 unsigned long current_type, last_type;
345 unsigned long long last_addr;
346 int chgidx, still_changing;
347 int overlap_entries;
348 int new_bios_entry;
349 int old_nr, new_nr, chg_nr;
350 int i;
351
352 /*
353 Visually we're performing the following (1,2,3,4 = memory types)...
354
355 Sample memory map (w/overlaps):
356 ____22__________________
357 ______________________4_
358 ____1111________________
359 _44_____________________
360 11111111________________
361 ____________________33__
362 ___________44___________
363 __________33333_________
364 ______________22________
365 ___________________2222_
366 _________111111111______
367 _____________________11_
368 _________________4______
369
370 Sanitized equivalent (no overlap):
371 1_______________________
372 _44_____________________
373 ___1____________________
374 ____22__________________
375 ______11________________
376 _________1______________
377 __________3_____________
378 ___________44___________
379 _____________33_________
380 _______________2________
381 ________________1_______
382 _________________4______
383 ___________________2____
384 ____________________33__
385 ______________________4_
386 */
387 printk("sanitize start\n");
388 /* if there's only one memory region, don't bother */
389 if (*pnr_map < 2) {
390 printk("sanitize bail 0\n");
391 return -1;
392 }
393
394 old_nr = *pnr_map;
395
396 /* bail out if we find any unreasonable addresses in bios map */
397 for (i=0; i<old_nr; i++)
398 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
399 printk("sanitize bail 1\n");
400 return -1;
401 }
402
403 /* create pointers for initial change-point information (for sorting) */
404 for (i=0; i < 2*old_nr; i++)
405 change_point[i] = &change_point_list[i];
406
407 /* record all known change-points (starting and ending addresses),
408 omitting those that are for empty memory regions */
409 chgidx = 0;
410 for (i=0; i < old_nr; i++) {
411 if (biosmap[i].size != 0) {
412 change_point[chgidx]->addr = biosmap[i].addr;
413 change_point[chgidx++]->pbios = &biosmap[i];
414 change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
415 change_point[chgidx++]->pbios = &biosmap[i];
416 }
417 }
418 chg_nr = chgidx; /* true number of change-points */
419
420 /* sort change-point list by memory addresses (low -> high) */
421 still_changing = 1;
422 while (still_changing) {
423 still_changing = 0;
424 for (i=1; i < chg_nr; i++) {
425 /* if <current_addr> > <last_addr>, swap */
426 /* or, if current=<start_addr> & last=<end_addr>, swap */
427 if ((change_point[i]->addr < change_point[i-1]->addr) ||
428 ((change_point[i]->addr == change_point[i-1]->addr) &&
429 (change_point[i]->addr == change_point[i]->pbios->addr) &&
430 (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
431 )
432 {
433 change_tmp = change_point[i];
434 change_point[i] = change_point[i-1];
435 change_point[i-1] = change_tmp;
436 still_changing=1;
437 }
438 }
439 }
440
441 /* create a new bios memory map, removing overlaps */
442 overlap_entries=0; /* number of entries in the overlap table */
443 new_bios_entry=0; /* index for creating new bios map entries */
444 last_type = 0; /* start with undefined memory type */
445 last_addr = 0; /* start with 0 as last starting address */
446 /* loop through change-points, determining affect on the new bios map */
447 for (chgidx=0; chgidx < chg_nr; chgidx++)
448 {
449 /* keep track of all overlapping bios entries */
450 if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
451 {
452 /* add map entry to overlap list (> 1 entry implies an overlap) */
453 overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
454 }
455 else
456 {
457 /* remove entry from list (order independent, so swap with last) */
458 for (i=0; i<overlap_entries; i++)
459 {
460 if (overlap_list[i] == change_point[chgidx]->pbios)
461 overlap_list[i] = overlap_list[overlap_entries-1];
462 }
463 overlap_entries--;
464 }
465 /* if there are overlapping entries, decide which "type" to use */
466 /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
467 current_type = 0;
468 for (i=0; i<overlap_entries; i++)
469 if (overlap_list[i]->type > current_type)
470 current_type = overlap_list[i]->type;
471 /* continue building up new bios map based on this information */
472 if (current_type != last_type) {
473 if (last_type != 0) {
474 new_bios[new_bios_entry].size =
475 change_point[chgidx]->addr - last_addr;
476 /* move forward only if the new size was non-zero */
477 if (new_bios[new_bios_entry].size != 0)
478 if (++new_bios_entry >= E820MAX)
479 break; /* no more space left for new bios entries */
480 }
481 if (current_type != 0) {
482 new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
483 new_bios[new_bios_entry].type = current_type;
484 last_addr=change_point[chgidx]->addr;
485 }
486 last_type = current_type;
487 }
488 }
489 new_nr = new_bios_entry; /* retain count for new bios entries */
490
491 /* copy new bios mapping into original location */
492 memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
493 *pnr_map = new_nr;
494
495 printk("sanitize end\n");
496 return 0;
497}
498
499/*
500 * Copy the BIOS e820 map into a safe place.
501 *
502 * Sanity-check it while we're at it..
503 *
504 * If we're lucky and live on a modern system, the setup code
505 * will have given us a memory map that we can use to properly
506 * set up memory. If we aren't, we'll fake a memory map.
507 *
508 * We check to see that the memory map contains at least 2 elements
509 * before we'll use it, because the detection code in setup.S may
510 * not be perfect and most every PC known to man has two memory
511 * regions: one from 0 to 640k, and one from 1mb up. (The IBM
512 * thinkpad 560x, for example, does not cooperate with the memory
513 * detection code.)
514 */
515int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
516{
517 /* Only one memory region (or negative)? Ignore it */
518 if (nr_map < 2)
519 return -1;
520
521 do {
522 unsigned long long start = biosmap->addr;
523 unsigned long long size = biosmap->size;
524 unsigned long long end = start + size;
525 unsigned long type = biosmap->type;
526 printk("copy_e820_map() start: %016Lx size: %016Lx end: %016Lx type: %ld\n", start, size, end, type);
527
528 /* Overflow in 64 bits? Ignore the memory map. */
529 if (start > end)
530 return -1;
531
532 /*
533 * Some BIOSes claim RAM in the 640k - 1M region.
534 * Not right. Fix it up.
535 */
536 if (type == E820_RAM) {
537 printk("copy_e820_map() type is E820_RAM\n");
538 if (start < 0x100000ULL && end > 0xA0000ULL) {
539 printk("copy_e820_map() lies in range...\n");
540 if (start < 0xA0000ULL) {
541 printk("copy_e820_map() start < 0xA0000ULL\n");
542 add_memory_region(start, 0xA0000ULL-start, type);
543 }
544 if (end <= 0x100000ULL) {
545 printk("copy_e820_map() end <= 0x100000ULL\n");
546 continue;
547 }
548 start = 0x100000ULL;
549 size = end - start;
550 }
551 }
552 add_memory_region(start, size, type);
553 } while (biosmap++,--nr_map);
554 return 0;
555}
556
557/*
558 * Callback for efi_memory_walk.
559 */
560static int __init
561efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
562{
563 unsigned long *max_pfn = arg, pfn;
564
565 if (start < end) {
566 pfn = PFN_UP(end -1);
567 if (pfn > *max_pfn)
568 *max_pfn = pfn;
569 }
570 return 0;
571}
572
573static int __init
574efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
575{
576 memory_present(0, PFN_UP(start), PFN_DOWN(end));
577 return 0;
578}
579
580/*
581 * Find the highest page frame number we have available
582 */
583void __init find_max_pfn(void)
584{
585 int i;
586
587 max_pfn = 0;
588 if (efi_enabled) {
589 efi_memmap_walk(efi_find_max_pfn, &max_pfn);
590 efi_memmap_walk(efi_memory_present_wrapper, NULL);
591 return;
592 }
593
594 for (i = 0; i < e820.nr_map; i++) {
595 unsigned long start, end;
596 /* RAM? */
597 if (e820.map[i].type != E820_RAM)
598 continue;
599 start = PFN_UP(e820.map[i].addr);
600 end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
601 if (start >= end)
602 continue;
603 if (end > max_pfn)
604 max_pfn = end;
605 memory_present(0, start, end);
606 }
607}
608
609/*
610 * Free all available memory for boot time allocation. Used
611 * as a callback function by efi_memory_walk()
612 */
613
614static int __init
615free_available_memory(unsigned long start, unsigned long end, void *arg)
616{
617 /* check max_low_pfn */
618 if (start >= (max_low_pfn << PAGE_SHIFT))
619 return 0;
620 if (end >= (max_low_pfn << PAGE_SHIFT))
621 end = max_low_pfn << PAGE_SHIFT;
622 if (start < end)
623 free_bootmem(start, end - start);
624
625 return 0;
626}
627/*
628 * Register fully available low RAM pages with the bootmem allocator.
629 */
630void __init register_bootmem_low_pages(unsigned long max_low_pfn)
631{
632 int i;
633
634 if (efi_enabled) {
635 efi_memmap_walk(free_available_memory, NULL);
636 return;
637 }
638 for (i = 0; i < e820.nr_map; i++) {
639 unsigned long curr_pfn, last_pfn, size;
640 /*
641 * Reserve usable low memory
642 */
643 if (e820.map[i].type != E820_RAM)
644 continue;
645 /*
646 * We are rounding up the start address of usable memory:
647 */
648 curr_pfn = PFN_UP(e820.map[i].addr);
649 if (curr_pfn >= max_low_pfn)
650 continue;
651 /*
652 * ... and at the end of the usable range downwards:
653 */
654 last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
655
656 if (last_pfn > max_low_pfn)
657 last_pfn = max_low_pfn;
658
659 /*
660 * .. finally, did all the rounding and playing
661 * around just make the area go away?
662 */
663 if (last_pfn <= curr_pfn)
664 continue;
665
666 size = last_pfn - curr_pfn;
667 free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
668 }
669}
670
671void __init register_memory(void)
672{
673 unsigned long gapstart, gapsize, round;
674 unsigned long long last;
675 int i;
676
677 /*
678 * Search for the bigest gap in the low 32 bits of the e820
679 * memory space.
680 */
681 last = 0x100000000ull;
682 gapstart = 0x10000000;
683 gapsize = 0x400000;
684 i = e820.nr_map;
685 while (--i >= 0) {
686 unsigned long long start = e820.map[i].addr;
687 unsigned long long end = start + e820.map[i].size;
688
689 /*
690 * Since "last" is at most 4GB, we know we'll
691 * fit in 32 bits if this condition is true
692 */
693 if (last > end) {
694 unsigned long gap = last - end;
695
696 if (gap > gapsize) {
697 gapsize = gap;
698 gapstart = end;
699 }
700 }
701 if (start < last)
702 last = start;
703 }
704
705 /*
706 * See how much we want to round up: start off with
707 * rounding to the next 1MB area.
708 */
709 round = 0x100000;
710 while ((gapsize >> 4) > round)
711 round += round;
712 /* Fun with two's complement */
713 pci_mem_start = (gapstart + round) & -round;
714
715 printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
716 pci_mem_start, gapstart, gapsize);
717}
718
719void __init print_memory_map(char *who)
720{
721 int i;
722
723 for (i = 0; i < e820.nr_map; i++) {
724 printk(" %s: %016Lx - %016Lx ", who,
725 e820.map[i].addr,
726 e820.map[i].addr + e820.map[i].size);
727 switch (e820.map[i].type) {
728 case E820_RAM: printk("(usable)\n");
729 break;
730 case E820_RESERVED:
731 printk("(reserved)\n");
732 break;
733 case E820_ACPI:
734 printk("(ACPI data)\n");
735 break;
736 case E820_NVS:
737 printk("(ACPI NVS)\n");
738 break;
739 default: printk("type %lu\n", e820.map[i].type);
740 break;
741 }
742 }
743}
744
745static __init __always_inline void efi_limit_regions(unsigned long long size)
746{
747 unsigned long long current_addr = 0;
748 efi_memory_desc_t *md, *next_md;
749 void *p, *p1;
750 int i, j;
751
752 j = 0;
753 p1 = memmap.map;
754 for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
755 md = p;
756 next_md = p1;
757 current_addr = md->phys_addr +
758 PFN_PHYS(md->num_pages);
759 if (is_available_memory(md)) {
760 if (md->phys_addr >= size) continue;
761 memcpy(next_md, md, memmap.desc_size);
762 if (current_addr >= size) {
763 next_md->num_pages -=
764 PFN_UP(current_addr-size);
765 }
766 p1 += memmap.desc_size;
767 next_md = p1;
768 j++;
769 } else if ((md->attribute & EFI_MEMORY_RUNTIME) ==
770 EFI_MEMORY_RUNTIME) {
771 /* In order to make runtime services
772 * available we have to include runtime
773 * memory regions in memory map */
774 memcpy(next_md, md, memmap.desc_size);
775 p1 += memmap.desc_size;
776 next_md = p1;
777 j++;
778 }
779 }
780 memmap.nr_map = j;
781 memmap.map_end = memmap.map +
782 (memmap.nr_map * memmap.desc_size);
783}
784
785void __init limit_regions(unsigned long long size)
786{
787 unsigned long long current_addr;
788 int i;
789
790 print_memory_map("limit_regions start");
791 if (efi_enabled) {
792 efi_limit_regions(size);
793 return;
794 }
795 for (i = 0; i < e820.nr_map; i++) {
796 current_addr = e820.map[i].addr + e820.map[i].size;
797 if (current_addr < size)
798 continue;
799
800 if (e820.map[i].type != E820_RAM)
801 continue;
802
803 if (e820.map[i].addr >= size) {
804 /*
805 * This region starts past the end of the
806 * requested size, skip it completely.
807 */
808 e820.nr_map = i;
809 } else {
810 e820.nr_map = i + 1;
811 e820.map[i].size -= current_addr - size;
812 }
813 print_memory_map("limit_regions endfor");
814 return;
815 }
816 print_memory_map("limit_regions endfunc");
817}
818
819 /*
820 * This function checks if the entire range <start,end> is mapped with type.
821 *
822 * Note: this function only works correct if the e820 table is sorted and
823 * not-overlapping, which is the case
824 */
825int __init
826e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
827{
828 u64 start = s;
829 u64 end = e;
830 int i;
831 for (i = 0; i < e820.nr_map; i++) {
832 struct e820entry *ei = &e820.map[i];
833 if (type && ei->type != type)
834 continue;
835 /* is the region (part) in overlap with the current region ?*/
836 if (ei->addr >= end || ei->addr + ei->size <= start)
837 continue;
838 /* if the region is at the beginning of <start,end> we move
839 * start to the end of the region since it's ok until there
840 */
841 if (ei->addr <= start)
842 start = ei->addr + ei->size;
843 /* if start is now at or beyond end, we're done, full
844 * coverage */
845 if (start >= end)
846 return 1; /* we're done */
847 }
848 return 0;
849}
850
851static int __init parse_memmap(char *arg)
852{
853 if (!arg)
854 return -EINVAL;
855
856 if (strcmp(arg, "exactmap") == 0) {
857#ifdef CONFIG_CRASH_DUMP
858 /* If we are doing a crash dump, we
859 * still need to know the real mem
860 * size before original memory map is
861 * reset.
862 */
863 find_max_pfn();
864 saved_max_pfn = max_pfn;
865#endif
866 e820.nr_map = 0;
867 user_defined_memmap = 1;
868 } else {
869 /* If the user specifies memory size, we
870 * limit the BIOS-provided memory map to
871 * that size. exactmap can be used to specify
872 * the exact map. mem=number can be used to
873 * trim the existing memory map.
874 */
875 unsigned long long start_at, mem_size;
876
877 mem_size = memparse(arg, &arg);
878 if (*arg == '@') {
879 start_at = memparse(arg+1, &arg);
880 add_memory_region(start_at, mem_size, E820_RAM);
881 } else if (*arg == '#') {
882 start_at = memparse(arg+1, &arg);
883 add_memory_region(start_at, mem_size, E820_ACPI);
884 } else if (*arg == '$') {
885 start_at = memparse(arg+1, &arg);
886 add_memory_region(start_at, mem_size, E820_RESERVED);
887 } else {
888 limit_regions(mem_size);
889 user_defined_memmap = 1;
890 }
891 }
892 return 0;
893}
894early_param("memmap", parse_memmap);
diff --git a/arch/i386/kernel/efi.c b/arch/i386/kernel/efi.c
index 8b40648d0ef0..b92c7f0a358a 100644
--- a/arch/i386/kernel/efi.c
+++ b/arch/i386/kernel/efi.c
@@ -194,17 +194,24 @@ inline int efi_set_rtc_mmss(unsigned long nowtime)
194 return 0; 194 return 0;
195} 195}
196/* 196/*
197 * This should only be used during kernel init and before runtime 197 * This is used during kernel init before runtime
198 * services have been remapped, therefore, we'll need to call in physical 198 * services have been remapped and also during suspend, therefore,
199 * mode. Note, this call isn't used later, so mark it __init. 199 * we'll need to call both in physical and virtual modes.
200 */ 200 */
201inline unsigned long __init efi_get_time(void) 201inline unsigned long efi_get_time(void)
202{ 202{
203 efi_status_t status; 203 efi_status_t status;
204 efi_time_t eft; 204 efi_time_t eft;
205 efi_time_cap_t cap; 205 efi_time_cap_t cap;
206 206
207 status = phys_efi_get_time(&eft, &cap); 207 if (efi.get_time) {
208 /* if we are in virtual mode use remapped function */
209 status = efi.get_time(&eft, &cap);
210 } else {
211 /* we are in physical mode */
212 status = phys_efi_get_time(&eft, &cap);
213 }
214
208 if (status != EFI_SUCCESS) 215 if (status != EFI_SUCCESS)
209 printk("Oops: efitime: can't read time status: 0x%lx\n",status); 216 printk("Oops: efitime: can't read time status: 0x%lx\n",status);
210 217
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 5a63d6fdb70e..de34b7fed3c1 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -30,12 +30,13 @@
30 * 18(%esp) - %eax 30 * 18(%esp) - %eax
31 * 1C(%esp) - %ds 31 * 1C(%esp) - %ds
32 * 20(%esp) - %es 32 * 20(%esp) - %es
33 * 24(%esp) - orig_eax 33 * 24(%esp) - %gs
34 * 28(%esp) - %eip 34 * 28(%esp) - orig_eax
35 * 2C(%esp) - %cs 35 * 2C(%esp) - %eip
36 * 30(%esp) - %eflags 36 * 30(%esp) - %cs
37 * 34(%esp) - %oldesp 37 * 34(%esp) - %eflags
38 * 38(%esp) - %oldss 38 * 38(%esp) - %oldesp
39 * 3C(%esp) - %oldss
39 * 40 *
40 * "current" is in register %ebx during any slow entries. 41 * "current" is in register %ebx during any slow entries.
41 */ 42 */
@@ -48,26 +49,24 @@
48#include <asm/smp.h> 49#include <asm/smp.h>
49#include <asm/page.h> 50#include <asm/page.h>
50#include <asm/desc.h> 51#include <asm/desc.h>
52#include <asm/percpu.h>
51#include <asm/dwarf2.h> 53#include <asm/dwarf2.h>
52#include "irq_vectors.h" 54#include "irq_vectors.h"
53 55
54#define nr_syscalls ((syscall_table_size)/4) 56/*
57 * We use macros for low-level operations which need to be overridden
58 * for paravirtualization. The following will never clobber any registers:
59 * INTERRUPT_RETURN (aka. "iret")
60 * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
61 * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
62 *
63 * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
64 * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
65 * Allowing a register to be clobbered can shrink the paravirt replacement
66 * enough to patch inline, increasing performance.
67 */
55 68
56EBX = 0x00 69#define nr_syscalls ((syscall_table_size)/4)
57ECX = 0x04
58EDX = 0x08
59ESI = 0x0C
60EDI = 0x10
61EBP = 0x14
62EAX = 0x18
63DS = 0x1C
64ES = 0x20
65ORIG_EAX = 0x24
66EIP = 0x28
67CS = 0x2C
68EFLAGS = 0x30
69OLDESP = 0x34
70OLDSS = 0x38
71 70
72CF_MASK = 0x00000001 71CF_MASK = 0x00000001
73TF_MASK = 0x00000100 72TF_MASK = 0x00000100
@@ -76,23 +75,16 @@ DF_MASK = 0x00000400
76NT_MASK = 0x00004000 75NT_MASK = 0x00004000
77VM_MASK = 0x00020000 76VM_MASK = 0x00020000
78 77
79/* These are replaces for paravirtualization */
80#define DISABLE_INTERRUPTS cli
81#define ENABLE_INTERRUPTS sti
82#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit
83#define INTERRUPT_RETURN iret
84#define GET_CR0_INTO_EAX movl %cr0, %eax
85
86#ifdef CONFIG_PREEMPT 78#ifdef CONFIG_PREEMPT
87#define preempt_stop DISABLE_INTERRUPTS; TRACE_IRQS_OFF 79#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
88#else 80#else
89#define preempt_stop 81#define preempt_stop(clobbers)
90#define resume_kernel restore_nocheck 82#define resume_kernel restore_nocheck
91#endif 83#endif
92 84
93.macro TRACE_IRQS_IRET 85.macro TRACE_IRQS_IRET
94#ifdef CONFIG_TRACE_IRQFLAGS 86#ifdef CONFIG_TRACE_IRQFLAGS
95 testl $IF_MASK,EFLAGS(%esp) # interrupts off? 87 testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off?
96 jz 1f 88 jz 1f
97 TRACE_IRQS_ON 89 TRACE_IRQS_ON
981: 901:
@@ -107,6 +99,9 @@ VM_MASK = 0x00020000
107 99
108#define SAVE_ALL \ 100#define SAVE_ALL \
109 cld; \ 101 cld; \
102 pushl %gs; \
103 CFI_ADJUST_CFA_OFFSET 4;\
104 /*CFI_REL_OFFSET gs, 0;*/\
110 pushl %es; \ 105 pushl %es; \
111 CFI_ADJUST_CFA_OFFSET 4;\ 106 CFI_ADJUST_CFA_OFFSET 4;\
112 /*CFI_REL_OFFSET es, 0;*/\ 107 /*CFI_REL_OFFSET es, 0;*/\
@@ -136,7 +131,9 @@ VM_MASK = 0x00020000
136 CFI_REL_OFFSET ebx, 0;\ 131 CFI_REL_OFFSET ebx, 0;\
137 movl $(__USER_DS), %edx; \ 132 movl $(__USER_DS), %edx; \
138 movl %edx, %ds; \ 133 movl %edx, %ds; \
139 movl %edx, %es; 134 movl %edx, %es; \
135 movl $(__KERNEL_PDA), %edx; \
136 movl %edx, %gs
140 137
141#define RESTORE_INT_REGS \ 138#define RESTORE_INT_REGS \
142 popl %ebx; \ 139 popl %ebx; \
@@ -169,17 +166,22 @@ VM_MASK = 0x00020000
1692: popl %es; \ 1662: popl %es; \
170 CFI_ADJUST_CFA_OFFSET -4;\ 167 CFI_ADJUST_CFA_OFFSET -4;\
171 /*CFI_RESTORE es;*/\ 168 /*CFI_RESTORE es;*/\
172.section .fixup,"ax"; \ 1693: popl %gs; \
1733: movl $0,(%esp); \ 170 CFI_ADJUST_CFA_OFFSET -4;\
174 jmp 1b; \ 171 /*CFI_RESTORE gs;*/\
172.pushsection .fixup,"ax"; \
1754: movl $0,(%esp); \ 1734: movl $0,(%esp); \
174 jmp 1b; \
1755: movl $0,(%esp); \
176 jmp 2b; \ 176 jmp 2b; \
177.previous; \ 1776: movl $0,(%esp); \
178 jmp 3b; \
178.section __ex_table,"a";\ 179.section __ex_table,"a";\
179 .align 4; \ 180 .align 4; \
180 .long 1b,3b; \ 181 .long 1b,4b; \
181 .long 2b,4b; \ 182 .long 2b,5b; \
182.previous 183 .long 3b,6b; \
184.popsection
183 185
184#define RING0_INT_FRAME \ 186#define RING0_INT_FRAME \
185 CFI_STARTPROC simple;\ 187 CFI_STARTPROC simple;\
@@ -198,18 +200,18 @@ VM_MASK = 0x00020000
198#define RING0_PTREGS_FRAME \ 200#define RING0_PTREGS_FRAME \
199 CFI_STARTPROC simple;\ 201 CFI_STARTPROC simple;\
200 CFI_SIGNAL_FRAME;\ 202 CFI_SIGNAL_FRAME;\
201 CFI_DEF_CFA esp, OLDESP-EBX;\ 203 CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\
202 /*CFI_OFFSET cs, CS-OLDESP;*/\ 204 /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\
203 CFI_OFFSET eip, EIP-OLDESP;\ 205 CFI_OFFSET eip, PT_EIP-PT_OLDESP;\
204 /*CFI_OFFSET es, ES-OLDESP;*/\ 206 /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\
205 /*CFI_OFFSET ds, DS-OLDESP;*/\ 207 /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\
206 CFI_OFFSET eax, EAX-OLDESP;\ 208 CFI_OFFSET eax, PT_EAX-PT_OLDESP;\
207 CFI_OFFSET ebp, EBP-OLDESP;\ 209 CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\
208 CFI_OFFSET edi, EDI-OLDESP;\ 210 CFI_OFFSET edi, PT_EDI-PT_OLDESP;\
209 CFI_OFFSET esi, ESI-OLDESP;\ 211 CFI_OFFSET esi, PT_ESI-PT_OLDESP;\
210 CFI_OFFSET edx, EDX-OLDESP;\ 212 CFI_OFFSET edx, PT_EDX-PT_OLDESP;\
211 CFI_OFFSET ecx, ECX-OLDESP;\ 213 CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\
212 CFI_OFFSET ebx, EBX-OLDESP 214 CFI_OFFSET ebx, PT_EBX-PT_OLDESP
213 215
214ENTRY(ret_from_fork) 216ENTRY(ret_from_fork)
215 CFI_STARTPROC 217 CFI_STARTPROC
@@ -237,17 +239,18 @@ ENTRY(ret_from_fork)
237 ALIGN 239 ALIGN
238 RING0_PTREGS_FRAME 240 RING0_PTREGS_FRAME
239ret_from_exception: 241ret_from_exception:
240 preempt_stop 242 preempt_stop(CLBR_ANY)
241ret_from_intr: 243ret_from_intr:
242 GET_THREAD_INFO(%ebp) 244 GET_THREAD_INFO(%ebp)
243check_userspace: 245check_userspace:
244 movl EFLAGS(%esp), %eax # mix EFLAGS and CS 246 movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
245 movb CS(%esp), %al 247 movb PT_CS(%esp), %al
246 andl $(VM_MASK | SEGMENT_RPL_MASK), %eax 248 andl $(VM_MASK | SEGMENT_RPL_MASK), %eax
247 cmpl $USER_RPL, %eax 249 cmpl $USER_RPL, %eax
248 jb resume_kernel # not returning to v8086 or userspace 250 jb resume_kernel # not returning to v8086 or userspace
251
249ENTRY(resume_userspace) 252ENTRY(resume_userspace)
250 DISABLE_INTERRUPTS # make sure we don't miss an interrupt 253 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
251 # setting need_resched or sigpending 254 # setting need_resched or sigpending
252 # between sampling and the iret 255 # between sampling and the iret
253 movl TI_flags(%ebp), %ecx 256 movl TI_flags(%ebp), %ecx
@@ -258,14 +261,14 @@ ENTRY(resume_userspace)
258 261
259#ifdef CONFIG_PREEMPT 262#ifdef CONFIG_PREEMPT
260ENTRY(resume_kernel) 263ENTRY(resume_kernel)
261 DISABLE_INTERRUPTS 264 DISABLE_INTERRUPTS(CLBR_ANY)
262 cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? 265 cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
263 jnz restore_nocheck 266 jnz restore_nocheck
264need_resched: 267need_resched:
265 movl TI_flags(%ebp), %ecx # need_resched set ? 268 movl TI_flags(%ebp), %ecx # need_resched set ?
266 testb $_TIF_NEED_RESCHED, %cl 269 testb $_TIF_NEED_RESCHED, %cl
267 jz restore_all 270 jz restore_all
268 testl $IF_MASK,EFLAGS(%esp) # interrupts off (exception path) ? 271 testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off (exception path) ?
269 jz restore_all 272 jz restore_all
270 call preempt_schedule_irq 273 call preempt_schedule_irq
271 jmp need_resched 274 jmp need_resched
@@ -287,7 +290,7 @@ sysenter_past_esp:
287 * No need to follow this irqs on/off section: the syscall 290 * No need to follow this irqs on/off section: the syscall
288 * disabled irqs and here we enable it straight after entry: 291 * disabled irqs and here we enable it straight after entry:
289 */ 292 */
290 ENABLE_INTERRUPTS 293 ENABLE_INTERRUPTS(CLBR_NONE)
291 pushl $(__USER_DS) 294 pushl $(__USER_DS)
292 CFI_ADJUST_CFA_OFFSET 4 295 CFI_ADJUST_CFA_OFFSET 4
293 /*CFI_REL_OFFSET ss, 0*/ 296 /*CFI_REL_OFFSET ss, 0*/
@@ -331,20 +334,27 @@ sysenter_past_esp:
331 cmpl $(nr_syscalls), %eax 334 cmpl $(nr_syscalls), %eax
332 jae syscall_badsys 335 jae syscall_badsys
333 call *sys_call_table(,%eax,4) 336 call *sys_call_table(,%eax,4)
334 movl %eax,EAX(%esp) 337 movl %eax,PT_EAX(%esp)
335 DISABLE_INTERRUPTS 338 DISABLE_INTERRUPTS(CLBR_ECX|CLBR_EDX)
336 TRACE_IRQS_OFF 339 TRACE_IRQS_OFF
337 movl TI_flags(%ebp), %ecx 340 movl TI_flags(%ebp), %ecx
338 testw $_TIF_ALLWORK_MASK, %cx 341 testw $_TIF_ALLWORK_MASK, %cx
339 jne syscall_exit_work 342 jne syscall_exit_work
340/* if something modifies registers it must also disable sysexit */ 343/* if something modifies registers it must also disable sysexit */
341 movl EIP(%esp), %edx 344 movl PT_EIP(%esp), %edx
342 movl OLDESP(%esp), %ecx 345 movl PT_OLDESP(%esp), %ecx
343 xorl %ebp,%ebp 346 xorl %ebp,%ebp
344 TRACE_IRQS_ON 347 TRACE_IRQS_ON
3481: mov PT_GS(%esp), %gs
345 ENABLE_INTERRUPTS_SYSEXIT 349 ENABLE_INTERRUPTS_SYSEXIT
346 CFI_ENDPROC 350 CFI_ENDPROC
347 351.pushsection .fixup,"ax"
3522: movl $0,PT_GS(%esp)
353 jmp 1b
354.section __ex_table,"a"
355 .align 4
356 .long 1b,2b
357.popsection
348 358
349 # system call handler stub 359 # system call handler stub
350ENTRY(system_call) 360ENTRY(system_call)
@@ -353,7 +363,7 @@ ENTRY(system_call)
353 CFI_ADJUST_CFA_OFFSET 4 363 CFI_ADJUST_CFA_OFFSET 4
354 SAVE_ALL 364 SAVE_ALL
355 GET_THREAD_INFO(%ebp) 365 GET_THREAD_INFO(%ebp)
356 testl $TF_MASK,EFLAGS(%esp) 366 testl $TF_MASK,PT_EFLAGS(%esp)
357 jz no_singlestep 367 jz no_singlestep
358 orl $_TIF_SINGLESTEP,TI_flags(%ebp) 368 orl $_TIF_SINGLESTEP,TI_flags(%ebp)
359no_singlestep: 369no_singlestep:
@@ -365,9 +375,9 @@ no_singlestep:
365 jae syscall_badsys 375 jae syscall_badsys
366syscall_call: 376syscall_call:
367 call *sys_call_table(,%eax,4) 377 call *sys_call_table(,%eax,4)
368 movl %eax,EAX(%esp) # store the return value 378 movl %eax,PT_EAX(%esp) # store the return value
369syscall_exit: 379syscall_exit:
370 DISABLE_INTERRUPTS # make sure we don't miss an interrupt 380 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
371 # setting need_resched or sigpending 381 # setting need_resched or sigpending
372 # between sampling and the iret 382 # between sampling and the iret
373 TRACE_IRQS_OFF 383 TRACE_IRQS_OFF
@@ -376,12 +386,12 @@ syscall_exit:
376 jne syscall_exit_work 386 jne syscall_exit_work
377 387
378restore_all: 388restore_all:
379 movl EFLAGS(%esp), %eax # mix EFLAGS, SS and CS 389 movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
380 # Warning: OLDSS(%esp) contains the wrong/random values if we 390 # Warning: PT_OLDSS(%esp) contains the wrong/random values if we
381 # are returning to the kernel. 391 # are returning to the kernel.
382 # See comments in process.c:copy_thread() for details. 392 # See comments in process.c:copy_thread() for details.
383 movb OLDSS(%esp), %ah 393 movb PT_OLDSS(%esp), %ah
384 movb CS(%esp), %al 394 movb PT_CS(%esp), %al
385 andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax 395 andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
386 cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax 396 cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
387 CFI_REMEMBER_STATE 397 CFI_REMEMBER_STATE
@@ -390,13 +400,13 @@ restore_nocheck:
390 TRACE_IRQS_IRET 400 TRACE_IRQS_IRET
391restore_nocheck_notrace: 401restore_nocheck_notrace:
392 RESTORE_REGS 402 RESTORE_REGS
393 addl $4, %esp 403 addl $4, %esp # skip orig_eax/error_code
394 CFI_ADJUST_CFA_OFFSET -4 404 CFI_ADJUST_CFA_OFFSET -4
3951: INTERRUPT_RETURN 4051: INTERRUPT_RETURN
396.section .fixup,"ax" 406.section .fixup,"ax"
397iret_exc: 407iret_exc:
398 TRACE_IRQS_ON 408 TRACE_IRQS_ON
399 ENABLE_INTERRUPTS 409 ENABLE_INTERRUPTS(CLBR_NONE)
400 pushl $0 # no error code 410 pushl $0 # no error code
401 pushl $do_iret_error 411 pushl $do_iret_error
402 jmp error_code 412 jmp error_code
@@ -408,33 +418,42 @@ iret_exc:
408 418
409 CFI_RESTORE_STATE 419 CFI_RESTORE_STATE
410ldt_ss: 420ldt_ss:
411 larl OLDSS(%esp), %eax 421 larl PT_OLDSS(%esp), %eax
412 jnz restore_nocheck 422 jnz restore_nocheck
413 testl $0x00400000, %eax # returning to 32bit stack? 423 testl $0x00400000, %eax # returning to 32bit stack?
414 jnz restore_nocheck # allright, normal return 424 jnz restore_nocheck # allright, normal return
425
426#ifdef CONFIG_PARAVIRT
427 /*
428 * The kernel can't run on a non-flat stack if paravirt mode
429 * is active. Rather than try to fixup the high bits of
430 * ESP, bypass this code entirely. This may break DOSemu
431 * and/or Wine support in a paravirt VM, although the option
432 * is still available to implement the setting of the high
433 * 16-bits in the INTERRUPT_RETURN paravirt-op.
434 */
435 cmpl $0, paravirt_ops+PARAVIRT_enabled
436 jne restore_nocheck
437#endif
438
415 /* If returning to userspace with 16bit stack, 439 /* If returning to userspace with 16bit stack,
416 * try to fix the higher word of ESP, as the CPU 440 * try to fix the higher word of ESP, as the CPU
417 * won't restore it. 441 * won't restore it.
418 * This is an "official" bug of all the x86-compatible 442 * This is an "official" bug of all the x86-compatible
419 * CPUs, which we can try to work around to make 443 * CPUs, which we can try to work around to make
420 * dosemu and wine happy. */ 444 * dosemu and wine happy. */
421 subl $8, %esp # reserve space for switch16 pointer 445 movl PT_OLDESP(%esp), %eax
422 CFI_ADJUST_CFA_OFFSET 8 446 movl %esp, %edx
423 DISABLE_INTERRUPTS 447 call patch_espfix_desc
448 pushl $__ESPFIX_SS
449 CFI_ADJUST_CFA_OFFSET 4
450 pushl %eax
451 CFI_ADJUST_CFA_OFFSET 4
452 DISABLE_INTERRUPTS(CLBR_EAX)
424 TRACE_IRQS_OFF 453 TRACE_IRQS_OFF
425 movl %esp, %eax 454 lss (%esp), %esp
426 /* Set up the 16bit stack frame with switch32 pointer on top, 455 CFI_ADJUST_CFA_OFFSET -8
427 * and a switch16 pointer on top of the current frame. */ 456 jmp restore_nocheck
428 call setup_x86_bogus_stack
429 CFI_ADJUST_CFA_OFFSET -8 # frame has moved
430 TRACE_IRQS_IRET
431 RESTORE_REGS
432 lss 20+4(%esp), %esp # switch to 16bit stack
4331: INTERRUPT_RETURN
434.section __ex_table,"a"
435 .align 4
436 .long 1b,iret_exc
437.previous
438 CFI_ENDPROC 457 CFI_ENDPROC
439 458
440 # perform work that needs to be done immediately before resumption 459 # perform work that needs to be done immediately before resumption
@@ -445,7 +464,7 @@ work_pending:
445 jz work_notifysig 464 jz work_notifysig
446work_resched: 465work_resched:
447 call schedule 466 call schedule
448 DISABLE_INTERRUPTS # make sure we don't miss an interrupt 467 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
449 # setting need_resched or sigpending 468 # setting need_resched or sigpending
450 # between sampling and the iret 469 # between sampling and the iret
451 TRACE_IRQS_OFF 470 TRACE_IRQS_OFF
@@ -458,7 +477,8 @@ work_resched:
458 477
459work_notifysig: # deal with pending signals and 478work_notifysig: # deal with pending signals and
460 # notify-resume requests 479 # notify-resume requests
461 testl $VM_MASK, EFLAGS(%esp) 480#ifdef CONFIG_VM86
481 testl $VM_MASK, PT_EFLAGS(%esp)
462 movl %esp, %eax 482 movl %esp, %eax
463 jne work_notifysig_v86 # returning to kernel-space or 483 jne work_notifysig_v86 # returning to kernel-space or
464 # vm86-space 484 # vm86-space
@@ -468,29 +488,30 @@ work_notifysig: # deal with pending signals and
468 488
469 ALIGN 489 ALIGN
470work_notifysig_v86: 490work_notifysig_v86:
471#ifdef CONFIG_VM86
472 pushl %ecx # save ti_flags for do_notify_resume 491 pushl %ecx # save ti_flags for do_notify_resume
473 CFI_ADJUST_CFA_OFFSET 4 492 CFI_ADJUST_CFA_OFFSET 4
474 call save_v86_state # %eax contains pt_regs pointer 493 call save_v86_state # %eax contains pt_regs pointer
475 popl %ecx 494 popl %ecx
476 CFI_ADJUST_CFA_OFFSET -4 495 CFI_ADJUST_CFA_OFFSET -4
477 movl %eax, %esp 496 movl %eax, %esp
497#else
498 movl %esp, %eax
499#endif
478 xorl %edx, %edx 500 xorl %edx, %edx
479 call do_notify_resume 501 call do_notify_resume
480 jmp resume_userspace_sig 502 jmp resume_userspace_sig
481#endif
482 503
483 # perform syscall exit tracing 504 # perform syscall exit tracing
484 ALIGN 505 ALIGN
485syscall_trace_entry: 506syscall_trace_entry:
486 movl $-ENOSYS,EAX(%esp) 507 movl $-ENOSYS,PT_EAX(%esp)
487 movl %esp, %eax 508 movl %esp, %eax
488 xorl %edx,%edx 509 xorl %edx,%edx
489 call do_syscall_trace 510 call do_syscall_trace
490 cmpl $0, %eax 511 cmpl $0, %eax
491 jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU, 512 jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
492 # so must skip actual syscall 513 # so must skip actual syscall
493 movl ORIG_EAX(%esp), %eax 514 movl PT_ORIG_EAX(%esp), %eax
494 cmpl $(nr_syscalls), %eax 515 cmpl $(nr_syscalls), %eax
495 jnae syscall_call 516 jnae syscall_call
496 jmp syscall_exit 517 jmp syscall_exit
@@ -501,7 +522,7 @@ syscall_exit_work:
501 testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl 522 testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
502 jz work_pending 523 jz work_pending
503 TRACE_IRQS_ON 524 TRACE_IRQS_ON
504 ENABLE_INTERRUPTS # could let do_syscall_trace() call 525 ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call
505 # schedule() instead 526 # schedule() instead
506 movl %esp, %eax 527 movl %esp, %eax
507 movl $1, %edx 528 movl $1, %edx
@@ -515,39 +536,38 @@ syscall_fault:
515 CFI_ADJUST_CFA_OFFSET 4 536 CFI_ADJUST_CFA_OFFSET 4
516 SAVE_ALL 537 SAVE_ALL
517 GET_THREAD_INFO(%ebp) 538 GET_THREAD_INFO(%ebp)
518 movl $-EFAULT,EAX(%esp) 539 movl $-EFAULT,PT_EAX(%esp)
519 jmp resume_userspace 540 jmp resume_userspace
520 541
521syscall_badsys: 542syscall_badsys:
522 movl $-ENOSYS,EAX(%esp) 543 movl $-ENOSYS,PT_EAX(%esp)
523 jmp resume_userspace 544 jmp resume_userspace
524 CFI_ENDPROC 545 CFI_ENDPROC
525 546
526#define FIXUP_ESPFIX_STACK \ 547#define FIXUP_ESPFIX_STACK \
527 movl %esp, %eax; \ 548 /* since we are on a wrong stack, we cant make it a C code :( */ \
528 /* switch to 32bit stack using the pointer on top of 16bit stack */ \ 549 movl %gs:PDA_cpu, %ebx; \
529 lss %ss:CPU_16BIT_STACK_SIZE-8, %esp; \ 550 PER_CPU(cpu_gdt_descr, %ebx); \
530 /* copy data from 16bit stack to 32bit stack */ \ 551 movl GDS_address(%ebx), %ebx; \
531 call fixup_x86_bogus_stack; \ 552 GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
532 /* put ESP to the proper location */ \ 553 addl %esp, %eax; \
533 movl %eax, %esp; 554 pushl $__KERNEL_DS; \
534#define UNWIND_ESPFIX_STACK \ 555 CFI_ADJUST_CFA_OFFSET 4; \
535 pushl %eax; \ 556 pushl %eax; \
536 CFI_ADJUST_CFA_OFFSET 4; \ 557 CFI_ADJUST_CFA_OFFSET 4; \
558 lss (%esp), %esp; \
559 CFI_ADJUST_CFA_OFFSET -8;
560#define UNWIND_ESPFIX_STACK \
537 movl %ss, %eax; \ 561 movl %ss, %eax; \
538 /* see if on 16bit stack */ \ 562 /* see if on espfix stack */ \
539 cmpw $__ESPFIX_SS, %ax; \ 563 cmpw $__ESPFIX_SS, %ax; \
540 je 28f; \ 564 jne 27f; \
54127: popl %eax; \ 565 movl $__KERNEL_DS, %eax; \
542 CFI_ADJUST_CFA_OFFSET -4; \
543.section .fixup,"ax"; \
54428: movl $__KERNEL_DS, %eax; \
545 movl %eax, %ds; \ 566 movl %eax, %ds; \
546 movl %eax, %es; \ 567 movl %eax, %es; \
547 /* switch to 32bit stack */ \ 568 /* switch to normal stack */ \
548 FIXUP_ESPFIX_STACK; \ 569 FIXUP_ESPFIX_STACK; \
549 jmp 27b; \ 57027:;
550.previous
551 571
552/* 572/*
553 * Build the entry stubs and pointer table with 573 * Build the entry stubs and pointer table with
@@ -608,13 +628,16 @@ KPROBE_ENTRY(page_fault)
608 CFI_ADJUST_CFA_OFFSET 4 628 CFI_ADJUST_CFA_OFFSET 4
609 ALIGN 629 ALIGN
610error_code: 630error_code:
631 /* the function address is in %gs's slot on the stack */
632 pushl %es
633 CFI_ADJUST_CFA_OFFSET 4
634 /*CFI_REL_OFFSET es, 0*/
611 pushl %ds 635 pushl %ds
612 CFI_ADJUST_CFA_OFFSET 4 636 CFI_ADJUST_CFA_OFFSET 4
613 /*CFI_REL_OFFSET ds, 0*/ 637 /*CFI_REL_OFFSET ds, 0*/
614 pushl %eax 638 pushl %eax
615 CFI_ADJUST_CFA_OFFSET 4 639 CFI_ADJUST_CFA_OFFSET 4
616 CFI_REL_OFFSET eax, 0 640 CFI_REL_OFFSET eax, 0
617 xorl %eax, %eax
618 pushl %ebp 641 pushl %ebp
619 CFI_ADJUST_CFA_OFFSET 4 642 CFI_ADJUST_CFA_OFFSET 4
620 CFI_REL_OFFSET ebp, 0 643 CFI_REL_OFFSET ebp, 0
@@ -627,7 +650,6 @@ error_code:
627 pushl %edx 650 pushl %edx
628 CFI_ADJUST_CFA_OFFSET 4 651 CFI_ADJUST_CFA_OFFSET 4
629 CFI_REL_OFFSET edx, 0 652 CFI_REL_OFFSET edx, 0
630 decl %eax # eax = -1
631 pushl %ecx 653 pushl %ecx
632 CFI_ADJUST_CFA_OFFSET 4 654 CFI_ADJUST_CFA_OFFSET 4
633 CFI_REL_OFFSET ecx, 0 655 CFI_REL_OFFSET ecx, 0
@@ -635,18 +657,20 @@ error_code:
635 CFI_ADJUST_CFA_OFFSET 4 657 CFI_ADJUST_CFA_OFFSET 4
636 CFI_REL_OFFSET ebx, 0 658 CFI_REL_OFFSET ebx, 0
637 cld 659 cld
638 pushl %es 660 pushl %gs
639 CFI_ADJUST_CFA_OFFSET 4 661 CFI_ADJUST_CFA_OFFSET 4
640 /*CFI_REL_OFFSET es, 0*/ 662 /*CFI_REL_OFFSET gs, 0*/
663 movl $(__KERNEL_PDA), %ecx
664 movl %ecx, %gs
641 UNWIND_ESPFIX_STACK 665 UNWIND_ESPFIX_STACK
642 popl %ecx 666 popl %ecx
643 CFI_ADJUST_CFA_OFFSET -4 667 CFI_ADJUST_CFA_OFFSET -4
644 /*CFI_REGISTER es, ecx*/ 668 /*CFI_REGISTER es, ecx*/
645 movl ES(%esp), %edi # get the function address 669 movl PT_GS(%esp), %edi # get the function address
646 movl ORIG_EAX(%esp), %edx # get the error code 670 movl PT_ORIG_EAX(%esp), %edx # get the error code
647 movl %eax, ORIG_EAX(%esp) 671 movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
648 movl %ecx, ES(%esp) 672 mov %ecx, PT_GS(%esp)
649 /*CFI_REL_OFFSET es, ES*/ 673 /*CFI_REL_OFFSET gs, ES*/
650 movl $(__USER_DS), %ecx 674 movl $(__USER_DS), %ecx
651 movl %ecx, %ds 675 movl %ecx, %ds
652 movl %ecx, %es 676 movl %ecx, %es
@@ -682,7 +706,7 @@ ENTRY(device_not_available)
682 GET_CR0_INTO_EAX 706 GET_CR0_INTO_EAX
683 testl $0x4, %eax # EM (math emulation bit) 707 testl $0x4, %eax # EM (math emulation bit)
684 jne device_not_available_emulate 708 jne device_not_available_emulate
685 preempt_stop 709 preempt_stop(CLBR_ANY)
686 call math_state_restore 710 call math_state_restore
687 jmp ret_from_exception 711 jmp ret_from_exception
688device_not_available_emulate: 712device_not_available_emulate:
@@ -754,7 +778,7 @@ KPROBE_ENTRY(nmi)
754 cmpw $__ESPFIX_SS, %ax 778 cmpw $__ESPFIX_SS, %ax
755 popl %eax 779 popl %eax
756 CFI_ADJUST_CFA_OFFSET -4 780 CFI_ADJUST_CFA_OFFSET -4
757 je nmi_16bit_stack 781 je nmi_espfix_stack
758 cmpl $sysenter_entry,(%esp) 782 cmpl $sysenter_entry,(%esp)
759 je nmi_stack_fixup 783 je nmi_stack_fixup
760 pushl %eax 784 pushl %eax
@@ -797,7 +821,7 @@ nmi_debug_stack_check:
797 FIX_STACK(24,nmi_stack_correct, 1) 821 FIX_STACK(24,nmi_stack_correct, 1)
798 jmp nmi_stack_correct 822 jmp nmi_stack_correct
799 823
800nmi_16bit_stack: 824nmi_espfix_stack:
801 /* We have a RING0_INT_FRAME here. 825 /* We have a RING0_INT_FRAME here.
802 * 826 *
803 * create the pointer to lss back 827 * create the pointer to lss back
@@ -806,7 +830,6 @@ nmi_16bit_stack:
806 CFI_ADJUST_CFA_OFFSET 4 830 CFI_ADJUST_CFA_OFFSET 4
807 pushl %esp 831 pushl %esp
808 CFI_ADJUST_CFA_OFFSET 4 832 CFI_ADJUST_CFA_OFFSET 4
809 movzwl %sp, %esp
810 addw $4, (%esp) 833 addw $4, (%esp)
811 /* copy the iret frame of 12 bytes */ 834 /* copy the iret frame of 12 bytes */
812 .rept 3 835 .rept 3
@@ -817,11 +840,11 @@ nmi_16bit_stack:
817 CFI_ADJUST_CFA_OFFSET 4 840 CFI_ADJUST_CFA_OFFSET 4
818 SAVE_ALL 841 SAVE_ALL
819 FIXUP_ESPFIX_STACK # %eax == %esp 842 FIXUP_ESPFIX_STACK # %eax == %esp
820 CFI_ADJUST_CFA_OFFSET -20 # the frame has now moved
821 xorl %edx,%edx # zero error code 843 xorl %edx,%edx # zero error code
822 call do_nmi 844 call do_nmi
823 RESTORE_REGS 845 RESTORE_REGS
824 lss 12+4(%esp), %esp # back to 16bit stack 846 lss 12+4(%esp), %esp # back to espfix stack
847 CFI_ADJUST_CFA_OFFSET -24
8251: INTERRUPT_RETURN 8481: INTERRUPT_RETURN
826 CFI_ENDPROC 849 CFI_ENDPROC
827.section __ex_table,"a" 850.section __ex_table,"a"
@@ -830,6 +853,19 @@ nmi_16bit_stack:
830.previous 853.previous
831KPROBE_END(nmi) 854KPROBE_END(nmi)
832 855
856#ifdef CONFIG_PARAVIRT
857ENTRY(native_iret)
8581: iret
859.section __ex_table,"a"
860 .align 4
861 .long 1b,iret_exc
862.previous
863
864ENTRY(native_irq_enable_sysexit)
865 sti
866 sysexit
867#endif
868
833KPROBE_ENTRY(int3) 869KPROBE_ENTRY(int3)
834 RING0_INT_FRAME 870 RING0_INT_FRAME
835 pushl $-1 # mark this as an int 871 pushl $-1 # mark this as an int
@@ -949,26 +985,27 @@ ENTRY(arch_unwind_init_running)
949 movl 4(%esp), %edx 985 movl 4(%esp), %edx
950 movl (%esp), %ecx 986 movl (%esp), %ecx
951 leal 4(%esp), %eax 987 leal 4(%esp), %eax
952 movl %ebx, EBX(%edx) 988 movl %ebx, PT_EBX(%edx)
953 xorl %ebx, %ebx 989 xorl %ebx, %ebx
954 movl %ebx, ECX(%edx) 990 movl %ebx, PT_ECX(%edx)
955 movl %ebx, EDX(%edx) 991 movl %ebx, PT_EDX(%edx)
956 movl %esi, ESI(%edx) 992 movl %esi, PT_ESI(%edx)
957 movl %edi, EDI(%edx) 993 movl %edi, PT_EDI(%edx)
958 movl %ebp, EBP(%edx) 994 movl %ebp, PT_EBP(%edx)
959 movl %ebx, EAX(%edx) 995 movl %ebx, PT_EAX(%edx)
960 movl $__USER_DS, DS(%edx) 996 movl $__USER_DS, PT_DS(%edx)
961 movl $__USER_DS, ES(%edx) 997 movl $__USER_DS, PT_ES(%edx)
962 movl %ebx, ORIG_EAX(%edx) 998 movl $0, PT_GS(%edx)
963 movl %ecx, EIP(%edx) 999 movl %ebx, PT_ORIG_EAX(%edx)
1000 movl %ecx, PT_EIP(%edx)
964 movl 12(%esp), %ecx 1001 movl 12(%esp), %ecx
965 movl $__KERNEL_CS, CS(%edx) 1002 movl $__KERNEL_CS, PT_CS(%edx)
966 movl %ebx, EFLAGS(%edx) 1003 movl %ebx, PT_EFLAGS(%edx)
967 movl %eax, OLDESP(%edx) 1004 movl %eax, PT_OLDESP(%edx)
968 movl 8(%esp), %eax 1005 movl 8(%esp), %eax
969 movl %ecx, 8(%esp) 1006 movl %ecx, 8(%esp)
970 movl EBX(%edx), %ebx 1007 movl PT_EBX(%edx), %ebx
971 movl $__KERNEL_DS, OLDSS(%edx) 1008 movl $__KERNEL_DS, PT_OLDSS(%edx)
972 jmpl *%eax 1009 jmpl *%eax
973 CFI_ENDPROC 1010 CFI_ENDPROC
974ENDPROC(arch_unwind_init_running) 1011ENDPROC(arch_unwind_init_running)
diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S
index ca31f18d277c..edef5084ce17 100644
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -55,6 +55,12 @@
55 */ 55 */
56ENTRY(startup_32) 56ENTRY(startup_32)
57 57
58#ifdef CONFIG_PARAVIRT
59 movl %cs, %eax
60 testl $0x3, %eax
61 jnz startup_paravirt
62#endif
63
58/* 64/*
59 * Set segments to known values. 65 * Set segments to known values.
60 */ 66 */
@@ -302,6 +308,7 @@ is386: movl $2,%ecx # set MP
302 movl %eax,%cr0 308 movl %eax,%cr0
303 309
304 call check_x87 310 call check_x87
311 call setup_pda
305 lgdt cpu_gdt_descr 312 lgdt cpu_gdt_descr
306 lidt idt_descr 313 lidt idt_descr
307 ljmp $(__KERNEL_CS),$1f 314 ljmp $(__KERNEL_CS),$1f
@@ -312,10 +319,13 @@ is386: movl $2,%ecx # set MP
312 movl %eax,%ds 319 movl %eax,%ds
313 movl %eax,%es 320 movl %eax,%es
314 321
315 xorl %eax,%eax # Clear FS/GS and LDT 322 xorl %eax,%eax # Clear FS and LDT
316 movl %eax,%fs 323 movl %eax,%fs
317 movl %eax,%gs
318 lldt %ax 324 lldt %ax
325
326 movl $(__KERNEL_PDA),%eax
327 mov %eax,%gs
328
319 cld # gcc2 wants the direction flag cleared at all times 329 cld # gcc2 wants the direction flag cleared at all times
320 pushl $0 # fake return address for unwinder 330 pushl $0 # fake return address for unwinder
321#ifdef CONFIG_SMP 331#ifdef CONFIG_SMP
@@ -346,6 +356,23 @@ check_x87:
346 ret 356 ret
347 357
348/* 358/*
359 * Point the GDT at this CPU's PDA. On boot this will be
360 * cpu_gdt_table and boot_pda; for secondary CPUs, these will be
361 * that CPU's GDT and PDA.
362 */
363setup_pda:
364 /* get the PDA pointer */
365 movl start_pda, %eax
366
367 /* slot the PDA address into the GDT */
368 mov cpu_gdt_descr+2, %ecx
369 mov %ax, (__KERNEL_PDA+0+2)(%ecx) /* base & 0x0000ffff */
370 shr $16, %eax
371 mov %al, (__KERNEL_PDA+4+0)(%ecx) /* base & 0x00ff0000 */
372 mov %ah, (__KERNEL_PDA+4+3)(%ecx) /* base & 0xff000000 */
373 ret
374
375/*
349 * setup_idt 376 * setup_idt
350 * 377 *
351 * sets up a idt with 256 entries pointing to 378 * sets up a idt with 256 entries pointing to
@@ -465,6 +492,33 @@ ignore_int:
465#endif 492#endif
466 iret 493 iret
467 494
495#ifdef CONFIG_PARAVIRT
496startup_paravirt:
497 cld
498 movl $(init_thread_union+THREAD_SIZE),%esp
499
500 /* We take pains to preserve all the regs. */
501 pushl %edx
502 pushl %ecx
503 pushl %eax
504
505 /* paravirt.o is last in link, and that probe fn never returns */
506 pushl $__start_paravirtprobe
5071:
508 movl 0(%esp), %eax
509 pushl (%eax)
510 movl 8(%esp), %eax
511 call *(%esp)
512 popl %eax
513
514 movl 4(%esp), %eax
515 movl 8(%esp), %ecx
516 movl 12(%esp), %edx
517
518 addl $4, (%esp)
519 jmp 1b
520#endif
521
468/* 522/*
469 * Real beginning of normal "text" segment 523 * Real beginning of normal "text" segment
470 */ 524 */
@@ -484,6 +538,8 @@ ENTRY(empty_zero_page)
484 * This starts the data section. 538 * This starts the data section.
485 */ 539 */
486.data 540.data
541ENTRY(start_pda)
542 .long boot_pda
487 543
488ENTRY(stack_start) 544ENTRY(stack_start)
489 .long init_thread_union+THREAD_SIZE 545 .long init_thread_union+THREAD_SIZE
@@ -525,7 +581,7 @@ idt_descr:
525 581
526# boot GDT descriptor (later on used by CPU#0): 582# boot GDT descriptor (later on used by CPU#0):
527 .word 0 # 32 bit align gdt_desc.address 583 .word 0 # 32 bit align gdt_desc.address
528cpu_gdt_descr: 584ENTRY(cpu_gdt_descr)
529 .word GDT_ENTRIES*8-1 585 .word GDT_ENTRIES*8-1
530 .long cpu_gdt_table 586 .long cpu_gdt_table
531 587
@@ -584,8 +640,8 @@ ENTRY(cpu_gdt_table)
584 .quad 0x00009a000000ffff /* 0xc0 APM CS 16 code (16 bit) */ 640 .quad 0x00009a000000ffff /* 0xc0 APM CS 16 code (16 bit) */
585 .quad 0x004092000000ffff /* 0xc8 APM DS data */ 641 .quad 0x004092000000ffff /* 0xc8 APM DS data */
586 642
587 .quad 0x0000920000000000 /* 0xd0 - ESPFIX 16-bit SS */ 643 .quad 0x00c0920000000000 /* 0xd0 - ESPFIX SS */
588 .quad 0x0000000000000000 /* 0xd8 - unused */ 644 .quad 0x00cf92000000ffff /* 0xd8 - PDA */
589 .quad 0x0000000000000000 /* 0xe0 - unused */ 645 .quad 0x0000000000000000 /* 0xe0 - unused */
590 .quad 0x0000000000000000 /* 0xe8 - unused */ 646 .quad 0x0000000000000000 /* 0xe8 - unused */
591 .quad 0x0000000000000000 /* 0xf0 - unused */ 647 .quad 0x0000000000000000 /* 0xf0 - unused */
diff --git a/arch/i386/kernel/hpet.c b/arch/i386/kernel/hpet.c
index 17647a530b2f..45a8685bb60b 100644
--- a/arch/i386/kernel/hpet.c
+++ b/arch/i386/kernel/hpet.c
@@ -34,6 +34,7 @@ static int __init init_hpet_clocksource(void)
34 unsigned long hpet_period; 34 unsigned long hpet_period;
35 void __iomem* hpet_base; 35 void __iomem* hpet_base;
36 u64 tmp; 36 u64 tmp;
37 int err;
37 38
38 if (!is_hpet_enabled()) 39 if (!is_hpet_enabled())
39 return -ENODEV; 40 return -ENODEV;
@@ -61,7 +62,11 @@ static int __init init_hpet_clocksource(void)
61 do_div(tmp, FSEC_PER_NSEC); 62 do_div(tmp, FSEC_PER_NSEC);
62 clocksource_hpet.mult = (u32)tmp; 63 clocksource_hpet.mult = (u32)tmp;
63 64
64 return clocksource_register(&clocksource_hpet); 65 err = clocksource_register(&clocksource_hpet);
66 if (err)
67 iounmap(hpet_base);
68
69 return err;
65} 70}
66 71
67module_init(init_hpet_clocksource); 72module_init(init_hpet_clocksource);
diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c
index 62996cd17084..c8d45821c788 100644
--- a/arch/i386/kernel/i8259.c
+++ b/arch/i386/kernel/i8259.c
@@ -381,7 +381,10 @@ void __init init_ISA_irqs (void)
381 } 381 }
382} 382}
383 383
384void __init init_IRQ(void) 384/* Overridden in paravirt.c */
385void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
386
387void __init native_init_IRQ(void)
385{ 388{
386 int i; 389 int i;
387 390
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 3b7a63e0ed1a..e21dcde0790e 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -34,6 +34,7 @@
34#include <linux/pci.h> 34#include <linux/pci.h>
35#include <linux/msi.h> 35#include <linux/msi.h>
36#include <linux/htirq.h> 36#include <linux/htirq.h>
37#include <linux/freezer.h>
37 38
38#include <asm/io.h> 39#include <asm/io.h>
39#include <asm/smp.h> 40#include <asm/smp.h>
@@ -153,14 +154,20 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
153 * the interrupt, and we need to make sure the entry is fully populated 154 * the interrupt, and we need to make sure the entry is fully populated
154 * before that happens. 155 * before that happens.
155 */ 156 */
156static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) 157static void
158__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
157{ 159{
158 unsigned long flags;
159 union entry_union eu; 160 union entry_union eu;
160 eu.entry = e; 161 eu.entry = e;
161 spin_lock_irqsave(&ioapic_lock, flags);
162 io_apic_write(apic, 0x11 + 2*pin, eu.w2); 162 io_apic_write(apic, 0x11 + 2*pin, eu.w2);
163 io_apic_write(apic, 0x10 + 2*pin, eu.w1); 163 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
164}
165
166static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
167{
168 unsigned long flags;
169 spin_lock_irqsave(&ioapic_lock, flags);
170 __ioapic_write_entry(apic, pin, e);
164 spin_unlock_irqrestore(&ioapic_lock, flags); 171 spin_unlock_irqrestore(&ioapic_lock, flags);
165} 172}
166 173
@@ -836,8 +843,7 @@ static int __init find_isa_irq_pin(int irq, int type)
836 843
837 if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || 844 if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
838 mp_bus_id_to_type[lbus] == MP_BUS_EISA || 845 mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
839 mp_bus_id_to_type[lbus] == MP_BUS_MCA || 846 mp_bus_id_to_type[lbus] == MP_BUS_MCA
840 mp_bus_id_to_type[lbus] == MP_BUS_NEC98
841 ) && 847 ) &&
842 (mp_irqs[i].mpc_irqtype == type) && 848 (mp_irqs[i].mpc_irqtype == type) &&
843 (mp_irqs[i].mpc_srcbusirq == irq)) 849 (mp_irqs[i].mpc_srcbusirq == irq))
@@ -856,8 +862,7 @@ static int __init find_isa_irq_apic(int irq, int type)
856 862
857 if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || 863 if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
858 mp_bus_id_to_type[lbus] == MP_BUS_EISA || 864 mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
859 mp_bus_id_to_type[lbus] == MP_BUS_MCA || 865 mp_bus_id_to_type[lbus] == MP_BUS_MCA
860 mp_bus_id_to_type[lbus] == MP_BUS_NEC98
861 ) && 866 ) &&
862 (mp_irqs[i].mpc_irqtype == type) && 867 (mp_irqs[i].mpc_irqtype == type) &&
863 (mp_irqs[i].mpc_srcbusirq == irq)) 868 (mp_irqs[i].mpc_srcbusirq == irq))
@@ -987,12 +992,6 @@ static int EISA_ELCR(unsigned int irq)
987#define default_MCA_trigger(idx) (1) 992#define default_MCA_trigger(idx) (1)
988#define default_MCA_polarity(idx) (0) 993#define default_MCA_polarity(idx) (0)
989 994
990/* NEC98 interrupts are always polarity zero edge triggered,
991 * when listed as conforming in the MP table. */
992
993#define default_NEC98_trigger(idx) (0)
994#define default_NEC98_polarity(idx) (0)
995
996static int __init MPBIOS_polarity(int idx) 995static int __init MPBIOS_polarity(int idx)
997{ 996{
998 int bus = mp_irqs[idx].mpc_srcbus; 997 int bus = mp_irqs[idx].mpc_srcbus;
@@ -1027,11 +1026,6 @@ static int __init MPBIOS_polarity(int idx)
1027 polarity = default_MCA_polarity(idx); 1026 polarity = default_MCA_polarity(idx);
1028 break; 1027 break;
1029 } 1028 }
1030 case MP_BUS_NEC98: /* NEC 98 pin */
1031 {
1032 polarity = default_NEC98_polarity(idx);
1033 break;
1034 }
1035 default: 1029 default:
1036 { 1030 {
1037 printk(KERN_WARNING "broken BIOS!!\n"); 1031 printk(KERN_WARNING "broken BIOS!!\n");
@@ -1101,11 +1095,6 @@ static int MPBIOS_trigger(int idx)
1101 trigger = default_MCA_trigger(idx); 1095 trigger = default_MCA_trigger(idx);
1102 break; 1096 break;
1103 } 1097 }
1104 case MP_BUS_NEC98: /* NEC 98 pin */
1105 {
1106 trigger = default_NEC98_trigger(idx);
1107 break;
1108 }
1109 default: 1098 default:
1110 { 1099 {
1111 printk(KERN_WARNING "broken BIOS!!\n"); 1100 printk(KERN_WARNING "broken BIOS!!\n");
@@ -1167,7 +1156,6 @@ static int pin_2_irq(int idx, int apic, int pin)
1167 case MP_BUS_ISA: /* ISA pin */ 1156 case MP_BUS_ISA: /* ISA pin */
1168 case MP_BUS_EISA: 1157 case MP_BUS_EISA:
1169 case MP_BUS_MCA: 1158 case MP_BUS_MCA:
1170 case MP_BUS_NEC98:
1171 { 1159 {
1172 irq = mp_irqs[idx].mpc_srcbusirq; 1160 irq = mp_irqs[idx].mpc_srcbusirq;
1173 break; 1161 break;
@@ -1235,7 +1223,7 @@ static inline int IO_APIC_irq_trigger(int irq)
1235} 1223}
1236 1224
1237/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ 1225/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
1238u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 }; 1226static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
1239 1227
1240static int __assign_irq_vector(int irq) 1228static int __assign_irq_vector(int irq)
1241{ 1229{
@@ -1360,8 +1348,8 @@ static void __init setup_IO_APIC_irqs(void)
1360 if (!apic && (irq < 16)) 1348 if (!apic && (irq < 16))
1361 disable_8259A_irq(irq); 1349 disable_8259A_irq(irq);
1362 } 1350 }
1363 ioapic_write_entry(apic, pin, entry);
1364 spin_lock_irqsave(&ioapic_lock, flags); 1351 spin_lock_irqsave(&ioapic_lock, flags);
1352 __ioapic_write_entry(apic, pin, entry);
1365 set_native_irq_info(irq, TARGET_CPUS); 1353 set_native_irq_info(irq, TARGET_CPUS);
1366 spin_unlock_irqrestore(&ioapic_lock, flags); 1354 spin_unlock_irqrestore(&ioapic_lock, flags);
1367 } 1355 }
@@ -1926,6 +1914,15 @@ static void __init setup_ioapic_ids_from_mpc(void)
1926static void __init setup_ioapic_ids_from_mpc(void) { } 1914static void __init setup_ioapic_ids_from_mpc(void) { }
1927#endif 1915#endif
1928 1916
1917static int no_timer_check __initdata;
1918
1919static int __init notimercheck(char *s)
1920{
1921 no_timer_check = 1;
1922 return 1;
1923}
1924__setup("no_timer_check", notimercheck);
1925
1929/* 1926/*
1930 * There is a nasty bug in some older SMP boards, their mptable lies 1927 * There is a nasty bug in some older SMP boards, their mptable lies
1931 * about the timer IRQ. We do the following to work around the situation: 1928 * about the timer IRQ. We do the following to work around the situation:
@@ -1934,10 +1931,13 @@ static void __init setup_ioapic_ids_from_mpc(void) { }
1934 * - if this function detects that timer IRQs are defunct, then we fall 1931 * - if this function detects that timer IRQs are defunct, then we fall
1935 * back to ISA timer IRQs 1932 * back to ISA timer IRQs
1936 */ 1933 */
1937static int __init timer_irq_works(void) 1934int __init timer_irq_works(void)
1938{ 1935{
1939 unsigned long t1 = jiffies; 1936 unsigned long t1 = jiffies;
1940 1937
1938 if (no_timer_check)
1939 return 1;
1940
1941 local_irq_enable(); 1941 local_irq_enable();
1942 /* Let ten ticks pass... */ 1942 /* Let ten ticks pass... */
1943 mdelay((10 * 1000) / HZ); 1943 mdelay((10 * 1000) / HZ);
@@ -2161,9 +2161,15 @@ static inline void unlock_ExtINT_logic(void)
2161 unsigned char save_control, save_freq_select; 2161 unsigned char save_control, save_freq_select;
2162 2162
2163 pin = find_isa_irq_pin(8, mp_INT); 2163 pin = find_isa_irq_pin(8, mp_INT);
2164 if (pin == -1) {
2165 WARN_ON_ONCE(1);
2166 return;
2167 }
2164 apic = find_isa_irq_apic(8, mp_INT); 2168 apic = find_isa_irq_apic(8, mp_INT);
2165 if (pin == -1) 2169 if (apic == -1) {
2170 WARN_ON_ONCE(1);
2166 return; 2171 return;
2172 }
2167 2173
2168 entry0 = ioapic_read_entry(apic, pin); 2174 entry0 = ioapic_read_entry(apic, pin);
2169 clear_IO_APIC_pin(apic, pin); 2175 clear_IO_APIC_pin(apic, pin);
@@ -2208,7 +2214,7 @@ int timer_uses_ioapic_pin_0;
2208 * is so screwy. Thanks to Brian Perkins for testing/hacking this beast 2214 * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
2209 * fanatically on his truly buggy board. 2215 * fanatically on his truly buggy board.
2210 */ 2216 */
2211static inline void check_timer(void) 2217static inline void __init check_timer(void)
2212{ 2218{
2213 int apic1, pin1, apic2, pin2; 2219 int apic1, pin1, apic2, pin2;
2214 int vector; 2220 int vector;
@@ -2856,8 +2862,8 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
2856 if (!ioapic && (irq < 16)) 2862 if (!ioapic && (irq < 16))
2857 disable_8259A_irq(irq); 2863 disable_8259A_irq(irq);
2858 2864
2859 ioapic_write_entry(ioapic, pin, entry);
2860 spin_lock_irqsave(&ioapic_lock, flags); 2865 spin_lock_irqsave(&ioapic_lock, flags);
2866 __ioapic_write_entry(ioapic, pin, entry);
2861 set_native_irq_info(irq, TARGET_CPUS); 2867 set_native_irq_info(irq, TARGET_CPUS);
2862 spin_unlock_irqrestore(&ioapic_lock, flags); 2868 spin_unlock_irqrestore(&ioapic_lock, flags);
2863 2869
diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c
index fc79e1e859c4..af1d53344993 100644
--- a/arch/i386/kernel/kprobes.c
+++ b/arch/i386/kernel/kprobes.c
@@ -184,7 +184,7 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
184void __kprobes arch_remove_kprobe(struct kprobe *p) 184void __kprobes arch_remove_kprobe(struct kprobe *p)
185{ 185{
186 mutex_lock(&kprobe_mutex); 186 mutex_lock(&kprobe_mutex);
187 free_insn_slot(p->ainsn.insn); 187 free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
188 mutex_unlock(&kprobe_mutex); 188 mutex_unlock(&kprobe_mutex);
189} 189}
190 190
@@ -333,7 +333,7 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
333 return 1; 333 return 1;
334 334
335ss_probe: 335ss_probe:
336#ifndef CONFIG_PREEMPT 336#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM)
337 if (p->ainsn.boostable == 1 && !p->post_handler){ 337 if (p->ainsn.boostable == 1 && !p->post_handler){
338 /* Boost up -- we can execute copied instructions directly */ 338 /* Boost up -- we can execute copied instructions directly */
339 reset_current_kprobe(); 339 reset_current_kprobe();
diff --git a/arch/i386/kernel/ldt.c b/arch/i386/kernel/ldt.c
index 445211eb2d57..b410e5fb034f 100644
--- a/arch/i386/kernel/ldt.c
+++ b/arch/i386/kernel/ldt.c
@@ -160,16 +160,14 @@ static int read_default_ldt(void __user * ptr, unsigned long bytecount)
160{ 160{
161 int err; 161 int err;
162 unsigned long size; 162 unsigned long size;
163 void *address;
164 163
165 err = 0; 164 err = 0;
166 address = &default_ldt[0];
167 size = 5*sizeof(struct desc_struct); 165 size = 5*sizeof(struct desc_struct);
168 if (size > bytecount) 166 if (size > bytecount)
169 size = bytecount; 167 size = bytecount;
170 168
171 err = size; 169 err = size;
172 if (copy_to_user(ptr, address, size)) 170 if (clear_user(ptr, size))
173 err = -EFAULT; 171 err = -EFAULT;
174 172
175 return err; 173 return err;
diff --git a/arch/i386/kernel/mca.c b/arch/i386/kernel/mca.c
index eb57a851789d..b83672b89527 100644
--- a/arch/i386/kernel/mca.c
+++ b/arch/i386/kernel/mca.c
@@ -283,10 +283,9 @@ static int __init mca_init(void)
283 bus->f.mca_transform_memory = mca_dummy_transform_memory; 283 bus->f.mca_transform_memory = mca_dummy_transform_memory;
284 284
285 /* get the motherboard device */ 285 /* get the motherboard device */
286 mca_dev = kmalloc(sizeof(struct mca_device), GFP_KERNEL); 286 mca_dev = kzalloc(sizeof(struct mca_device), GFP_KERNEL);
287 if(unlikely(!mca_dev)) 287 if(unlikely(!mca_dev))
288 goto out_nomem; 288 goto out_nomem;
289 memset(mca_dev, 0, sizeof(struct mca_device));
290 289
291 /* 290 /*
292 * We do not expect many MCA interrupts during initialization, 291 * We do not expect many MCA interrupts during initialization,
@@ -310,11 +309,9 @@ static int __init mca_init(void)
310 mca_dev->slot = MCA_MOTHERBOARD; 309 mca_dev->slot = MCA_MOTHERBOARD;
311 mca_register_device(MCA_PRIMARY_BUS, mca_dev); 310 mca_register_device(MCA_PRIMARY_BUS, mca_dev);
312 311
313 mca_dev = kmalloc(sizeof(struct mca_device), GFP_ATOMIC); 312 mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
314 if(unlikely(!mca_dev)) 313 if(unlikely(!mca_dev))
315 goto out_unlock_nomem; 314 goto out_unlock_nomem;
316 memset(mca_dev, 0, sizeof(struct mca_device));
317
318 315
319 /* Put motherboard into video setup mode, read integrated video 316 /* Put motherboard into video setup mode, read integrated video
320 * POS registers, and turn motherboard setup off. 317 * POS registers, and turn motherboard setup off.
@@ -349,10 +346,9 @@ static int __init mca_init(void)
349 } 346 }
350 if(which_scsi) { 347 if(which_scsi) {
351 /* found a scsi card */ 348 /* found a scsi card */
352 mca_dev = kmalloc(sizeof(struct mca_device), GFP_ATOMIC); 349 mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
353 if(unlikely(!mca_dev)) 350 if(unlikely(!mca_dev))
354 goto out_unlock_nomem; 351 goto out_unlock_nomem;
355 memset(mca_dev, 0, sizeof(struct mca_device));
356 352
357 for(j = 0; j < 8; j++) 353 for(j = 0; j < 8; j++)
358 mca_dev->pos[j] = pos[j]; 354 mca_dev->pos[j] = pos[j];
@@ -378,10 +374,9 @@ static int __init mca_init(void)
378 if(!mca_read_and_store_pos(pos)) 374 if(!mca_read_and_store_pos(pos))
379 continue; 375 continue;
380 376
381 mca_dev = kmalloc(sizeof(struct mca_device), GFP_ATOMIC); 377 mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
382 if(unlikely(!mca_dev)) 378 if(unlikely(!mca_dev))
383 goto out_unlock_nomem; 379 goto out_unlock_nomem;
384 memset(mca_dev, 0, sizeof(struct mca_device));
385 380
386 for(j=0; j<8; j++) 381 for(j=0; j<8; j++)
387 mca_dev->pos[j]=pos[j]; 382 mca_dev->pos[j]=pos[j];
diff --git a/arch/i386/kernel/microcode.c b/arch/i386/kernel/microcode.c
index 23f5984d0654..972346604f9d 100644
--- a/arch/i386/kernel/microcode.c
+++ b/arch/i386/kernel/microcode.c
@@ -703,7 +703,6 @@ static struct sysdev_driver mc_sysdev_driver = {
703 .resume = mc_sysdev_resume, 703 .resume = mc_sysdev_resume,
704}; 704};
705 705
706#ifdef CONFIG_HOTPLUG_CPU
707static __cpuinit int 706static __cpuinit int
708mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) 707mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
709{ 708{
@@ -726,7 +725,6 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
726static struct notifier_block mc_cpu_notifier = { 725static struct notifier_block mc_cpu_notifier = {
727 .notifier_call = mc_cpu_callback, 726 .notifier_call = mc_cpu_callback,
728}; 727};
729#endif
730 728
731static int __init microcode_init (void) 729static int __init microcode_init (void)
732{ 730{
diff --git a/arch/i386/kernel/module.c b/arch/i386/kernel/module.c
index 470cf97e7cd3..d7d9c8b23f72 100644
--- a/arch/i386/kernel/module.c
+++ b/arch/i386/kernel/module.c
@@ -108,7 +108,8 @@ int module_finalize(const Elf_Ehdr *hdr,
108 const Elf_Shdr *sechdrs, 108 const Elf_Shdr *sechdrs,
109 struct module *me) 109 struct module *me)
110{ 110{
111 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL; 111 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
112 *para = NULL;
112 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; 113 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
113 114
114 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { 115 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
@@ -118,6 +119,8 @@ int module_finalize(const Elf_Ehdr *hdr,
118 alt = s; 119 alt = s;
119 if (!strcmp(".smp_locks", secstrings + s->sh_name)) 120 if (!strcmp(".smp_locks", secstrings + s->sh_name))
120 locks= s; 121 locks= s;
122 if (!strcmp(".parainstructions", secstrings + s->sh_name))
123 para = s;
121 } 124 }
122 125
123 if (alt) { 126 if (alt) {
@@ -132,6 +135,12 @@ int module_finalize(const Elf_Ehdr *hdr,
132 lseg, lseg + locks->sh_size, 135 lseg, lseg + locks->sh_size,
133 tseg, tseg + text->sh_size); 136 tseg, tseg + text->sh_size);
134 } 137 }
138
139 if (para) {
140 void *pseg = (void *)para->sh_addr;
141 apply_paravirt(pseg, pseg + para->sh_size);
142 }
143
135 return 0; 144 return 0;
136} 145}
137 146
diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c
index 442aaf8c77eb..2ce67228dff8 100644
--- a/arch/i386/kernel/mpparse.c
+++ b/arch/i386/kernel/mpparse.c
@@ -249,8 +249,6 @@ static void __init MP_bus_info (struct mpc_config_bus *m)
249 mp_current_pci_id++; 249 mp_current_pci_id++;
250 } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) { 250 } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) {
251 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA; 251 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
252 } else if (strncmp(str, BUSTYPE_NEC98, sizeof(BUSTYPE_NEC98)-1) == 0) {
253 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_NEC98;
254 } else { 252 } else {
255 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); 253 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
256 } 254 }
diff --git a/arch/i386/kernel/msr.c b/arch/i386/kernel/msr.c
index a773f776c9ea..1d1a56cae340 100644
--- a/arch/i386/kernel/msr.c
+++ b/arch/i386/kernel/msr.c
@@ -195,7 +195,6 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
195{ 195{
196 const u32 __user *tmp = (const u32 __user *)buf; 196 const u32 __user *tmp = (const u32 __user *)buf;
197 u32 data[2]; 197 u32 data[2];
198 size_t rv;
199 u32 reg = *ppos; 198 u32 reg = *ppos;
200 int cpu = iminor(file->f_dentry->d_inode); 199 int cpu = iminor(file->f_dentry->d_inode);
201 int err; 200 int err;
@@ -203,7 +202,7 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
203 if (count % 8) 202 if (count % 8)
204 return -EINVAL; /* Invalid chunk size */ 203 return -EINVAL; /* Invalid chunk size */
205 204
206 for (rv = 0; count; count -= 8) { 205 for (; count; count -= 8) {
207 if (copy_from_user(&data, tmp, 8)) 206 if (copy_from_user(&data, tmp, 8))
208 return -EFAULT; 207 return -EFAULT;
209 err = do_wrmsr(cpu, reg, data[0], data[1]); 208 err = do_wrmsr(cpu, reg, data[0], data[1]);
@@ -250,7 +249,6 @@ static int msr_device_create(int i)
250 return err; 249 return err;
251} 250}
252 251
253#ifdef CONFIG_HOTPLUG_CPU
254static int msr_class_cpu_callback(struct notifier_block *nfb, 252static int msr_class_cpu_callback(struct notifier_block *nfb,
255 unsigned long action, void *hcpu) 253 unsigned long action, void *hcpu)
256{ 254{
@@ -271,7 +269,6 @@ static struct notifier_block __cpuinitdata msr_class_cpu_notifier =
271{ 269{
272 .notifier_call = msr_class_cpu_callback, 270 .notifier_call = msr_class_cpu_callback,
273}; 271};
274#endif
275 272
276static int __init msr_init(void) 273static int __init msr_init(void)
277{ 274{
diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index eaafe233a5da..f5bc7e1be801 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -22,6 +22,7 @@
22#include <linux/percpu.h> 22#include <linux/percpu.h>
23#include <linux/dmi.h> 23#include <linux/dmi.h>
24#include <linux/kprobes.h> 24#include <linux/kprobes.h>
25#include <linux/cpumask.h>
25 26
26#include <asm/smp.h> 27#include <asm/smp.h>
27#include <asm/nmi.h> 28#include <asm/nmi.h>
@@ -42,6 +43,8 @@ int nmi_watchdog_enabled;
42static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner); 43static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner);
43static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[3]); 44static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[3]);
44 45
46static cpumask_t backtrace_mask = CPU_MASK_NONE;
47
45/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's 48/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
46 * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now) 49 * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now)
47 */ 50 */
@@ -867,14 +870,16 @@ static unsigned int
867 870
868void touch_nmi_watchdog (void) 871void touch_nmi_watchdog (void)
869{ 872{
870 int i; 873 if (nmi_watchdog > 0) {
874 unsigned cpu;
871 875
872 /* 876 /*
873 * Just reset the alert counters, (other CPUs might be 877 * Just reset the alert counters, (other CPUs might be
874 * spinning on locks we hold): 878 * spinning on locks we hold):
875 */ 879 */
876 for_each_possible_cpu(i) 880 for_each_present_cpu (cpu)
877 alert_counter[i] = 0; 881 alert_counter[cpu] = 0;
882 }
878 883
879 /* 884 /*
880 * Tickle the softlockup detector too: 885 * Tickle the softlockup detector too:
@@ -907,6 +912,16 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
907 touched = 1; 912 touched = 1;
908 } 913 }
909 914
915 if (cpu_isset(cpu, backtrace_mask)) {
916 static DEFINE_SPINLOCK(lock); /* Serialise the printks */
917
918 spin_lock(&lock);
919 printk("NMI backtrace for cpu %d\n", cpu);
920 dump_stack();
921 spin_unlock(&lock);
922 cpu_clear(cpu, backtrace_mask);
923 }
924
910 sum = per_cpu(irq_stat, cpu).apic_timer_irqs; 925 sum = per_cpu(irq_stat, cpu).apic_timer_irqs;
911 926
912 /* if the apic timer isn't firing, this cpu isn't doing much */ 927 /* if the apic timer isn't firing, this cpu isn't doing much */
@@ -1033,6 +1048,19 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
1033 1048
1034#endif 1049#endif
1035 1050
1051void __trigger_all_cpu_backtrace(void)
1052{
1053 int i;
1054
1055 backtrace_mask = cpu_online_map;
1056 /* Wait for up to 10 seconds for all CPUs to do the backtrace */
1057 for (i = 0; i < 10 * 1000; i++) {
1058 if (cpus_empty(backtrace_mask))
1059 break;
1060 mdelay(1);
1061 }
1062}
1063
1036EXPORT_SYMBOL(nmi_active); 1064EXPORT_SYMBOL(nmi_active);
1037EXPORT_SYMBOL(nmi_watchdog); 1065EXPORT_SYMBOL(nmi_watchdog);
1038EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi); 1066EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c
new file mode 100644
index 000000000000..3dceab5828f1
--- /dev/null
+++ b/arch/i386/kernel/paravirt.c
@@ -0,0 +1,569 @@
1/* Paravirtualization interfaces
2 Copyright (C) 2006 Rusty Russell IBM Corporation
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17*/
18#include <linux/errno.h>
19#include <linux/module.h>
20#include <linux/efi.h>
21#include <linux/bcd.h>
22#include <linux/start_kernel.h>
23
24#include <asm/bug.h>
25#include <asm/paravirt.h>
26#include <asm/desc.h>
27#include <asm/setup.h>
28#include <asm/arch_hooks.h>
29#include <asm/time.h>
30#include <asm/irq.h>
31#include <asm/delay.h>
32#include <asm/fixmap.h>
33#include <asm/apic.h>
34#include <asm/tlbflush.h>
35
36/* nop stub */
37static void native_nop(void)
38{
39}
40
41static void __init default_banner(void)
42{
43 printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
44 paravirt_ops.name);
45}
46
47char *memory_setup(void)
48{
49 return paravirt_ops.memory_setup();
50}
51
52/* Simple instruction patching code. */
53#define DEF_NATIVE(name, code) \
54 extern const char start_##name[], end_##name[]; \
55 asm("start_" #name ": " code "; end_" #name ":")
56DEF_NATIVE(cli, "cli");
57DEF_NATIVE(sti, "sti");
58DEF_NATIVE(popf, "push %eax; popf");
59DEF_NATIVE(pushf, "pushf; pop %eax");
60DEF_NATIVE(pushf_cli, "pushf; pop %eax; cli");
61DEF_NATIVE(iret, "iret");
62DEF_NATIVE(sti_sysexit, "sti; sysexit");
63
64static const struct native_insns
65{
66 const char *start, *end;
67} native_insns[] = {
68 [PARAVIRT_IRQ_DISABLE] = { start_cli, end_cli },
69 [PARAVIRT_IRQ_ENABLE] = { start_sti, end_sti },
70 [PARAVIRT_RESTORE_FLAGS] = { start_popf, end_popf },
71 [PARAVIRT_SAVE_FLAGS] = { start_pushf, end_pushf },
72 [PARAVIRT_SAVE_FLAGS_IRQ_DISABLE] = { start_pushf_cli, end_pushf_cli },
73 [PARAVIRT_INTERRUPT_RETURN] = { start_iret, end_iret },
74 [PARAVIRT_STI_SYSEXIT] = { start_sti_sysexit, end_sti_sysexit },
75};
76
77static unsigned native_patch(u8 type, u16 clobbers, void *insns, unsigned len)
78{
79 unsigned int insn_len;
80
81 /* Don't touch it if we don't have a replacement */
82 if (type >= ARRAY_SIZE(native_insns) || !native_insns[type].start)
83 return len;
84
85 insn_len = native_insns[type].end - native_insns[type].start;
86
87 /* Similarly if we can't fit replacement. */
88 if (len < insn_len)
89 return len;
90
91 memcpy(insns, native_insns[type].start, insn_len);
92 return insn_len;
93}
94
95static fastcall unsigned long native_get_debugreg(int regno)
96{
97 unsigned long val = 0; /* Damn you, gcc! */
98
99 switch (regno) {
100 case 0:
101 asm("movl %%db0, %0" :"=r" (val)); break;
102 case 1:
103 asm("movl %%db1, %0" :"=r" (val)); break;
104 case 2:
105 asm("movl %%db2, %0" :"=r" (val)); break;
106 case 3:
107 asm("movl %%db3, %0" :"=r" (val)); break;
108 case 6:
109 asm("movl %%db6, %0" :"=r" (val)); break;
110 case 7:
111 asm("movl %%db7, %0" :"=r" (val)); break;
112 default:
113 BUG();
114 }
115 return val;
116}
117
118static fastcall void native_set_debugreg(int regno, unsigned long value)
119{
120 switch (regno) {
121 case 0:
122 asm("movl %0,%%db0" : /* no output */ :"r" (value));
123 break;
124 case 1:
125 asm("movl %0,%%db1" : /* no output */ :"r" (value));
126 break;
127 case 2:
128 asm("movl %0,%%db2" : /* no output */ :"r" (value));
129 break;
130 case 3:
131 asm("movl %0,%%db3" : /* no output */ :"r" (value));
132 break;
133 case 6:
134 asm("movl %0,%%db6" : /* no output */ :"r" (value));
135 break;
136 case 7:
137 asm("movl %0,%%db7" : /* no output */ :"r" (value));
138 break;
139 default:
140 BUG();
141 }
142}
143
144void init_IRQ(void)
145{
146 paravirt_ops.init_IRQ();
147}
148
149static fastcall void native_clts(void)
150{
151 asm volatile ("clts");
152}
153
154static fastcall unsigned long native_read_cr0(void)
155{
156 unsigned long val;
157 asm volatile("movl %%cr0,%0\n\t" :"=r" (val));
158 return val;
159}
160
161static fastcall void native_write_cr0(unsigned long val)
162{
163 asm volatile("movl %0,%%cr0": :"r" (val));
164}
165
166static fastcall unsigned long native_read_cr2(void)
167{
168 unsigned long val;
169 asm volatile("movl %%cr2,%0\n\t" :"=r" (val));
170 return val;
171}
172
173static fastcall void native_write_cr2(unsigned long val)
174{
175 asm volatile("movl %0,%%cr2": :"r" (val));
176}
177
178static fastcall unsigned long native_read_cr3(void)
179{
180 unsigned long val;
181 asm volatile("movl %%cr3,%0\n\t" :"=r" (val));
182 return val;
183}
184
185static fastcall void native_write_cr3(unsigned long val)
186{
187 asm volatile("movl %0,%%cr3": :"r" (val));
188}
189
190static fastcall unsigned long native_read_cr4(void)
191{
192 unsigned long val;
193 asm volatile("movl %%cr4,%0\n\t" :"=r" (val));
194 return val;
195}
196
197static fastcall unsigned long native_read_cr4_safe(void)
198{
199 unsigned long val;
200 /* This could fault if %cr4 does not exist */
201 asm("1: movl %%cr4, %0 \n"
202 "2: \n"
203 ".section __ex_table,\"a\" \n"
204 ".long 1b,2b \n"
205 ".previous \n"
206 : "=r" (val): "0" (0));
207 return val;
208}
209
210static fastcall void native_write_cr4(unsigned long val)
211{
212 asm volatile("movl %0,%%cr4": :"r" (val));
213}
214
215static fastcall unsigned long native_save_fl(void)
216{
217 unsigned long f;
218 asm volatile("pushfl ; popl %0":"=g" (f): /* no input */);
219 return f;
220}
221
222static fastcall void native_restore_fl(unsigned long f)
223{
224 asm volatile("pushl %0 ; popfl": /* no output */
225 :"g" (f)
226 :"memory", "cc");
227}
228
229static fastcall void native_irq_disable(void)
230{
231 asm volatile("cli": : :"memory");
232}
233
234static fastcall void native_irq_enable(void)
235{
236 asm volatile("sti": : :"memory");
237}
238
239static fastcall void native_safe_halt(void)
240{
241 asm volatile("sti; hlt": : :"memory");
242}
243
244static fastcall void native_halt(void)
245{
246 asm volatile("hlt": : :"memory");
247}
248
249static fastcall void native_wbinvd(void)
250{
251 asm volatile("wbinvd": : :"memory");
252}
253
254static fastcall unsigned long long native_read_msr(unsigned int msr, int *err)
255{
256 unsigned long long val;
257
258 asm volatile("2: rdmsr ; xorl %0,%0\n"
259 "1:\n\t"
260 ".section .fixup,\"ax\"\n\t"
261 "3: movl %3,%0 ; jmp 1b\n\t"
262 ".previous\n\t"
263 ".section __ex_table,\"a\"\n"
264 " .align 4\n\t"
265 " .long 2b,3b\n\t"
266 ".previous"
267 : "=r" (*err), "=A" (val)
268 : "c" (msr), "i" (-EFAULT));
269
270 return val;
271}
272
273static fastcall int native_write_msr(unsigned int msr, unsigned long long val)
274{
275 int err;
276 asm volatile("2: wrmsr ; xorl %0,%0\n"
277 "1:\n\t"
278 ".section .fixup,\"ax\"\n\t"
279 "3: movl %4,%0 ; jmp 1b\n\t"
280 ".previous\n\t"
281 ".section __ex_table,\"a\"\n"
282 " .align 4\n\t"
283 " .long 2b,3b\n\t"
284 ".previous"
285 : "=a" (err)
286 : "c" (msr), "0" ((u32)val), "d" ((u32)(val>>32)),
287 "i" (-EFAULT));
288 return err;
289}
290
291static fastcall unsigned long long native_read_tsc(void)
292{
293 unsigned long long val;
294 asm volatile("rdtsc" : "=A" (val));
295 return val;
296}
297
298static fastcall unsigned long long native_read_pmc(void)
299{
300 unsigned long long val;
301 asm volatile("rdpmc" : "=A" (val));
302 return val;
303}
304
305static fastcall void native_load_tr_desc(void)
306{
307 asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
308}
309
310static fastcall void native_load_gdt(const struct Xgt_desc_struct *dtr)
311{
312 asm volatile("lgdt %0"::"m" (*dtr));
313}
314
315static fastcall void native_load_idt(const struct Xgt_desc_struct *dtr)
316{
317 asm volatile("lidt %0"::"m" (*dtr));
318}
319
320static fastcall void native_store_gdt(struct Xgt_desc_struct *dtr)
321{
322 asm ("sgdt %0":"=m" (*dtr));
323}
324
325static fastcall void native_store_idt(struct Xgt_desc_struct *dtr)
326{
327 asm ("sidt %0":"=m" (*dtr));
328}
329
330static fastcall unsigned long native_store_tr(void)
331{
332 unsigned long tr;
333 asm ("str %0":"=r" (tr));
334 return tr;
335}
336
337static fastcall void native_load_tls(struct thread_struct *t, unsigned int cpu)
338{
339#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]
340 C(0); C(1); C(2);
341#undef C
342}
343
344static inline void native_write_dt_entry(void *dt, int entry, u32 entry_low, u32 entry_high)
345{
346 u32 *lp = (u32 *)((char *)dt + entry*8);
347 lp[0] = entry_low;
348 lp[1] = entry_high;
349}
350
351static fastcall void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high)
352{
353 native_write_dt_entry(dt, entrynum, low, high);
354}
355
356static fastcall void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high)
357{
358 native_write_dt_entry(dt, entrynum, low, high);
359}
360
361static fastcall void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high)
362{
363 native_write_dt_entry(dt, entrynum, low, high);
364}
365
366static fastcall void native_load_esp0(struct tss_struct *tss,
367 struct thread_struct *thread)
368{
369 tss->esp0 = thread->esp0;
370
371 /* This can only happen when SEP is enabled, no need to test "SEP"arately */
372 if (unlikely(tss->ss1 != thread->sysenter_cs)) {
373 tss->ss1 = thread->sysenter_cs;
374 wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
375 }
376}
377
378static fastcall void native_io_delay(void)
379{
380 asm volatile("outb %al,$0x80");
381}
382
383static fastcall void native_flush_tlb(void)
384{
385 __native_flush_tlb();
386}
387
388/*
389 * Global pages have to be flushed a bit differently. Not a real
390 * performance problem because this does not happen often.
391 */
392static fastcall void native_flush_tlb_global(void)
393{
394 __native_flush_tlb_global();
395}
396
397static fastcall void native_flush_tlb_single(u32 addr)
398{
399 __native_flush_tlb_single(addr);
400}
401
402#ifndef CONFIG_X86_PAE
403static fastcall void native_set_pte(pte_t *ptep, pte_t pteval)
404{
405 *ptep = pteval;
406}
407
408static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval)
409{
410 *ptep = pteval;
411}
412
413static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
414{
415 *pmdp = pmdval;
416}
417
418#else /* CONFIG_X86_PAE */
419
420static fastcall void native_set_pte(pte_t *ptep, pte_t pte)
421{
422 ptep->pte_high = pte.pte_high;
423 smp_wmb();
424 ptep->pte_low = pte.pte_low;
425}
426
427static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte)
428{
429 ptep->pte_high = pte.pte_high;
430 smp_wmb();
431 ptep->pte_low = pte.pte_low;
432}
433
434static fastcall void native_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
435{
436 ptep->pte_low = 0;
437 smp_wmb();
438 ptep->pte_high = pte.pte_high;
439 smp_wmb();
440 ptep->pte_low = pte.pte_low;
441}
442
443static fastcall void native_set_pte_atomic(pte_t *ptep, pte_t pteval)
444{
445 set_64bit((unsigned long long *)ptep,pte_val(pteval));
446}
447
448static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
449{
450 set_64bit((unsigned long long *)pmdp,pmd_val(pmdval));
451}
452
453static fastcall void native_set_pud(pud_t *pudp, pud_t pudval)
454{
455 *pudp = pudval;
456}
457
458static fastcall void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
459{
460 ptep->pte_low = 0;
461 smp_wmb();
462 ptep->pte_high = 0;
463}
464
465static fastcall void native_pmd_clear(pmd_t *pmd)
466{
467 u32 *tmp = (u32 *)pmd;
468 *tmp = 0;
469 smp_wmb();
470 *(tmp + 1) = 0;
471}
472#endif /* CONFIG_X86_PAE */
473
474/* These are in entry.S */
475extern fastcall void native_iret(void);
476extern fastcall void native_irq_enable_sysexit(void);
477
478static int __init print_banner(void)
479{
480 paravirt_ops.banner();
481 return 0;
482}
483core_initcall(print_banner);
484
485/* We simply declare start_kernel to be the paravirt probe of last resort. */
486paravirt_probe(start_kernel);
487
488struct paravirt_ops paravirt_ops = {
489 .name = "bare hardware",
490 .paravirt_enabled = 0,
491 .kernel_rpl = 0,
492
493 .patch = native_patch,
494 .banner = default_banner,
495 .arch_setup = native_nop,
496 .memory_setup = machine_specific_memory_setup,
497 .get_wallclock = native_get_wallclock,
498 .set_wallclock = native_set_wallclock,
499 .time_init = time_init_hook,
500 .init_IRQ = native_init_IRQ,
501
502 .cpuid = native_cpuid,
503 .get_debugreg = native_get_debugreg,
504 .set_debugreg = native_set_debugreg,
505 .clts = native_clts,
506 .read_cr0 = native_read_cr0,
507 .write_cr0 = native_write_cr0,
508 .read_cr2 = native_read_cr2,
509 .write_cr2 = native_write_cr2,
510 .read_cr3 = native_read_cr3,
511 .write_cr3 = native_write_cr3,
512 .read_cr4 = native_read_cr4,
513 .read_cr4_safe = native_read_cr4_safe,
514 .write_cr4 = native_write_cr4,
515 .save_fl = native_save_fl,
516 .restore_fl = native_restore_fl,
517 .irq_disable = native_irq_disable,
518 .irq_enable = native_irq_enable,
519 .safe_halt = native_safe_halt,
520 .halt = native_halt,
521 .wbinvd = native_wbinvd,
522 .read_msr = native_read_msr,
523 .write_msr = native_write_msr,
524 .read_tsc = native_read_tsc,
525 .read_pmc = native_read_pmc,
526 .load_tr_desc = native_load_tr_desc,
527 .set_ldt = native_set_ldt,
528 .load_gdt = native_load_gdt,
529 .load_idt = native_load_idt,
530 .store_gdt = native_store_gdt,
531 .store_idt = native_store_idt,
532 .store_tr = native_store_tr,
533 .load_tls = native_load_tls,
534 .write_ldt_entry = native_write_ldt_entry,
535 .write_gdt_entry = native_write_gdt_entry,
536 .write_idt_entry = native_write_idt_entry,
537 .load_esp0 = native_load_esp0,
538
539 .set_iopl_mask = native_set_iopl_mask,
540 .io_delay = native_io_delay,
541 .const_udelay = __const_udelay,
542
543#ifdef CONFIG_X86_LOCAL_APIC
544 .apic_write = native_apic_write,
545 .apic_write_atomic = native_apic_write_atomic,
546 .apic_read = native_apic_read,
547#endif
548
549 .flush_tlb_user = native_flush_tlb,
550 .flush_tlb_kernel = native_flush_tlb_global,
551 .flush_tlb_single = native_flush_tlb_single,
552
553 .set_pte = native_set_pte,
554 .set_pte_at = native_set_pte_at,
555 .set_pmd = native_set_pmd,
556 .pte_update = (void *)native_nop,
557 .pte_update_defer = (void *)native_nop,
558#ifdef CONFIG_X86_PAE
559 .set_pte_atomic = native_set_pte_atomic,
560 .set_pte_present = native_set_pte_present,
561 .set_pud = native_set_pud,
562 .pte_clear = native_pte_clear,
563 .pmd_clear = native_pmd_clear,
564#endif
565
566 .irq_enable_sysexit = native_irq_enable_sysexit,
567 .iret = native_iret,
568};
569EXPORT_SYMBOL(paravirt_ops);
diff --git a/arch/i386/kernel/pci-dma.c b/arch/i386/kernel/pci-dma.c
index 5c8c6ef1fc5e..41af692c1584 100644
--- a/arch/i386/kernel/pci-dma.c
+++ b/arch/i386/kernel/pci-dma.c
@@ -92,14 +92,12 @@ int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
92 if (!mem_base) 92 if (!mem_base)
93 goto out; 93 goto out;
94 94
95 dev->dma_mem = kmalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); 95 dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
96 if (!dev->dma_mem) 96 if (!dev->dma_mem)
97 goto out; 97 goto out;
98 memset(dev->dma_mem, 0, sizeof(struct dma_coherent_mem)); 98 dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
99 dev->dma_mem->bitmap = kmalloc(bitmap_size, GFP_KERNEL);
100 if (!dev->dma_mem->bitmap) 99 if (!dev->dma_mem->bitmap)
101 goto free1_out; 100 goto free1_out;
102 memset(dev->dma_mem->bitmap, 0, bitmap_size);
103 101
104 dev->dma_mem->virt_base = mem_base; 102 dev->dma_mem->virt_base = mem_base;
105 dev->dma_mem->device_base = device_addr; 103 dev->dma_mem->device_base = device_addr;
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index dd53c58f64f1..99308510a17c 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -56,6 +56,7 @@
56 56
57#include <asm/tlbflush.h> 57#include <asm/tlbflush.h>
58#include <asm/cpu.h> 58#include <asm/cpu.h>
59#include <asm/pda.h>
59 60
60asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 61asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
61 62
@@ -99,22 +100,18 @@ EXPORT_SYMBOL(enable_hlt);
99 */ 100 */
100void default_idle(void) 101void default_idle(void)
101{ 102{
102 local_irq_enable();
103
104 if (!hlt_counter && boot_cpu_data.hlt_works_ok) { 103 if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
105 current_thread_info()->status &= ~TS_POLLING; 104 current_thread_info()->status &= ~TS_POLLING;
106 smp_mb__after_clear_bit(); 105 smp_mb__after_clear_bit();
107 while (!need_resched()) { 106 local_irq_disable();
108 local_irq_disable(); 107 if (!need_resched())
109 if (!need_resched()) 108 safe_halt(); /* enables interrupts racelessly */
110 safe_halt(); 109 else
111 else 110 local_irq_enable();
112 local_irq_enable();
113 }
114 current_thread_info()->status |= TS_POLLING; 111 current_thread_info()->status |= TS_POLLING;
115 } else { 112 } else {
116 while (!need_resched()) 113 /* loop is done by the caller */
117 cpu_relax(); 114 cpu_relax();
118 } 115 }
119} 116}
120#ifdef CONFIG_APM_MODULE 117#ifdef CONFIG_APM_MODULE
@@ -128,14 +125,7 @@ EXPORT_SYMBOL(default_idle);
128 */ 125 */
129static void poll_idle (void) 126static void poll_idle (void)
130{ 127{
131 local_irq_enable(); 128 cpu_relax();
132
133 asm volatile(
134 "2:"
135 "testl %0, %1;"
136 "rep; nop;"
137 "je 2b;"
138 : : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags));
139} 129}
140 130
141#ifdef CONFIG_HOTPLUG_CPU 131#ifdef CONFIG_HOTPLUG_CPU
@@ -256,8 +246,7 @@ void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
256static void mwait_idle(void) 246static void mwait_idle(void)
257{ 247{
258 local_irq_enable(); 248 local_irq_enable();
259 while (!need_resched()) 249 mwait_idle_with_hints(0, 0);
260 mwait_idle_with_hints(0, 0);
261} 250}
262 251
263void __devinit select_idle_routine(const struct cpuinfo_x86 *c) 252void __devinit select_idle_routine(const struct cpuinfo_x86 *c)
@@ -314,8 +303,8 @@ void show_regs(struct pt_regs * regs)
314 regs->eax,regs->ebx,regs->ecx,regs->edx); 303 regs->eax,regs->ebx,regs->ecx,regs->edx);
315 printk("ESI: %08lx EDI: %08lx EBP: %08lx", 304 printk("ESI: %08lx EDI: %08lx EBP: %08lx",
316 regs->esi, regs->edi, regs->ebp); 305 regs->esi, regs->edi, regs->ebp);
317 printk(" DS: %04x ES: %04x\n", 306 printk(" DS: %04x ES: %04x GS: %04x\n",
318 0xffff & regs->xds,0xffff & regs->xes); 307 0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xgs);
319 308
320 cr0 = read_cr0(); 309 cr0 = read_cr0();
321 cr2 = read_cr2(); 310 cr2 = read_cr2();
@@ -346,6 +335,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
346 335
347 regs.xds = __USER_DS; 336 regs.xds = __USER_DS;
348 regs.xes = __USER_DS; 337 regs.xes = __USER_DS;
338 regs.xgs = __KERNEL_PDA;
349 regs.orig_eax = -1; 339 regs.orig_eax = -1;
350 regs.eip = (unsigned long) kernel_thread_helper; 340 regs.eip = (unsigned long) kernel_thread_helper;
351 regs.xcs = __KERNEL_CS | get_kernel_rpl(); 341 regs.xcs = __KERNEL_CS | get_kernel_rpl();
@@ -431,7 +421,6 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
431 p->thread.eip = (unsigned long) ret_from_fork; 421 p->thread.eip = (unsigned long) ret_from_fork;
432 422
433 savesegment(fs,p->thread.fs); 423 savesegment(fs,p->thread.fs);
434 savesegment(gs,p->thread.gs);
435 424
436 tsk = current; 425 tsk = current;
437 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { 426 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
@@ -508,7 +497,7 @@ void dump_thread(struct pt_regs * regs, struct user * dump)
508 dump->regs.ds = regs->xds; 497 dump->regs.ds = regs->xds;
509 dump->regs.es = regs->xes; 498 dump->regs.es = regs->xes;
510 savesegment(fs,dump->regs.fs); 499 savesegment(fs,dump->regs.fs);
511 savesegment(gs,dump->regs.gs); 500 dump->regs.gs = regs->xgs;
512 dump->regs.orig_eax = regs->orig_eax; 501 dump->regs.orig_eax = regs->orig_eax;
513 dump->regs.eip = regs->eip; 502 dump->regs.eip = regs->eip;
514 dump->regs.cs = regs->xcs; 503 dump->regs.cs = regs->xcs;
@@ -648,22 +637,27 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
648 637
649 __unlazy_fpu(prev_p); 638 __unlazy_fpu(prev_p);
650 639
640
641 /* we're going to use this soon, after a few expensive things */
642 if (next_p->fpu_counter > 5)
643 prefetch(&next->i387.fxsave);
644
651 /* 645 /*
652 * Reload esp0. 646 * Reload esp0.
653 */ 647 */
654 load_esp0(tss, next); 648 load_esp0(tss, next);
655 649
656 /* 650 /*
657 * Save away %fs and %gs. No need to save %es and %ds, as 651 * Save away %fs. No need to save %gs, as it was saved on the
658 * those are always kernel segments while inside the kernel. 652 * stack on entry. No need to save %es and %ds, as those are
659 * Doing this before setting the new TLS descriptors avoids 653 * always kernel segments while inside the kernel. Doing this
660 * the situation where we temporarily have non-reloadable 654 * before setting the new TLS descriptors avoids the situation
661 * segments in %fs and %gs. This could be an issue if the 655 * where we temporarily have non-reloadable segments in %fs
662 * NMI handler ever used %fs or %gs (it does not today), or 656 * and %gs. This could be an issue if the NMI handler ever
663 * if the kernel is running inside of a hypervisor layer. 657 * used %fs or %gs (it does not today), or if the kernel is
658 * running inside of a hypervisor layer.
664 */ 659 */
665 savesegment(fs, prev->fs); 660 savesegment(fs, prev->fs);
666 savesegment(gs, prev->gs);
667 661
668 /* 662 /*
669 * Load the per-thread Thread-Local Storage descriptor. 663 * Load the per-thread Thread-Local Storage descriptor.
@@ -671,22 +665,14 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
671 load_TLS(next, cpu); 665 load_TLS(next, cpu);
672 666
673 /* 667 /*
674 * Restore %fs and %gs if needed. 668 * Restore %fs if needed.
675 * 669 *
676 * Glibc normally makes %fs be zero, and %gs is one of 670 * Glibc normally makes %fs be zero.
677 * the TLS segments.
678 */ 671 */
679 if (unlikely(prev->fs | next->fs)) 672 if (unlikely(prev->fs | next->fs))
680 loadsegment(fs, next->fs); 673 loadsegment(fs, next->fs);
681 674
682 if (prev->gs | next->gs) 675 write_pda(pcurrent, next_p);
683 loadsegment(gs, next->gs);
684
685 /*
686 * Restore IOPL if needed.
687 */
688 if (unlikely(prev->iopl != next->iopl))
689 set_iopl_mask(next->iopl);
690 676
691 /* 677 /*
692 * Now maybe handle debug registers and/or IO bitmaps 678 * Now maybe handle debug registers and/or IO bitmaps
@@ -697,6 +683,13 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
697 683
698 disable_tsc(prev_p, next_p); 684 disable_tsc(prev_p, next_p);
699 685
686 /* If the task has used fpu the last 5 timeslices, just do a full
687 * restore of the math state immediately to avoid the trap; the
688 * chances of needing FPU soon are obviously high now
689 */
690 if (next_p->fpu_counter > 5)
691 math_state_restore();
692
700 return prev_p; 693 return prev_p;
701} 694}
702 695
diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index 775f50e9395b..f3f94ac5736a 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -94,13 +94,9 @@ static int putreg(struct task_struct *child,
94 return -EIO; 94 return -EIO;
95 child->thread.fs = value; 95 child->thread.fs = value;
96 return 0; 96 return 0;
97 case GS:
98 if (value && (value & 3) != 3)
99 return -EIO;
100 child->thread.gs = value;
101 return 0;
102 case DS: 97 case DS:
103 case ES: 98 case ES:
99 case GS:
104 if (value && (value & 3) != 3) 100 if (value && (value & 3) != 3)
105 return -EIO; 101 return -EIO;
106 value &= 0xffff; 102 value &= 0xffff;
@@ -116,8 +112,8 @@ static int putreg(struct task_struct *child,
116 value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK; 112 value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK;
117 break; 113 break;
118 } 114 }
119 if (regno > GS*4) 115 if (regno > ES*4)
120 regno -= 2*4; 116 regno -= 1*4;
121 put_stack_long(child, regno - sizeof(struct pt_regs), value); 117 put_stack_long(child, regno - sizeof(struct pt_regs), value);
122 return 0; 118 return 0;
123} 119}
@@ -131,18 +127,16 @@ static unsigned long getreg(struct task_struct *child,
131 case FS: 127 case FS:
132 retval = child->thread.fs; 128 retval = child->thread.fs;
133 break; 129 break;
134 case GS:
135 retval = child->thread.gs;
136 break;
137 case DS: 130 case DS:
138 case ES: 131 case ES:
132 case GS:
139 case SS: 133 case SS:
140 case CS: 134 case CS:
141 retval = 0xffff; 135 retval = 0xffff;
142 /* fall through */ 136 /* fall through */
143 default: 137 default:
144 if (regno > GS*4) 138 if (regno > ES*4)
145 regno -= 2*4; 139 regno -= 1*4;
146 regno = regno - sizeof(struct pt_regs); 140 regno = regno - sizeof(struct pt_regs);
147 retval &= get_stack_long(child, regno); 141 retval &= get_stack_long(child, regno);
148 } 142 }
diff --git a/arch/i386/kernel/quirks.c b/arch/i386/kernel/quirks.c
index 9f6ab1789bb0..a01320a7b636 100644
--- a/arch/i386/kernel/quirks.c
+++ b/arch/i386/kernel/quirks.c
@@ -3,10 +3,23 @@
3 */ 3 */
4#include <linux/pci.h> 4#include <linux/pci.h>
5#include <linux/irq.h> 5#include <linux/irq.h>
6#include <asm/pci-direct.h>
7#include <asm/genapic.h>
8#include <asm/cpu.h>
6 9
7#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) 10#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
11static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev)
12{
13#ifdef CONFIG_X86_64
14 if (genapic != &apic_flat)
15 panic("APIC mode must be flat on this system\n");
16#elif defined(CONFIG_X86_GENERICARCH)
17 if (genapic != &apic_default)
18 panic("APIC mode must be default(flat) on this system. Use apic=default\n");
19#endif
20}
8 21
9static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) 22void __init quirk_intel_irqbalance(void)
10{ 23{
11 u8 config, rev; 24 u8 config, rev;
12 u32 word; 25 u32 word;
@@ -16,18 +29,18 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
16 * based platforms. 29 * based platforms.
17 * Disable SW irqbalance/affinity on those platforms. 30 * Disable SW irqbalance/affinity on those platforms.
18 */ 31 */
19 pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); 32 rev = read_pci_config_byte(0, 0, 0, PCI_CLASS_REVISION);
20 if (rev > 0x9) 33 if (rev > 0x9)
21 return; 34 return;
22 35
23 printk(KERN_INFO "Intel E7520/7320/7525 detected."); 36 printk(KERN_INFO "Intel E7520/7320/7525 detected.");
24 37
25 /* enable access to config space*/ 38 /* enable access to config space */
26 pci_read_config_byte(dev, 0xf4, &config); 39 config = read_pci_config_byte(0, 0, 0, 0xf4);
27 pci_write_config_byte(dev, 0xf4, config|0x2); 40 write_pci_config_byte(0, 0, 0, 0xf4, config|0x2);
28 41
29 /* read xTPR register */ 42 /* read xTPR register */
30 raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word); 43 word = read_pci_config_16(0, 0, 0x40, 0x4c);
31 44
32 if (!(word & (1 << 13))) { 45 if (!(word & (1 << 13))) {
33 printk(KERN_INFO "Disabling irq balancing and affinity\n"); 46 printk(KERN_INFO "Disabling irq balancing and affinity\n");
@@ -38,13 +51,24 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
38#ifdef CONFIG_PROC_FS 51#ifdef CONFIG_PROC_FS
39 no_irq_affinity = 1; 52 no_irq_affinity = 1;
40#endif 53#endif
54#ifdef CONFIG_HOTPLUG_CPU
55 printk(KERN_INFO "Disabling cpu hotplug control\n");
56 enable_cpu_hotplug = 0;
57#endif
58#ifdef CONFIG_X86_64
59 /* force the genapic selection to flat mode so that
60 * interrupts can be redirected to more than one CPU.
61 */
62 genapic_force = &apic_flat;
63#endif
41 } 64 }
42 65
43 /* put back the original value for config space*/ 66 /* put back the original value for config space */
44 if (!(config & 0x2)) 67 if (!(config & 0x2))
45 pci_write_config_byte(dev, 0xf4, config); 68 write_pci_config_byte(0, 0, 0, 0xf4, config);
46} 69}
47DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance); 70DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, verify_quirk_intel_irqbalance);
48DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance); 71DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, verify_quirk_intel_irqbalance);
49DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance); 72DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, verify_quirk_intel_irqbalance);
73
50#endif 74#endif
diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c
index 84278e0093a2..3514b4153f7f 100644
--- a/arch/i386/kernel/reboot.c
+++ b/arch/i386/kernel/reboot.c
@@ -12,6 +12,7 @@
12#include <linux/dmi.h> 12#include <linux/dmi.h>
13#include <linux/ctype.h> 13#include <linux/ctype.h>
14#include <linux/pm.h> 14#include <linux/pm.h>
15#include <linux/reboot.h>
15#include <asm/uaccess.h> 16#include <asm/uaccess.h>
16#include <asm/apic.h> 17#include <asm/apic.h>
17#include <asm/desc.h> 18#include <asm/desc.h>
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index 141041dde74d..79df6e612dbd 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -63,9 +63,6 @@
63#include <setup_arch.h> 63#include <setup_arch.h>
64#include <bios_ebda.h> 64#include <bios_ebda.h>
65 65
66/* Forward Declaration. */
67void __init find_max_pfn(void);
68
69/* This value is set up by the early boot code to point to the value 66/* This value is set up by the early boot code to point to the value
70 immediately after the boot time page tables. It contains a *physical* 67 immediately after the boot time page tables. It contains a *physical*
71 address, and must not be in the .bss segment! */ 68 address, and must not be in the .bss segment! */
@@ -76,11 +73,8 @@ int disable_pse __devinitdata = 0;
76/* 73/*
77 * Machine setup.. 74 * Machine setup..
78 */ 75 */
79 76extern struct resource code_resource;
80#ifdef CONFIG_EFI 77extern struct resource data_resource;
81int efi_enabled = 0;
82EXPORT_SYMBOL(efi_enabled);
83#endif
84 78
85/* cpu data as detected by the assembly code in head.S */ 79/* cpu data as detected by the assembly code in head.S */
86struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; 80struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
@@ -99,12 +93,6 @@ unsigned int machine_submodel_id;
99unsigned int BIOS_revision; 93unsigned int BIOS_revision;
100unsigned int mca_pentium_flag; 94unsigned int mca_pentium_flag;
101 95
102/* For PCI or other memory-mapped resources */
103unsigned long pci_mem_start = 0x10000000;
104#ifdef CONFIG_PCI
105EXPORT_SYMBOL(pci_mem_start);
106#endif
107
108/* Boot loader ID as an integer, for the benefit of proc_dointvec */ 96/* Boot loader ID as an integer, for the benefit of proc_dointvec */
109int bootloader_type; 97int bootloader_type;
110 98
@@ -134,7 +122,6 @@ struct ist_info ist_info;
134 defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE) 122 defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
135EXPORT_SYMBOL(ist_info); 123EXPORT_SYMBOL(ist_info);
136#endif 124#endif
137struct e820map e820;
138 125
139extern void early_cpu_init(void); 126extern void early_cpu_init(void);
140extern int root_mountflags; 127extern int root_mountflags;
@@ -149,516 +136,6 @@ static char command_line[COMMAND_LINE_SIZE];
149 136
150unsigned char __initdata boot_params[PARAM_SIZE]; 137unsigned char __initdata boot_params[PARAM_SIZE];
151 138
152static struct resource data_resource = {
153 .name = "Kernel data",
154 .start = 0,
155 .end = 0,
156 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
157};
158
159static struct resource code_resource = {
160 .name = "Kernel code",
161 .start = 0,
162 .end = 0,
163 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
164};
165
166static struct resource system_rom_resource = {
167 .name = "System ROM",
168 .start = 0xf0000,
169 .end = 0xfffff,
170 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
171};
172
173static struct resource extension_rom_resource = {
174 .name = "Extension ROM",
175 .start = 0xe0000,
176 .end = 0xeffff,
177 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
178};
179
180static struct resource adapter_rom_resources[] = { {
181 .name = "Adapter ROM",
182 .start = 0xc8000,
183 .end = 0,
184 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
185}, {
186 .name = "Adapter ROM",
187 .start = 0,
188 .end = 0,
189 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
190}, {
191 .name = "Adapter ROM",
192 .start = 0,
193 .end = 0,
194 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
195}, {
196 .name = "Adapter ROM",
197 .start = 0,
198 .end = 0,
199 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
200}, {
201 .name = "Adapter ROM",
202 .start = 0,
203 .end = 0,
204 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
205}, {
206 .name = "Adapter ROM",
207 .start = 0,
208 .end = 0,
209 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
210} };
211
212static struct resource video_rom_resource = {
213 .name = "Video ROM",
214 .start = 0xc0000,
215 .end = 0xc7fff,
216 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
217};
218
219static struct resource video_ram_resource = {
220 .name = "Video RAM area",
221 .start = 0xa0000,
222 .end = 0xbffff,
223 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
224};
225
226static struct resource standard_io_resources[] = { {
227 .name = "dma1",
228 .start = 0x0000,
229 .end = 0x001f,
230 .flags = IORESOURCE_BUSY | IORESOURCE_IO
231}, {
232 .name = "pic1",
233 .start = 0x0020,
234 .end = 0x0021,
235 .flags = IORESOURCE_BUSY | IORESOURCE_IO
236}, {
237 .name = "timer0",
238 .start = 0x0040,
239 .end = 0x0043,
240 .flags = IORESOURCE_BUSY | IORESOURCE_IO
241}, {
242 .name = "timer1",
243 .start = 0x0050,
244 .end = 0x0053,
245 .flags = IORESOURCE_BUSY | IORESOURCE_IO
246}, {
247 .name = "keyboard",
248 .start = 0x0060,
249 .end = 0x006f,
250 .flags = IORESOURCE_BUSY | IORESOURCE_IO
251}, {
252 .name = "dma page reg",
253 .start = 0x0080,
254 .end = 0x008f,
255 .flags = IORESOURCE_BUSY | IORESOURCE_IO
256}, {
257 .name = "pic2",
258 .start = 0x00a0,
259 .end = 0x00a1,
260 .flags = IORESOURCE_BUSY | IORESOURCE_IO
261}, {
262 .name = "dma2",
263 .start = 0x00c0,
264 .end = 0x00df,
265 .flags = IORESOURCE_BUSY | IORESOURCE_IO
266}, {
267 .name = "fpu",
268 .start = 0x00f0,
269 .end = 0x00ff,
270 .flags = IORESOURCE_BUSY | IORESOURCE_IO
271} };
272
273#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
274
275static int __init romchecksum(unsigned char *rom, unsigned long length)
276{
277 unsigned char *p, sum = 0;
278
279 for (p = rom; p < rom + length; p++)
280 sum += *p;
281 return sum == 0;
282}
283
284static void __init probe_roms(void)
285{
286 unsigned long start, length, upper;
287 unsigned char *rom;
288 int i;
289
290 /* video rom */
291 upper = adapter_rom_resources[0].start;
292 for (start = video_rom_resource.start; start < upper; start += 2048) {
293 rom = isa_bus_to_virt(start);
294 if (!romsignature(rom))
295 continue;
296
297 video_rom_resource.start = start;
298
299 /* 0 < length <= 0x7f * 512, historically */
300 length = rom[2] * 512;
301
302 /* if checksum okay, trust length byte */
303 if (length && romchecksum(rom, length))
304 video_rom_resource.end = start + length - 1;
305
306 request_resource(&iomem_resource, &video_rom_resource);
307 break;
308 }
309
310 start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
311 if (start < upper)
312 start = upper;
313
314 /* system rom */
315 request_resource(&iomem_resource, &system_rom_resource);
316 upper = system_rom_resource.start;
317
318 /* check for extension rom (ignore length byte!) */
319 rom = isa_bus_to_virt(extension_rom_resource.start);
320 if (romsignature(rom)) {
321 length = extension_rom_resource.end - extension_rom_resource.start + 1;
322 if (romchecksum(rom, length)) {
323 request_resource(&iomem_resource, &extension_rom_resource);
324 upper = extension_rom_resource.start;
325 }
326 }
327
328 /* check for adapter roms on 2k boundaries */
329 for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
330 rom = isa_bus_to_virt(start);
331 if (!romsignature(rom))
332 continue;
333
334 /* 0 < length <= 0x7f * 512, historically */
335 length = rom[2] * 512;
336
337 /* but accept any length that fits if checksum okay */
338 if (!length || start + length > upper || !romchecksum(rom, length))
339 continue;
340
341 adapter_rom_resources[i].start = start;
342 adapter_rom_resources[i].end = start + length - 1;
343 request_resource(&iomem_resource, &adapter_rom_resources[i]);
344
345 start = adapter_rom_resources[i++].end & ~2047UL;
346 }
347}
348
349static void __init limit_regions(unsigned long long size)
350{
351 unsigned long long current_addr = 0;
352 int i;
353
354 if (efi_enabled) {
355 efi_memory_desc_t *md;
356 void *p;
357
358 for (p = memmap.map, i = 0; p < memmap.map_end;
359 p += memmap.desc_size, i++) {
360 md = p;
361 current_addr = md->phys_addr + (md->num_pages << 12);
362 if (md->type == EFI_CONVENTIONAL_MEMORY) {
363 if (current_addr >= size) {
364 md->num_pages -=
365 (((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT);
366 memmap.nr_map = i + 1;
367 return;
368 }
369 }
370 }
371 }
372 for (i = 0; i < e820.nr_map; i++) {
373 current_addr = e820.map[i].addr + e820.map[i].size;
374 if (current_addr < size)
375 continue;
376
377 if (e820.map[i].type != E820_RAM)
378 continue;
379
380 if (e820.map[i].addr >= size) {
381 /*
382 * This region starts past the end of the
383 * requested size, skip it completely.
384 */
385 e820.nr_map = i;
386 } else {
387 e820.nr_map = i + 1;
388 e820.map[i].size -= current_addr - size;
389 }
390 return;
391 }
392}
393
394void __init add_memory_region(unsigned long long start,
395 unsigned long long size, int type)
396{
397 int x;
398
399 if (!efi_enabled) {
400 x = e820.nr_map;
401
402 if (x == E820MAX) {
403 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
404 return;
405 }
406
407 e820.map[x].addr = start;
408 e820.map[x].size = size;
409 e820.map[x].type = type;
410 e820.nr_map++;
411 }
412} /* add_memory_region */
413
414#define E820_DEBUG 1
415
416static void __init print_memory_map(char *who)
417{
418 int i;
419
420 for (i = 0; i < e820.nr_map; i++) {
421 printk(" %s: %016Lx - %016Lx ", who,
422 e820.map[i].addr,
423 e820.map[i].addr + e820.map[i].size);
424 switch (e820.map[i].type) {
425 case E820_RAM: printk("(usable)\n");
426 break;
427 case E820_RESERVED:
428 printk("(reserved)\n");
429 break;
430 case E820_ACPI:
431 printk("(ACPI data)\n");
432 break;
433 case E820_NVS:
434 printk("(ACPI NVS)\n");
435 break;
436 default: printk("type %lu\n", e820.map[i].type);
437 break;
438 }
439 }
440}
441
442/*
443 * Sanitize the BIOS e820 map.
444 *
445 * Some e820 responses include overlapping entries. The following
446 * replaces the original e820 map with a new one, removing overlaps.
447 *
448 */
449struct change_member {
450 struct e820entry *pbios; /* pointer to original bios entry */
451 unsigned long long addr; /* address for this change point */
452};
453static struct change_member change_point_list[2*E820MAX] __initdata;
454static struct change_member *change_point[2*E820MAX] __initdata;
455static struct e820entry *overlap_list[E820MAX] __initdata;
456static struct e820entry new_bios[E820MAX] __initdata;
457
458int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
459{
460 struct change_member *change_tmp;
461 unsigned long current_type, last_type;
462 unsigned long long last_addr;
463 int chgidx, still_changing;
464 int overlap_entries;
465 int new_bios_entry;
466 int old_nr, new_nr, chg_nr;
467 int i;
468
469 /*
470 Visually we're performing the following (1,2,3,4 = memory types)...
471
472 Sample memory map (w/overlaps):
473 ____22__________________
474 ______________________4_
475 ____1111________________
476 _44_____________________
477 11111111________________
478 ____________________33__
479 ___________44___________
480 __________33333_________
481 ______________22________
482 ___________________2222_
483 _________111111111______
484 _____________________11_
485 _________________4______
486
487 Sanitized equivalent (no overlap):
488 1_______________________
489 _44_____________________
490 ___1____________________
491 ____22__________________
492 ______11________________
493 _________1______________
494 __________3_____________
495 ___________44___________
496 _____________33_________
497 _______________2________
498 ________________1_______
499 _________________4______
500 ___________________2____
501 ____________________33__
502 ______________________4_
503 */
504
505 /* if there's only one memory region, don't bother */
506 if (*pnr_map < 2)
507 return -1;
508
509 old_nr = *pnr_map;
510
511 /* bail out if we find any unreasonable addresses in bios map */
512 for (i=0; i<old_nr; i++)
513 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
514 return -1;
515
516 /* create pointers for initial change-point information (for sorting) */
517 for (i=0; i < 2*old_nr; i++)
518 change_point[i] = &change_point_list[i];
519
520 /* record all known change-points (starting and ending addresses),
521 omitting those that are for empty memory regions */
522 chgidx = 0;
523 for (i=0; i < old_nr; i++) {
524 if (biosmap[i].size != 0) {
525 change_point[chgidx]->addr = biosmap[i].addr;
526 change_point[chgidx++]->pbios = &biosmap[i];
527 change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
528 change_point[chgidx++]->pbios = &biosmap[i];
529 }
530 }
531 chg_nr = chgidx; /* true number of change-points */
532
533 /* sort change-point list by memory addresses (low -> high) */
534 still_changing = 1;
535 while (still_changing) {
536 still_changing = 0;
537 for (i=1; i < chg_nr; i++) {
538 /* if <current_addr> > <last_addr>, swap */
539 /* or, if current=<start_addr> & last=<end_addr>, swap */
540 if ((change_point[i]->addr < change_point[i-1]->addr) ||
541 ((change_point[i]->addr == change_point[i-1]->addr) &&
542 (change_point[i]->addr == change_point[i]->pbios->addr) &&
543 (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
544 )
545 {
546 change_tmp = change_point[i];
547 change_point[i] = change_point[i-1];
548 change_point[i-1] = change_tmp;
549 still_changing=1;
550 }
551 }
552 }
553
554 /* create a new bios memory map, removing overlaps */
555 overlap_entries=0; /* number of entries in the overlap table */
556 new_bios_entry=0; /* index for creating new bios map entries */
557 last_type = 0; /* start with undefined memory type */
558 last_addr = 0; /* start with 0 as last starting address */
559 /* loop through change-points, determining affect on the new bios map */
560 for (chgidx=0; chgidx < chg_nr; chgidx++)
561 {
562 /* keep track of all overlapping bios entries */
563 if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
564 {
565 /* add map entry to overlap list (> 1 entry implies an overlap) */
566 overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
567 }
568 else
569 {
570 /* remove entry from list (order independent, so swap with last) */
571 for (i=0; i<overlap_entries; i++)
572 {
573 if (overlap_list[i] == change_point[chgidx]->pbios)
574 overlap_list[i] = overlap_list[overlap_entries-1];
575 }
576 overlap_entries--;
577 }
578 /* if there are overlapping entries, decide which "type" to use */
579 /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
580 current_type = 0;
581 for (i=0; i<overlap_entries; i++)
582 if (overlap_list[i]->type > current_type)
583 current_type = overlap_list[i]->type;
584 /* continue building up new bios map based on this information */
585 if (current_type != last_type) {
586 if (last_type != 0) {
587 new_bios[new_bios_entry].size =
588 change_point[chgidx]->addr - last_addr;
589 /* move forward only if the new size was non-zero */
590 if (new_bios[new_bios_entry].size != 0)
591 if (++new_bios_entry >= E820MAX)
592 break; /* no more space left for new bios entries */
593 }
594 if (current_type != 0) {
595 new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
596 new_bios[new_bios_entry].type = current_type;
597 last_addr=change_point[chgidx]->addr;
598 }
599 last_type = current_type;
600 }
601 }
602 new_nr = new_bios_entry; /* retain count for new bios entries */
603
604 /* copy new bios mapping into original location */
605 memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
606 *pnr_map = new_nr;
607
608 return 0;
609}
610
611/*
612 * Copy the BIOS e820 map into a safe place.
613 *
614 * Sanity-check it while we're at it..
615 *
616 * If we're lucky and live on a modern system, the setup code
617 * will have given us a memory map that we can use to properly
618 * set up memory. If we aren't, we'll fake a memory map.
619 *
620 * We check to see that the memory map contains at least 2 elements
621 * before we'll use it, because the detection code in setup.S may
622 * not be perfect and most every PC known to man has two memory
623 * regions: one from 0 to 640k, and one from 1mb up. (The IBM
624 * thinkpad 560x, for example, does not cooperate with the memory
625 * detection code.)
626 */
627int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
628{
629 /* Only one memory region (or negative)? Ignore it */
630 if (nr_map < 2)
631 return -1;
632
633 do {
634 unsigned long long start = biosmap->addr;
635 unsigned long long size = biosmap->size;
636 unsigned long long end = start + size;
637 unsigned long type = biosmap->type;
638
639 /* Overflow in 64 bits? Ignore the memory map. */
640 if (start > end)
641 return -1;
642
643 /*
644 * Some BIOSes claim RAM in the 640k - 1M region.
645 * Not right. Fix it up.
646 */
647 if (type == E820_RAM) {
648 if (start < 0x100000ULL && end > 0xA0000ULL) {
649 if (start < 0xA0000ULL)
650 add_memory_region(start, 0xA0000ULL-start, type);
651 if (end <= 0x100000ULL)
652 continue;
653 start = 0x100000ULL;
654 size = end - start;
655 }
656 }
657 add_memory_region(start, size, type);
658 } while (biosmap++,--nr_map);
659 return 0;
660}
661
662#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) 139#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
663struct edd edd; 140struct edd edd;
664#ifdef CONFIG_EDD_MODULE 141#ifdef CONFIG_EDD_MODULE
@@ -682,7 +159,7 @@ static inline void copy_edd(void)
682} 159}
683#endif 160#endif
684 161
685static int __initdata user_defined_memmap = 0; 162int __initdata user_defined_memmap = 0;
686 163
687/* 164/*
688 * "mem=nopentium" disables the 4MB page tables. 165 * "mem=nopentium" disables the 4MB page tables.
@@ -719,51 +196,6 @@ static int __init parse_mem(char *arg)
719} 196}
720early_param("mem", parse_mem); 197early_param("mem", parse_mem);
721 198
722static int __init parse_memmap(char *arg)
723{
724 if (!arg)
725 return -EINVAL;
726
727 if (strcmp(arg, "exactmap") == 0) {
728#ifdef CONFIG_CRASH_DUMP
729 /* If we are doing a crash dump, we
730 * still need to know the real mem
731 * size before original memory map is
732 * reset.
733 */
734 find_max_pfn();
735 saved_max_pfn = max_pfn;
736#endif
737 e820.nr_map = 0;
738 user_defined_memmap = 1;
739 } else {
740 /* If the user specifies memory size, we
741 * limit the BIOS-provided memory map to
742 * that size. exactmap can be used to specify
743 * the exact map. mem=number can be used to
744 * trim the existing memory map.
745 */
746 unsigned long long start_at, mem_size;
747
748 mem_size = memparse(arg, &arg);
749 if (*arg == '@') {
750 start_at = memparse(arg+1, &arg);
751 add_memory_region(start_at, mem_size, E820_RAM);
752 } else if (*arg == '#') {
753 start_at = memparse(arg+1, &arg);
754 add_memory_region(start_at, mem_size, E820_ACPI);
755 } else if (*arg == '$') {
756 start_at = memparse(arg+1, &arg);
757 add_memory_region(start_at, mem_size, E820_RESERVED);
758 } else {
759 limit_regions(mem_size);
760 user_defined_memmap = 1;
761 }
762 }
763 return 0;
764}
765early_param("memmap", parse_memmap);
766
767#ifdef CONFIG_PROC_VMCORE 199#ifdef CONFIG_PROC_VMCORE
768/* elfcorehdr= specifies the location of elf core header 200/* elfcorehdr= specifies the location of elf core header
769 * stored by the crashed kernel. 201 * stored by the crashed kernel.
@@ -828,90 +260,6 @@ static int __init parse_reservetop(char *arg)
828early_param("reservetop", parse_reservetop); 260early_param("reservetop", parse_reservetop);
829 261
830/* 262/*
831 * Callback for efi_memory_walk.
832 */
833static int __init
834efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
835{
836 unsigned long *max_pfn = arg, pfn;
837
838 if (start < end) {
839 pfn = PFN_UP(end -1);
840 if (pfn > *max_pfn)
841 *max_pfn = pfn;
842 }
843 return 0;
844}
845
846static int __init
847efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
848{
849 memory_present(0, PFN_UP(start), PFN_DOWN(end));
850 return 0;
851}
852
853 /*
854 * This function checks if the entire range <start,end> is mapped with type.
855 *
856 * Note: this function only works correct if the e820 table is sorted and
857 * not-overlapping, which is the case
858 */
859int __init
860e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
861{
862 u64 start = s;
863 u64 end = e;
864 int i;
865 for (i = 0; i < e820.nr_map; i++) {
866 struct e820entry *ei = &e820.map[i];
867 if (type && ei->type != type)
868 continue;
869 /* is the region (part) in overlap with the current region ?*/
870 if (ei->addr >= end || ei->addr + ei->size <= start)
871 continue;
872 /* if the region is at the beginning of <start,end> we move
873 * start to the end of the region since it's ok until there
874 */
875 if (ei->addr <= start)
876 start = ei->addr + ei->size;
877 /* if start is now at or beyond end, we're done, full
878 * coverage */
879 if (start >= end)
880 return 1; /* we're done */
881 }
882 return 0;
883}
884
885/*
886 * Find the highest page frame number we have available
887 */
888void __init find_max_pfn(void)
889{
890 int i;
891
892 max_pfn = 0;
893 if (efi_enabled) {
894 efi_memmap_walk(efi_find_max_pfn, &max_pfn);
895 efi_memmap_walk(efi_memory_present_wrapper, NULL);
896 return;
897 }
898
899 for (i = 0; i < e820.nr_map; i++) {
900 unsigned long start, end;
901 /* RAM? */
902 if (e820.map[i].type != E820_RAM)
903 continue;
904 start = PFN_UP(e820.map[i].addr);
905 end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
906 if (start >= end)
907 continue;
908 if (end > max_pfn)
909 max_pfn = end;
910 memory_present(0, start, end);
911 }
912}
913
914/*
915 * Determine low and high memory ranges: 263 * Determine low and high memory ranges:
916 */ 264 */
917unsigned long __init find_max_low_pfn(void) 265unsigned long __init find_max_low_pfn(void)
@@ -971,68 +319,6 @@ unsigned long __init find_max_low_pfn(void)
971} 319}
972 320
973/* 321/*
974 * Free all available memory for boot time allocation. Used
975 * as a callback function by efi_memory_walk()
976 */
977
978static int __init
979free_available_memory(unsigned long start, unsigned long end, void *arg)
980{
981 /* check max_low_pfn */
982 if (start >= (max_low_pfn << PAGE_SHIFT))
983 return 0;
984 if (end >= (max_low_pfn << PAGE_SHIFT))
985 end = max_low_pfn << PAGE_SHIFT;
986 if (start < end)
987 free_bootmem(start, end - start);
988
989 return 0;
990}
991/*
992 * Register fully available low RAM pages with the bootmem allocator.
993 */
994static void __init register_bootmem_low_pages(unsigned long max_low_pfn)
995{
996 int i;
997
998 if (efi_enabled) {
999 efi_memmap_walk(free_available_memory, NULL);
1000 return;
1001 }
1002 for (i = 0; i < e820.nr_map; i++) {
1003 unsigned long curr_pfn, last_pfn, size;
1004 /*
1005 * Reserve usable low memory
1006 */
1007 if (e820.map[i].type != E820_RAM)
1008 continue;
1009 /*
1010 * We are rounding up the start address of usable memory:
1011 */
1012 curr_pfn = PFN_UP(e820.map[i].addr);
1013 if (curr_pfn >= max_low_pfn)
1014 continue;
1015 /*
1016 * ... and at the end of the usable range downwards:
1017 */
1018 last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
1019
1020 if (last_pfn > max_low_pfn)
1021 last_pfn = max_low_pfn;
1022
1023 /*
1024 * .. finally, did all the rounding and playing
1025 * around just make the area go away?
1026 */
1027 if (last_pfn <= curr_pfn)
1028 continue;
1029
1030 size = last_pfn - curr_pfn;
1031 free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
1032 }
1033}
1034
1035/*
1036 * workaround for Dell systems that neglect to reserve EBDA 322 * workaround for Dell systems that neglect to reserve EBDA
1037 */ 323 */
1038static void __init reserve_ebda_region(void) 324static void __init reserve_ebda_region(void)
@@ -1118,8 +404,8 @@ void __init setup_bootmem_allocator(void)
1118 * the (very unlikely) case of us accidentally initializing the 404 * the (very unlikely) case of us accidentally initializing the
1119 * bootmem allocator with an invalid RAM area. 405 * bootmem allocator with an invalid RAM area.
1120 */ 406 */
1121 reserve_bootmem(__PHYSICAL_START, (PFN_PHYS(min_low_pfn) + 407 reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
1122 bootmap_size + PAGE_SIZE-1) - (__PHYSICAL_START)); 408 bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text));
1123 409
1124 /* 410 /*
1125 * reserve physical page 0 - it's a special BIOS page on many boxes, 411 * reserve physical page 0 - it's a special BIOS page on many boxes,
@@ -1162,8 +448,7 @@ void __init setup_bootmem_allocator(void)
1162 if (LOADER_TYPE && INITRD_START) { 448 if (LOADER_TYPE && INITRD_START) {
1163 if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) { 449 if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
1164 reserve_bootmem(INITRD_START, INITRD_SIZE); 450 reserve_bootmem(INITRD_START, INITRD_SIZE);
1165 initrd_start = 451 initrd_start = INITRD_START + PAGE_OFFSET;
1166 INITRD_START ? INITRD_START + PAGE_OFFSET : 0;
1167 initrd_end = initrd_start+INITRD_SIZE; 452 initrd_end = initrd_start+INITRD_SIZE;
1168 } 453 }
1169 else { 454 else {
@@ -1200,126 +485,6 @@ void __init remapped_pgdat_init(void)
1200 } 485 }
1201} 486}
1202 487
1203/*
1204 * Request address space for all standard RAM and ROM resources
1205 * and also for regions reported as reserved by the e820.
1206 */
1207static void __init
1208legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource)
1209{
1210 int i;
1211
1212 probe_roms();
1213 for (i = 0; i < e820.nr_map; i++) {
1214 struct resource *res;
1215#ifndef CONFIG_RESOURCES_64BIT
1216 if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
1217 continue;
1218#endif
1219 res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
1220 switch (e820.map[i].type) {
1221 case E820_RAM: res->name = "System RAM"; break;
1222 case E820_ACPI: res->name = "ACPI Tables"; break;
1223 case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
1224 default: res->name = "reserved";
1225 }
1226 res->start = e820.map[i].addr;
1227 res->end = res->start + e820.map[i].size - 1;
1228 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
1229 if (request_resource(&iomem_resource, res)) {
1230 kfree(res);
1231 continue;
1232 }
1233 if (e820.map[i].type == E820_RAM) {
1234 /*
1235 * We don't know which RAM region contains kernel data,
1236 * so we try it repeatedly and let the resource manager
1237 * test it.
1238 */
1239 request_resource(res, code_resource);
1240 request_resource(res, data_resource);
1241#ifdef CONFIG_KEXEC
1242 request_resource(res, &crashk_res);
1243#endif
1244 }
1245 }
1246}
1247
1248/*
1249 * Request address space for all standard resources
1250 *
1251 * This is called just before pcibios_init(), which is also a
1252 * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
1253 */
1254static int __init request_standard_resources(void)
1255{
1256 int i;
1257
1258 printk("Setting up standard PCI resources\n");
1259 if (efi_enabled)
1260 efi_initialize_iomem_resources(&code_resource, &data_resource);
1261 else
1262 legacy_init_iomem_resources(&code_resource, &data_resource);
1263
1264 /* EFI systems may still have VGA */
1265 request_resource(&iomem_resource, &video_ram_resource);
1266
1267 /* request I/O space for devices used on all i[345]86 PCs */
1268 for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
1269 request_resource(&ioport_resource, &standard_io_resources[i]);
1270 return 0;
1271}
1272
1273subsys_initcall(request_standard_resources);
1274
1275static void __init register_memory(void)
1276{
1277 unsigned long gapstart, gapsize, round;
1278 unsigned long long last;
1279 int i;
1280
1281 /*
1282 * Search for the bigest gap in the low 32 bits of the e820
1283 * memory space.
1284 */
1285 last = 0x100000000ull;
1286 gapstart = 0x10000000;
1287 gapsize = 0x400000;
1288 i = e820.nr_map;
1289 while (--i >= 0) {
1290 unsigned long long start = e820.map[i].addr;
1291 unsigned long long end = start + e820.map[i].size;
1292
1293 /*
1294 * Since "last" is at most 4GB, we know we'll
1295 * fit in 32 bits if this condition is true
1296 */
1297 if (last > end) {
1298 unsigned long gap = last - end;
1299
1300 if (gap > gapsize) {
1301 gapsize = gap;
1302 gapstart = end;
1303 }
1304 }
1305 if (start < last)
1306 last = start;
1307 }
1308
1309 /*
1310 * See how much we want to round up: start off with
1311 * rounding to the next 1MB area.
1312 */
1313 round = 0x100000;
1314 while ((gapsize >> 4) > round)
1315 round += round;
1316 /* Fun with two's complement */
1317 pci_mem_start = (gapstart + round) & -round;
1318
1319 printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
1320 pci_mem_start, gapstart, gapsize);
1321}
1322
1323#ifdef CONFIG_MCA 488#ifdef CONFIG_MCA
1324static void set_mca_bus(int x) 489static void set_mca_bus(int x)
1325{ 490{
@@ -1329,6 +494,12 @@ static void set_mca_bus(int x)
1329static void set_mca_bus(int x) { } 494static void set_mca_bus(int x) { }
1330#endif 495#endif
1331 496
497/* Overridden in paravirt.c if CONFIG_PARAVIRT */
498char * __attribute__((weak)) memory_setup(void)
499{
500 return machine_specific_memory_setup();
501}
502
1332/* 503/*
1333 * Determine if we were loaded by an EFI loader. If so, then we have also been 504 * Determine if we were loaded by an EFI loader. If so, then we have also been
1334 * passed the efi memmap, systab, etc., so we should use these data structures 505 * passed the efi memmap, systab, etc., so we should use these data structures
@@ -1381,7 +552,7 @@ void __init setup_arch(char **cmdline_p)
1381 efi_init(); 552 efi_init();
1382 else { 553 else {
1383 printk(KERN_INFO "BIOS-provided physical RAM map:\n"); 554 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
1384 print_memory_map(machine_specific_memory_setup()); 555 print_memory_map(memory_setup());
1385 } 556 }
1386 557
1387 copy_edd(); 558 copy_edd();
diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c
index 43002cfb40c4..65d7620eaa09 100644
--- a/arch/i386/kernel/signal.c
+++ b/arch/i386/kernel/signal.c
@@ -128,7 +128,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax
128 X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \ 128 X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \
129 X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF) 129 X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF)
130 130
131 GET_SEG(gs); 131 COPY_SEG(gs);
132 GET_SEG(fs); 132 GET_SEG(fs);
133 COPY_SEG(es); 133 COPY_SEG(es);
134 COPY_SEG(ds); 134 COPY_SEG(ds);
@@ -244,9 +244,7 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
244{ 244{
245 int tmp, err = 0; 245 int tmp, err = 0;
246 246
247 tmp = 0; 247 err |= __put_user(regs->xgs, (unsigned int __user *)&sc->gs);
248 savesegment(gs, tmp);
249 err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
250 savesegment(fs, tmp); 248 savesegment(fs, tmp);
251 err |= __put_user(tmp, (unsigned int __user *)&sc->fs); 249 err |= __put_user(tmp, (unsigned int __user *)&sc->fs);
252 250
diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index 31e5c6573aae..5285aff8367f 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -321,7 +321,6 @@ static inline void leave_mm (unsigned long cpu)
321 321
322fastcall void smp_invalidate_interrupt(struct pt_regs *regs) 322fastcall void smp_invalidate_interrupt(struct pt_regs *regs)
323{ 323{
324 struct pt_regs *old_regs = set_irq_regs(regs);
325 unsigned long cpu; 324 unsigned long cpu;
326 325
327 cpu = get_cpu(); 326 cpu = get_cpu();
@@ -352,7 +351,6 @@ fastcall void smp_invalidate_interrupt(struct pt_regs *regs)
352 smp_mb__after_clear_bit(); 351 smp_mb__after_clear_bit();
353out: 352out:
354 put_cpu_no_resched(); 353 put_cpu_no_resched();
355 set_irq_regs(old_regs);
356} 354}
357 355
358static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, 356static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
@@ -607,14 +605,11 @@ void smp_send_stop(void)
607 */ 605 */
608fastcall void smp_reschedule_interrupt(struct pt_regs *regs) 606fastcall void smp_reschedule_interrupt(struct pt_regs *regs)
609{ 607{
610 struct pt_regs *old_regs = set_irq_regs(regs);
611 ack_APIC_irq(); 608 ack_APIC_irq();
612 set_irq_regs(old_regs);
613} 609}
614 610
615fastcall void smp_call_function_interrupt(struct pt_regs *regs) 611fastcall void smp_call_function_interrupt(struct pt_regs *regs)
616{ 612{
617 struct pt_regs *old_regs = set_irq_regs(regs);
618 void (*func) (void *info) = call_data->func; 613 void (*func) (void *info) = call_data->func;
619 void *info = call_data->info; 614 void *info = call_data->info;
620 int wait = call_data->wait; 615 int wait = call_data->wait;
@@ -637,7 +632,6 @@ fastcall void smp_call_function_interrupt(struct pt_regs *regs)
637 mb(); 632 mb();
638 atomic_inc(&call_data->finished); 633 atomic_inc(&call_data->finished);
639 } 634 }
640 set_irq_regs(old_regs);
641} 635}
642 636
643/* 637/*
@@ -699,6 +693,10 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
699 put_cpu(); 693 put_cpu();
700 return -EBUSY; 694 return -EBUSY;
701 } 695 }
696
697 /* Can deadlock when called with interrupts disabled */
698 WARN_ON(irqs_disabled());
699
702 spin_lock_bh(&call_lock); 700 spin_lock_bh(&call_lock);
703 __smp_call_function_single(cpu, func, info, nonatomic, wait); 701 __smp_call_function_single(cpu, func, info, nonatomic, wait);
704 spin_unlock_bh(&call_lock); 702 spin_unlock_bh(&call_lock);
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 02a9b66b6ac3..4bf0e3c83b8b 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -33,6 +33,11 @@
33 * Dave Jones : Report invalid combinations of Athlon CPUs. 33 * Dave Jones : Report invalid combinations of Athlon CPUs.
34* Rusty Russell : Hacked into shape for new "hotplug" boot process. */ 34* Rusty Russell : Hacked into shape for new "hotplug" boot process. */
35 35
36
37/* SMP boot always wants to use real time delay to allow sufficient time for
38 * the APs to come online */
39#define USE_REAL_TIME_DELAY
40
36#include <linux/module.h> 41#include <linux/module.h>
37#include <linux/init.h> 42#include <linux/init.h>
38#include <linux/kernel.h> 43#include <linux/kernel.h>
@@ -52,6 +57,8 @@
52#include <asm/desc.h> 57#include <asm/desc.h>
53#include <asm/arch_hooks.h> 58#include <asm/arch_hooks.h>
54#include <asm/nmi.h> 59#include <asm/nmi.h>
60#include <asm/pda.h>
61#include <asm/genapic.h>
55 62
56#include <mach_apic.h> 63#include <mach_apic.h>
57#include <mach_wakecpu.h> 64#include <mach_wakecpu.h>
@@ -536,11 +543,11 @@ set_cpu_sibling_map(int cpu)
536static void __devinit start_secondary(void *unused) 543static void __devinit start_secondary(void *unused)
537{ 544{
538 /* 545 /*
539 * Dont put anything before smp_callin(), SMP 546 * Don't put *anything* before secondary_cpu_init(), SMP
540 * booting is too fragile that we want to limit the 547 * booting is too fragile that we want to limit the
541 * things done here to the most necessary things. 548 * things done here to the most necessary things.
542 */ 549 */
543 cpu_init(); 550 secondary_cpu_init();
544 preempt_disable(); 551 preempt_disable();
545 smp_callin(); 552 smp_callin();
546 while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) 553 while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
@@ -599,13 +606,16 @@ void __devinit initialize_secondary(void)
599 "movl %0,%%esp\n\t" 606 "movl %0,%%esp\n\t"
600 "jmp *%1" 607 "jmp *%1"
601 : 608 :
602 :"r" (current->thread.esp),"r" (current->thread.eip)); 609 :"m" (current->thread.esp),"m" (current->thread.eip));
603} 610}
604 611
612/* Static state in head.S used to set up a CPU */
605extern struct { 613extern struct {
606 void * esp; 614 void * esp;
607 unsigned short ss; 615 unsigned short ss;
608} stack_start; 616} stack_start;
617extern struct i386_pda *start_pda;
618extern struct Xgt_desc_struct cpu_gdt_descr;
609 619
610#ifdef CONFIG_NUMA 620#ifdef CONFIG_NUMA
611 621
@@ -936,9 +946,6 @@ static int __devinit do_boot_cpu(int apicid, int cpu)
936 unsigned long start_eip; 946 unsigned long start_eip;
937 unsigned short nmi_high = 0, nmi_low = 0; 947 unsigned short nmi_high = 0, nmi_low = 0;
938 948
939 ++cpucount;
940 alternatives_smp_switch(1);
941
942 /* 949 /*
943 * We can't use kernel_thread since we must avoid to 950 * We can't use kernel_thread since we must avoid to
944 * reschedule the child. 951 * reschedule the child.
@@ -946,15 +953,30 @@ static int __devinit do_boot_cpu(int apicid, int cpu)
946 idle = alloc_idle_task(cpu); 953 idle = alloc_idle_task(cpu);
947 if (IS_ERR(idle)) 954 if (IS_ERR(idle))
948 panic("failed fork for CPU %d", cpu); 955 panic("failed fork for CPU %d", cpu);
956
957 /* Pre-allocate and initialize the CPU's GDT and PDA so it
958 doesn't have to do any memory allocation during the
959 delicate CPU-bringup phase. */
960 if (!init_gdt(cpu, idle)) {
961 printk(KERN_INFO "Couldn't allocate GDT/PDA for CPU %d\n", cpu);
962 return -1; /* ? */
963 }
964
949 idle->thread.eip = (unsigned long) start_secondary; 965 idle->thread.eip = (unsigned long) start_secondary;
950 /* start_eip had better be page-aligned! */ 966 /* start_eip had better be page-aligned! */
951 start_eip = setup_trampoline(); 967 start_eip = setup_trampoline();
952 968
969 ++cpucount;
970 alternatives_smp_switch(1);
971
953 /* So we see what's up */ 972 /* So we see what's up */
954 printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip); 973 printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip);
955 /* Stack for startup_32 can be just as for start_secondary onwards */ 974 /* Stack for startup_32 can be just as for start_secondary onwards */
956 stack_start.esp = (void *) idle->thread.esp; 975 stack_start.esp = (void *) idle->thread.esp;
957 976
977 start_pda = cpu_pda(cpu);
978 cpu_gdt_descr = per_cpu(cpu_gdt_descr, cpu);
979
958 irq_ctx_init(cpu); 980 irq_ctx_init(cpu);
959 981
960 x86_cpu_to_apicid[cpu] = apicid; 982 x86_cpu_to_apicid[cpu] = apicid;
@@ -1109,34 +1131,15 @@ exit:
1109} 1131}
1110#endif 1132#endif
1111 1133
1112static void smp_tune_scheduling (void) 1134static void smp_tune_scheduling(void)
1113{ 1135{
1114 unsigned long cachesize; /* kB */ 1136 unsigned long cachesize; /* kB */
1115 unsigned long bandwidth = 350; /* MB/s */
1116 /*
1117 * Rough estimation for SMP scheduling, this is the number of
1118 * cycles it takes for a fully memory-limited process to flush
1119 * the SMP-local cache.
1120 *
1121 * (For a P5 this pretty much means we will choose another idle
1122 * CPU almost always at wakeup time (this is due to the small
1123 * L1 cache), on PIIs it's around 50-100 usecs, depending on
1124 * the cache size)
1125 */
1126 1137
1127 if (!cpu_khz) { 1138 if (cpu_khz) {
1128 /*
1129 * this basically disables processor-affinity
1130 * scheduling on SMP without a TSC.
1131 */
1132 return;
1133 } else {
1134 cachesize = boot_cpu_data.x86_cache_size; 1139 cachesize = boot_cpu_data.x86_cache_size;
1135 if (cachesize == -1) { 1140
1136 cachesize = 16; /* Pentiums, 2x8kB cache */ 1141 if (cachesize > 0)
1137 bandwidth = 100; 1142 max_cache_size = cachesize * 1024;
1138 }
1139 max_cache_size = cachesize * 1024;
1140 } 1143 }
1141} 1144}
1142 1145
@@ -1462,6 +1465,12 @@ int __devinit __cpu_up(unsigned int cpu)
1462 cpu_set(cpu, smp_commenced_mask); 1465 cpu_set(cpu, smp_commenced_mask);
1463 while (!cpu_isset(cpu, cpu_online_map)) 1466 while (!cpu_isset(cpu, cpu_online_map))
1464 cpu_relax(); 1467 cpu_relax();
1468
1469#ifdef CONFIG_X86_GENERICARCH
1470 if (num_online_cpus() > 8 && genapic == &apic_default)
1471 panic("Default flat APIC routing can't be used with > 8 cpus\n");
1472#endif
1473
1465 return 0; 1474 return 0;
1466} 1475}
1467 1476
diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c
index 713ba39d32c6..7de9117b5a3a 100644
--- a/arch/i386/kernel/sysenter.c
+++ b/arch/i386/kernel/sysenter.c
@@ -27,7 +27,11 @@
27 * Should the kernel map a VDSO page into processes and pass its 27 * Should the kernel map a VDSO page into processes and pass its
28 * address down to glibc upon exec()? 28 * address down to glibc upon exec()?
29 */ 29 */
30#ifdef CONFIG_PARAVIRT
31unsigned int __read_mostly vdso_enabled = 0;
32#else
30unsigned int __read_mostly vdso_enabled = 1; 33unsigned int __read_mostly vdso_enabled = 1;
34#endif
31 35
32EXPORT_SYMBOL_GPL(vdso_enabled); 36EXPORT_SYMBOL_GPL(vdso_enabled);
33 37
@@ -132,7 +136,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
132 goto up_fail; 136 goto up_fail;
133 } 137 }
134 138
135 vma = kmem_cache_zalloc(vm_area_cachep, SLAB_KERNEL); 139 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
136 if (!vma) { 140 if (!vma) {
137 ret = -ENOMEM; 141 ret = -ENOMEM;
138 goto up_fail; 142 goto up_fail;
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index 78af572fd17c..c505b16c0990 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -56,6 +56,7 @@
56#include <asm/uaccess.h> 56#include <asm/uaccess.h>
57#include <asm/processor.h> 57#include <asm/processor.h>
58#include <asm/timer.h> 58#include <asm/timer.h>
59#include <asm/time.h>
59 60
60#include "mach_time.h" 61#include "mach_time.h"
61 62
@@ -116,10 +117,7 @@ static int set_rtc_mmss(unsigned long nowtime)
116 /* gets recalled with irq locally disabled */ 117 /* gets recalled with irq locally disabled */
117 /* XXX - does irqsave resolve this? -johnstul */ 118 /* XXX - does irqsave resolve this? -johnstul */
118 spin_lock_irqsave(&rtc_lock, flags); 119 spin_lock_irqsave(&rtc_lock, flags);
119 if (efi_enabled) 120 retval = set_wallclock(nowtime);
120 retval = efi_set_rtc_mmss(nowtime);
121 else
122 retval = mach_set_rtc_mmss(nowtime);
123 spin_unlock_irqrestore(&rtc_lock, flags); 121 spin_unlock_irqrestore(&rtc_lock, flags);
124 122
125 return retval; 123 return retval;
@@ -223,10 +221,7 @@ unsigned long get_cmos_time(void)
223 221
224 spin_lock_irqsave(&rtc_lock, flags); 222 spin_lock_irqsave(&rtc_lock, flags);
225 223
226 if (efi_enabled) 224 retval = get_wallclock();
227 retval = efi_get_time();
228 else
229 retval = mach_get_cmos_time();
230 225
231 spin_unlock_irqrestore(&rtc_lock, flags); 226 spin_unlock_irqrestore(&rtc_lock, flags);
232 227
@@ -370,7 +365,7 @@ static void __init hpet_time_init(void)
370 printk("Using HPET for base-timer\n"); 365 printk("Using HPET for base-timer\n");
371 } 366 }
372 367
373 time_init_hook(); 368 do_time_init();
374} 369}
375#endif 370#endif
376 371
@@ -392,5 +387,5 @@ void __init time_init(void)
392 387
393 do_settimeofday(&ts); 388 do_settimeofday(&ts);
394 389
395 time_init_hook(); 390 do_time_init();
396} 391}
diff --git a/arch/i386/kernel/time_hpet.c b/arch/i386/kernel/time_hpet.c
index 1a2a979cf6a3..1e4702dfcd01 100644
--- a/arch/i386/kernel/time_hpet.c
+++ b/arch/i386/kernel/time_hpet.c
@@ -132,14 +132,20 @@ int __init hpet_enable(void)
132 * the single HPET timer for system time. 132 * the single HPET timer for system time.
133 */ 133 */
134#ifdef CONFIG_HPET_EMULATE_RTC 134#ifdef CONFIG_HPET_EMULATE_RTC
135 if (!(id & HPET_ID_NUMBER)) 135 if (!(id & HPET_ID_NUMBER)) {
136 iounmap(hpet_virt_address);
137 hpet_virt_address = NULL;
136 return -1; 138 return -1;
139 }
137#endif 140#endif
138 141
139 142
140 hpet_period = hpet_readl(HPET_PERIOD); 143 hpet_period = hpet_readl(HPET_PERIOD);
141 if ((hpet_period < HPET_MIN_PERIOD) || (hpet_period > HPET_MAX_PERIOD)) 144 if ((hpet_period < HPET_MIN_PERIOD) || (hpet_period > HPET_MAX_PERIOD)) {
145 iounmap(hpet_virt_address);
146 hpet_virt_address = NULL;
142 return -1; 147 return -1;
148 }
143 149
144 /* 150 /*
145 * 64 bit math 151 * 64 bit math
@@ -156,8 +162,11 @@ int __init hpet_enable(void)
156 162
157 hpet_use_timer = id & HPET_ID_LEGSUP; 163 hpet_use_timer = id & HPET_ID_LEGSUP;
158 164
159 if (hpet_timer_stop_set_go(hpet_tick)) 165 if (hpet_timer_stop_set_go(hpet_tick)) {
166 iounmap(hpet_virt_address);
167 hpet_virt_address = NULL;
160 return -1; 168 return -1;
169 }
161 170
162 use_hpet = 1; 171 use_hpet = 1;
163 172
diff --git a/arch/i386/kernel/topology.c b/arch/i386/kernel/topology.c
index 07d6da36a825..79cf608e14ca 100644
--- a/arch/i386/kernel/topology.c
+++ b/arch/i386/kernel/topology.c
@@ -40,14 +40,18 @@ int arch_register_cpu(int num)
40 * restrictions and assumptions in kernel. This basically 40 * restrictions and assumptions in kernel. This basically
41 * doesnt add a control file, one cannot attempt to offline 41 * doesnt add a control file, one cannot attempt to offline
42 * BSP. 42 * BSP.
43 *
44 * Also certain PCI quirks require not to enable hotplug control
45 * for all CPU's.
43 */ 46 */
44 if (!num) 47 if (num && enable_cpu_hotplug)
45 cpu_devices[num].cpu.no_control = 1; 48 cpu_devices[num].cpu.hotpluggable = 1;
46 49
47 return register_cpu(&cpu_devices[num].cpu, num); 50 return register_cpu(&cpu_devices[num].cpu, num);
48} 51}
49 52
50#ifdef CONFIG_HOTPLUG_CPU 53#ifdef CONFIG_HOTPLUG_CPU
54int enable_cpu_hotplug = 1;
51 55
52void arch_unregister_cpu(int num) { 56void arch_unregister_cpu(int num) {
53 return unregister_cpu(&cpu_devices[num].cpu); 57 return unregister_cpu(&cpu_devices[num].cpu);
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index fe9c5e8e7e6f..68de48e498ca 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -29,6 +29,7 @@
29#include <linux/kexec.h> 29#include <linux/kexec.h>
30#include <linux/unwind.h> 30#include <linux/unwind.h>
31#include <linux/uaccess.h> 31#include <linux/uaccess.h>
32#include <linux/nmi.h>
32 33
33#ifdef CONFIG_EISA 34#ifdef CONFIG_EISA
34#include <linux/ioport.h> 35#include <linux/ioport.h>
@@ -61,9 +62,6 @@ int panic_on_unrecovered_nmi;
61 62
62asmlinkage int system_call(void); 63asmlinkage int system_call(void);
63 64
64struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 },
65 { 0, 0 }, { 0, 0 } };
66
67/* Do we ignore FPU interrupts ? */ 65/* Do we ignore FPU interrupts ? */
68char ignore_fpu_irq = 0; 66char ignore_fpu_irq = 0;
69 67
@@ -94,7 +92,7 @@ asmlinkage void alignment_check(void);
94asmlinkage void spurious_interrupt_bug(void); 92asmlinkage void spurious_interrupt_bug(void);
95asmlinkage void machine_check(void); 93asmlinkage void machine_check(void);
96 94
97static int kstack_depth_to_print = 24; 95int kstack_depth_to_print = 24;
98#ifdef CONFIG_STACK_UNWIND 96#ifdef CONFIG_STACK_UNWIND
99static int call_trace = 1; 97static int call_trace = 1;
100#else 98#else
@@ -163,16 +161,25 @@ dump_trace_unwind(struct unwind_frame_info *info, void *data)
163{ 161{
164 struct ops_and_data *oad = (struct ops_and_data *)data; 162 struct ops_and_data *oad = (struct ops_and_data *)data;
165 int n = 0; 163 int n = 0;
164 unsigned long sp = UNW_SP(info);
166 165
166 if (arch_unw_user_mode(info))
167 return -1;
167 while (unwind(info) == 0 && UNW_PC(info)) { 168 while (unwind(info) == 0 && UNW_PC(info)) {
168 n++; 169 n++;
169 oad->ops->address(oad->data, UNW_PC(info)); 170 oad->ops->address(oad->data, UNW_PC(info));
170 if (arch_unw_user_mode(info)) 171 if (arch_unw_user_mode(info))
171 break; 172 break;
173 if ((sp & ~(PAGE_SIZE - 1)) == (UNW_SP(info) & ~(PAGE_SIZE - 1))
174 && sp > UNW_SP(info))
175 break;
176 sp = UNW_SP(info);
172 } 177 }
173 return n; 178 return n;
174} 179}
175 180
181#define MSG(msg) ops->warning(data, msg)
182
176void dump_trace(struct task_struct *task, struct pt_regs *regs, 183void dump_trace(struct task_struct *task, struct pt_regs *regs,
177 unsigned long *stack, 184 unsigned long *stack,
178 struct stacktrace_ops *ops, void *data) 185 struct stacktrace_ops *ops, void *data)
@@ -191,29 +198,31 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
191 if (unwind_init_frame_info(&info, task, regs) == 0) 198 if (unwind_init_frame_info(&info, task, regs) == 0)
192 unw_ret = dump_trace_unwind(&info, &oad); 199 unw_ret = dump_trace_unwind(&info, &oad);
193 } else if (task == current) 200 } else if (task == current)
194 unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad); 201 unw_ret = unwind_init_running(&info, dump_trace_unwind,
202 &oad);
195 else { 203 else {
196 if (unwind_init_blocked(&info, task) == 0) 204 if (unwind_init_blocked(&info, task) == 0)
197 unw_ret = dump_trace_unwind(&info, &oad); 205 unw_ret = dump_trace_unwind(&info, &oad);
198 } 206 }
199 if (unw_ret > 0) { 207 if (unw_ret > 0) {
200 if (call_trace == 1 && !arch_unw_user_mode(&info)) { 208 if (call_trace == 1 && !arch_unw_user_mode(&info)) {
201 ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n", 209 ops->warning_symbol(data,
210 "DWARF2 unwinder stuck at %s",
202 UNW_PC(&info)); 211 UNW_PC(&info));
203 if (UNW_SP(&info) >= PAGE_OFFSET) { 212 if (UNW_SP(&info) >= PAGE_OFFSET) {
204 ops->warning(data, "Leftover inexact backtrace:\n"); 213 MSG("Leftover inexact backtrace:");
205 stack = (void *)UNW_SP(&info); 214 stack = (void *)UNW_SP(&info);
206 if (!stack) 215 if (!stack)
207 return; 216 return;
208 ebp = UNW_FP(&info); 217 ebp = UNW_FP(&info);
209 } else 218 } else
210 ops->warning(data, "Full inexact backtrace again:\n"); 219 MSG("Full inexact backtrace again:");
211 } else if (call_trace >= 1) 220 } else if (call_trace >= 1)
212 return; 221 return;
213 else 222 else
214 ops->warning(data, "Full inexact backtrace again:\n"); 223 MSG("Full inexact backtrace again:");
215 } else 224 } else
216 ops->warning(data, "Inexact backtrace:\n"); 225 MSG("Inexact backtrace:");
217 } 226 }
218 if (!stack) { 227 if (!stack) {
219 unsigned long dummy; 228 unsigned long dummy;
@@ -247,6 +256,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
247 stack = (unsigned long*)context->previous_esp; 256 stack = (unsigned long*)context->previous_esp;
248 if (!stack) 257 if (!stack)
249 break; 258 break;
259 touch_nmi_watchdog();
250 } 260 }
251} 261}
252EXPORT_SYMBOL(dump_trace); 262EXPORT_SYMBOL(dump_trace);
@@ -379,7 +389,7 @@ void show_registers(struct pt_regs *regs)
379 * time of the fault.. 389 * time of the fault..
380 */ 390 */
381 if (in_kernel) { 391 if (in_kernel) {
382 u8 __user *eip; 392 u8 *eip;
383 int code_bytes = 64; 393 int code_bytes = 64;
384 unsigned char c; 394 unsigned char c;
385 395
@@ -388,18 +398,20 @@ void show_registers(struct pt_regs *regs)
388 398
389 printk(KERN_EMERG "Code: "); 399 printk(KERN_EMERG "Code: ");
390 400
391 eip = (u8 __user *)regs->eip - 43; 401 eip = (u8 *)regs->eip - 43;
392 if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) { 402 if (eip < (u8 *)PAGE_OFFSET ||
403 probe_kernel_address(eip, c)) {
393 /* try starting at EIP */ 404 /* try starting at EIP */
394 eip = (u8 __user *)regs->eip; 405 eip = (u8 *)regs->eip;
395 code_bytes = 32; 406 code_bytes = 32;
396 } 407 }
397 for (i = 0; i < code_bytes; i++, eip++) { 408 for (i = 0; i < code_bytes; i++, eip++) {
398 if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) { 409 if (eip < (u8 *)PAGE_OFFSET ||
410 probe_kernel_address(eip, c)) {
399 printk(" Bad EIP value."); 411 printk(" Bad EIP value.");
400 break; 412 break;
401 } 413 }
402 if (eip == (u8 __user *)regs->eip) 414 if (eip == (u8 *)regs->eip)
403 printk("<%02x> ", c); 415 printk("<%02x> ", c);
404 else 416 else
405 printk("%02x ", c); 417 printk("%02x ", c);
@@ -415,7 +427,7 @@ static void handle_BUG(struct pt_regs *regs)
415 427
416 if (eip < PAGE_OFFSET) 428 if (eip < PAGE_OFFSET)
417 return; 429 return;
418 if (probe_kernel_address((unsigned short __user *)eip, ud2)) 430 if (probe_kernel_address((unsigned short *)eip, ud2))
419 return; 431 return;
420 if (ud2 != 0x0b0f) 432 if (ud2 != 0x0b0f)
421 return; 433 return;
@@ -428,11 +440,11 @@ static void handle_BUG(struct pt_regs *regs)
428 char *file; 440 char *file;
429 char c; 441 char c;
430 442
431 if (probe_kernel_address((unsigned short __user *)(eip + 2), 443 if (probe_kernel_address((unsigned short *)(eip + 2), line))
432 line))
433 break; 444 break;
434 if (__get_user(file, (char * __user *)(eip + 4)) || 445 if (probe_kernel_address((char **)(eip + 4), file) ||
435 (unsigned long)file < PAGE_OFFSET || __get_user(c, file)) 446 (unsigned long)file < PAGE_OFFSET ||
447 probe_kernel_address(file, c))
436 file = "<bad filename>"; 448 file = "<bad filename>";
437 449
438 printk(KERN_EMERG "kernel BUG at %s:%d!\n", file, line); 450 printk(KERN_EMERG "kernel BUG at %s:%d!\n", file, line);
@@ -452,7 +464,7 @@ void die(const char * str, struct pt_regs * regs, long err)
452 u32 lock_owner; 464 u32 lock_owner;
453 int lock_owner_depth; 465 int lock_owner_depth;
454 } die = { 466 } die = {
455 .lock = SPIN_LOCK_UNLOCKED, 467 .lock = __SPIN_LOCK_UNLOCKED(die.lock),
456 .lock_owner = -1, 468 .lock_owner = -1,
457 .lock_owner_depth = 0 469 .lock_owner_depth = 0
458 }; 470 };
@@ -707,8 +719,7 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs)
707{ 719{
708 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on " 720 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
709 "CPU %d.\n", reason, smp_processor_id()); 721 "CPU %d.\n", reason, smp_processor_id());
710 printk(KERN_EMERG "You probably have a hardware problem with your RAM " 722 printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
711 "chips\n");
712 if (panic_on_unrecovered_nmi) 723 if (panic_on_unrecovered_nmi)
713 panic("NMI: Not continuing"); 724 panic("NMI: Not continuing");
714 725
@@ -773,7 +784,6 @@ void __kprobes die_nmi(struct pt_regs *regs, const char *msg)
773 printk(" on CPU%d, eip %08lx, registers:\n", 784 printk(" on CPU%d, eip %08lx, registers:\n",
774 smp_processor_id(), regs->eip); 785 smp_processor_id(), regs->eip);
775 show_registers(regs); 786 show_registers(regs);
776 printk(KERN_EMERG "console shuts up ...\n");
777 console_silent(); 787 console_silent();
778 spin_unlock(&nmi_print_lock); 788 spin_unlock(&nmi_print_lock);
779 bust_spinlocks(0); 789 bust_spinlocks(0);
@@ -1088,49 +1098,24 @@ fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
1088#endif 1098#endif
1089} 1099}
1090 1100
1091fastcall void setup_x86_bogus_stack(unsigned char * stk) 1101fastcall unsigned long patch_espfix_desc(unsigned long uesp,
1092{ 1102 unsigned long kesp)
1093 unsigned long *switch16_ptr, *switch32_ptr;
1094 struct pt_regs *regs;
1095 unsigned long stack_top, stack_bot;
1096 unsigned short iret_frame16_off;
1097 int cpu = smp_processor_id();
1098 /* reserve the space on 32bit stack for the magic switch16 pointer */
1099 memmove(stk, stk + 8, sizeof(struct pt_regs));
1100 switch16_ptr = (unsigned long *)(stk + sizeof(struct pt_regs));
1101 regs = (struct pt_regs *)stk;
1102 /* now the switch32 on 16bit stack */
1103 stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
1104 stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
1105 switch32_ptr = (unsigned long *)(stack_top - 8);
1106 iret_frame16_off = CPU_16BIT_STACK_SIZE - 8 - 20;
1107 /* copy iret frame on 16bit stack */
1108 memcpy((void *)(stack_bot + iret_frame16_off), &regs->eip, 20);
1109 /* fill in the switch pointers */
1110 switch16_ptr[0] = (regs->esp & 0xffff0000) | iret_frame16_off;
1111 switch16_ptr[1] = __ESPFIX_SS;
1112 switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) +
1113 8 - CPU_16BIT_STACK_SIZE;
1114 switch32_ptr[1] = __KERNEL_DS;
1115}
1116
1117fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp)
1118{ 1103{
1119 unsigned long *switch32_ptr;
1120 unsigned char *stack16, *stack32;
1121 unsigned long stack_top, stack_bot;
1122 int len;
1123 int cpu = smp_processor_id(); 1104 int cpu = smp_processor_id();
1124 stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu); 1105 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
1125 stack_top = stack_bot + CPU_16BIT_STACK_SIZE; 1106 struct desc_struct *gdt = (struct desc_struct *)cpu_gdt_descr->address;
1126 switch32_ptr = (unsigned long *)(stack_top - 8); 1107 unsigned long base = (kesp - uesp) & -THREAD_SIZE;
1127 /* copy the data from 16bit stack to 32bit stack */ 1108 unsigned long new_kesp = kesp - base;
1128 len = CPU_16BIT_STACK_SIZE - 8 - sp; 1109 unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
1129 stack16 = (unsigned char *)(stack_bot + sp); 1110 __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS];
1130 stack32 = (unsigned char *) 1111 /* Set up base for espfix segment */
1131 (switch32_ptr[0] + CPU_16BIT_STACK_SIZE - 8 - len); 1112 desc &= 0x00f0ff0000000000ULL;
1132 memcpy(stack32, stack16, len); 1113 desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) |
1133 return stack32; 1114 ((((__u64)base) << 32) & 0xff00000000000000ULL) |
1115 ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) |
1116 (lim_pages & 0xffff);
1117 *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc;
1118 return new_kesp;
1134} 1119}
1135 1120
1136/* 1121/*
@@ -1143,7 +1128,7 @@ fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp)
1143 * Must be called with kernel preemption disabled (in this case, 1128 * Must be called with kernel preemption disabled (in this case,
1144 * local interrupts are disabled at the call-site in entry.S). 1129 * local interrupts are disabled at the call-site in entry.S).
1145 */ 1130 */
1146asmlinkage void math_state_restore(struct pt_regs regs) 1131asmlinkage void math_state_restore(void)
1147{ 1132{
1148 struct thread_info *thread = current_thread_info(); 1133 struct thread_info *thread = current_thread_info();
1149 struct task_struct *tsk = thread->task; 1134 struct task_struct *tsk = thread->task;
@@ -1153,6 +1138,7 @@ asmlinkage void math_state_restore(struct pt_regs regs)
1153 init_fpu(tsk); 1138 init_fpu(tsk);
1154 restore_fpu(tsk); 1139 restore_fpu(tsk);
1155 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ 1140 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
1141 tsk->fpu_counter++;
1156} 1142}
1157 1143
1158#ifndef CONFIG_MATH_EMULATION 1144#ifndef CONFIG_MATH_EMULATION
diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index 9810c8c90750..1bbe45dca7a0 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -13,7 +13,6 @@
13 13
14#include <asm/delay.h> 14#include <asm/delay.h>
15#include <asm/tsc.h> 15#include <asm/tsc.h>
16#include <asm/delay.h>
17#include <asm/io.h> 16#include <asm/io.h>
18 17
19#include "mach_timer.h" 18#include "mach_timer.h"
diff --git a/arch/i386/kernel/vm86.c b/arch/i386/kernel/vm86.c
index cbcd61d6120b..be2f96e67f78 100644
--- a/arch/i386/kernel/vm86.c
+++ b/arch/i386/kernel/vm86.c
@@ -43,6 +43,7 @@
43#include <linux/highmem.h> 43#include <linux/highmem.h>
44#include <linux/ptrace.h> 44#include <linux/ptrace.h>
45#include <linux/audit.h> 45#include <linux/audit.h>
46#include <linux/stddef.h>
46 47
47#include <asm/uaccess.h> 48#include <asm/uaccess.h>
48#include <asm/io.h> 49#include <asm/io.h>
@@ -72,10 +73,10 @@
72/* 73/*
73 * 8- and 16-bit register defines.. 74 * 8- and 16-bit register defines..
74 */ 75 */
75#define AL(regs) (((unsigned char *)&((regs)->eax))[0]) 76#define AL(regs) (((unsigned char *)&((regs)->pt.eax))[0])
76#define AH(regs) (((unsigned char *)&((regs)->eax))[1]) 77#define AH(regs) (((unsigned char *)&((regs)->pt.eax))[1])
77#define IP(regs) (*(unsigned short *)&((regs)->eip)) 78#define IP(regs) (*(unsigned short *)&((regs)->pt.eip))
78#define SP(regs) (*(unsigned short *)&((regs)->esp)) 79#define SP(regs) (*(unsigned short *)&((regs)->pt.esp))
79 80
80/* 81/*
81 * virtual flags (16 and 32-bit versions) 82 * virtual flags (16 and 32-bit versions)
@@ -89,10 +90,37 @@
89#define SAFE_MASK (0xDD5) 90#define SAFE_MASK (0xDD5)
90#define RETURN_MASK (0xDFF) 91#define RETURN_MASK (0xDFF)
91 92
92#define VM86_REGS_PART2 orig_eax 93/* convert kernel_vm86_regs to vm86_regs */
93#define VM86_REGS_SIZE1 \ 94static int copy_vm86_regs_to_user(struct vm86_regs __user *user,
94 ( (unsigned)( & (((struct kernel_vm86_regs *)0)->VM86_REGS_PART2) ) ) 95 const struct kernel_vm86_regs *regs)
95#define VM86_REGS_SIZE2 (sizeof(struct kernel_vm86_regs) - VM86_REGS_SIZE1) 96{
97 int ret = 0;
98
99 /* kernel_vm86_regs is missing xfs, so copy everything up to
100 (but not including) xgs, and then rest after xgs. */
101 ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.xgs));
102 ret += copy_to_user(&user->__null_gs, &regs->pt.xgs,
103 sizeof(struct kernel_vm86_regs) -
104 offsetof(struct kernel_vm86_regs, pt.xgs));
105
106 return ret;
107}
108
109/* convert vm86_regs to kernel_vm86_regs */
110static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
111 const struct vm86_regs __user *user,
112 unsigned extra)
113{
114 int ret = 0;
115
116 ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.xgs));
117 ret += copy_from_user(&regs->pt.xgs, &user->__null_gs,
118 sizeof(struct kernel_vm86_regs) -
119 offsetof(struct kernel_vm86_regs, pt.xgs) +
120 extra);
121
122 return ret;
123}
96 124
97struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs)); 125struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs));
98struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs) 126struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
@@ -112,10 +140,8 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
112 printk("no vm86_info: BAD\n"); 140 printk("no vm86_info: BAD\n");
113 do_exit(SIGSEGV); 141 do_exit(SIGSEGV);
114 } 142 }
115 set_flags(regs->eflags, VEFLAGS, VIF_MASK | current->thread.v86mask); 143 set_flags(regs->pt.eflags, VEFLAGS, VIF_MASK | current->thread.v86mask);
116 tmp = copy_to_user(&current->thread.vm86_info->regs,regs, VM86_REGS_SIZE1); 144 tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs,regs);
117 tmp += copy_to_user(&current->thread.vm86_info->regs.VM86_REGS_PART2,
118 &regs->VM86_REGS_PART2, VM86_REGS_SIZE2);
119 tmp += put_user(current->thread.screen_bitmap,&current->thread.vm86_info->screen_bitmap); 145 tmp += put_user(current->thread.screen_bitmap,&current->thread.vm86_info->screen_bitmap);
120 if (tmp) { 146 if (tmp) {
121 printk("vm86: could not access userspace vm86_info\n"); 147 printk("vm86: could not access userspace vm86_info\n");
@@ -129,9 +155,11 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
129 current->thread.saved_esp0 = 0; 155 current->thread.saved_esp0 = 0;
130 put_cpu(); 156 put_cpu();
131 157
132 loadsegment(fs, current->thread.saved_fs);
133 loadsegment(gs, current->thread.saved_gs);
134 ret = KVM86->regs32; 158 ret = KVM86->regs32;
159
160 loadsegment(fs, current->thread.saved_fs);
161 ret->xgs = current->thread.saved_gs;
162
135 return ret; 163 return ret;
136} 164}
137 165
@@ -183,9 +211,9 @@ asmlinkage int sys_vm86old(struct pt_regs regs)
183 tsk = current; 211 tsk = current;
184 if (tsk->thread.saved_esp0) 212 if (tsk->thread.saved_esp0)
185 goto out; 213 goto out;
186 tmp = copy_from_user(&info, v86, VM86_REGS_SIZE1); 214 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
187 tmp += copy_from_user(&info.regs.VM86_REGS_PART2, &v86->regs.VM86_REGS_PART2, 215 offsetof(struct kernel_vm86_struct, vm86plus) -
188 (long)&info.vm86plus - (long)&info.regs.VM86_REGS_PART2); 216 sizeof(info.regs));
189 ret = -EFAULT; 217 ret = -EFAULT;
190 if (tmp) 218 if (tmp)
191 goto out; 219 goto out;
@@ -233,9 +261,9 @@ asmlinkage int sys_vm86(struct pt_regs regs)
233 if (tsk->thread.saved_esp0) 261 if (tsk->thread.saved_esp0)
234 goto out; 262 goto out;
235 v86 = (struct vm86plus_struct __user *)regs.ecx; 263 v86 = (struct vm86plus_struct __user *)regs.ecx;
236 tmp = copy_from_user(&info, v86, VM86_REGS_SIZE1); 264 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
237 tmp += copy_from_user(&info.regs.VM86_REGS_PART2, &v86->regs.VM86_REGS_PART2, 265 offsetof(struct kernel_vm86_struct, regs32) -
238 (long)&info.regs32 - (long)&info.regs.VM86_REGS_PART2); 266 sizeof(info.regs));
239 ret = -EFAULT; 267 ret = -EFAULT;
240 if (tmp) 268 if (tmp)
241 goto out; 269 goto out;
@@ -252,15 +280,15 @@ out:
252static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk) 280static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk)
253{ 281{
254 struct tss_struct *tss; 282 struct tss_struct *tss;
255 long eax;
256/* 283/*
257 * make sure the vm86() system call doesn't try to do anything silly 284 * make sure the vm86() system call doesn't try to do anything silly
258 */ 285 */
259 info->regs.__null_ds = 0; 286 info->regs.pt.xds = 0;
260 info->regs.__null_es = 0; 287 info->regs.pt.xes = 0;
288 info->regs.pt.xgs = 0;
261 289
262/* we are clearing fs,gs later just before "jmp resume_userspace", 290/* we are clearing fs later just before "jmp resume_userspace",
263 * because starting with Linux 2.1.x they aren't no longer saved/restored 291 * because it is not saved/restored.
264 */ 292 */
265 293
266/* 294/*
@@ -268,10 +296,10 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
268 * has set it up safely, so this makes sure interrupt etc flags are 296 * has set it up safely, so this makes sure interrupt etc flags are
269 * inherited from protected mode. 297 * inherited from protected mode.
270 */ 298 */
271 VEFLAGS = info->regs.eflags; 299 VEFLAGS = info->regs.pt.eflags;
272 info->regs.eflags &= SAFE_MASK; 300 info->regs.pt.eflags &= SAFE_MASK;
273 info->regs.eflags |= info->regs32->eflags & ~SAFE_MASK; 301 info->regs.pt.eflags |= info->regs32->eflags & ~SAFE_MASK;
274 info->regs.eflags |= VM_MASK; 302 info->regs.pt.eflags |= VM_MASK;
275 303
276 switch (info->cpu_type) { 304 switch (info->cpu_type) {
277 case CPU_286: 305 case CPU_286:
@@ -294,7 +322,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
294 info->regs32->eax = 0; 322 info->regs32->eax = 0;
295 tsk->thread.saved_esp0 = tsk->thread.esp0; 323 tsk->thread.saved_esp0 = tsk->thread.esp0;
296 savesegment(fs, tsk->thread.saved_fs); 324 savesegment(fs, tsk->thread.saved_fs);
297 savesegment(gs, tsk->thread.saved_gs); 325 tsk->thread.saved_gs = info->regs32->xgs;
298 326
299 tss = &per_cpu(init_tss, get_cpu()); 327 tss = &per_cpu(init_tss, get_cpu());
300 tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0; 328 tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0;
@@ -306,19 +334,18 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
306 tsk->thread.screen_bitmap = info->screen_bitmap; 334 tsk->thread.screen_bitmap = info->screen_bitmap;
307 if (info->flags & VM86_SCREEN_BITMAP) 335 if (info->flags & VM86_SCREEN_BITMAP)
308 mark_screen_rdonly(tsk->mm); 336 mark_screen_rdonly(tsk->mm);
309 __asm__ __volatile__("xorl %eax,%eax; movl %eax,%fs; movl %eax,%gs\n\t");
310 __asm__ __volatile__("movl %%eax, %0\n" :"=r"(eax));
311 337
312 /*call audit_syscall_exit since we do not exit via the normal paths */ 338 /*call audit_syscall_exit since we do not exit via the normal paths */
313 if (unlikely(current->audit_context)) 339 if (unlikely(current->audit_context))
314 audit_syscall_exit(AUDITSC_RESULT(eax), eax); 340 audit_syscall_exit(AUDITSC_RESULT(0), 0);
315 341
316 __asm__ __volatile__( 342 __asm__ __volatile__(
317 "movl %0,%%esp\n\t" 343 "movl %0,%%esp\n\t"
318 "movl %1,%%ebp\n\t" 344 "movl %1,%%ebp\n\t"
345 "mov %2, %%fs\n\t"
319 "jmp resume_userspace" 346 "jmp resume_userspace"
320 : /* no outputs */ 347 : /* no outputs */
321 :"r" (&info->regs), "r" (task_thread_info(tsk))); 348 :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
322 /* we never return here */ 349 /* we never return here */
323} 350}
324 351
@@ -348,12 +375,12 @@ static inline void clear_IF(struct kernel_vm86_regs * regs)
348 375
349static inline void clear_TF(struct kernel_vm86_regs * regs) 376static inline void clear_TF(struct kernel_vm86_regs * regs)
350{ 377{
351 regs->eflags &= ~TF_MASK; 378 regs->pt.eflags &= ~TF_MASK;
352} 379}
353 380
354static inline void clear_AC(struct kernel_vm86_regs * regs) 381static inline void clear_AC(struct kernel_vm86_regs * regs)
355{ 382{
356 regs->eflags &= ~AC_MASK; 383 regs->pt.eflags &= ~AC_MASK;
357} 384}
358 385
359/* It is correct to call set_IF(regs) from the set_vflags_* 386/* It is correct to call set_IF(regs) from the set_vflags_*
@@ -370,7 +397,7 @@ static inline void clear_AC(struct kernel_vm86_regs * regs)
370static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs * regs) 397static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs * regs)
371{ 398{
372 set_flags(VEFLAGS, eflags, current->thread.v86mask); 399 set_flags(VEFLAGS, eflags, current->thread.v86mask);
373 set_flags(regs->eflags, eflags, SAFE_MASK); 400 set_flags(regs->pt.eflags, eflags, SAFE_MASK);
374 if (eflags & IF_MASK) 401 if (eflags & IF_MASK)
375 set_IF(regs); 402 set_IF(regs);
376 else 403 else
@@ -380,7 +407,7 @@ static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs
380static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs) 407static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs)
381{ 408{
382 set_flags(VFLAGS, flags, current->thread.v86mask); 409 set_flags(VFLAGS, flags, current->thread.v86mask);
383 set_flags(regs->eflags, flags, SAFE_MASK); 410 set_flags(regs->pt.eflags, flags, SAFE_MASK);
384 if (flags & IF_MASK) 411 if (flags & IF_MASK)
385 set_IF(regs); 412 set_IF(regs);
386 else 413 else
@@ -389,7 +416,7 @@ static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_reg
389 416
390static inline unsigned long get_vflags(struct kernel_vm86_regs * regs) 417static inline unsigned long get_vflags(struct kernel_vm86_regs * regs)
391{ 418{
392 unsigned long flags = regs->eflags & RETURN_MASK; 419 unsigned long flags = regs->pt.eflags & RETURN_MASK;
393 420
394 if (VEFLAGS & VIF_MASK) 421 if (VEFLAGS & VIF_MASK)
395 flags |= IF_MASK; 422 flags |= IF_MASK;
@@ -493,7 +520,7 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
493 unsigned long __user *intr_ptr; 520 unsigned long __user *intr_ptr;
494 unsigned long segoffs; 521 unsigned long segoffs;
495 522
496 if (regs->cs == BIOSSEG) 523 if (regs->pt.xcs == BIOSSEG)
497 goto cannot_handle; 524 goto cannot_handle;
498 if (is_revectored(i, &KVM86->int_revectored)) 525 if (is_revectored(i, &KVM86->int_revectored))
499 goto cannot_handle; 526 goto cannot_handle;
@@ -505,9 +532,9 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
505 if ((segoffs >> 16) == BIOSSEG) 532 if ((segoffs >> 16) == BIOSSEG)
506 goto cannot_handle; 533 goto cannot_handle;
507 pushw(ssp, sp, get_vflags(regs), cannot_handle); 534 pushw(ssp, sp, get_vflags(regs), cannot_handle);
508 pushw(ssp, sp, regs->cs, cannot_handle); 535 pushw(ssp, sp, regs->pt.xcs, cannot_handle);
509 pushw(ssp, sp, IP(regs), cannot_handle); 536 pushw(ssp, sp, IP(regs), cannot_handle);
510 regs->cs = segoffs >> 16; 537 regs->pt.xcs = segoffs >> 16;
511 SP(regs) -= 6; 538 SP(regs) -= 6;
512 IP(regs) = segoffs & 0xffff; 539 IP(regs) = segoffs & 0xffff;
513 clear_TF(regs); 540 clear_TF(regs);
@@ -524,7 +551,7 @@ int handle_vm86_trap(struct kernel_vm86_regs * regs, long error_code, int trapno
524 if (VMPI.is_vm86pus) { 551 if (VMPI.is_vm86pus) {
525 if ( (trapno==3) || (trapno==1) ) 552 if ( (trapno==3) || (trapno==1) )
526 return_to_32bit(regs, VM86_TRAP + (trapno << 8)); 553 return_to_32bit(regs, VM86_TRAP + (trapno << 8));
527 do_int(regs, trapno, (unsigned char __user *) (regs->ss << 4), SP(regs)); 554 do_int(regs, trapno, (unsigned char __user *) (regs->pt.xss << 4), SP(regs));
528 return 0; 555 return 0;
529 } 556 }
530 if (trapno !=1) 557 if (trapno !=1)
@@ -560,10 +587,10 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
560 handle_vm86_trap(regs, 0, 1); \ 587 handle_vm86_trap(regs, 0, 1); \
561 return; } while (0) 588 return; } while (0)
562 589
563 orig_flags = *(unsigned short *)&regs->eflags; 590 orig_flags = *(unsigned short *)&regs->pt.eflags;
564 591
565 csp = (unsigned char __user *) (regs->cs << 4); 592 csp = (unsigned char __user *) (regs->pt.xcs << 4);
566 ssp = (unsigned char __user *) (regs->ss << 4); 593 ssp = (unsigned char __user *) (regs->pt.xss << 4);
567 sp = SP(regs); 594 sp = SP(regs);
568 ip = IP(regs); 595 ip = IP(regs);
569 596
@@ -650,7 +677,7 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
650 SP(regs) += 6; 677 SP(regs) += 6;
651 } 678 }
652 IP(regs) = newip; 679 IP(regs) = newip;
653 regs->cs = newcs; 680 regs->pt.xcs = newcs;
654 CHECK_IF_IN_TRAP; 681 CHECK_IF_IN_TRAP;
655 if (data32) { 682 if (data32) {
656 set_vflags_long(newflags, regs); 683 set_vflags_long(newflags, regs);
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index c6f84a0322ba..56e6ad5cb045 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -1,13 +1,26 @@
1/* ld script to make i386 Linux kernel 1/* ld script to make i386 Linux kernel
2 * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>; 2 * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
3 *
4 * Don't define absolute symbols until and unless you know that symbol
5 * value is should remain constant even if kernel image is relocated
6 * at run time. Absolute symbols are not relocated. If symbol value should
7 * change if kernel is relocated, make the symbol section relative and
8 * put it inside the section definition.
3 */ 9 */
4 10
11/* Don't define absolute symbols until and unless you know that symbol
12 * value is should remain constant even if kernel image is relocated
13 * at run time. Absolute symbols are not relocated. If symbol value should
14 * change if kernel is relocated, make the symbol section relative and
15 * put it inside the section definition.
16 */
5#define LOAD_OFFSET __PAGE_OFFSET 17#define LOAD_OFFSET __PAGE_OFFSET
6 18
7#include <asm-generic/vmlinux.lds.h> 19#include <asm-generic/vmlinux.lds.h>
8#include <asm/thread_info.h> 20#include <asm/thread_info.h>
9#include <asm/page.h> 21#include <asm/page.h>
10#include <asm/cache.h> 22#include <asm/cache.h>
23#include <asm/boot.h>
11 24
12OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") 25OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
13OUTPUT_ARCH(i386) 26OUTPUT_ARCH(i386)
@@ -21,34 +34,35 @@ PHDRS {
21} 34}
22SECTIONS 35SECTIONS
23{ 36{
24 . = __KERNEL_START; 37 . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
25 phys_startup_32 = startup_32 - LOAD_OFFSET; 38 phys_startup_32 = startup_32 - LOAD_OFFSET;
26 /* read-only */ 39 /* read-only */
27 _text = .; /* Text and read-only data */
28 .text : AT(ADDR(.text) - LOAD_OFFSET) { 40 .text : AT(ADDR(.text) - LOAD_OFFSET) {
41 _text = .; /* Text and read-only data */
29 *(.text) 42 *(.text)
30 SCHED_TEXT 43 SCHED_TEXT
31 LOCK_TEXT 44 LOCK_TEXT
32 KPROBES_TEXT 45 KPROBES_TEXT
33 *(.fixup) 46 *(.fixup)
34 *(.gnu.warning) 47 *(.gnu.warning)
35 } :text = 0x9090 48 _etext = .; /* End of text section */
36 49 } :text = 0x9090
37 _etext = .; /* End of text section */
38 50
39 . = ALIGN(16); /* Exception table */ 51 . = ALIGN(16); /* Exception table */
40 __start___ex_table = .; 52 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
41 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) } 53 __start___ex_table = .;
42 __stop___ex_table = .; 54 *(__ex_table)
55 __stop___ex_table = .;
56 }
43 57
44 RODATA 58 RODATA
45 59
46 . = ALIGN(4); 60 . = ALIGN(4);
47 __tracedata_start = .;
48 .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) { 61 .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
62 __tracedata_start = .;
49 *(.tracedata) 63 *(.tracedata)
64 __tracedata_end = .;
50 } 65 }
51 __tracedata_end = .;
52 66
53 /* writeable */ 67 /* writeable */
54 . = ALIGN(4096); 68 . = ALIGN(4096);
@@ -57,11 +71,19 @@ SECTIONS
57 CONSTRUCTORS 71 CONSTRUCTORS
58 } :data 72 } :data
59 73
74 .paravirtprobe : AT(ADDR(.paravirtprobe) - LOAD_OFFSET) {
75 __start_paravirtprobe = .;
76 *(.paravirtprobe)
77 __stop_paravirtprobe = .;
78 }
79
60 . = ALIGN(4096); 80 . = ALIGN(4096);
61 __nosave_begin = .; 81 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
62 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) } 82 __nosave_begin = .;
63 . = ALIGN(4096); 83 *(.data.nosave)
64 __nosave_end = .; 84 . = ALIGN(4096);
85 __nosave_end = .;
86 }
65 87
66 . = ALIGN(4096); 88 . = ALIGN(4096);
67 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { 89 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
@@ -75,17 +97,10 @@ SECTIONS
75 97
76 /* rarely changed data like cpu maps */ 98 /* rarely changed data like cpu maps */
77 . = ALIGN(32); 99 . = ALIGN(32);
78 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { *(.data.read_mostly) } 100 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
79 _edata = .; /* End of data section */ 101 *(.data.read_mostly)
80 102 _edata = .; /* End of data section */
81#ifdef CONFIG_STACK_UNWIND
82 . = ALIGN(4);
83 .eh_frame : AT(ADDR(.eh_frame) - LOAD_OFFSET) {
84 __start_unwind = .;
85 *(.eh_frame)
86 __end_unwind = .;
87 } 103 }
88#endif
89 104
90 . = ALIGN(THREAD_SIZE); /* init_task */ 105 . = ALIGN(THREAD_SIZE); /* init_task */
91 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { 106 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
@@ -94,88 +109,102 @@ SECTIONS
94 109
95 /* might get freed after init */ 110 /* might get freed after init */
96 . = ALIGN(4096); 111 . = ALIGN(4096);
97 __smp_alt_begin = .;
98 __smp_alt_instructions = .;
99 .smp_altinstructions : AT(ADDR(.smp_altinstructions) - LOAD_OFFSET) { 112 .smp_altinstructions : AT(ADDR(.smp_altinstructions) - LOAD_OFFSET) {
113 __smp_alt_begin = .;
114 __smp_alt_instructions = .;
100 *(.smp_altinstructions) 115 *(.smp_altinstructions)
116 __smp_alt_instructions_end = .;
101 } 117 }
102 __smp_alt_instructions_end = .;
103 . = ALIGN(4); 118 . = ALIGN(4);
104 __smp_locks = .;
105 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { 119 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
120 __smp_locks = .;
106 *(.smp_locks) 121 *(.smp_locks)
122 __smp_locks_end = .;
107 } 123 }
108 __smp_locks_end = .;
109 .smp_altinstr_replacement : AT(ADDR(.smp_altinstr_replacement) - LOAD_OFFSET) { 124 .smp_altinstr_replacement : AT(ADDR(.smp_altinstr_replacement) - LOAD_OFFSET) {
110 *(.smp_altinstr_replacement) 125 *(.smp_altinstr_replacement)
126 __smp_alt_end = .;
111 } 127 }
128 /* will be freed after init
129 * Following ALIGN() is required to make sure no other data falls on the
130 * same page where __smp_alt_end is pointing as that page might be freed
131 * after boot. Always make sure that ALIGN() directive is present after
132 * the section which contains __smp_alt_end.
133 */
112 . = ALIGN(4096); 134 . = ALIGN(4096);
113 __smp_alt_end = .;
114 135
115 /* will be freed after init */ 136 /* will be freed after init */
116 . = ALIGN(4096); /* Init code and data */ 137 . = ALIGN(4096); /* Init code and data */
117 __init_begin = .;
118 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { 138 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
139 __init_begin = .;
119 _sinittext = .; 140 _sinittext = .;
120 *(.init.text) 141 *(.init.text)
121 _einittext = .; 142 _einittext = .;
122 } 143 }
123 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) } 144 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) }
124 . = ALIGN(16); 145 . = ALIGN(16);
125 __setup_start = .; 146 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
126 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) } 147 __setup_start = .;
127 __setup_end = .; 148 *(.init.setup)
128 __initcall_start = .; 149 __setup_end = .;
150 }
129 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { 151 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
152 __initcall_start = .;
130 INITCALLS 153 INITCALLS
154 __initcall_end = .;
131 } 155 }
132 __initcall_end = .;
133 __con_initcall_start = .;
134 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { 156 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
157 __con_initcall_start = .;
135 *(.con_initcall.init) 158 *(.con_initcall.init)
159 __con_initcall_end = .;
136 } 160 }
137 __con_initcall_end = .;
138 SECURITY_INIT 161 SECURITY_INIT
139 . = ALIGN(4); 162 . = ALIGN(4);
140 __alt_instructions = .;
141 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { 163 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
164 __alt_instructions = .;
142 *(.altinstructions) 165 *(.altinstructions)
166 __alt_instructions_end = .;
143 } 167 }
144 __alt_instructions_end = .;
145 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { 168 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
146 *(.altinstr_replacement) 169 *(.altinstr_replacement)
147 } 170 }
171 . = ALIGN(4);
172 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
173 __start_parainstructions = .;
174 *(.parainstructions)
175 __stop_parainstructions = .;
176 }
148 /* .exit.text is discard at runtime, not link time, to deal with references 177 /* .exit.text is discard at runtime, not link time, to deal with references
149 from .altinstructions and .eh_frame */ 178 from .altinstructions and .eh_frame */
150 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) } 179 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
151 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) } 180 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) }
152 . = ALIGN(4096); 181 . = ALIGN(4096);
153 __initramfs_start = .; 182 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
154 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) } 183 __initramfs_start = .;
155 __initramfs_end = .; 184 *(.init.ramfs)
185 __initramfs_end = .;
186 }
156 . = ALIGN(L1_CACHE_BYTES); 187 . = ALIGN(L1_CACHE_BYTES);
157 __per_cpu_start = .; 188 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
158 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) } 189 __per_cpu_start = .;
159 __per_cpu_end = .; 190 *(.data.percpu)
191 __per_cpu_end = .;
192 }
160 . = ALIGN(4096); 193 . = ALIGN(4096);
161 __init_end = .;
162 /* freed after init ends here */ 194 /* freed after init ends here */
163 195
164 __bss_start = .; /* BSS */
165 .bss.page_aligned : AT(ADDR(.bss.page_aligned) - LOAD_OFFSET) {
166 *(.bss.page_aligned)
167 }
168 .bss : AT(ADDR(.bss) - LOAD_OFFSET) { 196 .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
197 __init_end = .;
198 __bss_start = .; /* BSS */
199 *(.bss.page_aligned)
169 *(.bss) 200 *(.bss)
201 . = ALIGN(4);
202 __bss_stop = .;
203 _end = . ;
204 /* This is where the kernel creates the early boot page tables */
205 . = ALIGN(4096);
206 pg0 = . ;
170 } 207 }
171 . = ALIGN(4);
172 __bss_stop = .;
173
174 _end = . ;
175
176 /* This is where the kernel creates the early boot page tables */
177 . = ALIGN(4096);
178 pg0 = .;
179 208
180 /* Sections to be discarded */ 209 /* Sections to be discarded */
181 /DISCARD/ : { 210 /DISCARD/ : {
diff --git a/arch/i386/mach-generic/probe.c b/arch/i386/mach-generic/probe.c
index 94b1fd9cbe3c..a7b3999bb37a 100644
--- a/arch/i386/mach-generic/probe.c
+++ b/arch/i386/mach-generic/probe.c
@@ -45,7 +45,9 @@ static int __init parse_apic(char *arg)
45 return 0; 45 return 0;
46 } 46 }
47 } 47 }
48 return -ENOENT; 48
49 /* Parsed again by __setup for debug/verbose */
50 return 0;
49} 51}
50early_param("apic", parse_apic); 52early_param("apic", parse_apic);
51 53
diff --git a/arch/i386/mach-voyager/voyager_cat.c b/arch/i386/mach-voyager/voyager_cat.c
index f50c6c6ad680..943a9473b138 100644
--- a/arch/i386/mach-voyager/voyager_cat.c
+++ b/arch/i386/mach-voyager/voyager_cat.c
@@ -776,7 +776,7 @@ voyager_cat_init(void)
776 for(asic=0; asic < (*modpp)->num_asics; asic++) { 776 for(asic=0; asic < (*modpp)->num_asics; asic++) {
777 int j; 777 int j;
778 voyager_asic_t *asicp = *asicpp 778 voyager_asic_t *asicp = *asicpp
779 = kmalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count++];*/ 779 = kzalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count++];*/
780 voyager_sp_table_t *sp_table; 780 voyager_sp_table_t *sp_table;
781 voyager_at_t *asic_table; 781 voyager_at_t *asic_table;
782 voyager_jtt_t *jtag_table; 782 voyager_jtt_t *jtag_table;
@@ -785,7 +785,6 @@ voyager_cat_init(void)
785 printk("**WARNING** kmalloc failure in cat_init\n"); 785 printk("**WARNING** kmalloc failure in cat_init\n");
786 continue; 786 continue;
787 } 787 }
788 memset(asicp, 0, sizeof(voyager_asic_t));
789 asicpp = &(asicp->next); 788 asicpp = &(asicp->next);
790 asicp->asic_location = asic; 789 asicp->asic_location = asic;
791 sp_table = (voyager_sp_table_t *)(eprom_buf + sp_offset); 790 sp_table = (voyager_sp_table_t *)(eprom_buf + sp_offset);
@@ -851,8 +850,7 @@ voyager_cat_init(void)
851#endif 850#endif
852 851
853 { 852 {
854 struct resource *res = kmalloc(sizeof(struct resource),GFP_KERNEL); 853 struct resource *res = kzalloc(sizeof(struct resource),GFP_KERNEL);
855 memset(res, 0, sizeof(struct resource));
856 res->name = kmalloc(128, GFP_KERNEL); 854 res->name = kmalloc(128, GFP_KERNEL);
857 sprintf((char *)res->name, "Voyager %s Quad CPI", cat_module_name(i)); 855 sprintf((char *)res->name, "Voyager %s Quad CPI", cat_module_name(i));
858 res->start = qic_addr; 856 res->start = qic_addr;
diff --git a/arch/i386/mach-voyager/voyager_smp.c b/arch/i386/mach-voyager/voyager_smp.c
index f3fea2ad50fe..55428e656a3f 100644
--- a/arch/i386/mach-voyager/voyager_smp.c
+++ b/arch/i386/mach-voyager/voyager_smp.c
@@ -28,6 +28,7 @@
28#include <asm/pgalloc.h> 28#include <asm/pgalloc.h>
29#include <asm/tlbflush.h> 29#include <asm/tlbflush.h>
30#include <asm/arch_hooks.h> 30#include <asm/arch_hooks.h>
31#include <asm/pda.h>
31 32
32/* TLB state -- visible externally, indexed physically */ 33/* TLB state -- visible externally, indexed physically */
33DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0 }; 34DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0 };
@@ -422,6 +423,7 @@ find_smp_config(void)
422 VOYAGER_SUS_IN_CONTROL_PORT); 423 VOYAGER_SUS_IN_CONTROL_PORT);
423 424
424 current_thread_info()->cpu = boot_cpu_id; 425 current_thread_info()->cpu = boot_cpu_id;
426 write_pda(cpu_number, boot_cpu_id);
425} 427}
426 428
427/* 429/*
@@ -458,7 +460,7 @@ start_secondary(void *unused)
458 /* external functions not defined in the headers */ 460 /* external functions not defined in the headers */
459 extern void calibrate_delay(void); 461 extern void calibrate_delay(void);
460 462
461 cpu_init(); 463 secondary_cpu_init();
462 464
463 /* OK, we're in the routine */ 465 /* OK, we're in the routine */
464 ack_CPI(VIC_CPU_BOOT_CPI); 466 ack_CPI(VIC_CPU_BOOT_CPI);
@@ -578,6 +580,15 @@ do_boot_cpu(__u8 cpu)
578 /* init_tasks (in sched.c) is indexed logically */ 580 /* init_tasks (in sched.c) is indexed logically */
579 stack_start.esp = (void *) idle->thread.esp; 581 stack_start.esp = (void *) idle->thread.esp;
580 582
583 /* Pre-allocate and initialize the CPU's GDT and PDA so it
584 doesn't have to do any memory allocation during the
585 delicate CPU-bringup phase. */
586 if (!init_gdt(cpu, idle)) {
587 printk(KERN_INFO "Couldn't allocate GDT/PDA for CPU %d\n", cpu);
588 cpucount--;
589 return;
590 }
591
581 irq_ctx_init(cpu); 592 irq_ctx_init(cpu);
582 593
583 /* Note: Don't modify initial ss override */ 594 /* Note: Don't modify initial ss override */
@@ -1963,4 +1974,5 @@ void __init
1963smp_setup_processor_id(void) 1974smp_setup_processor_id(void)
1964{ 1975{
1965 current_thread_info()->cpu = hard_smp_processor_id(); 1976 current_thread_info()->cpu = hard_smp_processor_id();
1977 write_pda(cpu_number, hard_smp_processor_id());
1966} 1978}
diff --git a/arch/i386/math-emu/fpu_emu.h b/arch/i386/math-emu/fpu_emu.h
index d62b20a3e660..65120f523853 100644
--- a/arch/i386/math-emu/fpu_emu.h
+++ b/arch/i386/math-emu/fpu_emu.h
@@ -57,6 +57,7 @@
57#define TAG_Special Const(2) /* De-normal, + or - infinity, 57#define TAG_Special Const(2) /* De-normal, + or - infinity,
58 or Not a Number */ 58 or Not a Number */
59#define TAG_Empty Const(3) /* empty */ 59#define TAG_Empty Const(3) /* empty */
60#define TAG_Error Const(0x80) /* probably need to abort */
60 61
61#define LOADED_DATA Const(10101) /* Special st() number to identify 62#define LOADED_DATA Const(10101) /* Special st() number to identify
62 loaded data (not on stack). */ 63 loaded data (not on stack). */
diff --git a/arch/i386/math-emu/fpu_entry.c b/arch/i386/math-emu/fpu_entry.c
index d93f16ef828f..ddf8fa3bbd01 100644
--- a/arch/i386/math-emu/fpu_entry.c
+++ b/arch/i386/math-emu/fpu_entry.c
@@ -742,7 +742,8 @@ int save_i387_soft(void *s387, struct _fpstate __user * buf)
742 S387->fcs &= ~0xf8000000; 742 S387->fcs &= ~0xf8000000;
743 S387->fos |= 0xffff0000; 743 S387->fos |= 0xffff0000;
744#endif /* PECULIAR_486 */ 744#endif /* PECULIAR_486 */
745 __copy_to_user(d, &S387->cwd, 7*4); 745 if (__copy_to_user(d, &S387->cwd, 7*4))
746 return -1;
746 RE_ENTRANT_CHECK_ON; 747 RE_ENTRANT_CHECK_ON;
747 748
748 d += 7*4; 749 d += 7*4;
diff --git a/arch/i386/math-emu/fpu_system.h b/arch/i386/math-emu/fpu_system.h
index bf26341c8bde..a3ae28c49ddd 100644
--- a/arch/i386/math-emu/fpu_system.h
+++ b/arch/i386/math-emu/fpu_system.h
@@ -68,6 +68,7 @@
68 68
69#define FPU_access_ok(x,y,z) if ( !access_ok(x,y,z) ) \ 69#define FPU_access_ok(x,y,z) if ( !access_ok(x,y,z) ) \
70 math_abort(FPU_info,SIGSEGV) 70 math_abort(FPU_info,SIGSEGV)
71#define FPU_abort math_abort(FPU_info, SIGSEGV)
71 72
72#undef FPU_IGNORE_CODE_SEGV 73#undef FPU_IGNORE_CODE_SEGV
73#ifdef FPU_IGNORE_CODE_SEGV 74#ifdef FPU_IGNORE_CODE_SEGV
diff --git a/arch/i386/math-emu/load_store.c b/arch/i386/math-emu/load_store.c
index 85314be2fef8..eebd6fb1c8a8 100644
--- a/arch/i386/math-emu/load_store.c
+++ b/arch/i386/math-emu/load_store.c
@@ -227,6 +227,8 @@ int FPU_load_store(u_char type, fpu_addr_modes addr_modes,
227 case 027: /* fild m64int */ 227 case 027: /* fild m64int */
228 clear_C1(); 228 clear_C1();
229 loaded_tag = FPU_load_int64((long long __user *)data_address); 229 loaded_tag = FPU_load_int64((long long __user *)data_address);
230 if (loaded_tag == TAG_Error)
231 return 0;
230 FPU_settag0(loaded_tag); 232 FPU_settag0(loaded_tag);
231 break; 233 break;
232 case 030: /* fstenv m14/28byte */ 234 case 030: /* fstenv m14/28byte */
diff --git a/arch/i386/math-emu/reg_ld_str.c b/arch/i386/math-emu/reg_ld_str.c
index f06ed41d191d..e976caef6498 100644
--- a/arch/i386/math-emu/reg_ld_str.c
+++ b/arch/i386/math-emu/reg_ld_str.c
@@ -244,7 +244,8 @@ int FPU_load_int64(long long __user *_s)
244 244
245 RE_ENTRANT_CHECK_OFF; 245 RE_ENTRANT_CHECK_OFF;
246 FPU_access_ok(VERIFY_READ, _s, 8); 246 FPU_access_ok(VERIFY_READ, _s, 8);
247 copy_from_user(&s,_s,8); 247 if (copy_from_user(&s,_s,8))
248 FPU_abort;
248 RE_ENTRANT_CHECK_ON; 249 RE_ENTRANT_CHECK_ON;
249 250
250 if (s == 0) 251 if (s == 0)
@@ -907,7 +908,8 @@ int FPU_store_int64(FPU_REG *st0_ptr, u_char st0_tag, long long __user *d)
907 908
908 RE_ENTRANT_CHECK_OFF; 909 RE_ENTRANT_CHECK_OFF;
909 FPU_access_ok(VERIFY_WRITE,d,8); 910 FPU_access_ok(VERIFY_WRITE,d,8);
910 copy_to_user(d, &tll, 8); 911 if (copy_to_user(d, &tll, 8))
912 FPU_abort;
911 RE_ENTRANT_CHECK_ON; 913 RE_ENTRANT_CHECK_ON;
912 914
913 return 1; 915 return 1;
@@ -1336,7 +1338,8 @@ u_char __user *fstenv(fpu_addr_modes addr_modes, u_char __user *d)
1336 I387.soft.fcs &= ~0xf8000000; 1338 I387.soft.fcs &= ~0xf8000000;
1337 I387.soft.fos |= 0xffff0000; 1339 I387.soft.fos |= 0xffff0000;
1338#endif /* PECULIAR_486 */ 1340#endif /* PECULIAR_486 */
1339 __copy_to_user(d, &control_word, 7*4); 1341 if (__copy_to_user(d, &control_word, 7*4))
1342 FPU_abort;
1340 RE_ENTRANT_CHECK_ON; 1343 RE_ENTRANT_CHECK_ON;
1341 d += 0x1c; 1344 d += 0x1c;
1342 } 1345 }
@@ -1359,9 +1362,11 @@ void fsave(fpu_addr_modes addr_modes, u_char __user *data_address)
1359 FPU_access_ok(VERIFY_WRITE,d,80); 1362 FPU_access_ok(VERIFY_WRITE,d,80);
1360 1363
1361 /* Copy all registers in stack order. */ 1364 /* Copy all registers in stack order. */
1362 __copy_to_user(d, register_base+offset, other); 1365 if (__copy_to_user(d, register_base+offset, other))
1366 FPU_abort;
1363 if ( offset ) 1367 if ( offset )
1364 __copy_to_user(d+other, register_base, offset); 1368 if (__copy_to_user(d+other, register_base, offset))
1369 FPU_abort;
1365 RE_ENTRANT_CHECK_ON; 1370 RE_ENTRANT_CHECK_ON;
1366 1371
1367 finit(); 1372 finit();
diff --git a/arch/i386/mm/boot_ioremap.c b/arch/i386/mm/boot_ioremap.c
index 4de11f508c3a..4de95a17a7d4 100644
--- a/arch/i386/mm/boot_ioremap.c
+++ b/arch/i386/mm/boot_ioremap.c
@@ -16,6 +16,7 @@
16 */ 16 */
17 17
18#undef CONFIG_X86_PAE 18#undef CONFIG_X86_PAE
19#undef CONFIG_PARAVIRT
19#include <asm/page.h> 20#include <asm/page.h>
20#include <asm/pgtable.h> 21#include <asm/pgtable.h>
21#include <asm/tlbflush.h> 22#include <asm/tlbflush.h>
diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c
index ddbdb0336f28..103b76e56a94 100644
--- a/arch/i386/mm/discontig.c
+++ b/arch/i386/mm/discontig.c
@@ -168,7 +168,7 @@ static void __init allocate_pgdat(int nid)
168 if (nid && node_has_online_mem(nid)) 168 if (nid && node_has_online_mem(nid))
169 NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; 169 NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
170 else { 170 else {
171 NODE_DATA(nid) = (pg_data_t *)(__va(min_low_pfn << PAGE_SHIFT)); 171 NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(min_low_pfn));
172 min_low_pfn += PFN_UP(sizeof(pg_data_t)); 172 min_low_pfn += PFN_UP(sizeof(pg_data_t));
173 } 173 }
174} 174}
diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c
index 2581575786c1..aaaa4d225f7e 100644
--- a/arch/i386/mm/fault.c
+++ b/arch/i386/mm/fault.c
@@ -22,9 +22,9 @@
22#include <linux/highmem.h> 22#include <linux/highmem.h>
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/kprobes.h> 24#include <linux/kprobes.h>
25#include <linux/uaccess.h>
25 26
26#include <asm/system.h> 27#include <asm/system.h>
27#include <asm/uaccess.h>
28#include <asm/desc.h> 28#include <asm/desc.h>
29#include <asm/kdebug.h> 29#include <asm/kdebug.h>
30#include <asm/segment.h> 30#include <asm/segment.h>
@@ -167,7 +167,7 @@ static inline unsigned long get_segment_eip(struct pt_regs *regs,
167static int __is_prefetch(struct pt_regs *regs, unsigned long addr) 167static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
168{ 168{
169 unsigned long limit; 169 unsigned long limit;
170 unsigned long instr = get_segment_eip (regs, &limit); 170 unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit);
171 int scan_more = 1; 171 int scan_more = 1;
172 int prefetch = 0; 172 int prefetch = 0;
173 int i; 173 int i;
@@ -177,9 +177,9 @@ static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
177 unsigned char instr_hi; 177 unsigned char instr_hi;
178 unsigned char instr_lo; 178 unsigned char instr_lo;
179 179
180 if (instr > limit) 180 if (instr > (unsigned char *)limit)
181 break; 181 break;
182 if (__get_user(opcode, (unsigned char __user *) instr)) 182 if (probe_kernel_address(instr, opcode))
183 break; 183 break;
184 184
185 instr_hi = opcode & 0xf0; 185 instr_hi = opcode & 0xf0;
@@ -204,9 +204,9 @@ static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
204 case 0x00: 204 case 0x00:
205 /* Prefetch instruction is 0x0F0D or 0x0F18 */ 205 /* Prefetch instruction is 0x0F0D or 0x0F18 */
206 scan_more = 0; 206 scan_more = 0;
207 if (instr > limit) 207 if (instr > (unsigned char *)limit)
208 break; 208 break;
209 if (__get_user(opcode, (unsigned char __user *) instr)) 209 if (probe_kernel_address(instr, opcode))
210 break; 210 break;
211 prefetch = (instr_lo == 0xF) && 211 prefetch = (instr_lo == 0xF) &&
212 (opcode == 0x0D || opcode == 0x18); 212 (opcode == 0x0D || opcode == 0x18);
diff --git a/arch/i386/mm/highmem.c b/arch/i386/mm/highmem.c
index f9f647cdbc7b..e0fa6cb655a8 100644
--- a/arch/i386/mm/highmem.c
+++ b/arch/i386/mm/highmem.c
@@ -32,7 +32,7 @@ void *kmap_atomic(struct page *page, enum km_type type)
32 unsigned long vaddr; 32 unsigned long vaddr;
33 33
34 /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ 34 /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
35 inc_preempt_count(); 35 pagefault_disable();
36 if (!PageHighMem(page)) 36 if (!PageHighMem(page))
37 return page_address(page); 37 return page_address(page);
38 38
@@ -50,26 +50,22 @@ void kunmap_atomic(void *kvaddr, enum km_type type)
50 unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; 50 unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
51 enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); 51 enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
52 52
53#ifdef CONFIG_DEBUG_HIGHMEM
54 if (vaddr >= PAGE_OFFSET && vaddr < (unsigned long)high_memory) {
55 dec_preempt_count();
56 preempt_check_resched();
57 return;
58 }
59
60 if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx))
61 BUG();
62#endif
63 /* 53 /*
64 * Force other mappings to Oops if they'll try to access this pte 54 * Force other mappings to Oops if they'll try to access this pte
65 * without first remap it. Keeping stale mappings around is a bad idea 55 * without first remap it. Keeping stale mappings around is a bad idea
66 * also, in case the page changes cacheability attributes or becomes 56 * also, in case the page changes cacheability attributes or becomes
67 * a protected page in a hypervisor. 57 * a protected page in a hypervisor.
68 */ 58 */
69 kpte_clear_flush(kmap_pte-idx, vaddr); 59 if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx))
60 kpte_clear_flush(kmap_pte-idx, vaddr);
61 else {
62#ifdef CONFIG_DEBUG_HIGHMEM
63 BUG_ON(vaddr < PAGE_OFFSET);
64 BUG_ON(vaddr >= (unsigned long)high_memory);
65#endif
66 }
70 67
71 dec_preempt_count(); 68 pagefault_enable();
72 preempt_check_resched();
73} 69}
74 70
75/* This is the same as kmap_atomic() but can map memory that doesn't 71/* This is the same as kmap_atomic() but can map memory that doesn't
@@ -80,7 +76,7 @@ void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
80 enum fixed_addresses idx; 76 enum fixed_addresses idx;
81 unsigned long vaddr; 77 unsigned long vaddr;
82 78
83 inc_preempt_count(); 79 pagefault_disable();
84 80
85 idx = type + KM_TYPE_NR*smp_processor_id(); 81 idx = type + KM_TYPE_NR*smp_processor_id();
86 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); 82 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c
index 1719a8141f81..34728e4afe48 100644
--- a/arch/i386/mm/hugetlbpage.c
+++ b/arch/i386/mm/hugetlbpage.c
@@ -17,6 +17,113 @@
17#include <asm/tlb.h> 17#include <asm/tlb.h>
18#include <asm/tlbflush.h> 18#include <asm/tlbflush.h>
19 19
20static unsigned long page_table_shareable(struct vm_area_struct *svma,
21 struct vm_area_struct *vma,
22 unsigned long addr, pgoff_t idx)
23{
24 unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
25 svma->vm_start;
26 unsigned long sbase = saddr & PUD_MASK;
27 unsigned long s_end = sbase + PUD_SIZE;
28
29 /*
30 * match the virtual addresses, permission and the alignment of the
31 * page table page.
32 */
33 if (pmd_index(addr) != pmd_index(saddr) ||
34 vma->vm_flags != svma->vm_flags ||
35 sbase < svma->vm_start || svma->vm_end < s_end)
36 return 0;
37
38 return saddr;
39}
40
41static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
42{
43 unsigned long base = addr & PUD_MASK;
44 unsigned long end = base + PUD_SIZE;
45
46 /*
47 * check on proper vm_flags and page table alignment
48 */
49 if (vma->vm_flags & VM_MAYSHARE &&
50 vma->vm_start <= base && end <= vma->vm_end)
51 return 1;
52 return 0;
53}
54
55/*
56 * search for a shareable pmd page for hugetlb.
57 */
58static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
59{
60 struct vm_area_struct *vma = find_vma(mm, addr);
61 struct address_space *mapping = vma->vm_file->f_mapping;
62 pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
63 vma->vm_pgoff;
64 struct prio_tree_iter iter;
65 struct vm_area_struct *svma;
66 unsigned long saddr;
67 pte_t *spte = NULL;
68
69 if (!vma_shareable(vma, addr))
70 return;
71
72 spin_lock(&mapping->i_mmap_lock);
73 vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
74 if (svma == vma)
75 continue;
76
77 saddr = page_table_shareable(svma, vma, addr, idx);
78 if (saddr) {
79 spte = huge_pte_offset(svma->vm_mm, saddr);
80 if (spte) {
81 get_page(virt_to_page(spte));
82 break;
83 }
84 }
85 }
86
87 if (!spte)
88 goto out;
89
90 spin_lock(&mm->page_table_lock);
91 if (pud_none(*pud))
92 pud_populate(mm, pud, (unsigned long) spte & PAGE_MASK);
93 else
94 put_page(virt_to_page(spte));
95 spin_unlock(&mm->page_table_lock);
96out:
97 spin_unlock(&mapping->i_mmap_lock);
98}
99
100/*
101 * unmap huge page backed by shared pte.
102 *
103 * Hugetlb pte page is ref counted at the time of mapping. If pte is shared
104 * indicated by page_count > 1, unmap is achieved by clearing pud and
105 * decrementing the ref count. If count == 1, the pte page is not shared.
106 *
107 * called with vma->vm_mm->page_table_lock held.
108 *
109 * returns: 1 successfully unmapped a shared pte page
110 * 0 the underlying pte page is not shared, or it is the last user
111 */
112int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
113{
114 pgd_t *pgd = pgd_offset(mm, *addr);
115 pud_t *pud = pud_offset(pgd, *addr);
116
117 BUG_ON(page_count(virt_to_page(ptep)) == 0);
118 if (page_count(virt_to_page(ptep)) == 1)
119 return 0;
120
121 pud_clear(pud);
122 put_page(virt_to_page(ptep));
123 *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
124 return 1;
125}
126
20pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) 127pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
21{ 128{
22 pgd_t *pgd; 129 pgd_t *pgd;
@@ -25,8 +132,11 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
25 132
26 pgd = pgd_offset(mm, addr); 133 pgd = pgd_offset(mm, addr);
27 pud = pud_alloc(mm, pgd, addr); 134 pud = pud_alloc(mm, pgd, addr);
28 if (pud) 135 if (pud) {
136 if (pud_none(*pud))
137 huge_pmd_share(mm, addr, pud);
29 pte = (pte_t *) pmd_alloc(mm, pud, addr); 138 pte = (pte_t *) pmd_alloc(mm, pud, addr);
139 }
30 BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); 140 BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
31 141
32 return pte; 142 return pte;
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index 167416155ee4..84697dfc7348 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -192,8 +192,6 @@ static inline int page_kills_ppro(unsigned long pagenr)
192 return 0; 192 return 0;
193} 193}
194 194
195extern int is_available_memory(efi_memory_desc_t *);
196
197int page_is_ram(unsigned long pagenr) 195int page_is_ram(unsigned long pagenr)
198{ 196{
199 int i; 197 int i;
@@ -699,8 +697,8 @@ int remove_memory(u64 start, u64 size)
699#endif 697#endif
700#endif 698#endif
701 699
702kmem_cache_t *pgd_cache; 700struct kmem_cache *pgd_cache;
703kmem_cache_t *pmd_cache; 701struct kmem_cache *pmd_cache;
704 702
705void __init pgtable_cache_init(void) 703void __init pgtable_cache_init(void)
706{ 704{
diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c
index 8564b6ae17e3..ad91528bdc14 100644
--- a/arch/i386/mm/pageattr.c
+++ b/arch/i386/mm/pageattr.c
@@ -67,11 +67,17 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot,
67 return base; 67 return base;
68} 68}
69 69
70static void flush_kernel_map(void *dummy) 70static void flush_kernel_map(void *arg)
71{ 71{
72 /* Could use CLFLUSH here if the CPU supports it (Hammer,P4) */ 72 unsigned long adr = (unsigned long)arg;
73 if (boot_cpu_data.x86_model >= 4) 73
74 if (adr && cpu_has_clflush) {
75 int i;
76 for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
77 asm volatile("clflush (%0)" :: "r" (adr + i));
78 } else if (boot_cpu_data.x86_model >= 4)
74 wbinvd(); 79 wbinvd();
80
75 /* Flush all to work around Errata in early athlons regarding 81 /* Flush all to work around Errata in early athlons regarding
76 * large page flushing. 82 * large page flushing.
77 */ 83 */
@@ -173,9 +179,9 @@ __change_page_attr(struct page *page, pgprot_t prot)
173 return 0; 179 return 0;
174} 180}
175 181
176static inline void flush_map(void) 182static inline void flush_map(void *adr)
177{ 183{
178 on_each_cpu(flush_kernel_map, NULL, 1, 1); 184 on_each_cpu(flush_kernel_map, adr, 1, 1);
179} 185}
180 186
181/* 187/*
@@ -217,9 +223,13 @@ void global_flush_tlb(void)
217 spin_lock_irq(&cpa_lock); 223 spin_lock_irq(&cpa_lock);
218 list_replace_init(&df_list, &l); 224 list_replace_init(&df_list, &l);
219 spin_unlock_irq(&cpa_lock); 225 spin_unlock_irq(&cpa_lock);
220 flush_map(); 226 if (!cpu_has_clflush)
221 list_for_each_entry_safe(pg, next, &l, lru) 227 flush_map(0);
228 list_for_each_entry_safe(pg, next, &l, lru) {
229 if (cpu_has_clflush)
230 flush_map(page_address(pg));
222 __free_page(pg); 231 __free_page(pg);
232 }
223} 233}
224 234
225#ifdef CONFIG_DEBUG_PAGEALLOC 235#ifdef CONFIG_DEBUG_PAGEALLOC
diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
index 10126e3f8174..f349eaf450b0 100644
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -95,8 +95,11 @@ static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
95 return; 95 return;
96 } 96 }
97 pte = pte_offset_kernel(pmd, vaddr); 97 pte = pte_offset_kernel(pmd, vaddr);
98 /* <pfn,flags> stored as-is, to permit clearing entries */ 98 if (pgprot_val(flags))
99 set_pte(pte, pfn_pte(pfn, flags)); 99 /* <pfn,flags> stored as-is, to permit clearing entries */
100 set_pte(pte, pfn_pte(pfn, flags));
101 else
102 pte_clear(&init_mm, vaddr, pte);
100 103
101 /* 104 /*
102 * It's enough to flush this one mapping. 105 * It's enough to flush this one mapping.
@@ -193,7 +196,7 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
193 return pte; 196 return pte;
194} 197}
195 198
196void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags) 199void pmd_ctor(void *pmd, struct kmem_cache *cache, unsigned long flags)
197{ 200{
198 memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); 201 memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
199} 202}
@@ -233,7 +236,7 @@ static inline void pgd_list_del(pgd_t *pgd)
233 set_page_private(next, (unsigned long)pprev); 236 set_page_private(next, (unsigned long)pprev);
234} 237}
235 238
236void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused) 239void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused)
237{ 240{
238 unsigned long flags; 241 unsigned long flags;
239 242
@@ -253,7 +256,7 @@ void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
253} 256}
254 257
255/* never called when PTRS_PER_PMD > 1 */ 258/* never called when PTRS_PER_PMD > 1 */
256void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused) 259void pgd_dtor(void *pgd, struct kmem_cache *cache, unsigned long unused)
257{ 260{
258 unsigned long flags; /* can be called from interrupt context */ 261 unsigned long flags; /* can be called from interrupt context */
259 262
diff --git a/arch/i386/pci/early.c b/arch/i386/pci/early.c
index 713d6c866cae..42df4b6606df 100644
--- a/arch/i386/pci/early.c
+++ b/arch/i386/pci/early.c
@@ -45,6 +45,13 @@ void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset,
45 outl(val, 0xcfc); 45 outl(val, 0xcfc);
46} 46}
47 47
48void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val)
49{
50 PDprintk("%x writing to %x: %x\n", slot, offset, val);
51 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
52 outb(val, 0xcfc);
53}
54
48int early_pci_allowed(void) 55int early_pci_allowed(void)
49{ 56{
50 return (pci_probe & (PCI_PROBE_CONF1|PCI_PROBE_NOEARLY)) == 57 return (pci_probe & (PCI_PROBE_CONF1|PCI_PROBE_NOEARLY)) ==
diff --git a/arch/i386/pci/irq.c b/arch/i386/pci/irq.c
index e65551cd8216..f2cb942f8281 100644
--- a/arch/i386/pci/irq.c
+++ b/arch/i386/pci/irq.c
@@ -764,7 +764,7 @@ static void __init pirq_find_router(struct irq_router *r)
764 DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n", 764 DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n",
765 rt->rtr_vendor, rt->rtr_device); 765 rt->rtr_vendor, rt->rtr_device);
766 766
767 pirq_router_dev = pci_find_slot(rt->rtr_bus, rt->rtr_devfn); 767 pirq_router_dev = pci_get_bus_and_slot(rt->rtr_bus, rt->rtr_devfn);
768 if (!pirq_router_dev) { 768 if (!pirq_router_dev) {
769 DBG(KERN_DEBUG "PCI: Interrupt router not found at " 769 DBG(KERN_DEBUG "PCI: Interrupt router not found at "
770 "%02x:%02x\n", rt->rtr_bus, rt->rtr_devfn); 770 "%02x:%02x\n", rt->rtr_bus, rt->rtr_devfn);
@@ -784,6 +784,8 @@ static void __init pirq_find_router(struct irq_router *r)
784 pirq_router_dev->vendor, 784 pirq_router_dev->vendor,
785 pirq_router_dev->device, 785 pirq_router_dev->device,
786 pci_name(pirq_router_dev)); 786 pci_name(pirq_router_dev));
787
788 /* The device remains referenced for the kernel lifetime */
787} 789}
788 790
789static struct irq_info *pirq_get_info(struct pci_dev *dev) 791static struct irq_info *pirq_get_info(struct pci_dev *dev)
diff --git a/arch/i386/pci/pcbios.c b/arch/i386/pci/pcbios.c
index ed1512a175ab..5f5193401bea 100644
--- a/arch/i386/pci/pcbios.c
+++ b/arch/i386/pci/pcbios.c
@@ -5,6 +5,7 @@
5#include <linux/pci.h> 5#include <linux/pci.h>
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/uaccess.h>
8#include "pci.h" 9#include "pci.h"
9#include "pci-functions.h" 10#include "pci-functions.h"
10 11
@@ -314,6 +315,10 @@ static struct pci_raw_ops * __devinit pci_find_bios(void)
314 for (check = (union bios32 *) __va(0xe0000); 315 for (check = (union bios32 *) __va(0xe0000);
315 check <= (union bios32 *) __va(0xffff0); 316 check <= (union bios32 *) __va(0xffff0);
316 ++check) { 317 ++check) {
318 long sig;
319 if (probe_kernel_address(&check->fields.signature, sig))
320 continue;
321
317 if (check->fields.signature != BIOS32_SIGNATURE) 322 if (check->fields.signature != BIOS32_SIGNATURE)
318 continue; 323 continue;
319 length = check->fields.length * 16; 324 length = check->fields.length * 16;
@@ -331,11 +336,13 @@ static struct pci_raw_ops * __devinit pci_find_bios(void)
331 } 336 }
332 DBG("PCI: BIOS32 Service Directory structure at 0x%p\n", check); 337 DBG("PCI: BIOS32 Service Directory structure at 0x%p\n", check);
333 if (check->fields.entry >= 0x100000) { 338 if (check->fields.entry >= 0x100000) {
334 printk("PCI: BIOS32 entry (0x%p) in high memory, cannot use.\n", check); 339 printk("PCI: BIOS32 entry (0x%p) in high memory, "
340 "cannot use.\n", check);
335 return NULL; 341 return NULL;
336 } else { 342 } else {
337 unsigned long bios32_entry = check->fields.entry; 343 unsigned long bios32_entry = check->fields.entry;
338 DBG("PCI: BIOS32 Service Directory entry at 0x%lx\n", bios32_entry); 344 DBG("PCI: BIOS32 Service Directory entry at 0x%lx\n",
345 bios32_entry);
339 bios32_indirect.address = bios32_entry + PAGE_OFFSET; 346 bios32_indirect.address = bios32_entry + PAGE_OFFSET;
340 if (check_pcibios()) 347 if (check_pcibios())
341 return &pci_bios_access; 348 return &pci_bios_access;
diff --git a/arch/i386/power/Makefile b/arch/i386/power/Makefile
index 8cfa4e8a719d..2de7bbf03cd7 100644
--- a/arch/i386/power/Makefile
+++ b/arch/i386/power/Makefile
@@ -1,2 +1,2 @@
1obj-$(CONFIG_PM) += cpu.o 1obj-$(CONFIG_PM) += cpu.o
2obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o 2obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o suspend.o
diff --git a/arch/i386/power/cpu.c b/arch/i386/power/cpu.c
index 5a1abeff033b..2c15500f8713 100644
--- a/arch/i386/power/cpu.c
+++ b/arch/i386/power/cpu.c
@@ -26,8 +26,8 @@ void __save_processor_state(struct saved_context *ctxt)
26 /* 26 /*
27 * descriptor tables 27 * descriptor tables
28 */ 28 */
29 store_gdt(&ctxt->gdt_limit); 29 store_gdt(&ctxt->gdt);
30 store_idt(&ctxt->idt_limit); 30 store_idt(&ctxt->idt);
31 store_tr(ctxt->tr); 31 store_tr(ctxt->tr);
32 32
33 /* 33 /*
@@ -99,8 +99,8 @@ void __restore_processor_state(struct saved_context *ctxt)
99 * now restore the descriptor tables to their proper values 99 * now restore the descriptor tables to their proper values
100 * ltr is done i fix_processor_context(). 100 * ltr is done i fix_processor_context().
101 */ 101 */
102 load_gdt(&ctxt->gdt_limit); 102 load_gdt(&ctxt->gdt);
103 load_idt(&ctxt->idt_limit); 103 load_idt(&ctxt->idt);
104 104
105 /* 105 /*
106 * segment registers 106 * segment registers
diff --git a/arch/i386/power/suspend.c b/arch/i386/power/suspend.c
new file mode 100644
index 000000000000..db5e98d2eb73
--- /dev/null
+++ b/arch/i386/power/suspend.c
@@ -0,0 +1,158 @@
1/*
2 * Suspend support specific for i386 - temporary page tables
3 *
4 * Distribute under GPLv2
5 *
6 * Copyright (c) 2006 Rafael J. Wysocki <rjw@sisk.pl>
7 */
8
9#include <linux/suspend.h>
10#include <linux/bootmem.h>
11
12#include <asm/system.h>
13#include <asm/page.h>
14#include <asm/pgtable.h>
15
16/* Defined in arch/i386/power/swsusp.S */
17extern int restore_image(void);
18
19/* Pointer to the temporary resume page tables */
20pgd_t *resume_pg_dir;
21
22/* The following three functions are based on the analogous code in
23 * arch/i386/mm/init.c
24 */
25
26/*
27 * Create a middle page table on a resume-safe page and put a pointer to it in
28 * the given global directory entry. This only returns the gd entry
29 * in non-PAE compilation mode, since the middle layer is folded.
30 */
31static pmd_t *resume_one_md_table_init(pgd_t *pgd)
32{
33 pud_t *pud;
34 pmd_t *pmd_table;
35
36#ifdef CONFIG_X86_PAE
37 pmd_table = (pmd_t *)get_safe_page(GFP_ATOMIC);
38 if (!pmd_table)
39 return NULL;
40
41 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
42 pud = pud_offset(pgd, 0);
43
44 BUG_ON(pmd_table != pmd_offset(pud, 0));
45#else
46 pud = pud_offset(pgd, 0);
47 pmd_table = pmd_offset(pud, 0);
48#endif
49
50 return pmd_table;
51}
52
53/*
54 * Create a page table on a resume-safe page and place a pointer to it in
55 * a middle page directory entry.
56 */
57static pte_t *resume_one_page_table_init(pmd_t *pmd)
58{
59 if (pmd_none(*pmd)) {
60 pte_t *page_table = (pte_t *)get_safe_page(GFP_ATOMIC);
61 if (!page_table)
62 return NULL;
63
64 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
65
66 BUG_ON(page_table != pte_offset_kernel(pmd, 0));
67
68 return page_table;
69 }
70
71 return pte_offset_kernel(pmd, 0);
72}
73
74/*
75 * This maps the physical memory to kernel virtual address space, a total
76 * of max_low_pfn pages, by creating page tables starting from address
77 * PAGE_OFFSET. The page tables are allocated out of resume-safe pages.
78 */
79static int resume_physical_mapping_init(pgd_t *pgd_base)
80{
81 unsigned long pfn;
82 pgd_t *pgd;
83 pmd_t *pmd;
84 pte_t *pte;
85 int pgd_idx, pmd_idx;
86
87 pgd_idx = pgd_index(PAGE_OFFSET);
88 pgd = pgd_base + pgd_idx;
89 pfn = 0;
90
91 for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
92 pmd = resume_one_md_table_init(pgd);
93 if (!pmd)
94 return -ENOMEM;
95
96 if (pfn >= max_low_pfn)
97 continue;
98
99 for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD; pmd++, pmd_idx++) {
100 if (pfn >= max_low_pfn)
101 break;
102
103 /* Map with big pages if possible, otherwise create
104 * normal page tables.
105 * NOTE: We can mark everything as executable here
106 */
107 if (cpu_has_pse) {
108 set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
109 pfn += PTRS_PER_PTE;
110 } else {
111 pte_t *max_pte;
112
113 pte = resume_one_page_table_init(pmd);
114 if (!pte)
115 return -ENOMEM;
116
117 max_pte = pte + PTRS_PER_PTE;
118 for (; pte < max_pte; pte++, pfn++) {
119 if (pfn >= max_low_pfn)
120 break;
121
122 set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
123 }
124 }
125 }
126 }
127 return 0;
128}
129
130static inline void resume_init_first_level_page_table(pgd_t *pg_dir)
131{
132#ifdef CONFIG_X86_PAE
133 int i;
134
135 /* Init entries of the first-level page table to the zero page */
136 for (i = 0; i < PTRS_PER_PGD; i++)
137 set_pgd(pg_dir + i,
138 __pgd(__pa(empty_zero_page) | _PAGE_PRESENT));
139#endif
140}
141
142int swsusp_arch_resume(void)
143{
144 int error;
145
146 resume_pg_dir = (pgd_t *)get_safe_page(GFP_ATOMIC);
147 if (!resume_pg_dir)
148 return -ENOMEM;
149
150 resume_init_first_level_page_table(resume_pg_dir);
151 error = resume_physical_mapping_init(resume_pg_dir);
152 if (error)
153 return error;
154
155 /* We have got enough memory and from now on we cannot recover */
156 restore_image();
157 return 0;
158}
diff --git a/arch/i386/power/swsusp.S b/arch/i386/power/swsusp.S
index 8a2b50a0aaad..53662e05b393 100644
--- a/arch/i386/power/swsusp.S
+++ b/arch/i386/power/swsusp.S
@@ -28,8 +28,9 @@ ENTRY(swsusp_arch_suspend)
28 call swsusp_save 28 call swsusp_save
29 ret 29 ret
30 30
31ENTRY(swsusp_arch_resume) 31ENTRY(restore_image)
32 movl $swsusp_pg_dir-__PAGE_OFFSET, %ecx 32 movl resume_pg_dir, %ecx
33 subl $__PAGE_OFFSET, %ecx
33 movl %ecx, %cr3 34 movl %ecx, %cr3
34 35
35 movl restore_pblist, %edx 36 movl restore_pblist, %edx
@@ -51,6 +52,10 @@ copy_loop:
51 .p2align 4,,7 52 .p2align 4,,7
52 53
53done: 54done:
55 /* go back to the original page tables */
56 movl $swapper_pg_dir, %ecx
57 subl $__PAGE_OFFSET, %ecx
58 movl %ecx, %cr3
54 /* Flush TLB, including "global" things (vmalloc) */ 59 /* Flush TLB, including "global" things (vmalloc) */
55 movl mmu_cr4_features, %eax 60 movl mmu_cr4_features, %eax
56 movl %eax, %edx 61 movl %eax, %edx