aboutsummaryrefslogtreecommitdiffstats
path: root/arch/i386
diff options
context:
space:
mode:
authorDave Jones <davej@redhat.com>2006-12-12 17:41:41 -0500
committerDave Jones <davej@redhat.com>2006-12-12 17:41:41 -0500
commitc4366889dda8110247be59ca41fddb82951a8c26 (patch)
tree705c1a996bed8fd48ce94ff33ec9fd00f9b94875 /arch/i386
parentdb2fb9db5735cc532fd4fc55e94b9a3c3750378e (diff)
parente1036502e5263851259d147771226161e5ccc85a (diff)
Merge ../linus
Conflicts: drivers/cpufreq/cpufreq.c
Diffstat (limited to 'arch/i386')
-rw-r--r--arch/i386/Kconfig78
-rw-r--r--arch/i386/Kconfig.cpu27
-rw-r--r--arch/i386/Kconfig.debug10
-rw-r--r--arch/i386/Makefile16
-rw-r--r--arch/i386/Makefile.cpu1
-rw-r--r--arch/i386/boot/compressed/Makefile28
-rw-r--r--arch/i386/boot/compressed/head.S185
-rw-r--r--arch/i386/boot/compressed/misc.c264
-rw-r--r--arch/i386/boot/compressed/relocs.c625
-rw-r--r--arch/i386/boot/compressed/vmlinux.lds43
-rw-r--r--arch/i386/boot/compressed/vmlinux.scr3
-rw-r--r--arch/i386/boot/setup.S42
-rw-r--r--arch/i386/defconfig64
-rw-r--r--arch/i386/kernel/Makefile5
-rw-r--r--arch/i386/kernel/acpi/boot.c20
-rw-r--r--arch/i386/kernel/acpi/cstate.c7
-rw-r--r--arch/i386/kernel/acpi/earlyquirk.c29
-rw-r--r--arch/i386/kernel/alternative.c68
-rw-r--r--arch/i386/kernel/apic.c22
-rw-r--r--arch/i386/kernel/apm.c42
-rw-r--r--arch/i386/kernel/asm-offsets.c39
-rw-r--r--arch/i386/kernel/cpu/amd.c5
-rw-r--r--arch/i386/kernel/cpu/common.c249
-rw-r--r--arch/i386/kernel/cpu/intel.c12
-rw-r--r--arch/i386/kernel/cpu/intel_cacheinfo.c11
-rw-r--r--arch/i386/kernel/cpu/mcheck/non-fatal.c6
-rw-r--r--arch/i386/kernel/cpu/mcheck/therm_throt.c3
-rw-r--r--arch/i386/kernel/cpu/mtrr/Makefile4
-rw-r--r--arch/i386/kernel/cpu/mtrr/amd.c2
-rw-r--r--arch/i386/kernel/cpu/mtrr/centaur.c9
-rw-r--r--arch/i386/kernel/cpu/mtrr/cyrix.c25
-rw-r--r--arch/i386/kernel/cpu/mtrr/generic.c78
-rw-r--r--arch/i386/kernel/cpu/mtrr/if.c31
-rw-r--r--arch/i386/kernel/cpu/mtrr/main.c71
-rw-r--r--arch/i386/kernel/cpu/mtrr/mtrr.h25
-rw-r--r--arch/i386/kernel/cpu/proc.c3
-rw-r--r--arch/i386/kernel/cpuid.c27
-rw-r--r--arch/i386/kernel/crash.c66
-rw-r--r--arch/i386/kernel/e820.c894
-rw-r--r--arch/i386/kernel/efi.c17
-rw-r--r--arch/i386/kernel/entry.S331
-rw-r--r--arch/i386/kernel/head.S68
-rw-r--r--arch/i386/kernel/hpet.c7
-rw-r--r--arch/i386/kernel/i8253.c2
-rw-r--r--arch/i386/kernel/i8259.c12
-rw-r--r--arch/i386/kernel/io_apic.c179
-rw-r--r--arch/i386/kernel/irq.c2
-rw-r--r--arch/i386/kernel/kprobes.c26
-rw-r--r--arch/i386/kernel/ldt.c4
-rw-r--r--arch/i386/kernel/mca.c13
-rw-r--r--arch/i386/kernel/microcode.c4
-rw-r--r--arch/i386/kernel/module.c15
-rw-r--r--arch/i386/kernel/mpparse.c2
-rw-r--r--arch/i386/kernel/msr.c31
-rw-r--r--arch/i386/kernel/nmi.c60
-rw-r--r--arch/i386/kernel/paravirt.c569
-rw-r--r--arch/i386/kernel/pci-dma.c10
-rw-r--r--arch/i386/kernel/process.c91
-rw-r--r--arch/i386/kernel/ptrace.c18
-rw-r--r--arch/i386/kernel/quirks.c69
-rw-r--r--arch/i386/kernel/reboot.c1
-rw-r--r--arch/i386/kernel/setup.c855
-rw-r--r--arch/i386/kernel/signal.c6
-rw-r--r--arch/i386/kernel/smp.c10
-rw-r--r--arch/i386/kernel/smpboot.c84
-rw-r--r--arch/i386/kernel/sysenter.c6
-rw-r--r--arch/i386/kernel/time.c15
-rw-r--r--arch/i386/kernel/time_hpet.c15
-rw-r--r--arch/i386/kernel/topology.c8
-rw-r--r--arch/i386/kernel/traps.c163
-rw-r--r--arch/i386/kernel/tsc.c11
-rw-r--r--arch/i386/kernel/vm86.c121
-rw-r--r--arch/i386/kernel/vmlinux.lds.S161
-rw-r--r--arch/i386/lib/usercopy.c3
-rw-r--r--arch/i386/mach-generic/probe.c4
-rw-r--r--arch/i386/mach-visws/setup.c3
-rw-r--r--arch/i386/mach-visws/visws_apic.c7
-rw-r--r--arch/i386/mach-voyager/voyager_cat.c6
-rw-r--r--arch/i386/mach-voyager/voyager_smp.c14
-rw-r--r--arch/i386/math-emu/fpu_emu.h1
-rw-r--r--arch/i386/math-emu/fpu_entry.c3
-rw-r--r--arch/i386/math-emu/fpu_system.h1
-rw-r--r--arch/i386/math-emu/load_store.c2
-rw-r--r--arch/i386/math-emu/reg_ld_str.c15
-rw-r--r--arch/i386/mm/boot_ioremap.c1
-rw-r--r--arch/i386/mm/discontig.c2
-rw-r--r--arch/i386/mm/fault.c12
-rw-r--r--arch/i386/mm/highmem.c26
-rw-r--r--arch/i386/mm/hugetlbpage.c112
-rw-r--r--arch/i386/mm/init.c6
-rw-r--r--arch/i386/mm/pageattr.c24
-rw-r--r--arch/i386/mm/pgtable.c13
-rw-r--r--arch/i386/pci/common.c60
-rw-r--r--arch/i386/pci/early.c7
-rw-r--r--arch/i386/pci/fixup.c62
-rw-r--r--arch/i386/pci/i386.c73
-rw-r--r--arch/i386/pci/irq.c14
-rw-r--r--arch/i386/pci/mmconfig.c35
-rw-r--r--arch/i386/pci/pcbios.c11
-rw-r--r--arch/i386/pci/pci.h8
-rw-r--r--arch/i386/power/Makefile2
-rw-r--r--arch/i386/power/cpu.c8
-rw-r--r--arch/i386/power/suspend.c158
-rw-r--r--arch/i386/power/swsusp.S9
104 files changed, 4608 insertions, 2219 deletions
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 8ff1c6fb5aa1..0d67a0a1151e 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -49,6 +49,11 @@ config GENERIC_IOMAP
49 bool 49 bool
50 default y 50 default y
51 51
52config GENERIC_BUG
53 bool
54 default y
55 depends on BUG
56
52config GENERIC_HWEIGHT 57config GENERIC_HWEIGHT
53 bool 58 bool
54 default y 59 default y
@@ -182,6 +187,18 @@ config X86_ES7000
182 187
183endchoice 188endchoice
184 189
190config PARAVIRT
191 bool "Paravirtualization support (EXPERIMENTAL)"
192 depends on EXPERIMENTAL
193 depends on !(X86_VISWS || X86_VOYAGER)
194 help
195 Paravirtualization is a way of running multiple instances of
196 Linux on the same machine, under a hypervisor. This option
197 changes the kernel so it can modify itself when it is run
198 under a hypervisor, improving performance significantly.
199 However, when run without a hypervisor the kernel is
200 theoretically slower. If in doubt, say N.
201
185config ACPI_SRAT 202config ACPI_SRAT
186 bool 203 bool
187 default y 204 default y
@@ -443,7 +460,8 @@ source "drivers/firmware/Kconfig"
443 460
444choice 461choice
445 prompt "High Memory Support" 462 prompt "High Memory Support"
446 default NOHIGHMEM 463 default HIGHMEM4G if !X86_NUMAQ
464 default HIGHMEM64G if X86_NUMAQ
447 465
448config NOHIGHMEM 466config NOHIGHMEM
449 bool "off" 467 bool "off"
@@ -710,20 +728,6 @@ config BOOT_IOREMAP
710 depends on (((X86_SUMMIT || X86_GENERICARCH) && NUMA) || (X86 && EFI)) 728 depends on (((X86_SUMMIT || X86_GENERICARCH) && NUMA) || (X86 && EFI))
711 default y 729 default y
712 730
713config REGPARM
714 bool "Use register arguments"
715 default y
716 help
717 Compile the kernel with -mregparm=3. This instructs gcc to use
718 a more efficient function call ABI which passes the first three
719 arguments of a function call via registers, which results in denser
720 and faster code.
721
722 If this option is disabled, then the default ABI of passing
723 arguments via the stack is used.
724
725 If unsure, say Y.
726
727config SECCOMP 731config SECCOMP
728 bool "Enable seccomp to safely compute untrusted bytecode" 732 bool "Enable seccomp to safely compute untrusted bytecode"
729 depends on PROC_FS 733 depends on PROC_FS
@@ -773,23 +777,39 @@ config CRASH_DUMP
773 PHYSICAL_START. 777 PHYSICAL_START.
774 For more details see Documentation/kdump/kdump.txt 778 For more details see Documentation/kdump/kdump.txt
775 779
776config PHYSICAL_START 780config RELOCATABLE
777 hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) 781 bool "Build a relocatable kernel(EXPERIMENTAL)"
782 depends on EXPERIMENTAL
783 help
784 This build a kernel image that retains relocation information
785 so it can be loaded someplace besides the default 1MB.
786 The relocations tend to the kernel binary about 10% larger,
787 but are discarded at runtime.
788
789 One use is for the kexec on panic case where the recovery kernel
790 must live at a different physical address than the primary
791 kernel.
778 792
779 default "0x1000000" if CRASH_DUMP 793config PHYSICAL_ALIGN
794 hex "Alignment value to which kernel should be aligned"
780 default "0x100000" 795 default "0x100000"
796 range 0x2000 0x400000
781 help 797 help
782 This gives the physical address where the kernel is loaded. Normally 798 This value puts the alignment restrictions on physical address
783 for regular kernels this value is 0x100000 (1MB). But in the case 799 where kernel is loaded and run from. Kernel is compiled for an
784 of kexec on panic the fail safe kernel needs to run at a different 800 address which meets above alignment restriction.
785 address than the panic-ed kernel. This option is used to set the load 801
786 address for kernels used to capture crash dump on being kexec'ed 802 If bootloader loads the kernel at a non-aligned address and
787 after panic. The default value for crash dump kernels is 803 CONFIG_RELOCATABLE is set, kernel will move itself to nearest
788 0x1000000 (16MB). This can also be set based on the "X" value as 804 address aligned to above value and run from there.
789 specified in the "crashkernel=YM@XM" command line boot parameter 805
790 passed to the panic-ed kernel. Typically this parameter is set as 806 If bootloader loads the kernel at a non-aligned address and
791 crashkernel=64M@16M. Please take a look at 807 CONFIG_RELOCATABLE is not set, kernel will ignore the run time
792 Documentation/kdump/kdump.txt for more details about crash dumps. 808 load address and decompress itself to the address it has been
809 compiled for and run from there. The address for which kernel is
810 compiled already meets above alignment restrictions. Hence the
811 end result is that kernel runs from a physical address meeting
812 above alignment restrictions.
793 813
794 Don't change this unless you know what you are doing. 814 Don't change this unless you know what you are doing.
795 815
diff --git a/arch/i386/Kconfig.cpu b/arch/i386/Kconfig.cpu
index fc4f2abccf06..2aecfba4ac4f 100644
--- a/arch/i386/Kconfig.cpu
+++ b/arch/i386/Kconfig.cpu
@@ -103,8 +103,15 @@ config MPENTIUMM
103 Select this for Intel Pentium M (not Pentium-4 M) 103 Select this for Intel Pentium M (not Pentium-4 M)
104 notebook chips. 104 notebook chips.
105 105
106config MCORE2
107 bool "Core 2/newer Xeon"
108 help
109 Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and 53xx)
110 CPUs. You can distingush newer from older Xeons by the CPU family
111 in /proc/cpuinfo. Newer ones have 6.
112
106config MPENTIUM4 113config MPENTIUM4
107 bool "Pentium-4/Celeron(P4-based)/Pentium-4 M/Xeon" 114 bool "Pentium-4/Celeron(P4-based)/Pentium-4 M/older Xeon"
108 help 115 help
109 Select this for Intel Pentium 4 chips. This includes the 116 Select this for Intel Pentium 4 chips. This includes the
110 Pentium 4, P4-based Celeron and Xeon, and Pentium-4 M 117 Pentium 4, P4-based Celeron and Xeon, and Pentium-4 M
@@ -229,7 +236,7 @@ config X86_L1_CACHE_SHIFT
229 default "7" if MPENTIUM4 || X86_GENERIC 236 default "7" if MPENTIUM4 || X86_GENERIC
230 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 237 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1
231 default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX 238 default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
232 default "6" if MK7 || MK8 || MPENTIUMM 239 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2
233 240
234config RWSEM_GENERIC_SPINLOCK 241config RWSEM_GENERIC_SPINLOCK
235 bool 242 bool
@@ -241,6 +248,14 @@ config RWSEM_XCHGADD_ALGORITHM
241 depends on !M386 248 depends on !M386
242 default y 249 default y
243 250
251config ARCH_HAS_ILOG2_U32
252 bool
253 default n
254
255config ARCH_HAS_ILOG2_U64
256 bool
257 default n
258
244config GENERIC_CALIBRATE_DELAY 259config GENERIC_CALIBRATE_DELAY
245 bool 260 bool
246 default y 261 default y
@@ -287,17 +302,17 @@ config X86_ALIGNMENT_16
287 302
288config X86_GOOD_APIC 303config X86_GOOD_APIC
289 bool 304 bool
290 depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON 305 depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON || MCORE2
291 default y 306 default y
292 307
293config X86_INTEL_USERCOPY 308config X86_INTEL_USERCOPY
294 bool 309 bool
295 depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON 310 depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2
296 default y 311 default y
297 312
298config X86_USE_PPRO_CHECKSUM 313config X86_USE_PPRO_CHECKSUM
299 bool 314 bool
300 depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX 315 depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2
301 default y 316 default y
302 317
303config X86_USE_3DNOW 318config X86_USE_3DNOW
@@ -312,5 +327,5 @@ config X86_OOSTORE
312 327
313config X86_TSC 328config X86_TSC
314 bool 329 bool
315 depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1 || MGEODE_LX) && !X86_NUMAQ 330 depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ
316 default y 331 default y
diff --git a/arch/i386/Kconfig.debug b/arch/i386/Kconfig.debug
index b31c0802e1cc..f68cc6f215f8 100644
--- a/arch/i386/Kconfig.debug
+++ b/arch/i386/Kconfig.debug
@@ -85,4 +85,14 @@ config DOUBLEFAULT
85 option saves about 4k and might cause you much additional grey 85 option saves about 4k and might cause you much additional grey
86 hair. 86 hair.
87 87
88config DEBUG_PARAVIRT
89 bool "Enable some paravirtualization debugging"
90 default y
91 depends on PARAVIRT && DEBUG_KERNEL
92 help
93 Currently deliberately clobbers regs which are allowed to be
94 clobbered in inlined paravirt hooks, even in native mode.
95 If turning this off solves a problem, then DISABLE_INTERRUPTS() or
96 ENABLE_INTERRUPTS() is lying about what registers can be clobbered.
97
88endmenu 98endmenu
diff --git a/arch/i386/Makefile b/arch/i386/Makefile
index 7cc0b189b82b..f7ac1aea1d8a 100644
--- a/arch/i386/Makefile
+++ b/arch/i386/Makefile
@@ -26,10 +26,12 @@ endif
26 26
27LDFLAGS := -m elf_i386 27LDFLAGS := -m elf_i386
28OBJCOPYFLAGS := -O binary -R .note -R .comment -S 28OBJCOPYFLAGS := -O binary -R .note -R .comment -S
29LDFLAGS_vmlinux := 29ifdef CONFIG_RELOCATABLE
30LDFLAGS_vmlinux := --emit-relocs
31endif
30CHECKFLAGS += -D__i386__ 32CHECKFLAGS += -D__i386__
31 33
32CFLAGS += -pipe -msoft-float 34CFLAGS += -pipe -msoft-float -mregparm=3
33 35
34# prevent gcc from keeping the stack 16 byte aligned 36# prevent gcc from keeping the stack 16 byte aligned
35CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2) 37CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2)
@@ -37,11 +39,13 @@ CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2)
37# CPU-specific tuning. Anything which can be shared with UML should go here. 39# CPU-specific tuning. Anything which can be shared with UML should go here.
38include $(srctree)/arch/i386/Makefile.cpu 40include $(srctree)/arch/i386/Makefile.cpu
39 41
40cflags-$(CONFIG_REGPARM) += -mregparm=3
41
42# temporary until string.h is fixed 42# temporary until string.h is fixed
43cflags-y += -ffreestanding 43cflags-y += -ffreestanding
44 44
45# this works around some issues with generating unwind tables in older gccs
46# newer gccs do it by default
47cflags-y += -maccumulate-outgoing-args
48
45# Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use 49# Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use
46# a lot more stack due to the lack of sharing of stacklots: 50# a lot more stack due to the lack of sharing of stacklots:
47CFLAGS += $(shell if [ $(call cc-version) -lt 0400 ] ; then echo $(call cc-option,-fno-unit-at-a-time); fi ;) 51CFLAGS += $(shell if [ $(call cc-version) -lt 0400 ] ; then echo $(call cc-option,-fno-unit-at-a-time); fi ;)
@@ -51,8 +55,8 @@ cflags-y += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,)
51AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,) 55AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,)
52 56
53# is .cfi_signal_frame supported too? 57# is .cfi_signal_frame supported too?
54cflags-y += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,) 58cflags-y += $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1,)
55AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,) 59AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1,)
56 60
57CFLAGS += $(cflags-y) 61CFLAGS += $(cflags-y)
58 62
diff --git a/arch/i386/Makefile.cpu b/arch/i386/Makefile.cpu
index a11befba26d5..a32c031c90d7 100644
--- a/arch/i386/Makefile.cpu
+++ b/arch/i386/Makefile.cpu
@@ -32,6 +32,7 @@ cflags-$(CONFIG_MWINCHIP2) += $(call cc-option,-march=winchip2,-march=i586)
32cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586) 32cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586)
33cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 33cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
34cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) 34cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686)
35cflags-$(CONFIG_MCORE2) += -march=i686 $(call cc-option,-mtune=core2,$(call cc-option,-mtune=generic,-mtune=i686))
35 36
36# AMD Elan support 37# AMD Elan support
37cflags-$(CONFIG_X86_ELAN) += -march=i486 38cflags-$(CONFIG_X86_ELAN) += -march=i486
diff --git a/arch/i386/boot/compressed/Makefile b/arch/i386/boot/compressed/Makefile
index 258ea95224f6..a661217f33ec 100644
--- a/arch/i386/boot/compressed/Makefile
+++ b/arch/i386/boot/compressed/Makefile
@@ -4,22 +4,42 @@
4# create a compressed vmlinux image from the original vmlinux 4# create a compressed vmlinux image from the original vmlinux
5# 5#
6 6
7targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o 7targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o \
8 vmlinux.bin.all vmlinux.relocs
8EXTRA_AFLAGS := -traditional 9EXTRA_AFLAGS := -traditional
9 10
10LDFLAGS_vmlinux := -Ttext $(IMAGE_OFFSET) -e startup_32 11LDFLAGS_vmlinux := -T
12CFLAGS_misc.o += -fPIC
13hostprogs-y := relocs
11 14
12$(obj)/vmlinux: $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE 15$(obj)/vmlinux: $(src)/vmlinux.lds $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE
13 $(call if_changed,ld) 16 $(call if_changed,ld)
14 @: 17 @:
15 18
16$(obj)/vmlinux.bin: vmlinux FORCE 19$(obj)/vmlinux.bin: vmlinux FORCE
17 $(call if_changed,objcopy) 20 $(call if_changed,objcopy)
18 21
22quiet_cmd_relocs = RELOCS $@
23 cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $<
24$(obj)/vmlinux.relocs: vmlinux $(obj)/relocs FORCE
25 $(call if_changed,relocs)
26
27vmlinux.bin.all-y := $(obj)/vmlinux.bin
28vmlinux.bin.all-$(CONFIG_RELOCATABLE) += $(obj)/vmlinux.relocs
29quiet_cmd_relocbin = BUILD $@
30 cmd_relocbin = cat $(filter-out FORCE,$^) > $@
31$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE
32 $(call if_changed,relocbin)
33
34ifdef CONFIG_RELOCATABLE
35$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE
36 $(call if_changed,gzip)
37else
19$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE 38$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
20 $(call if_changed,gzip) 39 $(call if_changed,gzip)
40endif
21 41
22LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T 42LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T
23 43
24$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE 44$(obj)/piggy.o: $(src)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE
25 $(call if_changed,ld) 45 $(call if_changed,ld)
diff --git a/arch/i386/boot/compressed/head.S b/arch/i386/boot/compressed/head.S
index b5893e4ecd37..f395a4bb38bb 100644
--- a/arch/i386/boot/compressed/head.S
+++ b/arch/i386/boot/compressed/head.S
@@ -26,9 +26,11 @@
26#include <linux/linkage.h> 26#include <linux/linkage.h>
27#include <asm/segment.h> 27#include <asm/segment.h>
28#include <asm/page.h> 28#include <asm/page.h>
29#include <asm/boot.h>
29 30
31.section ".text.head"
30 .globl startup_32 32 .globl startup_32
31 33
32startup_32: 34startup_32:
33 cld 35 cld
34 cli 36 cli
@@ -37,93 +39,142 @@ startup_32:
37 movl %eax,%es 39 movl %eax,%es
38 movl %eax,%fs 40 movl %eax,%fs
39 movl %eax,%gs 41 movl %eax,%gs
42 movl %eax,%ss
40 43
41 lss stack_start,%esp 44/* Calculate the delta between where we were compiled to run
42 xorl %eax,%eax 45 * at and where we were actually loaded at. This can only be done
431: incl %eax # check that A20 really IS enabled 46 * with a short local call on x86. Nothing else will tell us what
44 movl %eax,0x000000 # loop forever if it isn't 47 * address we are running at. The reserved chunk of the real-mode
45 cmpl %eax,0x100000 48 * data at 0x34-0x3f are used as the stack for this calculation.
46 je 1b 49 * Only 4 bytes are needed.
50 */
51 leal 0x40(%esi), %esp
52 call 1f
531: popl %ebp
54 subl $1b, %ebp
55
56/* %ebp contains the address we are loaded at by the boot loader and %ebx
57 * contains the address where we should move the kernel image temporarily
58 * for safe in-place decompression.
59 */
60
61#ifdef CONFIG_RELOCATABLE
62 movl %ebp, %ebx
63 addl $(CONFIG_PHYSICAL_ALIGN - 1), %ebx
64 andl $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebx
65#else
66 movl $LOAD_PHYSICAL_ADDR, %ebx
67#endif
68
69 /* Replace the compressed data size with the uncompressed size */
70 subl input_len(%ebp), %ebx
71 movl output_len(%ebp), %eax
72 addl %eax, %ebx
73 /* Add 8 bytes for every 32K input block */
74 shrl $12, %eax
75 addl %eax, %ebx
76 /* Add 32K + 18 bytes of extra slack */
77 addl $(32768 + 18), %ebx
78 /* Align on a 4K boundary */
79 addl $4095, %ebx
80 andl $~4095, %ebx
81
82/* Copy the compressed kernel to the end of our buffer
83 * where decompression in place becomes safe.
84 */
85 pushl %esi
86 leal _end(%ebp), %esi
87 leal _end(%ebx), %edi
88 movl $(_end - startup_32), %ecx
89 std
90 rep
91 movsb
92 cld
93 popl %esi
94
95/* Compute the kernel start address.
96 */
97#ifdef CONFIG_RELOCATABLE
98 addl $(CONFIG_PHYSICAL_ALIGN - 1), %ebp
99 andl $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebp
100#else
101 movl $LOAD_PHYSICAL_ADDR, %ebp
102#endif
47 103
48/* 104/*
49 * Initialize eflags. Some BIOS's leave bits like NT set. This would 105 * Jump to the relocated address.
50 * confuse the debugger if this code is traced.
51 * XXX - best to initialize before switching to protected mode.
52 */ 106 */
53 pushl $0 107 leal relocated(%ebx), %eax
54 popfl 108 jmp *%eax
109.section ".text"
110relocated:
111
55/* 112/*
56 * Clear BSS 113 * Clear BSS
57 */ 114 */
58 xorl %eax,%eax 115 xorl %eax,%eax
59 movl $_edata,%edi 116 leal _edata(%ebx),%edi
60 movl $_end,%ecx 117 leal _end(%ebx), %ecx
61 subl %edi,%ecx 118 subl %edi,%ecx
62 cld 119 cld
63 rep 120 rep
64 stosb 121 stosb
122
123/*
124 * Setup the stack for the decompressor
125 */
126 leal stack_end(%ebx), %esp
127
65/* 128/*
66 * Do the decompression, and jump to the new kernel.. 129 * Do the decompression, and jump to the new kernel..
67 */ 130 */
68 subl $16,%esp # place for structure on the stack 131 movl output_len(%ebx), %eax
69 movl %esp,%eax 132 pushl %eax
133 pushl %ebp # output address
134 movl input_len(%ebx), %eax
135 pushl %eax # input_len
136 leal input_data(%ebx), %eax
137 pushl %eax # input_data
138 leal _end(%ebx), %eax
139 pushl %eax # end of the image as third argument
70 pushl %esi # real mode pointer as second arg 140 pushl %esi # real mode pointer as second arg
71 pushl %eax # address of structure as first arg
72 call decompress_kernel 141 call decompress_kernel
73 orl %eax,%eax 142 addl $20, %esp
74 jnz 3f 143 popl %ecx
75 popl %esi # discard address
76 popl %esi # real mode pointer
77 xorl %ebx,%ebx
78 ljmp $(__BOOT_CS), $__PHYSICAL_START
79 144
145#if CONFIG_RELOCATABLE
146/* Find the address of the relocations.
147 */
148 movl %ebp, %edi
149 addl %ecx, %edi
150
151/* Calculate the delta between where vmlinux was compiled to run
152 * and where it was actually loaded.
153 */
154 movl %ebp, %ebx
155 subl $LOAD_PHYSICAL_ADDR, %ebx
156 jz 2f /* Nothing to be done if loaded at compiled addr. */
80/* 157/*
81 * We come here, if we were loaded high. 158 * Process relocations.
82 * We need to move the move-in-place routine down to 0x1000
83 * and then start it with the buffer addresses in registers,
84 * which we got from the stack.
85 */ 159 */
863: 160
87 movl $move_routine_start,%esi 1611: subl $4, %edi
88 movl $0x1000,%edi 162 movl 0(%edi), %ecx
89 movl $move_routine_end,%ecx 163 testl %ecx, %ecx
90 subl %esi,%ecx 164 jz 2f
91 addl $3,%ecx 165 addl %ebx, -__PAGE_OFFSET(%ebx, %ecx)
92 shrl $2,%ecx 166 jmp 1b
93 cld 1672:
94 rep 168#endif
95 movsl
96
97 popl %esi # discard the address
98 popl %ebx # real mode pointer
99 popl %esi # low_buffer_start
100 popl %ecx # lcount
101 popl %edx # high_buffer_start
102 popl %eax # hcount
103 movl $__PHYSICAL_START,%edi
104 cli # make sure we don't get interrupted
105 ljmp $(__BOOT_CS), $0x1000 # and jump to the move routine
106 169
107/* 170/*
108 * Routine (template) for moving the decompressed kernel in place, 171 * Jump to the decompressed kernel.
109 * if we were high loaded. This _must_ PIC-code !
110 */ 172 */
111move_routine_start:
112 movl %ecx,%ebp
113 shrl $2,%ecx
114 rep
115 movsl
116 movl %ebp,%ecx
117 andl $3,%ecx
118 rep
119 movsb
120 movl %edx,%esi
121 movl %eax,%ecx # NOTE: rep movsb won't move if %ecx == 0
122 addl $3,%ecx
123 shrl $2,%ecx
124 rep
125 movsl
126 movl %ebx,%esi # Restore setup pointer
127 xorl %ebx,%ebx 173 xorl %ebx,%ebx
128 ljmp $(__BOOT_CS), $__PHYSICAL_START 174 jmp *%ebp
129move_routine_end: 175
176.bss
177.balign 4
178stack:
179 .fill 4096, 1, 0
180stack_end:
diff --git a/arch/i386/boot/compressed/misc.c b/arch/i386/boot/compressed/misc.c
index b2ccd543410d..1ce7017fd627 100644
--- a/arch/i386/boot/compressed/misc.c
+++ b/arch/i386/boot/compressed/misc.c
@@ -9,11 +9,94 @@
9 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 9 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
10 */ 10 */
11 11
12#undef CONFIG_PARAVIRT
12#include <linux/linkage.h> 13#include <linux/linkage.h>
13#include <linux/vmalloc.h> 14#include <linux/vmalloc.h>
14#include <linux/screen_info.h> 15#include <linux/screen_info.h>
15#include <asm/io.h> 16#include <asm/io.h>
16#include <asm/page.h> 17#include <asm/page.h>
18#include <asm/boot.h>
19
20/* WARNING!!
21 * This code is compiled with -fPIC and it is relocated dynamically
22 * at run time, but no relocation processing is performed.
23 * This means that it is not safe to place pointers in static structures.
24 */
25
26/*
27 * Getting to provable safe in place decompression is hard.
28 * Worst case behaviours need to be analized.
29 * Background information:
30 *
31 * The file layout is:
32 * magic[2]
33 * method[1]
34 * flags[1]
35 * timestamp[4]
36 * extraflags[1]
37 * os[1]
38 * compressed data blocks[N]
39 * crc[4] orig_len[4]
40 *
41 * resulting in 18 bytes of non compressed data overhead.
42 *
43 * Files divided into blocks
44 * 1 bit (last block flag)
45 * 2 bits (block type)
46 *
47 * 1 block occurs every 32K -1 bytes or when there 50% compression has been achieved.
48 * The smallest block type encoding is always used.
49 *
50 * stored:
51 * 32 bits length in bytes.
52 *
53 * fixed:
54 * magic fixed tree.
55 * symbols.
56 *
57 * dynamic:
58 * dynamic tree encoding.
59 * symbols.
60 *
61 *
62 * The buffer for decompression in place is the length of the
63 * uncompressed data, plus a small amount extra to keep the algorithm safe.
64 * The compressed data is placed at the end of the buffer. The output
65 * pointer is placed at the start of the buffer and the input pointer
66 * is placed where the compressed data starts. Problems will occur
67 * when the output pointer overruns the input pointer.
68 *
69 * The output pointer can only overrun the input pointer if the input
70 * pointer is moving faster than the output pointer. A condition only
71 * triggered by data whose compressed form is larger than the uncompressed
72 * form.
73 *
74 * The worst case at the block level is a growth of the compressed data
75 * of 5 bytes per 32767 bytes.
76 *
77 * The worst case internal to a compressed block is very hard to figure.
78 * The worst case can at least be boundined by having one bit that represents
79 * 32764 bytes and then all of the rest of the bytes representing the very
80 * very last byte.
81 *
82 * All of which is enough to compute an amount of extra data that is required
83 * to be safe. To avoid problems at the block level allocating 5 extra bytes
84 * per 32767 bytes of data is sufficient. To avoind problems internal to a block
85 * adding an extra 32767 bytes (the worst case uncompressed block size) is
86 * sufficient, to ensure that in the worst case the decompressed data for
87 * block will stop the byte before the compressed data for a block begins.
88 * To avoid problems with the compressed data's meta information an extra 18
89 * bytes are needed. Leading to the formula:
90 *
91 * extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size.
92 *
93 * Adding 8 bytes per 32K is a bit excessive but much easier to calculate.
94 * Adding 32768 instead of 32767 just makes for round numbers.
95 * Adding the decompressor_size is necessary as it musht live after all
96 * of the data as well. Last I measured the decompressor is about 14K.
97 * 10K of actuall data and 4K of bss.
98 *
99 */
17 100
18/* 101/*
19 * gzip declarations 102 * gzip declarations
@@ -30,15 +113,20 @@ typedef unsigned char uch;
30typedef unsigned short ush; 113typedef unsigned short ush;
31typedef unsigned long ulg; 114typedef unsigned long ulg;
32 115
33#define WSIZE 0x8000 /* Window size must be at least 32k, */ 116#define WSIZE 0x80000000 /* Window size must be at least 32k,
34 /* and a power of two */ 117 * and a power of two
118 * We don't actually have a window just
119 * a huge output buffer so I report
120 * a 2G windows size, as that should
121 * always be larger than our output buffer.
122 */
35 123
36static uch *inbuf; /* input buffer */ 124static uch *inbuf; /* input buffer */
37static uch window[WSIZE]; /* Sliding window buffer */ 125static uch *window; /* Sliding window buffer, (and final output buffer) */
38 126
39static unsigned insize = 0; /* valid bytes in inbuf */ 127static unsigned insize; /* valid bytes in inbuf */
40static unsigned inptr = 0; /* index of next byte to be processed in inbuf */ 128static unsigned inptr; /* index of next byte to be processed in inbuf */
41static unsigned outcnt = 0; /* bytes in output buffer */ 129static unsigned outcnt; /* bytes in output buffer */
42 130
43/* gzip flag byte */ 131/* gzip flag byte */
44#define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */ 132#define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */
@@ -89,8 +177,6 @@ extern unsigned char input_data[];
89extern int input_len; 177extern int input_len;
90 178
91static long bytes_out = 0; 179static long bytes_out = 0;
92static uch *output_data;
93static unsigned long output_ptr = 0;
94 180
95static void *malloc(int size); 181static void *malloc(int size);
96static void free(void *where); 182static void free(void *where);
@@ -100,24 +186,17 @@ static void *memcpy(void *dest, const void *src, unsigned n);
100 186
101static void putstr(const char *); 187static void putstr(const char *);
102 188
103extern int end; 189static unsigned long free_mem_ptr;
104static long free_mem_ptr = (long)&end; 190static unsigned long free_mem_end_ptr;
105static long free_mem_end_ptr;
106 191
107#define INPLACE_MOVE_ROUTINE 0x1000
108#define LOW_BUFFER_START 0x2000
109#define LOW_BUFFER_MAX 0x90000
110#define HEAP_SIZE 0x3000 192#define HEAP_SIZE 0x3000
111static unsigned int low_buffer_end, low_buffer_size;
112static int high_loaded =0;
113static uch *high_buffer_start /* = (uch *)(((ulg)&end) + HEAP_SIZE)*/;
114 193
115static char *vidmem = (char *)0xb8000; 194static char *vidmem = (char *)0xb8000;
116static int vidport; 195static int vidport;
117static int lines, cols; 196static int lines, cols;
118 197
119#ifdef CONFIG_X86_NUMAQ 198#ifdef CONFIG_X86_NUMAQ
120static void * xquad_portio = NULL; 199void *xquad_portio;
121#endif 200#endif
122 201
123#include "../../../../lib/inflate.c" 202#include "../../../../lib/inflate.c"
@@ -151,7 +230,7 @@ static void gzip_mark(void **ptr)
151 230
152static void gzip_release(void **ptr) 231static void gzip_release(void **ptr)
153{ 232{
154 free_mem_ptr = (long) *ptr; 233 free_mem_ptr = (unsigned long) *ptr;
155} 234}
156 235
157static void scroll(void) 236static void scroll(void)
@@ -179,7 +258,7 @@ static void putstr(const char *s)
179 y--; 258 y--;
180 } 259 }
181 } else { 260 } else {
182 vidmem [ ( x + cols * y ) * 2 ] = c; 261 vidmem [ ( x + cols * y ) * 2 ] = c;
183 if ( ++x >= cols ) { 262 if ( ++x >= cols ) {
184 x = 0; 263 x = 0;
185 if ( ++y >= lines ) { 264 if ( ++y >= lines ) {
@@ -224,58 +303,31 @@ static void* memcpy(void* dest, const void* src, unsigned n)
224 */ 303 */
225static int fill_inbuf(void) 304static int fill_inbuf(void)
226{ 305{
227 if (insize != 0) { 306 error("ran out of input data");
228 error("ran out of input data"); 307 return 0;
229 }
230
231 inbuf = input_data;
232 insize = input_len;
233 inptr = 1;
234 return inbuf[0];
235} 308}
236 309
237/* =========================================================================== 310/* ===========================================================================
238 * Write the output window window[0..outcnt-1] and update crc and bytes_out. 311 * Write the output window window[0..outcnt-1] and update crc and bytes_out.
239 * (Used for the decompressed data only.) 312 * (Used for the decompressed data only.)
240 */ 313 */
241static void flush_window_low(void)
242{
243 ulg c = crc; /* temporary variable */
244 unsigned n;
245 uch *in, *out, ch;
246
247 in = window;
248 out = &output_data[output_ptr];
249 for (n = 0; n < outcnt; n++) {
250 ch = *out++ = *in++;
251 c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
252 }
253 crc = c;
254 bytes_out += (ulg)outcnt;
255 output_ptr += (ulg)outcnt;
256 outcnt = 0;
257}
258
259static void flush_window_high(void)
260{
261 ulg c = crc; /* temporary variable */
262 unsigned n;
263 uch *in, ch;
264 in = window;
265 for (n = 0; n < outcnt; n++) {
266 ch = *output_data++ = *in++;
267 if ((ulg)output_data == low_buffer_end) output_data=high_buffer_start;
268 c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
269 }
270 crc = c;
271 bytes_out += (ulg)outcnt;
272 outcnt = 0;
273}
274
275static void flush_window(void) 314static void flush_window(void)
276{ 315{
277 if (high_loaded) flush_window_high(); 316 /* With my window equal to my output buffer
278 else flush_window_low(); 317 * I only need to compute the crc here.
318 */
319 ulg c = crc; /* temporary variable */
320 unsigned n;
321 uch *in, ch;
322
323 in = window;
324 for (n = 0; n < outcnt; n++) {
325 ch = *in++;
326 c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
327 }
328 crc = c;
329 bytes_out += (ulg)outcnt;
330 outcnt = 0;
279} 331}
280 332
281static void error(char *x) 333static void error(char *x)
@@ -287,66 +339,8 @@ static void error(char *x)
287 while(1); /* Halt */ 339 while(1); /* Halt */
288} 340}
289 341
290#define STACK_SIZE (4096) 342asmlinkage void decompress_kernel(void *rmode, unsigned long end,
291 343 uch *input_data, unsigned long input_len, uch *output)
292long user_stack [STACK_SIZE];
293
294struct {
295 long * a;
296 short b;
297 } stack_start = { & user_stack [STACK_SIZE] , __BOOT_DS };
298
299static void setup_normal_output_buffer(void)
300{
301#ifdef STANDARD_MEMORY_BIOS_CALL
302 if (RM_EXT_MEM_K < 1024) error("Less than 2MB of memory");
303#else
304 if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < 1024) error("Less than 2MB of memory");
305#endif
306 output_data = (unsigned char *)__PHYSICAL_START; /* Normally Points to 1M */
307 free_mem_end_ptr = (long)real_mode;
308}
309
310struct moveparams {
311 uch *low_buffer_start; int lcount;
312 uch *high_buffer_start; int hcount;
313};
314
315static void setup_output_buffer_if_we_run_high(struct moveparams *mv)
316{
317 high_buffer_start = (uch *)(((ulg)&end) + HEAP_SIZE);
318#ifdef STANDARD_MEMORY_BIOS_CALL
319 if (RM_EXT_MEM_K < (3*1024)) error("Less than 4MB of memory");
320#else
321 if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < (3*1024)) error("Less than 4MB of memory");
322#endif
323 mv->low_buffer_start = output_data = (unsigned char *)LOW_BUFFER_START;
324 low_buffer_end = ((unsigned int)real_mode > LOW_BUFFER_MAX
325 ? LOW_BUFFER_MAX : (unsigned int)real_mode) & ~0xfff;
326 low_buffer_size = low_buffer_end - LOW_BUFFER_START;
327 high_loaded = 1;
328 free_mem_end_ptr = (long)high_buffer_start;
329 if ( (__PHYSICAL_START + low_buffer_size) > ((ulg)high_buffer_start)) {
330 high_buffer_start = (uch *)(__PHYSICAL_START + low_buffer_size);
331 mv->hcount = 0; /* say: we need not to move high_buffer */
332 }
333 else mv->hcount = -1;
334 mv->high_buffer_start = high_buffer_start;
335}
336
337static void close_output_buffer_if_we_run_high(struct moveparams *mv)
338{
339 if (bytes_out > low_buffer_size) {
340 mv->lcount = low_buffer_size;
341 if (mv->hcount)
342 mv->hcount = bytes_out - low_buffer_size;
343 } else {
344 mv->lcount = bytes_out;
345 mv->hcount = 0;
346 }
347}
348
349asmlinkage int decompress_kernel(struct moveparams *mv, void *rmode)
350{ 344{
351 real_mode = rmode; 345 real_mode = rmode;
352 346
@@ -361,13 +355,25 @@ asmlinkage int decompress_kernel(struct moveparams *mv, void *rmode)
361 lines = RM_SCREEN_INFO.orig_video_lines; 355 lines = RM_SCREEN_INFO.orig_video_lines;
362 cols = RM_SCREEN_INFO.orig_video_cols; 356 cols = RM_SCREEN_INFO.orig_video_cols;
363 357
364 if (free_mem_ptr < 0x100000) setup_normal_output_buffer(); 358 window = output; /* Output buffer (Normally at 1M) */
365 else setup_output_buffer_if_we_run_high(mv); 359 free_mem_ptr = end; /* Heap */
360 free_mem_end_ptr = end + HEAP_SIZE;
361 inbuf = input_data; /* Input buffer */
362 insize = input_len;
363 inptr = 0;
364
365 if ((u32)output & (CONFIG_PHYSICAL_ALIGN -1))
366 error("Destination address not CONFIG_PHYSICAL_ALIGN aligned");
367 if (end > ((-__PAGE_OFFSET-(512 <<20)-1) & 0x7fffffff))
368 error("Destination address too large");
369#ifndef CONFIG_RELOCATABLE
370 if ((u32)output != LOAD_PHYSICAL_ADDR)
371 error("Wrong destination address");
372#endif
366 373
367 makecrc(); 374 makecrc();
368 putstr("Uncompressing Linux... "); 375 putstr("Uncompressing Linux... ");
369 gunzip(); 376 gunzip();
370 putstr("Ok, booting the kernel.\n"); 377 putstr("Ok, booting the kernel.\n");
371 if (high_loaded) close_output_buffer_if_we_run_high(mv); 378 return;
372 return high_loaded;
373} 379}
diff --git a/arch/i386/boot/compressed/relocs.c b/arch/i386/boot/compressed/relocs.c
new file mode 100644
index 000000000000..468da89153c4
--- /dev/null
+++ b/arch/i386/boot/compressed/relocs.c
@@ -0,0 +1,625 @@
1#include <stdio.h>
2#include <stdarg.h>
3#include <stdlib.h>
4#include <stdint.h>
5#include <string.h>
6#include <errno.h>
7#include <unistd.h>
8#include <elf.h>
9#include <byteswap.h>
10#define USE_BSD
11#include <endian.h>
12
13#define MAX_SHDRS 100
14static Elf32_Ehdr ehdr;
15static Elf32_Shdr shdr[MAX_SHDRS];
16static Elf32_Sym *symtab[MAX_SHDRS];
17static Elf32_Rel *reltab[MAX_SHDRS];
18static char *strtab[MAX_SHDRS];
19static unsigned long reloc_count, reloc_idx;
20static unsigned long *relocs;
21
22/*
23 * Following symbols have been audited. There values are constant and do
24 * not change if bzImage is loaded at a different physical address than
25 * the address for which it has been compiled. Don't warn user about
26 * absolute relocations present w.r.t these symbols.
27 */
28static const char* safe_abs_relocs[] = {
29 "__kernel_vsyscall",
30 "__kernel_rt_sigreturn",
31 "__kernel_sigreturn",
32 "SYSENTER_RETURN",
33};
34
35static int is_safe_abs_reloc(const char* sym_name)
36{
37 int i, array_size;
38
39 array_size = sizeof(safe_abs_relocs)/sizeof(char*);
40
41 for(i = 0; i < array_size; i++) {
42 if (!strcmp(sym_name, safe_abs_relocs[i]))
43 /* Match found */
44 return 1;
45 }
46 return 0;
47}
48
49static void die(char *fmt, ...)
50{
51 va_list ap;
52 va_start(ap, fmt);
53 vfprintf(stderr, fmt, ap);
54 va_end(ap);
55 exit(1);
56}
57
58static const char *sym_type(unsigned type)
59{
60 static const char *type_name[] = {
61#define SYM_TYPE(X) [X] = #X
62 SYM_TYPE(STT_NOTYPE),
63 SYM_TYPE(STT_OBJECT),
64 SYM_TYPE(STT_FUNC),
65 SYM_TYPE(STT_SECTION),
66 SYM_TYPE(STT_FILE),
67 SYM_TYPE(STT_COMMON),
68 SYM_TYPE(STT_TLS),
69#undef SYM_TYPE
70 };
71 const char *name = "unknown sym type name";
72 if (type < sizeof(type_name)/sizeof(type_name[0])) {
73 name = type_name[type];
74 }
75 return name;
76}
77
78static const char *sym_bind(unsigned bind)
79{
80 static const char *bind_name[] = {
81#define SYM_BIND(X) [X] = #X
82 SYM_BIND(STB_LOCAL),
83 SYM_BIND(STB_GLOBAL),
84 SYM_BIND(STB_WEAK),
85#undef SYM_BIND
86 };
87 const char *name = "unknown sym bind name";
88 if (bind < sizeof(bind_name)/sizeof(bind_name[0])) {
89 name = bind_name[bind];
90 }
91 return name;
92}
93
94static const char *sym_visibility(unsigned visibility)
95{
96 static const char *visibility_name[] = {
97#define SYM_VISIBILITY(X) [X] = #X
98 SYM_VISIBILITY(STV_DEFAULT),
99 SYM_VISIBILITY(STV_INTERNAL),
100 SYM_VISIBILITY(STV_HIDDEN),
101 SYM_VISIBILITY(STV_PROTECTED),
102#undef SYM_VISIBILITY
103 };
104 const char *name = "unknown sym visibility name";
105 if (visibility < sizeof(visibility_name)/sizeof(visibility_name[0])) {
106 name = visibility_name[visibility];
107 }
108 return name;
109}
110
111static const char *rel_type(unsigned type)
112{
113 static const char *type_name[] = {
114#define REL_TYPE(X) [X] = #X
115 REL_TYPE(R_386_NONE),
116 REL_TYPE(R_386_32),
117 REL_TYPE(R_386_PC32),
118 REL_TYPE(R_386_GOT32),
119 REL_TYPE(R_386_PLT32),
120 REL_TYPE(R_386_COPY),
121 REL_TYPE(R_386_GLOB_DAT),
122 REL_TYPE(R_386_JMP_SLOT),
123 REL_TYPE(R_386_RELATIVE),
124 REL_TYPE(R_386_GOTOFF),
125 REL_TYPE(R_386_GOTPC),
126#undef REL_TYPE
127 };
128 const char *name = "unknown type rel type name";
129 if (type < sizeof(type_name)/sizeof(type_name[0])) {
130 name = type_name[type];
131 }
132 return name;
133}
134
135static const char *sec_name(unsigned shndx)
136{
137 const char *sec_strtab;
138 const char *name;
139 sec_strtab = strtab[ehdr.e_shstrndx];
140 name = "<noname>";
141 if (shndx < ehdr.e_shnum) {
142 name = sec_strtab + shdr[shndx].sh_name;
143 }
144 else if (shndx == SHN_ABS) {
145 name = "ABSOLUTE";
146 }
147 else if (shndx == SHN_COMMON) {
148 name = "COMMON";
149 }
150 return name;
151}
152
153static const char *sym_name(const char *sym_strtab, Elf32_Sym *sym)
154{
155 const char *name;
156 name = "<noname>";
157 if (sym->st_name) {
158 name = sym_strtab + sym->st_name;
159 }
160 else {
161 name = sec_name(shdr[sym->st_shndx].sh_name);
162 }
163 return name;
164}
165
166
167
168#if BYTE_ORDER == LITTLE_ENDIAN
169#define le16_to_cpu(val) (val)
170#define le32_to_cpu(val) (val)
171#endif
172#if BYTE_ORDER == BIG_ENDIAN
173#define le16_to_cpu(val) bswap_16(val)
174#define le32_to_cpu(val) bswap_32(val)
175#endif
176
177static uint16_t elf16_to_cpu(uint16_t val)
178{
179 return le16_to_cpu(val);
180}
181
182static uint32_t elf32_to_cpu(uint32_t val)
183{
184 return le32_to_cpu(val);
185}
186
187static void read_ehdr(FILE *fp)
188{
189 if (fread(&ehdr, sizeof(ehdr), 1, fp) != 1) {
190 die("Cannot read ELF header: %s\n",
191 strerror(errno));
192 }
193 if (memcmp(ehdr.e_ident, ELFMAG, 4) != 0) {
194 die("No ELF magic\n");
195 }
196 if (ehdr.e_ident[EI_CLASS] != ELFCLASS32) {
197 die("Not a 32 bit executable\n");
198 }
199 if (ehdr.e_ident[EI_DATA] != ELFDATA2LSB) {
200 die("Not a LSB ELF executable\n");
201 }
202 if (ehdr.e_ident[EI_VERSION] != EV_CURRENT) {
203 die("Unknown ELF version\n");
204 }
205 /* Convert the fields to native endian */
206 ehdr.e_type = elf16_to_cpu(ehdr.e_type);
207 ehdr.e_machine = elf16_to_cpu(ehdr.e_machine);
208 ehdr.e_version = elf32_to_cpu(ehdr.e_version);
209 ehdr.e_entry = elf32_to_cpu(ehdr.e_entry);
210 ehdr.e_phoff = elf32_to_cpu(ehdr.e_phoff);
211 ehdr.e_shoff = elf32_to_cpu(ehdr.e_shoff);
212 ehdr.e_flags = elf32_to_cpu(ehdr.e_flags);
213 ehdr.e_ehsize = elf16_to_cpu(ehdr.e_ehsize);
214 ehdr.e_phentsize = elf16_to_cpu(ehdr.e_phentsize);
215 ehdr.e_phnum = elf16_to_cpu(ehdr.e_phnum);
216 ehdr.e_shentsize = elf16_to_cpu(ehdr.e_shentsize);
217 ehdr.e_shnum = elf16_to_cpu(ehdr.e_shnum);
218 ehdr.e_shstrndx = elf16_to_cpu(ehdr.e_shstrndx);
219
220 if ((ehdr.e_type != ET_EXEC) && (ehdr.e_type != ET_DYN)) {
221 die("Unsupported ELF header type\n");
222 }
223 if (ehdr.e_machine != EM_386) {
224 die("Not for x86\n");
225 }
226 if (ehdr.e_version != EV_CURRENT) {
227 die("Unknown ELF version\n");
228 }
229 if (ehdr.e_ehsize != sizeof(Elf32_Ehdr)) {
230 die("Bad Elf header size\n");
231 }
232 if (ehdr.e_phentsize != sizeof(Elf32_Phdr)) {
233 die("Bad program header entry\n");
234 }
235 if (ehdr.e_shentsize != sizeof(Elf32_Shdr)) {
236 die("Bad section header entry\n");
237 }
238 if (ehdr.e_shstrndx >= ehdr.e_shnum) {
239 die("String table index out of bounds\n");
240 }
241}
242
243static void read_shdrs(FILE *fp)
244{
245 int i;
246 if (ehdr.e_shnum > MAX_SHDRS) {
247 die("%d section headers supported: %d\n",
248 ehdr.e_shnum, MAX_SHDRS);
249 }
250 if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0) {
251 die("Seek to %d failed: %s\n",
252 ehdr.e_shoff, strerror(errno));
253 }
254 if (fread(&shdr, sizeof(shdr[0]), ehdr.e_shnum, fp) != ehdr.e_shnum) {
255 die("Cannot read ELF section headers: %s\n",
256 strerror(errno));
257 }
258 for(i = 0; i < ehdr.e_shnum; i++) {
259 shdr[i].sh_name = elf32_to_cpu(shdr[i].sh_name);
260 shdr[i].sh_type = elf32_to_cpu(shdr[i].sh_type);
261 shdr[i].sh_flags = elf32_to_cpu(shdr[i].sh_flags);
262 shdr[i].sh_addr = elf32_to_cpu(shdr[i].sh_addr);
263 shdr[i].sh_offset = elf32_to_cpu(shdr[i].sh_offset);
264 shdr[i].sh_size = elf32_to_cpu(shdr[i].sh_size);
265 shdr[i].sh_link = elf32_to_cpu(shdr[i].sh_link);
266 shdr[i].sh_info = elf32_to_cpu(shdr[i].sh_info);
267 shdr[i].sh_addralign = elf32_to_cpu(shdr[i].sh_addralign);
268 shdr[i].sh_entsize = elf32_to_cpu(shdr[i].sh_entsize);
269 }
270
271}
272
273static void read_strtabs(FILE *fp)
274{
275 int i;
276 for(i = 0; i < ehdr.e_shnum; i++) {
277 if (shdr[i].sh_type != SHT_STRTAB) {
278 continue;
279 }
280 strtab[i] = malloc(shdr[i].sh_size);
281 if (!strtab[i]) {
282 die("malloc of %d bytes for strtab failed\n",
283 shdr[i].sh_size);
284 }
285 if (fseek(fp, shdr[i].sh_offset, SEEK_SET) < 0) {
286 die("Seek to %d failed: %s\n",
287 shdr[i].sh_offset, strerror(errno));
288 }
289 if (fread(strtab[i], 1, shdr[i].sh_size, fp) != shdr[i].sh_size) {
290 die("Cannot read symbol table: %s\n",
291 strerror(errno));
292 }
293 }
294}
295
296static void read_symtabs(FILE *fp)
297{
298 int i,j;
299 for(i = 0; i < ehdr.e_shnum; i++) {
300 if (shdr[i].sh_type != SHT_SYMTAB) {
301 continue;
302 }
303 symtab[i] = malloc(shdr[i].sh_size);
304 if (!symtab[i]) {
305 die("malloc of %d bytes for symtab failed\n",
306 shdr[i].sh_size);
307 }
308 if (fseek(fp, shdr[i].sh_offset, SEEK_SET) < 0) {
309 die("Seek to %d failed: %s\n",
310 shdr[i].sh_offset, strerror(errno));
311 }
312 if (fread(symtab[i], 1, shdr[i].sh_size, fp) != shdr[i].sh_size) {
313 die("Cannot read symbol table: %s\n",
314 strerror(errno));
315 }
316 for(j = 0; j < shdr[i].sh_size/sizeof(symtab[i][0]); j++) {
317 symtab[i][j].st_name = elf32_to_cpu(symtab[i][j].st_name);
318 symtab[i][j].st_value = elf32_to_cpu(symtab[i][j].st_value);
319 symtab[i][j].st_size = elf32_to_cpu(symtab[i][j].st_size);
320 symtab[i][j].st_shndx = elf16_to_cpu(symtab[i][j].st_shndx);
321 }
322 }
323}
324
325
326static void read_relocs(FILE *fp)
327{
328 int i,j;
329 for(i = 0; i < ehdr.e_shnum; i++) {
330 if (shdr[i].sh_type != SHT_REL) {
331 continue;
332 }
333 reltab[i] = malloc(shdr[i].sh_size);
334 if (!reltab[i]) {
335 die("malloc of %d bytes for relocs failed\n",
336 shdr[i].sh_size);
337 }
338 if (fseek(fp, shdr[i].sh_offset, SEEK_SET) < 0) {
339 die("Seek to %d failed: %s\n",
340 shdr[i].sh_offset, strerror(errno));
341 }
342 if (fread(reltab[i], 1, shdr[i].sh_size, fp) != shdr[i].sh_size) {
343 die("Cannot read symbol table: %s\n",
344 strerror(errno));
345 }
346 for(j = 0; j < shdr[i].sh_size/sizeof(reltab[0][0]); j++) {
347 reltab[i][j].r_offset = elf32_to_cpu(reltab[i][j].r_offset);
348 reltab[i][j].r_info = elf32_to_cpu(reltab[i][j].r_info);
349 }
350 }
351}
352
353
354static void print_absolute_symbols(void)
355{
356 int i;
357 printf("Absolute symbols\n");
358 printf(" Num: Value Size Type Bind Visibility Name\n");
359 for(i = 0; i < ehdr.e_shnum; i++) {
360 char *sym_strtab;
361 Elf32_Sym *sh_symtab;
362 int j;
363 if (shdr[i].sh_type != SHT_SYMTAB) {
364 continue;
365 }
366 sh_symtab = symtab[i];
367 sym_strtab = strtab[shdr[i].sh_link];
368 for(j = 0; j < shdr[i].sh_size/sizeof(symtab[0][0]); j++) {
369 Elf32_Sym *sym;
370 const char *name;
371 sym = &symtab[i][j];
372 name = sym_name(sym_strtab, sym);
373 if (sym->st_shndx != SHN_ABS) {
374 continue;
375 }
376 printf("%5d %08x %5d %10s %10s %12s %s\n",
377 j, sym->st_value, sym->st_size,
378 sym_type(ELF32_ST_TYPE(sym->st_info)),
379 sym_bind(ELF32_ST_BIND(sym->st_info)),
380 sym_visibility(ELF32_ST_VISIBILITY(sym->st_other)),
381 name);
382 }
383 }
384 printf("\n");
385}
386
387static void print_absolute_relocs(void)
388{
389 int i, printed = 0;
390
391 for(i = 0; i < ehdr.e_shnum; i++) {
392 char *sym_strtab;
393 Elf32_Sym *sh_symtab;
394 unsigned sec_applies, sec_symtab;
395 int j;
396 if (shdr[i].sh_type != SHT_REL) {
397 continue;
398 }
399 sec_symtab = shdr[i].sh_link;
400 sec_applies = shdr[i].sh_info;
401 if (!(shdr[sec_applies].sh_flags & SHF_ALLOC)) {
402 continue;
403 }
404 sh_symtab = symtab[sec_symtab];
405 sym_strtab = strtab[shdr[sec_symtab].sh_link];
406 for(j = 0; j < shdr[i].sh_size/sizeof(reltab[0][0]); j++) {
407 Elf32_Rel *rel;
408 Elf32_Sym *sym;
409 const char *name;
410 rel = &reltab[i][j];
411 sym = &sh_symtab[ELF32_R_SYM(rel->r_info)];
412 name = sym_name(sym_strtab, sym);
413 if (sym->st_shndx != SHN_ABS) {
414 continue;
415 }
416
417 /* Absolute symbols are not relocated if bzImage is
418 * loaded at a non-compiled address. Display a warning
419 * to user at compile time about the absolute
420 * relocations present.
421 *
422 * User need to audit the code to make sure
423 * some symbols which should have been section
424 * relative have not become absolute because of some
425 * linker optimization or wrong programming usage.
426 *
427 * Before warning check if this absolute symbol
428 * relocation is harmless.
429 */
430 if (is_safe_abs_reloc(name))
431 continue;
432
433 if (!printed) {
434 printf("WARNING: Absolute relocations"
435 " present\n");
436 printf("Offset Info Type Sym.Value "
437 "Sym.Name\n");
438 printed = 1;
439 }
440
441 printf("%08x %08x %10s %08x %s\n",
442 rel->r_offset,
443 rel->r_info,
444 rel_type(ELF32_R_TYPE(rel->r_info)),
445 sym->st_value,
446 name);
447 }
448 }
449
450 if (printed)
451 printf("\n");
452}
453
454static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym))
455{
456 int i;
457 /* Walk through the relocations */
458 for(i = 0; i < ehdr.e_shnum; i++) {
459 char *sym_strtab;
460 Elf32_Sym *sh_symtab;
461 unsigned sec_applies, sec_symtab;
462 int j;
463 if (shdr[i].sh_type != SHT_REL) {
464 continue;
465 }
466 sec_symtab = shdr[i].sh_link;
467 sec_applies = shdr[i].sh_info;
468 if (!(shdr[sec_applies].sh_flags & SHF_ALLOC)) {
469 continue;
470 }
471 sh_symtab = symtab[sec_symtab];
472 sym_strtab = strtab[shdr[sec_symtab].sh_link];
473 for(j = 0; j < shdr[i].sh_size/sizeof(reltab[0][0]); j++) {
474 Elf32_Rel *rel;
475 Elf32_Sym *sym;
476 unsigned r_type;
477 rel = &reltab[i][j];
478 sym = &sh_symtab[ELF32_R_SYM(rel->r_info)];
479 r_type = ELF32_R_TYPE(rel->r_info);
480 /* Don't visit relocations to absolute symbols */
481 if (sym->st_shndx == SHN_ABS) {
482 continue;
483 }
484 if (r_type == R_386_PC32) {
485 /* PC relative relocations don't need to be adjusted */
486 }
487 else if (r_type == R_386_32) {
488 /* Visit relocations that need to be adjusted */
489 visit(rel, sym);
490 }
491 else {
492 die("Unsupported relocation type: %d\n", r_type);
493 }
494 }
495 }
496}
497
498static void count_reloc(Elf32_Rel *rel, Elf32_Sym *sym)
499{
500 reloc_count += 1;
501}
502
503static void collect_reloc(Elf32_Rel *rel, Elf32_Sym *sym)
504{
505 /* Remember the address that needs to be adjusted. */
506 relocs[reloc_idx++] = rel->r_offset;
507}
508
509static int cmp_relocs(const void *va, const void *vb)
510{
511 const unsigned long *a, *b;
512 a = va; b = vb;
513 return (*a == *b)? 0 : (*a > *b)? 1 : -1;
514}
515
516static void emit_relocs(int as_text)
517{
518 int i;
519 /* Count how many relocations I have and allocate space for them. */
520 reloc_count = 0;
521 walk_relocs(count_reloc);
522 relocs = malloc(reloc_count * sizeof(relocs[0]));
523 if (!relocs) {
524 die("malloc of %d entries for relocs failed\n",
525 reloc_count);
526 }
527 /* Collect up the relocations */
528 reloc_idx = 0;
529 walk_relocs(collect_reloc);
530
531 /* Order the relocations for more efficient processing */
532 qsort(relocs, reloc_count, sizeof(relocs[0]), cmp_relocs);
533
534 /* Print the relocations */
535 if (as_text) {
536 /* Print the relocations in a form suitable that
537 * gas will like.
538 */
539 printf(".section \".data.reloc\",\"a\"\n");
540 printf(".balign 4\n");
541 for(i = 0; i < reloc_count; i++) {
542 printf("\t .long 0x%08lx\n", relocs[i]);
543 }
544 printf("\n");
545 }
546 else {
547 unsigned char buf[4];
548 buf[0] = buf[1] = buf[2] = buf[3] = 0;
549 /* Print a stop */
550 printf("%c%c%c%c", buf[0], buf[1], buf[2], buf[3]);
551 /* Now print each relocation */
552 for(i = 0; i < reloc_count; i++) {
553 buf[0] = (relocs[i] >> 0) & 0xff;
554 buf[1] = (relocs[i] >> 8) & 0xff;
555 buf[2] = (relocs[i] >> 16) & 0xff;
556 buf[3] = (relocs[i] >> 24) & 0xff;
557 printf("%c%c%c%c", buf[0], buf[1], buf[2], buf[3]);
558 }
559 }
560}
561
562static void usage(void)
563{
564 die("relocs [--abs-syms |--abs-relocs | --text] vmlinux\n");
565}
566
567int main(int argc, char **argv)
568{
569 int show_absolute_syms, show_absolute_relocs;
570 int as_text;
571 const char *fname;
572 FILE *fp;
573 int i;
574
575 show_absolute_syms = 0;
576 show_absolute_relocs = 0;
577 as_text = 0;
578 fname = NULL;
579 for(i = 1; i < argc; i++) {
580 char *arg = argv[i];
581 if (*arg == '-') {
582 if (strcmp(argv[1], "--abs-syms") == 0) {
583 show_absolute_syms = 1;
584 continue;
585 }
586
587 if (strcmp(argv[1], "--abs-relocs") == 0) {
588 show_absolute_relocs = 1;
589 continue;
590 }
591 else if (strcmp(argv[1], "--text") == 0) {
592 as_text = 1;
593 continue;
594 }
595 }
596 else if (!fname) {
597 fname = arg;
598 continue;
599 }
600 usage();
601 }
602 if (!fname) {
603 usage();
604 }
605 fp = fopen(fname, "r");
606 if (!fp) {
607 die("Cannot open %s: %s\n",
608 fname, strerror(errno));
609 }
610 read_ehdr(fp);
611 read_shdrs(fp);
612 read_strtabs(fp);
613 read_symtabs(fp);
614 read_relocs(fp);
615 if (show_absolute_syms) {
616 print_absolute_symbols();
617 return 0;
618 }
619 if (show_absolute_relocs) {
620 print_absolute_relocs();
621 return 0;
622 }
623 emit_relocs(as_text);
624 return 0;
625}
diff --git a/arch/i386/boot/compressed/vmlinux.lds b/arch/i386/boot/compressed/vmlinux.lds
new file mode 100644
index 000000000000..cc4854f6c6c1
--- /dev/null
+++ b/arch/i386/boot/compressed/vmlinux.lds
@@ -0,0 +1,43 @@
1OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
2OUTPUT_ARCH(i386)
3ENTRY(startup_32)
4SECTIONS
5{
6 /* Be careful parts of head.S assume startup_32 is at
7 * address 0.
8 */
9 . = 0 ;
10 .text.head : {
11 _head = . ;
12 *(.text.head)
13 _ehead = . ;
14 }
15 .data.compressed : {
16 *(.data.compressed)
17 }
18 .text : {
19 _text = .; /* Text */
20 *(.text)
21 *(.text.*)
22 _etext = . ;
23 }
24 .rodata : {
25 _rodata = . ;
26 *(.rodata) /* read-only data */
27 *(.rodata.*)
28 _erodata = . ;
29 }
30 .data : {
31 _data = . ;
32 *(.data)
33 *(.data.*)
34 _edata = . ;
35 }
36 .bss : {
37 _bss = . ;
38 *(.bss)
39 *(.bss.*)
40 *(COMMON)
41 _end = . ;
42 }
43}
diff --git a/arch/i386/boot/compressed/vmlinux.scr b/arch/i386/boot/compressed/vmlinux.scr
index 1ed9d791f863..707a88f7f29e 100644
--- a/arch/i386/boot/compressed/vmlinux.scr
+++ b/arch/i386/boot/compressed/vmlinux.scr
@@ -1,9 +1,10 @@
1SECTIONS 1SECTIONS
2{ 2{
3 .data : { 3 .data.compressed : {
4 input_len = .; 4 input_len = .;
5 LONG(input_data_end - input_data) input_data = .; 5 LONG(input_data_end - input_data) input_data = .;
6 *(.data) 6 *(.data)
7 output_len = . - 4;
7 input_data_end = .; 8 input_data_end = .;
8 } 9 }
9} 10}
diff --git a/arch/i386/boot/setup.S b/arch/i386/boot/setup.S
index 3aec4538a113..06edf1c66242 100644
--- a/arch/i386/boot/setup.S
+++ b/arch/i386/boot/setup.S
@@ -81,7 +81,7 @@ start:
81# This is the setup header, and it must start at %cs:2 (old 0x9020:2) 81# This is the setup header, and it must start at %cs:2 (old 0x9020:2)
82 82
83 .ascii "HdrS" # header signature 83 .ascii "HdrS" # header signature
84 .word 0x0204 # header version number (>= 0x0105) 84 .word 0x0205 # header version number (>= 0x0105)
85 # or else old loadlin-1.5 will fail) 85 # or else old loadlin-1.5 will fail)
86realmode_swtch: .word 0, 0 # default_switch, SETUPSEG 86realmode_swtch: .word 0, 0 # default_switch, SETUPSEG
87start_sys_seg: .word SYSSEG 87start_sys_seg: .word SYSSEG
@@ -160,6 +160,17 @@ ramdisk_max: .long (-__PAGE_OFFSET-(512 << 20)-1) & 0x7fffffff
160 # The highest safe address for 160 # The highest safe address for
161 # the contents of an initrd 161 # the contents of an initrd
162 162
163kernel_alignment: .long CONFIG_PHYSICAL_ALIGN #physical addr alignment
164 #required for protected mode
165 #kernel
166#ifdef CONFIG_RELOCATABLE
167relocatable_kernel: .byte 1
168#else
169relocatable_kernel: .byte 0
170#endif
171pad2: .byte 0
172pad3: .word 0
173
163trampoline: call start_of_setup 174trampoline: call start_of_setup
164 .align 16 175 .align 16
165 # The offset at this point is 0x240 176 # The offset at this point is 0x240
@@ -588,11 +599,6 @@ rmodeswtch_normal:
588 call default_switch 599 call default_switch
589 600
590rmodeswtch_end: 601rmodeswtch_end:
591# we get the code32 start address and modify the below 'jmpi'
592# (loader may have changed it)
593 movl %cs:code32_start, %eax
594 movl %eax, %cs:code32
595
596# Now we move the system to its rightful place ... but we check if we have a 602# Now we move the system to its rightful place ... but we check if we have a
597# big-kernel. In that case we *must* not move it ... 603# big-kernel. In that case we *must* not move it ...
598 testb $LOADED_HIGH, %cs:loadflags 604 testb $LOADED_HIGH, %cs:loadflags
@@ -788,11 +794,12 @@ a20_err_msg:
788a20_done: 794a20_done:
789 795
790#endif /* CONFIG_X86_VOYAGER */ 796#endif /* CONFIG_X86_VOYAGER */
791# set up gdt and idt 797# set up gdt and idt and 32bit start address
792 lidt idt_48 # load idt with 0,0 798 lidt idt_48 # load idt with 0,0
793 xorl %eax, %eax # Compute gdt_base 799 xorl %eax, %eax # Compute gdt_base
794 movw %ds, %ax # (Convert %ds:gdt to a linear ptr) 800 movw %ds, %ax # (Convert %ds:gdt to a linear ptr)
795 shll $4, %eax 801 shll $4, %eax
802 addl %eax, code32
796 addl $gdt, %eax 803 addl $gdt, %eax
797 movl %eax, (gdt_48+2) 804 movl %eax, (gdt_48+2)
798 lgdt gdt_48 # load gdt with whatever is 805 lgdt gdt_48 # load gdt with whatever is
@@ -851,9 +858,26 @@ flush_instr:
851# Manual, Mixing 16-bit and 32-bit code, page 16-6) 858# Manual, Mixing 16-bit and 32-bit code, page 16-6)
852 859
853 .byte 0x66, 0xea # prefix + jmpi-opcode 860 .byte 0x66, 0xea # prefix + jmpi-opcode
854code32: .long 0x1000 # will be set to 0x100000 861code32: .long startup_32 # will be set to %cs+startup_32
855 # for big kernels
856 .word __BOOT_CS 862 .word __BOOT_CS
863.code32
864startup_32:
865 movl $(__BOOT_DS), %eax
866 movl %eax, %ds
867 movl %eax, %es
868 movl %eax, %fs
869 movl %eax, %gs
870 movl %eax, %ss
871
872 xorl %eax, %eax
8731: incl %eax # check that A20 really IS enabled
874 movl %eax, 0x00000000 # loop forever if it isn't
875 cmpl %eax, 0x00100000
876 je 1b
877
878 # Jump to the 32bit entry point
879 jmpl *(code32_start - start + (DELTA_INITSEG << 4))(%esi)
880.code16
857 881
858# Here's a bunch of information about your current kernel.. 882# Here's a bunch of information about your current kernel..
859kernel_version: .ascii UTS_RELEASE 883kernel_version: .ascii UTS_RELEASE
diff --git a/arch/i386/defconfig b/arch/i386/defconfig
index 60c0c02574f0..3265208e5899 100644
--- a/arch/i386/defconfig
+++ b/arch/i386/defconfig
@@ -1,7 +1,7 @@
1# 1#
2# Automatically generated make config: don't edit 2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.19-rc1 3# Linux kernel version: 2.6.19-git14
4# Thu Oct 5 13:04:53 2006 4# Sat Dec 9 21:23:14 2006
5# 5#
6CONFIG_X86_32=y 6CONFIG_X86_32=y
7CONFIG_GENERIC_TIME=y 7CONFIG_GENERIC_TIME=y
@@ -12,6 +12,7 @@ CONFIG_X86=y
12CONFIG_MMU=y 12CONFIG_MMU=y
13CONFIG_GENERIC_ISA_DMA=y 13CONFIG_GENERIC_ISA_DMA=y
14CONFIG_GENERIC_IOMAP=y 14CONFIG_GENERIC_IOMAP=y
15CONFIG_GENERIC_BUG=y
15CONFIG_GENERIC_HWEIGHT=y 16CONFIG_GENERIC_HWEIGHT=y
16CONFIG_ARCH_MAY_HAVE_PC_FDC=y 17CONFIG_ARCH_MAY_HAVE_PC_FDC=y
17CONFIG_DMI=y 18CONFIG_DMI=y
@@ -40,13 +41,14 @@ CONFIG_POSIX_MQUEUE=y
40CONFIG_IKCONFIG=y 41CONFIG_IKCONFIG=y
41CONFIG_IKCONFIG_PROC=y 42CONFIG_IKCONFIG_PROC=y
42# CONFIG_CPUSETS is not set 43# CONFIG_CPUSETS is not set
44CONFIG_SYSFS_DEPRECATED=y
43# CONFIG_RELAY is not set 45# CONFIG_RELAY is not set
44CONFIG_INITRAMFS_SOURCE="" 46CONFIG_INITRAMFS_SOURCE=""
45CONFIG_CC_OPTIMIZE_FOR_SIZE=y 47CONFIG_CC_OPTIMIZE_FOR_SIZE=y
46CONFIG_SYSCTL=y 48CONFIG_SYSCTL=y
47# CONFIG_EMBEDDED is not set 49# CONFIG_EMBEDDED is not set
48CONFIG_UID16=y 50CONFIG_UID16=y
49# CONFIG_SYSCTL_SYSCALL is not set 51CONFIG_SYSCTL_SYSCALL=y
50CONFIG_KALLSYMS=y 52CONFIG_KALLSYMS=y
51CONFIG_KALLSYMS_ALL=y 53CONFIG_KALLSYMS_ALL=y
52# CONFIG_KALLSYMS_EXTRA_PASS is not set 54# CONFIG_KALLSYMS_EXTRA_PASS is not set
@@ -110,6 +112,7 @@ CONFIG_SMP=y
110# CONFIG_X86_VISWS is not set 112# CONFIG_X86_VISWS is not set
111CONFIG_X86_GENERICARCH=y 113CONFIG_X86_GENERICARCH=y
112# CONFIG_X86_ES7000 is not set 114# CONFIG_X86_ES7000 is not set
115# CONFIG_PARAVIRT is not set
113CONFIG_X86_CYCLONE_TIMER=y 116CONFIG_X86_CYCLONE_TIMER=y
114# CONFIG_M386 is not set 117# CONFIG_M386 is not set
115# CONFIG_M486 is not set 118# CONFIG_M486 is not set
@@ -120,6 +123,7 @@ CONFIG_X86_CYCLONE_TIMER=y
120# CONFIG_MPENTIUMII is not set 123# CONFIG_MPENTIUMII is not set
121CONFIG_MPENTIUMIII=y 124CONFIG_MPENTIUMIII=y
122# CONFIG_MPENTIUMM is not set 125# CONFIG_MPENTIUMM is not set
126# CONFIG_MCORE2 is not set
123# CONFIG_MPENTIUM4 is not set 127# CONFIG_MPENTIUM4 is not set
124# CONFIG_MK6 is not set 128# CONFIG_MK6 is not set
125# CONFIG_MK7 is not set 129# CONFIG_MK7 is not set
@@ -138,6 +142,8 @@ CONFIG_X86_CMPXCHG=y
138CONFIG_X86_XADD=y 142CONFIG_X86_XADD=y
139CONFIG_X86_L1_CACHE_SHIFT=7 143CONFIG_X86_L1_CACHE_SHIFT=7
140CONFIG_RWSEM_XCHGADD_ALGORITHM=y 144CONFIG_RWSEM_XCHGADD_ALGORITHM=y
145# CONFIG_ARCH_HAS_ILOG2_U32 is not set
146# CONFIG_ARCH_HAS_ILOG2_U64 is not set
141CONFIG_GENERIC_CALIBRATE_DELAY=y 147CONFIG_GENERIC_CALIBRATE_DELAY=y
142CONFIG_X86_WP_WORKS_OK=y 148CONFIG_X86_WP_WORKS_OK=y
143CONFIG_X86_INVLPG=y 149CONFIG_X86_INVLPG=y
@@ -197,15 +203,16 @@ CONFIG_RESOURCES_64BIT=y
197CONFIG_MTRR=y 203CONFIG_MTRR=y
198# CONFIG_EFI is not set 204# CONFIG_EFI is not set
199# CONFIG_IRQBALANCE is not set 205# CONFIG_IRQBALANCE is not set
200CONFIG_REGPARM=y
201CONFIG_SECCOMP=y 206CONFIG_SECCOMP=y
202# CONFIG_HZ_100 is not set 207# CONFIG_HZ_100 is not set
203CONFIG_HZ_250=y 208CONFIG_HZ_250=y
209# CONFIG_HZ_300 is not set
204# CONFIG_HZ_1000 is not set 210# CONFIG_HZ_1000 is not set
205CONFIG_HZ=250 211CONFIG_HZ=250
206# CONFIG_KEXEC is not set 212# CONFIG_KEXEC is not set
207# CONFIG_CRASH_DUMP is not set 213# CONFIG_CRASH_DUMP is not set
208CONFIG_PHYSICAL_START=0x100000 214# CONFIG_RELOCATABLE is not set
215CONFIG_PHYSICAL_ALIGN=0x100000
209# CONFIG_HOTPLUG_CPU is not set 216# CONFIG_HOTPLUG_CPU is not set
210CONFIG_COMPAT_VDSO=y 217CONFIG_COMPAT_VDSO=y
211CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y 218CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
@@ -367,6 +374,7 @@ CONFIG_INET_TCP_DIAG=y
367# CONFIG_TCP_CONG_ADVANCED is not set 374# CONFIG_TCP_CONG_ADVANCED is not set
368CONFIG_TCP_CONG_CUBIC=y 375CONFIG_TCP_CONG_CUBIC=y
369CONFIG_DEFAULT_TCP_CONG="cubic" 376CONFIG_DEFAULT_TCP_CONG="cubic"
377# CONFIG_TCP_MD5SIG is not set
370CONFIG_IPV6=y 378CONFIG_IPV6=y
371# CONFIG_IPV6_PRIVACY is not set 379# CONFIG_IPV6_PRIVACY is not set
372# CONFIG_IPV6_ROUTER_PREF is not set 380# CONFIG_IPV6_ROUTER_PREF is not set
@@ -380,8 +388,8 @@ CONFIG_INET6_XFRM_MODE_TRANSPORT=y
380CONFIG_INET6_XFRM_MODE_TUNNEL=y 388CONFIG_INET6_XFRM_MODE_TUNNEL=y
381# CONFIG_INET6_XFRM_MODE_BEET is not set 389# CONFIG_INET6_XFRM_MODE_BEET is not set
382# CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION is not set 390# CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION is not set
391CONFIG_IPV6_SIT=y
383# CONFIG_IPV6_TUNNEL is not set 392# CONFIG_IPV6_TUNNEL is not set
384# CONFIG_IPV6_SUBTREES is not set
385# CONFIG_IPV6_MULTIPLE_TABLES is not set 393# CONFIG_IPV6_MULTIPLE_TABLES is not set
386# CONFIG_NETWORK_SECMARK is not set 394# CONFIG_NETWORK_SECMARK is not set
387# CONFIG_NETFILTER is not set 395# CONFIG_NETFILTER is not set
@@ -483,6 +491,13 @@ CONFIG_BLK_DEV_INITRD=y
483# CONFIG_ATA_OVER_ETH is not set 491# CONFIG_ATA_OVER_ETH is not set
484 492
485# 493#
494# Misc devices
495#
496# CONFIG_IBM_ASM is not set
497# CONFIG_SGI_IOC4 is not set
498# CONFIG_TIFM_CORE is not set
499
500#
486# ATA/ATAPI/MFM/RLL support 501# ATA/ATAPI/MFM/RLL support
487# 502#
488CONFIG_IDE=y 503CONFIG_IDE=y
@@ -552,6 +567,7 @@ CONFIG_IDEDMA_AUTO=y
552# 567#
553# CONFIG_RAID_ATTRS is not set 568# CONFIG_RAID_ATTRS is not set
554CONFIG_SCSI=y 569CONFIG_SCSI=y
570# CONFIG_SCSI_TGT is not set
555CONFIG_SCSI_NETLINK=y 571CONFIG_SCSI_NETLINK=y
556# CONFIG_SCSI_PROC_FS is not set 572# CONFIG_SCSI_PROC_FS is not set
557 573
@@ -572,6 +588,7 @@ CONFIG_CHR_DEV_SG=y
572# CONFIG_SCSI_MULTI_LUN is not set 588# CONFIG_SCSI_MULTI_LUN is not set
573# CONFIG_SCSI_CONSTANTS is not set 589# CONFIG_SCSI_CONSTANTS is not set
574# CONFIG_SCSI_LOGGING is not set 590# CONFIG_SCSI_LOGGING is not set
591# CONFIG_SCSI_SCAN_ASYNC is not set
575 592
576# 593#
577# SCSI Transports 594# SCSI Transports
@@ -631,6 +648,7 @@ CONFIG_AIC79XX_DEBUG_MASK=0
631# CONFIG_SCSI_DC390T is not set 648# CONFIG_SCSI_DC390T is not set
632# CONFIG_SCSI_NSP32 is not set 649# CONFIG_SCSI_NSP32 is not set
633# CONFIG_SCSI_DEBUG is not set 650# CONFIG_SCSI_DEBUG is not set
651# CONFIG_SCSI_SRP is not set
634 652
635# 653#
636# Serial ATA (prod) and Parallel ATA (experimental) drivers 654# Serial ATA (prod) and Parallel ATA (experimental) drivers
@@ -670,6 +688,7 @@ CONFIG_SATA_INTEL_COMBINED=y
670# CONFIG_PATA_IT821X is not set 688# CONFIG_PATA_IT821X is not set
671# CONFIG_PATA_JMICRON is not set 689# CONFIG_PATA_JMICRON is not set
672# CONFIG_PATA_TRIFLEX is not set 690# CONFIG_PATA_TRIFLEX is not set
691# CONFIG_PATA_MARVELL is not set
673# CONFIG_PATA_MPIIX is not set 692# CONFIG_PATA_MPIIX is not set
674# CONFIG_PATA_OLDPIIX is not set 693# CONFIG_PATA_OLDPIIX is not set
675# CONFIG_PATA_NETCELL is not set 694# CONFIG_PATA_NETCELL is not set
@@ -843,6 +862,7 @@ CONFIG_BNX2=y
843# CONFIG_IXGB is not set 862# CONFIG_IXGB is not set
844# CONFIG_S2IO is not set 863# CONFIG_S2IO is not set
845# CONFIG_MYRI10GE is not set 864# CONFIG_MYRI10GE is not set
865# CONFIG_NETXEN_NIC is not set
846 866
847# 867#
848# Token Ring devices 868# Token Ring devices
@@ -977,10 +997,6 @@ CONFIG_RTC=y
977# CONFIG_R3964 is not set 997# CONFIG_R3964 is not set
978# CONFIG_APPLICOM is not set 998# CONFIG_APPLICOM is not set
979# CONFIG_SONYPI is not set 999# CONFIG_SONYPI is not set
980
981#
982# Ftape, the floppy tape device driver
983#
984CONFIG_AGP=y 1000CONFIG_AGP=y
985# CONFIG_AGP_ALI is not set 1001# CONFIG_AGP_ALI is not set
986# CONFIG_AGP_ATI is not set 1002# CONFIG_AGP_ATI is not set
@@ -1024,6 +1040,7 @@ CONFIG_HANGCHECK_TIMER=y
1024# 1040#
1025# Dallas's 1-wire bus 1041# Dallas's 1-wire bus
1026# 1042#
1043# CONFIG_W1 is not set
1027 1044
1028# 1045#
1029# Hardware Monitoring support 1046# Hardware Monitoring support
@@ -1032,12 +1049,6 @@ CONFIG_HANGCHECK_TIMER=y
1032# CONFIG_HWMON_VID is not set 1049# CONFIG_HWMON_VID is not set
1033 1050
1034# 1051#
1035# Misc devices
1036#
1037# CONFIG_IBM_ASM is not set
1038# CONFIG_TIFM_CORE is not set
1039
1040#
1041# Multimedia devices 1052# Multimedia devices
1042# 1053#
1043# CONFIG_VIDEO_DEV is not set 1054# CONFIG_VIDEO_DEV is not set
@@ -1078,10 +1089,7 @@ CONFIG_SOUND=y
1078# Open Sound System 1089# Open Sound System
1079# 1090#
1080CONFIG_SOUND_PRIME=y 1091CONFIG_SOUND_PRIME=y
1081CONFIG_OSS_OBSOLETE_DRIVER=y
1082# CONFIG_SOUND_BT878 is not set 1092# CONFIG_SOUND_BT878 is not set
1083# CONFIG_SOUND_EMU10K1 is not set
1084# CONFIG_SOUND_FUSION is not set
1085# CONFIG_SOUND_ES1371 is not set 1093# CONFIG_SOUND_ES1371 is not set
1086CONFIG_SOUND_ICH=y 1094CONFIG_SOUND_ICH=y
1087# CONFIG_SOUND_TRIDENT is not set 1095# CONFIG_SOUND_TRIDENT is not set
@@ -1091,6 +1099,11 @@ CONFIG_SOUND_ICH=y
1091# CONFIG_SOUND_OSS is not set 1099# CONFIG_SOUND_OSS is not set
1092 1100
1093# 1101#
1102# HID Devices
1103#
1104CONFIG_HID=y
1105
1106#
1094# USB support 1107# USB support
1095# 1108#
1096CONFIG_USB_ARCH_HAS_HCD=y 1109CONFIG_USB_ARCH_HAS_HCD=y
@@ -1106,6 +1119,7 @@ CONFIG_USB_DEVICEFS=y
1106# CONFIG_USB_BANDWIDTH is not set 1119# CONFIG_USB_BANDWIDTH is not set
1107# CONFIG_USB_DYNAMIC_MINORS is not set 1120# CONFIG_USB_DYNAMIC_MINORS is not set
1108# CONFIG_USB_SUSPEND is not set 1121# CONFIG_USB_SUSPEND is not set
1122# CONFIG_USB_MULTITHREAD_PROBE is not set
1109# CONFIG_USB_OTG is not set 1123# CONFIG_USB_OTG is not set
1110 1124
1111# 1125#
@@ -1153,8 +1167,7 @@ CONFIG_USB_STORAGE=y
1153# USB Input Devices 1167# USB Input Devices
1154# 1168#
1155CONFIG_USB_HID=y 1169CONFIG_USB_HID=y
1156CONFIG_USB_HIDINPUT=y 1170# CONFIG_USB_HID_POWERBOOK is not set
1157# CONFIG_USB_HIDINPUT_POWERBOOK is not set
1158# CONFIG_HID_FF is not set 1171# CONFIG_HID_FF is not set
1159# CONFIG_USB_HIDDEV is not set 1172# CONFIG_USB_HIDDEV is not set
1160# CONFIG_USB_AIPTEK is not set 1173# CONFIG_USB_AIPTEK is not set
@@ -1169,7 +1182,6 @@ CONFIG_USB_HIDINPUT=y
1169# CONFIG_USB_ATI_REMOTE2 is not set 1182# CONFIG_USB_ATI_REMOTE2 is not set
1170# CONFIG_USB_KEYSPAN_REMOTE is not set 1183# CONFIG_USB_KEYSPAN_REMOTE is not set
1171# CONFIG_USB_APPLETOUCH is not set 1184# CONFIG_USB_APPLETOUCH is not set
1172# CONFIG_USB_TRANCEVIBRATOR is not set
1173 1185
1174# 1186#
1175# USB Imaging devices 1187# USB Imaging devices
@@ -1184,6 +1196,7 @@ CONFIG_USB_HIDINPUT=y
1184# CONFIG_USB_KAWETH is not set 1196# CONFIG_USB_KAWETH is not set
1185# CONFIG_USB_PEGASUS is not set 1197# CONFIG_USB_PEGASUS is not set
1186# CONFIG_USB_RTL8150 is not set 1198# CONFIG_USB_RTL8150 is not set
1199# CONFIG_USB_USBNET_MII is not set
1187# CONFIG_USB_USBNET is not set 1200# CONFIG_USB_USBNET is not set
1188CONFIG_USB_MON=y 1201CONFIG_USB_MON=y
1189 1202
@@ -1215,6 +1228,7 @@ CONFIG_USB_MON=y
1215# CONFIG_USB_APPLEDISPLAY is not set 1228# CONFIG_USB_APPLEDISPLAY is not set
1216# CONFIG_USB_SISUSBVGA is not set 1229# CONFIG_USB_SISUSBVGA is not set
1217# CONFIG_USB_LD is not set 1230# CONFIG_USB_LD is not set
1231# CONFIG_USB_TRANCEVIBRATOR is not set
1218# CONFIG_USB_TEST is not set 1232# CONFIG_USB_TEST is not set
1219 1233
1220# 1234#
@@ -1284,6 +1298,7 @@ CONFIG_EXT3_FS=y
1284CONFIG_EXT3_FS_XATTR=y 1298CONFIG_EXT3_FS_XATTR=y
1285CONFIG_EXT3_FS_POSIX_ACL=y 1299CONFIG_EXT3_FS_POSIX_ACL=y
1286# CONFIG_EXT3_FS_SECURITY is not set 1300# CONFIG_EXT3_FS_SECURITY is not set
1301# CONFIG_EXT4DEV_FS is not set
1287CONFIG_JBD=y 1302CONFIG_JBD=y
1288# CONFIG_JBD_DEBUG is not set 1303# CONFIG_JBD_DEBUG is not set
1289CONFIG_FS_MBCACHE=y 1304CONFIG_FS_MBCACHE=y
@@ -1307,6 +1322,7 @@ CONFIG_DNOTIFY=y
1307# CONFIG_AUTOFS_FS is not set 1322# CONFIG_AUTOFS_FS is not set
1308CONFIG_AUTOFS4_FS=y 1323CONFIG_AUTOFS4_FS=y
1309# CONFIG_FUSE_FS is not set 1324# CONFIG_FUSE_FS is not set
1325CONFIG_GENERIC_ACL=y
1310 1326
1311# 1327#
1312# CD-ROM/DVD Filesystems 1328# CD-ROM/DVD Filesystems
@@ -1384,7 +1400,6 @@ CONFIG_SUNRPC=y
1384# CONFIG_CODA_FS is not set 1400# CONFIG_CODA_FS is not set
1385# CONFIG_AFS_FS is not set 1401# CONFIG_AFS_FS is not set
1386# CONFIG_9P_FS is not set 1402# CONFIG_9P_FS is not set
1387CONFIG_GENERIC_ACL=y
1388 1403
1389# 1404#
1390# Partition Types 1405# Partition Types
@@ -1439,6 +1454,7 @@ CONFIG_NLS_UTF8=y
1439# 1454#
1440# Distributed Lock Manager 1455# Distributed Lock Manager
1441# 1456#
1457# CONFIG_DLM is not set
1442 1458
1443# 1459#
1444# Instrumentation Support 1460# Instrumentation Support
@@ -1480,6 +1496,7 @@ CONFIG_DEBUG_BUGVERBOSE=y
1480CONFIG_UNWIND_INFO=y 1496CONFIG_UNWIND_INFO=y
1481CONFIG_STACK_UNWIND=y 1497CONFIG_STACK_UNWIND=y
1482# CONFIG_FORCED_INLINING is not set 1498# CONFIG_FORCED_INLINING is not set
1499# CONFIG_HEADERS_CHECK is not set
1483# CONFIG_RCU_TORTURE_TEST is not set 1500# CONFIG_RCU_TORTURE_TEST is not set
1484# CONFIG_LKDTM is not set 1501# CONFIG_LKDTM is not set
1485CONFIG_EARLY_PRINTK=y 1502CONFIG_EARLY_PRINTK=y
@@ -1505,6 +1522,7 @@ CONFIG_DOUBLEFAULT=y
1505# 1522#
1506# Library routines 1523# Library routines
1507# 1524#
1525CONFIG_BITREVERSE=y
1508# CONFIG_CRC_CCITT is not set 1526# CONFIG_CRC_CCITT is not set
1509# CONFIG_CRC16 is not set 1527# CONFIG_CRC16 is not set
1510CONFIG_CRC32=y 1528CONFIG_CRC32=y
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
index 1a884b6e6e5c..1e8988e558c5 100644
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -6,7 +6,7 @@ extra-y := head.o init_task.o vmlinux.lds
6 6
7obj-y := process.o signal.o entry.o traps.o irq.o \ 7obj-y := process.o signal.o entry.o traps.o irq.o \
8 ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \ 8 ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \
9 pci-dma.o i386_ksyms.o i387.o bootflag.o \ 9 pci-dma.o i386_ksyms.o i387.o bootflag.o e820.o\
10 quirks.o i8237.o topology.o alternative.o i8253.o tsc.o 10 quirks.o i8237.o topology.o alternative.o i8253.o tsc.o
11 11
12obj-$(CONFIG_STACKTRACE) += stacktrace.o 12obj-$(CONFIG_STACKTRACE) += stacktrace.o
@@ -40,6 +40,9 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
40obj-$(CONFIG_HPET_TIMER) += hpet.o 40obj-$(CONFIG_HPET_TIMER) += hpet.o
41obj-$(CONFIG_K8_NB) += k8.o 41obj-$(CONFIG_K8_NB) += k8.o
42 42
43# Make sure this is linked after any other paravirt_ops structs: see head.S
44obj-$(CONFIG_PARAVIRT) += paravirt.o
45
43EXTRA_AFLAGS := -traditional 46EXTRA_AFLAGS := -traditional
44 47
45obj-$(CONFIG_SCx200) += scx200.o 48obj-$(CONFIG_SCx200) += scx200.o
diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c
index ab974ff97073..c8f96cff07c6 100644
--- a/arch/i386/kernel/acpi/boot.c
+++ b/arch/i386/kernel/acpi/boot.c
@@ -70,7 +70,7 @@ static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return
70 70
71#define PREFIX "ACPI: " 71#define PREFIX "ACPI: "
72 72
73int acpi_noirq __initdata; /* skip ACPI IRQ initialization */ 73int acpi_noirq; /* skip ACPI IRQ initialization */
74int acpi_pci_disabled __initdata; /* skip ACPI PCI scan and IRQ initialization */ 74int acpi_pci_disabled __initdata; /* skip ACPI PCI scan and IRQ initialization */
75int acpi_ht __initdata = 1; /* enable HT */ 75int acpi_ht __initdata = 1; /* enable HT */
76 76
@@ -82,6 +82,7 @@ EXPORT_SYMBOL(acpi_strict);
82acpi_interrupt_flags acpi_sci_flags __initdata; 82acpi_interrupt_flags acpi_sci_flags __initdata;
83int acpi_sci_override_gsi __initdata; 83int acpi_sci_override_gsi __initdata;
84int acpi_skip_timer_override __initdata; 84int acpi_skip_timer_override __initdata;
85int acpi_use_timer_override __initdata;
85 86
86#ifdef CONFIG_X86_LOCAL_APIC 87#ifdef CONFIG_X86_LOCAL_APIC
87static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; 88static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
@@ -332,7 +333,7 @@ acpi_parse_ioapic(acpi_table_entry_header * header, const unsigned long end)
332/* 333/*
333 * Parse Interrupt Source Override for the ACPI SCI 334 * Parse Interrupt Source Override for the ACPI SCI
334 */ 335 */
335static void acpi_sci_ioapic_setup(u32 bus_irq, u32 gsi, u16 polarity, u16 trigger) 336static void acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger)
336{ 337{
337 if (trigger == 0) /* compatible SCI trigger is level */ 338 if (trigger == 0) /* compatible SCI trigger is level */
338 trigger = 3; 339 trigger = 3;
@@ -352,13 +353,13 @@ static void acpi_sci_ioapic_setup(u32 bus_irq, u32 gsi, u16 polarity, u16 trigge
352 * If GSI is < 16, this will update its flags, 353 * If GSI is < 16, this will update its flags,
353 * else it will create a new mp_irqs[] entry. 354 * else it will create a new mp_irqs[] entry.
354 */ 355 */
355 mp_override_legacy_irq(bus_irq, polarity, trigger, gsi); 356 mp_override_legacy_irq(gsi, polarity, trigger, gsi);
356 357
357 /* 358 /*
358 * stash over-ride to indicate we've been here 359 * stash over-ride to indicate we've been here
359 * and for later update of acpi_fadt 360 * and for later update of acpi_fadt
360 */ 361 */
361 acpi_sci_override_gsi = bus_irq; 362 acpi_sci_override_gsi = gsi;
362 return; 363 return;
363} 364}
364 365
@@ -376,7 +377,7 @@ acpi_parse_int_src_ovr(acpi_table_entry_header * header,
376 acpi_table_print_madt_entry(header); 377 acpi_table_print_madt_entry(header);
377 378
378 if (intsrc->bus_irq == acpi_fadt.sci_int) { 379 if (intsrc->bus_irq == acpi_fadt.sci_int) {
379 acpi_sci_ioapic_setup(intsrc->bus_irq, intsrc->global_irq, 380 acpi_sci_ioapic_setup(intsrc->global_irq,
380 intsrc->flags.polarity, 381 intsrc->flags.polarity,
381 intsrc->flags.trigger); 382 intsrc->flags.trigger);
382 return 0; 383 return 0;
@@ -879,7 +880,7 @@ static int __init acpi_parse_madt_ioapic_entries(void)
879 * pretend we got one so we can set the SCI flags. 880 * pretend we got one so we can set the SCI flags.
880 */ 881 */
881 if (!acpi_sci_override_gsi) 882 if (!acpi_sci_override_gsi)
882 acpi_sci_ioapic_setup(acpi_fadt.sci_int, acpi_fadt.sci_int, 0, 0); 883 acpi_sci_ioapic_setup(acpi_fadt.sci_int, 0, 0);
883 884
884 /* Fill in identity legacy mapings where no override */ 885 /* Fill in identity legacy mapings where no override */
885 mp_config_acpi_legacy_irqs(); 886 mp_config_acpi_legacy_irqs();
@@ -1300,6 +1301,13 @@ static int __init parse_acpi_skip_timer_override(char *arg)
1300 return 0; 1301 return 0;
1301} 1302}
1302early_param("acpi_skip_timer_override", parse_acpi_skip_timer_override); 1303early_param("acpi_skip_timer_override", parse_acpi_skip_timer_override);
1304
1305static int __init parse_acpi_use_timer_override(char *arg)
1306{
1307 acpi_use_timer_override = 1;
1308 return 0;
1309}
1310early_param("acpi_use_timer_override", parse_acpi_use_timer_override);
1303#endif /* CONFIG_X86_IO_APIC */ 1311#endif /* CONFIG_X86_IO_APIC */
1304 1312
1305static int __init setup_acpi_sci(char *s) 1313static int __init setup_acpi_sci(char *s)
diff --git a/arch/i386/kernel/acpi/cstate.c b/arch/i386/kernel/acpi/cstate.c
index 20563e52c622..12e937c1ce4b 100644
--- a/arch/i386/kernel/acpi/cstate.c
+++ b/arch/i386/kernel/acpi/cstate.c
@@ -11,6 +11,7 @@
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/acpi.h> 12#include <linux/acpi.h>
13#include <linux/cpu.h> 13#include <linux/cpu.h>
14#include <linux/sched.h>
14 15
15#include <acpi/processor.h> 16#include <acpi/processor.h>
16#include <asm/acpi.h> 17#include <asm/acpi.h>
@@ -155,10 +156,8 @@ static int __init ffh_cstate_init(void)
155 156
156static void __exit ffh_cstate_exit(void) 157static void __exit ffh_cstate_exit(void)
157{ 158{
158 if (cpu_cstate_entry) { 159 free_percpu(cpu_cstate_entry);
159 free_percpu(cpu_cstate_entry); 160 cpu_cstate_entry = NULL;
160 cpu_cstate_entry = NULL;
161 }
162} 161}
163 162
164arch_initcall(ffh_cstate_init); 163arch_initcall(ffh_cstate_init);
diff --git a/arch/i386/kernel/acpi/earlyquirk.c b/arch/i386/kernel/acpi/earlyquirk.c
index fe799b11ac0a..4b60af7f91dd 100644
--- a/arch/i386/kernel/acpi/earlyquirk.c
+++ b/arch/i386/kernel/acpi/earlyquirk.c
@@ -10,6 +10,7 @@
10#include <asm/pci-direct.h> 10#include <asm/pci-direct.h>
11#include <asm/acpi.h> 11#include <asm/acpi.h>
12#include <asm/apic.h> 12#include <asm/apic.h>
13#include <asm/irq.h>
13 14
14#ifdef CONFIG_ACPI 15#ifdef CONFIG_ACPI
15 16
@@ -27,11 +28,17 @@ static int __init check_bridge(int vendor, int device)
27#ifdef CONFIG_ACPI 28#ifdef CONFIG_ACPI
28 /* According to Nvidia all timer overrides are bogus unless HPET 29 /* According to Nvidia all timer overrides are bogus unless HPET
29 is enabled. */ 30 is enabled. */
30 if (vendor == PCI_VENDOR_ID_NVIDIA) { 31 if (!acpi_use_timer_override && vendor == PCI_VENDOR_ID_NVIDIA) {
31 nvidia_hpet_detected = 0; 32 nvidia_hpet_detected = 0;
32 acpi_table_parse(ACPI_HPET, nvidia_hpet_check); 33 acpi_table_parse(ACPI_HPET, nvidia_hpet_check);
33 if (nvidia_hpet_detected == 0) { 34 if (nvidia_hpet_detected == 0) {
34 acpi_skip_timer_override = 1; 35 acpi_skip_timer_override = 1;
36 printk(KERN_INFO "Nvidia board "
37 "detected. Ignoring ACPI "
38 "timer override.\n");
39 printk(KERN_INFO "If you got timer trouble "
40 "try acpi_use_timer_override\n");
41
35 } 42 }
36 } 43 }
37#endif 44#endif
@@ -43,6 +50,24 @@ static int __init check_bridge(int vendor, int device)
43 return 0; 50 return 0;
44} 51}
45 52
53static void check_intel(void)
54{
55 u16 vendor, device;
56
57 vendor = read_pci_config_16(0, 0, 0, PCI_VENDOR_ID);
58
59 if (vendor != PCI_VENDOR_ID_INTEL)
60 return;
61
62 device = read_pci_config_16(0, 0, 0, PCI_DEVICE_ID);
63#ifdef CONFIG_SMP
64 if (device == PCI_DEVICE_ID_INTEL_E7320_MCH ||
65 device == PCI_DEVICE_ID_INTEL_E7520_MCH ||
66 device == PCI_DEVICE_ID_INTEL_E7525_MCH)
67 quirk_intel_irqbalance();
68#endif
69}
70
46void __init check_acpi_pci(void) 71void __init check_acpi_pci(void)
47{ 72{
48 int num, slot, func; 73 int num, slot, func;
@@ -54,6 +79,8 @@ void __init check_acpi_pci(void)
54 if (!early_pci_allowed()) 79 if (!early_pci_allowed())
55 return; 80 return;
56 81
82 check_intel();
83
57 /* Poor man's PCI discovery */ 84 /* Poor man's PCI discovery */
58 for (num = 0; num < 32; num++) { 85 for (num = 0; num < 32; num++) {
59 for (slot = 0; slot < 32; slot++) { 86 for (slot = 0; slot < 32; slot++) {
diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c
index 28ab80649764..9eca21b49f6b 100644
--- a/arch/i386/kernel/alternative.c
+++ b/arch/i386/kernel/alternative.c
@@ -1,4 +1,5 @@
1#include <linux/module.h> 1#include <linux/module.h>
2#include <linux/sched.h>
2#include <linux/spinlock.h> 3#include <linux/spinlock.h>
3#include <linux/list.h> 4#include <linux/list.h>
4#include <asm/alternative.h> 5#include <asm/alternative.h>
@@ -123,6 +124,20 @@ static unsigned char** find_nop_table(void)
123 124
124#endif /* CONFIG_X86_64 */ 125#endif /* CONFIG_X86_64 */
125 126
127static void nop_out(void *insns, unsigned int len)
128{
129 unsigned char **noptable = find_nop_table();
130
131 while (len > 0) {
132 unsigned int noplen = len;
133 if (noplen > ASM_NOP_MAX)
134 noplen = ASM_NOP_MAX;
135 memcpy(insns, noptable[noplen], noplen);
136 insns += noplen;
137 len -= noplen;
138 }
139}
140
126extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; 141extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
127extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[]; 142extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[];
128extern u8 *__smp_locks[], *__smp_locks_end[]; 143extern u8 *__smp_locks[], *__smp_locks_end[];
@@ -137,10 +152,9 @@ extern u8 __smp_alt_begin[], __smp_alt_end[];
137 152
138void apply_alternatives(struct alt_instr *start, struct alt_instr *end) 153void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
139{ 154{
140 unsigned char **noptable = find_nop_table();
141 struct alt_instr *a; 155 struct alt_instr *a;
142 u8 *instr; 156 u8 *instr;
143 int diff, i, k; 157 int diff;
144 158
145 DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end); 159 DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end);
146 for (a = start; a < end; a++) { 160 for (a = start; a < end; a++) {
@@ -158,13 +172,7 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
158#endif 172#endif
159 memcpy(instr, a->replacement, a->replacementlen); 173 memcpy(instr, a->replacement, a->replacementlen);
160 diff = a->instrlen - a->replacementlen; 174 diff = a->instrlen - a->replacementlen;
161 /* Pad the rest with nops */ 175 nop_out(instr + a->replacementlen, diff);
162 for (i = a->replacementlen; diff > 0; diff -= k, i += k) {
163 k = diff;
164 if (k > ASM_NOP_MAX)
165 k = ASM_NOP_MAX;
166 memcpy(a->instr + i, noptable[k], k);
167 }
168 } 176 }
169} 177}
170 178
@@ -208,7 +216,6 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
208 216
209static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) 217static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
210{ 218{
211 unsigned char **noptable = find_nop_table();
212 u8 **ptr; 219 u8 **ptr;
213 220
214 for (ptr = start; ptr < end; ptr++) { 221 for (ptr = start; ptr < end; ptr++) {
@@ -216,7 +223,7 @@ static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end
216 continue; 223 continue;
217 if (*ptr > text_end) 224 if (*ptr > text_end)
218 continue; 225 continue;
219 **ptr = noptable[1][0]; 226 nop_out(*ptr, 1);
220 }; 227 };
221} 228}
222 229
@@ -342,8 +349,43 @@ void alternatives_smp_switch(int smp)
342 349
343#endif 350#endif
344 351
352#ifdef CONFIG_PARAVIRT
353void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end)
354{
355 struct paravirt_patch *p;
356
357 for (p = start; p < end; p++) {
358 unsigned int used;
359
360 used = paravirt_ops.patch(p->instrtype, p->clobbers, p->instr,
361 p->len);
362#ifdef CONFIG_DEBUG_PARAVIRT
363 {
364 int i;
365 /* Deliberately clobber regs using "not %reg" to find bugs. */
366 for (i = 0; i < 3; i++) {
367 if (p->len - used >= 2 && (p->clobbers & (1 << i))) {
368 memcpy(p->instr + used, "\xf7\xd0", 2);
369 p->instr[used+1] |= i;
370 used += 2;
371 }
372 }
373 }
374#endif
375 /* Pad the rest with nops */
376 nop_out(p->instr + used, p->len - used);
377 }
378
379 /* Sync to be conservative, in case we patched following instructions */
380 sync_core();
381}
382extern struct paravirt_patch __start_parainstructions[],
383 __stop_parainstructions[];
384#endif /* CONFIG_PARAVIRT */
385
345void __init alternative_instructions(void) 386void __init alternative_instructions(void)
346{ 387{
388 unsigned long flags;
347 if (no_replacement) { 389 if (no_replacement) {
348 printk(KERN_INFO "(SMP-)alternatives turned off\n"); 390 printk(KERN_INFO "(SMP-)alternatives turned off\n");
349 free_init_pages("SMP alternatives", 391 free_init_pages("SMP alternatives",
@@ -351,6 +393,8 @@ void __init alternative_instructions(void)
351 (unsigned long)__smp_alt_end); 393 (unsigned long)__smp_alt_end);
352 return; 394 return;
353 } 395 }
396
397 local_irq_save(flags);
354 apply_alternatives(__alt_instructions, __alt_instructions_end); 398 apply_alternatives(__alt_instructions, __alt_instructions_end);
355 399
356 /* switch to patch-once-at-boottime-only mode and free the 400 /* switch to patch-once-at-boottime-only mode and free the
@@ -386,4 +430,6 @@ void __init alternative_instructions(void)
386 alternatives_smp_switch(0); 430 alternatives_smp_switch(0);
387 } 431 }
388#endif 432#endif
433 apply_paravirt(__start_parainstructions, __stop_parainstructions);
434 local_irq_restore(flags);
389} 435}
diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
index 2fd4b7d927c2..776d9be26af9 100644
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -647,23 +647,30 @@ static struct {
647static int lapic_suspend(struct sys_device *dev, pm_message_t state) 647static int lapic_suspend(struct sys_device *dev, pm_message_t state)
648{ 648{
649 unsigned long flags; 649 unsigned long flags;
650 int maxlvt;
650 651
651 if (!apic_pm_state.active) 652 if (!apic_pm_state.active)
652 return 0; 653 return 0;
653 654
655 maxlvt = get_maxlvt();
656
654 apic_pm_state.apic_id = apic_read(APIC_ID); 657 apic_pm_state.apic_id = apic_read(APIC_ID);
655 apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); 658 apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
656 apic_pm_state.apic_ldr = apic_read(APIC_LDR); 659 apic_pm_state.apic_ldr = apic_read(APIC_LDR);
657 apic_pm_state.apic_dfr = apic_read(APIC_DFR); 660 apic_pm_state.apic_dfr = apic_read(APIC_DFR);
658 apic_pm_state.apic_spiv = apic_read(APIC_SPIV); 661 apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
659 apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); 662 apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
660 apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); 663 if (maxlvt >= 4)
664 apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
661 apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); 665 apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
662 apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); 666 apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
663 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); 667 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
664 apic_pm_state.apic_tmict = apic_read(APIC_TMICT); 668 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
665 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); 669 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
666 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); 670#ifdef CONFIG_X86_MCE_P4THERMAL
671 if (maxlvt >= 5)
672 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
673#endif
667 674
668 local_irq_save(flags); 675 local_irq_save(flags);
669 disable_local_APIC(); 676 disable_local_APIC();
@@ -675,10 +682,13 @@ static int lapic_resume(struct sys_device *dev)
675{ 682{
676 unsigned int l, h; 683 unsigned int l, h;
677 unsigned long flags; 684 unsigned long flags;
685 int maxlvt;
678 686
679 if (!apic_pm_state.active) 687 if (!apic_pm_state.active)
680 return 0; 688 return 0;
681 689
690 maxlvt = get_maxlvt();
691
682 local_irq_save(flags); 692 local_irq_save(flags);
683 693
684 /* 694 /*
@@ -700,8 +710,12 @@ static int lapic_resume(struct sys_device *dev)
700 apic_write(APIC_SPIV, apic_pm_state.apic_spiv); 710 apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
701 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); 711 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
702 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); 712 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
703 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); 713#ifdef CONFIG_X86_MCE_P4THERMAL
704 apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); 714 if (maxlvt >= 5)
715 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
716#endif
717 if (maxlvt >= 4)
718 apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
705 apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); 719 apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
706 apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); 720 apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
707 apic_write(APIC_TMICT, apic_pm_state.apic_tmict); 721 apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
index b42f2d914af3..a97847da9ed5 100644
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -198,7 +198,7 @@
198 * (APM) BIOS Interface Specification, Revision 1.2, February 1996. 198 * (APM) BIOS Interface Specification, Revision 1.2, February 1996.
199 * 199 *
200 * [This document is available from Microsoft at: 200 * [This document is available from Microsoft at:
201 * http://www.microsoft.com/hwdev/busbios/amp_12.htm] 201 * http://www.microsoft.com/whdc/archive/amp_12.mspx]
202 */ 202 */
203 203
204#include <linux/module.h> 204#include <linux/module.h>
@@ -231,6 +231,7 @@
231#include <asm/uaccess.h> 231#include <asm/uaccess.h>
232#include <asm/desc.h> 232#include <asm/desc.h>
233#include <asm/i8253.h> 233#include <asm/i8253.h>
234#include <asm/paravirt.h>
234 235
235#include "io_ports.h" 236#include "io_ports.h"
236 237
@@ -540,11 +541,30 @@ static inline void apm_restore_cpus(cpumask_t mask)
540 * Also, we KNOW that for the non error case of apm_bios_call, there 541 * Also, we KNOW that for the non error case of apm_bios_call, there
541 * is no useful data returned in the low order 8 bits of eax. 542 * is no useful data returned in the low order 8 bits of eax.
542 */ 543 */
543#define APM_DO_CLI \ 544
544 if (apm_info.allow_ints) \ 545static inline unsigned long __apm_irq_save(void)
545 local_irq_enable(); \ 546{
546 else \ 547 unsigned long flags;
548 local_save_flags(flags);
549 if (apm_info.allow_ints) {
550 if (irqs_disabled_flags(flags))
551 local_irq_enable();
552 } else
553 local_irq_disable();
554
555 return flags;
556}
557
558#define apm_irq_save(flags) \
559 do { flags = __apm_irq_save(); } while (0)
560
561static inline void apm_irq_restore(unsigned long flags)
562{
563 if (irqs_disabled_flags(flags))
547 local_irq_disable(); 564 local_irq_disable();
565 else if (irqs_disabled())
566 local_irq_enable();
567}
548 568
549#ifdef APM_ZERO_SEGS 569#ifdef APM_ZERO_SEGS
550# define APM_DECL_SEGS \ 570# define APM_DECL_SEGS \
@@ -596,12 +616,11 @@ static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in,
596 save_desc_40 = gdt[0x40 / 8]; 616 save_desc_40 = gdt[0x40 / 8];
597 gdt[0x40 / 8] = bad_bios_desc; 617 gdt[0x40 / 8] = bad_bios_desc;
598 618
599 local_save_flags(flags); 619 apm_irq_save(flags);
600 APM_DO_CLI;
601 APM_DO_SAVE_SEGS; 620 APM_DO_SAVE_SEGS;
602 apm_bios_call_asm(func, ebx_in, ecx_in, eax, ebx, ecx, edx, esi); 621 apm_bios_call_asm(func, ebx_in, ecx_in, eax, ebx, ecx, edx, esi);
603 APM_DO_RESTORE_SEGS; 622 APM_DO_RESTORE_SEGS;
604 local_irq_restore(flags); 623 apm_irq_restore(flags);
605 gdt[0x40 / 8] = save_desc_40; 624 gdt[0x40 / 8] = save_desc_40;
606 put_cpu(); 625 put_cpu();
607 apm_restore_cpus(cpus); 626 apm_restore_cpus(cpus);
@@ -640,12 +659,11 @@ static u8 apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax)
640 save_desc_40 = gdt[0x40 / 8]; 659 save_desc_40 = gdt[0x40 / 8];
641 gdt[0x40 / 8] = bad_bios_desc; 660 gdt[0x40 / 8] = bad_bios_desc;
642 661
643 local_save_flags(flags); 662 apm_irq_save(flags);
644 APM_DO_CLI;
645 APM_DO_SAVE_SEGS; 663 APM_DO_SAVE_SEGS;
646 error = apm_bios_call_simple_asm(func, ebx_in, ecx_in, eax); 664 error = apm_bios_call_simple_asm(func, ebx_in, ecx_in, eax);
647 APM_DO_RESTORE_SEGS; 665 APM_DO_RESTORE_SEGS;
648 local_irq_restore(flags); 666 apm_irq_restore(flags);
649 gdt[0x40 / 8] = save_desc_40; 667 gdt[0x40 / 8] = save_desc_40;
650 put_cpu(); 668 put_cpu();
651 apm_restore_cpus(cpus); 669 apm_restore_cpus(cpus);
@@ -2218,7 +2236,7 @@ static int __init apm_init(void)
2218 2236
2219 dmi_check_system(apm_dmi_table); 2237 dmi_check_system(apm_dmi_table);
2220 2238
2221 if (apm_info.bios.version == 0) { 2239 if (apm_info.bios.version == 0 || paravirt_enabled()) {
2222 printk(KERN_INFO "apm: BIOS not found.\n"); 2240 printk(KERN_INFO "apm: BIOS not found.\n");
2223 return -ENODEV; 2241 return -ENODEV;
2224 } 2242 }
diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c
index c80271f8f084..1b2f3cd33270 100644
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -15,6 +15,7 @@
15#include <asm/processor.h> 15#include <asm/processor.h>
16#include <asm/thread_info.h> 16#include <asm/thread_info.h>
17#include <asm/elf.h> 17#include <asm/elf.h>
18#include <asm/pda.h>
18 19
19#define DEFINE(sym, val) \ 20#define DEFINE(sym, val) \
20 asm volatile("\n->" #sym " %0 " #val : : "i" (val)) 21 asm volatile("\n->" #sym " %0 " #val : : "i" (val))
@@ -51,13 +52,35 @@ void foo(void)
51 OFFSET(TI_exec_domain, thread_info, exec_domain); 52 OFFSET(TI_exec_domain, thread_info, exec_domain);
52 OFFSET(TI_flags, thread_info, flags); 53 OFFSET(TI_flags, thread_info, flags);
53 OFFSET(TI_status, thread_info, status); 54 OFFSET(TI_status, thread_info, status);
54 OFFSET(TI_cpu, thread_info, cpu);
55 OFFSET(TI_preempt_count, thread_info, preempt_count); 55 OFFSET(TI_preempt_count, thread_info, preempt_count);
56 OFFSET(TI_addr_limit, thread_info, addr_limit); 56 OFFSET(TI_addr_limit, thread_info, addr_limit);
57 OFFSET(TI_restart_block, thread_info, restart_block); 57 OFFSET(TI_restart_block, thread_info, restart_block);
58 OFFSET(TI_sysenter_return, thread_info, sysenter_return); 58 OFFSET(TI_sysenter_return, thread_info, sysenter_return);
59 BLANK(); 59 BLANK();
60 60
61 OFFSET(GDS_size, Xgt_desc_struct, size);
62 OFFSET(GDS_address, Xgt_desc_struct, address);
63 OFFSET(GDS_pad, Xgt_desc_struct, pad);
64 BLANK();
65
66 OFFSET(PT_EBX, pt_regs, ebx);
67 OFFSET(PT_ECX, pt_regs, ecx);
68 OFFSET(PT_EDX, pt_regs, edx);
69 OFFSET(PT_ESI, pt_regs, esi);
70 OFFSET(PT_EDI, pt_regs, edi);
71 OFFSET(PT_EBP, pt_regs, ebp);
72 OFFSET(PT_EAX, pt_regs, eax);
73 OFFSET(PT_DS, pt_regs, xds);
74 OFFSET(PT_ES, pt_regs, xes);
75 OFFSET(PT_GS, pt_regs, xgs);
76 OFFSET(PT_ORIG_EAX, pt_regs, orig_eax);
77 OFFSET(PT_EIP, pt_regs, eip);
78 OFFSET(PT_CS, pt_regs, xcs);
79 OFFSET(PT_EFLAGS, pt_regs, eflags);
80 OFFSET(PT_OLDESP, pt_regs, esp);
81 OFFSET(PT_OLDSS, pt_regs, xss);
82 BLANK();
83
61 OFFSET(EXEC_DOMAIN_handler, exec_domain, handler); 84 OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
62 OFFSET(RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext); 85 OFFSET(RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
63 BLANK(); 86 BLANK();
@@ -74,4 +97,18 @@ void foo(void)
74 DEFINE(VDSO_PRELINK, VDSO_PRELINK); 97 DEFINE(VDSO_PRELINK, VDSO_PRELINK);
75 98
76 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); 99 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
100
101 BLANK();
102 OFFSET(PDA_cpu, i386_pda, cpu_number);
103 OFFSET(PDA_pcurrent, i386_pda, pcurrent);
104
105#ifdef CONFIG_PARAVIRT
106 BLANK();
107 OFFSET(PARAVIRT_enabled, paravirt_ops, paravirt_enabled);
108 OFFSET(PARAVIRT_irq_disable, paravirt_ops, irq_disable);
109 OFFSET(PARAVIRT_irq_enable, paravirt_ops, irq_enable);
110 OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, irq_enable_sysexit);
111 OFFSET(PARAVIRT_iret, paravirt_ops, iret);
112 OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
113#endif
77} 114}
diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c
index e4758095d87a..41cfea57232b 100644
--- a/arch/i386/kernel/cpu/amd.c
+++ b/arch/i386/kernel/cpu/amd.c
@@ -104,10 +104,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
104 f_vide(); 104 f_vide();
105 rdtscl(d2); 105 rdtscl(d2);
106 d = d2-d; 106 d = d2-d;
107 107
108 /* Knock these two lines out if it debugs out ok */
109 printk(KERN_INFO "AMD K6 stepping B detected - ");
110 /* -- cut here -- */
111 if (d > 20*K6_BUG_LOOP) 108 if (d > 20*K6_BUG_LOOP)
112 printk("system stability may be impaired when more than 32 MB are used.\n"); 109 printk("system stability may be impaired when more than 32 MB are used.\n");
113 else 110 else
diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index d9f3e3c31f05..1b34c56f8123 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -18,14 +18,15 @@
18#include <asm/apic.h> 18#include <asm/apic.h>
19#include <mach_apic.h> 19#include <mach_apic.h>
20#endif 20#endif
21#include <asm/pda.h>
21 22
22#include "cpu.h" 23#include "cpu.h"
23 24
24DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr); 25DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
25EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr); 26EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr);
26 27
27DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]); 28struct i386_pda *_cpu_pda[NR_CPUS] __read_mostly;
28EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack); 29EXPORT_SYMBOL(_cpu_pda);
29 30
30static int cachesize_override __cpuinitdata = -1; 31static int cachesize_override __cpuinitdata = -1;
31static int disable_x86_fxsr __cpuinitdata; 32static int disable_x86_fxsr __cpuinitdata;
@@ -235,29 +236,14 @@ static int __cpuinit have_cpuid_p(void)
235 return flag_is_changeable_p(X86_EFLAGS_ID); 236 return flag_is_changeable_p(X86_EFLAGS_ID);
236} 237}
237 238
238/* Do minimum CPU detection early. 239void __init cpu_detect(struct cpuinfo_x86 *c)
239 Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
240 The others are not touched to avoid unwanted side effects.
241
242 WARNING: this function is only called on the BP. Don't add code here
243 that is supposed to run on all CPUs. */
244static void __init early_cpu_detect(void)
245{ 240{
246 struct cpuinfo_x86 *c = &boot_cpu_data;
247
248 c->x86_cache_alignment = 32;
249
250 if (!have_cpuid_p())
251 return;
252
253 /* Get vendor name */ 241 /* Get vendor name */
254 cpuid(0x00000000, &c->cpuid_level, 242 cpuid(0x00000000, &c->cpuid_level,
255 (int *)&c->x86_vendor_id[0], 243 (int *)&c->x86_vendor_id[0],
256 (int *)&c->x86_vendor_id[8], 244 (int *)&c->x86_vendor_id[8],
257 (int *)&c->x86_vendor_id[4]); 245 (int *)&c->x86_vendor_id[4]);
258 246
259 get_cpu_vendor(c, 1);
260
261 c->x86 = 4; 247 c->x86 = 4;
262 if (c->cpuid_level >= 0x00000001) { 248 if (c->cpuid_level >= 0x00000001) {
263 u32 junk, tfms, cap0, misc; 249 u32 junk, tfms, cap0, misc;
@@ -274,6 +260,26 @@ static void __init early_cpu_detect(void)
274 } 260 }
275} 261}
276 262
263/* Do minimum CPU detection early.
264 Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
265 The others are not touched to avoid unwanted side effects.
266
267 WARNING: this function is only called on the BP. Don't add code here
268 that is supposed to run on all CPUs. */
269static void __init early_cpu_detect(void)
270{
271 struct cpuinfo_x86 *c = &boot_cpu_data;
272
273 c->x86_cache_alignment = 32;
274
275 if (!have_cpuid_p())
276 return;
277
278 cpu_detect(c);
279
280 get_cpu_vendor(c, 1);
281}
282
277static void __cpuinit generic_identify(struct cpuinfo_x86 * c) 283static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
278{ 284{
279 u32 tfms, xlvl; 285 u32 tfms, xlvl;
@@ -308,6 +314,8 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
308#else 314#else
309 c->apicid = (ebx >> 24) & 0xFF; 315 c->apicid = (ebx >> 24) & 0xFF;
310#endif 316#endif
317 if (c->x86_capability[0] & (1<<19))
318 c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8;
311 } else { 319 } else {
312 /* Have CPUID level 0 only - unheard of */ 320 /* Have CPUID level 0 only - unheard of */
313 c->x86 = 4; 321 c->x86 = 4;
@@ -372,6 +380,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
372 c->x86_vendor_id[0] = '\0'; /* Unset */ 380 c->x86_vendor_id[0] = '\0'; /* Unset */
373 c->x86_model_id[0] = '\0'; /* Unset */ 381 c->x86_model_id[0] = '\0'; /* Unset */
374 c->x86_max_cores = 1; 382 c->x86_max_cores = 1;
383 c->x86_clflush_size = 32;
375 memset(&c->x86_capability, 0, sizeof c->x86_capability); 384 memset(&c->x86_capability, 0, sizeof c->x86_capability);
376 385
377 if (!have_cpuid_p()) { 386 if (!have_cpuid_p()) {
@@ -591,42 +600,24 @@ void __init early_cpu_init(void)
591 disable_pse = 1; 600 disable_pse = 1;
592#endif 601#endif
593} 602}
594/* 603
595 * cpu_init() initializes state that is per-CPU. Some data is already 604/* Make sure %gs is initialized properly in idle threads */
596 * initialized (naturally) in the bootstrap process, such as the GDT 605struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
597 * and IDT. We reload them nevertheless, this function acts as a
598 * 'CPU state barrier', nothing should get across.
599 */
600void __cpuinit cpu_init(void)
601{ 606{
602 int cpu = smp_processor_id(); 607 memset(regs, 0, sizeof(struct pt_regs));
603 struct tss_struct * t = &per_cpu(init_tss, cpu); 608 regs->xgs = __KERNEL_PDA;
604 struct thread_struct *thread = &current->thread; 609 return regs;
605 struct desc_struct *gdt; 610}
606 __u32 stk16_off = (__u32)&per_cpu(cpu_16bit_stack, cpu);
607 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
608 611
609 if (cpu_test_and_set(cpu, cpu_initialized)) { 612static __cpuinit int alloc_gdt(int cpu)
610 printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); 613{
611 for (;;) local_irq_enable(); 614 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
612 } 615 struct desc_struct *gdt;
613 printk(KERN_INFO "Initializing CPU#%d\n", cpu); 616 struct i386_pda *pda;
614 617
615 if (cpu_has_vme || cpu_has_tsc || cpu_has_de) 618 gdt = (struct desc_struct *)cpu_gdt_descr->address;
616 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); 619 pda = cpu_pda(cpu);
617 if (tsc_disable && cpu_has_tsc) {
618 printk(KERN_NOTICE "Disabling TSC...\n");
619 /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
620 clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
621 set_in_cr4(X86_CR4_TSD);
622 }
623 620
624 /* The CPU hotplug case */
625 if (cpu_gdt_descr->address) {
626 gdt = (struct desc_struct *)cpu_gdt_descr->address;
627 memset(gdt, 0, PAGE_SIZE);
628 goto old_gdt;
629 }
630 /* 621 /*
631 * This is a horrible hack to allocate the GDT. The problem 622 * This is a horrible hack to allocate the GDT. The problem
632 * is that cpu_init() is called really early for the boot CPU 623 * is that cpu_init() is called really early for the boot CPU
@@ -634,43 +625,130 @@ void __cpuinit cpu_init(void)
634 * CPUs, when bootmem will have gone away 625 * CPUs, when bootmem will have gone away
635 */ 626 */
636 if (NODE_DATA(0)->bdata->node_bootmem_map) { 627 if (NODE_DATA(0)->bdata->node_bootmem_map) {
637 gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE); 628 BUG_ON(gdt != NULL || pda != NULL);
638 /* alloc_bootmem_pages panics on failure, so no check */ 629
630 gdt = alloc_bootmem_pages(PAGE_SIZE);
631 pda = alloc_bootmem(sizeof(*pda));
632 /* alloc_bootmem(_pages) panics on failure, so no check */
633
639 memset(gdt, 0, PAGE_SIZE); 634 memset(gdt, 0, PAGE_SIZE);
635 memset(pda, 0, sizeof(*pda));
640 } else { 636 } else {
641 gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL); 637 /* GDT and PDA might already have been allocated if
642 if (unlikely(!gdt)) { 638 this is a CPU hotplug re-insertion. */
643 printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu); 639 if (gdt == NULL)
644 for (;;) 640 gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL);
645 local_irq_enable(); 641
642 if (pda == NULL)
643 pda = kmalloc_node(sizeof(*pda), GFP_KERNEL, cpu_to_node(cpu));
644
645 if (unlikely(!gdt || !pda)) {
646 free_pages((unsigned long)gdt, 0);
647 kfree(pda);
648 return 0;
646 } 649 }
647 } 650 }
648old_gdt: 651
652 cpu_gdt_descr->address = (unsigned long)gdt;
653 cpu_pda(cpu) = pda;
654
655 return 1;
656}
657
658/* Initial PDA used by boot CPU */
659struct i386_pda boot_pda = {
660 ._pda = &boot_pda,
661 .cpu_number = 0,
662 .pcurrent = &init_task,
663};
664
665static inline void set_kernel_gs(void)
666{
667 /* Set %gs for this CPU's PDA. Memory clobber is to create a
668 barrier with respect to any PDA operations, so the compiler
669 doesn't move any before here. */
670 asm volatile ("mov %0, %%gs" : : "r" (__KERNEL_PDA) : "memory");
671}
672
673/* Initialize the CPU's GDT and PDA. The boot CPU does this for
674 itself, but secondaries find this done for them. */
675__cpuinit int init_gdt(int cpu, struct task_struct *idle)
676{
677 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
678 struct desc_struct *gdt;
679 struct i386_pda *pda;
680
681 /* For non-boot CPUs, the GDT and PDA should already have been
682 allocated. */
683 if (!alloc_gdt(cpu)) {
684 printk(KERN_CRIT "CPU%d failed to allocate GDT or PDA\n", cpu);
685 return 0;
686 }
687
688 gdt = (struct desc_struct *)cpu_gdt_descr->address;
689 pda = cpu_pda(cpu);
690
691 BUG_ON(gdt == NULL || pda == NULL);
692
649 /* 693 /*
650 * Initialize the per-CPU GDT with the boot GDT, 694 * Initialize the per-CPU GDT with the boot GDT,
651 * and set up the GDT descriptor: 695 * and set up the GDT descriptor:
652 */ 696 */
653 memcpy(gdt, cpu_gdt_table, GDT_SIZE); 697 memcpy(gdt, cpu_gdt_table, GDT_SIZE);
698 cpu_gdt_descr->size = GDT_SIZE - 1;
654 699
655 /* Set up GDT entry for 16bit stack */ 700 pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a,
656 *(__u64 *)(&gdt[GDT_ENTRY_ESPFIX_SS]) |= 701 (u32 *)&gdt[GDT_ENTRY_PDA].b,
657 ((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) | 702 (unsigned long)pda, sizeof(*pda) - 1,
658 ((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) | 703 0x80 | DESCTYPE_S | 0x2, 0); /* present read-write data segment */
659 (CPU_16BIT_STACK_SIZE - 1);
660 704
661 cpu_gdt_descr->size = GDT_SIZE - 1; 705 memset(pda, 0, sizeof(*pda));
662 cpu_gdt_descr->address = (unsigned long)gdt; 706 pda->_pda = pda;
707 pda->cpu_number = cpu;
708 pda->pcurrent = idle;
709
710 return 1;
711}
712
713/* Common CPU init for both boot and secondary CPUs */
714static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
715{
716 struct tss_struct * t = &per_cpu(init_tss, cpu);
717 struct thread_struct *thread = &curr->thread;
718 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
663 719
720 /* Reinit these anyway, even if they've already been done (on
721 the boot CPU, this will transition from the boot gdt+pda to
722 the real ones). */
664 load_gdt(cpu_gdt_descr); 723 load_gdt(cpu_gdt_descr);
724 set_kernel_gs();
725
726 if (cpu_test_and_set(cpu, cpu_initialized)) {
727 printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
728 for (;;) local_irq_enable();
729 }
730
731 printk(KERN_INFO "Initializing CPU#%d\n", cpu);
732
733 if (cpu_has_vme || cpu_has_tsc || cpu_has_de)
734 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
735 if (tsc_disable && cpu_has_tsc) {
736 printk(KERN_NOTICE "Disabling TSC...\n");
737 /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
738 clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
739 set_in_cr4(X86_CR4_TSD);
740 }
741
665 load_idt(&idt_descr); 742 load_idt(&idt_descr);
666 743
667 /* 744 /*
668 * Set up and load the per-CPU TSS and LDT 745 * Set up and load the per-CPU TSS and LDT
669 */ 746 */
670 atomic_inc(&init_mm.mm_count); 747 atomic_inc(&init_mm.mm_count);
671 current->active_mm = &init_mm; 748 curr->active_mm = &init_mm;
672 BUG_ON(current->mm); 749 if (curr->mm)
673 enter_lazy_tlb(&init_mm, current); 750 BUG();
751 enter_lazy_tlb(&init_mm, curr);
674 752
675 load_esp0(t, thread); 753 load_esp0(t, thread);
676 set_tss_desc(cpu,t); 754 set_tss_desc(cpu,t);
@@ -682,8 +760,8 @@ old_gdt:
682 __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); 760 __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
683#endif 761#endif
684 762
685 /* Clear %fs and %gs. */ 763 /* Clear %fs. */
686 asm volatile ("movl %0, %%fs; movl %0, %%gs" : : "r" (0)); 764 asm volatile ("mov %0, %%fs" : : "r" (0));
687 765
688 /* Clear all 6 debug registers: */ 766 /* Clear all 6 debug registers: */
689 set_debugreg(0, 0); 767 set_debugreg(0, 0);
@@ -701,6 +779,37 @@ old_gdt:
701 mxcsr_feature_mask_init(); 779 mxcsr_feature_mask_init();
702} 780}
703 781
782/* Entrypoint to initialize secondary CPU */
783void __cpuinit secondary_cpu_init(void)
784{
785 int cpu = smp_processor_id();
786 struct task_struct *curr = current;
787
788 _cpu_init(cpu, curr);
789}
790
791/*
792 * cpu_init() initializes state that is per-CPU. Some data is already
793 * initialized (naturally) in the bootstrap process, such as the GDT
794 * and IDT. We reload them nevertheless, this function acts as a
795 * 'CPU state barrier', nothing should get across.
796 */
797void __cpuinit cpu_init(void)
798{
799 int cpu = smp_processor_id();
800 struct task_struct *curr = current;
801
802 /* Set up the real GDT and PDA, so we can transition from the
803 boot versions. */
804 if (!init_gdt(cpu, curr)) {
805 /* failed to allocate something; not much we can do... */
806 for (;;)
807 local_irq_enable();
808 }
809
810 _cpu_init(cpu, curr);
811}
812
704#ifdef CONFIG_HOTPLUG_CPU 813#ifdef CONFIG_HOTPLUG_CPU
705void __cpuinit cpu_uninit(void) 814void __cpuinit cpu_uninit(void)
706{ 815{
diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c
index 94a95aa5227e..56fe26584957 100644
--- a/arch/i386/kernel/cpu/intel.c
+++ b/arch/i386/kernel/cpu/intel.c
@@ -107,7 +107,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
107 * Note that the workaround only should be initialized once... 107 * Note that the workaround only should be initialized once...
108 */ 108 */
109 c->f00f_bug = 0; 109 c->f00f_bug = 0;
110 if ( c->x86 == 5 ) { 110 if (!paravirt_enabled() && c->x86 == 5) {
111 static int f00f_workaround_enabled = 0; 111 static int f00f_workaround_enabled = 0;
112 112
113 c->f00f_bug = 1; 113 c->f00f_bug = 1;
@@ -195,8 +195,16 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
195 if ((c->x86 == 0xf && c->x86_model >= 0x03) || 195 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
196 (c->x86 == 0x6 && c->x86_model >= 0x0e)) 196 (c->x86 == 0x6 && c->x86_model >= 0x0e))
197 set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability); 197 set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
198}
199 198
199 if (cpu_has_ds) {
200 unsigned int l1;
201 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
202 if (!(l1 & (1<<11)))
203 set_bit(X86_FEATURE_BTS, c->x86_capability);
204 if (!(l1 & (1<<12)))
205 set_bit(X86_FEATURE_PEBS, c->x86_capability);
206 }
207}
200 208
201static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 * c, unsigned int size) 209static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 * c, unsigned int size)
202{ 210{
diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c
index 5c43be47587f..80b4c5d421b1 100644
--- a/arch/i386/kernel/cpu/intel_cacheinfo.c
+++ b/arch/i386/kernel/cpu/intel_cacheinfo.c
@@ -480,12 +480,10 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
480 if (num_cache_leaves == 0) 480 if (num_cache_leaves == 0)
481 return -ENOENT; 481 return -ENOENT;
482 482
483 cpuid4_info[cpu] = kmalloc( 483 cpuid4_info[cpu] = kzalloc(
484 sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL); 484 sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
485 if (unlikely(cpuid4_info[cpu] == NULL)) 485 if (unlikely(cpuid4_info[cpu] == NULL))
486 return -ENOMEM; 486 return -ENOMEM;
487 memset(cpuid4_info[cpu], 0,
488 sizeof(struct _cpuid4_info) * num_cache_leaves);
489 487
490 oldmask = current->cpus_allowed; 488 oldmask = current->cpus_allowed;
491 retval = set_cpus_allowed(current, cpumask_of_cpu(cpu)); 489 retval = set_cpus_allowed(current, cpumask_of_cpu(cpu));
@@ -658,17 +656,14 @@ static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu)
658 return -ENOENT; 656 return -ENOENT;
659 657
660 /* Allocate all required memory */ 658 /* Allocate all required memory */
661 cache_kobject[cpu] = kmalloc(sizeof(struct kobject), GFP_KERNEL); 659 cache_kobject[cpu] = kzalloc(sizeof(struct kobject), GFP_KERNEL);
662 if (unlikely(cache_kobject[cpu] == NULL)) 660 if (unlikely(cache_kobject[cpu] == NULL))
663 goto err_out; 661 goto err_out;
664 memset(cache_kobject[cpu], 0, sizeof(struct kobject));
665 662
666 index_kobject[cpu] = kmalloc( 663 index_kobject[cpu] = kzalloc(
667 sizeof(struct _index_kobject ) * num_cache_leaves, GFP_KERNEL); 664 sizeof(struct _index_kobject ) * num_cache_leaves, GFP_KERNEL);
668 if (unlikely(index_kobject[cpu] == NULL)) 665 if (unlikely(index_kobject[cpu] == NULL))
669 goto err_out; 666 goto err_out;
670 memset(index_kobject[cpu], 0,
671 sizeof(struct _index_kobject) * num_cache_leaves);
672 667
673 return 0; 668 return 0;
674 669
diff --git a/arch/i386/kernel/cpu/mcheck/non-fatal.c b/arch/i386/kernel/cpu/mcheck/non-fatal.c
index 1f9153ae5b03..6b5d3518a1c0 100644
--- a/arch/i386/kernel/cpu/mcheck/non-fatal.c
+++ b/arch/i386/kernel/cpu/mcheck/non-fatal.c
@@ -51,10 +51,10 @@ static void mce_checkregs (void *info)
51 } 51 }
52} 52}
53 53
54static void mce_work_fn(void *data); 54static void mce_work_fn(struct work_struct *work);
55static DECLARE_WORK(mce_work, mce_work_fn, NULL); 55static DECLARE_DELAYED_WORK(mce_work, mce_work_fn);
56 56
57static void mce_work_fn(void *data) 57static void mce_work_fn(struct work_struct *work)
58{ 58{
59 on_each_cpu(mce_checkregs, NULL, 1, 1); 59 on_each_cpu(mce_checkregs, NULL, 1, 1);
60 schedule_delayed_work(&mce_work, MCE_RATE); 60 schedule_delayed_work(&mce_work, MCE_RATE);
diff --git a/arch/i386/kernel/cpu/mcheck/therm_throt.c b/arch/i386/kernel/cpu/mcheck/therm_throt.c
index 2d8703b7ce65..065005c3f168 100644
--- a/arch/i386/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/i386/kernel/cpu/mcheck/therm_throt.c
@@ -20,6 +20,7 @@
20#include <linux/cpu.h> 20#include <linux/cpu.h>
21#include <asm/cpu.h> 21#include <asm/cpu.h>
22#include <linux/notifier.h> 22#include <linux/notifier.h>
23#include <linux/jiffies.h>
23#include <asm/therm_throt.h> 24#include <asm/therm_throt.h>
24 25
25/* How long to wait between reporting thermal events */ 26/* How long to wait between reporting thermal events */
@@ -115,7 +116,6 @@ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
115 return sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group); 116 return sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group);
116} 117}
117 118
118#ifdef CONFIG_HOTPLUG_CPU
119static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) 119static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
120{ 120{
121 return sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); 121 return sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group);
@@ -152,7 +152,6 @@ static struct notifier_block thermal_throttle_cpu_notifier =
152{ 152{
153 .notifier_call = thermal_throttle_cpu_callback, 153 .notifier_call = thermal_throttle_cpu_callback,
154}; 154};
155#endif /* CONFIG_HOTPLUG_CPU */
156 155
157static __init int thermal_throttle_init_device(void) 156static __init int thermal_throttle_init_device(void)
158{ 157{
diff --git a/arch/i386/kernel/cpu/mtrr/Makefile b/arch/i386/kernel/cpu/mtrr/Makefile
index a25b701ab84e..191fc0533649 100644
--- a/arch/i386/kernel/cpu/mtrr/Makefile
+++ b/arch/i386/kernel/cpu/mtrr/Makefile
@@ -1,5 +1,3 @@
1obj-y := main.o if.o generic.o state.o 1obj-y := main.o if.o generic.o state.o
2obj-y += amd.o 2obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
3obj-y += cyrix.o
4obj-y += centaur.o
5 3
diff --git a/arch/i386/kernel/cpu/mtrr/amd.c b/arch/i386/kernel/cpu/mtrr/amd.c
index 1a1e04b6fd00..0949cdbf848a 100644
--- a/arch/i386/kernel/cpu/mtrr/amd.c
+++ b/arch/i386/kernel/cpu/mtrr/amd.c
@@ -7,7 +7,7 @@
7 7
8static void 8static void
9amd_get_mtrr(unsigned int reg, unsigned long *base, 9amd_get_mtrr(unsigned int reg, unsigned long *base,
10 unsigned int *size, mtrr_type * type) 10 unsigned long *size, mtrr_type * type)
11{ 11{
12 unsigned long low, high; 12 unsigned long low, high;
13 13
diff --git a/arch/i386/kernel/cpu/mtrr/centaur.c b/arch/i386/kernel/cpu/mtrr/centaur.c
index 33f00ac314ef..cb9aa3a7a7ab 100644
--- a/arch/i386/kernel/cpu/mtrr/centaur.c
+++ b/arch/i386/kernel/cpu/mtrr/centaur.c
@@ -17,7 +17,7 @@ static u8 centaur_mcr_type; /* 0 for winchip, 1 for winchip2 */
17 */ 17 */
18 18
19static int 19static int
20centaur_get_free_region(unsigned long base, unsigned long size) 20centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg)
21/* [SUMMARY] Get a free MTRR. 21/* [SUMMARY] Get a free MTRR.
22 <base> The starting (base) address of the region. 22 <base> The starting (base) address of the region.
23 <size> The size (in bytes) of the region. 23 <size> The size (in bytes) of the region.
@@ -26,10 +26,11 @@ centaur_get_free_region(unsigned long base, unsigned long size)
26{ 26{
27 int i, max; 27 int i, max;
28 mtrr_type ltype; 28 mtrr_type ltype;
29 unsigned long lbase; 29 unsigned long lbase, lsize;
30 unsigned int lsize;
31 30
32 max = num_var_ranges; 31 max = num_var_ranges;
32 if (replace_reg >= 0 && replace_reg < max)
33 return replace_reg;
33 for (i = 0; i < max; ++i) { 34 for (i = 0; i < max; ++i) {
34 if (centaur_mcr_reserved & (1 << i)) 35 if (centaur_mcr_reserved & (1 << i))
35 continue; 36 continue;
@@ -49,7 +50,7 @@ mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
49 50
50static void 51static void
51centaur_get_mcr(unsigned int reg, unsigned long *base, 52centaur_get_mcr(unsigned int reg, unsigned long *base,
52 unsigned int *size, mtrr_type * type) 53 unsigned long *size, mtrr_type * type)
53{ 54{
54 *base = centaur_mcr[reg].high >> PAGE_SHIFT; 55 *base = centaur_mcr[reg].high >> PAGE_SHIFT;
55 *size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT; 56 *size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT;
diff --git a/arch/i386/kernel/cpu/mtrr/cyrix.c b/arch/i386/kernel/cpu/mtrr/cyrix.c
index 9027a987006b..0737a596db43 100644
--- a/arch/i386/kernel/cpu/mtrr/cyrix.c
+++ b/arch/i386/kernel/cpu/mtrr/cyrix.c
@@ -9,7 +9,7 @@ int arr3_protected;
9 9
10static void 10static void
11cyrix_get_arr(unsigned int reg, unsigned long *base, 11cyrix_get_arr(unsigned int reg, unsigned long *base,
12 unsigned int *size, mtrr_type * type) 12 unsigned long *size, mtrr_type * type)
13{ 13{
14 unsigned long flags; 14 unsigned long flags;
15 unsigned char arr, ccr3, rcr, shift; 15 unsigned char arr, ccr3, rcr, shift;
@@ -77,7 +77,7 @@ cyrix_get_arr(unsigned int reg, unsigned long *base,
77} 77}
78 78
79static int 79static int
80cyrix_get_free_region(unsigned long base, unsigned long size) 80cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg)
81/* [SUMMARY] Get a free ARR. 81/* [SUMMARY] Get a free ARR.
82 <base> The starting (base) address of the region. 82 <base> The starting (base) address of the region.
83 <size> The size (in bytes) of the region. 83 <size> The size (in bytes) of the region.
@@ -86,9 +86,24 @@ cyrix_get_free_region(unsigned long base, unsigned long size)
86{ 86{
87 int i; 87 int i;
88 mtrr_type ltype; 88 mtrr_type ltype;
89 unsigned long lbase; 89 unsigned long lbase, lsize;
90 unsigned int lsize;
91 90
91 switch (replace_reg) {
92 case 7:
93 if (size < 0x40)
94 break;
95 case 6:
96 case 5:
97 case 4:
98 return replace_reg;
99 case 3:
100 if (arr3_protected)
101 break;
102 case 2:
103 case 1:
104 case 0:
105 return replace_reg;
106 }
92 /* If we are to set up a region >32M then look at ARR7 immediately */ 107 /* If we are to set up a region >32M then look at ARR7 immediately */
93 if (size > 0x2000) { 108 if (size > 0x2000) {
94 cyrix_get_arr(7, &lbase, &lsize, &ltype); 109 cyrix_get_arr(7, &lbase, &lsize, &ltype);
@@ -214,7 +229,7 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base,
214 229
215typedef struct { 230typedef struct {
216 unsigned long base; 231 unsigned long base;
217 unsigned int size; 232 unsigned long size;
218 mtrr_type type; 233 mtrr_type type;
219} arr_state_t; 234} arr_state_t;
220 235
diff --git a/arch/i386/kernel/cpu/mtrr/generic.c b/arch/i386/kernel/cpu/mtrr/generic.c
index 0b61eed8bbd8..f77fc53db654 100644
--- a/arch/i386/kernel/cpu/mtrr/generic.c
+++ b/arch/i386/kernel/cpu/mtrr/generic.c
@@ -3,6 +3,7 @@
3#include <linux/init.h> 3#include <linux/init.h>
4#include <linux/slab.h> 4#include <linux/slab.h>
5#include <linux/mm.h> 5#include <linux/mm.h>
6#include <linux/module.h>
6#include <asm/io.h> 7#include <asm/io.h>
7#include <asm/mtrr.h> 8#include <asm/mtrr.h>
8#include <asm/msr.h> 9#include <asm/msr.h>
@@ -15,12 +16,19 @@ struct mtrr_state {
15 struct mtrr_var_range *var_ranges; 16 struct mtrr_var_range *var_ranges;
16 mtrr_type fixed_ranges[NUM_FIXED_RANGES]; 17 mtrr_type fixed_ranges[NUM_FIXED_RANGES];
17 unsigned char enabled; 18 unsigned char enabled;
19 unsigned char have_fixed;
18 mtrr_type def_type; 20 mtrr_type def_type;
19}; 21};
20 22
21static unsigned long smp_changes_mask; 23static unsigned long smp_changes_mask;
22static struct mtrr_state mtrr_state = {}; 24static struct mtrr_state mtrr_state = {};
23 25
26#undef MODULE_PARAM_PREFIX
27#define MODULE_PARAM_PREFIX "mtrr."
28
29static __initdata int mtrr_show;
30module_param_named(show, mtrr_show, bool, 0);
31
24/* Get the MSR pair relating to a var range */ 32/* Get the MSR pair relating to a var range */
25static void __init 33static void __init
26get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) 34get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
@@ -43,6 +51,14 @@ get_fixed_ranges(mtrr_type * frs)
43 rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]); 51 rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]);
44} 52}
45 53
54static void __init print_fixed(unsigned base, unsigned step, const mtrr_type*types)
55{
56 unsigned i;
57
58 for (i = 0; i < 8; ++i, ++types, base += step)
59 printk(KERN_INFO "MTRR %05X-%05X %s\n", base, base + step - 1, mtrr_attrib_to_str(*types));
60}
61
46/* Grab all of the MTRR state for this CPU into *state */ 62/* Grab all of the MTRR state for this CPU into *state */
47void __init get_mtrr_state(void) 63void __init get_mtrr_state(void)
48{ 64{
@@ -58,13 +74,49 @@ void __init get_mtrr_state(void)
58 } 74 }
59 vrs = mtrr_state.var_ranges; 75 vrs = mtrr_state.var_ranges;
60 76
77 rdmsr(MTRRcap_MSR, lo, dummy);
78 mtrr_state.have_fixed = (lo >> 8) & 1;
79
61 for (i = 0; i < num_var_ranges; i++) 80 for (i = 0; i < num_var_ranges; i++)
62 get_mtrr_var_range(i, &vrs[i]); 81 get_mtrr_var_range(i, &vrs[i]);
63 get_fixed_ranges(mtrr_state.fixed_ranges); 82 if (mtrr_state.have_fixed)
83 get_fixed_ranges(mtrr_state.fixed_ranges);
64 84
65 rdmsr(MTRRdefType_MSR, lo, dummy); 85 rdmsr(MTRRdefType_MSR, lo, dummy);
66 mtrr_state.def_type = (lo & 0xff); 86 mtrr_state.def_type = (lo & 0xff);
67 mtrr_state.enabled = (lo & 0xc00) >> 10; 87 mtrr_state.enabled = (lo & 0xc00) >> 10;
88
89 if (mtrr_show) {
90 int high_width;
91
92 printk(KERN_INFO "MTRR default type: %s\n", mtrr_attrib_to_str(mtrr_state.def_type));
93 if (mtrr_state.have_fixed) {
94 printk(KERN_INFO "MTRR fixed ranges %sabled:\n",
95 mtrr_state.enabled & 1 ? "en" : "dis");
96 print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
97 for (i = 0; i < 2; ++i)
98 print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8);
99 for (i = 0; i < 8; ++i)
100 print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8);
101 }
102 printk(KERN_INFO "MTRR variable ranges %sabled:\n",
103 mtrr_state.enabled & 2 ? "en" : "dis");
104 high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4;
105 for (i = 0; i < num_var_ranges; ++i) {
106 if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
107 printk(KERN_INFO "MTRR %u base %0*X%05X000 mask %0*X%05X000 %s\n",
108 i,
109 high_width,
110 mtrr_state.var_ranges[i].base_hi,
111 mtrr_state.var_ranges[i].base_lo >> 12,
112 high_width,
113 mtrr_state.var_ranges[i].mask_hi,
114 mtrr_state.var_ranges[i].mask_lo >> 12,
115 mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff));
116 else
117 printk(KERN_INFO "MTRR %u disabled\n", i);
118 }
119 }
68} 120}
69 121
70/* Some BIOS's are fucked and don't set all MTRRs the same! */ 122/* Some BIOS's are fucked and don't set all MTRRs the same! */
@@ -95,7 +147,7 @@ void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b)
95 smp_processor_id(), msr, a, b); 147 smp_processor_id(), msr, a, b);
96} 148}
97 149
98int generic_get_free_region(unsigned long base, unsigned long size) 150int generic_get_free_region(unsigned long base, unsigned long size, int replace_reg)
99/* [SUMMARY] Get a free MTRR. 151/* [SUMMARY] Get a free MTRR.
100 <base> The starting (base) address of the region. 152 <base> The starting (base) address of the region.
101 <size> The size (in bytes) of the region. 153 <size> The size (in bytes) of the region.
@@ -104,10 +156,11 @@ int generic_get_free_region(unsigned long base, unsigned long size)
104{ 156{
105 int i, max; 157 int i, max;
106 mtrr_type ltype; 158 mtrr_type ltype;
107 unsigned long lbase; 159 unsigned long lbase, lsize;
108 unsigned lsize;
109 160
110 max = num_var_ranges; 161 max = num_var_ranges;
162 if (replace_reg >= 0 && replace_reg < max)
163 return replace_reg;
111 for (i = 0; i < max; ++i) { 164 for (i = 0; i < max; ++i) {
112 mtrr_if->get(i, &lbase, &lsize, &ltype); 165 mtrr_if->get(i, &lbase, &lsize, &ltype);
113 if (lsize == 0) 166 if (lsize == 0)
@@ -117,7 +170,7 @@ int generic_get_free_region(unsigned long base, unsigned long size)
117} 170}
118 171
119static void generic_get_mtrr(unsigned int reg, unsigned long *base, 172static void generic_get_mtrr(unsigned int reg, unsigned long *base,
120 unsigned int *size, mtrr_type * type) 173 unsigned long *size, mtrr_type *type)
121{ 174{
122 unsigned int mask_lo, mask_hi, base_lo, base_hi; 175 unsigned int mask_lo, mask_hi, base_lo, base_hi;
123 176
@@ -202,7 +255,9 @@ static int set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr)
202 return changed; 255 return changed;
203} 256}
204 257
205static unsigned long set_mtrr_state(u32 deftype_lo, u32 deftype_hi) 258static u32 deftype_lo, deftype_hi;
259
260static unsigned long set_mtrr_state(void)
206/* [SUMMARY] Set the MTRR state for this CPU. 261/* [SUMMARY] Set the MTRR state for this CPU.
207 <state> The MTRR state information to read. 262 <state> The MTRR state information to read.
208 <ctxt> Some relevant CPU context. 263 <ctxt> Some relevant CPU context.
@@ -217,14 +272,14 @@ static unsigned long set_mtrr_state(u32 deftype_lo, u32 deftype_hi)
217 if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i])) 272 if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i]))
218 change_mask |= MTRR_CHANGE_MASK_VARIABLE; 273 change_mask |= MTRR_CHANGE_MASK_VARIABLE;
219 274
220 if (set_fixed_ranges(mtrr_state.fixed_ranges)) 275 if (mtrr_state.have_fixed && set_fixed_ranges(mtrr_state.fixed_ranges))
221 change_mask |= MTRR_CHANGE_MASK_FIXED; 276 change_mask |= MTRR_CHANGE_MASK_FIXED;
222 277
223 /* Set_mtrr_restore restores the old value of MTRRdefType, 278 /* Set_mtrr_restore restores the old value of MTRRdefType,
224 so to set it we fiddle with the saved value */ 279 so to set it we fiddle with the saved value */
225 if ((deftype_lo & 0xff) != mtrr_state.def_type 280 if ((deftype_lo & 0xff) != mtrr_state.def_type
226 || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) { 281 || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) {
227 deftype_lo |= (mtrr_state.def_type | mtrr_state.enabled << 10); 282 deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type | (mtrr_state.enabled << 10);
228 change_mask |= MTRR_CHANGE_MASK_DEFTYPE; 283 change_mask |= MTRR_CHANGE_MASK_DEFTYPE;
229 } 284 }
230 285
@@ -233,7 +288,6 @@ static unsigned long set_mtrr_state(u32 deftype_lo, u32 deftype_hi)
233 288
234 289
235static unsigned long cr4 = 0; 290static unsigned long cr4 = 0;
236static u32 deftype_lo, deftype_hi;
237static DEFINE_SPINLOCK(set_atomicity_lock); 291static DEFINE_SPINLOCK(set_atomicity_lock);
238 292
239/* 293/*
@@ -271,7 +325,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
271 rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); 325 rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi);
272 326
273 /* Disable MTRRs, and set the default type to uncached */ 327 /* Disable MTRRs, and set the default type to uncached */
274 mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & 0xf300UL, deftype_hi); 328 mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & ~0xcff, deftype_hi);
275} 329}
276 330
277static void post_set(void) __releases(set_atomicity_lock) 331static void post_set(void) __releases(set_atomicity_lock)
@@ -300,7 +354,7 @@ static void generic_set_all(void)
300 prepare_set(); 354 prepare_set();
301 355
302 /* Actually set the state */ 356 /* Actually set the state */
303 mask = set_mtrr_state(deftype_lo,deftype_hi); 357 mask = set_mtrr_state();
304 358
305 post_set(); 359 post_set();
306 local_irq_restore(flags); 360 local_irq_restore(flags);
@@ -366,7 +420,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i
366 printk(KERN_WARNING "mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); 420 printk(KERN_WARNING "mtrr: base(0x%lx000) is not 4 MiB aligned\n", base);
367 return -EINVAL; 421 return -EINVAL;
368 } 422 }
369 if (!(base + size < 0x70000000 || base > 0x7003FFFF) && 423 if (!(base + size < 0x70000 || base > 0x7003F) &&
370 (type == MTRR_TYPE_WRCOMB 424 (type == MTRR_TYPE_WRCOMB
371 || type == MTRR_TYPE_WRBACK)) { 425 || type == MTRR_TYPE_WRBACK)) {
372 printk(KERN_WARNING "mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n"); 426 printk(KERN_WARNING "mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n");
diff --git a/arch/i386/kernel/cpu/mtrr/if.c b/arch/i386/kernel/cpu/mtrr/if.c
index 5ac051bb9d55..5ae1705eafa6 100644
--- a/arch/i386/kernel/cpu/mtrr/if.c
+++ b/arch/i386/kernel/cpu/mtrr/if.c
@@ -17,7 +17,7 @@ extern unsigned int *usage_table;
17 17
18#define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private) 18#define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private)
19 19
20static char *mtrr_strings[MTRR_NUM_TYPES] = 20static const char *const mtrr_strings[MTRR_NUM_TYPES] =
21{ 21{
22 "uncachable", /* 0 */ 22 "uncachable", /* 0 */
23 "write-combining", /* 1 */ 23 "write-combining", /* 1 */
@@ -28,7 +28,7 @@ static char *mtrr_strings[MTRR_NUM_TYPES] =
28 "write-back", /* 6 */ 28 "write-back", /* 6 */
29}; 29};
30 30
31char *mtrr_attrib_to_str(int x) 31const char *mtrr_attrib_to_str(int x)
32{ 32{
33 return (x <= 6) ? mtrr_strings[x] : "?"; 33 return (x <= 6) ? mtrr_strings[x] : "?";
34} 34}
@@ -44,10 +44,9 @@ mtrr_file_add(unsigned long base, unsigned long size,
44 44
45 max = num_var_ranges; 45 max = num_var_ranges;
46 if (fcount == NULL) { 46 if (fcount == NULL) {
47 fcount = kmalloc(max * sizeof *fcount, GFP_KERNEL); 47 fcount = kzalloc(max * sizeof *fcount, GFP_KERNEL);
48 if (!fcount) 48 if (!fcount)
49 return -ENOMEM; 49 return -ENOMEM;
50 memset(fcount, 0, max * sizeof *fcount);
51 FILE_FCOUNT(file) = fcount; 50 FILE_FCOUNT(file) = fcount;
52 } 51 }
53 if (!page) { 52 if (!page) {
@@ -155,6 +154,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
155{ 154{
156 int err = 0; 155 int err = 0;
157 mtrr_type type; 156 mtrr_type type;
157 unsigned long size;
158 struct mtrr_sentry sentry; 158 struct mtrr_sentry sentry;
159 struct mtrr_gentry gentry; 159 struct mtrr_gentry gentry;
160 void __user *arg = (void __user *) __arg; 160 void __user *arg = (void __user *) __arg;
@@ -235,15 +235,15 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
235 case MTRRIOC_GET_ENTRY: 235 case MTRRIOC_GET_ENTRY:
236 if (gentry.regnum >= num_var_ranges) 236 if (gentry.regnum >= num_var_ranges)
237 return -EINVAL; 237 return -EINVAL;
238 mtrr_if->get(gentry.regnum, &gentry.base, &gentry.size, &type); 238 mtrr_if->get(gentry.regnum, &gentry.base, &size, &type);
239 239
240 /* Hide entries that go above 4GB */ 240 /* Hide entries that go above 4GB */
241 if (gentry.base + gentry.size > 0x100000 241 if (gentry.base + size - 1 >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT))
242 || gentry.size == 0x100000) 242 || size >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT)))
243 gentry.base = gentry.size = gentry.type = 0; 243 gentry.base = gentry.size = gentry.type = 0;
244 else { 244 else {
245 gentry.base <<= PAGE_SHIFT; 245 gentry.base <<= PAGE_SHIFT;
246 gentry.size <<= PAGE_SHIFT; 246 gentry.size = size << PAGE_SHIFT;
247 gentry.type = type; 247 gentry.type = type;
248 } 248 }
249 249
@@ -273,8 +273,14 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
273 case MTRRIOC_GET_PAGE_ENTRY: 273 case MTRRIOC_GET_PAGE_ENTRY:
274 if (gentry.regnum >= num_var_ranges) 274 if (gentry.regnum >= num_var_ranges)
275 return -EINVAL; 275 return -EINVAL;
276 mtrr_if->get(gentry.regnum, &gentry.base, &gentry.size, &type); 276 mtrr_if->get(gentry.regnum, &gentry.base, &size, &type);
277 gentry.type = type; 277 /* Hide entries that would overflow */
278 if (size != (__typeof__(gentry.size))size)
279 gentry.base = gentry.size = gentry.type = 0;
280 else {
281 gentry.size = size;
282 gentry.type = type;
283 }
278 break; 284 break;
279 } 285 }
280 286
@@ -353,8 +359,7 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
353 char factor; 359 char factor;
354 int i, max, len; 360 int i, max, len;
355 mtrr_type type; 361 mtrr_type type;
356 unsigned long base; 362 unsigned long base, size;
357 unsigned int size;
358 363
359 len = 0; 364 len = 0;
360 max = num_var_ranges; 365 max = num_var_ranges;
@@ -373,7 +378,7 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
373 } 378 }
374 /* RED-PEN: base can be > 32bit */ 379 /* RED-PEN: base can be > 32bit */
375 len += seq_printf(seq, 380 len += seq_printf(seq,
376 "reg%02i: base=0x%05lx000 (%4liMB), size=%4i%cB: %s, count=%d\n", 381 "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n",
377 i, base, base >> (20 - PAGE_SHIFT), size, factor, 382 i, base, base >> (20 - PAGE_SHIFT), size, factor,
378 mtrr_attrib_to_str(type), usage_table[i]); 383 mtrr_attrib_to_str(type), usage_table[i]);
379 } 384 }
diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c
index fff90bda4733..16bb7ea87145 100644
--- a/arch/i386/kernel/cpu/mtrr/main.c
+++ b/arch/i386/kernel/cpu/mtrr/main.c
@@ -59,7 +59,11 @@ struct mtrr_ops * mtrr_if = NULL;
59static void set_mtrr(unsigned int reg, unsigned long base, 59static void set_mtrr(unsigned int reg, unsigned long base,
60 unsigned long size, mtrr_type type); 60 unsigned long size, mtrr_type type);
61 61
62#ifndef CONFIG_X86_64
62extern int arr3_protected; 63extern int arr3_protected;
64#else
65#define arr3_protected 0
66#endif
63 67
64void set_mtrr_ops(struct mtrr_ops * ops) 68void set_mtrr_ops(struct mtrr_ops * ops)
65{ 69{
@@ -168,6 +172,13 @@ static void ipi_handler(void *info)
168 172
169#endif 173#endif
170 174
175static inline int types_compatible(mtrr_type type1, mtrr_type type2) {
176 return type1 == MTRR_TYPE_UNCACHABLE ||
177 type2 == MTRR_TYPE_UNCACHABLE ||
178 (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK) ||
179 (type1 == MTRR_TYPE_WRBACK && type2 == MTRR_TYPE_WRTHROUGH);
180}
181
171/** 182/**
172 * set_mtrr - update mtrrs on all processors 183 * set_mtrr - update mtrrs on all processors
173 * @reg: mtrr in question 184 * @reg: mtrr in question
@@ -263,8 +274,8 @@ static void set_mtrr(unsigned int reg, unsigned long base,
263 274
264/** 275/**
265 * mtrr_add_page - Add a memory type region 276 * mtrr_add_page - Add a memory type region
266 * @base: Physical base address of region in pages (4 KB) 277 * @base: Physical base address of region in pages (in units of 4 kB!)
267 * @size: Physical size of region in pages (4 KB) 278 * @size: Physical size of region in pages (4 kB)
268 * @type: Type of MTRR desired 279 * @type: Type of MTRR desired
269 * @increment: If this is true do usage counting on the region 280 * @increment: If this is true do usage counting on the region
270 * 281 *
@@ -300,11 +311,9 @@ static void set_mtrr(unsigned int reg, unsigned long base,
300int mtrr_add_page(unsigned long base, unsigned long size, 311int mtrr_add_page(unsigned long base, unsigned long size,
301 unsigned int type, char increment) 312 unsigned int type, char increment)
302{ 313{
303 int i; 314 int i, replace, error;
304 mtrr_type ltype; 315 mtrr_type ltype;
305 unsigned long lbase; 316 unsigned long lbase, lsize;
306 unsigned int lsize;
307 int error;
308 317
309 if (!mtrr_if) 318 if (!mtrr_if)
310 return -ENXIO; 319 return -ENXIO;
@@ -324,12 +333,18 @@ int mtrr_add_page(unsigned long base, unsigned long size,
324 return -ENOSYS; 333 return -ENOSYS;
325 } 334 }
326 335
336 if (!size) {
337 printk(KERN_WARNING "mtrr: zero sized request\n");
338 return -EINVAL;
339 }
340
327 if (base & size_or_mask || size & size_or_mask) { 341 if (base & size_or_mask || size & size_or_mask) {
328 printk(KERN_WARNING "mtrr: base or size exceeds the MTRR width\n"); 342 printk(KERN_WARNING "mtrr: base or size exceeds the MTRR width\n");
329 return -EINVAL; 343 return -EINVAL;
330 } 344 }
331 345
332 error = -EINVAL; 346 error = -EINVAL;
347 replace = -1;
333 348
334 /* No CPU hotplug when we change MTRR entries */ 349 /* No CPU hotplug when we change MTRR entries */
335 lock_cpu_hotplug(); 350 lock_cpu_hotplug();
@@ -337,21 +352,28 @@ int mtrr_add_page(unsigned long base, unsigned long size,
337 mutex_lock(&mtrr_mutex); 352 mutex_lock(&mtrr_mutex);
338 for (i = 0; i < num_var_ranges; ++i) { 353 for (i = 0; i < num_var_ranges; ++i) {
339 mtrr_if->get(i, &lbase, &lsize, &ltype); 354 mtrr_if->get(i, &lbase, &lsize, &ltype);
340 if (base >= lbase + lsize) 355 if (!lsize || base > lbase + lsize - 1 || base + size - 1 < lbase)
341 continue;
342 if ((base < lbase) && (base + size <= lbase))
343 continue; 356 continue;
344 /* At this point we know there is some kind of overlap/enclosure */ 357 /* At this point we know there is some kind of overlap/enclosure */
345 if ((base < lbase) || (base + size > lbase + lsize)) { 358 if (base < lbase || base + size - 1 > lbase + lsize - 1) {
359 if (base <= lbase && base + size - 1 >= lbase + lsize - 1) {
360 /* New region encloses an existing region */
361 if (type == ltype) {
362 replace = replace == -1 ? i : -2;
363 continue;
364 }
365 else if (types_compatible(type, ltype))
366 continue;
367 }
346 printk(KERN_WARNING 368 printk(KERN_WARNING
347 "mtrr: 0x%lx000,0x%lx000 overlaps existing" 369 "mtrr: 0x%lx000,0x%lx000 overlaps existing"
348 " 0x%lx000,0x%x000\n", base, size, lbase, 370 " 0x%lx000,0x%lx000\n", base, size, lbase,
349 lsize); 371 lsize);
350 goto out; 372 goto out;
351 } 373 }
352 /* New region is enclosed by an existing region */ 374 /* New region is enclosed by an existing region */
353 if (ltype != type) { 375 if (ltype != type) {
354 if (type == MTRR_TYPE_UNCACHABLE) 376 if (types_compatible(type, ltype))
355 continue; 377 continue;
356 printk (KERN_WARNING "mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n", 378 printk (KERN_WARNING "mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n",
357 base, size, mtrr_attrib_to_str(ltype), 379 base, size, mtrr_attrib_to_str(ltype),
@@ -364,10 +386,18 @@ int mtrr_add_page(unsigned long base, unsigned long size,
364 goto out; 386 goto out;
365 } 387 }
366 /* Search for an empty MTRR */ 388 /* Search for an empty MTRR */
367 i = mtrr_if->get_free_region(base, size); 389 i = mtrr_if->get_free_region(base, size, replace);
368 if (i >= 0) { 390 if (i >= 0) {
369 set_mtrr(i, base, size, type); 391 set_mtrr(i, base, size, type);
370 usage_table[i] = 1; 392 if (likely(replace < 0))
393 usage_table[i] = 1;
394 else {
395 usage_table[i] = usage_table[replace] + !!increment;
396 if (unlikely(replace != i)) {
397 set_mtrr(replace, 0, 0, 0);
398 usage_table[replace] = 0;
399 }
400 }
371 } else 401 } else
372 printk(KERN_INFO "mtrr: no more MTRRs available\n"); 402 printk(KERN_INFO "mtrr: no more MTRRs available\n");
373 error = i; 403 error = i;
@@ -455,8 +485,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
455{ 485{
456 int i, max; 486 int i, max;
457 mtrr_type ltype; 487 mtrr_type ltype;
458 unsigned long lbase; 488 unsigned long lbase, lsize;
459 unsigned int lsize;
460 int error = -EINVAL; 489 int error = -EINVAL;
461 490
462 if (!mtrr_if) 491 if (!mtrr_if)
@@ -544,9 +573,11 @@ extern void centaur_init_mtrr(void);
544 573
545static void __init init_ifs(void) 574static void __init init_ifs(void)
546{ 575{
576#ifndef CONFIG_X86_64
547 amd_init_mtrr(); 577 amd_init_mtrr();
548 cyrix_init_mtrr(); 578 cyrix_init_mtrr();
549 centaur_init_mtrr(); 579 centaur_init_mtrr();
580#endif
550} 581}
551 582
552/* The suspend/resume methods are only for CPU without MTRR. CPU using generic 583/* The suspend/resume methods are only for CPU without MTRR. CPU using generic
@@ -555,7 +586,7 @@ static void __init init_ifs(void)
555struct mtrr_value { 586struct mtrr_value {
556 mtrr_type ltype; 587 mtrr_type ltype;
557 unsigned long lbase; 588 unsigned long lbase;
558 unsigned int lsize; 589 unsigned long lsize;
559}; 590};
560 591
561static struct mtrr_value * mtrr_state; 592static struct mtrr_value * mtrr_state;
@@ -565,10 +596,8 @@ static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
565 int i; 596 int i;
566 int size = num_var_ranges * sizeof(struct mtrr_value); 597 int size = num_var_ranges * sizeof(struct mtrr_value);
567 598
568 mtrr_state = kmalloc(size,GFP_ATOMIC); 599 mtrr_state = kzalloc(size,GFP_ATOMIC);
569 if (mtrr_state) 600 if (!mtrr_state)
570 memset(mtrr_state,0,size);
571 else
572 return -ENOMEM; 601 return -ENOMEM;
573 602
574 for (i = 0; i < num_var_ranges; i++) { 603 for (i = 0; i < num_var_ranges; i++) {
diff --git a/arch/i386/kernel/cpu/mtrr/mtrr.h b/arch/i386/kernel/cpu/mtrr/mtrr.h
index 99c9f2682041..d61ea9db6cfe 100644
--- a/arch/i386/kernel/cpu/mtrr/mtrr.h
+++ b/arch/i386/kernel/cpu/mtrr/mtrr.h
@@ -43,15 +43,16 @@ struct mtrr_ops {
43 void (*set_all)(void); 43 void (*set_all)(void);
44 44
45 void (*get)(unsigned int reg, unsigned long *base, 45 void (*get)(unsigned int reg, unsigned long *base,
46 unsigned int *size, mtrr_type * type); 46 unsigned long *size, mtrr_type * type);
47 int (*get_free_region) (unsigned long base, unsigned long size); 47 int (*get_free_region)(unsigned long base, unsigned long size,
48 48 int replace_reg);
49 int (*validate_add_page)(unsigned long base, unsigned long size, 49 int (*validate_add_page)(unsigned long base, unsigned long size,
50 unsigned int type); 50 unsigned int type);
51 int (*have_wrcomb)(void); 51 int (*have_wrcomb)(void);
52}; 52};
53 53
54extern int generic_get_free_region(unsigned long base, unsigned long size); 54extern int generic_get_free_region(unsigned long base, unsigned long size,
55 int replace_reg);
55extern int generic_validate_add_page(unsigned long base, unsigned long size, 56extern int generic_validate_add_page(unsigned long base, unsigned long size,
56 unsigned int type); 57 unsigned int type);
57 58
@@ -62,17 +63,17 @@ extern int positive_have_wrcomb(void);
62/* library functions for processor-specific routines */ 63/* library functions for processor-specific routines */
63struct set_mtrr_context { 64struct set_mtrr_context {
64 unsigned long flags; 65 unsigned long flags;
65 unsigned long deftype_lo;
66 unsigned long deftype_hi;
67 unsigned long cr4val; 66 unsigned long cr4val;
68 unsigned long ccr3; 67 u32 deftype_lo;
68 u32 deftype_hi;
69 u32 ccr3;
69}; 70};
70 71
71struct mtrr_var_range { 72struct mtrr_var_range {
72 unsigned long base_lo; 73 u32 base_lo;
73 unsigned long base_hi; 74 u32 base_hi;
74 unsigned long mask_lo; 75 u32 mask_lo;
75 unsigned long mask_hi; 76 u32 mask_hi;
76}; 77};
77 78
78void set_mtrr_done(struct set_mtrr_context *ctxt); 79void set_mtrr_done(struct set_mtrr_context *ctxt);
@@ -92,6 +93,6 @@ extern struct mtrr_ops * mtrr_if;
92extern unsigned int num_var_ranges; 93extern unsigned int num_var_ranges;
93 94
94void mtrr_state_warn(void); 95void mtrr_state_warn(void);
95char *mtrr_attrib_to_str(int x); 96const char *mtrr_attrib_to_str(int x);
96void mtrr_wrmsr(unsigned, unsigned, unsigned); 97void mtrr_wrmsr(unsigned, unsigned, unsigned);
97 98
diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c
index 76aac088a323..6624d8583c42 100644
--- a/arch/i386/kernel/cpu/proc.c
+++ b/arch/i386/kernel/cpu/proc.c
@@ -152,9 +152,10 @@ static int show_cpuinfo(struct seq_file *m, void *v)
152 seq_printf(m, " [%d]", i); 152 seq_printf(m, " [%d]", i);
153 } 153 }
154 154
155 seq_printf(m, "\nbogomips\t: %lu.%02lu\n\n", 155 seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
156 c->loops_per_jiffy/(500000/HZ), 156 c->loops_per_jiffy/(500000/HZ),
157 (c->loops_per_jiffy/(5000/HZ)) % 100); 157 (c->loops_per_jiffy/(5000/HZ)) % 100);
158 seq_printf(m, "clflush size\t: %u\n\n", c->x86_clflush_size);
158 159
159 return 0; 160 return 0;
160} 161}
diff --git a/arch/i386/kernel/cpuid.c b/arch/i386/kernel/cpuid.c
index fde8bea85cee..51130b39cd2e 100644
--- a/arch/i386/kernel/cpuid.c
+++ b/arch/i386/kernel/cpuid.c
@@ -34,7 +34,6 @@
34#include <linux/major.h> 34#include <linux/major.h>
35#include <linux/fs.h> 35#include <linux/fs.h>
36#include <linux/smp_lock.h> 36#include <linux/smp_lock.h>
37#include <linux/fs.h>
38#include <linux/device.h> 37#include <linux/device.h>
39#include <linux/cpu.h> 38#include <linux/cpu.h>
40#include <linux/notifier.h> 39#include <linux/notifier.h>
@@ -117,7 +116,7 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
117 char __user *tmp = buf; 116 char __user *tmp = buf;
118 u32 data[4]; 117 u32 data[4];
119 u32 reg = *ppos; 118 u32 reg = *ppos;
120 int cpu = iminor(file->f_dentry->d_inode); 119 int cpu = iminor(file->f_path.dentry->d_inode);
121 120
122 if (count % 16) 121 if (count % 16)
123 return -EINVAL; /* Invalid chunk size */ 122 return -EINVAL; /* Invalid chunk size */
@@ -135,7 +134,7 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
135 134
136static int cpuid_open(struct inode *inode, struct file *file) 135static int cpuid_open(struct inode *inode, struct file *file)
137{ 136{
138 unsigned int cpu = iminor(file->f_dentry->d_inode); 137 unsigned int cpu = iminor(file->f_path.dentry->d_inode);
139 struct cpuinfo_x86 *c = &(cpu_data)[cpu]; 138 struct cpuinfo_x86 *c = &(cpu_data)[cpu];
140 139
141 if (cpu >= NR_CPUS || !cpu_online(cpu)) 140 if (cpu >= NR_CPUS || !cpu_online(cpu))
@@ -156,28 +155,27 @@ static struct file_operations cpuid_fops = {
156 .open = cpuid_open, 155 .open = cpuid_open,
157}; 156};
158 157
159static int cpuid_class_device_create(int i) 158static int cpuid_device_create(int i)
160{ 159{
161 int err = 0; 160 int err = 0;
162 struct class_device *class_err; 161 struct device *dev;
163 162
164 class_err = class_device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, i), NULL, "cpu%d",i); 163 dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, i), "cpu%d",i);
165 if (IS_ERR(class_err)) 164 if (IS_ERR(dev))
166 err = PTR_ERR(class_err); 165 err = PTR_ERR(dev);
167 return err; 166 return err;
168} 167}
169 168
170#ifdef CONFIG_HOTPLUG_CPU
171static int cpuid_class_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 169static int cpuid_class_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
172{ 170{
173 unsigned int cpu = (unsigned long)hcpu; 171 unsigned int cpu = (unsigned long)hcpu;
174 172
175 switch (action) { 173 switch (action) {
176 case CPU_ONLINE: 174 case CPU_ONLINE:
177 cpuid_class_device_create(cpu); 175 cpuid_device_create(cpu);
178 break; 176 break;
179 case CPU_DEAD: 177 case CPU_DEAD:
180 class_device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu)); 178 device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu));
181 break; 179 break;
182 } 180 }
183 return NOTIFY_OK; 181 return NOTIFY_OK;
@@ -187,7 +185,6 @@ static struct notifier_block __cpuinitdata cpuid_class_cpu_notifier =
187{ 185{
188 .notifier_call = cpuid_class_cpu_callback, 186 .notifier_call = cpuid_class_cpu_callback,
189}; 187};
190#endif /* !CONFIG_HOTPLUG_CPU */
191 188
192static int __init cpuid_init(void) 189static int __init cpuid_init(void)
193{ 190{
@@ -206,7 +203,7 @@ static int __init cpuid_init(void)
206 goto out_chrdev; 203 goto out_chrdev;
207 } 204 }
208 for_each_online_cpu(i) { 205 for_each_online_cpu(i) {
209 err = cpuid_class_device_create(i); 206 err = cpuid_device_create(i);
210 if (err != 0) 207 if (err != 0)
211 goto out_class; 208 goto out_class;
212 } 209 }
@@ -218,7 +215,7 @@ static int __init cpuid_init(void)
218out_class: 215out_class:
219 i = 0; 216 i = 0;
220 for_each_online_cpu(i) { 217 for_each_online_cpu(i) {
221 class_device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, i)); 218 device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, i));
222 } 219 }
223 class_destroy(cpuid_class); 220 class_destroy(cpuid_class);
224out_chrdev: 221out_chrdev:
@@ -232,7 +229,7 @@ static void __exit cpuid_exit(void)
232 int cpu = 0; 229 int cpu = 0;
233 230
234 for_each_online_cpu(cpu) 231 for_each_online_cpu(cpu)
235 class_device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu)); 232 device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu));
236 class_destroy(cpuid_class); 233 class_destroy(cpuid_class);
237 unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); 234 unregister_chrdev(CPUID_MAJOR, "cpu/cpuid");
238 unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); 235 unregister_hotcpu_notifier(&cpuid_class_cpu_notifier);
diff --git a/arch/i386/kernel/crash.c b/arch/i386/kernel/crash.c
index 144b43288965..a5e0e990ea95 100644
--- a/arch/i386/kernel/crash.c
+++ b/arch/i386/kernel/crash.c
@@ -31,68 +31,6 @@
31/* This keeps a track of which one is crashing cpu. */ 31/* This keeps a track of which one is crashing cpu. */
32static int crashing_cpu; 32static int crashing_cpu;
33 33
34static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
35 size_t data_len)
36{
37 struct elf_note note;
38
39 note.n_namesz = strlen(name) + 1;
40 note.n_descsz = data_len;
41 note.n_type = type;
42 memcpy(buf, &note, sizeof(note));
43 buf += (sizeof(note) +3)/4;
44 memcpy(buf, name, note.n_namesz);
45 buf += (note.n_namesz + 3)/4;
46 memcpy(buf, data, note.n_descsz);
47 buf += (note.n_descsz + 3)/4;
48
49 return buf;
50}
51
52static void final_note(u32 *buf)
53{
54 struct elf_note note;
55
56 note.n_namesz = 0;
57 note.n_descsz = 0;
58 note.n_type = 0;
59 memcpy(buf, &note, sizeof(note));
60}
61
62static void crash_save_this_cpu(struct pt_regs *regs, int cpu)
63{
64 struct elf_prstatus prstatus;
65 u32 *buf;
66
67 if ((cpu < 0) || (cpu >= NR_CPUS))
68 return;
69
70 /* Using ELF notes here is opportunistic.
71 * I need a well defined structure format
72 * for the data I pass, and I need tags
73 * on the data to indicate what information I have
74 * squirrelled away. ELF notes happen to provide
75 * all of that, so there is no need to invent something new.
76 */
77 buf = (u32*)per_cpu_ptr(crash_notes, cpu);
78 if (!buf)
79 return;
80 memset(&prstatus, 0, sizeof(prstatus));
81 prstatus.pr_pid = current->pid;
82 elf_core_copy_regs(&prstatus.pr_reg, regs);
83 buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
84 sizeof(prstatus));
85 final_note(buf);
86}
87
88static void crash_save_self(struct pt_regs *regs)
89{
90 int cpu;
91
92 cpu = safe_smp_processor_id();
93 crash_save_this_cpu(regs, cpu);
94}
95
96#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) 34#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
97static atomic_t waiting_for_crash_ipi; 35static atomic_t waiting_for_crash_ipi;
98 36
@@ -121,7 +59,7 @@ static int crash_nmi_callback(struct notifier_block *self,
121 crash_fixup_ss_esp(&fixed_regs, regs); 59 crash_fixup_ss_esp(&fixed_regs, regs);
122 regs = &fixed_regs; 60 regs = &fixed_regs;
123 } 61 }
124 crash_save_this_cpu(regs, cpu); 62 crash_save_cpu(regs, cpu);
125 disable_local_APIC(); 63 disable_local_APIC();
126 atomic_dec(&waiting_for_crash_ipi); 64 atomic_dec(&waiting_for_crash_ipi);
127 /* Assume hlt works */ 65 /* Assume hlt works */
@@ -195,5 +133,5 @@ void machine_crash_shutdown(struct pt_regs *regs)
195#if defined(CONFIG_X86_IO_APIC) 133#if defined(CONFIG_X86_IO_APIC)
196 disable_IO_APIC(); 134 disable_IO_APIC();
197#endif 135#endif
198 crash_save_self(regs); 136 crash_save_cpu(regs, safe_smp_processor_id());
199} 137}
diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c
new file mode 100644
index 000000000000..2f7d0a92fd7c
--- /dev/null
+++ b/arch/i386/kernel/e820.c
@@ -0,0 +1,894 @@
1#include <linux/kernel.h>
2#include <linux/types.h>
3#include <linux/init.h>
4#include <linux/bootmem.h>
5#include <linux/ioport.h>
6#include <linux/string.h>
7#include <linux/kexec.h>
8#include <linux/module.h>
9#include <linux/mm.h>
10#include <linux/efi.h>
11#include <linux/pfn.h>
12#include <linux/uaccess.h>
13
14#include <asm/pgtable.h>
15#include <asm/page.h>
16#include <asm/e820.h>
17
18#ifdef CONFIG_EFI
19int efi_enabled = 0;
20EXPORT_SYMBOL(efi_enabled);
21#endif
22
23struct e820map e820;
24struct change_member {
25 struct e820entry *pbios; /* pointer to original bios entry */
26 unsigned long long addr; /* address for this change point */
27};
28static struct change_member change_point_list[2*E820MAX] __initdata;
29static struct change_member *change_point[2*E820MAX] __initdata;
30static struct e820entry *overlap_list[E820MAX] __initdata;
31static struct e820entry new_bios[E820MAX] __initdata;
32/* For PCI or other memory-mapped resources */
33unsigned long pci_mem_start = 0x10000000;
34#ifdef CONFIG_PCI
35EXPORT_SYMBOL(pci_mem_start);
36#endif
37extern int user_defined_memmap;
38struct resource data_resource = {
39 .name = "Kernel data",
40 .start = 0,
41 .end = 0,
42 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
43};
44
45struct resource code_resource = {
46 .name = "Kernel code",
47 .start = 0,
48 .end = 0,
49 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
50};
51
52static struct resource system_rom_resource = {
53 .name = "System ROM",
54 .start = 0xf0000,
55 .end = 0xfffff,
56 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
57};
58
59static struct resource extension_rom_resource = {
60 .name = "Extension ROM",
61 .start = 0xe0000,
62 .end = 0xeffff,
63 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
64};
65
66static struct resource adapter_rom_resources[] = { {
67 .name = "Adapter ROM",
68 .start = 0xc8000,
69 .end = 0,
70 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
71}, {
72 .name = "Adapter ROM",
73 .start = 0,
74 .end = 0,
75 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
76}, {
77 .name = "Adapter ROM",
78 .start = 0,
79 .end = 0,
80 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
81}, {
82 .name = "Adapter ROM",
83 .start = 0,
84 .end = 0,
85 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
86}, {
87 .name = "Adapter ROM",
88 .start = 0,
89 .end = 0,
90 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
91}, {
92 .name = "Adapter ROM",
93 .start = 0,
94 .end = 0,
95 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
96} };
97
98static struct resource video_rom_resource = {
99 .name = "Video ROM",
100 .start = 0xc0000,
101 .end = 0xc7fff,
102 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
103};
104
105static struct resource video_ram_resource = {
106 .name = "Video RAM area",
107 .start = 0xa0000,
108 .end = 0xbffff,
109 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
110};
111
112static struct resource standard_io_resources[] = { {
113 .name = "dma1",
114 .start = 0x0000,
115 .end = 0x001f,
116 .flags = IORESOURCE_BUSY | IORESOURCE_IO
117}, {
118 .name = "pic1",
119 .start = 0x0020,
120 .end = 0x0021,
121 .flags = IORESOURCE_BUSY | IORESOURCE_IO
122}, {
123 .name = "timer0",
124 .start = 0x0040,
125 .end = 0x0043,
126 .flags = IORESOURCE_BUSY | IORESOURCE_IO
127}, {
128 .name = "timer1",
129 .start = 0x0050,
130 .end = 0x0053,
131 .flags = IORESOURCE_BUSY | IORESOURCE_IO
132}, {
133 .name = "keyboard",
134 .start = 0x0060,
135 .end = 0x006f,
136 .flags = IORESOURCE_BUSY | IORESOURCE_IO
137}, {
138 .name = "dma page reg",
139 .start = 0x0080,
140 .end = 0x008f,
141 .flags = IORESOURCE_BUSY | IORESOURCE_IO
142}, {
143 .name = "pic2",
144 .start = 0x00a0,
145 .end = 0x00a1,
146 .flags = IORESOURCE_BUSY | IORESOURCE_IO
147}, {
148 .name = "dma2",
149 .start = 0x00c0,
150 .end = 0x00df,
151 .flags = IORESOURCE_BUSY | IORESOURCE_IO
152}, {
153 .name = "fpu",
154 .start = 0x00f0,
155 .end = 0x00ff,
156 .flags = IORESOURCE_BUSY | IORESOURCE_IO
157} };
158
159static int romsignature(const unsigned char *x)
160{
161 unsigned short sig;
162 int ret = 0;
163 if (probe_kernel_address((const unsigned short *)x, sig) == 0)
164 ret = (sig == 0xaa55);
165 return ret;
166}
167
168static int __init romchecksum(unsigned char *rom, unsigned long length)
169{
170 unsigned char *p, sum = 0;
171
172 for (p = rom; p < rom + length; p++)
173 sum += *p;
174 return sum == 0;
175}
176
177static void __init probe_roms(void)
178{
179 unsigned long start, length, upper;
180 unsigned char *rom;
181 int i;
182
183 /* video rom */
184 upper = adapter_rom_resources[0].start;
185 for (start = video_rom_resource.start; start < upper; start += 2048) {
186 rom = isa_bus_to_virt(start);
187 if (!romsignature(rom))
188 continue;
189
190 video_rom_resource.start = start;
191
192 /* 0 < length <= 0x7f * 512, historically */
193 length = rom[2] * 512;
194
195 /* if checksum okay, trust length byte */
196 if (length && romchecksum(rom, length))
197 video_rom_resource.end = start + length - 1;
198
199 request_resource(&iomem_resource, &video_rom_resource);
200 break;
201 }
202
203 start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
204 if (start < upper)
205 start = upper;
206
207 /* system rom */
208 request_resource(&iomem_resource, &system_rom_resource);
209 upper = system_rom_resource.start;
210
211 /* check for extension rom (ignore length byte!) */
212 rom = isa_bus_to_virt(extension_rom_resource.start);
213 if (romsignature(rom)) {
214 length = extension_rom_resource.end - extension_rom_resource.start + 1;
215 if (romchecksum(rom, length)) {
216 request_resource(&iomem_resource, &extension_rom_resource);
217 upper = extension_rom_resource.start;
218 }
219 }
220
221 /* check for adapter roms on 2k boundaries */
222 for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
223 rom = isa_bus_to_virt(start);
224 if (!romsignature(rom))
225 continue;
226
227 /* 0 < length <= 0x7f * 512, historically */
228 length = rom[2] * 512;
229
230 /* but accept any length that fits if checksum okay */
231 if (!length || start + length > upper || !romchecksum(rom, length))
232 continue;
233
234 adapter_rom_resources[i].start = start;
235 adapter_rom_resources[i].end = start + length - 1;
236 request_resource(&iomem_resource, &adapter_rom_resources[i]);
237
238 start = adapter_rom_resources[i++].end & ~2047UL;
239 }
240}
241
242/*
243 * Request address space for all standard RAM and ROM resources
244 * and also for regions reported as reserved by the e820.
245 */
246static void __init
247legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource)
248{
249 int i;
250
251 probe_roms();
252 for (i = 0; i < e820.nr_map; i++) {
253 struct resource *res;
254#ifndef CONFIG_RESOURCES_64BIT
255 if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
256 continue;
257#endif
258 res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
259 switch (e820.map[i].type) {
260 case E820_RAM: res->name = "System RAM"; break;
261 case E820_ACPI: res->name = "ACPI Tables"; break;
262 case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
263 default: res->name = "reserved";
264 }
265 res->start = e820.map[i].addr;
266 res->end = res->start + e820.map[i].size - 1;
267 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
268 if (request_resource(&iomem_resource, res)) {
269 kfree(res);
270 continue;
271 }
272 if (e820.map[i].type == E820_RAM) {
273 /*
274 * We don't know which RAM region contains kernel data,
275 * so we try it repeatedly and let the resource manager
276 * test it.
277 */
278 request_resource(res, code_resource);
279 request_resource(res, data_resource);
280#ifdef CONFIG_KEXEC
281 request_resource(res, &crashk_res);
282#endif
283 }
284 }
285}
286
287/*
288 * Request address space for all standard resources
289 *
290 * This is called just before pcibios_init(), which is also a
291 * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
292 */
293static int __init request_standard_resources(void)
294{
295 int i;
296
297 printk("Setting up standard PCI resources\n");
298 if (efi_enabled)
299 efi_initialize_iomem_resources(&code_resource, &data_resource);
300 else
301 legacy_init_iomem_resources(&code_resource, &data_resource);
302
303 /* EFI systems may still have VGA */
304 request_resource(&iomem_resource, &video_ram_resource);
305
306 /* request I/O space for devices used on all i[345]86 PCs */
307 for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
308 request_resource(&ioport_resource, &standard_io_resources[i]);
309 return 0;
310}
311
312subsys_initcall(request_standard_resources);
313
314void __init add_memory_region(unsigned long long start,
315 unsigned long long size, int type)
316{
317 int x;
318
319 if (!efi_enabled) {
320 x = e820.nr_map;
321
322 if (x == E820MAX) {
323 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
324 return;
325 }
326
327 e820.map[x].addr = start;
328 e820.map[x].size = size;
329 e820.map[x].type = type;
330 e820.nr_map++;
331 }
332} /* add_memory_region */
333
334/*
335 * Sanitize the BIOS e820 map.
336 *
337 * Some e820 responses include overlapping entries. The following
338 * replaces the original e820 map with a new one, removing overlaps.
339 *
340 */
341int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
342{
343 struct change_member *change_tmp;
344 unsigned long current_type, last_type;
345 unsigned long long last_addr;
346 int chgidx, still_changing;
347 int overlap_entries;
348 int new_bios_entry;
349 int old_nr, new_nr, chg_nr;
350 int i;
351
352 /*
353 Visually we're performing the following (1,2,3,4 = memory types)...
354
355 Sample memory map (w/overlaps):
356 ____22__________________
357 ______________________4_
358 ____1111________________
359 _44_____________________
360 11111111________________
361 ____________________33__
362 ___________44___________
363 __________33333_________
364 ______________22________
365 ___________________2222_
366 _________111111111______
367 _____________________11_
368 _________________4______
369
370 Sanitized equivalent (no overlap):
371 1_______________________
372 _44_____________________
373 ___1____________________
374 ____22__________________
375 ______11________________
376 _________1______________
377 __________3_____________
378 ___________44___________
379 _____________33_________
380 _______________2________
381 ________________1_______
382 _________________4______
383 ___________________2____
384 ____________________33__
385 ______________________4_
386 */
387 printk("sanitize start\n");
388 /* if there's only one memory region, don't bother */
389 if (*pnr_map < 2) {
390 printk("sanitize bail 0\n");
391 return -1;
392 }
393
394 old_nr = *pnr_map;
395
396 /* bail out if we find any unreasonable addresses in bios map */
397 for (i=0; i<old_nr; i++)
398 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
399 printk("sanitize bail 1\n");
400 return -1;
401 }
402
403 /* create pointers for initial change-point information (for sorting) */
404 for (i=0; i < 2*old_nr; i++)
405 change_point[i] = &change_point_list[i];
406
407 /* record all known change-points (starting and ending addresses),
408 omitting those that are for empty memory regions */
409 chgidx = 0;
410 for (i=0; i < old_nr; i++) {
411 if (biosmap[i].size != 0) {
412 change_point[chgidx]->addr = biosmap[i].addr;
413 change_point[chgidx++]->pbios = &biosmap[i];
414 change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
415 change_point[chgidx++]->pbios = &biosmap[i];
416 }
417 }
418 chg_nr = chgidx; /* true number of change-points */
419
420 /* sort change-point list by memory addresses (low -> high) */
421 still_changing = 1;
422 while (still_changing) {
423 still_changing = 0;
424 for (i=1; i < chg_nr; i++) {
425 /* if <current_addr> > <last_addr>, swap */
426 /* or, if current=<start_addr> & last=<end_addr>, swap */
427 if ((change_point[i]->addr < change_point[i-1]->addr) ||
428 ((change_point[i]->addr == change_point[i-1]->addr) &&
429 (change_point[i]->addr == change_point[i]->pbios->addr) &&
430 (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
431 )
432 {
433 change_tmp = change_point[i];
434 change_point[i] = change_point[i-1];
435 change_point[i-1] = change_tmp;
436 still_changing=1;
437 }
438 }
439 }
440
441 /* create a new bios memory map, removing overlaps */
442 overlap_entries=0; /* number of entries in the overlap table */
443 new_bios_entry=0; /* index for creating new bios map entries */
444 last_type = 0; /* start with undefined memory type */
445 last_addr = 0; /* start with 0 as last starting address */
446 /* loop through change-points, determining affect on the new bios map */
447 for (chgidx=0; chgidx < chg_nr; chgidx++)
448 {
449 /* keep track of all overlapping bios entries */
450 if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
451 {
452 /* add map entry to overlap list (> 1 entry implies an overlap) */
453 overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
454 }
455 else
456 {
457 /* remove entry from list (order independent, so swap with last) */
458 for (i=0; i<overlap_entries; i++)
459 {
460 if (overlap_list[i] == change_point[chgidx]->pbios)
461 overlap_list[i] = overlap_list[overlap_entries-1];
462 }
463 overlap_entries--;
464 }
465 /* if there are overlapping entries, decide which "type" to use */
466 /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
467 current_type = 0;
468 for (i=0; i<overlap_entries; i++)
469 if (overlap_list[i]->type > current_type)
470 current_type = overlap_list[i]->type;
471 /* continue building up new bios map based on this information */
472 if (current_type != last_type) {
473 if (last_type != 0) {
474 new_bios[new_bios_entry].size =
475 change_point[chgidx]->addr - last_addr;
476 /* move forward only if the new size was non-zero */
477 if (new_bios[new_bios_entry].size != 0)
478 if (++new_bios_entry >= E820MAX)
479 break; /* no more space left for new bios entries */
480 }
481 if (current_type != 0) {
482 new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
483 new_bios[new_bios_entry].type = current_type;
484 last_addr=change_point[chgidx]->addr;
485 }
486 last_type = current_type;
487 }
488 }
489 new_nr = new_bios_entry; /* retain count for new bios entries */
490
491 /* copy new bios mapping into original location */
492 memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
493 *pnr_map = new_nr;
494
495 printk("sanitize end\n");
496 return 0;
497}
498
499/*
500 * Copy the BIOS e820 map into a safe place.
501 *
502 * Sanity-check it while we're at it..
503 *
504 * If we're lucky and live on a modern system, the setup code
505 * will have given us a memory map that we can use to properly
506 * set up memory. If we aren't, we'll fake a memory map.
507 *
508 * We check to see that the memory map contains at least 2 elements
509 * before we'll use it, because the detection code in setup.S may
510 * not be perfect and most every PC known to man has two memory
511 * regions: one from 0 to 640k, and one from 1mb up. (The IBM
512 * thinkpad 560x, for example, does not cooperate with the memory
513 * detection code.)
514 */
515int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
516{
517 /* Only one memory region (or negative)? Ignore it */
518 if (nr_map < 2)
519 return -1;
520
521 do {
522 unsigned long long start = biosmap->addr;
523 unsigned long long size = biosmap->size;
524 unsigned long long end = start + size;
525 unsigned long type = biosmap->type;
526 printk("copy_e820_map() start: %016Lx size: %016Lx end: %016Lx type: %ld\n", start, size, end, type);
527
528 /* Overflow in 64 bits? Ignore the memory map. */
529 if (start > end)
530 return -1;
531
532 /*
533 * Some BIOSes claim RAM in the 640k - 1M region.
534 * Not right. Fix it up.
535 */
536 if (type == E820_RAM) {
537 printk("copy_e820_map() type is E820_RAM\n");
538 if (start < 0x100000ULL && end > 0xA0000ULL) {
539 printk("copy_e820_map() lies in range...\n");
540 if (start < 0xA0000ULL) {
541 printk("copy_e820_map() start < 0xA0000ULL\n");
542 add_memory_region(start, 0xA0000ULL-start, type);
543 }
544 if (end <= 0x100000ULL) {
545 printk("copy_e820_map() end <= 0x100000ULL\n");
546 continue;
547 }
548 start = 0x100000ULL;
549 size = end - start;
550 }
551 }
552 add_memory_region(start, size, type);
553 } while (biosmap++,--nr_map);
554 return 0;
555}
556
557/*
558 * Callback for efi_memory_walk.
559 */
560static int __init
561efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
562{
563 unsigned long *max_pfn = arg, pfn;
564
565 if (start < end) {
566 pfn = PFN_UP(end -1);
567 if (pfn > *max_pfn)
568 *max_pfn = pfn;
569 }
570 return 0;
571}
572
573static int __init
574efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
575{
576 memory_present(0, PFN_UP(start), PFN_DOWN(end));
577 return 0;
578}
579
580/*
581 * Find the highest page frame number we have available
582 */
583void __init find_max_pfn(void)
584{
585 int i;
586
587 max_pfn = 0;
588 if (efi_enabled) {
589 efi_memmap_walk(efi_find_max_pfn, &max_pfn);
590 efi_memmap_walk(efi_memory_present_wrapper, NULL);
591 return;
592 }
593
594 for (i = 0; i < e820.nr_map; i++) {
595 unsigned long start, end;
596 /* RAM? */
597 if (e820.map[i].type != E820_RAM)
598 continue;
599 start = PFN_UP(e820.map[i].addr);
600 end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
601 if (start >= end)
602 continue;
603 if (end > max_pfn)
604 max_pfn = end;
605 memory_present(0, start, end);
606 }
607}
608
609/*
610 * Free all available memory for boot time allocation. Used
611 * as a callback function by efi_memory_walk()
612 */
613
614static int __init
615free_available_memory(unsigned long start, unsigned long end, void *arg)
616{
617 /* check max_low_pfn */
618 if (start >= (max_low_pfn << PAGE_SHIFT))
619 return 0;
620 if (end >= (max_low_pfn << PAGE_SHIFT))
621 end = max_low_pfn << PAGE_SHIFT;
622 if (start < end)
623 free_bootmem(start, end - start);
624
625 return 0;
626}
627/*
628 * Register fully available low RAM pages with the bootmem allocator.
629 */
630void __init register_bootmem_low_pages(unsigned long max_low_pfn)
631{
632 int i;
633
634 if (efi_enabled) {
635 efi_memmap_walk(free_available_memory, NULL);
636 return;
637 }
638 for (i = 0; i < e820.nr_map; i++) {
639 unsigned long curr_pfn, last_pfn, size;
640 /*
641 * Reserve usable low memory
642 */
643 if (e820.map[i].type != E820_RAM)
644 continue;
645 /*
646 * We are rounding up the start address of usable memory:
647 */
648 curr_pfn = PFN_UP(e820.map[i].addr);
649 if (curr_pfn >= max_low_pfn)
650 continue;
651 /*
652 * ... and at the end of the usable range downwards:
653 */
654 last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
655
656 if (last_pfn > max_low_pfn)
657 last_pfn = max_low_pfn;
658
659 /*
660 * .. finally, did all the rounding and playing
661 * around just make the area go away?
662 */
663 if (last_pfn <= curr_pfn)
664 continue;
665
666 size = last_pfn - curr_pfn;
667 free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
668 }
669}
670
671void __init register_memory(void)
672{
673 unsigned long gapstart, gapsize, round;
674 unsigned long long last;
675 int i;
676
677 /*
678 * Search for the bigest gap in the low 32 bits of the e820
679 * memory space.
680 */
681 last = 0x100000000ull;
682 gapstart = 0x10000000;
683 gapsize = 0x400000;
684 i = e820.nr_map;
685 while (--i >= 0) {
686 unsigned long long start = e820.map[i].addr;
687 unsigned long long end = start + e820.map[i].size;
688
689 /*
690 * Since "last" is at most 4GB, we know we'll
691 * fit in 32 bits if this condition is true
692 */
693 if (last > end) {
694 unsigned long gap = last - end;
695
696 if (gap > gapsize) {
697 gapsize = gap;
698 gapstart = end;
699 }
700 }
701 if (start < last)
702 last = start;
703 }
704
705 /*
706 * See how much we want to round up: start off with
707 * rounding to the next 1MB area.
708 */
709 round = 0x100000;
710 while ((gapsize >> 4) > round)
711 round += round;
712 /* Fun with two's complement */
713 pci_mem_start = (gapstart + round) & -round;
714
715 printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
716 pci_mem_start, gapstart, gapsize);
717}
718
719void __init print_memory_map(char *who)
720{
721 int i;
722
723 for (i = 0; i < e820.nr_map; i++) {
724 printk(" %s: %016Lx - %016Lx ", who,
725 e820.map[i].addr,
726 e820.map[i].addr + e820.map[i].size);
727 switch (e820.map[i].type) {
728 case E820_RAM: printk("(usable)\n");
729 break;
730 case E820_RESERVED:
731 printk("(reserved)\n");
732 break;
733 case E820_ACPI:
734 printk("(ACPI data)\n");
735 break;
736 case E820_NVS:
737 printk("(ACPI NVS)\n");
738 break;
739 default: printk("type %lu\n", e820.map[i].type);
740 break;
741 }
742 }
743}
744
745static __init __always_inline void efi_limit_regions(unsigned long long size)
746{
747 unsigned long long current_addr = 0;
748 efi_memory_desc_t *md, *next_md;
749 void *p, *p1;
750 int i, j;
751
752 j = 0;
753 p1 = memmap.map;
754 for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
755 md = p;
756 next_md = p1;
757 current_addr = md->phys_addr +
758 PFN_PHYS(md->num_pages);
759 if (is_available_memory(md)) {
760 if (md->phys_addr >= size) continue;
761 memcpy(next_md, md, memmap.desc_size);
762 if (current_addr >= size) {
763 next_md->num_pages -=
764 PFN_UP(current_addr-size);
765 }
766 p1 += memmap.desc_size;
767 next_md = p1;
768 j++;
769 } else if ((md->attribute & EFI_MEMORY_RUNTIME) ==
770 EFI_MEMORY_RUNTIME) {
771 /* In order to make runtime services
772 * available we have to include runtime
773 * memory regions in memory map */
774 memcpy(next_md, md, memmap.desc_size);
775 p1 += memmap.desc_size;
776 next_md = p1;
777 j++;
778 }
779 }
780 memmap.nr_map = j;
781 memmap.map_end = memmap.map +
782 (memmap.nr_map * memmap.desc_size);
783}
784
785void __init limit_regions(unsigned long long size)
786{
787 unsigned long long current_addr;
788 int i;
789
790 print_memory_map("limit_regions start");
791 if (efi_enabled) {
792 efi_limit_regions(size);
793 return;
794 }
795 for (i = 0; i < e820.nr_map; i++) {
796 current_addr = e820.map[i].addr + e820.map[i].size;
797 if (current_addr < size)
798 continue;
799
800 if (e820.map[i].type != E820_RAM)
801 continue;
802
803 if (e820.map[i].addr >= size) {
804 /*
805 * This region starts past the end of the
806 * requested size, skip it completely.
807 */
808 e820.nr_map = i;
809 } else {
810 e820.nr_map = i + 1;
811 e820.map[i].size -= current_addr - size;
812 }
813 print_memory_map("limit_regions endfor");
814 return;
815 }
816 print_memory_map("limit_regions endfunc");
817}
818
819 /*
820 * This function checks if the entire range <start,end> is mapped with type.
821 *
822 * Note: this function only works correct if the e820 table is sorted and
823 * not-overlapping, which is the case
824 */
825int __init
826e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
827{
828 u64 start = s;
829 u64 end = e;
830 int i;
831 for (i = 0; i < e820.nr_map; i++) {
832 struct e820entry *ei = &e820.map[i];
833 if (type && ei->type != type)
834 continue;
835 /* is the region (part) in overlap with the current region ?*/
836 if (ei->addr >= end || ei->addr + ei->size <= start)
837 continue;
838 /* if the region is at the beginning of <start,end> we move
839 * start to the end of the region since it's ok until there
840 */
841 if (ei->addr <= start)
842 start = ei->addr + ei->size;
843 /* if start is now at or beyond end, we're done, full
844 * coverage */
845 if (start >= end)
846 return 1; /* we're done */
847 }
848 return 0;
849}
850
851static int __init parse_memmap(char *arg)
852{
853 if (!arg)
854 return -EINVAL;
855
856 if (strcmp(arg, "exactmap") == 0) {
857#ifdef CONFIG_CRASH_DUMP
858 /* If we are doing a crash dump, we
859 * still need to know the real mem
860 * size before original memory map is
861 * reset.
862 */
863 find_max_pfn();
864 saved_max_pfn = max_pfn;
865#endif
866 e820.nr_map = 0;
867 user_defined_memmap = 1;
868 } else {
869 /* If the user specifies memory size, we
870 * limit the BIOS-provided memory map to
871 * that size. exactmap can be used to specify
872 * the exact map. mem=number can be used to
873 * trim the existing memory map.
874 */
875 unsigned long long start_at, mem_size;
876
877 mem_size = memparse(arg, &arg);
878 if (*arg == '@') {
879 start_at = memparse(arg+1, &arg);
880 add_memory_region(start_at, mem_size, E820_RAM);
881 } else if (*arg == '#') {
882 start_at = memparse(arg+1, &arg);
883 add_memory_region(start_at, mem_size, E820_ACPI);
884 } else if (*arg == '$') {
885 start_at = memparse(arg+1, &arg);
886 add_memory_region(start_at, mem_size, E820_RESERVED);
887 } else {
888 limit_regions(mem_size);
889 user_defined_memmap = 1;
890 }
891 }
892 return 0;
893}
894early_param("memmap", parse_memmap);
diff --git a/arch/i386/kernel/efi.c b/arch/i386/kernel/efi.c
index 8b40648d0ef0..b92c7f0a358a 100644
--- a/arch/i386/kernel/efi.c
+++ b/arch/i386/kernel/efi.c
@@ -194,17 +194,24 @@ inline int efi_set_rtc_mmss(unsigned long nowtime)
194 return 0; 194 return 0;
195} 195}
196/* 196/*
197 * This should only be used during kernel init and before runtime 197 * This is used during kernel init before runtime
198 * services have been remapped, therefore, we'll need to call in physical 198 * services have been remapped and also during suspend, therefore,
199 * mode. Note, this call isn't used later, so mark it __init. 199 * we'll need to call both in physical and virtual modes.
200 */ 200 */
201inline unsigned long __init efi_get_time(void) 201inline unsigned long efi_get_time(void)
202{ 202{
203 efi_status_t status; 203 efi_status_t status;
204 efi_time_t eft; 204 efi_time_t eft;
205 efi_time_cap_t cap; 205 efi_time_cap_t cap;
206 206
207 status = phys_efi_get_time(&eft, &cap); 207 if (efi.get_time) {
208 /* if we are in virtual mode use remapped function */
209 status = efi.get_time(&eft, &cap);
210 } else {
211 /* we are in physical mode */
212 status = phys_efi_get_time(&eft, &cap);
213 }
214
208 if (status != EFI_SUCCESS) 215 if (status != EFI_SUCCESS)
209 printk("Oops: efitime: can't read time status: 0x%lx\n",status); 216 printk("Oops: efitime: can't read time status: 0x%lx\n",status);
210 217
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 5a63d6fdb70e..de34b7fed3c1 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -30,12 +30,13 @@
30 * 18(%esp) - %eax 30 * 18(%esp) - %eax
31 * 1C(%esp) - %ds 31 * 1C(%esp) - %ds
32 * 20(%esp) - %es 32 * 20(%esp) - %es
33 * 24(%esp) - orig_eax 33 * 24(%esp) - %gs
34 * 28(%esp) - %eip 34 * 28(%esp) - orig_eax
35 * 2C(%esp) - %cs 35 * 2C(%esp) - %eip
36 * 30(%esp) - %eflags 36 * 30(%esp) - %cs
37 * 34(%esp) - %oldesp 37 * 34(%esp) - %eflags
38 * 38(%esp) - %oldss 38 * 38(%esp) - %oldesp
39 * 3C(%esp) - %oldss
39 * 40 *
40 * "current" is in register %ebx during any slow entries. 41 * "current" is in register %ebx during any slow entries.
41 */ 42 */
@@ -48,26 +49,24 @@
48#include <asm/smp.h> 49#include <asm/smp.h>
49#include <asm/page.h> 50#include <asm/page.h>
50#include <asm/desc.h> 51#include <asm/desc.h>
52#include <asm/percpu.h>
51#include <asm/dwarf2.h> 53#include <asm/dwarf2.h>
52#include "irq_vectors.h" 54#include "irq_vectors.h"
53 55
54#define nr_syscalls ((syscall_table_size)/4) 56/*
57 * We use macros for low-level operations which need to be overridden
58 * for paravirtualization. The following will never clobber any registers:
59 * INTERRUPT_RETURN (aka. "iret")
60 * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
61 * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
62 *
63 * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
64 * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
65 * Allowing a register to be clobbered can shrink the paravirt replacement
66 * enough to patch inline, increasing performance.
67 */
55 68
56EBX = 0x00 69#define nr_syscalls ((syscall_table_size)/4)
57ECX = 0x04
58EDX = 0x08
59ESI = 0x0C
60EDI = 0x10
61EBP = 0x14
62EAX = 0x18
63DS = 0x1C
64ES = 0x20
65ORIG_EAX = 0x24
66EIP = 0x28
67CS = 0x2C
68EFLAGS = 0x30
69OLDESP = 0x34
70OLDSS = 0x38
71 70
72CF_MASK = 0x00000001 71CF_MASK = 0x00000001
73TF_MASK = 0x00000100 72TF_MASK = 0x00000100
@@ -76,23 +75,16 @@ DF_MASK = 0x00000400
76NT_MASK = 0x00004000 75NT_MASK = 0x00004000
77VM_MASK = 0x00020000 76VM_MASK = 0x00020000
78 77
79/* These are replaces for paravirtualization */
80#define DISABLE_INTERRUPTS cli
81#define ENABLE_INTERRUPTS sti
82#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit
83#define INTERRUPT_RETURN iret
84#define GET_CR0_INTO_EAX movl %cr0, %eax
85
86#ifdef CONFIG_PREEMPT 78#ifdef CONFIG_PREEMPT
87#define preempt_stop DISABLE_INTERRUPTS; TRACE_IRQS_OFF 79#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
88#else 80#else
89#define preempt_stop 81#define preempt_stop(clobbers)
90#define resume_kernel restore_nocheck 82#define resume_kernel restore_nocheck
91#endif 83#endif
92 84
93.macro TRACE_IRQS_IRET 85.macro TRACE_IRQS_IRET
94#ifdef CONFIG_TRACE_IRQFLAGS 86#ifdef CONFIG_TRACE_IRQFLAGS
95 testl $IF_MASK,EFLAGS(%esp) # interrupts off? 87 testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off?
96 jz 1f 88 jz 1f
97 TRACE_IRQS_ON 89 TRACE_IRQS_ON
981: 901:
@@ -107,6 +99,9 @@ VM_MASK = 0x00020000
107 99
108#define SAVE_ALL \ 100#define SAVE_ALL \
109 cld; \ 101 cld; \
102 pushl %gs; \
103 CFI_ADJUST_CFA_OFFSET 4;\
104 /*CFI_REL_OFFSET gs, 0;*/\
110 pushl %es; \ 105 pushl %es; \
111 CFI_ADJUST_CFA_OFFSET 4;\ 106 CFI_ADJUST_CFA_OFFSET 4;\
112 /*CFI_REL_OFFSET es, 0;*/\ 107 /*CFI_REL_OFFSET es, 0;*/\
@@ -136,7 +131,9 @@ VM_MASK = 0x00020000
136 CFI_REL_OFFSET ebx, 0;\ 131 CFI_REL_OFFSET ebx, 0;\
137 movl $(__USER_DS), %edx; \ 132 movl $(__USER_DS), %edx; \
138 movl %edx, %ds; \ 133 movl %edx, %ds; \
139 movl %edx, %es; 134 movl %edx, %es; \
135 movl $(__KERNEL_PDA), %edx; \
136 movl %edx, %gs
140 137
141#define RESTORE_INT_REGS \ 138#define RESTORE_INT_REGS \
142 popl %ebx; \ 139 popl %ebx; \
@@ -169,17 +166,22 @@ VM_MASK = 0x00020000
1692: popl %es; \ 1662: popl %es; \
170 CFI_ADJUST_CFA_OFFSET -4;\ 167 CFI_ADJUST_CFA_OFFSET -4;\
171 /*CFI_RESTORE es;*/\ 168 /*CFI_RESTORE es;*/\
172.section .fixup,"ax"; \ 1693: popl %gs; \
1733: movl $0,(%esp); \ 170 CFI_ADJUST_CFA_OFFSET -4;\
174 jmp 1b; \ 171 /*CFI_RESTORE gs;*/\
172.pushsection .fixup,"ax"; \
1754: movl $0,(%esp); \ 1734: movl $0,(%esp); \
174 jmp 1b; \
1755: movl $0,(%esp); \
176 jmp 2b; \ 176 jmp 2b; \
177.previous; \ 1776: movl $0,(%esp); \
178 jmp 3b; \
178.section __ex_table,"a";\ 179.section __ex_table,"a";\
179 .align 4; \ 180 .align 4; \
180 .long 1b,3b; \ 181 .long 1b,4b; \
181 .long 2b,4b; \ 182 .long 2b,5b; \
182.previous 183 .long 3b,6b; \
184.popsection
183 185
184#define RING0_INT_FRAME \ 186#define RING0_INT_FRAME \
185 CFI_STARTPROC simple;\ 187 CFI_STARTPROC simple;\
@@ -198,18 +200,18 @@ VM_MASK = 0x00020000
198#define RING0_PTREGS_FRAME \ 200#define RING0_PTREGS_FRAME \
199 CFI_STARTPROC simple;\ 201 CFI_STARTPROC simple;\
200 CFI_SIGNAL_FRAME;\ 202 CFI_SIGNAL_FRAME;\
201 CFI_DEF_CFA esp, OLDESP-EBX;\ 203 CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\
202 /*CFI_OFFSET cs, CS-OLDESP;*/\ 204 /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\
203 CFI_OFFSET eip, EIP-OLDESP;\ 205 CFI_OFFSET eip, PT_EIP-PT_OLDESP;\
204 /*CFI_OFFSET es, ES-OLDESP;*/\ 206 /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\
205 /*CFI_OFFSET ds, DS-OLDESP;*/\ 207 /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\
206 CFI_OFFSET eax, EAX-OLDESP;\ 208 CFI_OFFSET eax, PT_EAX-PT_OLDESP;\
207 CFI_OFFSET ebp, EBP-OLDESP;\ 209 CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\
208 CFI_OFFSET edi, EDI-OLDESP;\ 210 CFI_OFFSET edi, PT_EDI-PT_OLDESP;\
209 CFI_OFFSET esi, ESI-OLDESP;\ 211 CFI_OFFSET esi, PT_ESI-PT_OLDESP;\
210 CFI_OFFSET edx, EDX-OLDESP;\ 212 CFI_OFFSET edx, PT_EDX-PT_OLDESP;\
211 CFI_OFFSET ecx, ECX-OLDESP;\ 213 CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\
212 CFI_OFFSET ebx, EBX-OLDESP 214 CFI_OFFSET ebx, PT_EBX-PT_OLDESP
213 215
214ENTRY(ret_from_fork) 216ENTRY(ret_from_fork)
215 CFI_STARTPROC 217 CFI_STARTPROC
@@ -237,17 +239,18 @@ ENTRY(ret_from_fork)
237 ALIGN 239 ALIGN
238 RING0_PTREGS_FRAME 240 RING0_PTREGS_FRAME
239ret_from_exception: 241ret_from_exception:
240 preempt_stop 242 preempt_stop(CLBR_ANY)
241ret_from_intr: 243ret_from_intr:
242 GET_THREAD_INFO(%ebp) 244 GET_THREAD_INFO(%ebp)
243check_userspace: 245check_userspace:
244 movl EFLAGS(%esp), %eax # mix EFLAGS and CS 246 movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
245 movb CS(%esp), %al 247 movb PT_CS(%esp), %al
246 andl $(VM_MASK | SEGMENT_RPL_MASK), %eax 248 andl $(VM_MASK | SEGMENT_RPL_MASK), %eax
247 cmpl $USER_RPL, %eax 249 cmpl $USER_RPL, %eax
248 jb resume_kernel # not returning to v8086 or userspace 250 jb resume_kernel # not returning to v8086 or userspace
251
249ENTRY(resume_userspace) 252ENTRY(resume_userspace)
250 DISABLE_INTERRUPTS # make sure we don't miss an interrupt 253 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
251 # setting need_resched or sigpending 254 # setting need_resched or sigpending
252 # between sampling and the iret 255 # between sampling and the iret
253 movl TI_flags(%ebp), %ecx 256 movl TI_flags(%ebp), %ecx
@@ -258,14 +261,14 @@ ENTRY(resume_userspace)
258 261
259#ifdef CONFIG_PREEMPT 262#ifdef CONFIG_PREEMPT
260ENTRY(resume_kernel) 263ENTRY(resume_kernel)
261 DISABLE_INTERRUPTS 264 DISABLE_INTERRUPTS(CLBR_ANY)
262 cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? 265 cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
263 jnz restore_nocheck 266 jnz restore_nocheck
264need_resched: 267need_resched:
265 movl TI_flags(%ebp), %ecx # need_resched set ? 268 movl TI_flags(%ebp), %ecx # need_resched set ?
266 testb $_TIF_NEED_RESCHED, %cl 269 testb $_TIF_NEED_RESCHED, %cl
267 jz restore_all 270 jz restore_all
268 testl $IF_MASK,EFLAGS(%esp) # interrupts off (exception path) ? 271 testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off (exception path) ?
269 jz restore_all 272 jz restore_all
270 call preempt_schedule_irq 273 call preempt_schedule_irq
271 jmp need_resched 274 jmp need_resched
@@ -287,7 +290,7 @@ sysenter_past_esp:
287 * No need to follow this irqs on/off section: the syscall 290 * No need to follow this irqs on/off section: the syscall
288 * disabled irqs and here we enable it straight after entry: 291 * disabled irqs and here we enable it straight after entry:
289 */ 292 */
290 ENABLE_INTERRUPTS 293 ENABLE_INTERRUPTS(CLBR_NONE)
291 pushl $(__USER_DS) 294 pushl $(__USER_DS)
292 CFI_ADJUST_CFA_OFFSET 4 295 CFI_ADJUST_CFA_OFFSET 4
293 /*CFI_REL_OFFSET ss, 0*/ 296 /*CFI_REL_OFFSET ss, 0*/
@@ -331,20 +334,27 @@ sysenter_past_esp:
331 cmpl $(nr_syscalls), %eax 334 cmpl $(nr_syscalls), %eax
332 jae syscall_badsys 335 jae syscall_badsys
333 call *sys_call_table(,%eax,4) 336 call *sys_call_table(,%eax,4)
334 movl %eax,EAX(%esp) 337 movl %eax,PT_EAX(%esp)
335 DISABLE_INTERRUPTS 338 DISABLE_INTERRUPTS(CLBR_ECX|CLBR_EDX)
336 TRACE_IRQS_OFF 339 TRACE_IRQS_OFF
337 movl TI_flags(%ebp), %ecx 340 movl TI_flags(%ebp), %ecx
338 testw $_TIF_ALLWORK_MASK, %cx 341 testw $_TIF_ALLWORK_MASK, %cx
339 jne syscall_exit_work 342 jne syscall_exit_work
340/* if something modifies registers it must also disable sysexit */ 343/* if something modifies registers it must also disable sysexit */
341 movl EIP(%esp), %edx 344 movl PT_EIP(%esp), %edx
342 movl OLDESP(%esp), %ecx 345 movl PT_OLDESP(%esp), %ecx
343 xorl %ebp,%ebp 346 xorl %ebp,%ebp
344 TRACE_IRQS_ON 347 TRACE_IRQS_ON
3481: mov PT_GS(%esp), %gs
345 ENABLE_INTERRUPTS_SYSEXIT 349 ENABLE_INTERRUPTS_SYSEXIT
346 CFI_ENDPROC 350 CFI_ENDPROC
347 351.pushsection .fixup,"ax"
3522: movl $0,PT_GS(%esp)
353 jmp 1b
354.section __ex_table,"a"
355 .align 4
356 .long 1b,2b
357.popsection
348 358
349 # system call handler stub 359 # system call handler stub
350ENTRY(system_call) 360ENTRY(system_call)
@@ -353,7 +363,7 @@ ENTRY(system_call)
353 CFI_ADJUST_CFA_OFFSET 4 363 CFI_ADJUST_CFA_OFFSET 4
354 SAVE_ALL 364 SAVE_ALL
355 GET_THREAD_INFO(%ebp) 365 GET_THREAD_INFO(%ebp)
356 testl $TF_MASK,EFLAGS(%esp) 366 testl $TF_MASK,PT_EFLAGS(%esp)
357 jz no_singlestep 367 jz no_singlestep
358 orl $_TIF_SINGLESTEP,TI_flags(%ebp) 368 orl $_TIF_SINGLESTEP,TI_flags(%ebp)
359no_singlestep: 369no_singlestep:
@@ -365,9 +375,9 @@ no_singlestep:
365 jae syscall_badsys 375 jae syscall_badsys
366syscall_call: 376syscall_call:
367 call *sys_call_table(,%eax,4) 377 call *sys_call_table(,%eax,4)
368 movl %eax,EAX(%esp) # store the return value 378 movl %eax,PT_EAX(%esp) # store the return value
369syscall_exit: 379syscall_exit:
370 DISABLE_INTERRUPTS # make sure we don't miss an interrupt 380 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
371 # setting need_resched or sigpending 381 # setting need_resched or sigpending
372 # between sampling and the iret 382 # between sampling and the iret
373 TRACE_IRQS_OFF 383 TRACE_IRQS_OFF
@@ -376,12 +386,12 @@ syscall_exit:
376 jne syscall_exit_work 386 jne syscall_exit_work
377 387
378restore_all: 388restore_all:
379 movl EFLAGS(%esp), %eax # mix EFLAGS, SS and CS 389 movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
380 # Warning: OLDSS(%esp) contains the wrong/random values if we 390 # Warning: PT_OLDSS(%esp) contains the wrong/random values if we
381 # are returning to the kernel. 391 # are returning to the kernel.
382 # See comments in process.c:copy_thread() for details. 392 # See comments in process.c:copy_thread() for details.
383 movb OLDSS(%esp), %ah 393 movb PT_OLDSS(%esp), %ah
384 movb CS(%esp), %al 394 movb PT_CS(%esp), %al
385 andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax 395 andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
386 cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax 396 cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
387 CFI_REMEMBER_STATE 397 CFI_REMEMBER_STATE
@@ -390,13 +400,13 @@ restore_nocheck:
390 TRACE_IRQS_IRET 400 TRACE_IRQS_IRET
391restore_nocheck_notrace: 401restore_nocheck_notrace:
392 RESTORE_REGS 402 RESTORE_REGS
393 addl $4, %esp 403 addl $4, %esp # skip orig_eax/error_code
394 CFI_ADJUST_CFA_OFFSET -4 404 CFI_ADJUST_CFA_OFFSET -4
3951: INTERRUPT_RETURN 4051: INTERRUPT_RETURN
396.section .fixup,"ax" 406.section .fixup,"ax"
397iret_exc: 407iret_exc:
398 TRACE_IRQS_ON 408 TRACE_IRQS_ON
399 ENABLE_INTERRUPTS 409 ENABLE_INTERRUPTS(CLBR_NONE)
400 pushl $0 # no error code 410 pushl $0 # no error code
401 pushl $do_iret_error 411 pushl $do_iret_error
402 jmp error_code 412 jmp error_code
@@ -408,33 +418,42 @@ iret_exc:
408 418
409 CFI_RESTORE_STATE 419 CFI_RESTORE_STATE
410ldt_ss: 420ldt_ss:
411 larl OLDSS(%esp), %eax 421 larl PT_OLDSS(%esp), %eax
412 jnz restore_nocheck 422 jnz restore_nocheck
413 testl $0x00400000, %eax # returning to 32bit stack? 423 testl $0x00400000, %eax # returning to 32bit stack?
414 jnz restore_nocheck # allright, normal return 424 jnz restore_nocheck # allright, normal return
425
426#ifdef CONFIG_PARAVIRT
427 /*
428 * The kernel can't run on a non-flat stack if paravirt mode
429 * is active. Rather than try to fixup the high bits of
430 * ESP, bypass this code entirely. This may break DOSemu
431 * and/or Wine support in a paravirt VM, although the option
432 * is still available to implement the setting of the high
433 * 16-bits in the INTERRUPT_RETURN paravirt-op.
434 */
435 cmpl $0, paravirt_ops+PARAVIRT_enabled
436 jne restore_nocheck
437#endif
438
415 /* If returning to userspace with 16bit stack, 439 /* If returning to userspace with 16bit stack,
416 * try to fix the higher word of ESP, as the CPU 440 * try to fix the higher word of ESP, as the CPU
417 * won't restore it. 441 * won't restore it.
418 * This is an "official" bug of all the x86-compatible 442 * This is an "official" bug of all the x86-compatible
419 * CPUs, which we can try to work around to make 443 * CPUs, which we can try to work around to make
420 * dosemu and wine happy. */ 444 * dosemu and wine happy. */
421 subl $8, %esp # reserve space for switch16 pointer 445 movl PT_OLDESP(%esp), %eax
422 CFI_ADJUST_CFA_OFFSET 8 446 movl %esp, %edx
423 DISABLE_INTERRUPTS 447 call patch_espfix_desc
448 pushl $__ESPFIX_SS
449 CFI_ADJUST_CFA_OFFSET 4
450 pushl %eax
451 CFI_ADJUST_CFA_OFFSET 4
452 DISABLE_INTERRUPTS(CLBR_EAX)
424 TRACE_IRQS_OFF 453 TRACE_IRQS_OFF
425 movl %esp, %eax 454 lss (%esp), %esp
426 /* Set up the 16bit stack frame with switch32 pointer on top, 455 CFI_ADJUST_CFA_OFFSET -8
427 * and a switch16 pointer on top of the current frame. */ 456 jmp restore_nocheck
428 call setup_x86_bogus_stack
429 CFI_ADJUST_CFA_OFFSET -8 # frame has moved
430 TRACE_IRQS_IRET
431 RESTORE_REGS
432 lss 20+4(%esp), %esp # switch to 16bit stack
4331: INTERRUPT_RETURN
434.section __ex_table,"a"
435 .align 4
436 .long 1b,iret_exc
437.previous
438 CFI_ENDPROC 457 CFI_ENDPROC
439 458
440 # perform work that needs to be done immediately before resumption 459 # perform work that needs to be done immediately before resumption
@@ -445,7 +464,7 @@ work_pending:
445 jz work_notifysig 464 jz work_notifysig
446work_resched: 465work_resched:
447 call schedule 466 call schedule
448 DISABLE_INTERRUPTS # make sure we don't miss an interrupt 467 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
449 # setting need_resched or sigpending 468 # setting need_resched or sigpending
450 # between sampling and the iret 469 # between sampling and the iret
451 TRACE_IRQS_OFF 470 TRACE_IRQS_OFF
@@ -458,7 +477,8 @@ work_resched:
458 477
459work_notifysig: # deal with pending signals and 478work_notifysig: # deal with pending signals and
460 # notify-resume requests 479 # notify-resume requests
461 testl $VM_MASK, EFLAGS(%esp) 480#ifdef CONFIG_VM86
481 testl $VM_MASK, PT_EFLAGS(%esp)
462 movl %esp, %eax 482 movl %esp, %eax
463 jne work_notifysig_v86 # returning to kernel-space or 483 jne work_notifysig_v86 # returning to kernel-space or
464 # vm86-space 484 # vm86-space
@@ -468,29 +488,30 @@ work_notifysig: # deal with pending signals and
468 488
469 ALIGN 489 ALIGN
470work_notifysig_v86: 490work_notifysig_v86:
471#ifdef CONFIG_VM86
472 pushl %ecx # save ti_flags for do_notify_resume 491 pushl %ecx # save ti_flags for do_notify_resume
473 CFI_ADJUST_CFA_OFFSET 4 492 CFI_ADJUST_CFA_OFFSET 4
474 call save_v86_state # %eax contains pt_regs pointer 493 call save_v86_state # %eax contains pt_regs pointer
475 popl %ecx 494 popl %ecx
476 CFI_ADJUST_CFA_OFFSET -4 495 CFI_ADJUST_CFA_OFFSET -4
477 movl %eax, %esp 496 movl %eax, %esp
497#else
498 movl %esp, %eax
499#endif
478 xorl %edx, %edx 500 xorl %edx, %edx
479 call do_notify_resume 501 call do_notify_resume
480 jmp resume_userspace_sig 502 jmp resume_userspace_sig
481#endif
482 503
483 # perform syscall exit tracing 504 # perform syscall exit tracing
484 ALIGN 505 ALIGN
485syscall_trace_entry: 506syscall_trace_entry:
486 movl $-ENOSYS,EAX(%esp) 507 movl $-ENOSYS,PT_EAX(%esp)
487 movl %esp, %eax 508 movl %esp, %eax
488 xorl %edx,%edx 509 xorl %edx,%edx
489 call do_syscall_trace 510 call do_syscall_trace
490 cmpl $0, %eax 511 cmpl $0, %eax
491 jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU, 512 jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
492 # so must skip actual syscall 513 # so must skip actual syscall
493 movl ORIG_EAX(%esp), %eax 514 movl PT_ORIG_EAX(%esp), %eax
494 cmpl $(nr_syscalls), %eax 515 cmpl $(nr_syscalls), %eax
495 jnae syscall_call 516 jnae syscall_call
496 jmp syscall_exit 517 jmp syscall_exit
@@ -501,7 +522,7 @@ syscall_exit_work:
501 testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl 522 testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
502 jz work_pending 523 jz work_pending
503 TRACE_IRQS_ON 524 TRACE_IRQS_ON
504 ENABLE_INTERRUPTS # could let do_syscall_trace() call 525 ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call
505 # schedule() instead 526 # schedule() instead
506 movl %esp, %eax 527 movl %esp, %eax
507 movl $1, %edx 528 movl $1, %edx
@@ -515,39 +536,38 @@ syscall_fault:
515 CFI_ADJUST_CFA_OFFSET 4 536 CFI_ADJUST_CFA_OFFSET 4
516 SAVE_ALL 537 SAVE_ALL
517 GET_THREAD_INFO(%ebp) 538 GET_THREAD_INFO(%ebp)
518 movl $-EFAULT,EAX(%esp) 539 movl $-EFAULT,PT_EAX(%esp)
519 jmp resume_userspace 540 jmp resume_userspace
520 541
521syscall_badsys: 542syscall_badsys:
522 movl $-ENOSYS,EAX(%esp) 543 movl $-ENOSYS,PT_EAX(%esp)
523 jmp resume_userspace 544 jmp resume_userspace
524 CFI_ENDPROC 545 CFI_ENDPROC
525 546
526#define FIXUP_ESPFIX_STACK \ 547#define FIXUP_ESPFIX_STACK \
527 movl %esp, %eax; \ 548 /* since we are on a wrong stack, we cant make it a C code :( */ \
528 /* switch to 32bit stack using the pointer on top of 16bit stack */ \ 549 movl %gs:PDA_cpu, %ebx; \
529 lss %ss:CPU_16BIT_STACK_SIZE-8, %esp; \ 550 PER_CPU(cpu_gdt_descr, %ebx); \
530 /* copy data from 16bit stack to 32bit stack */ \ 551 movl GDS_address(%ebx), %ebx; \
531 call fixup_x86_bogus_stack; \ 552 GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
532 /* put ESP to the proper location */ \ 553 addl %esp, %eax; \
533 movl %eax, %esp; 554 pushl $__KERNEL_DS; \
534#define UNWIND_ESPFIX_STACK \ 555 CFI_ADJUST_CFA_OFFSET 4; \
535 pushl %eax; \ 556 pushl %eax; \
536 CFI_ADJUST_CFA_OFFSET 4; \ 557 CFI_ADJUST_CFA_OFFSET 4; \
558 lss (%esp), %esp; \
559 CFI_ADJUST_CFA_OFFSET -8;
560#define UNWIND_ESPFIX_STACK \
537 movl %ss, %eax; \ 561 movl %ss, %eax; \
538 /* see if on 16bit stack */ \ 562 /* see if on espfix stack */ \
539 cmpw $__ESPFIX_SS, %ax; \ 563 cmpw $__ESPFIX_SS, %ax; \
540 je 28f; \ 564 jne 27f; \
54127: popl %eax; \ 565 movl $__KERNEL_DS, %eax; \
542 CFI_ADJUST_CFA_OFFSET -4; \
543.section .fixup,"ax"; \
54428: movl $__KERNEL_DS, %eax; \
545 movl %eax, %ds; \ 566 movl %eax, %ds; \
546 movl %eax, %es; \ 567 movl %eax, %es; \
547 /* switch to 32bit stack */ \ 568 /* switch to normal stack */ \
548 FIXUP_ESPFIX_STACK; \ 569 FIXUP_ESPFIX_STACK; \
549 jmp 27b; \ 57027:;
550.previous
551 571
552/* 572/*
553 * Build the entry stubs and pointer table with 573 * Build the entry stubs and pointer table with
@@ -608,13 +628,16 @@ KPROBE_ENTRY(page_fault)
608 CFI_ADJUST_CFA_OFFSET 4 628 CFI_ADJUST_CFA_OFFSET 4
609 ALIGN 629 ALIGN
610error_code: 630error_code:
631 /* the function address is in %gs's slot on the stack */
632 pushl %es
633 CFI_ADJUST_CFA_OFFSET 4
634 /*CFI_REL_OFFSET es, 0*/
611 pushl %ds 635 pushl %ds
612 CFI_ADJUST_CFA_OFFSET 4 636 CFI_ADJUST_CFA_OFFSET 4
613 /*CFI_REL_OFFSET ds, 0*/ 637 /*CFI_REL_OFFSET ds, 0*/
614 pushl %eax 638 pushl %eax
615 CFI_ADJUST_CFA_OFFSET 4 639 CFI_ADJUST_CFA_OFFSET 4
616 CFI_REL_OFFSET eax, 0 640 CFI_REL_OFFSET eax, 0
617 xorl %eax, %eax
618 pushl %ebp 641 pushl %ebp
619 CFI_ADJUST_CFA_OFFSET 4 642 CFI_ADJUST_CFA_OFFSET 4
620 CFI_REL_OFFSET ebp, 0 643 CFI_REL_OFFSET ebp, 0
@@ -627,7 +650,6 @@ error_code:
627 pushl %edx 650 pushl %edx
628 CFI_ADJUST_CFA_OFFSET 4 651 CFI_ADJUST_CFA_OFFSET 4
629 CFI_REL_OFFSET edx, 0 652 CFI_REL_OFFSET edx, 0
630 decl %eax # eax = -1
631 pushl %ecx 653 pushl %ecx
632 CFI_ADJUST_CFA_OFFSET 4 654 CFI_ADJUST_CFA_OFFSET 4
633 CFI_REL_OFFSET ecx, 0 655 CFI_REL_OFFSET ecx, 0
@@ -635,18 +657,20 @@ error_code:
635 CFI_ADJUST_CFA_OFFSET 4 657 CFI_ADJUST_CFA_OFFSET 4
636 CFI_REL_OFFSET ebx, 0 658 CFI_REL_OFFSET ebx, 0
637 cld 659 cld
638 pushl %es 660 pushl %gs
639 CFI_ADJUST_CFA_OFFSET 4 661 CFI_ADJUST_CFA_OFFSET 4
640 /*CFI_REL_OFFSET es, 0*/ 662 /*CFI_REL_OFFSET gs, 0*/
663 movl $(__KERNEL_PDA), %ecx
664 movl %ecx, %gs
641 UNWIND_ESPFIX_STACK 665 UNWIND_ESPFIX_STACK
642 popl %ecx 666 popl %ecx
643 CFI_ADJUST_CFA_OFFSET -4 667 CFI_ADJUST_CFA_OFFSET -4
644 /*CFI_REGISTER es, ecx*/ 668 /*CFI_REGISTER es, ecx*/
645 movl ES(%esp), %edi # get the function address 669 movl PT_GS(%esp), %edi # get the function address
646 movl ORIG_EAX(%esp), %edx # get the error code 670 movl PT_ORIG_EAX(%esp), %edx # get the error code
647 movl %eax, ORIG_EAX(%esp) 671 movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
648 movl %ecx, ES(%esp) 672 mov %ecx, PT_GS(%esp)
649 /*CFI_REL_OFFSET es, ES*/ 673 /*CFI_REL_OFFSET gs, ES*/
650 movl $(__USER_DS), %ecx 674 movl $(__USER_DS), %ecx
651 movl %ecx, %ds 675 movl %ecx, %ds
652 movl %ecx, %es 676 movl %ecx, %es
@@ -682,7 +706,7 @@ ENTRY(device_not_available)
682 GET_CR0_INTO_EAX 706 GET_CR0_INTO_EAX
683 testl $0x4, %eax # EM (math emulation bit) 707 testl $0x4, %eax # EM (math emulation bit)
684 jne device_not_available_emulate 708 jne device_not_available_emulate
685 preempt_stop 709 preempt_stop(CLBR_ANY)
686 call math_state_restore 710 call math_state_restore
687 jmp ret_from_exception 711 jmp ret_from_exception
688device_not_available_emulate: 712device_not_available_emulate:
@@ -754,7 +778,7 @@ KPROBE_ENTRY(nmi)
754 cmpw $__ESPFIX_SS, %ax 778 cmpw $__ESPFIX_SS, %ax
755 popl %eax 779 popl %eax
756 CFI_ADJUST_CFA_OFFSET -4 780 CFI_ADJUST_CFA_OFFSET -4
757 je nmi_16bit_stack 781 je nmi_espfix_stack
758 cmpl $sysenter_entry,(%esp) 782 cmpl $sysenter_entry,(%esp)
759 je nmi_stack_fixup 783 je nmi_stack_fixup
760 pushl %eax 784 pushl %eax
@@ -797,7 +821,7 @@ nmi_debug_stack_check:
797 FIX_STACK(24,nmi_stack_correct, 1) 821 FIX_STACK(24,nmi_stack_correct, 1)
798 jmp nmi_stack_correct 822 jmp nmi_stack_correct
799 823
800nmi_16bit_stack: 824nmi_espfix_stack:
801 /* We have a RING0_INT_FRAME here. 825 /* We have a RING0_INT_FRAME here.
802 * 826 *
803 * create the pointer to lss back 827 * create the pointer to lss back
@@ -806,7 +830,6 @@ nmi_16bit_stack:
806 CFI_ADJUST_CFA_OFFSET 4 830 CFI_ADJUST_CFA_OFFSET 4
807 pushl %esp 831 pushl %esp
808 CFI_ADJUST_CFA_OFFSET 4 832 CFI_ADJUST_CFA_OFFSET 4
809 movzwl %sp, %esp
810 addw $4, (%esp) 833 addw $4, (%esp)
811 /* copy the iret frame of 12 bytes */ 834 /* copy the iret frame of 12 bytes */
812 .rept 3 835 .rept 3
@@ -817,11 +840,11 @@ nmi_16bit_stack:
817 CFI_ADJUST_CFA_OFFSET 4 840 CFI_ADJUST_CFA_OFFSET 4
818 SAVE_ALL 841 SAVE_ALL
819 FIXUP_ESPFIX_STACK # %eax == %esp 842 FIXUP_ESPFIX_STACK # %eax == %esp
820 CFI_ADJUST_CFA_OFFSET -20 # the frame has now moved
821 xorl %edx,%edx # zero error code 843 xorl %edx,%edx # zero error code
822 call do_nmi 844 call do_nmi
823 RESTORE_REGS 845 RESTORE_REGS
824 lss 12+4(%esp), %esp # back to 16bit stack 846 lss 12+4(%esp), %esp # back to espfix stack
847 CFI_ADJUST_CFA_OFFSET -24
8251: INTERRUPT_RETURN 8481: INTERRUPT_RETURN
826 CFI_ENDPROC 849 CFI_ENDPROC
827.section __ex_table,"a" 850.section __ex_table,"a"
@@ -830,6 +853,19 @@ nmi_16bit_stack:
830.previous 853.previous
831KPROBE_END(nmi) 854KPROBE_END(nmi)
832 855
856#ifdef CONFIG_PARAVIRT
857ENTRY(native_iret)
8581: iret
859.section __ex_table,"a"
860 .align 4
861 .long 1b,iret_exc
862.previous
863
864ENTRY(native_irq_enable_sysexit)
865 sti
866 sysexit
867#endif
868
833KPROBE_ENTRY(int3) 869KPROBE_ENTRY(int3)
834 RING0_INT_FRAME 870 RING0_INT_FRAME
835 pushl $-1 # mark this as an int 871 pushl $-1 # mark this as an int
@@ -949,26 +985,27 @@ ENTRY(arch_unwind_init_running)
949 movl 4(%esp), %edx 985 movl 4(%esp), %edx
950 movl (%esp), %ecx 986 movl (%esp), %ecx
951 leal 4(%esp), %eax 987 leal 4(%esp), %eax
952 movl %ebx, EBX(%edx) 988 movl %ebx, PT_EBX(%edx)
953 xorl %ebx, %ebx 989 xorl %ebx, %ebx
954 movl %ebx, ECX(%edx) 990 movl %ebx, PT_ECX(%edx)
955 movl %ebx, EDX(%edx) 991 movl %ebx, PT_EDX(%edx)
956 movl %esi, ESI(%edx) 992 movl %esi, PT_ESI(%edx)
957 movl %edi, EDI(%edx) 993 movl %edi, PT_EDI(%edx)
958 movl %ebp, EBP(%edx) 994 movl %ebp, PT_EBP(%edx)
959 movl %ebx, EAX(%edx) 995 movl %ebx, PT_EAX(%edx)
960 movl $__USER_DS, DS(%edx) 996 movl $__USER_DS, PT_DS(%edx)
961 movl $__USER_DS, ES(%edx) 997 movl $__USER_DS, PT_ES(%edx)
962 movl %ebx, ORIG_EAX(%edx) 998 movl $0, PT_GS(%edx)
963 movl %ecx, EIP(%edx) 999 movl %ebx, PT_ORIG_EAX(%edx)
1000 movl %ecx, PT_EIP(%edx)
964 movl 12(%esp), %ecx 1001 movl 12(%esp), %ecx
965 movl $__KERNEL_CS, CS(%edx) 1002 movl $__KERNEL_CS, PT_CS(%edx)
966 movl %ebx, EFLAGS(%edx) 1003 movl %ebx, PT_EFLAGS(%edx)
967 movl %eax, OLDESP(%edx) 1004 movl %eax, PT_OLDESP(%edx)
968 movl 8(%esp), %eax 1005 movl 8(%esp), %eax
969 movl %ecx, 8(%esp) 1006 movl %ecx, 8(%esp)
970 movl EBX(%edx), %ebx 1007 movl PT_EBX(%edx), %ebx
971 movl $__KERNEL_DS, OLDSS(%edx) 1008 movl $__KERNEL_DS, PT_OLDSS(%edx)
972 jmpl *%eax 1009 jmpl *%eax
973 CFI_ENDPROC 1010 CFI_ENDPROC
974ENDPROC(arch_unwind_init_running) 1011ENDPROC(arch_unwind_init_running)
diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S
index be9d883c62ce..edef5084ce17 100644
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -55,6 +55,12 @@
55 */ 55 */
56ENTRY(startup_32) 56ENTRY(startup_32)
57 57
58#ifdef CONFIG_PARAVIRT
59 movl %cs, %eax
60 testl $0x3, %eax
61 jnz startup_paravirt
62#endif
63
58/* 64/*
59 * Set segments to known values. 65 * Set segments to known values.
60 */ 66 */
@@ -302,6 +308,7 @@ is386: movl $2,%ecx # set MP
302 movl %eax,%cr0 308 movl %eax,%cr0
303 309
304 call check_x87 310 call check_x87
311 call setup_pda
305 lgdt cpu_gdt_descr 312 lgdt cpu_gdt_descr
306 lidt idt_descr 313 lidt idt_descr
307 ljmp $(__KERNEL_CS),$1f 314 ljmp $(__KERNEL_CS),$1f
@@ -312,12 +319,15 @@ is386: movl $2,%ecx # set MP
312 movl %eax,%ds 319 movl %eax,%ds
313 movl %eax,%es 320 movl %eax,%es
314 321
315 xorl %eax,%eax # Clear FS/GS and LDT 322 xorl %eax,%eax # Clear FS and LDT
316 movl %eax,%fs 323 movl %eax,%fs
317 movl %eax,%gs
318 lldt %ax 324 lldt %ax
325
326 movl $(__KERNEL_PDA),%eax
327 mov %eax,%gs
328
319 cld # gcc2 wants the direction flag cleared at all times 329 cld # gcc2 wants the direction flag cleared at all times
320 pushl %eax # fake return address 330 pushl $0 # fake return address for unwinder
321#ifdef CONFIG_SMP 331#ifdef CONFIG_SMP
322 movb ready, %cl 332 movb ready, %cl
323 movb $1, ready 333 movb $1, ready
@@ -346,6 +356,23 @@ check_x87:
346 ret 356 ret
347 357
348/* 358/*
359 * Point the GDT at this CPU's PDA. On boot this will be
360 * cpu_gdt_table and boot_pda; for secondary CPUs, these will be
361 * that CPU's GDT and PDA.
362 */
363setup_pda:
364 /* get the PDA pointer */
365 movl start_pda, %eax
366
367 /* slot the PDA address into the GDT */
368 mov cpu_gdt_descr+2, %ecx
369 mov %ax, (__KERNEL_PDA+0+2)(%ecx) /* base & 0x0000ffff */
370 shr $16, %eax
371 mov %al, (__KERNEL_PDA+4+0)(%ecx) /* base & 0x00ff0000 */
372 mov %ah, (__KERNEL_PDA+4+3)(%ecx) /* base & 0xff000000 */
373 ret
374
375/*
349 * setup_idt 376 * setup_idt
350 * 377 *
351 * sets up a idt with 256 entries pointing to 378 * sets up a idt with 256 entries pointing to
@@ -465,6 +492,33 @@ ignore_int:
465#endif 492#endif
466 iret 493 iret
467 494
495#ifdef CONFIG_PARAVIRT
496startup_paravirt:
497 cld
498 movl $(init_thread_union+THREAD_SIZE),%esp
499
500 /* We take pains to preserve all the regs. */
501 pushl %edx
502 pushl %ecx
503 pushl %eax
504
505 /* paravirt.o is last in link, and that probe fn never returns */
506 pushl $__start_paravirtprobe
5071:
508 movl 0(%esp), %eax
509 pushl (%eax)
510 movl 8(%esp), %eax
511 call *(%esp)
512 popl %eax
513
514 movl 4(%esp), %eax
515 movl 8(%esp), %ecx
516 movl 12(%esp), %edx
517
518 addl $4, (%esp)
519 jmp 1b
520#endif
521
468/* 522/*
469 * Real beginning of normal "text" segment 523 * Real beginning of normal "text" segment
470 */ 524 */
@@ -484,6 +538,8 @@ ENTRY(empty_zero_page)
484 * This starts the data section. 538 * This starts the data section.
485 */ 539 */
486.data 540.data
541ENTRY(start_pda)
542 .long boot_pda
487 543
488ENTRY(stack_start) 544ENTRY(stack_start)
489 .long init_thread_union+THREAD_SIZE 545 .long init_thread_union+THREAD_SIZE
@@ -525,7 +581,7 @@ idt_descr:
525 581
526# boot GDT descriptor (later on used by CPU#0): 582# boot GDT descriptor (later on used by CPU#0):
527 .word 0 # 32 bit align gdt_desc.address 583 .word 0 # 32 bit align gdt_desc.address
528cpu_gdt_descr: 584ENTRY(cpu_gdt_descr)
529 .word GDT_ENTRIES*8-1 585 .word GDT_ENTRIES*8-1
530 .long cpu_gdt_table 586 .long cpu_gdt_table
531 587
@@ -584,8 +640,8 @@ ENTRY(cpu_gdt_table)
584 .quad 0x00009a000000ffff /* 0xc0 APM CS 16 code (16 bit) */ 640 .quad 0x00009a000000ffff /* 0xc0 APM CS 16 code (16 bit) */
585 .quad 0x004092000000ffff /* 0xc8 APM DS data */ 641 .quad 0x004092000000ffff /* 0xc8 APM DS data */
586 642
587 .quad 0x0000920000000000 /* 0xd0 - ESPFIX 16-bit SS */ 643 .quad 0x00c0920000000000 /* 0xd0 - ESPFIX SS */
588 .quad 0x0000000000000000 /* 0xd8 - unused */ 644 .quad 0x00cf92000000ffff /* 0xd8 - PDA */
589 .quad 0x0000000000000000 /* 0xe0 - unused */ 645 .quad 0x0000000000000000 /* 0xe0 - unused */
590 .quad 0x0000000000000000 /* 0xe8 - unused */ 646 .quad 0x0000000000000000 /* 0xe8 - unused */
591 .quad 0x0000000000000000 /* 0xf0 - unused */ 647 .quad 0x0000000000000000 /* 0xf0 - unused */
diff --git a/arch/i386/kernel/hpet.c b/arch/i386/kernel/hpet.c
index 17647a530b2f..45a8685bb60b 100644
--- a/arch/i386/kernel/hpet.c
+++ b/arch/i386/kernel/hpet.c
@@ -34,6 +34,7 @@ static int __init init_hpet_clocksource(void)
34 unsigned long hpet_period; 34 unsigned long hpet_period;
35 void __iomem* hpet_base; 35 void __iomem* hpet_base;
36 u64 tmp; 36 u64 tmp;
37 int err;
37 38
38 if (!is_hpet_enabled()) 39 if (!is_hpet_enabled())
39 return -ENODEV; 40 return -ENODEV;
@@ -61,7 +62,11 @@ static int __init init_hpet_clocksource(void)
61 do_div(tmp, FSEC_PER_NSEC); 62 do_div(tmp, FSEC_PER_NSEC);
62 clocksource_hpet.mult = (u32)tmp; 63 clocksource_hpet.mult = (u32)tmp;
63 64
64 return clocksource_register(&clocksource_hpet); 65 err = clocksource_register(&clocksource_hpet);
66 if (err)
67 iounmap(hpet_base);
68
69 return err;
65} 70}
66 71
67module_init(init_hpet_clocksource); 72module_init(init_hpet_clocksource);
diff --git a/arch/i386/kernel/i8253.c b/arch/i386/kernel/i8253.c
index 477b24daff53..9a0060b92e32 100644
--- a/arch/i386/kernel/i8253.c
+++ b/arch/i386/kernel/i8253.c
@@ -109,7 +109,7 @@ static struct clocksource clocksource_pit = {
109 109
110static int __init init_pit_clocksource(void) 110static int __init init_pit_clocksource(void)
111{ 111{
112 if (num_possible_cpus() > 4) /* PIT does not scale! */ 112 if (num_possible_cpus() > 1) /* PIT does not scale! */
113 return 0; 113 return 0;
114 114
115 clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20); 115 clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20);
diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c
index d53eafb6daa7..c8d45821c788 100644
--- a/arch/i386/kernel/i8259.c
+++ b/arch/i386/kernel/i8259.c
@@ -113,7 +113,8 @@ void make_8259A_irq(unsigned int irq)
113{ 113{
114 disable_irq_nosync(irq); 114 disable_irq_nosync(irq);
115 io_apic_irqs &= ~(1<<irq); 115 io_apic_irqs &= ~(1<<irq);
116 set_irq_chip_and_handler(irq, &i8259A_chip, handle_level_irq); 116 set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
117 "XT");
117 enable_irq(irq); 118 enable_irq(irq);
118} 119}
119 120
@@ -369,8 +370,8 @@ void __init init_ISA_irqs (void)
369 /* 370 /*
370 * 16 old-style INTA-cycle interrupts: 371 * 16 old-style INTA-cycle interrupts:
371 */ 372 */
372 set_irq_chip_and_handler(i, &i8259A_chip, 373 set_irq_chip_and_handler_name(i, &i8259A_chip,
373 handle_level_irq); 374 handle_level_irq, "XT");
374 } else { 375 } else {
375 /* 376 /*
376 * 'high' PCI IRQs filled in on demand 377 * 'high' PCI IRQs filled in on demand
@@ -380,7 +381,10 @@ void __init init_ISA_irqs (void)
380 } 381 }
381} 382}
382 383
383void __init init_IRQ(void) 384/* Overridden in paravirt.c */
385void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
386
387void __init native_init_IRQ(void)
384{ 388{
385 int i; 389 int i;
386 390
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 27bceaf5ce40..2424cc9c7b3d 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -34,6 +34,7 @@
34#include <linux/pci.h> 34#include <linux/pci.h>
35#include <linux/msi.h> 35#include <linux/msi.h>
36#include <linux/htirq.h> 36#include <linux/htirq.h>
37#include <linux/freezer.h>
37 38
38#include <asm/io.h> 39#include <asm/io.h>
39#include <asm/smp.h> 40#include <asm/smp.h>
@@ -91,6 +92,46 @@ static struct irq_pin_list {
91 int apic, pin, next; 92 int apic, pin, next;
92} irq_2_pin[PIN_MAP_SIZE]; 93} irq_2_pin[PIN_MAP_SIZE];
93 94
95struct io_apic {
96 unsigned int index;
97 unsigned int unused[3];
98 unsigned int data;
99};
100
101static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
102{
103 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
104 + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
105}
106
107static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
108{
109 struct io_apic __iomem *io_apic = io_apic_base(apic);
110 writel(reg, &io_apic->index);
111 return readl(&io_apic->data);
112}
113
114static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
115{
116 struct io_apic __iomem *io_apic = io_apic_base(apic);
117 writel(reg, &io_apic->index);
118 writel(value, &io_apic->data);
119}
120
121/*
122 * Re-write a value: to be used for read-modify-write
123 * cycles where the read already set up the index register.
124 *
125 * Older SiS APIC requires we rewrite the index register
126 */
127static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
128{
129 volatile struct io_apic *io_apic = io_apic_base(apic);
130 if (sis_apic_bug)
131 writel(reg, &io_apic->index);
132 writel(value, &io_apic->data);
133}
134
94union entry_union { 135union entry_union {
95 struct { u32 w1, w2; }; 136 struct { u32 w1, w2; };
96 struct IO_APIC_route_entry entry; 137 struct IO_APIC_route_entry entry;
@@ -107,11 +148,39 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
107 return eu.entry; 148 return eu.entry;
108} 149}
109 150
110static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) 151/*
152 * When we write a new IO APIC routing entry, we need to write the high
153 * word first! If the mask bit in the low word is clear, we will enable
154 * the interrupt, and we need to make sure the entry is fully populated
155 * before that happens.
156 */
157static void
158__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
111{ 159{
112 unsigned long flags;
113 union entry_union eu; 160 union entry_union eu;
114 eu.entry = e; 161 eu.entry = e;
162 io_apic_write(apic, 0x11 + 2*pin, eu.w2);
163 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
164}
165
166static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
167{
168 unsigned long flags;
169 spin_lock_irqsave(&ioapic_lock, flags);
170 __ioapic_write_entry(apic, pin, e);
171 spin_unlock_irqrestore(&ioapic_lock, flags);
172}
173
174/*
175 * When we mask an IO APIC routing entry, we need to write the low
176 * word first, in order to set the mask bit before we change the
177 * high bits!
178 */
179static void ioapic_mask_entry(int apic, int pin)
180{
181 unsigned long flags;
182 union entry_union eu = { .entry.mask = 1 };
183
115 spin_lock_irqsave(&ioapic_lock, flags); 184 spin_lock_irqsave(&ioapic_lock, flags);
116 io_apic_write(apic, 0x10 + 2*pin, eu.w1); 185 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
117 io_apic_write(apic, 0x11 + 2*pin, eu.w2); 186 io_apic_write(apic, 0x11 + 2*pin, eu.w2);
@@ -234,9 +303,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
234 /* 303 /*
235 * Disable it in the IO-APIC irq-routing table: 304 * Disable it in the IO-APIC irq-routing table:
236 */ 305 */
237 memset(&entry, 0, sizeof(entry)); 306 ioapic_mask_entry(apic, pin);
238 entry.mask = 1;
239 ioapic_write_entry(apic, pin, entry);
240} 307}
241 308
242static void clear_IO_APIC (void) 309static void clear_IO_APIC (void)
@@ -776,8 +843,7 @@ static int __init find_isa_irq_pin(int irq, int type)
776 843
777 if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || 844 if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
778 mp_bus_id_to_type[lbus] == MP_BUS_EISA || 845 mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
779 mp_bus_id_to_type[lbus] == MP_BUS_MCA || 846 mp_bus_id_to_type[lbus] == MP_BUS_MCA
780 mp_bus_id_to_type[lbus] == MP_BUS_NEC98
781 ) && 847 ) &&
782 (mp_irqs[i].mpc_irqtype == type) && 848 (mp_irqs[i].mpc_irqtype == type) &&
783 (mp_irqs[i].mpc_srcbusirq == irq)) 849 (mp_irqs[i].mpc_srcbusirq == irq))
@@ -796,8 +862,7 @@ static int __init find_isa_irq_apic(int irq, int type)
796 862
797 if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || 863 if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
798 mp_bus_id_to_type[lbus] == MP_BUS_EISA || 864 mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
799 mp_bus_id_to_type[lbus] == MP_BUS_MCA || 865 mp_bus_id_to_type[lbus] == MP_BUS_MCA
800 mp_bus_id_to_type[lbus] == MP_BUS_NEC98
801 ) && 866 ) &&
802 (mp_irqs[i].mpc_irqtype == type) && 867 (mp_irqs[i].mpc_irqtype == type) &&
803 (mp_irqs[i].mpc_srcbusirq == irq)) 868 (mp_irqs[i].mpc_srcbusirq == irq))
@@ -927,12 +992,6 @@ static int EISA_ELCR(unsigned int irq)
927#define default_MCA_trigger(idx) (1) 992#define default_MCA_trigger(idx) (1)
928#define default_MCA_polarity(idx) (0) 993#define default_MCA_polarity(idx) (0)
929 994
930/* NEC98 interrupts are always polarity zero edge triggered,
931 * when listed as conforming in the MP table. */
932
933#define default_NEC98_trigger(idx) (0)
934#define default_NEC98_polarity(idx) (0)
935
936static int __init MPBIOS_polarity(int idx) 995static int __init MPBIOS_polarity(int idx)
937{ 996{
938 int bus = mp_irqs[idx].mpc_srcbus; 997 int bus = mp_irqs[idx].mpc_srcbus;
@@ -967,11 +1026,6 @@ static int __init MPBIOS_polarity(int idx)
967 polarity = default_MCA_polarity(idx); 1026 polarity = default_MCA_polarity(idx);
968 break; 1027 break;
969 } 1028 }
970 case MP_BUS_NEC98: /* NEC 98 pin */
971 {
972 polarity = default_NEC98_polarity(idx);
973 break;
974 }
975 default: 1029 default:
976 { 1030 {
977 printk(KERN_WARNING "broken BIOS!!\n"); 1031 printk(KERN_WARNING "broken BIOS!!\n");
@@ -1041,11 +1095,6 @@ static int MPBIOS_trigger(int idx)
1041 trigger = default_MCA_trigger(idx); 1095 trigger = default_MCA_trigger(idx);
1042 break; 1096 break;
1043 } 1097 }
1044 case MP_BUS_NEC98: /* NEC 98 pin */
1045 {
1046 trigger = default_NEC98_trigger(idx);
1047 break;
1048 }
1049 default: 1098 default:
1050 { 1099 {
1051 printk(KERN_WARNING "broken BIOS!!\n"); 1100 printk(KERN_WARNING "broken BIOS!!\n");
@@ -1107,7 +1156,6 @@ static int pin_2_irq(int idx, int apic, int pin)
1107 case MP_BUS_ISA: /* ISA pin */ 1156 case MP_BUS_ISA: /* ISA pin */
1108 case MP_BUS_EISA: 1157 case MP_BUS_EISA:
1109 case MP_BUS_MCA: 1158 case MP_BUS_MCA:
1110 case MP_BUS_NEC98:
1111 { 1159 {
1112 irq = mp_irqs[idx].mpc_srcbusirq; 1160 irq = mp_irqs[idx].mpc_srcbusirq;
1113 break; 1161 break;
@@ -1175,7 +1223,7 @@ static inline int IO_APIC_irq_trigger(int irq)
1175} 1223}
1176 1224
1177/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ 1225/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
1178u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 }; 1226static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
1179 1227
1180static int __assign_irq_vector(int irq) 1228static int __assign_irq_vector(int irq)
1181{ 1229{
@@ -1225,11 +1273,13 @@ static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
1225{ 1273{
1226 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || 1274 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
1227 trigger == IOAPIC_LEVEL) 1275 trigger == IOAPIC_LEVEL)
1228 set_irq_chip_and_handler(irq, &ioapic_chip, 1276 set_irq_chip_and_handler_name(irq, &ioapic_chip,
1229 handle_fasteoi_irq); 1277 handle_fasteoi_irq, "fasteoi");
1230 else 1278 else {
1231 set_irq_chip_and_handler(irq, &ioapic_chip, 1279 irq_desc[irq].status |= IRQ_DELAYED_DISABLE;
1232 handle_edge_irq); 1280 set_irq_chip_and_handler_name(irq, &ioapic_chip,
1281 handle_edge_irq, "edge");
1282 }
1233 set_intr_gate(vector, interrupt[irq]); 1283 set_intr_gate(vector, interrupt[irq]);
1234} 1284}
1235 1285
@@ -1298,8 +1348,8 @@ static void __init setup_IO_APIC_irqs(void)
1298 if (!apic && (irq < 16)) 1348 if (!apic && (irq < 16))
1299 disable_8259A_irq(irq); 1349 disable_8259A_irq(irq);
1300 } 1350 }
1301 ioapic_write_entry(apic, pin, entry);
1302 spin_lock_irqsave(&ioapic_lock, flags); 1351 spin_lock_irqsave(&ioapic_lock, flags);
1352 __ioapic_write_entry(apic, pin, entry);
1303 set_native_irq_info(irq, TARGET_CPUS); 1353 set_native_irq_info(irq, TARGET_CPUS);
1304 spin_unlock_irqrestore(&ioapic_lock, flags); 1354 spin_unlock_irqrestore(&ioapic_lock, flags);
1305 } 1355 }
@@ -1864,6 +1914,15 @@ static void __init setup_ioapic_ids_from_mpc(void)
1864static void __init setup_ioapic_ids_from_mpc(void) { } 1914static void __init setup_ioapic_ids_from_mpc(void) { }
1865#endif 1915#endif
1866 1916
1917static int no_timer_check __initdata;
1918
1919static int __init notimercheck(char *s)
1920{
1921 no_timer_check = 1;
1922 return 1;
1923}
1924__setup("no_timer_check", notimercheck);
1925
1867/* 1926/*
1868 * There is a nasty bug in some older SMP boards, their mptable lies 1927 * There is a nasty bug in some older SMP boards, their mptable lies
1869 * about the timer IRQ. We do the following to work around the situation: 1928 * about the timer IRQ. We do the following to work around the situation:
@@ -1872,10 +1931,13 @@ static void __init setup_ioapic_ids_from_mpc(void) { }
1872 * - if this function detects that timer IRQs are defunct, then we fall 1931 * - if this function detects that timer IRQs are defunct, then we fall
1873 * back to ISA timer IRQs 1932 * back to ISA timer IRQs
1874 */ 1933 */
1875static int __init timer_irq_works(void) 1934int __init timer_irq_works(void)
1876{ 1935{
1877 unsigned long t1 = jiffies; 1936 unsigned long t1 = jiffies;
1878 1937
1938 if (no_timer_check)
1939 return 1;
1940
1879 local_irq_enable(); 1941 local_irq_enable();
1880 /* Let ten ticks pass... */ 1942 /* Let ten ticks pass... */
1881 mdelay((10 * 1000) / HZ); 1943 mdelay((10 * 1000) / HZ);
@@ -2099,9 +2161,15 @@ static inline void unlock_ExtINT_logic(void)
2099 unsigned char save_control, save_freq_select; 2161 unsigned char save_control, save_freq_select;
2100 2162
2101 pin = find_isa_irq_pin(8, mp_INT); 2163 pin = find_isa_irq_pin(8, mp_INT);
2164 if (pin == -1) {
2165 WARN_ON_ONCE(1);
2166 return;
2167 }
2102 apic = find_isa_irq_apic(8, mp_INT); 2168 apic = find_isa_irq_apic(8, mp_INT);
2103 if (pin == -1) 2169 if (apic == -1) {
2170 WARN_ON_ONCE(1);
2104 return; 2171 return;
2172 }
2105 2173
2106 entry0 = ioapic_read_entry(apic, pin); 2174 entry0 = ioapic_read_entry(apic, pin);
2107 clear_IO_APIC_pin(apic, pin); 2175 clear_IO_APIC_pin(apic, pin);
@@ -2146,7 +2214,7 @@ int timer_uses_ioapic_pin_0;
2146 * is so screwy. Thanks to Brian Perkins for testing/hacking this beast 2214 * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
2147 * fanatically on his truly buggy board. 2215 * fanatically on his truly buggy board.
2148 */ 2216 */
2149static inline void check_timer(void) 2217static inline void __init check_timer(void)
2150{ 2218{
2151 int apic1, pin1, apic2, pin2; 2219 int apic1, pin1, apic2, pin2;
2152 int vector; 2220 int vector;
@@ -2235,7 +2303,8 @@ static inline void check_timer(void)
2235 printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); 2303 printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
2236 2304
2237 disable_8259A_irq(0); 2305 disable_8259A_irq(0);
2238 set_irq_chip_and_handler(0, &lapic_chip, handle_fasteoi_irq); 2306 set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq,
2307 "fasteio");
2239 apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ 2308 apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
2240 enable_8259A_irq(0); 2309 enable_8259A_irq(0);
2241 2310
@@ -2416,7 +2485,7 @@ device_initcall(ioapic_init_sysfs);
2416int create_irq(void) 2485int create_irq(void)
2417{ 2486{
2418 /* Allocate an unused irq */ 2487 /* Allocate an unused irq */
2419 int irq, new, vector; 2488 int irq, new, vector = 0;
2420 unsigned long flags; 2489 unsigned long flags;
2421 2490
2422 irq = -ENOSPC; 2491 irq = -ENOSPC;
@@ -2541,7 +2610,8 @@ int arch_setup_msi_irq(unsigned int irq, struct pci_dev *dev)
2541 2610
2542 write_msi_msg(irq, &msg); 2611 write_msi_msg(irq, &msg);
2543 2612
2544 set_irq_chip_and_handler(irq, &msi_chip, handle_edge_irq); 2613 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq,
2614 "edge");
2545 2615
2546 return 0; 2616 return 0;
2547} 2617}
@@ -2562,18 +2632,16 @@ void arch_teardown_msi_irq(unsigned int irq)
2562 2632
2563static void target_ht_irq(unsigned int irq, unsigned int dest) 2633static void target_ht_irq(unsigned int irq, unsigned int dest)
2564{ 2634{
2565 u32 low, high; 2635 struct ht_irq_msg msg;
2566 low = read_ht_irq_low(irq); 2636 fetch_ht_irq_msg(irq, &msg);
2567 high = read_ht_irq_high(irq);
2568 2637
2569 low &= ~(HT_IRQ_LOW_DEST_ID_MASK); 2638 msg.address_lo &= ~(HT_IRQ_LOW_DEST_ID_MASK);
2570 high &= ~(HT_IRQ_HIGH_DEST_ID_MASK); 2639 msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
2571 2640
2572 low |= HT_IRQ_LOW_DEST_ID(dest); 2641 msg.address_lo |= HT_IRQ_LOW_DEST_ID(dest);
2573 high |= HT_IRQ_HIGH_DEST_ID(dest); 2642 msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
2574 2643
2575 write_ht_irq_low(irq, low); 2644 write_ht_irq_msg(irq, &msg);
2576 write_ht_irq_high(irq, high);
2577} 2645}
2578 2646
2579static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) 2647static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
@@ -2611,7 +2679,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
2611 2679
2612 vector = assign_irq_vector(irq); 2680 vector = assign_irq_vector(irq);
2613 if (vector >= 0) { 2681 if (vector >= 0) {
2614 u32 low, high; 2682 struct ht_irq_msg msg;
2615 unsigned dest; 2683 unsigned dest;
2616 cpumask_t tmp; 2684 cpumask_t tmp;
2617 2685
@@ -2619,9 +2687,10 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
2619 cpu_set(vector >> 8, tmp); 2687 cpu_set(vector >> 8, tmp);
2620 dest = cpu_mask_to_apicid(tmp); 2688 dest = cpu_mask_to_apicid(tmp);
2621 2689
2622 high = HT_IRQ_HIGH_DEST_ID(dest); 2690 msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
2623 2691
2624 low = HT_IRQ_LOW_BASE | 2692 msg.address_lo =
2693 HT_IRQ_LOW_BASE |
2625 HT_IRQ_LOW_DEST_ID(dest) | 2694 HT_IRQ_LOW_DEST_ID(dest) |
2626 HT_IRQ_LOW_VECTOR(vector) | 2695 HT_IRQ_LOW_VECTOR(vector) |
2627 ((INT_DEST_MODE == 0) ? 2696 ((INT_DEST_MODE == 0) ?
@@ -2633,10 +2702,10 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
2633 HT_IRQ_LOW_MT_ARBITRATED) | 2702 HT_IRQ_LOW_MT_ARBITRATED) |
2634 HT_IRQ_LOW_IRQ_MASKED; 2703 HT_IRQ_LOW_IRQ_MASKED;
2635 2704
2636 write_ht_irq_low(irq, low); 2705 write_ht_irq_msg(irq, &msg);
2637 write_ht_irq_high(irq, high);
2638 2706
2639 set_irq_chip_and_handler(irq, &ht_irq_chip, handle_edge_irq); 2707 set_irq_chip_and_handler_name(irq, &ht_irq_chip,
2708 handle_edge_irq, "edge");
2640 } 2709 }
2641 return vector; 2710 return vector;
2642} 2711}
@@ -2793,8 +2862,8 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
2793 if (!ioapic && (irq < 16)) 2862 if (!ioapic && (irq < 16))
2794 disable_8259A_irq(irq); 2863 disable_8259A_irq(irq);
2795 2864
2796 ioapic_write_entry(ioapic, pin, entry);
2797 spin_lock_irqsave(&ioapic_lock, flags); 2865 spin_lock_irqsave(&ioapic_lock, flags);
2866 __ioapic_write_entry(ioapic, pin, entry);
2798 set_native_irq_info(irq, TARGET_CPUS); 2867 set_native_irq_info(irq, TARGET_CPUS);
2799 spin_unlock_irqrestore(&ioapic_lock, flags); 2868 spin_unlock_irqrestore(&ioapic_lock, flags);
2800 2869
diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c
index 8cfc7dbec7b9..3201d421090a 100644
--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -258,7 +258,7 @@ int show_interrupts(struct seq_file *p, void *v)
258 seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); 258 seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
259#endif 259#endif
260 seq_printf(p, " %8s", irq_desc[i].chip->name); 260 seq_printf(p, " %8s", irq_desc[i].chip->name);
261 seq_printf(p, "-%s", handle_irq_name(irq_desc[i].handle_irq)); 261 seq_printf(p, "-%-8s", irq_desc[i].name);
262 seq_printf(p, " %s", action->name); 262 seq_printf(p, " %s", action->name);
263 263
264 for (action=action->next; action; action = action->next) 264 for (action=action->next; action; action = action->next)
diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c
index d98e44b16fe2..af1d53344993 100644
--- a/arch/i386/kernel/kprobes.c
+++ b/arch/i386/kernel/kprobes.c
@@ -184,7 +184,7 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
184void __kprobes arch_remove_kprobe(struct kprobe *p) 184void __kprobes arch_remove_kprobe(struct kprobe *p)
185{ 185{
186 mutex_lock(&kprobe_mutex); 186 mutex_lock(&kprobe_mutex);
187 free_insn_slot(p->ainsn.insn); 187 free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
188 mutex_unlock(&kprobe_mutex); 188 mutex_unlock(&kprobe_mutex);
189} 189}
190 190
@@ -333,7 +333,7 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
333 return 1; 333 return 1;
334 334
335ss_probe: 335ss_probe:
336#ifndef CONFIG_PREEMPT 336#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM)
337 if (p->ainsn.boostable == 1 && !p->post_handler){ 337 if (p->ainsn.boostable == 1 && !p->post_handler){
338 /* Boost up -- we can execute copied instructions directly */ 338 /* Boost up -- we can execute copied instructions directly */
339 reset_current_kprobe(); 339 reset_current_kprobe();
@@ -361,8 +361,11 @@ no_kprobe:
361 asm volatile ( ".global kretprobe_trampoline\n" 361 asm volatile ( ".global kretprobe_trampoline\n"
362 "kretprobe_trampoline: \n" 362 "kretprobe_trampoline: \n"
363 " pushf\n" 363 " pushf\n"
364 /* skip cs, eip, orig_eax, es, ds */ 364 /* skip cs, eip, orig_eax */
365 " subl $20, %esp\n" 365 " subl $12, %esp\n"
366 " pushl %gs\n"
367 " pushl %ds\n"
368 " pushl %es\n"
366 " pushl %eax\n" 369 " pushl %eax\n"
367 " pushl %ebp\n" 370 " pushl %ebp\n"
368 " pushl %edi\n" 371 " pushl %edi\n"
@@ -373,10 +376,10 @@ no_kprobe:
373 " movl %esp, %eax\n" 376 " movl %esp, %eax\n"
374 " call trampoline_handler\n" 377 " call trampoline_handler\n"
375 /* move eflags to cs */ 378 /* move eflags to cs */
376 " movl 48(%esp), %edx\n" 379 " movl 52(%esp), %edx\n"
377 " movl %edx, 44(%esp)\n" 380 " movl %edx, 48(%esp)\n"
378 /* save true return address on eflags */ 381 /* save true return address on eflags */
379 " movl %eax, 48(%esp)\n" 382 " movl %eax, 52(%esp)\n"
380 " popl %ebx\n" 383 " popl %ebx\n"
381 " popl %ecx\n" 384 " popl %ecx\n"
382 " popl %edx\n" 385 " popl %edx\n"
@@ -384,8 +387,8 @@ no_kprobe:
384 " popl %edi\n" 387 " popl %edi\n"
385 " popl %ebp\n" 388 " popl %ebp\n"
386 " popl %eax\n" 389 " popl %eax\n"
387 /* skip eip, orig_eax, es, ds */ 390 /* skip eip, orig_eax, es, ds, gs */
388 " addl $16, %esp\n" 391 " addl $20, %esp\n"
389 " popf\n" 392 " popf\n"
390 " ret\n"); 393 " ret\n");
391} 394}
@@ -404,6 +407,10 @@ fastcall void *__kprobes trampoline_handler(struct pt_regs *regs)
404 INIT_HLIST_HEAD(&empty_rp); 407 INIT_HLIST_HEAD(&empty_rp);
405 spin_lock_irqsave(&kretprobe_lock, flags); 408 spin_lock_irqsave(&kretprobe_lock, flags);
406 head = kretprobe_inst_table_head(current); 409 head = kretprobe_inst_table_head(current);
410 /* fixup registers */
411 regs->xcs = __KERNEL_CS;
412 regs->eip = trampoline_address;
413 regs->orig_eax = 0xffffffff;
407 414
408 /* 415 /*
409 * It is possible to have multiple instances associated with a given 416 * It is possible to have multiple instances associated with a given
@@ -425,6 +432,7 @@ fastcall void *__kprobes trampoline_handler(struct pt_regs *regs)
425 432
426 if (ri->rp && ri->rp->handler){ 433 if (ri->rp && ri->rp->handler){
427 __get_cpu_var(current_kprobe) = &ri->rp->kp; 434 __get_cpu_var(current_kprobe) = &ri->rp->kp;
435 get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
428 ri->rp->handler(ri, regs); 436 ri->rp->handler(ri, regs);
429 __get_cpu_var(current_kprobe) = NULL; 437 __get_cpu_var(current_kprobe) = NULL;
430 } 438 }
diff --git a/arch/i386/kernel/ldt.c b/arch/i386/kernel/ldt.c
index 445211eb2d57..b410e5fb034f 100644
--- a/arch/i386/kernel/ldt.c
+++ b/arch/i386/kernel/ldt.c
@@ -160,16 +160,14 @@ static int read_default_ldt(void __user * ptr, unsigned long bytecount)
160{ 160{
161 int err; 161 int err;
162 unsigned long size; 162 unsigned long size;
163 void *address;
164 163
165 err = 0; 164 err = 0;
166 address = &default_ldt[0];
167 size = 5*sizeof(struct desc_struct); 165 size = 5*sizeof(struct desc_struct);
168 if (size > bytecount) 166 if (size > bytecount)
169 size = bytecount; 167 size = bytecount;
170 168
171 err = size; 169 err = size;
172 if (copy_to_user(ptr, address, size)) 170 if (clear_user(ptr, size))
173 err = -EFAULT; 171 err = -EFAULT;
174 172
175 return err; 173 return err;
diff --git a/arch/i386/kernel/mca.c b/arch/i386/kernel/mca.c
index eb57a851789d..b83672b89527 100644
--- a/arch/i386/kernel/mca.c
+++ b/arch/i386/kernel/mca.c
@@ -283,10 +283,9 @@ static int __init mca_init(void)
283 bus->f.mca_transform_memory = mca_dummy_transform_memory; 283 bus->f.mca_transform_memory = mca_dummy_transform_memory;
284 284
285 /* get the motherboard device */ 285 /* get the motherboard device */
286 mca_dev = kmalloc(sizeof(struct mca_device), GFP_KERNEL); 286 mca_dev = kzalloc(sizeof(struct mca_device), GFP_KERNEL);
287 if(unlikely(!mca_dev)) 287 if(unlikely(!mca_dev))
288 goto out_nomem; 288 goto out_nomem;
289 memset(mca_dev, 0, sizeof(struct mca_device));
290 289
291 /* 290 /*
292 * We do not expect many MCA interrupts during initialization, 291 * We do not expect many MCA interrupts during initialization,
@@ -310,11 +309,9 @@ static int __init mca_init(void)
310 mca_dev->slot = MCA_MOTHERBOARD; 309 mca_dev->slot = MCA_MOTHERBOARD;
311 mca_register_device(MCA_PRIMARY_BUS, mca_dev); 310 mca_register_device(MCA_PRIMARY_BUS, mca_dev);
312 311
313 mca_dev = kmalloc(sizeof(struct mca_device), GFP_ATOMIC); 312 mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
314 if(unlikely(!mca_dev)) 313 if(unlikely(!mca_dev))
315 goto out_unlock_nomem; 314 goto out_unlock_nomem;
316 memset(mca_dev, 0, sizeof(struct mca_device));
317
318 315
319 /* Put motherboard into video setup mode, read integrated video 316 /* Put motherboard into video setup mode, read integrated video
320 * POS registers, and turn motherboard setup off. 317 * POS registers, and turn motherboard setup off.
@@ -349,10 +346,9 @@ static int __init mca_init(void)
349 } 346 }
350 if(which_scsi) { 347 if(which_scsi) {
351 /* found a scsi card */ 348 /* found a scsi card */
352 mca_dev = kmalloc(sizeof(struct mca_device), GFP_ATOMIC); 349 mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
353 if(unlikely(!mca_dev)) 350 if(unlikely(!mca_dev))
354 goto out_unlock_nomem; 351 goto out_unlock_nomem;
355 memset(mca_dev, 0, sizeof(struct mca_device));
356 352
357 for(j = 0; j < 8; j++) 353 for(j = 0; j < 8; j++)
358 mca_dev->pos[j] = pos[j]; 354 mca_dev->pos[j] = pos[j];
@@ -378,10 +374,9 @@ static int __init mca_init(void)
378 if(!mca_read_and_store_pos(pos)) 374 if(!mca_read_and_store_pos(pos))
379 continue; 375 continue;
380 376
381 mca_dev = kmalloc(sizeof(struct mca_device), GFP_ATOMIC); 377 mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
382 if(unlikely(!mca_dev)) 378 if(unlikely(!mca_dev))
383 goto out_unlock_nomem; 379 goto out_unlock_nomem;
384 memset(mca_dev, 0, sizeof(struct mca_device));
385 380
386 for(j=0; j<8; j++) 381 for(j=0; j<8; j++)
387 mca_dev->pos[j]=pos[j]; 382 mca_dev->pos[j]=pos[j];
diff --git a/arch/i386/kernel/microcode.c b/arch/i386/kernel/microcode.c
index c4d0291b519f..972346604f9d 100644
--- a/arch/i386/kernel/microcode.c
+++ b/arch/i386/kernel/microcode.c
@@ -577,7 +577,7 @@ static void microcode_init_cpu(int cpu)
577 set_cpus_allowed(current, cpumask_of_cpu(cpu)); 577 set_cpus_allowed(current, cpumask_of_cpu(cpu));
578 mutex_lock(&microcode_mutex); 578 mutex_lock(&microcode_mutex);
579 collect_cpu_info(cpu); 579 collect_cpu_info(cpu);
580 if (uci->valid) 580 if (uci->valid && system_state == SYSTEM_RUNNING)
581 cpu_request_microcode(cpu); 581 cpu_request_microcode(cpu);
582 mutex_unlock(&microcode_mutex); 582 mutex_unlock(&microcode_mutex);
583 set_cpus_allowed(current, old); 583 set_cpus_allowed(current, old);
@@ -703,7 +703,6 @@ static struct sysdev_driver mc_sysdev_driver = {
703 .resume = mc_sysdev_resume, 703 .resume = mc_sysdev_resume,
704}; 704};
705 705
706#ifdef CONFIG_HOTPLUG_CPU
707static __cpuinit int 706static __cpuinit int
708mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) 707mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
709{ 708{
@@ -726,7 +725,6 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
726static struct notifier_block mc_cpu_notifier = { 725static struct notifier_block mc_cpu_notifier = {
727 .notifier_call = mc_cpu_callback, 726 .notifier_call = mc_cpu_callback,
728}; 727};
729#endif
730 728
731static int __init microcode_init (void) 729static int __init microcode_init (void)
732{ 730{
diff --git a/arch/i386/kernel/module.c b/arch/i386/kernel/module.c
index 470cf97e7cd3..3db0a5442eb1 100644
--- a/arch/i386/kernel/module.c
+++ b/arch/i386/kernel/module.c
@@ -21,6 +21,7 @@
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/string.h> 22#include <linux/string.h>
23#include <linux/kernel.h> 23#include <linux/kernel.h>
24#include <linux/bug.h>
24 25
25#if 0 26#if 0
26#define DEBUGP printk 27#define DEBUGP printk
@@ -108,7 +109,8 @@ int module_finalize(const Elf_Ehdr *hdr,
108 const Elf_Shdr *sechdrs, 109 const Elf_Shdr *sechdrs,
109 struct module *me) 110 struct module *me)
110{ 111{
111 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL; 112 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
113 *para = NULL;
112 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; 114 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
113 115
114 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { 116 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
@@ -118,6 +120,8 @@ int module_finalize(const Elf_Ehdr *hdr,
118 alt = s; 120 alt = s;
119 if (!strcmp(".smp_locks", secstrings + s->sh_name)) 121 if (!strcmp(".smp_locks", secstrings + s->sh_name))
120 locks= s; 122 locks= s;
123 if (!strcmp(".parainstructions", secstrings + s->sh_name))
124 para = s;
121 } 125 }
122 126
123 if (alt) { 127 if (alt) {
@@ -132,10 +136,17 @@ int module_finalize(const Elf_Ehdr *hdr,
132 lseg, lseg + locks->sh_size, 136 lseg, lseg + locks->sh_size,
133 tseg, tseg + text->sh_size); 137 tseg, tseg + text->sh_size);
134 } 138 }
135 return 0; 139
140 if (para) {
141 void *pseg = (void *)para->sh_addr;
142 apply_paravirt(pseg, pseg + para->sh_size);
143 }
144
145 return module_bug_finalize(hdr, sechdrs, me);
136} 146}
137 147
138void module_arch_cleanup(struct module *mod) 148void module_arch_cleanup(struct module *mod)
139{ 149{
140 alternatives_smp_module_del(mod); 150 alternatives_smp_module_del(mod);
151 module_bug_cleanup(mod);
141} 152}
diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c
index 442aaf8c77eb..2ce67228dff8 100644
--- a/arch/i386/kernel/mpparse.c
+++ b/arch/i386/kernel/mpparse.c
@@ -249,8 +249,6 @@ static void __init MP_bus_info (struct mpc_config_bus *m)
249 mp_current_pci_id++; 249 mp_current_pci_id++;
250 } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) { 250 } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) {
251 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA; 251 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
252 } else if (strncmp(str, BUSTYPE_NEC98, sizeof(BUSTYPE_NEC98)-1) == 0) {
253 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_NEC98;
254 } else { 252 } else {
255 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); 253 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
256 } 254 }
diff --git a/arch/i386/kernel/msr.c b/arch/i386/kernel/msr.c
index d535cdbbfd25..4a472a17d1c6 100644
--- a/arch/i386/kernel/msr.c
+++ b/arch/i386/kernel/msr.c
@@ -172,7 +172,7 @@ static ssize_t msr_read(struct file *file, char __user * buf,
172 u32 __user *tmp = (u32 __user *) buf; 172 u32 __user *tmp = (u32 __user *) buf;
173 u32 data[2]; 173 u32 data[2];
174 u32 reg = *ppos; 174 u32 reg = *ppos;
175 int cpu = iminor(file->f_dentry->d_inode); 175 int cpu = iminor(file->f_path.dentry->d_inode);
176 int err; 176 int err;
177 177
178 if (count % 8) 178 if (count % 8)
@@ -195,15 +195,14 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
195{ 195{
196 const u32 __user *tmp = (const u32 __user *)buf; 196 const u32 __user *tmp = (const u32 __user *)buf;
197 u32 data[2]; 197 u32 data[2];
198 size_t rv;
199 u32 reg = *ppos; 198 u32 reg = *ppos;
200 int cpu = iminor(file->f_dentry->d_inode); 199 int cpu = iminor(file->f_path.dentry->d_inode);
201 int err; 200 int err;
202 201
203 if (count % 8) 202 if (count % 8)
204 return -EINVAL; /* Invalid chunk size */ 203 return -EINVAL; /* Invalid chunk size */
205 204
206 for (rv = 0; count; count -= 8) { 205 for (; count; count -= 8) {
207 if (copy_from_user(&data, tmp, 8)) 206 if (copy_from_user(&data, tmp, 8))
208 return -EFAULT; 207 return -EFAULT;
209 err = do_wrmsr(cpu, reg, data[0], data[1]); 208 err = do_wrmsr(cpu, reg, data[0], data[1]);
@@ -217,7 +216,7 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
217 216
218static int msr_open(struct inode *inode, struct file *file) 217static int msr_open(struct inode *inode, struct file *file)
219{ 218{
220 unsigned int cpu = iminor(file->f_dentry->d_inode); 219 unsigned int cpu = iminor(file->f_path.dentry->d_inode);
221 struct cpuinfo_x86 *c = &(cpu_data)[cpu]; 220 struct cpuinfo_x86 *c = &(cpu_data)[cpu];
222 221
223 if (cpu >= NR_CPUS || !cpu_online(cpu)) 222 if (cpu >= NR_CPUS || !cpu_online(cpu))
@@ -239,18 +238,17 @@ static struct file_operations msr_fops = {
239 .open = msr_open, 238 .open = msr_open,
240}; 239};
241 240
242static int msr_class_device_create(int i) 241static int msr_device_create(int i)
243{ 242{
244 int err = 0; 243 int err = 0;
245 struct class_device *class_err; 244 struct device *dev;
246 245
247 class_err = class_device_create(msr_class, NULL, MKDEV(MSR_MAJOR, i), NULL, "msr%d",i); 246 dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, i), "msr%d",i);
248 if (IS_ERR(class_err)) 247 if (IS_ERR(dev))
249 err = PTR_ERR(class_err); 248 err = PTR_ERR(dev);
250 return err; 249 return err;
251} 250}
252 251
253#ifdef CONFIG_HOTPLUG_CPU
254static int msr_class_cpu_callback(struct notifier_block *nfb, 252static int msr_class_cpu_callback(struct notifier_block *nfb,
255 unsigned long action, void *hcpu) 253 unsigned long action, void *hcpu)
256{ 254{
@@ -258,10 +256,10 @@ static int msr_class_cpu_callback(struct notifier_block *nfb,
258 256
259 switch (action) { 257 switch (action) {
260 case CPU_ONLINE: 258 case CPU_ONLINE:
261 msr_class_device_create(cpu); 259 msr_device_create(cpu);
262 break; 260 break;
263 case CPU_DEAD: 261 case CPU_DEAD:
264 class_device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu)); 262 device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu));
265 break; 263 break;
266 } 264 }
267 return NOTIFY_OK; 265 return NOTIFY_OK;
@@ -271,7 +269,6 @@ static struct notifier_block __cpuinitdata msr_class_cpu_notifier =
271{ 269{
272 .notifier_call = msr_class_cpu_callback, 270 .notifier_call = msr_class_cpu_callback,
273}; 271};
274#endif
275 272
276static int __init msr_init(void) 273static int __init msr_init(void)
277{ 274{
@@ -290,7 +287,7 @@ static int __init msr_init(void)
290 goto out_chrdev; 287 goto out_chrdev;
291 } 288 }
292 for_each_online_cpu(i) { 289 for_each_online_cpu(i) {
293 err = msr_class_device_create(i); 290 err = msr_device_create(i);
294 if (err != 0) 291 if (err != 0)
295 goto out_class; 292 goto out_class;
296 } 293 }
@@ -302,7 +299,7 @@ static int __init msr_init(void)
302out_class: 299out_class:
303 i = 0; 300 i = 0;
304 for_each_online_cpu(i) 301 for_each_online_cpu(i)
305 class_device_destroy(msr_class, MKDEV(MSR_MAJOR, i)); 302 device_destroy(msr_class, MKDEV(MSR_MAJOR, i));
306 class_destroy(msr_class); 303 class_destroy(msr_class);
307out_chrdev: 304out_chrdev:
308 unregister_chrdev(MSR_MAJOR, "cpu/msr"); 305 unregister_chrdev(MSR_MAJOR, "cpu/msr");
@@ -314,7 +311,7 @@ static void __exit msr_exit(void)
314{ 311{
315 int cpu = 0; 312 int cpu = 0;
316 for_each_online_cpu(cpu) 313 for_each_online_cpu(cpu)
317 class_device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu)); 314 device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu));
318 class_destroy(msr_class); 315 class_destroy(msr_class);
319 unregister_chrdev(MSR_MAJOR, "cpu/msr"); 316 unregister_chrdev(MSR_MAJOR, "cpu/msr");
320 unregister_hotcpu_notifier(&msr_class_cpu_notifier); 317 unregister_hotcpu_notifier(&msr_class_cpu_notifier);
diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index 3e8e3adb0489..a5e34d655965 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -22,6 +22,7 @@
22#include <linux/percpu.h> 22#include <linux/percpu.h>
23#include <linux/dmi.h> 23#include <linux/dmi.h>
24#include <linux/kprobes.h> 24#include <linux/kprobes.h>
25#include <linux/cpumask.h>
25 26
26#include <asm/smp.h> 27#include <asm/smp.h>
27#include <asm/nmi.h> 28#include <asm/nmi.h>
@@ -42,6 +43,8 @@ int nmi_watchdog_enabled;
42static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner); 43static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner);
43static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[3]); 44static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[3]);
44 45
46static cpumask_t backtrace_mask = CPU_MASK_NONE;
47
45/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's 48/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
46 * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now) 49 * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now)
47 */ 50 */
@@ -192,6 +195,8 @@ static __cpuinit inline int nmi_known_cpu(void)
192 return 0; 195 return 0;
193} 196}
194 197
198static int endflag __initdata = 0;
199
195#ifdef CONFIG_SMP 200#ifdef CONFIG_SMP
196/* The performance counters used by NMI_LOCAL_APIC don't trigger when 201/* The performance counters used by NMI_LOCAL_APIC don't trigger when
197 * the CPU is idle. To make sure the NMI watchdog really ticks on all 202 * the CPU is idle. To make sure the NMI watchdog really ticks on all
@@ -199,7 +204,6 @@ static __cpuinit inline int nmi_known_cpu(void)
199 */ 204 */
200static __init void nmi_cpu_busy(void *data) 205static __init void nmi_cpu_busy(void *data)
201{ 206{
202 volatile int *endflag = data;
203 local_irq_enable_in_hardirq(); 207 local_irq_enable_in_hardirq();
204 /* Intentionally don't use cpu_relax here. This is 208 /* Intentionally don't use cpu_relax here. This is
205 to make sure that the performance counter really ticks, 209 to make sure that the performance counter really ticks,
@@ -207,23 +211,22 @@ static __init void nmi_cpu_busy(void *data)
207 pause instruction. On a real HT machine this is fine because 211 pause instruction. On a real HT machine this is fine because
208 all other CPUs are busy with "useless" delay loops and don't 212 all other CPUs are busy with "useless" delay loops and don't
209 care if they get somewhat less cycles. */ 213 care if they get somewhat less cycles. */
210 while (*endflag == 0) 214 while (endflag == 0)
211 barrier(); 215 mb();
212} 216}
213#endif 217#endif
214 218
215static int __init check_nmi_watchdog(void) 219static int __init check_nmi_watchdog(void)
216{ 220{
217 volatile int endflag = 0;
218 unsigned int *prev_nmi_count; 221 unsigned int *prev_nmi_count;
219 int cpu; 222 int cpu;
220 223
221 /* Enable NMI watchdog for newer systems. 224 /* Enable NMI watchdog for newer systems.
222 Actually it should be safe for most systems before 2004 too except 225 Probably safe on most older systems too, but let's be careful.
223 for some IBM systems that corrupt registers when NMI happens 226 IBM ThinkPads use INT10 inside SMM and that allows early NMI inside SMM
224 during SMM. Unfortunately we don't have more exact information 227 which hangs the system. Disable watchdog for all thinkpads */
225 on these and use this coarse check. */ 228 if (nmi_watchdog == NMI_DEFAULT && dmi_get_year(DMI_BIOS_DATE) >= 2004 &&
226 if (nmi_watchdog == NMI_DEFAULT && dmi_get_year(DMI_BIOS_DATE) >= 2004) 229 !dmi_name_in_vendors("ThinkPad"))
227 nmi_watchdog = NMI_LOCAL_APIC; 230 nmi_watchdog = NMI_LOCAL_APIC;
228 231
229 if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DEFAULT)) 232 if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DEFAULT))
@@ -867,14 +870,16 @@ static unsigned int
867 870
868void touch_nmi_watchdog (void) 871void touch_nmi_watchdog (void)
869{ 872{
870 int i; 873 if (nmi_watchdog > 0) {
874 unsigned cpu;
871 875
872 /* 876 /*
873 * Just reset the alert counters, (other CPUs might be 877 * Just reset the alert counters, (other CPUs might be
874 * spinning on locks we hold): 878 * spinning on locks we hold):
875 */ 879 */
876 for_each_possible_cpu(i) 880 for_each_present_cpu (cpu)
877 alert_counter[i] = 0; 881 alert_counter[cpu] = 0;
882 }
878 883
879 /* 884 /*
880 * Tickle the softlockup detector too: 885 * Tickle the softlockup detector too:
@@ -907,6 +912,16 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
907 touched = 1; 912 touched = 1;
908 } 913 }
909 914
915 if (cpu_isset(cpu, backtrace_mask)) {
916 static DEFINE_SPINLOCK(lock); /* Serialise the printks */
917
918 spin_lock(&lock);
919 printk("NMI backtrace for cpu %d\n", cpu);
920 dump_stack();
921 spin_unlock(&lock);
922 cpu_clear(cpu, backtrace_mask);
923 }
924
910 sum = per_cpu(irq_stat, cpu).apic_timer_irqs; 925 sum = per_cpu(irq_stat, cpu).apic_timer_irqs;
911 926
912 /* if the apic timer isn't firing, this cpu isn't doing much */ 927 /* if the apic timer isn't firing, this cpu isn't doing much */
@@ -1033,6 +1048,19 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
1033 1048
1034#endif 1049#endif
1035 1050
1051void __trigger_all_cpu_backtrace(void)
1052{
1053 int i;
1054
1055 backtrace_mask = cpu_online_map;
1056 /* Wait for up to 10 seconds for all CPUs to do the backtrace */
1057 for (i = 0; i < 10 * 1000; i++) {
1058 if (cpus_empty(backtrace_mask))
1059 break;
1060 mdelay(1);
1061 }
1062}
1063
1036EXPORT_SYMBOL(nmi_active); 1064EXPORT_SYMBOL(nmi_active);
1037EXPORT_SYMBOL(nmi_watchdog); 1065EXPORT_SYMBOL(nmi_watchdog);
1038EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi); 1066EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c
new file mode 100644
index 000000000000..3dceab5828f1
--- /dev/null
+++ b/arch/i386/kernel/paravirt.c
@@ -0,0 +1,569 @@
1/* Paravirtualization interfaces
2 Copyright (C) 2006 Rusty Russell IBM Corporation
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17*/
18#include <linux/errno.h>
19#include <linux/module.h>
20#include <linux/efi.h>
21#include <linux/bcd.h>
22#include <linux/start_kernel.h>
23
24#include <asm/bug.h>
25#include <asm/paravirt.h>
26#include <asm/desc.h>
27#include <asm/setup.h>
28#include <asm/arch_hooks.h>
29#include <asm/time.h>
30#include <asm/irq.h>
31#include <asm/delay.h>
32#include <asm/fixmap.h>
33#include <asm/apic.h>
34#include <asm/tlbflush.h>
35
36/* nop stub */
37static void native_nop(void)
38{
39}
40
41static void __init default_banner(void)
42{
43 printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
44 paravirt_ops.name);
45}
46
47char *memory_setup(void)
48{
49 return paravirt_ops.memory_setup();
50}
51
52/* Simple instruction patching code. */
53#define DEF_NATIVE(name, code) \
54 extern const char start_##name[], end_##name[]; \
55 asm("start_" #name ": " code "; end_" #name ":")
56DEF_NATIVE(cli, "cli");
57DEF_NATIVE(sti, "sti");
58DEF_NATIVE(popf, "push %eax; popf");
59DEF_NATIVE(pushf, "pushf; pop %eax");
60DEF_NATIVE(pushf_cli, "pushf; pop %eax; cli");
61DEF_NATIVE(iret, "iret");
62DEF_NATIVE(sti_sysexit, "sti; sysexit");
63
64static const struct native_insns
65{
66 const char *start, *end;
67} native_insns[] = {
68 [PARAVIRT_IRQ_DISABLE] = { start_cli, end_cli },
69 [PARAVIRT_IRQ_ENABLE] = { start_sti, end_sti },
70 [PARAVIRT_RESTORE_FLAGS] = { start_popf, end_popf },
71 [PARAVIRT_SAVE_FLAGS] = { start_pushf, end_pushf },
72 [PARAVIRT_SAVE_FLAGS_IRQ_DISABLE] = { start_pushf_cli, end_pushf_cli },
73 [PARAVIRT_INTERRUPT_RETURN] = { start_iret, end_iret },
74 [PARAVIRT_STI_SYSEXIT] = { start_sti_sysexit, end_sti_sysexit },
75};
76
77static unsigned native_patch(u8 type, u16 clobbers, void *insns, unsigned len)
78{
79 unsigned int insn_len;
80
81 /* Don't touch it if we don't have a replacement */
82 if (type >= ARRAY_SIZE(native_insns) || !native_insns[type].start)
83 return len;
84
85 insn_len = native_insns[type].end - native_insns[type].start;
86
87 /* Similarly if we can't fit replacement. */
88 if (len < insn_len)
89 return len;
90
91 memcpy(insns, native_insns[type].start, insn_len);
92 return insn_len;
93}
94
95static fastcall unsigned long native_get_debugreg(int regno)
96{
97 unsigned long val = 0; /* Damn you, gcc! */
98
99 switch (regno) {
100 case 0:
101 asm("movl %%db0, %0" :"=r" (val)); break;
102 case 1:
103 asm("movl %%db1, %0" :"=r" (val)); break;
104 case 2:
105 asm("movl %%db2, %0" :"=r" (val)); break;
106 case 3:
107 asm("movl %%db3, %0" :"=r" (val)); break;
108 case 6:
109 asm("movl %%db6, %0" :"=r" (val)); break;
110 case 7:
111 asm("movl %%db7, %0" :"=r" (val)); break;
112 default:
113 BUG();
114 }
115 return val;
116}
117
118static fastcall void native_set_debugreg(int regno, unsigned long value)
119{
120 switch (regno) {
121 case 0:
122 asm("movl %0,%%db0" : /* no output */ :"r" (value));
123 break;
124 case 1:
125 asm("movl %0,%%db1" : /* no output */ :"r" (value));
126 break;
127 case 2:
128 asm("movl %0,%%db2" : /* no output */ :"r" (value));
129 break;
130 case 3:
131 asm("movl %0,%%db3" : /* no output */ :"r" (value));
132 break;
133 case 6:
134 asm("movl %0,%%db6" : /* no output */ :"r" (value));
135 break;
136 case 7:
137 asm("movl %0,%%db7" : /* no output */ :"r" (value));
138 break;
139 default:
140 BUG();
141 }
142}
143
144void init_IRQ(void)
145{
146 paravirt_ops.init_IRQ();
147}
148
149static fastcall void native_clts(void)
150{
151 asm volatile ("clts");
152}
153
154static fastcall unsigned long native_read_cr0(void)
155{
156 unsigned long val;
157 asm volatile("movl %%cr0,%0\n\t" :"=r" (val));
158 return val;
159}
160
161static fastcall void native_write_cr0(unsigned long val)
162{
163 asm volatile("movl %0,%%cr0": :"r" (val));
164}
165
166static fastcall unsigned long native_read_cr2(void)
167{
168 unsigned long val;
169 asm volatile("movl %%cr2,%0\n\t" :"=r" (val));
170 return val;
171}
172
173static fastcall void native_write_cr2(unsigned long val)
174{
175 asm volatile("movl %0,%%cr2": :"r" (val));
176}
177
178static fastcall unsigned long native_read_cr3(void)
179{
180 unsigned long val;
181 asm volatile("movl %%cr3,%0\n\t" :"=r" (val));
182 return val;
183}
184
185static fastcall void native_write_cr3(unsigned long val)
186{
187 asm volatile("movl %0,%%cr3": :"r" (val));
188}
189
190static fastcall unsigned long native_read_cr4(void)
191{
192 unsigned long val;
193 asm volatile("movl %%cr4,%0\n\t" :"=r" (val));
194 return val;
195}
196
197static fastcall unsigned long native_read_cr4_safe(void)
198{
199 unsigned long val;
200 /* This could fault if %cr4 does not exist */
201 asm("1: movl %%cr4, %0 \n"
202 "2: \n"
203 ".section __ex_table,\"a\" \n"
204 ".long 1b,2b \n"
205 ".previous \n"
206 : "=r" (val): "0" (0));
207 return val;
208}
209
210static fastcall void native_write_cr4(unsigned long val)
211{
212 asm volatile("movl %0,%%cr4": :"r" (val));
213}
214
215static fastcall unsigned long native_save_fl(void)
216{
217 unsigned long f;
218 asm volatile("pushfl ; popl %0":"=g" (f): /* no input */);
219 return f;
220}
221
222static fastcall void native_restore_fl(unsigned long f)
223{
224 asm volatile("pushl %0 ; popfl": /* no output */
225 :"g" (f)
226 :"memory", "cc");
227}
228
229static fastcall void native_irq_disable(void)
230{
231 asm volatile("cli": : :"memory");
232}
233
234static fastcall void native_irq_enable(void)
235{
236 asm volatile("sti": : :"memory");
237}
238
239static fastcall void native_safe_halt(void)
240{
241 asm volatile("sti; hlt": : :"memory");
242}
243
244static fastcall void native_halt(void)
245{
246 asm volatile("hlt": : :"memory");
247}
248
249static fastcall void native_wbinvd(void)
250{
251 asm volatile("wbinvd": : :"memory");
252}
253
254static fastcall unsigned long long native_read_msr(unsigned int msr, int *err)
255{
256 unsigned long long val;
257
258 asm volatile("2: rdmsr ; xorl %0,%0\n"
259 "1:\n\t"
260 ".section .fixup,\"ax\"\n\t"
261 "3: movl %3,%0 ; jmp 1b\n\t"
262 ".previous\n\t"
263 ".section __ex_table,\"a\"\n"
264 " .align 4\n\t"
265 " .long 2b,3b\n\t"
266 ".previous"
267 : "=r" (*err), "=A" (val)
268 : "c" (msr), "i" (-EFAULT));
269
270 return val;
271}
272
273static fastcall int native_write_msr(unsigned int msr, unsigned long long val)
274{
275 int err;
276 asm volatile("2: wrmsr ; xorl %0,%0\n"
277 "1:\n\t"
278 ".section .fixup,\"ax\"\n\t"
279 "3: movl %4,%0 ; jmp 1b\n\t"
280 ".previous\n\t"
281 ".section __ex_table,\"a\"\n"
282 " .align 4\n\t"
283 " .long 2b,3b\n\t"
284 ".previous"
285 : "=a" (err)
286 : "c" (msr), "0" ((u32)val), "d" ((u32)(val>>32)),
287 "i" (-EFAULT));
288 return err;
289}
290
291static fastcall unsigned long long native_read_tsc(void)
292{
293 unsigned long long val;
294 asm volatile("rdtsc" : "=A" (val));
295 return val;
296}
297
298static fastcall unsigned long long native_read_pmc(void)
299{
300 unsigned long long val;
301 asm volatile("rdpmc" : "=A" (val));
302 return val;
303}
304
305static fastcall void native_load_tr_desc(void)
306{
307 asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
308}
309
310static fastcall void native_load_gdt(const struct Xgt_desc_struct *dtr)
311{
312 asm volatile("lgdt %0"::"m" (*dtr));
313}
314
315static fastcall void native_load_idt(const struct Xgt_desc_struct *dtr)
316{
317 asm volatile("lidt %0"::"m" (*dtr));
318}
319
320static fastcall void native_store_gdt(struct Xgt_desc_struct *dtr)
321{
322 asm ("sgdt %0":"=m" (*dtr));
323}
324
325static fastcall void native_store_idt(struct Xgt_desc_struct *dtr)
326{
327 asm ("sidt %0":"=m" (*dtr));
328}
329
330static fastcall unsigned long native_store_tr(void)
331{
332 unsigned long tr;
333 asm ("str %0":"=r" (tr));
334 return tr;
335}
336
337static fastcall void native_load_tls(struct thread_struct *t, unsigned int cpu)
338{
339#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]
340 C(0); C(1); C(2);
341#undef C
342}
343
344static inline void native_write_dt_entry(void *dt, int entry, u32 entry_low, u32 entry_high)
345{
346 u32 *lp = (u32 *)((char *)dt + entry*8);
347 lp[0] = entry_low;
348 lp[1] = entry_high;
349}
350
351static fastcall void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high)
352{
353 native_write_dt_entry(dt, entrynum, low, high);
354}
355
356static fastcall void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high)
357{
358 native_write_dt_entry(dt, entrynum, low, high);
359}
360
361static fastcall void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high)
362{
363 native_write_dt_entry(dt, entrynum, low, high);
364}
365
366static fastcall void native_load_esp0(struct tss_struct *tss,
367 struct thread_struct *thread)
368{
369 tss->esp0 = thread->esp0;
370
371 /* This can only happen when SEP is enabled, no need to test "SEP"arately */
372 if (unlikely(tss->ss1 != thread->sysenter_cs)) {
373 tss->ss1 = thread->sysenter_cs;
374 wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
375 }
376}
377
378static fastcall void native_io_delay(void)
379{
380 asm volatile("outb %al,$0x80");
381}
382
383static fastcall void native_flush_tlb(void)
384{
385 __native_flush_tlb();
386}
387
388/*
389 * Global pages have to be flushed a bit differently. Not a real
390 * performance problem because this does not happen often.
391 */
392static fastcall void native_flush_tlb_global(void)
393{
394 __native_flush_tlb_global();
395}
396
397static fastcall void native_flush_tlb_single(u32 addr)
398{
399 __native_flush_tlb_single(addr);
400}
401
402#ifndef CONFIG_X86_PAE
403static fastcall void native_set_pte(pte_t *ptep, pte_t pteval)
404{
405 *ptep = pteval;
406}
407
408static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval)
409{
410 *ptep = pteval;
411}
412
413static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
414{
415 *pmdp = pmdval;
416}
417
418#else /* CONFIG_X86_PAE */
419
420static fastcall void native_set_pte(pte_t *ptep, pte_t pte)
421{
422 ptep->pte_high = pte.pte_high;
423 smp_wmb();
424 ptep->pte_low = pte.pte_low;
425}
426
427static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte)
428{
429 ptep->pte_high = pte.pte_high;
430 smp_wmb();
431 ptep->pte_low = pte.pte_low;
432}
433
434static fastcall void native_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
435{
436 ptep->pte_low = 0;
437 smp_wmb();
438 ptep->pte_high = pte.pte_high;
439 smp_wmb();
440 ptep->pte_low = pte.pte_low;
441}
442
443static fastcall void native_set_pte_atomic(pte_t *ptep, pte_t pteval)
444{
445 set_64bit((unsigned long long *)ptep,pte_val(pteval));
446}
447
448static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
449{
450 set_64bit((unsigned long long *)pmdp,pmd_val(pmdval));
451}
452
453static fastcall void native_set_pud(pud_t *pudp, pud_t pudval)
454{
455 *pudp = pudval;
456}
457
458static fastcall void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
459{
460 ptep->pte_low = 0;
461 smp_wmb();
462 ptep->pte_high = 0;
463}
464
465static fastcall void native_pmd_clear(pmd_t *pmd)
466{
467 u32 *tmp = (u32 *)pmd;
468 *tmp = 0;
469 smp_wmb();
470 *(tmp + 1) = 0;
471}
472#endif /* CONFIG_X86_PAE */
473
474/* These are in entry.S */
475extern fastcall void native_iret(void);
476extern fastcall void native_irq_enable_sysexit(void);
477
478static int __init print_banner(void)
479{
480 paravirt_ops.banner();
481 return 0;
482}
483core_initcall(print_banner);
484
485/* We simply declare start_kernel to be the paravirt probe of last resort. */
486paravirt_probe(start_kernel);
487
488struct paravirt_ops paravirt_ops = {
489 .name = "bare hardware",
490 .paravirt_enabled = 0,
491 .kernel_rpl = 0,
492
493 .patch = native_patch,
494 .banner = default_banner,
495 .arch_setup = native_nop,
496 .memory_setup = machine_specific_memory_setup,
497 .get_wallclock = native_get_wallclock,
498 .set_wallclock = native_set_wallclock,
499 .time_init = time_init_hook,
500 .init_IRQ = native_init_IRQ,
501
502 .cpuid = native_cpuid,
503 .get_debugreg = native_get_debugreg,
504 .set_debugreg = native_set_debugreg,
505 .clts = native_clts,
506 .read_cr0 = native_read_cr0,
507 .write_cr0 = native_write_cr0,
508 .read_cr2 = native_read_cr2,
509 .write_cr2 = native_write_cr2,
510 .read_cr3 = native_read_cr3,
511 .write_cr3 = native_write_cr3,
512 .read_cr4 = native_read_cr4,
513 .read_cr4_safe = native_read_cr4_safe,
514 .write_cr4 = native_write_cr4,
515 .save_fl = native_save_fl,
516 .restore_fl = native_restore_fl,
517 .irq_disable = native_irq_disable,
518 .irq_enable = native_irq_enable,
519 .safe_halt = native_safe_halt,
520 .halt = native_halt,
521 .wbinvd = native_wbinvd,
522 .read_msr = native_read_msr,
523 .write_msr = native_write_msr,
524 .read_tsc = native_read_tsc,
525 .read_pmc = native_read_pmc,
526 .load_tr_desc = native_load_tr_desc,
527 .set_ldt = native_set_ldt,
528 .load_gdt = native_load_gdt,
529 .load_idt = native_load_idt,
530 .store_gdt = native_store_gdt,
531 .store_idt = native_store_idt,
532 .store_tr = native_store_tr,
533 .load_tls = native_load_tls,
534 .write_ldt_entry = native_write_ldt_entry,
535 .write_gdt_entry = native_write_gdt_entry,
536 .write_idt_entry = native_write_idt_entry,
537 .load_esp0 = native_load_esp0,
538
539 .set_iopl_mask = native_set_iopl_mask,
540 .io_delay = native_io_delay,
541 .const_udelay = __const_udelay,
542
543#ifdef CONFIG_X86_LOCAL_APIC
544 .apic_write = native_apic_write,
545 .apic_write_atomic = native_apic_write_atomic,
546 .apic_read = native_apic_read,
547#endif
548
549 .flush_tlb_user = native_flush_tlb,
550 .flush_tlb_kernel = native_flush_tlb_global,
551 .flush_tlb_single = native_flush_tlb_single,
552
553 .set_pte = native_set_pte,
554 .set_pte_at = native_set_pte_at,
555 .set_pmd = native_set_pmd,
556 .pte_update = (void *)native_nop,
557 .pte_update_defer = (void *)native_nop,
558#ifdef CONFIG_X86_PAE
559 .set_pte_atomic = native_set_pte_atomic,
560 .set_pte_present = native_set_pte_present,
561 .set_pud = native_set_pud,
562 .pte_clear = native_pte_clear,
563 .pmd_clear = native_pmd_clear,
564#endif
565
566 .irq_enable_sysexit = native_irq_enable_sysexit,
567 .iret = native_iret,
568};
569EXPORT_SYMBOL(paravirt_ops);
diff --git a/arch/i386/kernel/pci-dma.c b/arch/i386/kernel/pci-dma.c
index 25fe66853934..41af692c1584 100644
--- a/arch/i386/kernel/pci-dma.c
+++ b/arch/i386/kernel/pci-dma.c
@@ -75,7 +75,7 @@ EXPORT_SYMBOL(dma_free_coherent);
75int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, 75int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
76 dma_addr_t device_addr, size_t size, int flags) 76 dma_addr_t device_addr, size_t size, int flags)
77{ 77{
78 void __iomem *mem_base; 78 void __iomem *mem_base = NULL;
79 int pages = size >> PAGE_SHIFT; 79 int pages = size >> PAGE_SHIFT;
80 int bitmap_size = (pages + 31)/32; 80 int bitmap_size = (pages + 31)/32;
81 81
@@ -92,14 +92,12 @@ int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
92 if (!mem_base) 92 if (!mem_base)
93 goto out; 93 goto out;
94 94
95 dev->dma_mem = kmalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); 95 dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
96 if (!dev->dma_mem) 96 if (!dev->dma_mem)
97 goto out; 97 goto out;
98 memset(dev->dma_mem, 0, sizeof(struct dma_coherent_mem)); 98 dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
99 dev->dma_mem->bitmap = kmalloc(bitmap_size, GFP_KERNEL);
100 if (!dev->dma_mem->bitmap) 99 if (!dev->dma_mem->bitmap)
101 goto free1_out; 100 goto free1_out;
102 memset(dev->dma_mem->bitmap, 0, bitmap_size);
103 101
104 dev->dma_mem->virt_base = mem_base; 102 dev->dma_mem->virt_base = mem_base;
105 dev->dma_mem->device_base = device_addr; 103 dev->dma_mem->device_base = device_addr;
@@ -114,6 +112,8 @@ int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
114 free1_out: 112 free1_out:
115 kfree(dev->dma_mem->bitmap); 113 kfree(dev->dma_mem->bitmap);
116 out: 114 out:
115 if (mem_base)
116 iounmap(mem_base);
117 return 0; 117 return 0;
118} 118}
119EXPORT_SYMBOL(dma_declare_coherent_memory); 119EXPORT_SYMBOL(dma_declare_coherent_memory);
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 57d375900afb..99308510a17c 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -56,6 +56,7 @@
56 56
57#include <asm/tlbflush.h> 57#include <asm/tlbflush.h>
58#include <asm/cpu.h> 58#include <asm/cpu.h>
59#include <asm/pda.h>
59 60
60asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 61asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
61 62
@@ -99,22 +100,18 @@ EXPORT_SYMBOL(enable_hlt);
99 */ 100 */
100void default_idle(void) 101void default_idle(void)
101{ 102{
102 local_irq_enable();
103
104 if (!hlt_counter && boot_cpu_data.hlt_works_ok) { 103 if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
105 current_thread_info()->status &= ~TS_POLLING; 104 current_thread_info()->status &= ~TS_POLLING;
106 smp_mb__after_clear_bit(); 105 smp_mb__after_clear_bit();
107 while (!need_resched()) { 106 local_irq_disable();
108 local_irq_disable(); 107 if (!need_resched())
109 if (!need_resched()) 108 safe_halt(); /* enables interrupts racelessly */
110 safe_halt(); 109 else
111 else 110 local_irq_enable();
112 local_irq_enable();
113 }
114 current_thread_info()->status |= TS_POLLING; 111 current_thread_info()->status |= TS_POLLING;
115 } else { 112 } else {
116 while (!need_resched()) 113 /* loop is done by the caller */
117 cpu_relax(); 114 cpu_relax();
118 } 115 }
119} 116}
120#ifdef CONFIG_APM_MODULE 117#ifdef CONFIG_APM_MODULE
@@ -128,14 +125,7 @@ EXPORT_SYMBOL(default_idle);
128 */ 125 */
129static void poll_idle (void) 126static void poll_idle (void)
130{ 127{
131 local_irq_enable(); 128 cpu_relax();
132
133 asm volatile(
134 "2:"
135 "testl %0, %1;"
136 "rep; nop;"
137 "je 2b;"
138 : : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags));
139} 129}
140 130
141#ifdef CONFIG_HOTPLUG_CPU 131#ifdef CONFIG_HOTPLUG_CPU
@@ -205,7 +195,7 @@ void cpu_idle(void)
205void cpu_idle_wait(void) 195void cpu_idle_wait(void)
206{ 196{
207 unsigned int cpu, this_cpu = get_cpu(); 197 unsigned int cpu, this_cpu = get_cpu();
208 cpumask_t map; 198 cpumask_t map, tmp = current->cpus_allowed;
209 199
210 set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); 200 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
211 put_cpu(); 201 put_cpu();
@@ -227,6 +217,8 @@ void cpu_idle_wait(void)
227 } 217 }
228 cpus_and(map, map, cpu_online_map); 218 cpus_and(map, map, cpu_online_map);
229 } while (!cpus_empty(map)); 219 } while (!cpus_empty(map));
220
221 set_cpus_allowed(current, tmp);
230} 222}
231EXPORT_SYMBOL_GPL(cpu_idle_wait); 223EXPORT_SYMBOL_GPL(cpu_idle_wait);
232 224
@@ -254,8 +246,7 @@ void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
254static void mwait_idle(void) 246static void mwait_idle(void)
255{ 247{
256 local_irq_enable(); 248 local_irq_enable();
257 while (!need_resched()) 249 mwait_idle_with_hints(0, 0);
258 mwait_idle_with_hints(0, 0);
259} 250}
260 251
261void __devinit select_idle_routine(const struct cpuinfo_x86 *c) 252void __devinit select_idle_routine(const struct cpuinfo_x86 *c)
@@ -312,8 +303,8 @@ void show_regs(struct pt_regs * regs)
312 regs->eax,regs->ebx,regs->ecx,regs->edx); 303 regs->eax,regs->ebx,regs->ecx,regs->edx);
313 printk("ESI: %08lx EDI: %08lx EBP: %08lx", 304 printk("ESI: %08lx EDI: %08lx EBP: %08lx",
314 regs->esi, regs->edi, regs->ebp); 305 regs->esi, regs->edi, regs->ebp);
315 printk(" DS: %04x ES: %04x\n", 306 printk(" DS: %04x ES: %04x GS: %04x\n",
316 0xffff & regs->xds,0xffff & regs->xes); 307 0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xgs);
317 308
318 cr0 = read_cr0(); 309 cr0 = read_cr0();
319 cr2 = read_cr2(); 310 cr2 = read_cr2();
@@ -336,7 +327,6 @@ extern void kernel_thread_helper(void);
336int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) 327int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
337{ 328{
338 struct pt_regs regs; 329 struct pt_regs regs;
339 int err;
340 330
341 memset(&regs, 0, sizeof(regs)); 331 memset(&regs, 0, sizeof(regs));
342 332
@@ -345,16 +335,14 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
345 335
346 regs.xds = __USER_DS; 336 regs.xds = __USER_DS;
347 regs.xes = __USER_DS; 337 regs.xes = __USER_DS;
338 regs.xgs = __KERNEL_PDA;
348 regs.orig_eax = -1; 339 regs.orig_eax = -1;
349 regs.eip = (unsigned long) kernel_thread_helper; 340 regs.eip = (unsigned long) kernel_thread_helper;
350 regs.xcs = __KERNEL_CS | get_kernel_rpl(); 341 regs.xcs = __KERNEL_CS | get_kernel_rpl();
351 regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; 342 regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
352 343
353 /* Ok, create the new process.. */ 344 /* Ok, create the new process.. */
354 err = do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL); 345 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
355 if (err == 0) /* terminate kernel stack */
356 task_pt_regs(current)->eip = 0;
357 return err;
358} 346}
359EXPORT_SYMBOL(kernel_thread); 347EXPORT_SYMBOL(kernel_thread);
360 348
@@ -433,7 +421,6 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
433 p->thread.eip = (unsigned long) ret_from_fork; 421 p->thread.eip = (unsigned long) ret_from_fork;
434 422
435 savesegment(fs,p->thread.fs); 423 savesegment(fs,p->thread.fs);
436 savesegment(gs,p->thread.gs);
437 424
438 tsk = current; 425 tsk = current;
439 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { 426 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
@@ -510,7 +497,7 @@ void dump_thread(struct pt_regs * regs, struct user * dump)
510 dump->regs.ds = regs->xds; 497 dump->regs.ds = regs->xds;
511 dump->regs.es = regs->xes; 498 dump->regs.es = regs->xes;
512 savesegment(fs,dump->regs.fs); 499 savesegment(fs,dump->regs.fs);
513 savesegment(gs,dump->regs.gs); 500 dump->regs.gs = regs->xgs;
514 dump->regs.orig_eax = regs->orig_eax; 501 dump->regs.orig_eax = regs->orig_eax;
515 dump->regs.eip = regs->eip; 502 dump->regs.eip = regs->eip;
516 dump->regs.cs = regs->xcs; 503 dump->regs.cs = regs->xcs;
@@ -650,22 +637,27 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
650 637
651 __unlazy_fpu(prev_p); 638 __unlazy_fpu(prev_p);
652 639
640
641 /* we're going to use this soon, after a few expensive things */
642 if (next_p->fpu_counter > 5)
643 prefetch(&next->i387.fxsave);
644
653 /* 645 /*
654 * Reload esp0. 646 * Reload esp0.
655 */ 647 */
656 load_esp0(tss, next); 648 load_esp0(tss, next);
657 649
658 /* 650 /*
659 * Save away %fs and %gs. No need to save %es and %ds, as 651 * Save away %fs. No need to save %gs, as it was saved on the
660 * those are always kernel segments while inside the kernel. 652 * stack on entry. No need to save %es and %ds, as those are
661 * Doing this before setting the new TLS descriptors avoids 653 * always kernel segments while inside the kernel. Doing this
662 * the situation where we temporarily have non-reloadable 654 * before setting the new TLS descriptors avoids the situation
663 * segments in %fs and %gs. This could be an issue if the 655 * where we temporarily have non-reloadable segments in %fs
664 * NMI handler ever used %fs or %gs (it does not today), or 656 * and %gs. This could be an issue if the NMI handler ever
665 * if the kernel is running inside of a hypervisor layer. 657 * used %fs or %gs (it does not today), or if the kernel is
658 * running inside of a hypervisor layer.
666 */ 659 */
667 savesegment(fs, prev->fs); 660 savesegment(fs, prev->fs);
668 savesegment(gs, prev->gs);
669 661
670 /* 662 /*
671 * Load the per-thread Thread-Local Storage descriptor. 663 * Load the per-thread Thread-Local Storage descriptor.
@@ -673,22 +665,14 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
673 load_TLS(next, cpu); 665 load_TLS(next, cpu);
674 666
675 /* 667 /*
676 * Restore %fs and %gs if needed. 668 * Restore %fs if needed.
677 * 669 *
678 * Glibc normally makes %fs be zero, and %gs is one of 670 * Glibc normally makes %fs be zero.
679 * the TLS segments.
680 */ 671 */
681 if (unlikely(prev->fs | next->fs)) 672 if (unlikely(prev->fs | next->fs))
682 loadsegment(fs, next->fs); 673 loadsegment(fs, next->fs);
683 674
684 if (prev->gs | next->gs) 675 write_pda(pcurrent, next_p);
685 loadsegment(gs, next->gs);
686
687 /*
688 * Restore IOPL if needed.
689 */
690 if (unlikely(prev->iopl != next->iopl))
691 set_iopl_mask(next->iopl);
692 676
693 /* 677 /*
694 * Now maybe handle debug registers and/or IO bitmaps 678 * Now maybe handle debug registers and/or IO bitmaps
@@ -699,6 +683,13 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
699 683
700 disable_tsc(prev_p, next_p); 684 disable_tsc(prev_p, next_p);
701 685
686 /* If the task has used fpu the last 5 timeslices, just do a full
687 * restore of the math state immediately to avoid the trap; the
688 * chances of needing FPU soon are obviously high now
689 */
690 if (next_p->fpu_counter > 5)
691 math_state_restore();
692
702 return prev_p; 693 return prev_p;
703} 694}
704 695
diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index 775f50e9395b..f3f94ac5736a 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -94,13 +94,9 @@ static int putreg(struct task_struct *child,
94 return -EIO; 94 return -EIO;
95 child->thread.fs = value; 95 child->thread.fs = value;
96 return 0; 96 return 0;
97 case GS:
98 if (value && (value & 3) != 3)
99 return -EIO;
100 child->thread.gs = value;
101 return 0;
102 case DS: 97 case DS:
103 case ES: 98 case ES:
99 case GS:
104 if (value && (value & 3) != 3) 100 if (value && (value & 3) != 3)
105 return -EIO; 101 return -EIO;
106 value &= 0xffff; 102 value &= 0xffff;
@@ -116,8 +112,8 @@ static int putreg(struct task_struct *child,
116 value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK; 112 value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK;
117 break; 113 break;
118 } 114 }
119 if (regno > GS*4) 115 if (regno > ES*4)
120 regno -= 2*4; 116 regno -= 1*4;
121 put_stack_long(child, regno - sizeof(struct pt_regs), value); 117 put_stack_long(child, regno - sizeof(struct pt_regs), value);
122 return 0; 118 return 0;
123} 119}
@@ -131,18 +127,16 @@ static unsigned long getreg(struct task_struct *child,
131 case FS: 127 case FS:
132 retval = child->thread.fs; 128 retval = child->thread.fs;
133 break; 129 break;
134 case GS:
135 retval = child->thread.gs;
136 break;
137 case DS: 130 case DS:
138 case ES: 131 case ES:
132 case GS:
139 case SS: 133 case SS:
140 case CS: 134 case CS:
141 retval = 0xffff; 135 retval = 0xffff;
142 /* fall through */ 136 /* fall through */
143 default: 137 default:
144 if (regno > GS*4) 138 if (regno > ES*4)
145 regno -= 2*4; 139 regno -= 1*4;
146 regno = regno - sizeof(struct pt_regs); 140 regno = regno - sizeof(struct pt_regs);
147 retval &= get_stack_long(child, regno); 141 retval &= get_stack_long(child, regno);
148 } 142 }
diff --git a/arch/i386/kernel/quirks.c b/arch/i386/kernel/quirks.c
index 9f6ab1789bb0..34874c398b44 100644
--- a/arch/i386/kernel/quirks.c
+++ b/arch/i386/kernel/quirks.c
@@ -3,10 +3,12 @@
3 */ 3 */
4#include <linux/pci.h> 4#include <linux/pci.h>
5#include <linux/irq.h> 5#include <linux/irq.h>
6#include <asm/pci-direct.h>
7#include <asm/genapic.h>
8#include <asm/cpu.h>
6 9
7#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) 10#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
8 11static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev)
9static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
10{ 12{
11 u8 config, rev; 13 u8 config, rev;
12 u32 word; 14 u32 word;
@@ -14,14 +16,12 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
14 /* BIOS may enable hardware IRQ balancing for 16 /* BIOS may enable hardware IRQ balancing for
15 * E7520/E7320/E7525(revision ID 0x9 and below) 17 * E7520/E7320/E7525(revision ID 0x9 and below)
16 * based platforms. 18 * based platforms.
17 * Disable SW irqbalance/affinity on those platforms. 19 * For those platforms, make sure that the genapic is set to 'flat'
18 */ 20 */
19 pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); 21 pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
20 if (rev > 0x9) 22 if (rev > 0x9)
21 return; 23 return;
22 24
23 printk(KERN_INFO "Intel E7520/7320/7525 detected.");
24
25 /* enable access to config space*/ 25 /* enable access to config space*/
26 pci_read_config_byte(dev, 0xf4, &config); 26 pci_read_config_byte(dev, 0xf4, &config);
27 pci_write_config_byte(dev, 0xf4, config|0x2); 27 pci_write_config_byte(dev, 0xf4, config|0x2);
@@ -30,6 +30,44 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
30 raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word); 30 raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
31 31
32 if (!(word & (1 << 13))) { 32 if (!(word & (1 << 13))) {
33#ifdef CONFIG_X86_64
34 if (genapic != &apic_flat)
35 panic("APIC mode must be flat on this system\n");
36#elif defined(CONFIG_X86_GENERICARCH)
37 if (genapic != &apic_default)
38 panic("APIC mode must be default(flat) on this system. Use apic=default\n");
39#endif
40 }
41
42 /* put back the original value for config space*/
43 if (!(config & 0x2))
44 pci_write_config_byte(dev, 0xf4, config);
45}
46
47void __init quirk_intel_irqbalance(void)
48{
49 u8 config, rev;
50 u32 word;
51
52 /* BIOS may enable hardware IRQ balancing for
53 * E7520/E7320/E7525(revision ID 0x9 and below)
54 * based platforms.
55 * Disable SW irqbalance/affinity on those platforms.
56 */
57 rev = read_pci_config_byte(0, 0, 0, PCI_CLASS_REVISION);
58 if (rev > 0x9)
59 return;
60
61 printk(KERN_INFO "Intel E7520/7320/7525 detected.");
62
63 /* enable access to config space */
64 config = read_pci_config_byte(0, 0, 0, 0xf4);
65 write_pci_config_byte(0, 0, 0, 0xf4, config|0x2);
66
67 /* read xTPR register */
68 word = read_pci_config_16(0, 0, 0x40, 0x4c);
69
70 if (!(word & (1 << 13))) {
33 printk(KERN_INFO "Disabling irq balancing and affinity\n"); 71 printk(KERN_INFO "Disabling irq balancing and affinity\n");
34#ifdef CONFIG_IRQBALANCE 72#ifdef CONFIG_IRQBALANCE
35 irqbalance_disable(""); 73 irqbalance_disable("");
@@ -38,13 +76,24 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
38#ifdef CONFIG_PROC_FS 76#ifdef CONFIG_PROC_FS
39 no_irq_affinity = 1; 77 no_irq_affinity = 1;
40#endif 78#endif
79#ifdef CONFIG_HOTPLUG_CPU
80 printk(KERN_INFO "Disabling cpu hotplug control\n");
81 enable_cpu_hotplug = 0;
82#endif
83#ifdef CONFIG_X86_64
84 /* force the genapic selection to flat mode so that
85 * interrupts can be redirected to more than one CPU.
86 */
87 genapic_force = &apic_flat;
88#endif
41 } 89 }
42 90
43 /* put back the original value for config space*/ 91 /* put back the original value for config space */
44 if (!(config & 0x2)) 92 if (!(config & 0x2))
45 pci_write_config_byte(dev, 0xf4, config); 93 write_pci_config_byte(0, 0, 0, 0xf4, config);
46} 94}
47DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance); 95DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, verify_quirk_intel_irqbalance);
48DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance); 96DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, verify_quirk_intel_irqbalance);
49DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance); 97DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, verify_quirk_intel_irqbalance);
98
50#endif 99#endif
diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c
index 84278e0093a2..3514b4153f7f 100644
--- a/arch/i386/kernel/reboot.c
+++ b/arch/i386/kernel/reboot.c
@@ -12,6 +12,7 @@
12#include <linux/dmi.h> 12#include <linux/dmi.h>
13#include <linux/ctype.h> 13#include <linux/ctype.h>
14#include <linux/pm.h> 14#include <linux/pm.h>
15#include <linux/reboot.h>
15#include <asm/uaccess.h> 16#include <asm/uaccess.h>
16#include <asm/apic.h> 17#include <asm/apic.h>
17#include <asm/desc.h> 18#include <asm/desc.h>
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index 519e63c3c130..79df6e612dbd 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -63,9 +63,6 @@
63#include <setup_arch.h> 63#include <setup_arch.h>
64#include <bios_ebda.h> 64#include <bios_ebda.h>
65 65
66/* Forward Declaration. */
67void __init find_max_pfn(void);
68
69/* This value is set up by the early boot code to point to the value 66/* This value is set up by the early boot code to point to the value
70 immediately after the boot time page tables. It contains a *physical* 67 immediately after the boot time page tables. It contains a *physical*
71 address, and must not be in the .bss segment! */ 68 address, and must not be in the .bss segment! */
@@ -76,11 +73,8 @@ int disable_pse __devinitdata = 0;
76/* 73/*
77 * Machine setup.. 74 * Machine setup..
78 */ 75 */
79 76extern struct resource code_resource;
80#ifdef CONFIG_EFI 77extern struct resource data_resource;
81int efi_enabled = 0;
82EXPORT_SYMBOL(efi_enabled);
83#endif
84 78
85/* cpu data as detected by the assembly code in head.S */ 79/* cpu data as detected by the assembly code in head.S */
86struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; 80struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
@@ -99,12 +93,6 @@ unsigned int machine_submodel_id;
99unsigned int BIOS_revision; 93unsigned int BIOS_revision;
100unsigned int mca_pentium_flag; 94unsigned int mca_pentium_flag;
101 95
102/* For PCI or other memory-mapped resources */
103unsigned long pci_mem_start = 0x10000000;
104#ifdef CONFIG_PCI
105EXPORT_SYMBOL(pci_mem_start);
106#endif
107
108/* Boot loader ID as an integer, for the benefit of proc_dointvec */ 96/* Boot loader ID as an integer, for the benefit of proc_dointvec */
109int bootloader_type; 97int bootloader_type;
110 98
@@ -134,7 +122,6 @@ struct ist_info ist_info;
134 defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE) 122 defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
135EXPORT_SYMBOL(ist_info); 123EXPORT_SYMBOL(ist_info);
136#endif 124#endif
137struct e820map e820;
138 125
139extern void early_cpu_init(void); 126extern void early_cpu_init(void);
140extern int root_mountflags; 127extern int root_mountflags;
@@ -149,516 +136,6 @@ static char command_line[COMMAND_LINE_SIZE];
149 136
150unsigned char __initdata boot_params[PARAM_SIZE]; 137unsigned char __initdata boot_params[PARAM_SIZE];
151 138
152static struct resource data_resource = {
153 .name = "Kernel data",
154 .start = 0,
155 .end = 0,
156 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
157};
158
159static struct resource code_resource = {
160 .name = "Kernel code",
161 .start = 0,
162 .end = 0,
163 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
164};
165
166static struct resource system_rom_resource = {
167 .name = "System ROM",
168 .start = 0xf0000,
169 .end = 0xfffff,
170 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
171};
172
173static struct resource extension_rom_resource = {
174 .name = "Extension ROM",
175 .start = 0xe0000,
176 .end = 0xeffff,
177 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
178};
179
180static struct resource adapter_rom_resources[] = { {
181 .name = "Adapter ROM",
182 .start = 0xc8000,
183 .end = 0,
184 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
185}, {
186 .name = "Adapter ROM",
187 .start = 0,
188 .end = 0,
189 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
190}, {
191 .name = "Adapter ROM",
192 .start = 0,
193 .end = 0,
194 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
195}, {
196 .name = "Adapter ROM",
197 .start = 0,
198 .end = 0,
199 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
200}, {
201 .name = "Adapter ROM",
202 .start = 0,
203 .end = 0,
204 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
205}, {
206 .name = "Adapter ROM",
207 .start = 0,
208 .end = 0,
209 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
210} };
211
212static struct resource video_rom_resource = {
213 .name = "Video ROM",
214 .start = 0xc0000,
215 .end = 0xc7fff,
216 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
217};
218
219static struct resource video_ram_resource = {
220 .name = "Video RAM area",
221 .start = 0xa0000,
222 .end = 0xbffff,
223 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
224};
225
226static struct resource standard_io_resources[] = { {
227 .name = "dma1",
228 .start = 0x0000,
229 .end = 0x001f,
230 .flags = IORESOURCE_BUSY | IORESOURCE_IO
231}, {
232 .name = "pic1",
233 .start = 0x0020,
234 .end = 0x0021,
235 .flags = IORESOURCE_BUSY | IORESOURCE_IO
236}, {
237 .name = "timer0",
238 .start = 0x0040,
239 .end = 0x0043,
240 .flags = IORESOURCE_BUSY | IORESOURCE_IO
241}, {
242 .name = "timer1",
243 .start = 0x0050,
244 .end = 0x0053,
245 .flags = IORESOURCE_BUSY | IORESOURCE_IO
246}, {
247 .name = "keyboard",
248 .start = 0x0060,
249 .end = 0x006f,
250 .flags = IORESOURCE_BUSY | IORESOURCE_IO
251}, {
252 .name = "dma page reg",
253 .start = 0x0080,
254 .end = 0x008f,
255 .flags = IORESOURCE_BUSY | IORESOURCE_IO
256}, {
257 .name = "pic2",
258 .start = 0x00a0,
259 .end = 0x00a1,
260 .flags = IORESOURCE_BUSY | IORESOURCE_IO
261}, {
262 .name = "dma2",
263 .start = 0x00c0,
264 .end = 0x00df,
265 .flags = IORESOURCE_BUSY | IORESOURCE_IO
266}, {
267 .name = "fpu",
268 .start = 0x00f0,
269 .end = 0x00ff,
270 .flags = IORESOURCE_BUSY | IORESOURCE_IO
271} };
272
273#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
274
275static int __init romchecksum(unsigned char *rom, unsigned long length)
276{
277 unsigned char *p, sum = 0;
278
279 for (p = rom; p < rom + length; p++)
280 sum += *p;
281 return sum == 0;
282}
283
284static void __init probe_roms(void)
285{
286 unsigned long start, length, upper;
287 unsigned char *rom;
288 int i;
289
290 /* video rom */
291 upper = adapter_rom_resources[0].start;
292 for (start = video_rom_resource.start; start < upper; start += 2048) {
293 rom = isa_bus_to_virt(start);
294 if (!romsignature(rom))
295 continue;
296
297 video_rom_resource.start = start;
298
299 /* 0 < length <= 0x7f * 512, historically */
300 length = rom[2] * 512;
301
302 /* if checksum okay, trust length byte */
303 if (length && romchecksum(rom, length))
304 video_rom_resource.end = start + length - 1;
305
306 request_resource(&iomem_resource, &video_rom_resource);
307 break;
308 }
309
310 start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
311 if (start < upper)
312 start = upper;
313
314 /* system rom */
315 request_resource(&iomem_resource, &system_rom_resource);
316 upper = system_rom_resource.start;
317
318 /* check for extension rom (ignore length byte!) */
319 rom = isa_bus_to_virt(extension_rom_resource.start);
320 if (romsignature(rom)) {
321 length = extension_rom_resource.end - extension_rom_resource.start + 1;
322 if (romchecksum(rom, length)) {
323 request_resource(&iomem_resource, &extension_rom_resource);
324 upper = extension_rom_resource.start;
325 }
326 }
327
328 /* check for adapter roms on 2k boundaries */
329 for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
330 rom = isa_bus_to_virt(start);
331 if (!romsignature(rom))
332 continue;
333
334 /* 0 < length <= 0x7f * 512, historically */
335 length = rom[2] * 512;
336
337 /* but accept any length that fits if checksum okay */
338 if (!length || start + length > upper || !romchecksum(rom, length))
339 continue;
340
341 adapter_rom_resources[i].start = start;
342 adapter_rom_resources[i].end = start + length - 1;
343 request_resource(&iomem_resource, &adapter_rom_resources[i]);
344
345 start = adapter_rom_resources[i++].end & ~2047UL;
346 }
347}
348
349static void __init limit_regions(unsigned long long size)
350{
351 unsigned long long current_addr = 0;
352 int i;
353
354 if (efi_enabled) {
355 efi_memory_desc_t *md;
356 void *p;
357
358 for (p = memmap.map, i = 0; p < memmap.map_end;
359 p += memmap.desc_size, i++) {
360 md = p;
361 current_addr = md->phys_addr + (md->num_pages << 12);
362 if (md->type == EFI_CONVENTIONAL_MEMORY) {
363 if (current_addr >= size) {
364 md->num_pages -=
365 (((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT);
366 memmap.nr_map = i + 1;
367 return;
368 }
369 }
370 }
371 }
372 for (i = 0; i < e820.nr_map; i++) {
373 current_addr = e820.map[i].addr + e820.map[i].size;
374 if (current_addr < size)
375 continue;
376
377 if (e820.map[i].type != E820_RAM)
378 continue;
379
380 if (e820.map[i].addr >= size) {
381 /*
382 * This region starts past the end of the
383 * requested size, skip it completely.
384 */
385 e820.nr_map = i;
386 } else {
387 e820.nr_map = i + 1;
388 e820.map[i].size -= current_addr - size;
389 }
390 return;
391 }
392}
393
394void __init add_memory_region(unsigned long long start,
395 unsigned long long size, int type)
396{
397 int x;
398
399 if (!efi_enabled) {
400 x = e820.nr_map;
401
402 if (x == E820MAX) {
403 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
404 return;
405 }
406
407 e820.map[x].addr = start;
408 e820.map[x].size = size;
409 e820.map[x].type = type;
410 e820.nr_map++;
411 }
412} /* add_memory_region */
413
414#define E820_DEBUG 1
415
416static void __init print_memory_map(char *who)
417{
418 int i;
419
420 for (i = 0; i < e820.nr_map; i++) {
421 printk(" %s: %016Lx - %016Lx ", who,
422 e820.map[i].addr,
423 e820.map[i].addr + e820.map[i].size);
424 switch (e820.map[i].type) {
425 case E820_RAM: printk("(usable)\n");
426 break;
427 case E820_RESERVED:
428 printk("(reserved)\n");
429 break;
430 case E820_ACPI:
431 printk("(ACPI data)\n");
432 break;
433 case E820_NVS:
434 printk("(ACPI NVS)\n");
435 break;
436 default: printk("type %lu\n", e820.map[i].type);
437 break;
438 }
439 }
440}
441
442/*
443 * Sanitize the BIOS e820 map.
444 *
445 * Some e820 responses include overlapping entries. The following
446 * replaces the original e820 map with a new one, removing overlaps.
447 *
448 */
449struct change_member {
450 struct e820entry *pbios; /* pointer to original bios entry */
451 unsigned long long addr; /* address for this change point */
452};
453static struct change_member change_point_list[2*E820MAX] __initdata;
454static struct change_member *change_point[2*E820MAX] __initdata;
455static struct e820entry *overlap_list[E820MAX] __initdata;
456static struct e820entry new_bios[E820MAX] __initdata;
457
458int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
459{
460 struct change_member *change_tmp;
461 unsigned long current_type, last_type;
462 unsigned long long last_addr;
463 int chgidx, still_changing;
464 int overlap_entries;
465 int new_bios_entry;
466 int old_nr, new_nr, chg_nr;
467 int i;
468
469 /*
470 Visually we're performing the following (1,2,3,4 = memory types)...
471
472 Sample memory map (w/overlaps):
473 ____22__________________
474 ______________________4_
475 ____1111________________
476 _44_____________________
477 11111111________________
478 ____________________33__
479 ___________44___________
480 __________33333_________
481 ______________22________
482 ___________________2222_
483 _________111111111______
484 _____________________11_
485 _________________4______
486
487 Sanitized equivalent (no overlap):
488 1_______________________
489 _44_____________________
490 ___1____________________
491 ____22__________________
492 ______11________________
493 _________1______________
494 __________3_____________
495 ___________44___________
496 _____________33_________
497 _______________2________
498 ________________1_______
499 _________________4______
500 ___________________2____
501 ____________________33__
502 ______________________4_
503 */
504
505 /* if there's only one memory region, don't bother */
506 if (*pnr_map < 2)
507 return -1;
508
509 old_nr = *pnr_map;
510
511 /* bail out if we find any unreasonable addresses in bios map */
512 for (i=0; i<old_nr; i++)
513 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
514 return -1;
515
516 /* create pointers for initial change-point information (for sorting) */
517 for (i=0; i < 2*old_nr; i++)
518 change_point[i] = &change_point_list[i];
519
520 /* record all known change-points (starting and ending addresses),
521 omitting those that are for empty memory regions */
522 chgidx = 0;
523 for (i=0; i < old_nr; i++) {
524 if (biosmap[i].size != 0) {
525 change_point[chgidx]->addr = biosmap[i].addr;
526 change_point[chgidx++]->pbios = &biosmap[i];
527 change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
528 change_point[chgidx++]->pbios = &biosmap[i];
529 }
530 }
531 chg_nr = chgidx; /* true number of change-points */
532
533 /* sort change-point list by memory addresses (low -> high) */
534 still_changing = 1;
535 while (still_changing) {
536 still_changing = 0;
537 for (i=1; i < chg_nr; i++) {
538 /* if <current_addr> > <last_addr>, swap */
539 /* or, if current=<start_addr> & last=<end_addr>, swap */
540 if ((change_point[i]->addr < change_point[i-1]->addr) ||
541 ((change_point[i]->addr == change_point[i-1]->addr) &&
542 (change_point[i]->addr == change_point[i]->pbios->addr) &&
543 (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
544 )
545 {
546 change_tmp = change_point[i];
547 change_point[i] = change_point[i-1];
548 change_point[i-1] = change_tmp;
549 still_changing=1;
550 }
551 }
552 }
553
554 /* create a new bios memory map, removing overlaps */
555 overlap_entries=0; /* number of entries in the overlap table */
556 new_bios_entry=0; /* index for creating new bios map entries */
557 last_type = 0; /* start with undefined memory type */
558 last_addr = 0; /* start with 0 as last starting address */
559 /* loop through change-points, determining affect on the new bios map */
560 for (chgidx=0; chgidx < chg_nr; chgidx++)
561 {
562 /* keep track of all overlapping bios entries */
563 if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
564 {
565 /* add map entry to overlap list (> 1 entry implies an overlap) */
566 overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
567 }
568 else
569 {
570 /* remove entry from list (order independent, so swap with last) */
571 for (i=0; i<overlap_entries; i++)
572 {
573 if (overlap_list[i] == change_point[chgidx]->pbios)
574 overlap_list[i] = overlap_list[overlap_entries-1];
575 }
576 overlap_entries--;
577 }
578 /* if there are overlapping entries, decide which "type" to use */
579 /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
580 current_type = 0;
581 for (i=0; i<overlap_entries; i++)
582 if (overlap_list[i]->type > current_type)
583 current_type = overlap_list[i]->type;
584 /* continue building up new bios map based on this information */
585 if (current_type != last_type) {
586 if (last_type != 0) {
587 new_bios[new_bios_entry].size =
588 change_point[chgidx]->addr - last_addr;
589 /* move forward only if the new size was non-zero */
590 if (new_bios[new_bios_entry].size != 0)
591 if (++new_bios_entry >= E820MAX)
592 break; /* no more space left for new bios entries */
593 }
594 if (current_type != 0) {
595 new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
596 new_bios[new_bios_entry].type = current_type;
597 last_addr=change_point[chgidx]->addr;
598 }
599 last_type = current_type;
600 }
601 }
602 new_nr = new_bios_entry; /* retain count for new bios entries */
603
604 /* copy new bios mapping into original location */
605 memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
606 *pnr_map = new_nr;
607
608 return 0;
609}
610
611/*
612 * Copy the BIOS e820 map into a safe place.
613 *
614 * Sanity-check it while we're at it..
615 *
616 * If we're lucky and live on a modern system, the setup code
617 * will have given us a memory map that we can use to properly
618 * set up memory. If we aren't, we'll fake a memory map.
619 *
620 * We check to see that the memory map contains at least 2 elements
621 * before we'll use it, because the detection code in setup.S may
622 * not be perfect and most every PC known to man has two memory
623 * regions: one from 0 to 640k, and one from 1mb up. (The IBM
624 * thinkpad 560x, for example, does not cooperate with the memory
625 * detection code.)
626 */
627int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
628{
629 /* Only one memory region (or negative)? Ignore it */
630 if (nr_map < 2)
631 return -1;
632
633 do {
634 unsigned long long start = biosmap->addr;
635 unsigned long long size = biosmap->size;
636 unsigned long long end = start + size;
637 unsigned long type = biosmap->type;
638
639 /* Overflow in 64 bits? Ignore the memory map. */
640 if (start > end)
641 return -1;
642
643 /*
644 * Some BIOSes claim RAM in the 640k - 1M region.
645 * Not right. Fix it up.
646 */
647 if (type == E820_RAM) {
648 if (start < 0x100000ULL && end > 0xA0000ULL) {
649 if (start < 0xA0000ULL)
650 add_memory_region(start, 0xA0000ULL-start, type);
651 if (end <= 0x100000ULL)
652 continue;
653 start = 0x100000ULL;
654 size = end - start;
655 }
656 }
657 add_memory_region(start, size, type);
658 } while (biosmap++,--nr_map);
659 return 0;
660}
661
662#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) 139#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
663struct edd edd; 140struct edd edd;
664#ifdef CONFIG_EDD_MODULE 141#ifdef CONFIG_EDD_MODULE
@@ -682,7 +159,7 @@ static inline void copy_edd(void)
682} 159}
683#endif 160#endif
684 161
685static int __initdata user_defined_memmap = 0; 162int __initdata user_defined_memmap = 0;
686 163
687/* 164/*
688 * "mem=nopentium" disables the 4MB page tables. 165 * "mem=nopentium" disables the 4MB page tables.
@@ -719,51 +196,6 @@ static int __init parse_mem(char *arg)
719} 196}
720early_param("mem", parse_mem); 197early_param("mem", parse_mem);
721 198
722static int __init parse_memmap(char *arg)
723{
724 if (!arg)
725 return -EINVAL;
726
727 if (strcmp(arg, "exactmap") == 0) {
728#ifdef CONFIG_CRASH_DUMP
729 /* If we are doing a crash dump, we
730 * still need to know the real mem
731 * size before original memory map is
732 * reset.
733 */
734 find_max_pfn();
735 saved_max_pfn = max_pfn;
736#endif
737 e820.nr_map = 0;
738 user_defined_memmap = 1;
739 } else {
740 /* If the user specifies memory size, we
741 * limit the BIOS-provided memory map to
742 * that size. exactmap can be used to specify
743 * the exact map. mem=number can be used to
744 * trim the existing memory map.
745 */
746 unsigned long long start_at, mem_size;
747
748 mem_size = memparse(arg, &arg);
749 if (*arg == '@') {
750 start_at = memparse(arg+1, &arg);
751 add_memory_region(start_at, mem_size, E820_RAM);
752 } else if (*arg == '#') {
753 start_at = memparse(arg+1, &arg);
754 add_memory_region(start_at, mem_size, E820_ACPI);
755 } else if (*arg == '$') {
756 start_at = memparse(arg+1, &arg);
757 add_memory_region(start_at, mem_size, E820_RESERVED);
758 } else {
759 limit_regions(mem_size);
760 user_defined_memmap = 1;
761 }
762 }
763 return 0;
764}
765early_param("memmap", parse_memmap);
766
767#ifdef CONFIG_PROC_VMCORE 199#ifdef CONFIG_PROC_VMCORE
768/* elfcorehdr= specifies the location of elf core header 200/* elfcorehdr= specifies the location of elf core header
769 * stored by the crashed kernel. 201 * stored by the crashed kernel.
@@ -828,90 +260,6 @@ static int __init parse_reservetop(char *arg)
828early_param("reservetop", parse_reservetop); 260early_param("reservetop", parse_reservetop);
829 261
830/* 262/*
831 * Callback for efi_memory_walk.
832 */
833static int __init
834efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
835{
836 unsigned long *max_pfn = arg, pfn;
837
838 if (start < end) {
839 pfn = PFN_UP(end -1);
840 if (pfn > *max_pfn)
841 *max_pfn = pfn;
842 }
843 return 0;
844}
845
846static int __init
847efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
848{
849 memory_present(0, start, end);
850 return 0;
851}
852
853 /*
854 * This function checks if the entire range <start,end> is mapped with type.
855 *
856 * Note: this function only works correct if the e820 table is sorted and
857 * not-overlapping, which is the case
858 */
859int __init
860e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
861{
862 u64 start = s;
863 u64 end = e;
864 int i;
865 for (i = 0; i < e820.nr_map; i++) {
866 struct e820entry *ei = &e820.map[i];
867 if (type && ei->type != type)
868 continue;
869 /* is the region (part) in overlap with the current region ?*/
870 if (ei->addr >= end || ei->addr + ei->size <= start)
871 continue;
872 /* if the region is at the beginning of <start,end> we move
873 * start to the end of the region since it's ok until there
874 */
875 if (ei->addr <= start)
876 start = ei->addr + ei->size;
877 /* if start is now at or beyond end, we're done, full
878 * coverage */
879 if (start >= end)
880 return 1; /* we're done */
881 }
882 return 0;
883}
884
885/*
886 * Find the highest page frame number we have available
887 */
888void __init find_max_pfn(void)
889{
890 int i;
891
892 max_pfn = 0;
893 if (efi_enabled) {
894 efi_memmap_walk(efi_find_max_pfn, &max_pfn);
895 efi_memmap_walk(efi_memory_present_wrapper, NULL);
896 return;
897 }
898
899 for (i = 0; i < e820.nr_map; i++) {
900 unsigned long start, end;
901 /* RAM? */
902 if (e820.map[i].type != E820_RAM)
903 continue;
904 start = PFN_UP(e820.map[i].addr);
905 end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
906 if (start >= end)
907 continue;
908 if (end > max_pfn)
909 max_pfn = end;
910 memory_present(0, start, end);
911 }
912}
913
914/*
915 * Determine low and high memory ranges: 263 * Determine low and high memory ranges:
916 */ 264 */
917unsigned long __init find_max_low_pfn(void) 265unsigned long __init find_max_low_pfn(void)
@@ -971,68 +319,6 @@ unsigned long __init find_max_low_pfn(void)
971} 319}
972 320
973/* 321/*
974 * Free all available memory for boot time allocation. Used
975 * as a callback function by efi_memory_walk()
976 */
977
978static int __init
979free_available_memory(unsigned long start, unsigned long end, void *arg)
980{
981 /* check max_low_pfn */
982 if (start >= (max_low_pfn << PAGE_SHIFT))
983 return 0;
984 if (end >= (max_low_pfn << PAGE_SHIFT))
985 end = max_low_pfn << PAGE_SHIFT;
986 if (start < end)
987 free_bootmem(start, end - start);
988
989 return 0;
990}
991/*
992 * Register fully available low RAM pages with the bootmem allocator.
993 */
994static void __init register_bootmem_low_pages(unsigned long max_low_pfn)
995{
996 int i;
997
998 if (efi_enabled) {
999 efi_memmap_walk(free_available_memory, NULL);
1000 return;
1001 }
1002 for (i = 0; i < e820.nr_map; i++) {
1003 unsigned long curr_pfn, last_pfn, size;
1004 /*
1005 * Reserve usable low memory
1006 */
1007 if (e820.map[i].type != E820_RAM)
1008 continue;
1009 /*
1010 * We are rounding up the start address of usable memory:
1011 */
1012 curr_pfn = PFN_UP(e820.map[i].addr);
1013 if (curr_pfn >= max_low_pfn)
1014 continue;
1015 /*
1016 * ... and at the end of the usable range downwards:
1017 */
1018 last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
1019
1020 if (last_pfn > max_low_pfn)
1021 last_pfn = max_low_pfn;
1022
1023 /*
1024 * .. finally, did all the rounding and playing
1025 * around just make the area go away?
1026 */
1027 if (last_pfn <= curr_pfn)
1028 continue;
1029
1030 size = last_pfn - curr_pfn;
1031 free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
1032 }
1033}
1034
1035/*
1036 * workaround for Dell systems that neglect to reserve EBDA 322 * workaround for Dell systems that neglect to reserve EBDA
1037 */ 323 */
1038static void __init reserve_ebda_region(void) 324static void __init reserve_ebda_region(void)
@@ -1118,8 +404,8 @@ void __init setup_bootmem_allocator(void)
1118 * the (very unlikely) case of us accidentally initializing the 404 * the (very unlikely) case of us accidentally initializing the
1119 * bootmem allocator with an invalid RAM area. 405 * bootmem allocator with an invalid RAM area.
1120 */ 406 */
1121 reserve_bootmem(__PHYSICAL_START, (PFN_PHYS(min_low_pfn) + 407 reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
1122 bootmap_size + PAGE_SIZE-1) - (__PHYSICAL_START)); 408 bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text));
1123 409
1124 /* 410 /*
1125 * reserve physical page 0 - it's a special BIOS page on many boxes, 411 * reserve physical page 0 - it's a special BIOS page on many boxes,
@@ -1162,8 +448,7 @@ void __init setup_bootmem_allocator(void)
1162 if (LOADER_TYPE && INITRD_START) { 448 if (LOADER_TYPE && INITRD_START) {
1163 if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) { 449 if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
1164 reserve_bootmem(INITRD_START, INITRD_SIZE); 450 reserve_bootmem(INITRD_START, INITRD_SIZE);
1165 initrd_start = 451 initrd_start = INITRD_START + PAGE_OFFSET;
1166 INITRD_START ? INITRD_START + PAGE_OFFSET : 0;
1167 initrd_end = initrd_start+INITRD_SIZE; 452 initrd_end = initrd_start+INITRD_SIZE;
1168 } 453 }
1169 else { 454 else {
@@ -1200,126 +485,6 @@ void __init remapped_pgdat_init(void)
1200 } 485 }
1201} 486}
1202 487
1203/*
1204 * Request address space for all standard RAM and ROM resources
1205 * and also for regions reported as reserved by the e820.
1206 */
1207static void __init
1208legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource)
1209{
1210 int i;
1211
1212 probe_roms();
1213 for (i = 0; i < e820.nr_map; i++) {
1214 struct resource *res;
1215#ifndef CONFIG_RESOURCES_64BIT
1216 if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
1217 continue;
1218#endif
1219 res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
1220 switch (e820.map[i].type) {
1221 case E820_RAM: res->name = "System RAM"; break;
1222 case E820_ACPI: res->name = "ACPI Tables"; break;
1223 case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
1224 default: res->name = "reserved";
1225 }
1226 res->start = e820.map[i].addr;
1227 res->end = res->start + e820.map[i].size - 1;
1228 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
1229 if (request_resource(&iomem_resource, res)) {
1230 kfree(res);
1231 continue;
1232 }
1233 if (e820.map[i].type == E820_RAM) {
1234 /*
1235 * We don't know which RAM region contains kernel data,
1236 * so we try it repeatedly and let the resource manager
1237 * test it.
1238 */
1239 request_resource(res, code_resource);
1240 request_resource(res, data_resource);
1241#ifdef CONFIG_KEXEC
1242 request_resource(res, &crashk_res);
1243#endif
1244 }
1245 }
1246}
1247
1248/*
1249 * Request address space for all standard resources
1250 *
1251 * This is called just before pcibios_init(), which is also a
1252 * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
1253 */
1254static int __init request_standard_resources(void)
1255{
1256 int i;
1257
1258 printk("Setting up standard PCI resources\n");
1259 if (efi_enabled)
1260 efi_initialize_iomem_resources(&code_resource, &data_resource);
1261 else
1262 legacy_init_iomem_resources(&code_resource, &data_resource);
1263
1264 /* EFI systems may still have VGA */
1265 request_resource(&iomem_resource, &video_ram_resource);
1266
1267 /* request I/O space for devices used on all i[345]86 PCs */
1268 for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
1269 request_resource(&ioport_resource, &standard_io_resources[i]);
1270 return 0;
1271}
1272
1273subsys_initcall(request_standard_resources);
1274
1275static void __init register_memory(void)
1276{
1277 unsigned long gapstart, gapsize, round;
1278 unsigned long long last;
1279 int i;
1280
1281 /*
1282 * Search for the bigest gap in the low 32 bits of the e820
1283 * memory space.
1284 */
1285 last = 0x100000000ull;
1286 gapstart = 0x10000000;
1287 gapsize = 0x400000;
1288 i = e820.nr_map;
1289 while (--i >= 0) {
1290 unsigned long long start = e820.map[i].addr;
1291 unsigned long long end = start + e820.map[i].size;
1292
1293 /*
1294 * Since "last" is at most 4GB, we know we'll
1295 * fit in 32 bits if this condition is true
1296 */
1297 if (last > end) {
1298 unsigned long gap = last - end;
1299
1300 if (gap > gapsize) {
1301 gapsize = gap;
1302 gapstart = end;
1303 }
1304 }
1305 if (start < last)
1306 last = start;
1307 }
1308
1309 /*
1310 * See how much we want to round up: start off with
1311 * rounding to the next 1MB area.
1312 */
1313 round = 0x100000;
1314 while ((gapsize >> 4) > round)
1315 round += round;
1316 /* Fun with two's complement */
1317 pci_mem_start = (gapstart + round) & -round;
1318
1319 printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
1320 pci_mem_start, gapstart, gapsize);
1321}
1322
1323#ifdef CONFIG_MCA 488#ifdef CONFIG_MCA
1324static void set_mca_bus(int x) 489static void set_mca_bus(int x)
1325{ 490{
@@ -1329,6 +494,12 @@ static void set_mca_bus(int x)
1329static void set_mca_bus(int x) { } 494static void set_mca_bus(int x) { }
1330#endif 495#endif
1331 496
497/* Overridden in paravirt.c if CONFIG_PARAVIRT */
498char * __attribute__((weak)) memory_setup(void)
499{
500 return machine_specific_memory_setup();
501}
502
1332/* 503/*
1333 * Determine if we were loaded by an EFI loader. If so, then we have also been 504 * Determine if we were loaded by an EFI loader. If so, then we have also been
1334 * passed the efi memmap, systab, etc., so we should use these data structures 505 * passed the efi memmap, systab, etc., so we should use these data structures
@@ -1381,7 +552,7 @@ void __init setup_arch(char **cmdline_p)
1381 efi_init(); 552 efi_init();
1382 else { 553 else {
1383 printk(KERN_INFO "BIOS-provided physical RAM map:\n"); 554 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
1384 print_memory_map(machine_specific_memory_setup()); 555 print_memory_map(memory_setup());
1385 } 556 }
1386 557
1387 copy_edd(); 558 copy_edd();
diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c
index 43002cfb40c4..65d7620eaa09 100644
--- a/arch/i386/kernel/signal.c
+++ b/arch/i386/kernel/signal.c
@@ -128,7 +128,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax
128 X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \ 128 X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \
129 X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF) 129 X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF)
130 130
131 GET_SEG(gs); 131 COPY_SEG(gs);
132 GET_SEG(fs); 132 GET_SEG(fs);
133 COPY_SEG(es); 133 COPY_SEG(es);
134 COPY_SEG(ds); 134 COPY_SEG(ds);
@@ -244,9 +244,7 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
244{ 244{
245 int tmp, err = 0; 245 int tmp, err = 0;
246 246
247 tmp = 0; 247 err |= __put_user(regs->xgs, (unsigned int __user *)&sc->gs);
248 savesegment(gs, tmp);
249 err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
250 savesegment(fs, tmp); 248 savesegment(fs, tmp);
251 err |= __put_user(tmp, (unsigned int __user *)&sc->fs); 249 err |= __put_user(tmp, (unsigned int __user *)&sc->fs);
252 250
diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index 31e5c6573aae..5285aff8367f 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -321,7 +321,6 @@ static inline void leave_mm (unsigned long cpu)
321 321
322fastcall void smp_invalidate_interrupt(struct pt_regs *regs) 322fastcall void smp_invalidate_interrupt(struct pt_regs *regs)
323{ 323{
324 struct pt_regs *old_regs = set_irq_regs(regs);
325 unsigned long cpu; 324 unsigned long cpu;
326 325
327 cpu = get_cpu(); 326 cpu = get_cpu();
@@ -352,7 +351,6 @@ fastcall void smp_invalidate_interrupt(struct pt_regs *regs)
352 smp_mb__after_clear_bit(); 351 smp_mb__after_clear_bit();
353out: 352out:
354 put_cpu_no_resched(); 353 put_cpu_no_resched();
355 set_irq_regs(old_regs);
356} 354}
357 355
358static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, 356static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
@@ -607,14 +605,11 @@ void smp_send_stop(void)
607 */ 605 */
608fastcall void smp_reschedule_interrupt(struct pt_regs *regs) 606fastcall void smp_reschedule_interrupt(struct pt_regs *regs)
609{ 607{
610 struct pt_regs *old_regs = set_irq_regs(regs);
611 ack_APIC_irq(); 608 ack_APIC_irq();
612 set_irq_regs(old_regs);
613} 609}
614 610
615fastcall void smp_call_function_interrupt(struct pt_regs *regs) 611fastcall void smp_call_function_interrupt(struct pt_regs *regs)
616{ 612{
617 struct pt_regs *old_regs = set_irq_regs(regs);
618 void (*func) (void *info) = call_data->func; 613 void (*func) (void *info) = call_data->func;
619 void *info = call_data->info; 614 void *info = call_data->info;
620 int wait = call_data->wait; 615 int wait = call_data->wait;
@@ -637,7 +632,6 @@ fastcall void smp_call_function_interrupt(struct pt_regs *regs)
637 mb(); 632 mb();
638 atomic_inc(&call_data->finished); 633 atomic_inc(&call_data->finished);
639 } 634 }
640 set_irq_regs(old_regs);
641} 635}
642 636
643/* 637/*
@@ -699,6 +693,10 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
699 put_cpu(); 693 put_cpu();
700 return -EBUSY; 694 return -EBUSY;
701 } 695 }
696
697 /* Can deadlock when called with interrupts disabled */
698 WARN_ON(irqs_disabled());
699
702 spin_lock_bh(&call_lock); 700 spin_lock_bh(&call_lock);
703 __smp_call_function_single(cpu, func, info, nonatomic, wait); 701 __smp_call_function_single(cpu, func, info, nonatomic, wait);
704 spin_unlock_bh(&call_lock); 702 spin_unlock_bh(&call_lock);
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 4bb8b77cd65b..b0f84e5778ad 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -33,6 +33,11 @@
33 * Dave Jones : Report invalid combinations of Athlon CPUs. 33 * Dave Jones : Report invalid combinations of Athlon CPUs.
34* Rusty Russell : Hacked into shape for new "hotplug" boot process. */ 34* Rusty Russell : Hacked into shape for new "hotplug" boot process. */
35 35
36
37/* SMP boot always wants to use real time delay to allow sufficient time for
38 * the APs to come online */
39#define USE_REAL_TIME_DELAY
40
36#include <linux/module.h> 41#include <linux/module.h>
37#include <linux/init.h> 42#include <linux/init.h>
38#include <linux/kernel.h> 43#include <linux/kernel.h>
@@ -52,6 +57,8 @@
52#include <asm/desc.h> 57#include <asm/desc.h>
53#include <asm/arch_hooks.h> 58#include <asm/arch_hooks.h>
54#include <asm/nmi.h> 59#include <asm/nmi.h>
60#include <asm/pda.h>
61#include <asm/genapic.h>
55 62
56#include <mach_apic.h> 63#include <mach_apic.h>
57#include <mach_wakecpu.h> 64#include <mach_wakecpu.h>
@@ -62,7 +69,7 @@ static int __devinitdata smp_b_stepping;
62 69
63/* Number of siblings per CPU package */ 70/* Number of siblings per CPU package */
64int smp_num_siblings = 1; 71int smp_num_siblings = 1;
65#ifdef CONFIG_X86_HT 72#ifdef CONFIG_SMP
66EXPORT_SYMBOL(smp_num_siblings); 73EXPORT_SYMBOL(smp_num_siblings);
67#endif 74#endif
68 75
@@ -536,11 +543,11 @@ set_cpu_sibling_map(int cpu)
536static void __devinit start_secondary(void *unused) 543static void __devinit start_secondary(void *unused)
537{ 544{
538 /* 545 /*
539 * Dont put anything before smp_callin(), SMP 546 * Don't put *anything* before secondary_cpu_init(), SMP
540 * booting is too fragile that we want to limit the 547 * booting is too fragile that we want to limit the
541 * things done here to the most necessary things. 548 * things done here to the most necessary things.
542 */ 549 */
543 cpu_init(); 550 secondary_cpu_init();
544 preempt_disable(); 551 preempt_disable();
545 smp_callin(); 552 smp_callin();
546 while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) 553 while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
@@ -599,13 +606,16 @@ void __devinit initialize_secondary(void)
599 "movl %0,%%esp\n\t" 606 "movl %0,%%esp\n\t"
600 "jmp *%1" 607 "jmp *%1"
601 : 608 :
602 :"r" (current->thread.esp),"r" (current->thread.eip)); 609 :"m" (current->thread.esp),"m" (current->thread.eip));
603} 610}
604 611
612/* Static state in head.S used to set up a CPU */
605extern struct { 613extern struct {
606 void * esp; 614 void * esp;
607 unsigned short ss; 615 unsigned short ss;
608} stack_start; 616} stack_start;
617extern struct i386_pda *start_pda;
618extern struct Xgt_desc_struct cpu_gdt_descr;
609 619
610#ifdef CONFIG_NUMA 620#ifdef CONFIG_NUMA
611 621
@@ -936,9 +946,6 @@ static int __devinit do_boot_cpu(int apicid, int cpu)
936 unsigned long start_eip; 946 unsigned long start_eip;
937 unsigned short nmi_high = 0, nmi_low = 0; 947 unsigned short nmi_high = 0, nmi_low = 0;
938 948
939 ++cpucount;
940 alternatives_smp_switch(1);
941
942 /* 949 /*
943 * We can't use kernel_thread since we must avoid to 950 * We can't use kernel_thread since we must avoid to
944 * reschedule the child. 951 * reschedule the child.
@@ -946,15 +953,30 @@ static int __devinit do_boot_cpu(int apicid, int cpu)
946 idle = alloc_idle_task(cpu); 953 idle = alloc_idle_task(cpu);
947 if (IS_ERR(idle)) 954 if (IS_ERR(idle))
948 panic("failed fork for CPU %d", cpu); 955 panic("failed fork for CPU %d", cpu);
956
957 /* Pre-allocate and initialize the CPU's GDT and PDA so it
958 doesn't have to do any memory allocation during the
959 delicate CPU-bringup phase. */
960 if (!init_gdt(cpu, idle)) {
961 printk(KERN_INFO "Couldn't allocate GDT/PDA for CPU %d\n", cpu);
962 return -1; /* ? */
963 }
964
949 idle->thread.eip = (unsigned long) start_secondary; 965 idle->thread.eip = (unsigned long) start_secondary;
950 /* start_eip had better be page-aligned! */ 966 /* start_eip had better be page-aligned! */
951 start_eip = setup_trampoline(); 967 start_eip = setup_trampoline();
952 968
969 ++cpucount;
970 alternatives_smp_switch(1);
971
953 /* So we see what's up */ 972 /* So we see what's up */
954 printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip); 973 printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip);
955 /* Stack for startup_32 can be just as for start_secondary onwards */ 974 /* Stack for startup_32 can be just as for start_secondary onwards */
956 stack_start.esp = (void *) idle->thread.esp; 975 stack_start.esp = (void *) idle->thread.esp;
957 976
977 start_pda = cpu_pda(cpu);
978 cpu_gdt_descr = per_cpu(cpu_gdt_descr, cpu);
979
958 irq_ctx_init(cpu); 980 irq_ctx_init(cpu);
959 981
960 x86_cpu_to_apicid[cpu] = apicid; 982 x86_cpu_to_apicid[cpu] = apicid;
@@ -1049,13 +1071,15 @@ void cpu_exit_clear(void)
1049 1071
1050struct warm_boot_cpu_info { 1072struct warm_boot_cpu_info {
1051 struct completion *complete; 1073 struct completion *complete;
1074 struct work_struct task;
1052 int apicid; 1075 int apicid;
1053 int cpu; 1076 int cpu;
1054}; 1077};
1055 1078
1056static void __cpuinit do_warm_boot_cpu(void *p) 1079static void __cpuinit do_warm_boot_cpu(struct work_struct *work)
1057{ 1080{
1058 struct warm_boot_cpu_info *info = p; 1081 struct warm_boot_cpu_info *info =
1082 container_of(work, struct warm_boot_cpu_info, task);
1059 do_boot_cpu(info->apicid, info->cpu); 1083 do_boot_cpu(info->apicid, info->cpu);
1060 complete(info->complete); 1084 complete(info->complete);
1061} 1085}
@@ -1064,7 +1088,6 @@ static int __cpuinit __smp_prepare_cpu(int cpu)
1064{ 1088{
1065 DECLARE_COMPLETION_ONSTACK(done); 1089 DECLARE_COMPLETION_ONSTACK(done);
1066 struct warm_boot_cpu_info info; 1090 struct warm_boot_cpu_info info;
1067 struct work_struct task;
1068 int apicid, ret; 1091 int apicid, ret;
1069 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); 1092 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
1070 1093
@@ -1089,15 +1112,15 @@ static int __cpuinit __smp_prepare_cpu(int cpu)
1089 info.complete = &done; 1112 info.complete = &done;
1090 info.apicid = apicid; 1113 info.apicid = apicid;
1091 info.cpu = cpu; 1114 info.cpu = cpu;
1092 INIT_WORK(&task, do_warm_boot_cpu, &info); 1115 INIT_WORK(&info.task, do_warm_boot_cpu);
1093 1116
1094 tsc_sync_disabled = 1; 1117 tsc_sync_disabled = 1;
1095 1118
1096 /* init low mem mapping */ 1119 /* init low mem mapping */
1097 clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, 1120 clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
1098 KERNEL_PGD_PTRS); 1121 min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS));
1099 flush_tlb_all(); 1122 flush_tlb_all();
1100 schedule_work(&task); 1123 schedule_work(&info.task);
1101 wait_for_completion(&done); 1124 wait_for_completion(&done);
1102 1125
1103 tsc_sync_disabled = 0; 1126 tsc_sync_disabled = 0;
@@ -1108,34 +1131,15 @@ exit:
1108} 1131}
1109#endif 1132#endif
1110 1133
1111static void smp_tune_scheduling (void) 1134static void smp_tune_scheduling(void)
1112{ 1135{
1113 unsigned long cachesize; /* kB */ 1136 unsigned long cachesize; /* kB */
1114 unsigned long bandwidth = 350; /* MB/s */
1115 /*
1116 * Rough estimation for SMP scheduling, this is the number of
1117 * cycles it takes for a fully memory-limited process to flush
1118 * the SMP-local cache.
1119 *
1120 * (For a P5 this pretty much means we will choose another idle
1121 * CPU almost always at wakeup time (this is due to the small
1122 * L1 cache), on PIIs it's around 50-100 usecs, depending on
1123 * the cache size)
1124 */
1125 1137
1126 if (!cpu_khz) { 1138 if (cpu_khz) {
1127 /*
1128 * this basically disables processor-affinity
1129 * scheduling on SMP without a TSC.
1130 */
1131 return;
1132 } else {
1133 cachesize = boot_cpu_data.x86_cache_size; 1139 cachesize = boot_cpu_data.x86_cache_size;
1134 if (cachesize == -1) { 1140
1135 cachesize = 16; /* Pentiums, 2x8kB cache */ 1141 if (cachesize > 0)
1136 bandwidth = 100; 1142 max_cache_size = cachesize * 1024;
1137 }
1138 max_cache_size = cachesize * 1024;
1139 } 1143 }
1140} 1144}
1141 1145
@@ -1461,6 +1465,12 @@ int __devinit __cpu_up(unsigned int cpu)
1461 cpu_set(cpu, smp_commenced_mask); 1465 cpu_set(cpu, smp_commenced_mask);
1462 while (!cpu_isset(cpu, cpu_online_map)) 1466 while (!cpu_isset(cpu, cpu_online_map))
1463 cpu_relax(); 1467 cpu_relax();
1468
1469#ifdef CONFIG_X86_GENERICARCH
1470 if (num_online_cpus() > 8 && genapic == &apic_default)
1471 panic("Default flat APIC routing can't be used with > 8 cpus\n");
1472#endif
1473
1464 return 0; 1474 return 0;
1465} 1475}
1466 1476
diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c
index 713ba39d32c6..7de9117b5a3a 100644
--- a/arch/i386/kernel/sysenter.c
+++ b/arch/i386/kernel/sysenter.c
@@ -27,7 +27,11 @@
27 * Should the kernel map a VDSO page into processes and pass its 27 * Should the kernel map a VDSO page into processes and pass its
28 * address down to glibc upon exec()? 28 * address down to glibc upon exec()?
29 */ 29 */
30#ifdef CONFIG_PARAVIRT
31unsigned int __read_mostly vdso_enabled = 0;
32#else
30unsigned int __read_mostly vdso_enabled = 1; 33unsigned int __read_mostly vdso_enabled = 1;
34#endif
31 35
32EXPORT_SYMBOL_GPL(vdso_enabled); 36EXPORT_SYMBOL_GPL(vdso_enabled);
33 37
@@ -132,7 +136,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
132 goto up_fail; 136 goto up_fail;
133 } 137 }
134 138
135 vma = kmem_cache_zalloc(vm_area_cachep, SLAB_KERNEL); 139 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
136 if (!vma) { 140 if (!vma) {
137 ret = -ENOMEM; 141 ret = -ENOMEM;
138 goto up_fail; 142 goto up_fail;
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index 78af572fd17c..c505b16c0990 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -56,6 +56,7 @@
56#include <asm/uaccess.h> 56#include <asm/uaccess.h>
57#include <asm/processor.h> 57#include <asm/processor.h>
58#include <asm/timer.h> 58#include <asm/timer.h>
59#include <asm/time.h>
59 60
60#include "mach_time.h" 61#include "mach_time.h"
61 62
@@ -116,10 +117,7 @@ static int set_rtc_mmss(unsigned long nowtime)
116 /* gets recalled with irq locally disabled */ 117 /* gets recalled with irq locally disabled */
117 /* XXX - does irqsave resolve this? -johnstul */ 118 /* XXX - does irqsave resolve this? -johnstul */
118 spin_lock_irqsave(&rtc_lock, flags); 119 spin_lock_irqsave(&rtc_lock, flags);
119 if (efi_enabled) 120 retval = set_wallclock(nowtime);
120 retval = efi_set_rtc_mmss(nowtime);
121 else
122 retval = mach_set_rtc_mmss(nowtime);
123 spin_unlock_irqrestore(&rtc_lock, flags); 121 spin_unlock_irqrestore(&rtc_lock, flags);
124 122
125 return retval; 123 return retval;
@@ -223,10 +221,7 @@ unsigned long get_cmos_time(void)
223 221
224 spin_lock_irqsave(&rtc_lock, flags); 222 spin_lock_irqsave(&rtc_lock, flags);
225 223
226 if (efi_enabled) 224 retval = get_wallclock();
227 retval = efi_get_time();
228 else
229 retval = mach_get_cmos_time();
230 225
231 spin_unlock_irqrestore(&rtc_lock, flags); 226 spin_unlock_irqrestore(&rtc_lock, flags);
232 227
@@ -370,7 +365,7 @@ static void __init hpet_time_init(void)
370 printk("Using HPET for base-timer\n"); 365 printk("Using HPET for base-timer\n");
371 } 366 }
372 367
373 time_init_hook(); 368 do_time_init();
374} 369}
375#endif 370#endif
376 371
@@ -392,5 +387,5 @@ void __init time_init(void)
392 387
393 do_settimeofday(&ts); 388 do_settimeofday(&ts);
394 389
395 time_init_hook(); 390 do_time_init();
396} 391}
diff --git a/arch/i386/kernel/time_hpet.c b/arch/i386/kernel/time_hpet.c
index 1a2a979cf6a3..1e4702dfcd01 100644
--- a/arch/i386/kernel/time_hpet.c
+++ b/arch/i386/kernel/time_hpet.c
@@ -132,14 +132,20 @@ int __init hpet_enable(void)
132 * the single HPET timer for system time. 132 * the single HPET timer for system time.
133 */ 133 */
134#ifdef CONFIG_HPET_EMULATE_RTC 134#ifdef CONFIG_HPET_EMULATE_RTC
135 if (!(id & HPET_ID_NUMBER)) 135 if (!(id & HPET_ID_NUMBER)) {
136 iounmap(hpet_virt_address);
137 hpet_virt_address = NULL;
136 return -1; 138 return -1;
139 }
137#endif 140#endif
138 141
139 142
140 hpet_period = hpet_readl(HPET_PERIOD); 143 hpet_period = hpet_readl(HPET_PERIOD);
141 if ((hpet_period < HPET_MIN_PERIOD) || (hpet_period > HPET_MAX_PERIOD)) 144 if ((hpet_period < HPET_MIN_PERIOD) || (hpet_period > HPET_MAX_PERIOD)) {
145 iounmap(hpet_virt_address);
146 hpet_virt_address = NULL;
142 return -1; 147 return -1;
148 }
143 149
144 /* 150 /*
145 * 64 bit math 151 * 64 bit math
@@ -156,8 +162,11 @@ int __init hpet_enable(void)
156 162
157 hpet_use_timer = id & HPET_ID_LEGSUP; 163 hpet_use_timer = id & HPET_ID_LEGSUP;
158 164
159 if (hpet_timer_stop_set_go(hpet_tick)) 165 if (hpet_timer_stop_set_go(hpet_tick)) {
166 iounmap(hpet_virt_address);
167 hpet_virt_address = NULL;
160 return -1; 168 return -1;
169 }
161 170
162 use_hpet = 1; 171 use_hpet = 1;
163 172
diff --git a/arch/i386/kernel/topology.c b/arch/i386/kernel/topology.c
index 07d6da36a825..79cf608e14ca 100644
--- a/arch/i386/kernel/topology.c
+++ b/arch/i386/kernel/topology.c
@@ -40,14 +40,18 @@ int arch_register_cpu(int num)
40 * restrictions and assumptions in kernel. This basically 40 * restrictions and assumptions in kernel. This basically
41 * doesnt add a control file, one cannot attempt to offline 41 * doesnt add a control file, one cannot attempt to offline
42 * BSP. 42 * BSP.
43 *
44 * Also certain PCI quirks require not to enable hotplug control
45 * for all CPU's.
43 */ 46 */
44 if (!num) 47 if (num && enable_cpu_hotplug)
45 cpu_devices[num].cpu.no_control = 1; 48 cpu_devices[num].cpu.hotpluggable = 1;
46 49
47 return register_cpu(&cpu_devices[num].cpu, num); 50 return register_cpu(&cpu_devices[num].cpu, num);
48} 51}
49 52
50#ifdef CONFIG_HOTPLUG_CPU 53#ifdef CONFIG_HOTPLUG_CPU
54int enable_cpu_hotplug = 1;
51 55
52void arch_unregister_cpu(int num) { 56void arch_unregister_cpu(int num) {
53 return unregister_cpu(&cpu_devices[num].cpu); 57 return unregister_cpu(&cpu_devices[num].cpu);
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 00489b706d27..2b30dbf8d117 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -29,6 +29,8 @@
29#include <linux/kexec.h> 29#include <linux/kexec.h>
30#include <linux/unwind.h> 30#include <linux/unwind.h>
31#include <linux/uaccess.h> 31#include <linux/uaccess.h>
32#include <linux/nmi.h>
33#include <linux/bug.h>
32 34
33#ifdef CONFIG_EISA 35#ifdef CONFIG_EISA
34#include <linux/ioport.h> 36#include <linux/ioport.h>
@@ -61,9 +63,6 @@ int panic_on_unrecovered_nmi;
61 63
62asmlinkage int system_call(void); 64asmlinkage int system_call(void);
63 65
64struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 },
65 { 0, 0 }, { 0, 0 } };
66
67/* Do we ignore FPU interrupts ? */ 66/* Do we ignore FPU interrupts ? */
68char ignore_fpu_irq = 0; 67char ignore_fpu_irq = 0;
69 68
@@ -94,7 +93,7 @@ asmlinkage void alignment_check(void);
94asmlinkage void spurious_interrupt_bug(void); 93asmlinkage void spurious_interrupt_bug(void);
95asmlinkage void machine_check(void); 94asmlinkage void machine_check(void);
96 95
97static int kstack_depth_to_print = 24; 96int kstack_depth_to_print = 24;
98#ifdef CONFIG_STACK_UNWIND 97#ifdef CONFIG_STACK_UNWIND
99static int call_trace = 1; 98static int call_trace = 1;
100#else 99#else
@@ -129,15 +128,19 @@ static inline unsigned long print_context_stack(struct thread_info *tinfo,
129 128
130#ifdef CONFIG_FRAME_POINTER 129#ifdef CONFIG_FRAME_POINTER
131 while (valid_stack_ptr(tinfo, (void *)ebp)) { 130 while (valid_stack_ptr(tinfo, (void *)ebp)) {
131 unsigned long new_ebp;
132 addr = *(unsigned long *)(ebp + 4); 132 addr = *(unsigned long *)(ebp + 4);
133 ops->address(data, addr); 133 ops->address(data, addr);
134 /* 134 /*
135 * break out of recursive entries (such as 135 * break out of recursive entries (such as
136 * end_of_stack_stop_unwind_function): 136 * end_of_stack_stop_unwind_function). Also,
137 * we can never allow a frame pointer to
138 * move downwards!
137 */ 139 */
138 if (ebp == *(unsigned long *)ebp) 140 new_ebp = *(unsigned long *)ebp;
141 if (new_ebp <= ebp)
139 break; 142 break;
140 ebp = *(unsigned long *)ebp; 143 ebp = new_ebp;
141 } 144 }
142#else 145#else
143 while (valid_stack_ptr(tinfo, stack)) { 146 while (valid_stack_ptr(tinfo, stack)) {
@@ -159,16 +162,25 @@ dump_trace_unwind(struct unwind_frame_info *info, void *data)
159{ 162{
160 struct ops_and_data *oad = (struct ops_and_data *)data; 163 struct ops_and_data *oad = (struct ops_and_data *)data;
161 int n = 0; 164 int n = 0;
165 unsigned long sp = UNW_SP(info);
162 166
167 if (arch_unw_user_mode(info))
168 return -1;
163 while (unwind(info) == 0 && UNW_PC(info)) { 169 while (unwind(info) == 0 && UNW_PC(info)) {
164 n++; 170 n++;
165 oad->ops->address(oad->data, UNW_PC(info)); 171 oad->ops->address(oad->data, UNW_PC(info));
166 if (arch_unw_user_mode(info)) 172 if (arch_unw_user_mode(info))
167 break; 173 break;
174 if ((sp & ~(PAGE_SIZE - 1)) == (UNW_SP(info) & ~(PAGE_SIZE - 1))
175 && sp > UNW_SP(info))
176 break;
177 sp = UNW_SP(info);
168 } 178 }
169 return n; 179 return n;
170} 180}
171 181
182#define MSG(msg) ops->warning(data, msg)
183
172void dump_trace(struct task_struct *task, struct pt_regs *regs, 184void dump_trace(struct task_struct *task, struct pt_regs *regs,
173 unsigned long *stack, 185 unsigned long *stack,
174 struct stacktrace_ops *ops, void *data) 186 struct stacktrace_ops *ops, void *data)
@@ -187,29 +199,31 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
187 if (unwind_init_frame_info(&info, task, regs) == 0) 199 if (unwind_init_frame_info(&info, task, regs) == 0)
188 unw_ret = dump_trace_unwind(&info, &oad); 200 unw_ret = dump_trace_unwind(&info, &oad);
189 } else if (task == current) 201 } else if (task == current)
190 unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad); 202 unw_ret = unwind_init_running(&info, dump_trace_unwind,
203 &oad);
191 else { 204 else {
192 if (unwind_init_blocked(&info, task) == 0) 205 if (unwind_init_blocked(&info, task) == 0)
193 unw_ret = dump_trace_unwind(&info, &oad); 206 unw_ret = dump_trace_unwind(&info, &oad);
194 } 207 }
195 if (unw_ret > 0) { 208 if (unw_ret > 0) {
196 if (call_trace == 1 && !arch_unw_user_mode(&info)) { 209 if (call_trace == 1 && !arch_unw_user_mode(&info)) {
197 ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n", 210 ops->warning_symbol(data,
211 "DWARF2 unwinder stuck at %s",
198 UNW_PC(&info)); 212 UNW_PC(&info));
199 if (UNW_SP(&info) >= PAGE_OFFSET) { 213 if (UNW_SP(&info) >= PAGE_OFFSET) {
200 ops->warning(data, "Leftover inexact backtrace:\n"); 214 MSG("Leftover inexact backtrace:");
201 stack = (void *)UNW_SP(&info); 215 stack = (void *)UNW_SP(&info);
202 if (!stack) 216 if (!stack)
203 return; 217 return;
204 ebp = UNW_FP(&info); 218 ebp = UNW_FP(&info);
205 } else 219 } else
206 ops->warning(data, "Full inexact backtrace again:\n"); 220 MSG("Full inexact backtrace again:");
207 } else if (call_trace >= 1) 221 } else if (call_trace >= 1)
208 return; 222 return;
209 else 223 else
210 ops->warning(data, "Full inexact backtrace again:\n"); 224 MSG("Full inexact backtrace again:");
211 } else 225 } else
212 ops->warning(data, "Inexact backtrace:\n"); 226 MSG("Inexact backtrace:");
213 } 227 }
214 if (!stack) { 228 if (!stack) {
215 unsigned long dummy; 229 unsigned long dummy;
@@ -243,6 +257,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
243 stack = (unsigned long*)context->previous_esp; 257 stack = (unsigned long*)context->previous_esp;
244 if (!stack) 258 if (!stack)
245 break; 259 break;
260 touch_nmi_watchdog();
246 } 261 }
247} 262}
248EXPORT_SYMBOL(dump_trace); 263EXPORT_SYMBOL(dump_trace);
@@ -375,7 +390,7 @@ void show_registers(struct pt_regs *regs)
375 * time of the fault.. 390 * time of the fault..
376 */ 391 */
377 if (in_kernel) { 392 if (in_kernel) {
378 u8 __user *eip; 393 u8 *eip;
379 int code_bytes = 64; 394 int code_bytes = 64;
380 unsigned char c; 395 unsigned char c;
381 396
@@ -384,18 +399,20 @@ void show_registers(struct pt_regs *regs)
384 399
385 printk(KERN_EMERG "Code: "); 400 printk(KERN_EMERG "Code: ");
386 401
387 eip = (u8 __user *)regs->eip - 43; 402 eip = (u8 *)regs->eip - 43;
388 if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) { 403 if (eip < (u8 *)PAGE_OFFSET ||
404 probe_kernel_address(eip, c)) {
389 /* try starting at EIP */ 405 /* try starting at EIP */
390 eip = (u8 __user *)regs->eip; 406 eip = (u8 *)regs->eip;
391 code_bytes = 32; 407 code_bytes = 32;
392 } 408 }
393 for (i = 0; i < code_bytes; i++, eip++) { 409 for (i = 0; i < code_bytes; i++, eip++) {
394 if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) { 410 if (eip < (u8 *)PAGE_OFFSET ||
411 probe_kernel_address(eip, c)) {
395 printk(" Bad EIP value."); 412 printk(" Bad EIP value.");
396 break; 413 break;
397 } 414 }
398 if (eip == (u8 __user *)regs->eip) 415 if (eip == (u8 *)regs->eip)
399 printk("<%02x> ", c); 416 printk("<%02x> ", c);
400 else 417 else
401 printk("%02x ", c); 418 printk("%02x ", c);
@@ -404,43 +421,22 @@ void show_registers(struct pt_regs *regs)
404 printk("\n"); 421 printk("\n");
405} 422}
406 423
407static void handle_BUG(struct pt_regs *regs) 424int is_valid_bugaddr(unsigned long eip)
408{ 425{
409 unsigned long eip = regs->eip;
410 unsigned short ud2; 426 unsigned short ud2;
411 427
412 if (eip < PAGE_OFFSET) 428 if (eip < PAGE_OFFSET)
413 return; 429 return 0;
414 if (probe_kernel_address((unsigned short __user *)eip, ud2)) 430 if (probe_kernel_address((unsigned short *)eip, ud2))
415 return; 431 return 0;
416 if (ud2 != 0x0b0f)
417 return;
418
419 printk(KERN_EMERG "------------[ cut here ]------------\n");
420
421#ifdef CONFIG_DEBUG_BUGVERBOSE
422 do {
423 unsigned short line;
424 char *file;
425 char c;
426
427 if (probe_kernel_address((unsigned short __user *)(eip + 2),
428 line))
429 break;
430 if (__get_user(file, (char * __user *)(eip + 4)) ||
431 (unsigned long)file < PAGE_OFFSET || __get_user(c, file))
432 file = "<bad filename>";
433 432
434 printk(KERN_EMERG "kernel BUG at %s:%d!\n", file, line); 433 return ud2 == 0x0b0f;
435 return;
436 } while (0);
437#endif
438 printk(KERN_EMERG "Kernel BUG at [verbose debug info unavailable]\n");
439} 434}
440 435
441/* This is gone through when something in the kernel 436/*
442 * has done something bad and is about to be terminated. 437 * This is gone through when something in the kernel has done something bad and
443*/ 438 * is about to be terminated.
439 */
444void die(const char * str, struct pt_regs * regs, long err) 440void die(const char * str, struct pt_regs * regs, long err)
445{ 441{
446 static struct { 442 static struct {
@@ -448,7 +444,7 @@ void die(const char * str, struct pt_regs * regs, long err)
448 u32 lock_owner; 444 u32 lock_owner;
449 int lock_owner_depth; 445 int lock_owner_depth;
450 } die = { 446 } die = {
451 .lock = SPIN_LOCK_UNLOCKED, 447 .lock = __SPIN_LOCK_UNLOCKED(die.lock),
452 .lock_owner = -1, 448 .lock_owner = -1,
453 .lock_owner_depth = 0 449 .lock_owner_depth = 0
454 }; 450 };
@@ -472,7 +468,8 @@ void die(const char * str, struct pt_regs * regs, long err)
472 unsigned long esp; 468 unsigned long esp;
473 unsigned short ss; 469 unsigned short ss;
474 470
475 handle_BUG(regs); 471 report_bug(regs->eip);
472
476 printk(KERN_EMERG "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter); 473 printk(KERN_EMERG "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter);
477#ifdef CONFIG_PREEMPT 474#ifdef CONFIG_PREEMPT
478 printk(KERN_EMERG "PREEMPT "); 475 printk(KERN_EMERG "PREEMPT ");
@@ -703,8 +700,7 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs)
703{ 700{
704 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on " 701 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
705 "CPU %d.\n", reason, smp_processor_id()); 702 "CPU %d.\n", reason, smp_processor_id());
706 printk(KERN_EMERG "You probably have a hardware problem with your RAM " 703 printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
707 "chips\n");
708 if (panic_on_unrecovered_nmi) 704 if (panic_on_unrecovered_nmi)
709 panic("NMI: Not continuing"); 705 panic("NMI: Not continuing");
710 706
@@ -769,7 +765,6 @@ void __kprobes die_nmi(struct pt_regs *regs, const char *msg)
769 printk(" on CPU%d, eip %08lx, registers:\n", 765 printk(" on CPU%d, eip %08lx, registers:\n",
770 smp_processor_id(), regs->eip); 766 smp_processor_id(), regs->eip);
771 show_registers(regs); 767 show_registers(regs);
772 printk(KERN_EMERG "console shuts up ...\n");
773 console_silent(); 768 console_silent();
774 spin_unlock(&nmi_print_lock); 769 spin_unlock(&nmi_print_lock);
775 bust_spinlocks(0); 770 bust_spinlocks(0);
@@ -1084,49 +1079,24 @@ fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
1084#endif 1079#endif
1085} 1080}
1086 1081
1087fastcall void setup_x86_bogus_stack(unsigned char * stk) 1082fastcall unsigned long patch_espfix_desc(unsigned long uesp,
1088{ 1083 unsigned long kesp)
1089 unsigned long *switch16_ptr, *switch32_ptr;
1090 struct pt_regs *regs;
1091 unsigned long stack_top, stack_bot;
1092 unsigned short iret_frame16_off;
1093 int cpu = smp_processor_id();
1094 /* reserve the space on 32bit stack for the magic switch16 pointer */
1095 memmove(stk, stk + 8, sizeof(struct pt_regs));
1096 switch16_ptr = (unsigned long *)(stk + sizeof(struct pt_regs));
1097 regs = (struct pt_regs *)stk;
1098 /* now the switch32 on 16bit stack */
1099 stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
1100 stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
1101 switch32_ptr = (unsigned long *)(stack_top - 8);
1102 iret_frame16_off = CPU_16BIT_STACK_SIZE - 8 - 20;
1103 /* copy iret frame on 16bit stack */
1104 memcpy((void *)(stack_bot + iret_frame16_off), &regs->eip, 20);
1105 /* fill in the switch pointers */
1106 switch16_ptr[0] = (regs->esp & 0xffff0000) | iret_frame16_off;
1107 switch16_ptr[1] = __ESPFIX_SS;
1108 switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) +
1109 8 - CPU_16BIT_STACK_SIZE;
1110 switch32_ptr[1] = __KERNEL_DS;
1111}
1112
1113fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp)
1114{ 1084{
1115 unsigned long *switch32_ptr;
1116 unsigned char *stack16, *stack32;
1117 unsigned long stack_top, stack_bot;
1118 int len;
1119 int cpu = smp_processor_id(); 1085 int cpu = smp_processor_id();
1120 stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu); 1086 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
1121 stack_top = stack_bot + CPU_16BIT_STACK_SIZE; 1087 struct desc_struct *gdt = (struct desc_struct *)cpu_gdt_descr->address;
1122 switch32_ptr = (unsigned long *)(stack_top - 8); 1088 unsigned long base = (kesp - uesp) & -THREAD_SIZE;
1123 /* copy the data from 16bit stack to 32bit stack */ 1089 unsigned long new_kesp = kesp - base;
1124 len = CPU_16BIT_STACK_SIZE - 8 - sp; 1090 unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
1125 stack16 = (unsigned char *)(stack_bot + sp); 1091 __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS];
1126 stack32 = (unsigned char *) 1092 /* Set up base for espfix segment */
1127 (switch32_ptr[0] + CPU_16BIT_STACK_SIZE - 8 - len); 1093 desc &= 0x00f0ff0000000000ULL;
1128 memcpy(stack32, stack16, len); 1094 desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) |
1129 return stack32; 1095 ((((__u64)base) << 32) & 0xff00000000000000ULL) |
1096 ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) |
1097 (lim_pages & 0xffff);
1098 *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc;
1099 return new_kesp;
1130} 1100}
1131 1101
1132/* 1102/*
@@ -1139,7 +1109,7 @@ fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp)
1139 * Must be called with kernel preemption disabled (in this case, 1109 * Must be called with kernel preemption disabled (in this case,
1140 * local interrupts are disabled at the call-site in entry.S). 1110 * local interrupts are disabled at the call-site in entry.S).
1141 */ 1111 */
1142asmlinkage void math_state_restore(struct pt_regs regs) 1112asmlinkage void math_state_restore(void)
1143{ 1113{
1144 struct thread_info *thread = current_thread_info(); 1114 struct thread_info *thread = current_thread_info();
1145 struct task_struct *tsk = thread->task; 1115 struct task_struct *tsk = thread->task;
@@ -1149,6 +1119,7 @@ asmlinkage void math_state_restore(struct pt_regs regs)
1149 init_fpu(tsk); 1119 init_fpu(tsk);
1150 restore_fpu(tsk); 1120 restore_fpu(tsk);
1151 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ 1121 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
1122 tsk->fpu_counter++;
1152} 1123}
1153 1124
1154#ifndef CONFIG_MATH_EMULATION 1125#ifndef CONFIG_MATH_EMULATION
diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index b8fa0a8b2e47..1bbe45dca7a0 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -13,7 +13,6 @@
13 13
14#include <asm/delay.h> 14#include <asm/delay.h>
15#include <asm/tsc.h> 15#include <asm/tsc.h>
16#include <asm/delay.h>
17#include <asm/io.h> 16#include <asm/io.h>
18 17
19#include "mach_timer.h" 18#include "mach_timer.h"
@@ -217,7 +216,7 @@ static unsigned int cpufreq_delayed_issched = 0;
217static unsigned int cpufreq_init = 0; 216static unsigned int cpufreq_init = 0;
218static struct work_struct cpufreq_delayed_get_work; 217static struct work_struct cpufreq_delayed_get_work;
219 218
220static void handle_cpufreq_delayed_get(void *v) 219static void handle_cpufreq_delayed_get(struct work_struct *work)
221{ 220{
222 unsigned int cpu; 221 unsigned int cpu;
223 222
@@ -306,7 +305,7 @@ static int __init cpufreq_tsc(void)
306{ 305{
307 int ret; 306 int ret;
308 307
309 INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL); 308 INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get);
310 ret = cpufreq_register_notifier(&time_cpufreq_notifier_block, 309 ret = cpufreq_register_notifier(&time_cpufreq_notifier_block,
311 CPUFREQ_TRANSITION_NOTIFIER); 310 CPUFREQ_TRANSITION_NOTIFIER);
312 if (!ret) 311 if (!ret)
@@ -349,8 +348,8 @@ static int tsc_update_callback(void)
349 int change = 0; 348 int change = 0;
350 349
351 /* check to see if we should switch to the safe clocksource: */ 350 /* check to see if we should switch to the safe clocksource: */
352 if (clocksource_tsc.rating != 50 && check_tsc_unstable()) { 351 if (clocksource_tsc.rating != 0 && check_tsc_unstable()) {
353 clocksource_tsc.rating = 50; 352 clocksource_tsc.rating = 0;
354 clocksource_reselect(); 353 clocksource_reselect();
355 change = 1; 354 change = 1;
356 } 355 }
@@ -461,7 +460,7 @@ static int __init init_tsc_clocksource(void)
461 clocksource_tsc.shift); 460 clocksource_tsc.shift);
462 /* lower the rating if we already know its unstable: */ 461 /* lower the rating if we already know its unstable: */
463 if (check_tsc_unstable()) 462 if (check_tsc_unstable())
464 clocksource_tsc.rating = 50; 463 clocksource_tsc.rating = 0;
465 464
466 init_timer(&verify_tsc_freq_timer); 465 init_timer(&verify_tsc_freq_timer);
467 verify_tsc_freq_timer.function = verify_tsc_freq; 466 verify_tsc_freq_timer.function = verify_tsc_freq;
diff --git a/arch/i386/kernel/vm86.c b/arch/i386/kernel/vm86.c
index cbcd61d6120b..be2f96e67f78 100644
--- a/arch/i386/kernel/vm86.c
+++ b/arch/i386/kernel/vm86.c
@@ -43,6 +43,7 @@
43#include <linux/highmem.h> 43#include <linux/highmem.h>
44#include <linux/ptrace.h> 44#include <linux/ptrace.h>
45#include <linux/audit.h> 45#include <linux/audit.h>
46#include <linux/stddef.h>
46 47
47#include <asm/uaccess.h> 48#include <asm/uaccess.h>
48#include <asm/io.h> 49#include <asm/io.h>
@@ -72,10 +73,10 @@
72/* 73/*
73 * 8- and 16-bit register defines.. 74 * 8- and 16-bit register defines..
74 */ 75 */
75#define AL(regs) (((unsigned char *)&((regs)->eax))[0]) 76#define AL(regs) (((unsigned char *)&((regs)->pt.eax))[0])
76#define AH(regs) (((unsigned char *)&((regs)->eax))[1]) 77#define AH(regs) (((unsigned char *)&((regs)->pt.eax))[1])
77#define IP(regs) (*(unsigned short *)&((regs)->eip)) 78#define IP(regs) (*(unsigned short *)&((regs)->pt.eip))
78#define SP(regs) (*(unsigned short *)&((regs)->esp)) 79#define SP(regs) (*(unsigned short *)&((regs)->pt.esp))
79 80
80/* 81/*
81 * virtual flags (16 and 32-bit versions) 82 * virtual flags (16 and 32-bit versions)
@@ -89,10 +90,37 @@
89#define SAFE_MASK (0xDD5) 90#define SAFE_MASK (0xDD5)
90#define RETURN_MASK (0xDFF) 91#define RETURN_MASK (0xDFF)
91 92
92#define VM86_REGS_PART2 orig_eax 93/* convert kernel_vm86_regs to vm86_regs */
93#define VM86_REGS_SIZE1 \ 94static int copy_vm86_regs_to_user(struct vm86_regs __user *user,
94 ( (unsigned)( & (((struct kernel_vm86_regs *)0)->VM86_REGS_PART2) ) ) 95 const struct kernel_vm86_regs *regs)
95#define VM86_REGS_SIZE2 (sizeof(struct kernel_vm86_regs) - VM86_REGS_SIZE1) 96{
97 int ret = 0;
98
99 /* kernel_vm86_regs is missing xfs, so copy everything up to
100 (but not including) xgs, and then rest after xgs. */
101 ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.xgs));
102 ret += copy_to_user(&user->__null_gs, &regs->pt.xgs,
103 sizeof(struct kernel_vm86_regs) -
104 offsetof(struct kernel_vm86_regs, pt.xgs));
105
106 return ret;
107}
108
109/* convert vm86_regs to kernel_vm86_regs */
110static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
111 const struct vm86_regs __user *user,
112 unsigned extra)
113{
114 int ret = 0;
115
116 ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.xgs));
117 ret += copy_from_user(&regs->pt.xgs, &user->__null_gs,
118 sizeof(struct kernel_vm86_regs) -
119 offsetof(struct kernel_vm86_regs, pt.xgs) +
120 extra);
121
122 return ret;
123}
96 124
97struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs)); 125struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs));
98struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs) 126struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
@@ -112,10 +140,8 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
112 printk("no vm86_info: BAD\n"); 140 printk("no vm86_info: BAD\n");
113 do_exit(SIGSEGV); 141 do_exit(SIGSEGV);
114 } 142 }
115 set_flags(regs->eflags, VEFLAGS, VIF_MASK | current->thread.v86mask); 143 set_flags(regs->pt.eflags, VEFLAGS, VIF_MASK | current->thread.v86mask);
116 tmp = copy_to_user(&current->thread.vm86_info->regs,regs, VM86_REGS_SIZE1); 144 tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs,regs);
117 tmp += copy_to_user(&current->thread.vm86_info->regs.VM86_REGS_PART2,
118 &regs->VM86_REGS_PART2, VM86_REGS_SIZE2);
119 tmp += put_user(current->thread.screen_bitmap,&current->thread.vm86_info->screen_bitmap); 145 tmp += put_user(current->thread.screen_bitmap,&current->thread.vm86_info->screen_bitmap);
120 if (tmp) { 146 if (tmp) {
121 printk("vm86: could not access userspace vm86_info\n"); 147 printk("vm86: could not access userspace vm86_info\n");
@@ -129,9 +155,11 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
129 current->thread.saved_esp0 = 0; 155 current->thread.saved_esp0 = 0;
130 put_cpu(); 156 put_cpu();
131 157
132 loadsegment(fs, current->thread.saved_fs);
133 loadsegment(gs, current->thread.saved_gs);
134 ret = KVM86->regs32; 158 ret = KVM86->regs32;
159
160 loadsegment(fs, current->thread.saved_fs);
161 ret->xgs = current->thread.saved_gs;
162
135 return ret; 163 return ret;
136} 164}
137 165
@@ -183,9 +211,9 @@ asmlinkage int sys_vm86old(struct pt_regs regs)
183 tsk = current; 211 tsk = current;
184 if (tsk->thread.saved_esp0) 212 if (tsk->thread.saved_esp0)
185 goto out; 213 goto out;
186 tmp = copy_from_user(&info, v86, VM86_REGS_SIZE1); 214 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
187 tmp += copy_from_user(&info.regs.VM86_REGS_PART2, &v86->regs.VM86_REGS_PART2, 215 offsetof(struct kernel_vm86_struct, vm86plus) -
188 (long)&info.vm86plus - (long)&info.regs.VM86_REGS_PART2); 216 sizeof(info.regs));
189 ret = -EFAULT; 217 ret = -EFAULT;
190 if (tmp) 218 if (tmp)
191 goto out; 219 goto out;
@@ -233,9 +261,9 @@ asmlinkage int sys_vm86(struct pt_regs regs)
233 if (tsk->thread.saved_esp0) 261 if (tsk->thread.saved_esp0)
234 goto out; 262 goto out;
235 v86 = (struct vm86plus_struct __user *)regs.ecx; 263 v86 = (struct vm86plus_struct __user *)regs.ecx;
236 tmp = copy_from_user(&info, v86, VM86_REGS_SIZE1); 264 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
237 tmp += copy_from_user(&info.regs.VM86_REGS_PART2, &v86->regs.VM86_REGS_PART2, 265 offsetof(struct kernel_vm86_struct, regs32) -
238 (long)&info.regs32 - (long)&info.regs.VM86_REGS_PART2); 266 sizeof(info.regs));
239 ret = -EFAULT; 267 ret = -EFAULT;
240 if (tmp) 268 if (tmp)
241 goto out; 269 goto out;
@@ -252,15 +280,15 @@ out:
252static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk) 280static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk)
253{ 281{
254 struct tss_struct *tss; 282 struct tss_struct *tss;
255 long eax;
256/* 283/*
257 * make sure the vm86() system call doesn't try to do anything silly 284 * make sure the vm86() system call doesn't try to do anything silly
258 */ 285 */
259 info->regs.__null_ds = 0; 286 info->regs.pt.xds = 0;
260 info->regs.__null_es = 0; 287 info->regs.pt.xes = 0;
288 info->regs.pt.xgs = 0;
261 289
262/* we are clearing fs,gs later just before "jmp resume_userspace", 290/* we are clearing fs later just before "jmp resume_userspace",
263 * because starting with Linux 2.1.x they aren't no longer saved/restored 291 * because it is not saved/restored.
264 */ 292 */
265 293
266/* 294/*
@@ -268,10 +296,10 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
268 * has set it up safely, so this makes sure interrupt etc flags are 296 * has set it up safely, so this makes sure interrupt etc flags are
269 * inherited from protected mode. 297 * inherited from protected mode.
270 */ 298 */
271 VEFLAGS = info->regs.eflags; 299 VEFLAGS = info->regs.pt.eflags;
272 info->regs.eflags &= SAFE_MASK; 300 info->regs.pt.eflags &= SAFE_MASK;
273 info->regs.eflags |= info->regs32->eflags & ~SAFE_MASK; 301 info->regs.pt.eflags |= info->regs32->eflags & ~SAFE_MASK;
274 info->regs.eflags |= VM_MASK; 302 info->regs.pt.eflags |= VM_MASK;
275 303
276 switch (info->cpu_type) { 304 switch (info->cpu_type) {
277 case CPU_286: 305 case CPU_286:
@@ -294,7 +322,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
294 info->regs32->eax = 0; 322 info->regs32->eax = 0;
295 tsk->thread.saved_esp0 = tsk->thread.esp0; 323 tsk->thread.saved_esp0 = tsk->thread.esp0;
296 savesegment(fs, tsk->thread.saved_fs); 324 savesegment(fs, tsk->thread.saved_fs);
297 savesegment(gs, tsk->thread.saved_gs); 325 tsk->thread.saved_gs = info->regs32->xgs;
298 326
299 tss = &per_cpu(init_tss, get_cpu()); 327 tss = &per_cpu(init_tss, get_cpu());
300 tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0; 328 tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0;
@@ -306,19 +334,18 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
306 tsk->thread.screen_bitmap = info->screen_bitmap; 334 tsk->thread.screen_bitmap = info->screen_bitmap;
307 if (info->flags & VM86_SCREEN_BITMAP) 335 if (info->flags & VM86_SCREEN_BITMAP)
308 mark_screen_rdonly(tsk->mm); 336 mark_screen_rdonly(tsk->mm);
309 __asm__ __volatile__("xorl %eax,%eax; movl %eax,%fs; movl %eax,%gs\n\t");
310 __asm__ __volatile__("movl %%eax, %0\n" :"=r"(eax));
311 337
312 /*call audit_syscall_exit since we do not exit via the normal paths */ 338 /*call audit_syscall_exit since we do not exit via the normal paths */
313 if (unlikely(current->audit_context)) 339 if (unlikely(current->audit_context))
314 audit_syscall_exit(AUDITSC_RESULT(eax), eax); 340 audit_syscall_exit(AUDITSC_RESULT(0), 0);
315 341
316 __asm__ __volatile__( 342 __asm__ __volatile__(
317 "movl %0,%%esp\n\t" 343 "movl %0,%%esp\n\t"
318 "movl %1,%%ebp\n\t" 344 "movl %1,%%ebp\n\t"
345 "mov %2, %%fs\n\t"
319 "jmp resume_userspace" 346 "jmp resume_userspace"
320 : /* no outputs */ 347 : /* no outputs */
321 :"r" (&info->regs), "r" (task_thread_info(tsk))); 348 :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
322 /* we never return here */ 349 /* we never return here */
323} 350}
324 351
@@ -348,12 +375,12 @@ static inline void clear_IF(struct kernel_vm86_regs * regs)
348 375
349static inline void clear_TF(struct kernel_vm86_regs * regs) 376static inline void clear_TF(struct kernel_vm86_regs * regs)
350{ 377{
351 regs->eflags &= ~TF_MASK; 378 regs->pt.eflags &= ~TF_MASK;
352} 379}
353 380
354static inline void clear_AC(struct kernel_vm86_regs * regs) 381static inline void clear_AC(struct kernel_vm86_regs * regs)
355{ 382{
356 regs->eflags &= ~AC_MASK; 383 regs->pt.eflags &= ~AC_MASK;
357} 384}
358 385
359/* It is correct to call set_IF(regs) from the set_vflags_* 386/* It is correct to call set_IF(regs) from the set_vflags_*
@@ -370,7 +397,7 @@ static inline void clear_AC(struct kernel_vm86_regs * regs)
370static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs * regs) 397static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs * regs)
371{ 398{
372 set_flags(VEFLAGS, eflags, current->thread.v86mask); 399 set_flags(VEFLAGS, eflags, current->thread.v86mask);
373 set_flags(regs->eflags, eflags, SAFE_MASK); 400 set_flags(regs->pt.eflags, eflags, SAFE_MASK);
374 if (eflags & IF_MASK) 401 if (eflags & IF_MASK)
375 set_IF(regs); 402 set_IF(regs);
376 else 403 else
@@ -380,7 +407,7 @@ static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs
380static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs) 407static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs)
381{ 408{
382 set_flags(VFLAGS, flags, current->thread.v86mask); 409 set_flags(VFLAGS, flags, current->thread.v86mask);
383 set_flags(regs->eflags, flags, SAFE_MASK); 410 set_flags(regs->pt.eflags, flags, SAFE_MASK);
384 if (flags & IF_MASK) 411 if (flags & IF_MASK)
385 set_IF(regs); 412 set_IF(regs);
386 else 413 else
@@ -389,7 +416,7 @@ static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_reg
389 416
390static inline unsigned long get_vflags(struct kernel_vm86_regs * regs) 417static inline unsigned long get_vflags(struct kernel_vm86_regs * regs)
391{ 418{
392 unsigned long flags = regs->eflags & RETURN_MASK; 419 unsigned long flags = regs->pt.eflags & RETURN_MASK;
393 420
394 if (VEFLAGS & VIF_MASK) 421 if (VEFLAGS & VIF_MASK)
395 flags |= IF_MASK; 422 flags |= IF_MASK;
@@ -493,7 +520,7 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
493 unsigned long __user *intr_ptr; 520 unsigned long __user *intr_ptr;
494 unsigned long segoffs; 521 unsigned long segoffs;
495 522
496 if (regs->cs == BIOSSEG) 523 if (regs->pt.xcs == BIOSSEG)
497 goto cannot_handle; 524 goto cannot_handle;
498 if (is_revectored(i, &KVM86->int_revectored)) 525 if (is_revectored(i, &KVM86->int_revectored))
499 goto cannot_handle; 526 goto cannot_handle;
@@ -505,9 +532,9 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
505 if ((segoffs >> 16) == BIOSSEG) 532 if ((segoffs >> 16) == BIOSSEG)
506 goto cannot_handle; 533 goto cannot_handle;
507 pushw(ssp, sp, get_vflags(regs), cannot_handle); 534 pushw(ssp, sp, get_vflags(regs), cannot_handle);
508 pushw(ssp, sp, regs->cs, cannot_handle); 535 pushw(ssp, sp, regs->pt.xcs, cannot_handle);
509 pushw(ssp, sp, IP(regs), cannot_handle); 536 pushw(ssp, sp, IP(regs), cannot_handle);
510 regs->cs = segoffs >> 16; 537 regs->pt.xcs = segoffs >> 16;
511 SP(regs) -= 6; 538 SP(regs) -= 6;
512 IP(regs) = segoffs & 0xffff; 539 IP(regs) = segoffs & 0xffff;
513 clear_TF(regs); 540 clear_TF(regs);
@@ -524,7 +551,7 @@ int handle_vm86_trap(struct kernel_vm86_regs * regs, long error_code, int trapno
524 if (VMPI.is_vm86pus) { 551 if (VMPI.is_vm86pus) {
525 if ( (trapno==3) || (trapno==1) ) 552 if ( (trapno==3) || (trapno==1) )
526 return_to_32bit(regs, VM86_TRAP + (trapno << 8)); 553 return_to_32bit(regs, VM86_TRAP + (trapno << 8));
527 do_int(regs, trapno, (unsigned char __user *) (regs->ss << 4), SP(regs)); 554 do_int(regs, trapno, (unsigned char __user *) (regs->pt.xss << 4), SP(regs));
528 return 0; 555 return 0;
529 } 556 }
530 if (trapno !=1) 557 if (trapno !=1)
@@ -560,10 +587,10 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
560 handle_vm86_trap(regs, 0, 1); \ 587 handle_vm86_trap(regs, 0, 1); \
561 return; } while (0) 588 return; } while (0)
562 589
563 orig_flags = *(unsigned short *)&regs->eflags; 590 orig_flags = *(unsigned short *)&regs->pt.eflags;
564 591
565 csp = (unsigned char __user *) (regs->cs << 4); 592 csp = (unsigned char __user *) (regs->pt.xcs << 4);
566 ssp = (unsigned char __user *) (regs->ss << 4); 593 ssp = (unsigned char __user *) (regs->pt.xss << 4);
567 sp = SP(regs); 594 sp = SP(regs);
568 ip = IP(regs); 595 ip = IP(regs);
569 596
@@ -650,7 +677,7 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
650 SP(regs) += 6; 677 SP(regs) += 6;
651 } 678 }
652 IP(regs) = newip; 679 IP(regs) = newip;
653 regs->cs = newcs; 680 regs->pt.xcs = newcs;
654 CHECK_IF_IN_TRAP; 681 CHECK_IF_IN_TRAP;
655 if (data32) { 682 if (data32) {
656 set_vflags_long(newflags, regs); 683 set_vflags_long(newflags, regs);
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index 1e7ac1c44ddc..a53c8b1854b5 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -1,18 +1,32 @@
1/* ld script to make i386 Linux kernel 1/* ld script to make i386 Linux kernel
2 * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>; 2 * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
3 *
4 * Don't define absolute symbols until and unless you know that symbol
5 * value is should remain constant even if kernel image is relocated
6 * at run time. Absolute symbols are not relocated. If symbol value should
7 * change if kernel is relocated, make the symbol section relative and
8 * put it inside the section definition.
3 */ 9 */
4 10
11/* Don't define absolute symbols until and unless you know that symbol
12 * value is should remain constant even if kernel image is relocated
13 * at run time. Absolute symbols are not relocated. If symbol value should
14 * change if kernel is relocated, make the symbol section relative and
15 * put it inside the section definition.
16 */
5#define LOAD_OFFSET __PAGE_OFFSET 17#define LOAD_OFFSET __PAGE_OFFSET
6 18
7#include <asm-generic/vmlinux.lds.h> 19#include <asm-generic/vmlinux.lds.h>
8#include <asm/thread_info.h> 20#include <asm/thread_info.h>
9#include <asm/page.h> 21#include <asm/page.h>
10#include <asm/cache.h> 22#include <asm/cache.h>
23#include <asm/boot.h>
11 24
12OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") 25OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
13OUTPUT_ARCH(i386) 26OUTPUT_ARCH(i386)
14ENTRY(phys_startup_32) 27ENTRY(phys_startup_32)
15jiffies = jiffies_64; 28jiffies = jiffies_64;
29_proxy_pda = 0;
16 30
17PHDRS { 31PHDRS {
18 text PT_LOAD FLAGS(5); /* R_E */ 32 text PT_LOAD FLAGS(5); /* R_E */
@@ -21,46 +35,58 @@ PHDRS {
21} 35}
22SECTIONS 36SECTIONS
23{ 37{
24 . = __KERNEL_START; 38 . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
25 phys_startup_32 = startup_32 - LOAD_OFFSET; 39 phys_startup_32 = startup_32 - LOAD_OFFSET;
26 /* read-only */ 40 /* read-only */
27 _text = .; /* Text and read-only data */
28 .text : AT(ADDR(.text) - LOAD_OFFSET) { 41 .text : AT(ADDR(.text) - LOAD_OFFSET) {
42 _text = .; /* Text and read-only data */
29 *(.text) 43 *(.text)
30 SCHED_TEXT 44 SCHED_TEXT
31 LOCK_TEXT 45 LOCK_TEXT
32 KPROBES_TEXT 46 KPROBES_TEXT
33 *(.fixup) 47 *(.fixup)
34 *(.gnu.warning) 48 *(.gnu.warning)
35 } :text = 0x9090 49 _etext = .; /* End of text section */
36 50 } :text = 0x9090
37 _etext = .; /* End of text section */
38 51
39 . = ALIGN(16); /* Exception table */ 52 . = ALIGN(16); /* Exception table */
40 __start___ex_table = .; 53 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
41 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) } 54 __start___ex_table = .;
42 __stop___ex_table = .; 55 *(__ex_table)
56 __stop___ex_table = .;
57 }
43 58
44 RODATA 59 RODATA
45 60
61 BUG_TABLE
62
46 . = ALIGN(4); 63 . = ALIGN(4);
47 __tracedata_start = .;
48 .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) { 64 .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
65 __tracedata_start = .;
49 *(.tracedata) 66 *(.tracedata)
67 __tracedata_end = .;
50 } 68 }
51 __tracedata_end = .;
52 69
53 /* writeable */ 70 /* writeable */
71 . = ALIGN(4096);
54 .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */ 72 .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */
55 *(.data) 73 *(.data)
56 CONSTRUCTORS 74 CONSTRUCTORS
57 } :data 75 } :data
58 76
77 .paravirtprobe : AT(ADDR(.paravirtprobe) - LOAD_OFFSET) {
78 __start_paravirtprobe = .;
79 *(.paravirtprobe)
80 __stop_paravirtprobe = .;
81 }
82
59 . = ALIGN(4096); 83 . = ALIGN(4096);
60 __nosave_begin = .; 84 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
61 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) } 85 __nosave_begin = .;
62 . = ALIGN(4096); 86 *(.data.nosave)
63 __nosave_end = .; 87 . = ALIGN(4096);
88 __nosave_end = .;
89 }
64 90
65 . = ALIGN(4096); 91 . = ALIGN(4096);
66 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { 92 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
@@ -74,17 +100,10 @@ SECTIONS
74 100
75 /* rarely changed data like cpu maps */ 101 /* rarely changed data like cpu maps */
76 . = ALIGN(32); 102 . = ALIGN(32);
77 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { *(.data.read_mostly) } 103 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
78 _edata = .; /* End of data section */ 104 *(.data.read_mostly)
79 105 _edata = .; /* End of data section */
80#ifdef CONFIG_STACK_UNWIND
81 . = ALIGN(4);
82 .eh_frame : AT(ADDR(.eh_frame) - LOAD_OFFSET) {
83 __start_unwind = .;
84 *(.eh_frame)
85 __end_unwind = .;
86 } 106 }
87#endif
88 107
89 . = ALIGN(THREAD_SIZE); /* init_task */ 108 . = ALIGN(THREAD_SIZE); /* init_task */
90 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { 109 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
@@ -93,94 +112,102 @@ SECTIONS
93 112
94 /* might get freed after init */ 113 /* might get freed after init */
95 . = ALIGN(4096); 114 . = ALIGN(4096);
96 __smp_alt_begin = .;
97 __smp_alt_instructions = .;
98 .smp_altinstructions : AT(ADDR(.smp_altinstructions) - LOAD_OFFSET) { 115 .smp_altinstructions : AT(ADDR(.smp_altinstructions) - LOAD_OFFSET) {
116 __smp_alt_begin = .;
117 __smp_alt_instructions = .;
99 *(.smp_altinstructions) 118 *(.smp_altinstructions)
119 __smp_alt_instructions_end = .;
100 } 120 }
101 __smp_alt_instructions_end = .;
102 . = ALIGN(4); 121 . = ALIGN(4);
103 __smp_locks = .;
104 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { 122 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
123 __smp_locks = .;
105 *(.smp_locks) 124 *(.smp_locks)
125 __smp_locks_end = .;
106 } 126 }
107 __smp_locks_end = .;
108 .smp_altinstr_replacement : AT(ADDR(.smp_altinstr_replacement) - LOAD_OFFSET) { 127 .smp_altinstr_replacement : AT(ADDR(.smp_altinstr_replacement) - LOAD_OFFSET) {
109 *(.smp_altinstr_replacement) 128 *(.smp_altinstr_replacement)
129 __smp_alt_end = .;
110 } 130 }
131 /* will be freed after init
132 * Following ALIGN() is required to make sure no other data falls on the
133 * same page where __smp_alt_end is pointing as that page might be freed
134 * after boot. Always make sure that ALIGN() directive is present after
135 * the section which contains __smp_alt_end.
136 */
111 . = ALIGN(4096); 137 . = ALIGN(4096);
112 __smp_alt_end = .;
113 138
114 /* will be freed after init */ 139 /* will be freed after init */
115 . = ALIGN(4096); /* Init code and data */ 140 . = ALIGN(4096); /* Init code and data */
116 __init_begin = .;
117 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { 141 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
142 __init_begin = .;
118 _sinittext = .; 143 _sinittext = .;
119 *(.init.text) 144 *(.init.text)
120 _einittext = .; 145 _einittext = .;
121 } 146 }
122 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) } 147 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) }
123 . = ALIGN(16); 148 . = ALIGN(16);
124 __setup_start = .; 149 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
125 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) } 150 __setup_start = .;
126 __setup_end = .; 151 *(.init.setup)
127 __initcall_start = .; 152 __setup_end = .;
153 }
128 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { 154 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
129 *(.initcall1.init) 155 __initcall_start = .;
130 *(.initcall2.init) 156 INITCALLS
131 *(.initcall3.init) 157 __initcall_end = .;
132 *(.initcall4.init) 158 }
133 *(.initcall5.init)
134 *(.initcall6.init)
135 *(.initcall7.init)
136 }
137 __initcall_end = .;
138 __con_initcall_start = .;
139 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { 159 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
160 __con_initcall_start = .;
140 *(.con_initcall.init) 161 *(.con_initcall.init)
162 __con_initcall_end = .;
141 } 163 }
142 __con_initcall_end = .;
143 SECURITY_INIT 164 SECURITY_INIT
144 . = ALIGN(4); 165 . = ALIGN(4);
145 __alt_instructions = .;
146 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { 166 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
167 __alt_instructions = .;
147 *(.altinstructions) 168 *(.altinstructions)
169 __alt_instructions_end = .;
148 } 170 }
149 __alt_instructions_end = .;
150 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { 171 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
151 *(.altinstr_replacement) 172 *(.altinstr_replacement)
152 } 173 }
174 . = ALIGN(4);
175 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
176 __start_parainstructions = .;
177 *(.parainstructions)
178 __stop_parainstructions = .;
179 }
153 /* .exit.text is discard at runtime, not link time, to deal with references 180 /* .exit.text is discard at runtime, not link time, to deal with references
154 from .altinstructions and .eh_frame */ 181 from .altinstructions and .eh_frame */
155 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) } 182 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
156 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) } 183 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) }
157 . = ALIGN(4096); 184 . = ALIGN(4096);
158 __initramfs_start = .; 185 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
159 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) } 186 __initramfs_start = .;
160 __initramfs_end = .; 187 *(.init.ramfs)
188 __initramfs_end = .;
189 }
161 . = ALIGN(L1_CACHE_BYTES); 190 . = ALIGN(L1_CACHE_BYTES);
162 __per_cpu_start = .; 191 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
163 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) } 192 __per_cpu_start = .;
164 __per_cpu_end = .; 193 *(.data.percpu)
194 __per_cpu_end = .;
195 }
165 . = ALIGN(4096); 196 . = ALIGN(4096);
166 __init_end = .;
167 /* freed after init ends here */ 197 /* freed after init ends here */
168 198
169 __bss_start = .; /* BSS */
170 .bss.page_aligned : AT(ADDR(.bss.page_aligned) - LOAD_OFFSET) {
171 *(.bss.page_aligned)
172 }
173 .bss : AT(ADDR(.bss) - LOAD_OFFSET) { 199 .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
200 __init_end = .;
201 __bss_start = .; /* BSS */
202 *(.bss.page_aligned)
174 *(.bss) 203 *(.bss)
204 . = ALIGN(4);
205 __bss_stop = .;
206 _end = . ;
207 /* This is where the kernel creates the early boot page tables */
208 . = ALIGN(4096);
209 pg0 = . ;
175 } 210 }
176 . = ALIGN(4);
177 __bss_stop = .;
178
179 _end = . ;
180
181 /* This is where the kernel creates the early boot page tables */
182 . = ALIGN(4096);
183 pg0 = .;
184 211
185 /* Sections to be discarded */ 212 /* Sections to be discarded */
186 /DISCARD/ : { 213 /DISCARD/ : {
diff --git a/arch/i386/lib/usercopy.c b/arch/i386/lib/usercopy.c
index 258df6b4d7d7..d22cfc9d656c 100644
--- a/arch/i386/lib/usercopy.c
+++ b/arch/i386/lib/usercopy.c
@@ -9,6 +9,7 @@
9#include <linux/highmem.h> 9#include <linux/highmem.h>
10#include <linux/blkdev.h> 10#include <linux/blkdev.h>
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/backing-dev.h>
12#include <asm/uaccess.h> 13#include <asm/uaccess.h>
13#include <asm/mmx.h> 14#include <asm/mmx.h>
14 15
@@ -741,7 +742,7 @@ survive:
741 742
742 if (retval == -ENOMEM && is_init(current)) { 743 if (retval == -ENOMEM && is_init(current)) {
743 up_read(&current->mm->mmap_sem); 744 up_read(&current->mm->mmap_sem);
744 blk_congestion_wait(WRITE, HZ/50); 745 congestion_wait(WRITE, HZ/50);
745 goto survive; 746 goto survive;
746 } 747 }
747 748
diff --git a/arch/i386/mach-generic/probe.c b/arch/i386/mach-generic/probe.c
index 94b1fd9cbe3c..a7b3999bb37a 100644
--- a/arch/i386/mach-generic/probe.c
+++ b/arch/i386/mach-generic/probe.c
@@ -45,7 +45,9 @@ static int __init parse_apic(char *arg)
45 return 0; 45 return 0;
46 } 46 }
47 } 47 }
48 return -ENOENT; 48
49 /* Parsed again by __setup for debug/verbose */
50 return 0;
49} 51}
50early_param("apic", parse_apic); 52early_param("apic", parse_apic);
51 53
diff --git a/arch/i386/mach-visws/setup.c b/arch/i386/mach-visws/setup.c
index 885c7cbfd478..233ee20907b9 100644
--- a/arch/i386/mach-visws/setup.c
+++ b/arch/i386/mach-visws/setup.c
@@ -6,6 +6,7 @@
6#include <linux/smp.h> 6#include <linux/smp.h>
7#include <linux/init.h> 7#include <linux/init.h>
8#include <linux/interrupt.h> 8#include <linux/interrupt.h>
9#include <linux/module.h>
9 10
10#include <asm/fixmap.h> 11#include <asm/fixmap.h>
11#include <asm/arch_hooks.h> 12#include <asm/arch_hooks.h>
@@ -142,6 +143,8 @@ void __init time_init_hook(void)
142 143
143unsigned long sgivwfb_mem_phys; 144unsigned long sgivwfb_mem_phys;
144unsigned long sgivwfb_mem_size; 145unsigned long sgivwfb_mem_size;
146EXPORT_SYMBOL(sgivwfb_mem_phys);
147EXPORT_SYMBOL(sgivwfb_mem_size);
145 148
146long long mem_size __initdata = 0; 149long long mem_size __initdata = 0;
147 150
diff --git a/arch/i386/mach-visws/visws_apic.c b/arch/i386/mach-visws/visws_apic.c
index 07097ed48890..38c2b13124d9 100644
--- a/arch/i386/mach-visws/visws_apic.c
+++ b/arch/i386/mach-visws/visws_apic.c
@@ -122,7 +122,7 @@ static void end_cobalt_irq(unsigned int irq)
122 spin_unlock_irqrestore(&cobalt_lock, flags); 122 spin_unlock_irqrestore(&cobalt_lock, flags);
123} 123}
124 124
125static struct hw_interrupt_type cobalt_irq_type = { 125static struct irq_chip cobalt_irq_type = {
126 .typename = "Cobalt-APIC", 126 .typename = "Cobalt-APIC",
127 .startup = startup_cobalt_irq, 127 .startup = startup_cobalt_irq,
128 .shutdown = disable_cobalt_irq, 128 .shutdown = disable_cobalt_irq,
@@ -159,7 +159,7 @@ static void end_piix4_master_irq(unsigned int irq)
159 spin_unlock_irqrestore(&cobalt_lock, flags); 159 spin_unlock_irqrestore(&cobalt_lock, flags);
160} 160}
161 161
162static struct hw_interrupt_type piix4_master_irq_type = { 162static struct irq_chip piix4_master_irq_type = {
163 .typename = "PIIX4-master", 163 .typename = "PIIX4-master",
164 .startup = startup_piix4_master_irq, 164 .startup = startup_piix4_master_irq,
165 .ack = ack_cobalt_irq, 165 .ack = ack_cobalt_irq,
@@ -167,9 +167,8 @@ static struct hw_interrupt_type piix4_master_irq_type = {
167}; 167};
168 168
169 169
170static struct hw_interrupt_type piix4_virtual_irq_type = { 170static struct irq_chip piix4_virtual_irq_type = {
171 .typename = "PIIX4-virtual", 171 .typename = "PIIX4-virtual",
172 .startup = startup_8259A_irq,
173 .shutdown = disable_8259A_irq, 172 .shutdown = disable_8259A_irq,
174 .enable = enable_8259A_irq, 173 .enable = enable_8259A_irq,
175 .disable = disable_8259A_irq, 174 .disable = disable_8259A_irq,
diff --git a/arch/i386/mach-voyager/voyager_cat.c b/arch/i386/mach-voyager/voyager_cat.c
index f50c6c6ad680..943a9473b138 100644
--- a/arch/i386/mach-voyager/voyager_cat.c
+++ b/arch/i386/mach-voyager/voyager_cat.c
@@ -776,7 +776,7 @@ voyager_cat_init(void)
776 for(asic=0; asic < (*modpp)->num_asics; asic++) { 776 for(asic=0; asic < (*modpp)->num_asics; asic++) {
777 int j; 777 int j;
778 voyager_asic_t *asicp = *asicpp 778 voyager_asic_t *asicp = *asicpp
779 = kmalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count++];*/ 779 = kzalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count++];*/
780 voyager_sp_table_t *sp_table; 780 voyager_sp_table_t *sp_table;
781 voyager_at_t *asic_table; 781 voyager_at_t *asic_table;
782 voyager_jtt_t *jtag_table; 782 voyager_jtt_t *jtag_table;
@@ -785,7 +785,6 @@ voyager_cat_init(void)
785 printk("**WARNING** kmalloc failure in cat_init\n"); 785 printk("**WARNING** kmalloc failure in cat_init\n");
786 continue; 786 continue;
787 } 787 }
788 memset(asicp, 0, sizeof(voyager_asic_t));
789 asicpp = &(asicp->next); 788 asicpp = &(asicp->next);
790 asicp->asic_location = asic; 789 asicp->asic_location = asic;
791 sp_table = (voyager_sp_table_t *)(eprom_buf + sp_offset); 790 sp_table = (voyager_sp_table_t *)(eprom_buf + sp_offset);
@@ -851,8 +850,7 @@ voyager_cat_init(void)
851#endif 850#endif
852 851
853 { 852 {
854 struct resource *res = kmalloc(sizeof(struct resource),GFP_KERNEL); 853 struct resource *res = kzalloc(sizeof(struct resource),GFP_KERNEL);
855 memset(res, 0, sizeof(struct resource));
856 res->name = kmalloc(128, GFP_KERNEL); 854 res->name = kmalloc(128, GFP_KERNEL);
857 sprintf((char *)res->name, "Voyager %s Quad CPI", cat_module_name(i)); 855 sprintf((char *)res->name, "Voyager %s Quad CPI", cat_module_name(i));
858 res->start = qic_addr; 856 res->start = qic_addr;
diff --git a/arch/i386/mach-voyager/voyager_smp.c b/arch/i386/mach-voyager/voyager_smp.c
index f3fea2ad50fe..55428e656a3f 100644
--- a/arch/i386/mach-voyager/voyager_smp.c
+++ b/arch/i386/mach-voyager/voyager_smp.c
@@ -28,6 +28,7 @@
28#include <asm/pgalloc.h> 28#include <asm/pgalloc.h>
29#include <asm/tlbflush.h> 29#include <asm/tlbflush.h>
30#include <asm/arch_hooks.h> 30#include <asm/arch_hooks.h>
31#include <asm/pda.h>
31 32
32/* TLB state -- visible externally, indexed physically */ 33/* TLB state -- visible externally, indexed physically */
33DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0 }; 34DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0 };
@@ -422,6 +423,7 @@ find_smp_config(void)
422 VOYAGER_SUS_IN_CONTROL_PORT); 423 VOYAGER_SUS_IN_CONTROL_PORT);
423 424
424 current_thread_info()->cpu = boot_cpu_id; 425 current_thread_info()->cpu = boot_cpu_id;
426 write_pda(cpu_number, boot_cpu_id);
425} 427}
426 428
427/* 429/*
@@ -458,7 +460,7 @@ start_secondary(void *unused)
458 /* external functions not defined in the headers */ 460 /* external functions not defined in the headers */
459 extern void calibrate_delay(void); 461 extern void calibrate_delay(void);
460 462
461 cpu_init(); 463 secondary_cpu_init();
462 464
463 /* OK, we're in the routine */ 465 /* OK, we're in the routine */
464 ack_CPI(VIC_CPU_BOOT_CPI); 466 ack_CPI(VIC_CPU_BOOT_CPI);
@@ -578,6 +580,15 @@ do_boot_cpu(__u8 cpu)
578 /* init_tasks (in sched.c) is indexed logically */ 580 /* init_tasks (in sched.c) is indexed logically */
579 stack_start.esp = (void *) idle->thread.esp; 581 stack_start.esp = (void *) idle->thread.esp;
580 582
583 /* Pre-allocate and initialize the CPU's GDT and PDA so it
584 doesn't have to do any memory allocation during the
585 delicate CPU-bringup phase. */
586 if (!init_gdt(cpu, idle)) {
587 printk(KERN_INFO "Couldn't allocate GDT/PDA for CPU %d\n", cpu);
588 cpucount--;
589 return;
590 }
591
581 irq_ctx_init(cpu); 592 irq_ctx_init(cpu);
582 593
583 /* Note: Don't modify initial ss override */ 594 /* Note: Don't modify initial ss override */
@@ -1963,4 +1974,5 @@ void __init
1963smp_setup_processor_id(void) 1974smp_setup_processor_id(void)
1964{ 1975{
1965 current_thread_info()->cpu = hard_smp_processor_id(); 1976 current_thread_info()->cpu = hard_smp_processor_id();
1977 write_pda(cpu_number, hard_smp_processor_id());
1966} 1978}
diff --git a/arch/i386/math-emu/fpu_emu.h b/arch/i386/math-emu/fpu_emu.h
index d62b20a3e660..65120f523853 100644
--- a/arch/i386/math-emu/fpu_emu.h
+++ b/arch/i386/math-emu/fpu_emu.h
@@ -57,6 +57,7 @@
57#define TAG_Special Const(2) /* De-normal, + or - infinity, 57#define TAG_Special Const(2) /* De-normal, + or - infinity,
58 or Not a Number */ 58 or Not a Number */
59#define TAG_Empty Const(3) /* empty */ 59#define TAG_Empty Const(3) /* empty */
60#define TAG_Error Const(0x80) /* probably need to abort */
60 61
61#define LOADED_DATA Const(10101) /* Special st() number to identify 62#define LOADED_DATA Const(10101) /* Special st() number to identify
62 loaded data (not on stack). */ 63 loaded data (not on stack). */
diff --git a/arch/i386/math-emu/fpu_entry.c b/arch/i386/math-emu/fpu_entry.c
index d93f16ef828f..ddf8fa3bbd01 100644
--- a/arch/i386/math-emu/fpu_entry.c
+++ b/arch/i386/math-emu/fpu_entry.c
@@ -742,7 +742,8 @@ int save_i387_soft(void *s387, struct _fpstate __user * buf)
742 S387->fcs &= ~0xf8000000; 742 S387->fcs &= ~0xf8000000;
743 S387->fos |= 0xffff0000; 743 S387->fos |= 0xffff0000;
744#endif /* PECULIAR_486 */ 744#endif /* PECULIAR_486 */
745 __copy_to_user(d, &S387->cwd, 7*4); 745 if (__copy_to_user(d, &S387->cwd, 7*4))
746 return -1;
746 RE_ENTRANT_CHECK_ON; 747 RE_ENTRANT_CHECK_ON;
747 748
748 d += 7*4; 749 d += 7*4;
diff --git a/arch/i386/math-emu/fpu_system.h b/arch/i386/math-emu/fpu_system.h
index bf26341c8bde..a3ae28c49ddd 100644
--- a/arch/i386/math-emu/fpu_system.h
+++ b/arch/i386/math-emu/fpu_system.h
@@ -68,6 +68,7 @@
68 68
69#define FPU_access_ok(x,y,z) if ( !access_ok(x,y,z) ) \ 69#define FPU_access_ok(x,y,z) if ( !access_ok(x,y,z) ) \
70 math_abort(FPU_info,SIGSEGV) 70 math_abort(FPU_info,SIGSEGV)
71#define FPU_abort math_abort(FPU_info, SIGSEGV)
71 72
72#undef FPU_IGNORE_CODE_SEGV 73#undef FPU_IGNORE_CODE_SEGV
73#ifdef FPU_IGNORE_CODE_SEGV 74#ifdef FPU_IGNORE_CODE_SEGV
diff --git a/arch/i386/math-emu/load_store.c b/arch/i386/math-emu/load_store.c
index 85314be2fef8..eebd6fb1c8a8 100644
--- a/arch/i386/math-emu/load_store.c
+++ b/arch/i386/math-emu/load_store.c
@@ -227,6 +227,8 @@ int FPU_load_store(u_char type, fpu_addr_modes addr_modes,
227 case 027: /* fild m64int */ 227 case 027: /* fild m64int */
228 clear_C1(); 228 clear_C1();
229 loaded_tag = FPU_load_int64((long long __user *)data_address); 229 loaded_tag = FPU_load_int64((long long __user *)data_address);
230 if (loaded_tag == TAG_Error)
231 return 0;
230 FPU_settag0(loaded_tag); 232 FPU_settag0(loaded_tag);
231 break; 233 break;
232 case 030: /* fstenv m14/28byte */ 234 case 030: /* fstenv m14/28byte */
diff --git a/arch/i386/math-emu/reg_ld_str.c b/arch/i386/math-emu/reg_ld_str.c
index f06ed41d191d..e976caef6498 100644
--- a/arch/i386/math-emu/reg_ld_str.c
+++ b/arch/i386/math-emu/reg_ld_str.c
@@ -244,7 +244,8 @@ int FPU_load_int64(long long __user *_s)
244 244
245 RE_ENTRANT_CHECK_OFF; 245 RE_ENTRANT_CHECK_OFF;
246 FPU_access_ok(VERIFY_READ, _s, 8); 246 FPU_access_ok(VERIFY_READ, _s, 8);
247 copy_from_user(&s,_s,8); 247 if (copy_from_user(&s,_s,8))
248 FPU_abort;
248 RE_ENTRANT_CHECK_ON; 249 RE_ENTRANT_CHECK_ON;
249 250
250 if (s == 0) 251 if (s == 0)
@@ -907,7 +908,8 @@ int FPU_store_int64(FPU_REG *st0_ptr, u_char st0_tag, long long __user *d)
907 908
908 RE_ENTRANT_CHECK_OFF; 909 RE_ENTRANT_CHECK_OFF;
909 FPU_access_ok(VERIFY_WRITE,d,8); 910 FPU_access_ok(VERIFY_WRITE,d,8);
910 copy_to_user(d, &tll, 8); 911 if (copy_to_user(d, &tll, 8))
912 FPU_abort;
911 RE_ENTRANT_CHECK_ON; 913 RE_ENTRANT_CHECK_ON;
912 914
913 return 1; 915 return 1;
@@ -1336,7 +1338,8 @@ u_char __user *fstenv(fpu_addr_modes addr_modes, u_char __user *d)
1336 I387.soft.fcs &= ~0xf8000000; 1338 I387.soft.fcs &= ~0xf8000000;
1337 I387.soft.fos |= 0xffff0000; 1339 I387.soft.fos |= 0xffff0000;
1338#endif /* PECULIAR_486 */ 1340#endif /* PECULIAR_486 */
1339 __copy_to_user(d, &control_word, 7*4); 1341 if (__copy_to_user(d, &control_word, 7*4))
1342 FPU_abort;
1340 RE_ENTRANT_CHECK_ON; 1343 RE_ENTRANT_CHECK_ON;
1341 d += 0x1c; 1344 d += 0x1c;
1342 } 1345 }
@@ -1359,9 +1362,11 @@ void fsave(fpu_addr_modes addr_modes, u_char __user *data_address)
1359 FPU_access_ok(VERIFY_WRITE,d,80); 1362 FPU_access_ok(VERIFY_WRITE,d,80);
1360 1363
1361 /* Copy all registers in stack order. */ 1364 /* Copy all registers in stack order. */
1362 __copy_to_user(d, register_base+offset, other); 1365 if (__copy_to_user(d, register_base+offset, other))
1366 FPU_abort;
1363 if ( offset ) 1367 if ( offset )
1364 __copy_to_user(d+other, register_base, offset); 1368 if (__copy_to_user(d+other, register_base, offset))
1369 FPU_abort;
1365 RE_ENTRANT_CHECK_ON; 1370 RE_ENTRANT_CHECK_ON;
1366 1371
1367 finit(); 1372 finit();
diff --git a/arch/i386/mm/boot_ioremap.c b/arch/i386/mm/boot_ioremap.c
index 4de11f508c3a..4de95a17a7d4 100644
--- a/arch/i386/mm/boot_ioremap.c
+++ b/arch/i386/mm/boot_ioremap.c
@@ -16,6 +16,7 @@
16 */ 16 */
17 17
18#undef CONFIG_X86_PAE 18#undef CONFIG_X86_PAE
19#undef CONFIG_PARAVIRT
19#include <asm/page.h> 20#include <asm/page.h>
20#include <asm/pgtable.h> 21#include <asm/pgtable.h>
21#include <asm/tlbflush.h> 22#include <asm/tlbflush.h>
diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c
index ddbdb0336f28..103b76e56a94 100644
--- a/arch/i386/mm/discontig.c
+++ b/arch/i386/mm/discontig.c
@@ -168,7 +168,7 @@ static void __init allocate_pgdat(int nid)
168 if (nid && node_has_online_mem(nid)) 168 if (nid && node_has_online_mem(nid))
169 NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; 169 NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
170 else { 170 else {
171 NODE_DATA(nid) = (pg_data_t *)(__va(min_low_pfn << PAGE_SHIFT)); 171 NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(min_low_pfn));
172 min_low_pfn += PFN_UP(sizeof(pg_data_t)); 172 min_low_pfn += PFN_UP(sizeof(pg_data_t));
173 } 173 }
174} 174}
diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c
index 2581575786c1..aaaa4d225f7e 100644
--- a/arch/i386/mm/fault.c
+++ b/arch/i386/mm/fault.c
@@ -22,9 +22,9 @@
22#include <linux/highmem.h> 22#include <linux/highmem.h>
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/kprobes.h> 24#include <linux/kprobes.h>
25#include <linux/uaccess.h>
25 26
26#include <asm/system.h> 27#include <asm/system.h>
27#include <asm/uaccess.h>
28#include <asm/desc.h> 28#include <asm/desc.h>
29#include <asm/kdebug.h> 29#include <asm/kdebug.h>
30#include <asm/segment.h> 30#include <asm/segment.h>
@@ -167,7 +167,7 @@ static inline unsigned long get_segment_eip(struct pt_regs *regs,
167static int __is_prefetch(struct pt_regs *regs, unsigned long addr) 167static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
168{ 168{
169 unsigned long limit; 169 unsigned long limit;
170 unsigned long instr = get_segment_eip (regs, &limit); 170 unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit);
171 int scan_more = 1; 171 int scan_more = 1;
172 int prefetch = 0; 172 int prefetch = 0;
173 int i; 173 int i;
@@ -177,9 +177,9 @@ static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
177 unsigned char instr_hi; 177 unsigned char instr_hi;
178 unsigned char instr_lo; 178 unsigned char instr_lo;
179 179
180 if (instr > limit) 180 if (instr > (unsigned char *)limit)
181 break; 181 break;
182 if (__get_user(opcode, (unsigned char __user *) instr)) 182 if (probe_kernel_address(instr, opcode))
183 break; 183 break;
184 184
185 instr_hi = opcode & 0xf0; 185 instr_hi = opcode & 0xf0;
@@ -204,9 +204,9 @@ static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
204 case 0x00: 204 case 0x00:
205 /* Prefetch instruction is 0x0F0D or 0x0F18 */ 205 /* Prefetch instruction is 0x0F0D or 0x0F18 */
206 scan_more = 0; 206 scan_more = 0;
207 if (instr > limit) 207 if (instr > (unsigned char *)limit)
208 break; 208 break;
209 if (__get_user(opcode, (unsigned char __user *) instr)) 209 if (probe_kernel_address(instr, opcode))
210 break; 210 break;
211 prefetch = (instr_lo == 0xF) && 211 prefetch = (instr_lo == 0xF) &&
212 (opcode == 0x0D || opcode == 0x18); 212 (opcode == 0x0D || opcode == 0x18);
diff --git a/arch/i386/mm/highmem.c b/arch/i386/mm/highmem.c
index f9f647cdbc7b..e0fa6cb655a8 100644
--- a/arch/i386/mm/highmem.c
+++ b/arch/i386/mm/highmem.c
@@ -32,7 +32,7 @@ void *kmap_atomic(struct page *page, enum km_type type)
32 unsigned long vaddr; 32 unsigned long vaddr;
33 33
34 /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ 34 /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
35 inc_preempt_count(); 35 pagefault_disable();
36 if (!PageHighMem(page)) 36 if (!PageHighMem(page))
37 return page_address(page); 37 return page_address(page);
38 38
@@ -50,26 +50,22 @@ void kunmap_atomic(void *kvaddr, enum km_type type)
50 unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; 50 unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
51 enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); 51 enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
52 52
53#ifdef CONFIG_DEBUG_HIGHMEM
54 if (vaddr >= PAGE_OFFSET && vaddr < (unsigned long)high_memory) {
55 dec_preempt_count();
56 preempt_check_resched();
57 return;
58 }
59
60 if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx))
61 BUG();
62#endif
63 /* 53 /*
64 * Force other mappings to Oops if they'll try to access this pte 54 * Force other mappings to Oops if they'll try to access this pte
65 * without first remap it. Keeping stale mappings around is a bad idea 55 * without first remap it. Keeping stale mappings around is a bad idea
66 * also, in case the page changes cacheability attributes or becomes 56 * also, in case the page changes cacheability attributes or becomes
67 * a protected page in a hypervisor. 57 * a protected page in a hypervisor.
68 */ 58 */
69 kpte_clear_flush(kmap_pte-idx, vaddr); 59 if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx))
60 kpte_clear_flush(kmap_pte-idx, vaddr);
61 else {
62#ifdef CONFIG_DEBUG_HIGHMEM
63 BUG_ON(vaddr < PAGE_OFFSET);
64 BUG_ON(vaddr >= (unsigned long)high_memory);
65#endif
66 }
70 67
71 dec_preempt_count(); 68 pagefault_enable();
72 preempt_check_resched();
73} 69}
74 70
75/* This is the same as kmap_atomic() but can map memory that doesn't 71/* This is the same as kmap_atomic() but can map memory that doesn't
@@ -80,7 +76,7 @@ void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
80 enum fixed_addresses idx; 76 enum fixed_addresses idx;
81 unsigned long vaddr; 77 unsigned long vaddr;
82 78
83 inc_preempt_count(); 79 pagefault_disable();
84 80
85 idx = type + KM_TYPE_NR*smp_processor_id(); 81 idx = type + KM_TYPE_NR*smp_processor_id();
86 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); 82 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c
index 1719a8141f81..34728e4afe48 100644
--- a/arch/i386/mm/hugetlbpage.c
+++ b/arch/i386/mm/hugetlbpage.c
@@ -17,6 +17,113 @@
17#include <asm/tlb.h> 17#include <asm/tlb.h>
18#include <asm/tlbflush.h> 18#include <asm/tlbflush.h>
19 19
20static unsigned long page_table_shareable(struct vm_area_struct *svma,
21 struct vm_area_struct *vma,
22 unsigned long addr, pgoff_t idx)
23{
24 unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
25 svma->vm_start;
26 unsigned long sbase = saddr & PUD_MASK;
27 unsigned long s_end = sbase + PUD_SIZE;
28
29 /*
30 * match the virtual addresses, permission and the alignment of the
31 * page table page.
32 */
33 if (pmd_index(addr) != pmd_index(saddr) ||
34 vma->vm_flags != svma->vm_flags ||
35 sbase < svma->vm_start || svma->vm_end < s_end)
36 return 0;
37
38 return saddr;
39}
40
41static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
42{
43 unsigned long base = addr & PUD_MASK;
44 unsigned long end = base + PUD_SIZE;
45
46 /*
47 * check on proper vm_flags and page table alignment
48 */
49 if (vma->vm_flags & VM_MAYSHARE &&
50 vma->vm_start <= base && end <= vma->vm_end)
51 return 1;
52 return 0;
53}
54
55/*
56 * search for a shareable pmd page for hugetlb.
57 */
58static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
59{
60 struct vm_area_struct *vma = find_vma(mm, addr);
61 struct address_space *mapping = vma->vm_file->f_mapping;
62 pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
63 vma->vm_pgoff;
64 struct prio_tree_iter iter;
65 struct vm_area_struct *svma;
66 unsigned long saddr;
67 pte_t *spte = NULL;
68
69 if (!vma_shareable(vma, addr))
70 return;
71
72 spin_lock(&mapping->i_mmap_lock);
73 vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
74 if (svma == vma)
75 continue;
76
77 saddr = page_table_shareable(svma, vma, addr, idx);
78 if (saddr) {
79 spte = huge_pte_offset(svma->vm_mm, saddr);
80 if (spte) {
81 get_page(virt_to_page(spte));
82 break;
83 }
84 }
85 }
86
87 if (!spte)
88 goto out;
89
90 spin_lock(&mm->page_table_lock);
91 if (pud_none(*pud))
92 pud_populate(mm, pud, (unsigned long) spte & PAGE_MASK);
93 else
94 put_page(virt_to_page(spte));
95 spin_unlock(&mm->page_table_lock);
96out:
97 spin_unlock(&mapping->i_mmap_lock);
98}
99
100/*
101 * unmap huge page backed by shared pte.
102 *
103 * Hugetlb pte page is ref counted at the time of mapping. If pte is shared
104 * indicated by page_count > 1, unmap is achieved by clearing pud and
105 * decrementing the ref count. If count == 1, the pte page is not shared.
106 *
107 * called with vma->vm_mm->page_table_lock held.
108 *
109 * returns: 1 successfully unmapped a shared pte page
110 * 0 the underlying pte page is not shared, or it is the last user
111 */
112int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
113{
114 pgd_t *pgd = pgd_offset(mm, *addr);
115 pud_t *pud = pud_offset(pgd, *addr);
116
117 BUG_ON(page_count(virt_to_page(ptep)) == 0);
118 if (page_count(virt_to_page(ptep)) == 1)
119 return 0;
120
121 pud_clear(pud);
122 put_page(virt_to_page(ptep));
123 *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
124 return 1;
125}
126
20pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) 127pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
21{ 128{
22 pgd_t *pgd; 129 pgd_t *pgd;
@@ -25,8 +132,11 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
25 132
26 pgd = pgd_offset(mm, addr); 133 pgd = pgd_offset(mm, addr);
27 pud = pud_alloc(mm, pgd, addr); 134 pud = pud_alloc(mm, pgd, addr);
28 if (pud) 135 if (pud) {
136 if (pud_none(*pud))
137 huge_pmd_share(mm, addr, pud);
29 pte = (pte_t *) pmd_alloc(mm, pud, addr); 138 pte = (pte_t *) pmd_alloc(mm, pud, addr);
139 }
30 BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); 140 BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
31 141
32 return pte; 142 return pte;
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index 167416155ee4..84697dfc7348 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -192,8 +192,6 @@ static inline int page_kills_ppro(unsigned long pagenr)
192 return 0; 192 return 0;
193} 193}
194 194
195extern int is_available_memory(efi_memory_desc_t *);
196
197int page_is_ram(unsigned long pagenr) 195int page_is_ram(unsigned long pagenr)
198{ 196{
199 int i; 197 int i;
@@ -699,8 +697,8 @@ int remove_memory(u64 start, u64 size)
699#endif 697#endif
700#endif 698#endif
701 699
702kmem_cache_t *pgd_cache; 700struct kmem_cache *pgd_cache;
703kmem_cache_t *pmd_cache; 701struct kmem_cache *pmd_cache;
704 702
705void __init pgtable_cache_init(void) 703void __init pgtable_cache_init(void)
706{ 704{
diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c
index 8564b6ae17e3..ad91528bdc14 100644
--- a/arch/i386/mm/pageattr.c
+++ b/arch/i386/mm/pageattr.c
@@ -67,11 +67,17 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot,
67 return base; 67 return base;
68} 68}
69 69
70static void flush_kernel_map(void *dummy) 70static void flush_kernel_map(void *arg)
71{ 71{
72 /* Could use CLFLUSH here if the CPU supports it (Hammer,P4) */ 72 unsigned long adr = (unsigned long)arg;
73 if (boot_cpu_data.x86_model >= 4) 73
74 if (adr && cpu_has_clflush) {
75 int i;
76 for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
77 asm volatile("clflush (%0)" :: "r" (adr + i));
78 } else if (boot_cpu_data.x86_model >= 4)
74 wbinvd(); 79 wbinvd();
80
75 /* Flush all to work around Errata in early athlons regarding 81 /* Flush all to work around Errata in early athlons regarding
76 * large page flushing. 82 * large page flushing.
77 */ 83 */
@@ -173,9 +179,9 @@ __change_page_attr(struct page *page, pgprot_t prot)
173 return 0; 179 return 0;
174} 180}
175 181
176static inline void flush_map(void) 182static inline void flush_map(void *adr)
177{ 183{
178 on_each_cpu(flush_kernel_map, NULL, 1, 1); 184 on_each_cpu(flush_kernel_map, adr, 1, 1);
179} 185}
180 186
181/* 187/*
@@ -217,9 +223,13 @@ void global_flush_tlb(void)
217 spin_lock_irq(&cpa_lock); 223 spin_lock_irq(&cpa_lock);
218 list_replace_init(&df_list, &l); 224 list_replace_init(&df_list, &l);
219 spin_unlock_irq(&cpa_lock); 225 spin_unlock_irq(&cpa_lock);
220 flush_map(); 226 if (!cpu_has_clflush)
221 list_for_each_entry_safe(pg, next, &l, lru) 227 flush_map(0);
228 list_for_each_entry_safe(pg, next, &l, lru) {
229 if (cpu_has_clflush)
230 flush_map(page_address(pg));
222 __free_page(pg); 231 __free_page(pg);
232 }
223} 233}
224 234
225#ifdef CONFIG_DEBUG_PAGEALLOC 235#ifdef CONFIG_DEBUG_PAGEALLOC
diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
index 10126e3f8174..f349eaf450b0 100644
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -95,8 +95,11 @@ static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
95 return; 95 return;
96 } 96 }
97 pte = pte_offset_kernel(pmd, vaddr); 97 pte = pte_offset_kernel(pmd, vaddr);
98 /* <pfn,flags> stored as-is, to permit clearing entries */ 98 if (pgprot_val(flags))
99 set_pte(pte, pfn_pte(pfn, flags)); 99 /* <pfn,flags> stored as-is, to permit clearing entries */
100 set_pte(pte, pfn_pte(pfn, flags));
101 else
102 pte_clear(&init_mm, vaddr, pte);
100 103
101 /* 104 /*
102 * It's enough to flush this one mapping. 105 * It's enough to flush this one mapping.
@@ -193,7 +196,7 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
193 return pte; 196 return pte;
194} 197}
195 198
196void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags) 199void pmd_ctor(void *pmd, struct kmem_cache *cache, unsigned long flags)
197{ 200{
198 memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); 201 memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
199} 202}
@@ -233,7 +236,7 @@ static inline void pgd_list_del(pgd_t *pgd)
233 set_page_private(next, (unsigned long)pprev); 236 set_page_private(next, (unsigned long)pprev);
234} 237}
235 238
236void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused) 239void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused)
237{ 240{
238 unsigned long flags; 241 unsigned long flags;
239 242
@@ -253,7 +256,7 @@ void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
253} 256}
254 257
255/* never called when PTRS_PER_PMD > 1 */ 258/* never called when PTRS_PER_PMD > 1 */
256void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused) 259void pgd_dtor(void *pgd, struct kmem_cache *cache, unsigned long unused)
257{ 260{
258 unsigned long flags; /* can be called from interrupt context */ 261 unsigned long flags; /* can be called from interrupt context */
259 262
diff --git a/arch/i386/pci/common.c b/arch/i386/pci/common.c
index 68bce194e688..53ca6e897984 100644
--- a/arch/i386/pci/common.c
+++ b/arch/i386/pci/common.c
@@ -20,6 +20,7 @@
20unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 | 20unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 |
21 PCI_PROBE_MMCONF; 21 PCI_PROBE_MMCONF;
22 22
23static int pci_bf_sort;
23int pci_routeirq; 24int pci_routeirq;
24int pcibios_last_bus = -1; 25int pcibios_last_bus = -1;
25unsigned long pirq_table_addr; 26unsigned long pirq_table_addr;
@@ -118,6 +119,20 @@ void __devinit pcibios_fixup_bus(struct pci_bus *b)
118} 119}
119 120
120/* 121/*
122 * Only use DMI information to set this if nothing was passed
123 * on the kernel command line (which was parsed earlier).
124 */
125
126static int __devinit set_bf_sort(struct dmi_system_id *d)
127{
128 if (pci_bf_sort == pci_bf_sort_default) {
129 pci_bf_sort = pci_dmi_bf;
130 printk(KERN_INFO "PCI: %s detected, enabling pci=bfsort.\n", d->ident);
131 }
132 return 0;
133}
134
135/*
121 * Enable renumbering of PCI bus# ranges to reach all PCI busses (Cardbus) 136 * Enable renumbering of PCI bus# ranges to reach all PCI busses (Cardbus)
122 */ 137 */
123#ifdef __i386__ 138#ifdef __i386__
@@ -130,11 +145,11 @@ static int __devinit assign_all_busses(struct dmi_system_id *d)
130} 145}
131#endif 146#endif
132 147
148static struct dmi_system_id __devinitdata pciprobe_dmi_table[] = {
149#ifdef __i386__
133/* 150/*
134 * Laptops which need pci=assign-busses to see Cardbus cards 151 * Laptops which need pci=assign-busses to see Cardbus cards
135 */ 152 */
136static struct dmi_system_id __devinitdata pciprobe_dmi_table[] = {
137#ifdef __i386__
138 { 153 {
139 .callback = assign_all_busses, 154 .callback = assign_all_busses,
140 .ident = "Samsung X20 Laptop", 155 .ident = "Samsung X20 Laptop",
@@ -144,6 +159,38 @@ static struct dmi_system_id __devinitdata pciprobe_dmi_table[] = {
144 }, 159 },
145 }, 160 },
146#endif /* __i386__ */ 161#endif /* __i386__ */
162 {
163 .callback = set_bf_sort,
164 .ident = "Dell PowerEdge 1950",
165 .matches = {
166 DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
167 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1950"),
168 },
169 },
170 {
171 .callback = set_bf_sort,
172 .ident = "Dell PowerEdge 1955",
173 .matches = {
174 DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
175 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1955"),
176 },
177 },
178 {
179 .callback = set_bf_sort,
180 .ident = "Dell PowerEdge 2900",
181 .matches = {
182 DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
183 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2900"),
184 },
185 },
186 {
187 .callback = set_bf_sort,
188 .ident = "Dell PowerEdge 2950",
189 .matches = {
190 DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
191 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2950"),
192 },
193 },
147 {} 194 {}
148}; 195};
149 196
@@ -189,6 +236,8 @@ static int __init pcibios_init(void)
189 236
190 pcibios_resource_survey(); 237 pcibios_resource_survey();
191 238
239 if (pci_bf_sort >= pci_force_bf)
240 pci_sort_breadthfirst();
192#ifdef CONFIG_PCI_BIOS 241#ifdef CONFIG_PCI_BIOS
193 if ((pci_probe & PCI_BIOS_SORT) && !(pci_probe & PCI_NO_SORT)) 242 if ((pci_probe & PCI_BIOS_SORT) && !(pci_probe & PCI_NO_SORT))
194 pcibios_sort(); 243 pcibios_sort();
@@ -203,6 +252,12 @@ char * __devinit pcibios_setup(char *str)
203 if (!strcmp(str, "off")) { 252 if (!strcmp(str, "off")) {
204 pci_probe = 0; 253 pci_probe = 0;
205 return NULL; 254 return NULL;
255 } else if (!strcmp(str, "bfsort")) {
256 pci_bf_sort = pci_force_bf;
257 return NULL;
258 } else if (!strcmp(str, "nobfsort")) {
259 pci_bf_sort = pci_force_nobf;
260 return NULL;
206 } 261 }
207#ifdef CONFIG_PCI_BIOS 262#ifdef CONFIG_PCI_BIOS
208 else if (!strcmp(str, "bios")) { 263 else if (!strcmp(str, "bios")) {
@@ -288,7 +343,6 @@ int pcibios_enable_device(struct pci_dev *dev, int mask)
288 343
289void pcibios_disable_device (struct pci_dev *dev) 344void pcibios_disable_device (struct pci_dev *dev)
290{ 345{
291 pcibios_disable_resources(dev);
292 if (pcibios_disable_irq) 346 if (pcibios_disable_irq)
293 pcibios_disable_irq(dev); 347 pcibios_disable_irq(dev);
294} 348}
diff --git a/arch/i386/pci/early.c b/arch/i386/pci/early.c
index 713d6c866cae..42df4b6606df 100644
--- a/arch/i386/pci/early.c
+++ b/arch/i386/pci/early.c
@@ -45,6 +45,13 @@ void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset,
45 outl(val, 0xcfc); 45 outl(val, 0xcfc);
46} 46}
47 47
48void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val)
49{
50 PDprintk("%x writing to %x: %x\n", slot, offset, val);
51 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
52 outb(val, 0xcfc);
53}
54
48int early_pci_allowed(void) 55int early_pci_allowed(void)
49{ 56{
50 return (pci_probe & (PCI_PROBE_CONF1|PCI_PROBE_NOEARLY)) == 57 return (pci_probe & (PCI_PROBE_CONF1|PCI_PROBE_NOEARLY)) ==
diff --git a/arch/i386/pci/fixup.c b/arch/i386/pci/fixup.c
index b60d7e8689ed..cde1170b01a1 100644
--- a/arch/i386/pci/fixup.c
+++ b/arch/i386/pci/fixup.c
@@ -74,52 +74,6 @@ static void __devinit pci_fixup_ncr53c810(struct pci_dev *d)
74} 74}
75DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NCR, PCI_DEVICE_ID_NCR_53C810, pci_fixup_ncr53c810); 75DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NCR, PCI_DEVICE_ID_NCR_53C810, pci_fixup_ncr53c810);
76 76
77static void __devinit pci_fixup_ide_bases(struct pci_dev *d)
78{
79 int i;
80
81 /*
82 * PCI IDE controllers use non-standard I/O port decoding, respect it.
83 */
84 if ((d->class >> 8) != PCI_CLASS_STORAGE_IDE)
85 return;
86 DBG("PCI: IDE base address fixup for %s\n", pci_name(d));
87 for(i=0; i<4; i++) {
88 struct resource *r = &d->resource[i];
89 if ((r->start & ~0x80) == 0x374) {
90 r->start |= 2;
91 r->end = r->start;
92 }
93 }
94}
95DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_ide_bases);
96
97static void __devinit pci_fixup_ide_trash(struct pci_dev *d)
98{
99 int i;
100
101 /*
102 * Runs the fixup only for the first IDE controller
103 * (Shai Fultheim - shai@ftcon.com)
104 */
105 static int called = 0;
106 if (called)
107 return;
108 called = 1;
109
110 /*
111 * There exist PCI IDE controllers which have utter garbage
112 * in first four base registers. Ignore that.
113 */
114 DBG("PCI: IDE base address trash cleared for %s\n", pci_name(d));
115 for(i=0; i<4; i++)
116 d->resource[i].start = d->resource[i].end = d->resource[i].flags = 0;
117}
118DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SI, PCI_DEVICE_ID_SI_5513, pci_fixup_ide_trash);
119DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_10, pci_fixup_ide_trash);
120DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_11, pci_fixup_ide_trash);
121DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_9, pci_fixup_ide_trash);
122
123static void __devinit pci_fixup_latency(struct pci_dev *d) 77static void __devinit pci_fixup_latency(struct pci_dev *d)
124{ 78{
125 /* 79 /*
@@ -348,8 +302,8 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PC1, pcie_r
348 * From information provided by "Jon Smirl" <jonsmirl@gmail.com> 302 * From information provided by "Jon Smirl" <jonsmirl@gmail.com>
349 * 303 *
350 * The standard boot ROM sequence for an x86 machine uses the BIOS 304 * The standard boot ROM sequence for an x86 machine uses the BIOS
351 * to select an initial video card for boot display. This boot video 305 * to select an initial video card for boot display. This boot video
352 * card will have it's BIOS copied to C0000 in system RAM. 306 * card will have it's BIOS copied to C0000 in system RAM.
353 * IORESOURCE_ROM_SHADOW is used to associate the boot video 307 * IORESOURCE_ROM_SHADOW is used to associate the boot video
354 * card with this copy. On laptops this copy has to be used since 308 * card with this copy. On laptops this copy has to be used since
355 * the main ROM may be compressed or combined with another image. 309 * the main ROM may be compressed or combined with another image.
@@ -371,7 +325,17 @@ static void __devinit pci_fixup_video(struct pci_dev *pdev)
371 bus = pdev->bus; 325 bus = pdev->bus;
372 while (bus) { 326 while (bus) {
373 bridge = bus->self; 327 bridge = bus->self;
374 if (bridge) { 328
329 /*
330 * From information provided by
331 * "David Miller" <davem@davemloft.net>
332 * The bridge control register is valid for PCI header
333 * type BRIDGE, or CARDBUS. Host to PCI controllers use
334 * PCI header type NORMAL.
335 */
336 if (bridge
337 &&((bridge->hdr_type == PCI_HEADER_TYPE_BRIDGE)
338 ||(bridge->hdr_type == PCI_HEADER_TYPE_CARDBUS))) {
375 pci_read_config_word(bridge, PCI_BRIDGE_CONTROL, 339 pci_read_config_word(bridge, PCI_BRIDGE_CONTROL,
376 &config); 340 &config);
377 if (!(config & PCI_BRIDGE_CTL_VGA)) 341 if (!(config & PCI_BRIDGE_CTL_VGA))
diff --git a/arch/i386/pci/i386.c b/arch/i386/pci/i386.c
index 10154a2cac68..43005f044424 100644
--- a/arch/i386/pci/i386.c
+++ b/arch/i386/pci/i386.c
@@ -104,16 +104,24 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list)
104 /* Depth-First Search on bus tree */ 104 /* Depth-First Search on bus tree */
105 list_for_each_entry(bus, bus_list, node) { 105 list_for_each_entry(bus, bus_list, node) {
106 if ((dev = bus->self)) { 106 if ((dev = bus->self)) {
107 for (idx = PCI_BRIDGE_RESOURCES; idx < PCI_NUM_RESOURCES; idx++) { 107 for (idx = PCI_BRIDGE_RESOURCES;
108 idx < PCI_NUM_RESOURCES; idx++) {
108 r = &dev->resource[idx]; 109 r = &dev->resource[idx];
109 if (!r->flags) 110 if (!r->flags)
110 continue; 111 continue;
111 pr = pci_find_parent_resource(dev, r); 112 pr = pci_find_parent_resource(dev, r);
112 if (!r->start || !pr || request_resource(pr, r) < 0) { 113 if (!r->start || !pr ||
113 printk(KERN_ERR "PCI: Cannot allocate resource region %d of bridge %s\n", idx, pci_name(dev)); 114 request_resource(pr, r) < 0) {
114 /* Something is wrong with the region. 115 printk(KERN_ERR "PCI: Cannot allocate "
115 Invalidate the resource to prevent child 116 "resource region %d "
116 resource allocations in this range. */ 117 "of bridge %s\n",
118 idx, pci_name(dev));
119 /*
120 * Something is wrong with the region.
121 * Invalidate the resource to prevent
122 * child resource allocations in this
123 * range.
124 */
117 r->flags = 0; 125 r->flags = 0;
118 } 126 }
119 } 127 }
@@ -131,7 +139,7 @@ static void __init pcibios_allocate_resources(int pass)
131 139
132 for_each_pci_dev(dev) { 140 for_each_pci_dev(dev) {
133 pci_read_config_word(dev, PCI_COMMAND, &command); 141 pci_read_config_word(dev, PCI_COMMAND, &command);
134 for(idx = 0; idx < 6; idx++) { 142 for (idx = 0; idx < PCI_ROM_RESOURCE; idx++) {
135 r = &dev->resource[idx]; 143 r = &dev->resource[idx];
136 if (r->parent) /* Already allocated */ 144 if (r->parent) /* Already allocated */
137 continue; 145 continue;
@@ -142,11 +150,15 @@ static void __init pcibios_allocate_resources(int pass)
142 else 150 else
143 disabled = !(command & PCI_COMMAND_MEMORY); 151 disabled = !(command & PCI_COMMAND_MEMORY);
144 if (pass == disabled) { 152 if (pass == disabled) {
145 DBG("PCI: Resource %08lx-%08lx (f=%lx, d=%d, p=%d)\n", 153 DBG("PCI: Resource %08lx-%08lx "
154 "(f=%lx, d=%d, p=%d)\n",
146 r->start, r->end, r->flags, disabled, pass); 155 r->start, r->end, r->flags, disabled, pass);
147 pr = pci_find_parent_resource(dev, r); 156 pr = pci_find_parent_resource(dev, r);
148 if (!pr || request_resource(pr, r) < 0) { 157 if (!pr || request_resource(pr, r) < 0) {
149 printk(KERN_ERR "PCI: Cannot allocate resource region %d of device %s\n", idx, pci_name(dev)); 158 printk(KERN_ERR "PCI: Cannot allocate "
159 "resource region %d "
160 "of device %s\n",
161 idx, pci_name(dev));
150 /* We'll assign a new address later */ 162 /* We'll assign a new address later */
151 r->end -= r->start; 163 r->end -= r->start;
152 r->start = 0; 164 r->start = 0;
@@ -156,12 +168,16 @@ static void __init pcibios_allocate_resources(int pass)
156 if (!pass) { 168 if (!pass) {
157 r = &dev->resource[PCI_ROM_RESOURCE]; 169 r = &dev->resource[PCI_ROM_RESOURCE];
158 if (r->flags & IORESOURCE_ROM_ENABLE) { 170 if (r->flags & IORESOURCE_ROM_ENABLE) {
159 /* Turn the ROM off, leave the resource region, but keep it unregistered. */ 171 /* Turn the ROM off, leave the resource region,
172 * but keep it unregistered. */
160 u32 reg; 173 u32 reg;
161 DBG("PCI: Switching off ROM of %s\n", pci_name(dev)); 174 DBG("PCI: Switching off ROM of %s\n",
175 pci_name(dev));
162 r->flags &= ~IORESOURCE_ROM_ENABLE; 176 r->flags &= ~IORESOURCE_ROM_ENABLE;
163 pci_read_config_dword(dev, dev->rom_base_reg, &reg); 177 pci_read_config_dword(dev,
164 pci_write_config_dword(dev, dev->rom_base_reg, reg & ~PCI_ROM_ADDRESS_ENABLE); 178 dev->rom_base_reg, &reg);
179 pci_write_config_dword(dev, dev->rom_base_reg,
180 reg & ~PCI_ROM_ADDRESS_ENABLE);
165 } 181 }
166 } 182 }
167 } 183 }
@@ -173,9 +189,11 @@ static int __init pcibios_assign_resources(void)
173 struct resource *r, *pr; 189 struct resource *r, *pr;
174 190
175 if (!(pci_probe & PCI_ASSIGN_ROMS)) { 191 if (!(pci_probe & PCI_ASSIGN_ROMS)) {
176 /* Try to use BIOS settings for ROMs, otherwise let 192 /*
177 pci_assign_unassigned_resources() allocate the new 193 * Try to use BIOS settings for ROMs, otherwise let
178 addresses. */ 194 * pci_assign_unassigned_resources() allocate the new
195 * addresses.
196 */
179 for_each_pci_dev(dev) { 197 for_each_pci_dev(dev) {
180 r = &dev->resource[PCI_ROM_RESOURCE]; 198 r = &dev->resource[PCI_ROM_RESOURCE];
181 if (!r->flags || !r->start) 199 if (!r->flags || !r->start)
@@ -215,9 +233,9 @@ int pcibios_enable_resources(struct pci_dev *dev, int mask)
215 233
216 pci_read_config_word(dev, PCI_COMMAND, &cmd); 234 pci_read_config_word(dev, PCI_COMMAND, &cmd);
217 old_cmd = cmd; 235 old_cmd = cmd;
218 for(idx = 0; idx < PCI_NUM_RESOURCES; idx++) { 236 for (idx = 0; idx < PCI_NUM_RESOURCES; idx++) {
219 /* Only set up the requested stuff */ 237 /* Only set up the requested stuff */
220 if (!(mask & (1<<idx))) 238 if (!(mask & (1 << idx)))
221 continue; 239 continue;
222 240
223 r = &dev->resource[idx]; 241 r = &dev->resource[idx];
@@ -227,7 +245,9 @@ int pcibios_enable_resources(struct pci_dev *dev, int mask)
227 (!(r->flags & IORESOURCE_ROM_ENABLE))) 245 (!(r->flags & IORESOURCE_ROM_ENABLE)))
228 continue; 246 continue;
229 if (!r->start && r->end) { 247 if (!r->start && r->end) {
230 printk(KERN_ERR "PCI: Device %s not available because of resource collisions\n", pci_name(dev)); 248 printk(KERN_ERR "PCI: Device %s not available "
249 "because of resource collisions\n",
250 pci_name(dev));
231 return -EINVAL; 251 return -EINVAL;
232 } 252 }
233 if (r->flags & IORESOURCE_IO) 253 if (r->flags & IORESOURCE_IO)
@@ -236,21 +256,13 @@ int pcibios_enable_resources(struct pci_dev *dev, int mask)
236 cmd |= PCI_COMMAND_MEMORY; 256 cmd |= PCI_COMMAND_MEMORY;
237 } 257 }
238 if (cmd != old_cmd) { 258 if (cmd != old_cmd) {
239 printk("PCI: Enabling device %s (%04x -> %04x)\n", pci_name(dev), old_cmd, cmd); 259 printk("PCI: Enabling device %s (%04x -> %04x)\n",
260 pci_name(dev), old_cmd, cmd);
240 pci_write_config_word(dev, PCI_COMMAND, cmd); 261 pci_write_config_word(dev, PCI_COMMAND, cmd);
241 } 262 }
242 return 0; 263 return 0;
243} 264}
244 265
245void pcibios_disable_resources(struct pci_dev *dev)
246{
247 u16 cmd;
248
249 pci_read_config_word(dev, PCI_COMMAND, &cmd);
250 cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY);
251 pci_write_config_word(dev, PCI_COMMAND, cmd);
252}
253
254/* 266/*
255 * If we set up a device for bus mastering, we need to check the latency 267 * If we set up a device for bus mastering, we need to check the latency
256 * timer as certain crappy BIOSes forget to set it properly. 268 * timer as certain crappy BIOSes forget to set it properly.
@@ -267,7 +279,8 @@ void pcibios_set_master(struct pci_dev *dev)
267 lat = pcibios_max_latency; 279 lat = pcibios_max_latency;
268 else 280 else
269 return; 281 return;
270 printk(KERN_DEBUG "PCI: Setting latency timer of device %s to %d\n", pci_name(dev), lat); 282 printk(KERN_DEBUG "PCI: Setting latency timer of device %s to %d\n",
283 pci_name(dev), lat);
271 pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat); 284 pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat);
272} 285}
273 286
diff --git a/arch/i386/pci/irq.c b/arch/i386/pci/irq.c
index dbc4aae91959..f2cb942f8281 100644
--- a/arch/i386/pci/irq.c
+++ b/arch/i386/pci/irq.c
@@ -255,13 +255,13 @@ static int pirq_via_set(struct pci_dev *router, struct pci_dev *dev, int pirq, i
255 */ 255 */
256static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq) 256static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
257{ 257{
258 static const unsigned int pirqmap[4] = { 3, 2, 5, 1 }; 258 static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
259 return read_config_nybble(router, 0x55, pirqmap[pirq-1]); 259 return read_config_nybble(router, 0x55, pirqmap[pirq-1]);
260} 260}
261 261
262static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) 262static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
263{ 263{
264 static const unsigned int pirqmap[4] = { 3, 2, 5, 1 }; 264 static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
265 write_config_nybble(router, 0x55, pirqmap[pirq-1], irq); 265 write_config_nybble(router, 0x55, pirqmap[pirq-1], irq);
266 return 1; 266 return 1;
267} 267}
@@ -543,6 +543,12 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
543 case PCI_DEVICE_ID_INTEL_ICH8_2: 543 case PCI_DEVICE_ID_INTEL_ICH8_2:
544 case PCI_DEVICE_ID_INTEL_ICH8_3: 544 case PCI_DEVICE_ID_INTEL_ICH8_3:
545 case PCI_DEVICE_ID_INTEL_ICH8_4: 545 case PCI_DEVICE_ID_INTEL_ICH8_4:
546 case PCI_DEVICE_ID_INTEL_ICH9_0:
547 case PCI_DEVICE_ID_INTEL_ICH9_1:
548 case PCI_DEVICE_ID_INTEL_ICH9_2:
549 case PCI_DEVICE_ID_INTEL_ICH9_3:
550 case PCI_DEVICE_ID_INTEL_ICH9_4:
551 case PCI_DEVICE_ID_INTEL_ICH9_5:
546 r->name = "PIIX/ICH"; 552 r->name = "PIIX/ICH";
547 r->get = pirq_piix_get; 553 r->get = pirq_piix_get;
548 r->set = pirq_piix_set; 554 r->set = pirq_piix_set;
@@ -758,7 +764,7 @@ static void __init pirq_find_router(struct irq_router *r)
758 DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n", 764 DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n",
759 rt->rtr_vendor, rt->rtr_device); 765 rt->rtr_vendor, rt->rtr_device);
760 766
761 pirq_router_dev = pci_find_slot(rt->rtr_bus, rt->rtr_devfn); 767 pirq_router_dev = pci_get_bus_and_slot(rt->rtr_bus, rt->rtr_devfn);
762 if (!pirq_router_dev) { 768 if (!pirq_router_dev) {
763 DBG(KERN_DEBUG "PCI: Interrupt router not found at " 769 DBG(KERN_DEBUG "PCI: Interrupt router not found at "
764 "%02x:%02x\n", rt->rtr_bus, rt->rtr_devfn); 770 "%02x:%02x\n", rt->rtr_bus, rt->rtr_devfn);
@@ -778,6 +784,8 @@ static void __init pirq_find_router(struct irq_router *r)
778 pirq_router_dev->vendor, 784 pirq_router_dev->vendor,
779 pirq_router_dev->device, 785 pirq_router_dev->device,
780 pci_name(pirq_router_dev)); 786 pci_name(pirq_router_dev));
787
788 /* The device remains referenced for the kernel lifetime */
781} 789}
782 790
783static struct irq_info *pirq_get_info(struct pci_dev *dev) 791static struct irq_info *pirq_get_info(struct pci_dev *dev)
diff --git a/arch/i386/pci/mmconfig.c b/arch/i386/pci/mmconfig.c
index d0c3da3aa2aa..c6b6d9bbc453 100644
--- a/arch/i386/pci/mmconfig.c
+++ b/arch/i386/pci/mmconfig.c
@@ -154,38 +154,6 @@ static struct pci_raw_ops pci_mmcfg = {
154 .write = pci_mmcfg_write, 154 .write = pci_mmcfg_write,
155}; 155};
156 156
157
158static __init void pci_mmcfg_insert_resources(void)
159{
160#define PCI_MMCFG_RESOURCE_NAME_LEN 19
161 int i;
162 struct resource *res;
163 char *names;
164 unsigned num_buses;
165
166 res = kcalloc(PCI_MMCFG_RESOURCE_NAME_LEN + sizeof(*res),
167 pci_mmcfg_config_num, GFP_KERNEL);
168
169 if (!res) {
170 printk(KERN_ERR "PCI: Unable to allocate MMCONFIG resources\n");
171 return;
172 }
173
174 names = (void *)&res[pci_mmcfg_config_num];
175 for (i = 0; i < pci_mmcfg_config_num; i++, res++) {
176 num_buses = pci_mmcfg_config[i].end_bus_number -
177 pci_mmcfg_config[i].start_bus_number + 1;
178 res->name = names;
179 snprintf(names, PCI_MMCFG_RESOURCE_NAME_LEN, "PCI MMCONFIG %u",
180 pci_mmcfg_config[i].pci_segment_group_number);
181 res->start = pci_mmcfg_config[i].base_address;
182 res->end = res->start + (num_buses << 20) - 1;
183 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
184 insert_resource(&iomem_resource, res);
185 names += PCI_MMCFG_RESOURCE_NAME_LEN;
186 }
187}
188
189/* K8 systems have some devices (typically in the builtin northbridge) 157/* K8 systems have some devices (typically in the builtin northbridge)
190 that are only accessible using type1 158 that are only accessible using type1
191 Normally this can be expressed in the MCFG by not listing them 159 Normally this can be expressed in the MCFG by not listing them
@@ -222,8 +190,6 @@ static __init void unreachable_devices(void)
222 } 190 }
223} 191}
224 192
225
226
227void __init pci_mmcfg_init(int type) 193void __init pci_mmcfg_init(int type)
228{ 194{
229 if ((pci_probe & PCI_PROBE_MMCONF) == 0) 195 if ((pci_probe & PCI_PROBE_MMCONF) == 0)
@@ -251,5 +217,4 @@ void __init pci_mmcfg_init(int type)
251 pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; 217 pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
252 218
253 unreachable_devices(); 219 unreachable_devices();
254 pci_mmcfg_insert_resources();
255} 220}
diff --git a/arch/i386/pci/pcbios.c b/arch/i386/pci/pcbios.c
index ed1512a175ab..5f5193401bea 100644
--- a/arch/i386/pci/pcbios.c
+++ b/arch/i386/pci/pcbios.c
@@ -5,6 +5,7 @@
5#include <linux/pci.h> 5#include <linux/pci.h>
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/uaccess.h>
8#include "pci.h" 9#include "pci.h"
9#include "pci-functions.h" 10#include "pci-functions.h"
10 11
@@ -314,6 +315,10 @@ static struct pci_raw_ops * __devinit pci_find_bios(void)
314 for (check = (union bios32 *) __va(0xe0000); 315 for (check = (union bios32 *) __va(0xe0000);
315 check <= (union bios32 *) __va(0xffff0); 316 check <= (union bios32 *) __va(0xffff0);
316 ++check) { 317 ++check) {
318 long sig;
319 if (probe_kernel_address(&check->fields.signature, sig))
320 continue;
321
317 if (check->fields.signature != BIOS32_SIGNATURE) 322 if (check->fields.signature != BIOS32_SIGNATURE)
318 continue; 323 continue;
319 length = check->fields.length * 16; 324 length = check->fields.length * 16;
@@ -331,11 +336,13 @@ static struct pci_raw_ops * __devinit pci_find_bios(void)
331 } 336 }
332 DBG("PCI: BIOS32 Service Directory structure at 0x%p\n", check); 337 DBG("PCI: BIOS32 Service Directory structure at 0x%p\n", check);
333 if (check->fields.entry >= 0x100000) { 338 if (check->fields.entry >= 0x100000) {
334 printk("PCI: BIOS32 entry (0x%p) in high memory, cannot use.\n", check); 339 printk("PCI: BIOS32 entry (0x%p) in high memory, "
340 "cannot use.\n", check);
335 return NULL; 341 return NULL;
336 } else { 342 } else {
337 unsigned long bios32_entry = check->fields.entry; 343 unsigned long bios32_entry = check->fields.entry;
338 DBG("PCI: BIOS32 Service Directory entry at 0x%lx\n", bios32_entry); 344 DBG("PCI: BIOS32 Service Directory entry at 0x%lx\n",
345 bios32_entry);
339 bios32_indirect.address = bios32_entry + PAGE_OFFSET; 346 bios32_indirect.address = bios32_entry + PAGE_OFFSET;
340 if (check_pcibios()) 347 if (check_pcibios())
341 return &pci_bios_access; 348 return &pci_bios_access;
diff --git a/arch/i386/pci/pci.h b/arch/i386/pci/pci.h
index 1814f74569c6..a0a25180b61a 100644
--- a/arch/i386/pci/pci.h
+++ b/arch/i386/pci/pci.h
@@ -30,13 +30,19 @@
30extern unsigned int pci_probe; 30extern unsigned int pci_probe;
31extern unsigned long pirq_table_addr; 31extern unsigned long pirq_table_addr;
32 32
33enum pci_bf_sort_state {
34 pci_bf_sort_default,
35 pci_force_nobf,
36 pci_force_bf,
37 pci_dmi_bf,
38};
39
33/* pci-i386.c */ 40/* pci-i386.c */
34 41
35extern unsigned int pcibios_max_latency; 42extern unsigned int pcibios_max_latency;
36 43
37void pcibios_resource_survey(void); 44void pcibios_resource_survey(void);
38int pcibios_enable_resources(struct pci_dev *, int); 45int pcibios_enable_resources(struct pci_dev *, int);
39void pcibios_disable_resources(struct pci_dev *);
40 46
41/* pci-pc.c */ 47/* pci-pc.c */
42 48
diff --git a/arch/i386/power/Makefile b/arch/i386/power/Makefile
index 8cfa4e8a719d..2de7bbf03cd7 100644
--- a/arch/i386/power/Makefile
+++ b/arch/i386/power/Makefile
@@ -1,2 +1,2 @@
1obj-$(CONFIG_PM) += cpu.o 1obj-$(CONFIG_PM) += cpu.o
2obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o 2obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o suspend.o
diff --git a/arch/i386/power/cpu.c b/arch/i386/power/cpu.c
index 5a1abeff033b..2c15500f8713 100644
--- a/arch/i386/power/cpu.c
+++ b/arch/i386/power/cpu.c
@@ -26,8 +26,8 @@ void __save_processor_state(struct saved_context *ctxt)
26 /* 26 /*
27 * descriptor tables 27 * descriptor tables
28 */ 28 */
29 store_gdt(&ctxt->gdt_limit); 29 store_gdt(&ctxt->gdt);
30 store_idt(&ctxt->idt_limit); 30 store_idt(&ctxt->idt);
31 store_tr(ctxt->tr); 31 store_tr(ctxt->tr);
32 32
33 /* 33 /*
@@ -99,8 +99,8 @@ void __restore_processor_state(struct saved_context *ctxt)
99 * now restore the descriptor tables to their proper values 99 * now restore the descriptor tables to their proper values
100 * ltr is done i fix_processor_context(). 100 * ltr is done i fix_processor_context().
101 */ 101 */
102 load_gdt(&ctxt->gdt_limit); 102 load_gdt(&ctxt->gdt);
103 load_idt(&ctxt->idt_limit); 103 load_idt(&ctxt->idt);
104 104
105 /* 105 /*
106 * segment registers 106 * segment registers
diff --git a/arch/i386/power/suspend.c b/arch/i386/power/suspend.c
new file mode 100644
index 000000000000..db5e98d2eb73
--- /dev/null
+++ b/arch/i386/power/suspend.c
@@ -0,0 +1,158 @@
1/*
2 * Suspend support specific for i386 - temporary page tables
3 *
4 * Distribute under GPLv2
5 *
6 * Copyright (c) 2006 Rafael J. Wysocki <rjw@sisk.pl>
7 */
8
9#include <linux/suspend.h>
10#include <linux/bootmem.h>
11
12#include <asm/system.h>
13#include <asm/page.h>
14#include <asm/pgtable.h>
15
16/* Defined in arch/i386/power/swsusp.S */
17extern int restore_image(void);
18
19/* Pointer to the temporary resume page tables */
20pgd_t *resume_pg_dir;
21
22/* The following three functions are based on the analogous code in
23 * arch/i386/mm/init.c
24 */
25
26/*
27 * Create a middle page table on a resume-safe page and put a pointer to it in
28 * the given global directory entry. This only returns the gd entry
29 * in non-PAE compilation mode, since the middle layer is folded.
30 */
31static pmd_t *resume_one_md_table_init(pgd_t *pgd)
32{
33 pud_t *pud;
34 pmd_t *pmd_table;
35
36#ifdef CONFIG_X86_PAE
37 pmd_table = (pmd_t *)get_safe_page(GFP_ATOMIC);
38 if (!pmd_table)
39 return NULL;
40
41 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
42 pud = pud_offset(pgd, 0);
43
44 BUG_ON(pmd_table != pmd_offset(pud, 0));
45#else
46 pud = pud_offset(pgd, 0);
47 pmd_table = pmd_offset(pud, 0);
48#endif
49
50 return pmd_table;
51}
52
53/*
54 * Create a page table on a resume-safe page and place a pointer to it in
55 * a middle page directory entry.
56 */
57static pte_t *resume_one_page_table_init(pmd_t *pmd)
58{
59 if (pmd_none(*pmd)) {
60 pte_t *page_table = (pte_t *)get_safe_page(GFP_ATOMIC);
61 if (!page_table)
62 return NULL;
63
64 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
65
66 BUG_ON(page_table != pte_offset_kernel(pmd, 0));
67
68 return page_table;
69 }
70
71 return pte_offset_kernel(pmd, 0);
72}
73
74/*
75 * This maps the physical memory to kernel virtual address space, a total
76 * of max_low_pfn pages, by creating page tables starting from address
77 * PAGE_OFFSET. The page tables are allocated out of resume-safe pages.
78 */
79static int resume_physical_mapping_init(pgd_t *pgd_base)
80{
81 unsigned long pfn;
82 pgd_t *pgd;
83 pmd_t *pmd;
84 pte_t *pte;
85 int pgd_idx, pmd_idx;
86
87 pgd_idx = pgd_index(PAGE_OFFSET);
88 pgd = pgd_base + pgd_idx;
89 pfn = 0;
90
91 for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
92 pmd = resume_one_md_table_init(pgd);
93 if (!pmd)
94 return -ENOMEM;
95
96 if (pfn >= max_low_pfn)
97 continue;
98
99 for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD; pmd++, pmd_idx++) {
100 if (pfn >= max_low_pfn)
101 break;
102
103 /* Map with big pages if possible, otherwise create
104 * normal page tables.
105 * NOTE: We can mark everything as executable here
106 */
107 if (cpu_has_pse) {
108 set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
109 pfn += PTRS_PER_PTE;
110 } else {
111 pte_t *max_pte;
112
113 pte = resume_one_page_table_init(pmd);
114 if (!pte)
115 return -ENOMEM;
116
117 max_pte = pte + PTRS_PER_PTE;
118 for (; pte < max_pte; pte++, pfn++) {
119 if (pfn >= max_low_pfn)
120 break;
121
122 set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
123 }
124 }
125 }
126 }
127 return 0;
128}
129
130static inline void resume_init_first_level_page_table(pgd_t *pg_dir)
131{
132#ifdef CONFIG_X86_PAE
133 int i;
134
135 /* Init entries of the first-level page table to the zero page */
136 for (i = 0; i < PTRS_PER_PGD; i++)
137 set_pgd(pg_dir + i,
138 __pgd(__pa(empty_zero_page) | _PAGE_PRESENT));
139#endif
140}
141
142int swsusp_arch_resume(void)
143{
144 int error;
145
146 resume_pg_dir = (pgd_t *)get_safe_page(GFP_ATOMIC);
147 if (!resume_pg_dir)
148 return -ENOMEM;
149
150 resume_init_first_level_page_table(resume_pg_dir);
151 error = resume_physical_mapping_init(resume_pg_dir);
152 if (error)
153 return error;
154
155 /* We have got enough memory and from now on we cannot recover */
156 restore_image();
157 return 0;
158}
diff --git a/arch/i386/power/swsusp.S b/arch/i386/power/swsusp.S
index 8a2b50a0aaad..53662e05b393 100644
--- a/arch/i386/power/swsusp.S
+++ b/arch/i386/power/swsusp.S
@@ -28,8 +28,9 @@ ENTRY(swsusp_arch_suspend)
28 call swsusp_save 28 call swsusp_save
29 ret 29 ret
30 30
31ENTRY(swsusp_arch_resume) 31ENTRY(restore_image)
32 movl $swsusp_pg_dir-__PAGE_OFFSET, %ecx 32 movl resume_pg_dir, %ecx
33 subl $__PAGE_OFFSET, %ecx
33 movl %ecx, %cr3 34 movl %ecx, %cr3
34 35
35 movl restore_pblist, %edx 36 movl restore_pblist, %edx
@@ -51,6 +52,10 @@ copy_loop:
51 .p2align 4,,7 52 .p2align 4,,7
52 53
53done: 54done:
55 /* go back to the original page tables */
56 movl $swapper_pg_dir, %ecx
57 subl $__PAGE_OFFSET, %ecx
58 movl %ecx, %cr3
54 /* Flush TLB, including "global" things (vmalloc) */ 59 /* Flush TLB, including "global" things (vmalloc) */
55 movl mmu_cr4_features, %eax 60 movl mmu_cr4_features, %eax
56 movl %eax, %edx 61 movl %eax, %edx